static void validate_bpfpath_is_safe(const char *path) { if (path == NULL || strlen(path) == 0 || path[0] != '/') { die("valid_bpfpath_is_safe needs an absolute path as input"); } // strtok_r() modifies its first argument, so work on a copy char *tokenized SC_CLEANUP(sc_cleanup_string) = NULL; tokenized = strdup(path); if (tokenized == NULL) { die("cannot allocate memory for copy of path"); } // allocate a string large enough to hold path, and initialize it to // '/' size_t checked_path_size = strlen(path) + 1; char *checked_path SC_CLEANUP(sc_cleanup_string) = NULL; checked_path = calloc(checked_path_size, 1); if (checked_path == NULL) { die("cannot allocate memory for checked_path"); } checked_path[0] = '/'; checked_path[1] = '\0'; // validate '/' validate_path_has_strict_perms(checked_path); // strtok_r needs a pointer to keep track of where it is in the // string. char *buf_saveptr = NULL; // reconstruct the path from '/' down to profile_name char *buf_token = strtok_r(tokenized, "/", &buf_saveptr); while (buf_token != NULL) { char *prev SC_CLEANUP(sc_cleanup_string) = NULL; prev = strdup(checked_path); // needed by vsnprintf in sc_must_snprintf if (prev == NULL) { die("cannot allocate memory for copy of checked_path"); } // append '<buf_token>' if checked_path is '/', otherwise '/<buf_token>' if (strlen(checked_path) == 1) { sc_must_snprintf(checked_path, checked_path_size, "%s%s", prev, buf_token); } else { sc_must_snprintf(checked_path, checked_path_size, "%s/%s", prev, buf_token); } validate_path_has_strict_perms(checked_path); buf_token = strtok_r(NULL, "/", &buf_saveptr); } }
static bool should_discard_current_ns(dev_t base_snap_dev) { // Inspect the namespace and check if we should discard it. // // The namespace may become "stale" when the rootfs is not the same // device we found above. This will happen whenever the base snap is // refreshed since the namespace was first created. struct sc_mountinfo_entry *mie; struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; mi = sc_parse_mountinfo(NULL); if (mi == NULL) { die("cannot parse mountinfo of the current process"); } for (mie = sc_first_mountinfo_entry(mi); mie != NULL; mie = sc_next_mountinfo_entry(mie)) { if (!sc_streq(mie->mount_dir, "/")) { continue; } // NOTE: we want the initial rootfs just in case overmount // was used to do something weird. The initial rootfs was // set up by snap-confine and that is the one we want to // measure. debug("found root filesystem inside the mount namespace %d:%d", mie->dev_major, mie->dev_minor); return base_snap_dev != MKDEV(mie->dev_major, mie->dev_minor); } die("cannot find mount entry of the root filesystem inside snap namespace"); }
static dev_t find_base_snap_device(const char *base_snap_name, const char *base_snap_rev) { // Find the backing device of the base snap. // TODO: add support for "try mode" base snaps that also need // consideration of the mie->root component. dev_t base_snap_dev = 0; char base_squashfs_path[PATH_MAX]; sc_must_snprintf(base_squashfs_path, sizeof base_squashfs_path, "%s/%s/%s", SNAP_MOUNT_DIR, base_snap_name, base_snap_rev); struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; mi = sc_parse_mountinfo(NULL); if (mi == NULL) { die("cannot parse mountinfo of the current process"); } bool found = false; for (struct sc_mountinfo_entry * mie = sc_first_mountinfo_entry(mi); mie != NULL; mie = sc_next_mountinfo_entry(mie)) { if (sc_streq(mie->mount_dir, base_squashfs_path)) { base_snap_dev = MKDEV(mie->dev_major, mie->dev_minor); debug("found base snap filesystem device %d:%d", mie->dev_major, mie->dev_minor); // Don't break when found, we are interested in the last // entry as this is the "effective" one. found = true; } } if (!found) { die("cannot find device backing the base snap %s", base_snap_name); } return base_snap_dev; }
int sc_nonfatal_mkpath(const char *const path, mode_t mode) { // If asked to create an empty path, return immediately. if (strlen(path) == 0) { return 0; } // We're going to use strtok_r, which needs to modify the path, so we'll // make a copy of it. char *path_copy SC_CLEANUP(sc_cleanup_string) = NULL; path_copy = strdup(path); if (path_copy == NULL) { return -1; } // Open flags to use while we walk the user data path: // - Don't follow symlinks // - Don't allow child access to file descriptor // - Only open a directory (fail otherwise) const int open_flags = O_NOFOLLOW | O_CLOEXEC | O_DIRECTORY; // We're going to create each path segment via openat/mkdirat calls instead // of mkdir calls, to avoid following symlinks and placing the user data // directory somewhere we never intended for it to go. The first step is to // get an initial file descriptor. int fd SC_CLEANUP(sc_cleanup_close) = AT_FDCWD; if (path_copy[0] == '/') { fd = open("/", open_flags); if (fd < 0) { return -1; } } // strtok_r needs a pointer to keep track of where it is in the string. char *path_walker = NULL; // Initialize tokenizer and obtain first path segment. char *path_segment = strtok_r(path_copy, "/", &path_walker); while (path_segment) { // Try to create the directory. It's okay if it already existed, but // return with error on any other error. Reset errno before attempting // this as it may stay stale (errno is not reset if mkdirat(2) returns // successfully). errno = 0; if (mkdirat(fd, path_segment, mode) < 0 && errno != EEXIST) { return -1; } // Open the parent directory we just made (and close the previous one // (but not the special value AT_FDCWD) so we can continue down the // path. int previous_fd = fd; fd = openat(fd, path_segment, open_flags); if (previous_fd != AT_FDCWD && close(previous_fd) != 0) { return -1; } if (fd < 0) { return -1; } // Obtain the next path segment. path_segment = strtok_r(NULL, "/", &path_walker); } return 0; }
/** * Create a writable mimic directory based on reference directory. * * The mimic directory is a tmpfs populated with bind mounts to the (possibly * read only) directories in the reference directory. While all the read-only * content stays read-only the actual mimic directory is writable so additional * content can be placed there. * * Flags are forwarded to sc_quirk_mkdir_bind() **/ static void sc_quirk_create_writable_mimic(const char *mimic_dir, const char *ref_dir, unsigned flags) { debug("creating writable mimic directory %s based on %s", mimic_dir, ref_dir); sc_quirk_setup_tmpfs(mimic_dir); // Now copy the ownership and permissions of the mimicked directory struct stat stat_buf; if (stat(ref_dir, &stat_buf) < 0) { die("cannot stat %s", ref_dir); } if (chown(mimic_dir, stat_buf.st_uid, stat_buf.st_gid) < 0) { die("cannot chown for %s", mimic_dir); } if (chmod(mimic_dir, stat_buf.st_mode) < 0) { die("cannot chmod for %s", mimic_dir); } debug("bind-mounting all the files from the reference directory"); DIR *dirp SC_CLEANUP(sc_cleanup_closedir) = NULL; dirp = opendir(ref_dir); if (dirp == NULL) { die("cannot open reference directory %s", ref_dir); } struct dirent *entryp = NULL; do { char src_name[PATH_MAX * 2] = { 0 }; char dest_name[PATH_MAX * 2] = { 0 }; // Set errno to zero, if readdir fails it will not only return null but // set errno to a non-zero value. This is how we can differentiate // end-of-directory from an actual error. errno = 0; entryp = readdir(dirp); if (entryp == NULL && errno != 0) { die("cannot read another directory entry"); } if (entryp == NULL) { break; } if (strcmp(entryp->d_name, ".") == 0 || strcmp(entryp->d_name, "..") == 0) { continue; } if (entryp->d_type != DT_DIR && entryp->d_type != DT_REG) { die("unsupported entry type of file %s (%d)", entryp->d_name, entryp->d_type); } sc_must_snprintf(src_name, sizeof src_name, "%s/%s", ref_dir, entryp->d_name); sc_must_snprintf(dest_name, sizeof dest_name, "%s/%s", mimic_dir, entryp->d_name); sc_quirk_mkdir_bind(src_name, dest_name, flags); } while (entryp != NULL); }
struct sc_mountinfo *sc_parse_mountinfo(const char *fname) { struct sc_mountinfo *info = calloc(1, sizeof *info); if (info == NULL) { return NULL; } if (fname == NULL) { fname = "/proc/self/mountinfo"; } FILE *f SC_CLEANUP(sc_cleanup_file) = NULL; f = fopen(fname, "rt"); if (f == NULL) { free(info); return NULL; } char *line SC_CLEANUP(sc_cleanup_string) = NULL; size_t line_size = 0; struct sc_mountinfo_entry *entry, *last = NULL; for (;;) { errno = 0; if (getline(&line, &line_size, f) == -1) { if (errno != 0) { sc_free_mountinfo(info); return NULL; } break; }; entry = sc_parse_mountinfo_entry(line); if (entry == NULL) { sc_free_mountinfo(info); return NULL; } if (last != NULL) { last->next = entry; } else { info->first = entry; } last = entry; } return info; }
sc_distro sc_classify_distro(void) { FILE *f SC_CLEANUP(sc_cleanup_file) = fopen(os_release, "r"); if (f == NULL) { return SC_DISTRO_CLASSIC; } bool is_core = false; int core_version = 0; char buf[255] = { 0 }; while (fgets(buf, sizeof buf, f) != NULL) { size_t len = strlen(buf); if (len > 0 && buf[len - 1] == '\n') { buf[len - 1] = '\0'; } if (sc_streq(buf, "ID=\"ubuntu-core\"") || sc_streq(buf, "ID=ubuntu-core")) { is_core = true; } else if (sc_streq(buf, "VERSION_ID=\"16\"") || sc_streq(buf, "VERSION_ID=16")) { core_version = 16; } else if (sc_streq(buf, "VARIANT_ID=\"snappy\"") || sc_streq(buf, "VARIANT_ID=snappy")) { is_core = true; } } if (!is_core) { /* Since classic systems don't have a /meta/snap.yaml file the simple presence of that file qualifies as SC_DISTRO_CORE_OTHER. */ if (access(meta_snap_yaml, F_OK) == 0) { is_core = true; } } if (is_core) { if (core_version == 16) { return SC_DISTRO_CORE16; } return SC_DISTRO_CORE_OTHER; } else { return SC_DISTRO_CLASSIC; } }
/** * Read /proc/self/mountinfo and check if /run/snapd/ns is a private bind mount. * * We do this because /run/snapd/ns cannot be shared with any other peers as per: * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt **/ static bool sc_is_ns_group_dir_private(void) { struct sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL; info = sc_parse_mountinfo(NULL); if (info == NULL) { die("cannot parse /proc/self/mountinfo"); } struct sc_mountinfo_entry *entry = sc_first_mountinfo_entry(info); while (entry != NULL) { const char *mount_dir = entry->mount_dir; const char *optional_fields = entry->optional_fields; if (strcmp(mount_dir, sc_ns_dir) == 0 && strcmp(optional_fields, "") == 0) { // If /run/snapd/ns has no optional fields, we know it is mounted // private and there is nothing else to do. return true; } entry = sc_next_mountinfo_entry(entry); } return false; }
int sc_create_or_join_ns_group(struct sc_ns_group *group, struct sc_apparmor *apparmor, const char *base_snap_name, const char *snap_name) { // Open the mount namespace file. char mnt_fname[PATH_MAX] = { 0 }; sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name, SC_NS_MNT_FILE); int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; // NOTE: There is no O_EXCL here because the file can be around but // doesn't have to be a mounted namespace. // // If the mounted namespace is discarded with // sc_discard_preserved_ns_group() it will revert to a regular file. If // snap-confine is killed for whatever reason after the file is created but // before the file is bind-mounted it will also be a regular file. mnt_fd = openat(group->dir_fd, mnt_fname, O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); if (mnt_fd < 0) { die("cannot open mount namespace file for namespace group %s", group->name); } // Check if we got an nsfs-based or procfs file or a regular file. This can // be reliably tested because nsfs has an unique filesystem type // NSFS_MAGIC. On older kernels that don't support nsfs yet we can look // for PROC_SUPER_MAGIC instead. // We can just ensure that this is the case thanks to fstatfs. struct statfs ns_statfs_buf; if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { die("cannot perform fstatfs() on the mount namespace file descriptor"); } // Stat the mount namespace as well, this is later used to check if the // namespace is used by other processes if we are considering discarding a // stale namespace. struct stat ns_stat_buf; if (fstat(mnt_fd, &ns_stat_buf) < 0) { die("cannot perform fstat() on the mount namespace file descriptor"); } #ifndef NSFS_MAGIC // Account for kernel headers old enough to not know about NSFS_MAGIC. #define NSFS_MAGIC 0x6e736673 #endif if (ns_statfs_buf.f_type == NSFS_MAGIC || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { // Inspect and perhaps discard the preserved mount namespace. if (sc_inspect_and_maybe_discard_stale_ns (mnt_fd, snap_name, base_snap_name) == EAGAIN) { return EAGAIN; } // Remember the vanilla working directory so that we may attempt to restore it later. char *vanilla_cwd SC_CLEANUP(sc_cleanup_string) = NULL; vanilla_cwd = get_current_dir_name(); if (vanilla_cwd == NULL) { die("cannot get the current working directory"); } // Move to the mount namespace of the snap we're trying to start. debug ("attempting to re-associate the mount namespace with the namespace group %s", group->name); if (setns(mnt_fd, CLONE_NEWNS) < 0) { die("cannot re-associate the mount namespace with namespace group %s", group->name); } debug ("successfully re-associated the mount namespace with the namespace group %s", group->name); // Try to re-locate back to vanilla working directory. This can fail // because that directory is no longer present. if (chdir(vanilla_cwd) != 0) { debug ("cannot remain in %s, moving to the void directory", vanilla_cwd); if (chdir(SC_VOID_DIR) != 0) { die("cannot change directory to %s", SC_VOID_DIR); } debug("successfully moved to %s", SC_VOID_DIR); } return 0; } debug("initializing new namespace group %s", group->name); // Create a new namespace and ask the caller to populate it. // For rationale of forking see this: // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html // // The eventfd created here is used to synchronize the child and the parent // processes. It effectively tells the child to perform the capture // operation. group->event_fd = eventfd(0, EFD_CLOEXEC); if (group->event_fd < 0) { die("cannot create eventfd for mount namespace capture"); } debug("forking support process for mount namespace capture"); // Store the PID of the "parent" process. This done instead of calls to // getppid() because then we can reliably track the PID of the parent even // if the child process is re-parented. pid_t parent = getpid(); // Glibc defines pid as a signed 32bit integer. There's no standard way to // print pid's portably so this is the best we can do. pid_t pid = fork(); debug("forked support process has pid %d", (int)pid); if (pid < 0) { die("cannot fork support process for mount namespace capture"); } if (pid == 0) { // This is the child process which will capture the mount namespace. // // It will do so by bind-mounting the SC_NS_MNT_FILE after the parent // process calls unshare() and finishes setting up the namespace // completely. // Change the hat to a sub-profile that has limited permissions // necessary to accomplish the capture of the mount namespace. debug ("changing apparmor hat of the support process for mount namespace capture"); sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0); // Configure the child to die as soon as the parent dies. In an odd // case where the parent is killed then we don't want to complete our // task or wait for anything. if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { die("cannot set parent process death notification signal to SIGINT"); } // Check that parent process is still alive. If this is the case then // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake // us up from eventfd_read() below. In the rare case that the PID numbers // overflow and the now-dead parent PID is recycled we will still hang // forever on the read from eventfd below. debug("ensuring that parent process is still alive"); if (kill(parent, 0) < 0) { switch (errno) { case ESRCH: debug("parent process has already terminated"); abort(); default: die("cannot ensure that parent process is still alive"); break; } } if (fchdir(group->dir_fd) < 0) { die("cannot move process for mount namespace capture to namespace group directory"); } debug ("waiting for a eventfd data from the parent process to continue"); eventfd_t value = 0; sc_enable_sanity_timeout(); if (eventfd_read(group->event_fd, &value) < 0) { die("cannot read expected data from eventfd"); } sc_disable_sanity_timeout(); debug ("capturing mount namespace of process %d in namespace group %s", (int)parent, group->name); char src[PATH_MAX] = { 0 }; char dst[PATH_MAX] = { 0 }; sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); sc_must_snprintf(dst, sizeof dst, "%s%s", group->name, SC_NS_MNT_FILE); if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { die("cannot bind-mount the mount namespace file %s -> %s", src, dst); } debug ("successfully captured mount namespace in namespace group %s", group->name); exit(0); } else { group->child = pid; // Unshare the mount namespace and set a flag instructing the caller that // the namespace is pristine and needs to be populated now. debug("unsharing the mount namespace"); if (unshare(CLONE_NEWNS) < 0) { die("cannot unshare the mount namespace"); } group->should_populate = true; } return 0; }
bool sc_cgroup_freezer_occupied(const char *snap_name) { // Format the name of the cgroup hierarchy. char buf[PATH_MAX] = { 0 }; sc_must_snprintf(buf, sizeof buf, "snap.%s", snap_name); // Open the freezer cgroup directory. int cgroup_fd SC_CLEANUP(sc_cleanup_close) = -1; cgroup_fd = open(freezer_cgroup_dir, O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (cgroup_fd < 0) { die("cannot open freezer cgroup (%s)", freezer_cgroup_dir); } // Open the proc directory. int proc_fd SC_CLEANUP(sc_cleanup_close) = -1; proc_fd = open("/proc", O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (proc_fd < 0) { die("cannot open /proc"); } // Open the hierarchy directory for the given snap. int hierarchy_fd SC_CLEANUP(sc_cleanup_close) = -1; hierarchy_fd = openat(cgroup_fd, buf, O_PATH | O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC); if (hierarchy_fd < 0) { if (errno == ENOENT) { return false; } die("cannot open freezer cgroup hierarchy for snap %s", snap_name); } // Open the "cgroup.procs" file. Alternatively we could open the "tasks" // file and see per-thread data but we don't need that. int cgroup_procs_fd SC_CLEANUP(sc_cleanup_close) = -1; cgroup_procs_fd = openat(hierarchy_fd, "cgroup.procs", O_RDONLY | O_NOFOLLOW | O_CLOEXEC); if (cgroup_procs_fd < 0) { die("cannot open cgroup.procs file for freezer cgroup hierarchy for snap %s", snap_name); } FILE *cgroup_procs SC_CLEANUP(sc_cleanup_file) = NULL; cgroup_procs = fdopen(cgroup_procs_fd, "r"); if (cgroup_procs == NULL) { die("cannot convert tasks file descriptor to FILE"); } cgroup_procs_fd = -1; // cgroup_procs_fd will now be closed by fclose. char *line_buf SC_CLEANUP(sc_cleanup_string) = NULL; size_t line_buf_size = 0; ssize_t num_read; struct stat statbuf; do { num_read = getline(&line_buf, &line_buf_size, cgroup_procs); if (num_read < 0 && errno != 0) { die("cannot read next PID belonging to snap %s", snap_name); } if (num_read <= 0) { break; } else { if (line_buf[num_read - 1] == '\n') { line_buf[num_read - 1] = '\0'; } else { die("could not find newline in cgroup.procs"); } } debug("found process id: %s\n", line_buf); if (fstatat(proc_fd, line_buf, &statbuf, AT_SYMLINK_NOFOLLOW) < 0) { // The process may have died already. if (errno != ENOENT) { die("cannot stat /proc/%s", line_buf); } } debug("found process %s belonging to user %d", line_buf, statbuf.st_uid); return true; } while (num_read > 0); return false; }