static bool should_discard_current_ns(dev_t base_snap_dev) { // Inspect the namespace and check if we should discard it. // // The namespace may become "stale" when the rootfs is not the same // device we found above. This will happen whenever the base snap is // refreshed since the namespace was first created. struct sc_mountinfo_entry *mie; struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; mi = sc_parse_mountinfo(NULL); if (mi == NULL) { die("cannot parse mountinfo of the current process"); } for (mie = sc_first_mountinfo_entry(mi); mie != NULL; mie = sc_next_mountinfo_entry(mie)) { if (!sc_streq(mie->mount_dir, "/")) { continue; } // NOTE: we want the initial rootfs just in case overmount // was used to do something weird. The initial rootfs was // set up by snap-confine and that is the one we want to // measure. debug("found root filesystem inside the mount namespace %d:%d", mie->dev_major, mie->dev_minor); return base_snap_dev != MKDEV(mie->dev_major, mie->dev_minor); } die("cannot find mount entry of the root filesystem inside snap namespace"); }
static dev_t find_base_snap_device(const char *base_snap_name, const char *base_snap_rev) { // Find the backing device of the base snap. // TODO: add support for "try mode" base snaps that also need // consideration of the mie->root component. dev_t base_snap_dev = 0; char base_squashfs_path[PATH_MAX]; sc_must_snprintf(base_squashfs_path, sizeof base_squashfs_path, "%s/%s/%s", SNAP_MOUNT_DIR, base_snap_name, base_snap_rev); struct sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; mi = sc_parse_mountinfo(NULL); if (mi == NULL) { die("cannot parse mountinfo of the current process"); } bool found = false; for (struct sc_mountinfo_entry * mie = sc_first_mountinfo_entry(mi); mie != NULL; mie = sc_next_mountinfo_entry(mie)) { if (sc_streq(mie->mount_dir, base_squashfs_path)) { base_snap_dev = MKDEV(mie->dev_major, mie->dev_minor); debug("found base snap filesystem device %d:%d", mie->dev_major, mie->dev_minor); // Don't break when found, we are interested in the last // entry as this is the "effective" one. found = true; } } if (!found) { die("cannot find device backing the base snap %s", base_snap_name); } return base_snap_dev; }
static void test_sc_streq() { g_assert_false(sc_streq(NULL, NULL)); g_assert_false(sc_streq(NULL, "text")); g_assert_false(sc_streq("text", NULL)); g_assert_false(sc_streq("foo", "bar")); g_assert_false(sc_streq("foo", "barbar")); g_assert_false(sc_streq("foofoo", "bar")); g_assert_true(sc_streq("text", "text")); g_assert_true(sc_streq("", "")); }
sc_distro sc_classify_distro(void) { FILE *f SC_CLEANUP(sc_cleanup_file) = fopen(os_release, "r"); if (f == NULL) { return SC_DISTRO_CLASSIC; } bool is_core = false; int core_version = 0; char buf[255] = { 0 }; while (fgets(buf, sizeof buf, f) != NULL) { size_t len = strlen(buf); if (len > 0 && buf[len - 1] == '\n') { buf[len - 1] = '\0'; } if (sc_streq(buf, "ID=\"ubuntu-core\"") || sc_streq(buf, "ID=ubuntu-core")) { is_core = true; } else if (sc_streq(buf, "VERSION_ID=\"16\"") || sc_streq(buf, "VERSION_ID=16")) { core_version = 16; } else if (sc_streq(buf, "VARIANT_ID=\"snappy\"") || sc_streq(buf, "VARIANT_ID=snappy")) { is_core = true; } } if (!is_core) { /* Since classic systems don't have a /meta/snap.yaml file the simple presence of that file qualifies as SC_DISTRO_CORE_OTHER. */ if (access(meta_snap_yaml, F_OK) == 0) { is_core = true; } } if (is_core) { if (core_version == 16) { return SC_DISTRO_CORE16; } return SC_DISTRO_CORE_OTHER; } else { return SC_DISTRO_CLASSIC; } }
int sc_apply_seccomp_bpf(const char *filter_profile) { debug("loading bpf program for security tag %s", filter_profile); char profile_path[PATH_MAX] = { 0 }; sc_must_snprintf(profile_path, sizeof(profile_path), "%s/%s.bin", filter_profile_dir, filter_profile); // Wait some time for the security profile to show up. When // the system boots snapd will created security profiles, but // a service snap (e.g. network-manager) starts in parallel with // snapd so for such snaps, the profiles may not be generated // yet long max_wait = 120; const char *MAX_PROFILE_WAIT = getenv("SNAP_CONFINE_MAX_PROFILE_WAIT"); if (MAX_PROFILE_WAIT != NULL) { char *endptr = NULL; errno = 0; long env_max_wait = strtol(MAX_PROFILE_WAIT, &endptr, 10); if (errno != 0 || MAX_PROFILE_WAIT == endptr || *endptr != '\0' || env_max_wait <= 0) { die("SNAP_CONFINE_MAX_PROFILE_WAIT invalid"); } max_wait = env_max_wait > 0 ? env_max_wait : max_wait; } if (max_wait > 3600) { max_wait = 3600; } for (long i = 0; i < max_wait; ++i) { if (access(profile_path, F_OK) == 0) { break; } sleep(1); } // validate '/' down to profile_path are root-owned and not // 'other' writable to avoid possibility of privilege // escalation via bpf program load when paths are incorrectly // set on the system. validate_bpfpath_is_safe(profile_path); // load bpf char bpf[MAX_BPF_SIZE + 1] = { 0 }; // account for EOF FILE *fp = fopen(profile_path, "rb"); if (fp == NULL) { die("cannot read %s", profile_path); } // set 'size' to 1 to get bytes transferred size_t num_read = fread(bpf, 1, sizeof(bpf), fp); if (ferror(fp) != 0) { die("cannot read seccomp profile %s", profile_path); } else if (feof(fp) == 0) { die("seccomp profile %s exceeds %zu bytes", profile_path, sizeof(bpf)); } fclose(fp); debug("read %zu bytes from %s", num_read, profile_path); if (sc_streq(bpf, "@unrestricted\n")) { return 0; } uid_t real_uid, effective_uid, saved_uid; if (getresuid(&real_uid, &effective_uid, &saved_uid) < 0) { die("cannot call getresuid"); } // If we can, raise privileges so that we can load the BPF into the // kernel via 'prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, ...)'. debug("raising privileges to load seccomp profile"); if (effective_uid != 0 && saved_uid == 0) { if (seteuid(0) != 0) { die("seteuid failed"); } if (geteuid() != 0) { die("raising privs before seccomp_load did not work"); } } // Load filter into the kernel. Importantly we are // intentionally *not* setting NO_NEW_PRIVS because it // interferes with exec transitions in AppArmor with certain // snappy interfaces. Not setting NO_NEW_PRIVS does mean that // applications can adjust their sandbox if they have // CAP_SYS_ADMIN or, if running on < 4.8 kernels, break out of // the seccomp via ptrace. Both CAP_SYS_ADMIN and 'ptrace // (trace)' are blocked by AppArmor with typical snappy // interfaces. struct sock_fprog prog = { .len = num_read / sizeof(struct sock_filter), .filter = (struct sock_filter *)bpf, }; if (seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, &prog) != 0) { if (errno == ENOSYS) { debug("kernel doesn't support the seccomp(2) syscall"); } else if (errno == EINVAL) { debug ("kernel may not support the SECCOMP_FILTER_FLAG_LOG flag"); } debug ("falling back to prctl(2) syscall to load seccomp filter"); if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) { die("cannot apply seccomp profile"); } } // drop privileges again debug("dropping privileges after loading seccomp profile"); if (geteuid() == 0) { unsigned real_uid = getuid(); if (seteuid(real_uid) != 0) { die("seteuid failed"); } if (real_uid != 0 && geteuid() == 0) { die("dropping privs after seccomp_load did not work"); } } return 0; }
bool sc_should_use_normal_mode(sc_distro distro, const char *base_snap_name) { return distro != SC_DISTRO_CORE16 || !sc_streq(base_snap_name, "core"); }