// IPC isolation on Linux just requires that a process be placed in an IPC // namespace. Neither /proc, nor any of the special SVIPC filesystem need // to be remounted for this to work. IPC namespaces are disjoint. That is, // once you enter an IPC namespace, IPC objects from the host namespace are // no longer visible (and vice versa). Since IPC namespaces do not nest, // we always place nested containers into the IPC namespace of the parent // container. That is, containers in the same group share an IPC namespace, // but groups are isolated from each other. Future<Option<ContainerLaunchInfo>> NamespacesIPCIsolatorProcess::prepare( const ContainerID& containerId, const ContainerConfig& containerConfig) { ContainerLaunchInfo launchInfo; if (containerId.has_parent()) { launchInfo.add_enter_namespaces(CLONE_NEWIPC); } else { launchInfo.add_clone_namespaces(CLONE_NEWIPC); } return launchInfo; }
Option<Error> validateContainerId(const ContainerID& containerId) { // Slashes are disallowed as these IDs are mapped to directories. // // Periods are disallowed because our string representation of // ContainerID uses periods: <uuid>.<child>.<grandchild>. // For example: <uuid>.redis.backup // // Spaces are disallowed as they can render logs confusing and // need escaping on terminals when dealing with paths. // // TODO(bmahler): Add common/validation.hpp to share ID validation. // Note that this however is slightly stricter than other IDs in // that we do not allow periods or spaces. auto invalidCharacter = [](char c) { return iscntrl(c) || c == os::POSIX_PATH_SEPARATOR || c == os::WINDOWS_PATH_SEPARATOR || c == '.' || c == ' '; }; const string& id = containerId.value(); if (id.empty()) { return Error("'ContainerID.value' must be non-empty"); } if (std::any_of(id.begin(), id.end(), invalidCharacter)) { return Error("'ContainerID.value' '" + id + "'" " contains invalid characters"); } // TODO(bmahler): Print the invalid field nicely within the error // (e.g. 'parent.parent.parent.value'). For now we only have one // level of nesting so it's ok. if (containerId.has_parent()) { Option<Error> parentError = validateContainerId(containerId.parent()); if (parentError.isSome()) { return Error("'ContainerID.parent' is invalid: " + parentError->message); } } return None(); }
Option<Error> validateContainerId(const ContainerID& containerId) { const string& id = containerId.value(); // Check common Mesos ID rules. Option<Error> error = common::validation::validateID(id); if (error.isSome()) { return Error(error->message); } // Check ContainerID specific rules. // // Periods are disallowed because our string representation of // ContainerID uses periods: <uuid>.<child>.<grandchild>. // For example: <uuid>.redis.backup // // Spaces are disallowed as they can render logs confusing and // need escaping on terminals when dealing with paths. auto invalidCharacter = [](char c) { return c == '.' || c == ' '; }; if (std::any_of(id.begin(), id.end(), invalidCharacter)) { return Error("'ContainerID.value' '" + id + "'" " contains invalid characters"); } // TODO(bmahler): Print the invalid field nicely within the error // (e.g. 'parent.parent.parent.value'). For now we only have one // level of nesting so it's ok. if (containerId.has_parent()) { Option<Error> parentError = validateContainerId(containerId.parent()); if (parentError.isSome()) { return Error("'ContainerID.parent' is invalid: " + parentError->message); } } return None(); }
Future<bool> launch( const ContainerID& containerId, const ContainerConfig& containerConfig, const map<string, string>& environment, const Option<string>& pidCheckpointPath) { CHECK(!terminatedContainers.contains(containerId)) << "Failed to launch nested container " << containerId << " for executor '" << containerConfig.executor_info().executor_id() << "' of framework " << containerConfig.executor_info().framework_id() << " because this ContainerID is being re-used with" << " a previously terminated container"; CHECK(!containers_.contains(containerId)) << "Failed to launch container " << containerId << " for executor '" << containerConfig.executor_info().executor_id() << "' of framework " << containerConfig.executor_info().framework_id() << " because it is already launched"; containers_[containerId] = Owned<ContainerData>(new ContainerData()); if (containerId.has_parent()) { // Launching a nested container via the test containerizer is a // no-op for now. return true; } CHECK(executors.contains(containerConfig.executor_info().executor_id())) << "Failed to launch executor '" << containerConfig.executor_info().executor_id() << "' of framework " << containerConfig.executor_info().framework_id() << " because it is unknown to the containerizer"; containers_.at(containerId)->executorId = containerConfig.executor_info().executor_id(); containers_.at(containerId)->frameworkId = containerConfig.executor_info().framework_id(); // We need to synchronize all reads and writes to the environment // as this is global state. // // TODO(jmlvanre): Even this is not sufficient, as other aspects // of the code may read an environment variable while we are // manipulating it. The better solution is to pass the environment // variables into the fork, or to set them on the command line. // See MESOS-3475. static std::mutex mutex; synchronized(mutex) { // Since the constructor for `MesosExecutorDriver` reads // environment variables to load flags, even it needs to // be within this synchronization section. // // Prepare additional environment variables for the executor. // TODO(benh): Need to get flags passed into the TestContainerizer // in order to properly use here. slave::Flags flags; flags.recovery_timeout = Duration::zero(); // We need to save the original set of environment variables so we // can reset the environment after calling 'driver->start()' below. hashmap<string, string> original = os::environment(); foreachpair (const string& name, const string& variable, environment) { os::setenv(name, variable); } // TODO(benh): Can this be removed and done exclusively in the // 'executorEnvironment()' function? There are other places in the // code where we do this as well and it's likely we can do this once // in 'executorEnvironment()'. foreach (const Environment::Variable& variable, containerConfig.executor_info() .command().environment().variables()) { os::setenv(variable.name(), variable.value()); } os::setenv("MESOS_LOCAL", "1"); const Owned<ExecutorData>& executorData = executors.at(containerConfig.executor_info().executor_id()); if (executorData->executor != nullptr) { executorData->driver = Owned<MesosExecutorDriver>( new MesosExecutorDriver(executorData->executor)); executorData->driver->start(); } else { shared_ptr<v1::MockHTTPExecutor> executor = executorData->v1ExecutorMock; executorData->v1Library = Owned<v1::executor::TestMesos>( new v1::executor::TestMesos(ContentType::PROTOBUF, executor)); } os::unsetenv("MESOS_LOCAL"); // Unset the environment variables we set by resetting them to their // original values and also removing any that were not part of the // original environment. foreachpair (const string& name, const string& value, original) { os::setenv(name, value); }
Future<Nothing> NvidiaGpuIsolatorProcess::update( const ContainerID& containerId, const Resources& resources) { if (containerId.has_parent()) { return Failure("Not supported for nested containers"); } if (!infos.contains(containerId)) { return Failure("Unknown container"); } Info* info = CHECK_NOTNULL(infos[containerId]); Option<double> gpus = resources.gpus(); // Make sure that the `gpus` resource is not fractional. // We rely on scalar resources only having 3 digits of precision. if (static_cast<long long>(gpus.getOrElse(0.0) * 1000.0) % 1000 != 0) { return Failure("The 'gpus' resource must be an unsigned integer"); } size_t requested = static_cast<size_t>(resources.gpus().getOrElse(0.0)); // Update the GPU allocation to reflect the new total. if (requested > info->allocated.size()) { size_t additional = requested - info->allocated.size(); return allocator.allocate(additional) .then(defer(PID<NvidiaGpuIsolatorProcess>(this), &NvidiaGpuIsolatorProcess::_update, containerId, lambda::_1)); } else if (requested < info->allocated.size()) { size_t fewer = info->allocated.size() - requested; set<Gpu> deallocated; for (size_t i = 0; i < fewer; i++) { const auto gpu = info->allocated.begin(); cgroups::devices::Entry entry; entry.selector.type = Entry::Selector::Type::CHARACTER; entry.selector.major = gpu->major; entry.selector.minor = gpu->minor; entry.access.read = true; entry.access.write = true; entry.access.mknod = true; Try<Nothing> deny = cgroups::devices::deny( hierarchy, info->cgroup, entry); if (deny.isError()) { return Failure("Failed to deny cgroups access to GPU device" " '" + stringify(entry) + "': " + deny.error()); } deallocated.insert(*gpu); info->allocated.erase(gpu); } return allocator.deallocate(deallocated); } return Nothing(); }
bool operator==(const ContainerID& left, const ContainerID& right) { return left.value() == right.value() && left.has_parent() == right.has_parent() && (!left.has_parent() || left.parent() == right.parent()); }
Future<Nothing> LinuxFilesystemIsolatorProcess::update( const ContainerID& containerId, const Resources& resources) { if (containerId.has_parent()) { return Failure("Not supported for nested containers"); } // Mount persistent volumes. We do this in the host namespace and // rely on mount propagation for them to be visible inside the // container. if (!infos.contains(containerId)) { return Failure("Unknown container"); } const Owned<Info>& info = infos[containerId]; Resources current = info->resources; // We first remove unneeded persistent volumes. foreach (const Resource& resource, current.persistentVolumes()) { // This is enforced by the master. CHECK(resource.disk().has_volume()); // Ignore absolute and nested paths. const string& containerPath = resource.disk().volume().container_path(); if (strings::contains(containerPath, "/")) { LOG(WARNING) << "Skipping updating mount for persistent volume " << resource << " of container " << containerId << " because the container path '" << containerPath << "' contains slash"; continue; } if (resources.contains(resource)) { continue; } // Determine the target of the mount. string target = path::join(info->directory, containerPath); LOG(INFO) << "Removing mount '" << target << "' for persistent volume " << resource << " of container " << containerId; // The unmount will fail if the task/executor is still using files // or directories under 'target'. Try<Nothing> unmount = fs::unmount(target); if (unmount.isError()) { return Failure( "Failed to unmount unneeded persistent volume at '" + target + "': " + unmount.error()); } // NOTE: This is a non-recursive rmdir. Try<Nothing> rmdir = os::rmdir(target, false); if (rmdir.isError()) { return Failure( "Failed to remove persistent volume mount point at '" + target + "': " + rmdir.error()); } } // Get user and group info for this task based on the task's sandbox. struct stat s; if (::stat(info->directory.c_str(), &s) < 0) { return Failure("Failed to get ownership for '" + info->directory + "': " + os::strerror(errno)); } const uid_t uid = s.st_uid; const gid_t gid = s.st_gid; // We then mount new persistent volumes. foreach (const Resource& resource, resources.persistentVolumes()) { // This is enforced by the master. CHECK(resource.disk().has_volume()); // Ignore absolute and nested paths. const string& containerPath = resource.disk().volume().container_path(); if (strings::contains(containerPath, "/")) { LOG(WARNING) << "Skipping updating mount for persistent volume " << resource << " of container " << containerId << " because the container path '" << containerPath << "' contains slash"; continue; } if (current.contains(resource)) { continue; } // Determine the source of the mount. string source = paths::getPersistentVolumePath(flags.work_dir, resource); bool isVolumeInUse = false; foreachvalue (const Owned<Info>& info, infos) { if (info->resources.contains(resource)) { isVolumeInUse = true; break; } } // Set the ownership of the persistent volume to match that of the sandbox // directory if the volume is not already in use. If the volume is // currently in use by other containers, tasks in this container may fail // to read from or write to the persistent volume due to incompatible // ownership and file system permissions. if (!isVolumeInUse) { LOG(INFO) << "Changing the ownership of the persistent volume at '" << source << "' with uid " << uid << " and gid " << gid; Try<Nothing> chown = os::chown(uid, gid, source, false); if (chown.isError()) { return Failure( "Failed to change the ownership of the persistent volume at '" + source + "' with uid " + stringify(uid) + " and gid " + stringify(gid) + ": " + chown.error()); } } // Determine the target of the mount. string target = path::join(info->directory, containerPath); if (os::exists(target)) { // NOTE: There are two scenarios that we may have the mount // target existed: // 1. This is possible because 'info->resources' will be reset // when slave restarts and recovers. When the slave calls // 'containerizer->update' after the executor re-registers, // we'll try to re-mount all the already mounted volumes. // 2. There may be multiple references to the persistent // volume's mount target. E.g., a host volume and a // persistent volume are both specified, and the source // of the host volume is the same as the container path // of the persistent volume. // Check the source of the mount matches the entry with the // same target in the mount table if one can be found. If // not, mount the persistent volume as we did below. This is // possible because the slave could crash after it unmounts the // volume but before it is able to delete the mount point. Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); if (table.isError()) { return Failure("Failed to get mount table: " + table.error()); } // Check a particular persistent volume is mounted or not. bool volumeMounted = false; foreach (const fs::MountInfoTable::Entry& entry, table->entries) { // TODO(gilbert): Check source of the mount matches the entry's // root. Note that the root is relative to the root of its parent // mount. See: // http://man7.org/linux/man-pages/man5/proc.5.html if (target == entry.target) { volumeMounted = true; break; } } if (volumeMounted) { continue; } } Try<Nothing> mkdir = os::mkdir(target); if (mkdir.isError()) { return Failure( "Failed to create persistent volume mount point at '" + target + "': " + mkdir.error()); } LOG(INFO) << "Mounting '" << source << "' to '" << target << "' for persistent volume " << resource << " of container " << containerId; Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, nullptr); if (mount.isError()) { return Failure( "Failed to mount persistent volume from '" + source + "' to '" + target + "': " + mount.error()); } // If the mount needs to be read-only, do a remount. if (resource.disk().volume().mode() == Volume::RO) { mount = fs::mount( None(), target, None(), MS_BIND | MS_RDONLY | MS_REMOUNT, nullptr); if (mount.isError()) { return Failure( "Failed to remount persistent volume as read-only from '" + source + "' to '" + target + "': " + mount.error()); } } } // Store the new resources; info->resources = resources; return Nothing(); }