Ejemplo n.º 1
0
// IPC isolation on Linux just requires that a process be placed in an IPC
// namespace. Neither /proc, nor any of the special SVIPC filesystem need
// to be remounted for this to work. IPC namespaces are disjoint. That is,
// once you enter an IPC namespace, IPC objects from the host namespace are
// no longer visible (and vice versa). Since IPC namespaces do not nest,
// we always place nested containers into the IPC namespace of the parent
// container. That is, containers in the same group share an IPC namespace,
// but groups are isolated from each other.
Future<Option<ContainerLaunchInfo>> NamespacesIPCIsolatorProcess::prepare(
    const ContainerID& containerId,
    const ContainerConfig& containerConfig)
{
  ContainerLaunchInfo launchInfo;

  if (containerId.has_parent()) {
    launchInfo.add_enter_namespaces(CLONE_NEWIPC);
  } else {
    launchInfo.add_clone_namespaces(CLONE_NEWIPC);
  }

  return launchInfo;
}
Ejemplo n.º 2
0
Option<Error> validateContainerId(const ContainerID& containerId)
{
  // Slashes are disallowed as these IDs are mapped to directories.
  //
  // Periods are disallowed because our string representation of
  // ContainerID uses periods: <uuid>.<child>.<grandchild>.
  // For example: <uuid>.redis.backup
  //
  // Spaces are disallowed as they can render logs confusing and
  // need escaping on terminals when dealing with paths.
  //
  // TODO(bmahler): Add common/validation.hpp to share ID validation.
  // Note that this however is slightly stricter than other IDs in
  // that we do not allow periods or spaces.
  auto invalidCharacter = [](char c) {
    return iscntrl(c) ||
      c == os::POSIX_PATH_SEPARATOR ||
      c == os::WINDOWS_PATH_SEPARATOR ||
      c == '.' ||
      c == ' ';
  };

  const string& id = containerId.value();

  if (id.empty()) {
    return Error("'ContainerID.value' must be non-empty");
  }

  if (std::any_of(id.begin(), id.end(), invalidCharacter)) {
    return Error("'ContainerID.value' '" + id + "'"
                 " contains invalid characters");
  }

  // TODO(bmahler): Print the invalid field nicely within the error
  // (e.g. 'parent.parent.parent.value'). For now we only have one
  // level of nesting so it's ok.
  if (containerId.has_parent()) {
    Option<Error> parentError = validateContainerId(containerId.parent());

    if (parentError.isSome()) {
      return Error("'ContainerID.parent' is invalid: " + parentError->message);
    }
  }

  return None();
}
Ejemplo n.º 3
0
Option<Error> validateContainerId(const ContainerID& containerId)
{
  const string& id = containerId.value();

  // Check common Mesos ID rules.
  Option<Error> error = common::validation::validateID(id);
  if (error.isSome()) {
    return Error(error->message);
  }

  // Check ContainerID specific rules.
  //
  // Periods are disallowed because our string representation of
  // ContainerID uses periods: <uuid>.<child>.<grandchild>.
  // For example: <uuid>.redis.backup
  //
  // Spaces are disallowed as they can render logs confusing and
  // need escaping on terminals when dealing with paths.
  auto invalidCharacter = [](char c) {
    return  c == '.' || c == ' ';
  };

  if (std::any_of(id.begin(), id.end(), invalidCharacter)) {
    return Error("'ContainerID.value' '" + id + "'"
                 " contains invalid characters");
  }

  // TODO(bmahler): Print the invalid field nicely within the error
  // (e.g. 'parent.parent.parent.value'). For now we only have one
  // level of nesting so it's ok.
  if (containerId.has_parent()) {
    Option<Error> parentError = validateContainerId(containerId.parent());

    if (parentError.isSome()) {
      return Error("'ContainerID.parent' is invalid: " + parentError->message);
    }
  }

  return None();
}
Ejemplo n.º 4
0
  Future<bool> launch(
      const ContainerID& containerId,
      const ContainerConfig& containerConfig,
      const map<string, string>& environment,
      const Option<string>& pidCheckpointPath)
  {
    CHECK(!terminatedContainers.contains(containerId))
      << "Failed to launch nested container " << containerId
      << " for executor '" << containerConfig.executor_info().executor_id()
      << "' of framework " << containerConfig.executor_info().framework_id()
      << " because this ContainerID is being re-used with"
      << " a previously terminated container";

    CHECK(!containers_.contains(containerId))
      << "Failed to launch container " << containerId
      << " for executor '" << containerConfig.executor_info().executor_id()
      << "' of framework " << containerConfig.executor_info().framework_id()
      << " because it is already launched";

    containers_[containerId] = Owned<ContainerData>(new ContainerData());

    if (containerId.has_parent()) {
      // Launching a nested container via the test containerizer is a
      // no-op for now.
      return true;
    }

    CHECK(executors.contains(containerConfig.executor_info().executor_id()))
      << "Failed to launch executor '"
      << containerConfig.executor_info().executor_id()
      << "' of framework " << containerConfig.executor_info().framework_id()
      << " because it is unknown to the containerizer";

    containers_.at(containerId)->executorId =
      containerConfig.executor_info().executor_id();

    containers_.at(containerId)->frameworkId =
      containerConfig.executor_info().framework_id();

    // We need to synchronize all reads and writes to the environment
    // as this is global state.
    //
    // TODO(jmlvanre): Even this is not sufficient, as other aspects
    // of the code may read an environment variable while we are
    // manipulating it. The better solution is to pass the environment
    // variables into the fork, or to set them on the command line.
    // See MESOS-3475.
    static std::mutex mutex;

    synchronized(mutex) {
      // Since the constructor for `MesosExecutorDriver` reads
      // environment variables to load flags, even it needs to
      // be within this synchronization section.
      //
      // Prepare additional environment variables for the executor.
      // TODO(benh): Need to get flags passed into the TestContainerizer
      // in order to properly use here.
      slave::Flags flags;
      flags.recovery_timeout = Duration::zero();

      // We need to save the original set of environment variables so we
      // can reset the environment after calling 'driver->start()' below.
      hashmap<string, string> original = os::environment();

      foreachpair (const string& name, const string& variable, environment) {
        os::setenv(name, variable);
      }

      // TODO(benh): Can this be removed and done exclusively in the
      // 'executorEnvironment()' function? There are other places in the
      // code where we do this as well and it's likely we can do this once
      // in 'executorEnvironment()'.
      foreach (const Environment::Variable& variable,
               containerConfig.executor_info()
                 .command().environment().variables()) {
        os::setenv(variable.name(), variable.value());
      }

      os::setenv("MESOS_LOCAL", "1");

      const Owned<ExecutorData>& executorData =
        executors.at(containerConfig.executor_info().executor_id());

      if (executorData->executor != nullptr) {
        executorData->driver = Owned<MesosExecutorDriver>(
            new MesosExecutorDriver(executorData->executor));
        executorData->driver->start();
      } else {
        shared_ptr<v1::MockHTTPExecutor> executor =
          executorData->v1ExecutorMock;
        executorData->v1Library = Owned<v1::executor::TestMesos>(
          new v1::executor::TestMesos(ContentType::PROTOBUF, executor));
      }

      os::unsetenv("MESOS_LOCAL");

      // Unset the environment variables we set by resetting them to their
      // original values and also removing any that were not part of the
      // original environment.
      foreachpair (const string& name, const string& value, original) {
        os::setenv(name, value);
      }
Ejemplo n.º 5
0
Future<Nothing> NvidiaGpuIsolatorProcess::update(
    const ContainerID& containerId,
    const Resources& resources)
{
  if (containerId.has_parent()) {
    return Failure("Not supported for nested containers");
  }

  if (!infos.contains(containerId)) {
    return Failure("Unknown container");
  }

  Info* info = CHECK_NOTNULL(infos[containerId]);

  Option<double> gpus = resources.gpus();

  // Make sure that the `gpus` resource is not fractional.
  // We rely on scalar resources only having 3 digits of precision.
  if (static_cast<long long>(gpus.getOrElse(0.0) * 1000.0) % 1000 != 0) {
    return Failure("The 'gpus' resource must be an unsigned integer");
  }

  size_t requested = static_cast<size_t>(resources.gpus().getOrElse(0.0));

  // Update the GPU allocation to reflect the new total.
  if (requested > info->allocated.size()) {
    size_t additional = requested - info->allocated.size();

    return allocator.allocate(additional)
      .then(defer(PID<NvidiaGpuIsolatorProcess>(this),
                  &NvidiaGpuIsolatorProcess::_update,
                  containerId,
                  lambda::_1));
  } else if (requested < info->allocated.size()) {
    size_t fewer = info->allocated.size() - requested;

    set<Gpu> deallocated;

    for (size_t i = 0; i < fewer; i++) {
      const auto gpu = info->allocated.begin();

      cgroups::devices::Entry entry;
      entry.selector.type = Entry::Selector::Type::CHARACTER;
      entry.selector.major = gpu->major;
      entry.selector.minor = gpu->minor;
      entry.access.read = true;
      entry.access.write = true;
      entry.access.mknod = true;

      Try<Nothing> deny = cgroups::devices::deny(
          hierarchy, info->cgroup, entry);

      if (deny.isError()) {
        return Failure("Failed to deny cgroups access to GPU device"
                       " '" + stringify(entry) + "': " + deny.error());
      }

      deallocated.insert(*gpu);
      info->allocated.erase(gpu);
    }

    return allocator.deallocate(deallocated);
  }

  return Nothing();
}
Ejemplo n.º 6
0
bool operator==(const ContainerID& left, const ContainerID& right)
{
  return left.value() == right.value() &&
    left.has_parent() == right.has_parent() &&
    (!left.has_parent() || left.parent() == right.parent());
}
Ejemplo n.º 7
0
Future<Nothing> LinuxFilesystemIsolatorProcess::update(
    const ContainerID& containerId,
    const Resources& resources)
{
  if (containerId.has_parent()) {
    return Failure("Not supported for nested containers");
  }

  // Mount persistent volumes. We do this in the host namespace and
  // rely on mount propagation for them to be visible inside the
  // container.
  if (!infos.contains(containerId)) {
    return Failure("Unknown container");
  }

  const Owned<Info>& info = infos[containerId];

  Resources current = info->resources;

  // We first remove unneeded persistent volumes.
  foreach (const Resource& resource, current.persistentVolumes()) {
    // This is enforced by the master.
    CHECK(resource.disk().has_volume());

    // Ignore absolute and nested paths.
    const string& containerPath = resource.disk().volume().container_path();
    if (strings::contains(containerPath, "/")) {
      LOG(WARNING) << "Skipping updating mount for persistent volume "
                   << resource << " of container " << containerId
                   << " because the container path '" << containerPath
                   << "' contains slash";
      continue;
    }

    if (resources.contains(resource)) {
      continue;
    }

    // Determine the target of the mount.
    string target = path::join(info->directory, containerPath);

    LOG(INFO) << "Removing mount '" << target << "' for persistent volume "
              << resource << " of container " << containerId;

    // The unmount will fail if the task/executor is still using files
    // or directories under 'target'.
    Try<Nothing> unmount = fs::unmount(target);
    if (unmount.isError()) {
      return Failure(
          "Failed to unmount unneeded persistent volume at '" +
          target + "': " + unmount.error());
    }

    // NOTE: This is a non-recursive rmdir.
    Try<Nothing> rmdir = os::rmdir(target, false);
    if (rmdir.isError()) {
      return Failure(
          "Failed to remove persistent volume mount point at '" +
          target + "': " + rmdir.error());
    }
  }

  // Get user and group info for this task based on the task's sandbox.
  struct stat s;
  if (::stat(info->directory.c_str(), &s) < 0) {
    return Failure("Failed to get ownership for '" + info->directory +
                   "': " + os::strerror(errno));
  }

  const uid_t uid = s.st_uid;
  const gid_t gid = s.st_gid;

  // We then mount new persistent volumes.
  foreach (const Resource& resource, resources.persistentVolumes()) {
    // This is enforced by the master.
    CHECK(resource.disk().has_volume());

    // Ignore absolute and nested paths.
    const string& containerPath = resource.disk().volume().container_path();
    if (strings::contains(containerPath, "/")) {
      LOG(WARNING) << "Skipping updating mount for persistent volume "
                   << resource << " of container " << containerId
                   << " because the container path '" << containerPath
                   << "' contains slash";
      continue;
    }

    if (current.contains(resource)) {
      continue;
    }

    // Determine the source of the mount.
    string source = paths::getPersistentVolumePath(flags.work_dir, resource);

    bool isVolumeInUse = false;

    foreachvalue (const Owned<Info>& info, infos) {
      if (info->resources.contains(resource)) {
        isVolumeInUse = true;
        break;
      }
    }

    // Set the ownership of the persistent volume to match that of the sandbox
    // directory if the volume is not already in use. If the volume is
    // currently in use by other containers, tasks in this container may fail
    // to read from or write to the persistent volume due to incompatible
    // ownership and file system permissions.
    if (!isVolumeInUse) {
      LOG(INFO) << "Changing the ownership of the persistent volume at '"
                << source << "' with uid " << uid << " and gid " << gid;

      Try<Nothing> chown = os::chown(uid, gid, source, false);

      if (chown.isError()) {
        return Failure(
            "Failed to change the ownership of the persistent volume at '" +
            source + "' with uid " + stringify(uid) +
            " and gid " + stringify(gid) + ": " + chown.error());
      }
    }

    // Determine the target of the mount.
    string target = path::join(info->directory, containerPath);

    if (os::exists(target)) {
      // NOTE: There are two scenarios that we may have the mount
      // target existed:
      // 1. This is possible because 'info->resources' will be reset
      //    when slave restarts and recovers. When the slave calls
      //    'containerizer->update' after the executor re-registers,
      //    we'll try to re-mount all the already mounted volumes.
      // 2. There may be multiple references to the persistent
      //    volume's mount target. E.g., a host volume and a
      //    persistent volume are both specified, and the source
      //    of the host volume is the same as the container path
      //    of the persistent volume.

      // Check the source of the mount matches the entry with the
      // same target in the mount table if one can be found. If
      // not, mount the persistent volume as we did below. This is
      // possible because the slave could crash after it unmounts the
      // volume but before it is able to delete the mount point.
      Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
      if (table.isError()) {
        return Failure("Failed to get mount table: " + table.error());
      }

      // Check a particular persistent volume is mounted or not.
      bool volumeMounted = false;

      foreach (const fs::MountInfoTable::Entry& entry, table->entries) {
        // TODO(gilbert): Check source of the mount matches the entry's
        // root. Note that the root is relative to the root of its parent
        // mount. See:
        // http://man7.org/linux/man-pages/man5/proc.5.html
        if (target == entry.target) {
          volumeMounted = true;
          break;
        }
      }

      if (volumeMounted) {
        continue;
      }
    }

    Try<Nothing> mkdir = os::mkdir(target);
    if (mkdir.isError()) {
      return Failure(
          "Failed to create persistent volume mount point at '" +
          target + "': " + mkdir.error());
    }

    LOG(INFO) << "Mounting '" << source << "' to '" << target
              << "' for persistent volume " << resource
              << " of container " << containerId;

    Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, nullptr);
    if (mount.isError()) {
      return Failure(
          "Failed to mount persistent volume from '" +
          source + "' to '" + target + "': " + mount.error());
    }

    // If the mount needs to be read-only, do a remount.
    if (resource.disk().volume().mode() == Volume::RO) {
      mount = fs::mount(
          None(), target, None(), MS_BIND | MS_RDONLY | MS_REMOUNT, nullptr);

      if (mount.isError()) {
        return Failure(
            "Failed to remount persistent volume as read-only from '" +
            source + "' to '" + target + "': " + mount.error());
      }
    }
  }

  // Store the new resources;
  info->resources = resources;

  return Nothing();
}