Ejemplo n.º 1
0
  void killTask(ExecutorDriver* driver, const TaskID& taskId)
  {
    LOG(INFO) << "Received killTask for task " << taskId.value();

    // Using shutdown grace period as a default is backwards compatible
    // with the `stop_timeout` flag, deprecated in 1.0.
    Duration gracePeriod = shutdownGracePeriod;

    if (killPolicy.isSome() && killPolicy->has_grace_period()) {
      gracePeriod = Nanoseconds(killPolicy->grace_period().nanoseconds());
    }

    killTask(driver, taskId, gracePeriod);
  }
Ejemplo n.º 2
0
    void killTask (ExecutorDriver* driver, const TaskID& taskId) override {
      const string& ti = taskId.value();
      pid_t pid;

      {
        lock_guard<mutex> lock(TaskId2PidLock);

        auto iter = TaskId2Pid.find(ti);

        if (iter == TaskId2Pid.end()) {
          LOG(WARNING)
          << "unknown task id '" << ti << "'";
          return;
        }

        pid = iter->second;
      }

      // TODO(fc) be graceful
      kill(pid, 9);
    }
Ejemplo n.º 3
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const UUID& uuid,
    bool strict)
{
  RunState state;
  state.id = uuid;
  string message;

  // Find the tasks.
  const Try<list<string> >& tasks = os::glob(strings::format(
      paths::TASK_PATH,
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      uuid.toString(),
      "*").get());

  if (tasks.isError()) {
    return Error("Failed to find tasks for executor run " + uuid.toString() +
                 ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(os::basename(path).get());

    const Try<TaskState>& task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
    state.errors += task.get().errors;
  }

  // Read the forked pid.
  string path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);
  if (!os::exists(path)) {
    // This could happen if the slave died before the isolator
    // checkpointed the forked pid.
    LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
    return state;
  }

  Try<string> pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor forked pid file '" << path << "'";
    return state;
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid " + pid.get() +
                 ": " + forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  if (!os::exists(path)) {
    // This could happen if the slave died before the executor
    // registered with the slave.
    LOG(WARNING)
      << "Failed to find executor libprocess pid file '" << path << "'";
    return state;
  }

  pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor libprocess pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'";
    return state;
  }

  state.libprocessPid = process::UPID(pid.get());

  // See if the sentinel file exists.
  path = paths::getExecutorSentinelPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  state.completed = os::exists(path);

  return state;
}
Ejemplo n.º 4
0
inline bool operator<(const TaskID& left, const TaskID& right)
{
  return left.value() < right.value();
}
Ejemplo n.º 5
0
inline bool operator==(const TaskID& left, const std::string& right)
{
  return left.value() == right;
}
Ejemplo n.º 6
0
inline std::size_t hash_value(const TaskID& taskId)
{
  size_t seed = 0;
  boost::hash_combine(seed, taskId.value());
  return seed;
}
Ejemplo n.º 7
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    bool strict,
    bool rebooted)
{
  RunState state;
  state.id = containerId;
  string message;

  // See if the sentinel file exists. This is done first so it is
  // known even if partial state is returned, e.g., if the libprocess
  // pid file is not recovered. It indicates the slave removed the
  // executor.
  string path = paths::getExecutorSentinelPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  state.completed = os::exists(path);

  // Find the tasks.
  Try<list<string>> tasks = paths::getTaskPaths(
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      containerId);

  if (tasks.isError()) {
    return Error(
        "Failed to find tasks for executor run " + containerId.value() +
        ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(Path(path).basename());

    Try<TaskState> task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
    state.errors += task->errors;
  }

  path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  // If agent host is rebooted, we do not read the forked pid and libprocess pid
  // since those two pids are obsolete after reboot. And we remove the forked
  // pid file to make sure we will not read it in the case the agent process is
  // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e.,
  // agent recovery is done after the reboot).
  if (rebooted) {
    if (os::exists(path)) {
      Try<Nothing> rm = os::rm(path);
      if (rm.isError()) {
        return Error(
            "Failed to remove executor forked pid file '" + path + "': " +
            rm.error());
      }
    }

    return state;
  }

  if (!os::exists(path)) {
    // This could happen if the slave died before the containerizer checkpointed
    // the forked pid or agent process is restarted after agent host is rebooted
    // since we remove this file in the above code.
    LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
    return state;
  }

  // Read the forked pid.
  Result<string> pid = state::read<string>(path);
  if (pid.isError()) {
    message = "Failed to read executor forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid->empty()) {
    // This could happen if the slave is hard rebooted after the file is created
    // but before the data is synced on disk.
    LOG(WARNING) << "Found empty executor forked pid file '" << path << "'";
    return state;
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid '" + pid.get() + "' "
                 "from pid file '" + path + "': " +
                 forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  if (os::exists(path)) {
    pid = state::read<string>(path);

    if (pid.isError()) {
      message = "Failed to read executor libprocess pid from '" + path +
                "': " + pid.error();

      if (strict) {
        return Error(message);
      } else {
        LOG(WARNING) << message;
        state.errors++;
        return state;
      }
    }

    if (pid->empty()) {
      // This could happen if the slave is hard rebooted after the file is
      // created but before the data is synced on disk.
      LOG(WARNING) << "Found empty executor libprocess pid file '" << path
                   << "'";
      return state;
    }

    state.libprocessPid = process::UPID(pid.get());
    state.http = false;

    return state;
  }

  path = paths::getExecutorHttpMarkerPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  // The marker could be absent if the slave died before the executor
  // registered with the slave.
  if (!os::exists(path)) {
    LOG(WARNING) << "Failed to find '" << paths::LIBPROCESS_PID_FILE
                 << "' or '" << paths::HTTP_MARKER_FILE
                 << "' for container " << containerId
                 << " of executor '" << executorId
                 << "' of framework " << frameworkId;
    return state;
  }

  state.http = true;
  return state;
}
Ejemplo n.º 8
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    bool strict)
{
  RunState state;
  state.id = containerId;
  string message;

  // See if the sentinel file exists. This is done first so it is
  // known even if partial state is returned, e.g., if the libprocess
  // pid file is not recovered. It indicates the slave removed the
  // executor.
  string path = paths::getExecutorSentinelPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  state.completed = os::exists(path);

  // Find the tasks.
  Try<list<string> > tasks = paths::getTaskPaths(
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      containerId);

  if (tasks.isError()) {
    return Error(
        "Failed to find tasks for executor run " + containerId.value() +
        ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(Path(path).basename());

    Try<TaskState> task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
    state.errors += task.get().errors;
  }

  // Read the forked pid.
  path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);
  if (!os::exists(path)) {
    // This could happen if the slave died before the isolator
    // checkpointed the forked pid.
    LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
    return state;
  }

  Try<string> pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor forked pid file '" << path << "'";
    return state;
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid " + pid.get() +
                 ": " + forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  if (!os::exists(path)) {
    // This could happen if the slave died before the executor
    // registered with the slave.
    LOG(WARNING)
      << "Failed to find executor libprocess pid file '" << path << "'";
    return state;
  }

  pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor libprocess pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'";
    return state;
  }

  state.libprocessPid = process::UPID(pid.get());

  return state;
}
Ejemplo n.º 9
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const UUID& uuid,
    bool strict)
{
  RunState state;
  state.id = uuid;
  string message;

  // Find the tasks.
  const Try<list<string> >& tasks = os::glob(strings::format(
      paths::TASK_PATH,
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      uuid.toString(),
      "*").get());

  if (tasks.isError()) {
    return Error("Failed to find tasks for executor run " + uuid.toString() +
                 ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(os::basename(path).get());

    const Try<TaskState>& task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
  }

  // Read the forked pid.
  string path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  Try<string> pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor's forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      return state;
    }
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid " + pid.get() +
                 ": " + forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor's libprocess pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      return state;
    }
  }

  state.libprocessPid = process::UPID(pid.get());

  return state;
}