void killTask(ExecutorDriver* driver, const TaskID& taskId) { LOG(INFO) << "Received killTask for task " << taskId.value(); // Using shutdown grace period as a default is backwards compatible // with the `stop_timeout` flag, deprecated in 1.0. Duration gracePeriod = shutdownGracePeriod; if (killPolicy.isSome() && killPolicy->has_grace_period()) { gracePeriod = Nanoseconds(killPolicy->grace_period().nanoseconds()); } killTask(driver, taskId, gracePeriod); }
void killTask (ExecutorDriver* driver, const TaskID& taskId) override { const string& ti = taskId.value(); pid_t pid; { lock_guard<mutex> lock(TaskId2PidLock); auto iter = TaskId2Pid.find(ti); if (iter == TaskId2Pid.end()) { LOG(WARNING) << "unknown task id '" << ti << "'"; return; } pid = iter->second; } // TODO(fc) be graceful kill(pid, 9); }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const UUID& uuid, bool strict) { RunState state; state.id = uuid; string message; // Find the tasks. const Try<list<string> >& tasks = os::glob(strings::format( paths::TASK_PATH, rootDir, slaveId, frameworkId, executorId, uuid.toString(), "*").get()); if (tasks.isError()) { return Error("Failed to find tasks for executor run " + uuid.toString() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(os::basename(path).get()); const Try<TaskState>& task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); state.errors += task.get().errors; } // Read the forked pid. string path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, uuid); if (!os::exists(path)) { // This could happen if the slave died before the isolator // checkpointed the forked pid. LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'"; return state; } Try<string> pid = os::read(path); if (pid.isError()) { message = "Failed to read executor forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor forked pid file '" << path << "'"; return state; } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid " + pid.get() + ": " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, uuid); if (!os::exists(path)) { // This could happen if the slave died before the executor // registered with the slave. LOG(WARNING) << "Failed to find executor libprocess pid file '" << path << "'"; return state; } pid = os::read(path); if (pid.isError()) { message = "Failed to read executor libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'"; return state; } state.libprocessPid = process::UPID(pid.get()); // See if the sentinel file exists. path = paths::getExecutorSentinelPath( rootDir, slaveId, frameworkId, executorId, uuid); state.completed = os::exists(path); return state; }
inline bool operator<(const TaskID& left, const TaskID& right) { return left.value() < right.value(); }
inline bool operator==(const TaskID& left, const std::string& right) { return left.value() == right; }
inline std::size_t hash_value(const TaskID& taskId) { size_t seed = 0; boost::hash_combine(seed, taskId.value()); return seed; }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const ContainerID& containerId, bool strict, bool rebooted) { RunState state; state.id = containerId; string message; // See if the sentinel file exists. This is done first so it is // known even if partial state is returned, e.g., if the libprocess // pid file is not recovered. It indicates the slave removed the // executor. string path = paths::getExecutorSentinelPath( rootDir, slaveId, frameworkId, executorId, containerId); state.completed = os::exists(path); // Find the tasks. Try<list<string>> tasks = paths::getTaskPaths( rootDir, slaveId, frameworkId, executorId, containerId); if (tasks.isError()) { return Error( "Failed to find tasks for executor run " + containerId.value() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(Path(path).basename()); Try<TaskState> task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); state.errors += task->errors; } path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, containerId); // If agent host is rebooted, we do not read the forked pid and libprocess pid // since those two pids are obsolete after reboot. And we remove the forked // pid file to make sure we will not read it in the case the agent process is // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e., // agent recovery is done after the reboot). if (rebooted) { if (os::exists(path)) { Try<Nothing> rm = os::rm(path); if (rm.isError()) { return Error( "Failed to remove executor forked pid file '" + path + "': " + rm.error()); } } return state; } if (!os::exists(path)) { // This could happen if the slave died before the containerizer checkpointed // the forked pid or agent process is restarted after agent host is rebooted // since we remove this file in the above code. LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'"; return state; } // Read the forked pid. Result<string> pid = state::read<string>(path); if (pid.isError()) { message = "Failed to read executor forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid->empty()) { // This could happen if the slave is hard rebooted after the file is created // but before the data is synced on disk. LOG(WARNING) << "Found empty executor forked pid file '" << path << "'"; return state; } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid '" + pid.get() + "' " "from pid file '" + path + "': " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, containerId); if (os::exists(path)) { pid = state::read<string>(path); if (pid.isError()) { message = "Failed to read executor libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid->empty()) { // This could happen if the slave is hard rebooted after the file is // created but before the data is synced on disk. LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'"; return state; } state.libprocessPid = process::UPID(pid.get()); state.http = false; return state; } path = paths::getExecutorHttpMarkerPath( rootDir, slaveId, frameworkId, executorId, containerId); // The marker could be absent if the slave died before the executor // registered with the slave. if (!os::exists(path)) { LOG(WARNING) << "Failed to find '" << paths::LIBPROCESS_PID_FILE << "' or '" << paths::HTTP_MARKER_FILE << "' for container " << containerId << " of executor '" << executorId << "' of framework " << frameworkId; return state; } state.http = true; return state; }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const ContainerID& containerId, bool strict) { RunState state; state.id = containerId; string message; // See if the sentinel file exists. This is done first so it is // known even if partial state is returned, e.g., if the libprocess // pid file is not recovered. It indicates the slave removed the // executor. string path = paths::getExecutorSentinelPath( rootDir, slaveId, frameworkId, executorId, containerId); state.completed = os::exists(path); // Find the tasks. Try<list<string> > tasks = paths::getTaskPaths( rootDir, slaveId, frameworkId, executorId, containerId); if (tasks.isError()) { return Error( "Failed to find tasks for executor run " + containerId.value() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(Path(path).basename()); Try<TaskState> task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); state.errors += task.get().errors; } // Read the forked pid. path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, containerId); if (!os::exists(path)) { // This could happen if the slave died before the isolator // checkpointed the forked pid. LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'"; return state; } Try<string> pid = os::read(path); if (pid.isError()) { message = "Failed to read executor forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor forked pid file '" << path << "'"; return state; } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid " + pid.get() + ": " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, containerId); if (!os::exists(path)) { // This could happen if the slave died before the executor // registered with the slave. LOG(WARNING) << "Failed to find executor libprocess pid file '" << path << "'"; return state; } pid = os::read(path); if (pid.isError()) { message = "Failed to read executor libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'"; return state; } state.libprocessPid = process::UPID(pid.get()); return state; }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const UUID& uuid, bool strict) { RunState state; state.id = uuid; string message; // Find the tasks. const Try<list<string> >& tasks = os::glob(strings::format( paths::TASK_PATH, rootDir, slaveId, frameworkId, executorId, uuid.toString(), "*").get()); if (tasks.isError()) { return Error("Failed to find tasks for executor run " + uuid.toString() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(os::basename(path).get()); const Try<TaskState>& task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); } // Read the forked pid. string path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, uuid); Try<string> pid = os::read(path); if (pid.isError()) { message = "Failed to read executor's forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; return state; } } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid " + pid.get() + ": " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, uuid); pid = os::read(path); if (pid.isError()) { message = "Failed to read executor's libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; return state; } } state.libprocessPid = process::UPID(pid.get()); return state; }