void ChapleScheduler::terminateAllTasks(SchedulerDriver* driver) { for(map<string, TaskInfo>::iterator i = launchedTsks.begin(); i != launchedTsks.end(); i++) { cout << "\tChapel Task " << i->first << " notified to terminate" << endl; TaskID tid; tid.set_value(i->first); driver->killTask(tid); } }
void killTask(ExecutorDriver* driver, const TaskID& taskId) { LOG(INFO) << "Received killTask for task " << taskId.value(); // Using shutdown grace period as a default is backwards compatible // with the `stop_timeout` flag, deprecated in 1.0. Duration gracePeriod = shutdownGracePeriod; if (killPolicy.isSome() && killPolicy->has_grace_period()) { gracePeriod = Nanoseconds(killPolicy->grace_period().nanoseconds()); } killTask(driver, taskId, gracePeriod); }
jobject convert(JNIEnv* env, const TaskID& taskId) { string data; taskId.SerializeToString(&data); // byte[] data = ..; jbyteArray jdata = env->NewByteArray(data.size()); env->SetByteArrayRegion(jdata, 0, data.size(), (jbyte*) data.data()); // TaskID taskId = TaskID.parseFrom(data); jclass clazz = FindMesosClass(env, "org/apache/mesos/Protos$TaskID"); jmethodID parseFrom = env->GetStaticMethodID(clazz, "parseFrom", "([B)Lorg/apache/mesos/Protos$TaskID;"); jobject jtaskId = env->CallStaticObjectMethod(clazz, parseFrom, jdata); return jtaskId; }
void killTask (ExecutorDriver* driver, const TaskID& taskId) override { const string& ti = taskId.value(); pid_t pid; { lock_guard<mutex> lock(TaskId2PidLock); auto iter = TaskId2Pid.find(ti); if (iter == TaskId2Pid.end()) { LOG(WARNING) << "unknown task id '" << ti << "'"; return; } pid = iter->second; } // TODO(fc) be graceful kill(pid, 9); }
inline bool operator==(const TaskID& left, const std::string& right) { return left.value() == right; }
inline std::size_t hash_value(const TaskID& taskId) { size_t seed = 0; boost::hash_combine(seed, taskId.value()); return seed; }
// The purpose of this test is to ensure that when slaves are removed // from the master, and then attempt to send exited executor messages, // we send a ShutdownMessage to the slave. Why? Because during a // network partition, the master will remove a partitioned slave, thus // sending its tasks to LOST. At this point, when the partition is // removed, the slave may attempt to send exited executor messages if // it was unaware that the master removed it. We've already // notified frameworks that the tasks under the executors were LOST, // so we have to have the slave shut down. TEST_F(PartitionTest, PartitionedSlaveExitedExecutor) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); // Allow the master to PING the slave, but drop all PONG messages // from the slave. Note that we don't match on the master / slave // PIDs because it's actually the SlaveObserver Process that sends // the pings. Future<Message> ping = FUTURE_MESSAGE(Eq("PING"), _, _); DROP_MESSAGES(Eq("PONG"), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); Try<PID<Slave> > slave = StartSlave(&containerizer); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); Future<FrameworkID> frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId));\ Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); driver.start(); AWAIT_READY(frameworkId); AWAIT_READY(offers); ASSERT_NE(0u, offers.get().size()); // Launch a task. This allows us to have the slave send an // ExitedExecutorMessage. TaskID taskId; taskId.set_value("1"); TaskInfo task; task.set_name(""); task.mutable_task_id()->MergeFrom(taskId); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); task.mutable_executor()->mutable_command()->set_value("sleep 60"); vector<TaskInfo> tasks; tasks.push_back(task); // Set up the expectations for launching the task. EXPECT_CALL(exec, registered(_, _, _, _)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Drop all the status updates from the slave, so that we can // ensure the ExitedExecutorMessage is what triggers the slave // shutdown. DROP_PROTOBUFS(StatusUpdateMessage(), _, master.get()); driver.launchTasks(offers.get()[0].id(), tasks); // Drop the first shutdown message from the master (simulated // partition) and allow the second shutdown message to pass when // triggered by the ExitedExecutorMessage. Future<ShutdownMessage> shutdownMessage = DROP_PROTOBUF(ShutdownMessage(), _, slave.get()); Future<TaskStatus> lostStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&lostStatus)); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); Clock::pause(); // Now, induce a partition of the slave by having the master // timeout the slave. uint32_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == master::MAX_SLAVE_PING_TIMEOUTS) { break; } ping = FUTURE_MESSAGE(Eq("PING"), _, _); Clock::advance(master::SLAVE_PING_TIMEOUT); Clock::settle(); } Clock::advance(master::SLAVE_PING_TIMEOUT); Clock::settle(); // The master will have notified the framework of the lost task. AWAIT_READY(lostStatus); EXPECT_EQ(TASK_LOST, lostStatus.get().state()); // Wait for the master to attempt to shut down the slave. AWAIT_READY(shutdownMessage); // The master will notify the framework that the slave was lost. AWAIT_READY(slaveLost); shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()); // Induce an ExitedExecutorMessage from the slave. containerizer.destroy( frameworkId.get(), DEFAULT_EXECUTOR_INFO.executor_id()); // Upon receiving the message, the master will shutdown the slave. AWAIT_READY(shutdownMessage); Clock::resume(); driver.stop(); driver.join(); Shutdown(); }
// The purpose of this test is to ensure that when slaves are removed // from the master, and then attempt to re-register, we deny the // re-registration by sending a ShutdownMessage to the slave. // Why? Because during a network partition, the master will remove a // partitioned slave, thus sending its tasks to LOST. At this point, // when the partition is removed, the slave will attempt to // re-register with its running tasks. We've already notified // frameworks that these tasks were LOST, so we have to have the slave // slave shut down. TEST_F(PartitionTest, PartitionedSlaveReregistration) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); // Allow the master to PING the slave, but drop all PONG messages // from the slave. Note that we don't match on the master / slave // PIDs because it's actually the SlaveObserver Process that sends // the pings. Future<Message> ping = FUTURE_MESSAGE(Eq("PING"), _, _); DROP_MESSAGES(Eq("PONG"), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); StandaloneMasterDetector detector(master.get()); Try<PID<Slave> > slave = StartSlave(&exec, &detector); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); driver.start(); AWAIT_READY(offers); ASSERT_NE(0u, offers.get().size()); // Launch a task. This is to ensure the task is killed by the slave, // during shutdown. TaskID taskId; taskId.set_value("1"); TaskInfo task; task.set_name(""); task.mutable_task_id()->MergeFrom(taskId); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); task.mutable_executor()->mutable_command()->set_value("sleep 60"); vector<TaskInfo> tasks; tasks.push_back(task); // Set up the expectations for launching the task. EXPECT_CALL(exec, registered(_, _, _, _)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<TaskStatus> runningStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&runningStatus)); Future<Nothing> statusUpdateAck = FUTURE_DISPATCH( slave.get(), &Slave::_statusUpdateAcknowledgement); driver.launchTasks(offers.get()[0].id(), tasks); AWAIT_READY(runningStatus); EXPECT_EQ(TASK_RUNNING, runningStatus.get().state()); // Wait for the slave to have handled the acknowledgment prior // to pausing the clock. AWAIT_READY(statusUpdateAck); // Drop the first shutdown message from the master (simulated // partition), allow the second shutdown message to pass when // the slave re-registers. Future<ShutdownMessage> shutdownMessage = DROP_PROTOBUF(ShutdownMessage(), _, slave.get()); Future<TaskStatus> lostStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&lostStatus)); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); Clock::pause(); // Now, induce a partition of the slave by having the master // timeout the slave. uint32_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == master::MAX_SLAVE_PING_TIMEOUTS) { break; } ping = FUTURE_MESSAGE(Eq("PING"), _, _); Clock::advance(master::SLAVE_PING_TIMEOUT); Clock::settle(); } Clock::advance(master::SLAVE_PING_TIMEOUT); Clock::settle(); // The master will have notified the framework of the lost task. AWAIT_READY(lostStatus); EXPECT_EQ(TASK_LOST, lostStatus.get().state()); // Wait for the master to attempt to shut down the slave. AWAIT_READY(shutdownMessage); // The master will notify the framework that the slave was lost. AWAIT_READY(slaveLost); Clock::resume(); // We now complete the partition on the slave side as well. This // is done by simulating a master loss event which would normally // occur during a network partition. detector.appoint(None()); Future<Nothing> shutdown; EXPECT_CALL(exec, shutdown(_)) .WillOnce(FutureSatisfy(&shutdown)); shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()); // Have the slave re-register with the master. detector.appoint(master.get()); // Upon re-registration, the master will shutdown the slave. // The slave will then shut down the executor. AWAIT_READY(shutdownMessage); AWAIT_READY(shutdown); driver.stop(); driver.join(); Shutdown(); }
// This test ensures we don't break the API when it comes to JSON // representation of tasks. TEST(HTTPTest, ModelTask) { TaskID taskId; taskId.set_value("t"); SlaveID slaveId; slaveId.set_value("s"); ExecutorID executorId; executorId.set_value("t"); FrameworkID frameworkId; frameworkId.set_value("f"); TaskState state = TASK_RUNNING; vector<TaskStatus> statuses; TaskStatus status; status.mutable_task_id()->CopyFrom(taskId); status.set_state(state); status.mutable_slave_id()->CopyFrom(slaveId); status.mutable_executor_id()->CopyFrom(executorId); status.set_timestamp(0.0); statuses.push_back(status); Labels labels; labels.add_labels()->CopyFrom(createLabel("ACTION", "port:7987 DENY")); Ports ports; Port* port = ports.add_ports(); port->set_number(80); port->mutable_labels()->CopyFrom(labels); DiscoveryInfo discovery; discovery.set_visibility(DiscoveryInfo::CLUSTER); discovery.set_name("discover"); discovery.mutable_ports()->CopyFrom(ports); TaskInfo taskInfo; taskInfo.set_name("task"); taskInfo.mutable_task_id()->CopyFrom(taskId); taskInfo.mutable_slave_id()->CopyFrom(slaveId); taskInfo.mutable_command()->set_value("echo hello"); taskInfo.mutable_discovery()->CopyFrom(discovery); Task task = createTask(taskInfo, state, frameworkId); task.add_statuses()->CopyFrom(statuses[0]); JSON::Value object = model(task); Try<JSON::Value> expected = JSON::parse( "{" " \"executor_id\":\"\"," " \"framework_id\":\"f\"," " \"id\":\"t\"," " \"name\":\"task\"," " \"resources\":" " {" " \"cpus\":0," " \"disk\":0," " \"gpus\":0," " \"mem\":0" " }," " \"slave_id\":\"s\"," " \"state\":\"TASK_RUNNING\"," " \"statuses\":" " [" " {" " \"state\":\"TASK_RUNNING\"," " \"timestamp\":0" " }" " ]," " \"discovery\":" " {" " \"name\":\"discover\"," " \"ports\":" " {" " \"ports\":" " [" " {" " \"number\":80," " \"labels\":" " {" " \"labels\":" " [" " {" " \"key\":\"ACTION\"," " \"value\":\"port:7987 DENY\"" " }" " ]" " }" " }" " ]" " }," " \"visibility\":\"CLUSTER\"" " }" "}"); ASSERT_SOME(expected); EXPECT_EQ(expected.get(), object); }
TEST(MasterTest, KillTask) { ASSERT_TRUE(GTEST_IS_THREADSAFE); SimpleAllocator a; Master m(&a); PID<Master> master = process::spawn(&m); MockExecutor exec; trigger killTaskCall, shutdownCall; EXPECT_CALL(exec, registered(_, _, _, _, _, _)) .Times(1); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdate(TASK_RUNNING)); EXPECT_CALL(exec, killTask(_, _)) .WillOnce(Trigger(&killTaskCall)); EXPECT_CALL(exec, shutdown(_)) .WillOnce(Trigger(&shutdownCall)); map<ExecutorID, Executor*> execs; execs[DEFAULT_EXECUTOR_ID] = &exec; TestingIsolationModule isolationModule(execs); Resources resources = Resources::parse("cpus:2;mem:1024"); Slave s(resources, true, &isolationModule); PID<Slave> slave = process::spawn(&s); BasicMasterDetector detector(master, slave, true); MockScheduler sched; MesosSchedulerDriver driver(&sched, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; TaskStatus status; trigger resourceOffersCall, statusUpdateCall; EXPECT_CALL(sched, registered(&driver, _)) .Times(1); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&resourceOffersCall))) .WillRepeatedly(Return()); EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall))); driver.start(); WAIT_UNTIL(resourceOffersCall); EXPECT_NE(0, offers.size()); TaskID taskId; taskId.set_value("1"); TaskDescription task; task.set_name(""); task.mutable_task_id()->MergeFrom(taskId); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); task.mutable_resources()->MergeFrom(offers[0].resources()); vector<TaskDescription> tasks; tasks.push_back(task); driver.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(statusUpdateCall); EXPECT_EQ(TASK_RUNNING, status.state()); driver.killTask(taskId); WAIT_UNTIL(killTaskCall); driver.stop(); driver.join(); WAIT_UNTIL(shutdownCall); // To ensure can deallocate MockExecutor. process::terminate(slave); process::wait(slave); process::terminate(master); process::wait(master); }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const ContainerID& containerId, bool strict, bool rebooted) { RunState state; state.id = containerId; string message; // See if the sentinel file exists. This is done first so it is // known even if partial state is returned, e.g., if the libprocess // pid file is not recovered. It indicates the slave removed the // executor. string path = paths::getExecutorSentinelPath( rootDir, slaveId, frameworkId, executorId, containerId); state.completed = os::exists(path); // Find the tasks. Try<list<string>> tasks = paths::getTaskPaths( rootDir, slaveId, frameworkId, executorId, containerId); if (tasks.isError()) { return Error( "Failed to find tasks for executor run " + containerId.value() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(Path(path).basename()); Try<TaskState> task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); state.errors += task->errors; } path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, containerId); // If agent host is rebooted, we do not read the forked pid and libprocess pid // since those two pids are obsolete after reboot. And we remove the forked // pid file to make sure we will not read it in the case the agent process is // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e., // agent recovery is done after the reboot). if (rebooted) { if (os::exists(path)) { Try<Nothing> rm = os::rm(path); if (rm.isError()) { return Error( "Failed to remove executor forked pid file '" + path + "': " + rm.error()); } } return state; } if (!os::exists(path)) { // This could happen if the slave died before the containerizer checkpointed // the forked pid or agent process is restarted after agent host is rebooted // since we remove this file in the above code. LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'"; return state; } // Read the forked pid. Result<string> pid = state::read<string>(path); if (pid.isError()) { message = "Failed to read executor forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid->empty()) { // This could happen if the slave is hard rebooted after the file is created // but before the data is synced on disk. LOG(WARNING) << "Found empty executor forked pid file '" << path << "'"; return state; } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid '" + pid.get() + "' " "from pid file '" + path + "': " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, containerId); if (os::exists(path)) { pid = state::read<string>(path); if (pid.isError()) { message = "Failed to read executor libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid->empty()) { // This could happen if the slave is hard rebooted after the file is // created but before the data is synced on disk. LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'"; return state; } state.libprocessPid = process::UPID(pid.get()); state.http = false; return state; } path = paths::getExecutorHttpMarkerPath( rootDir, slaveId, frameworkId, executorId, containerId); // The marker could be absent if the slave died before the executor // registered with the slave. if (!os::exists(path)) { LOG(WARNING) << "Failed to find '" << paths::LIBPROCESS_PID_FILE << "' or '" << paths::HTTP_MARKER_FILE << "' for container " << containerId << " of executor '" << executorId << "' of framework " << frameworkId; return state; } state.http = true; return state; }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const ContainerID& containerId, bool strict) { RunState state; state.id = containerId; string message; // See if the sentinel file exists. This is done first so it is // known even if partial state is returned, e.g., if the libprocess // pid file is not recovered. It indicates the slave removed the // executor. string path = paths::getExecutorSentinelPath( rootDir, slaveId, frameworkId, executorId, containerId); state.completed = os::exists(path); // Find the tasks. Try<list<string> > tasks = paths::getTaskPaths( rootDir, slaveId, frameworkId, executorId, containerId); if (tasks.isError()) { return Error( "Failed to find tasks for executor run " + containerId.value() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(Path(path).basename()); Try<TaskState> task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); state.errors += task.get().errors; } // Read the forked pid. path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, containerId); if (!os::exists(path)) { // This could happen if the slave died before the isolator // checkpointed the forked pid. LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'"; return state; } Try<string> pid = os::read(path); if (pid.isError()) { message = "Failed to read executor forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor forked pid file '" << path << "'"; return state; } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid " + pid.get() + ": " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, containerId); if (!os::exists(path)) { // This could happen if the slave died before the executor // registered with the slave. LOG(WARNING) << "Failed to find executor libprocess pid file '" << path << "'"; return state; } pid = os::read(path); if (pid.isError()) { message = "Failed to read executor libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'"; return state; } state.libprocessPid = process::UPID(pid.get()); return state; }
int main(int argc, char** argv) { GOOGLE_PROTOBUF_VERIFY_VERSION; Flags flags; Try<flags::Warnings> load = flags.load(None(), argc, argv); if (load.isError()) { cerr << flags.usage(load.error()) << endl; return EXIT_FAILURE; } if (flags.help) { cout << flags.usage() << endl; return EXIT_SUCCESS; } // Log any flag warnings. foreach (const flags::Warning& warning, load->warnings) { LOG(WARNING) << warning.message; } if (flags.health_check_json.isNone()) { cerr << flags.usage("Expected JSON with health check description") << endl; return EXIT_FAILURE; } Try<JSON::Object> parse = JSON::parse<JSON::Object>(flags.health_check_json.get()); if (parse.isError()) { cerr << flags.usage("Failed to parse --health_check_json: " + parse.error()) << endl; return EXIT_FAILURE; } Try<HealthCheck> check = protobuf::parse<HealthCheck>(parse.get()); if (check.isError()) { cerr << flags.usage("Failed to parse --health_check_json: " + check.error()) << endl; return EXIT_SUCCESS; } if (flags.executor.isNone()) { cerr << flags.usage("Missing required option --executor") << endl; return EXIT_FAILURE; } if (check.get().has_http() && check.get().has_command()) { cerr << flags.usage("Both 'http' and 'command' health check requested") << endl; return EXIT_FAILURE; } if (!check.get().has_http() && !check.get().has_command()) { cerr << flags.usage("Expecting one of 'http' or 'command' health check") << endl; return EXIT_FAILURE; } if (flags.task_id.isNone()) { cerr << flags.usage("Missing required option --task_id") << endl; return EXIT_FAILURE; } TaskID taskID; taskID.set_value(flags.task_id.get()); mesos::internal::HealthCheckerProcess process( check.get(), flags.executor.get(), taskID); process::spawn(&process); process::Future<Nothing> checking = process::dispatch( process, &mesos::internal::HealthCheckerProcess::healthCheck); checking.await(); process::terminate(process); process::wait(process); if (checking.isFailed()) { LOG(WARNING) << "Health check failed " << checking.failure(); return EXIT_FAILURE; } return EXIT_SUCCESS; }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const UUID& uuid, bool strict) { RunState state; state.id = uuid; string message; // Find the tasks. const Try<list<string> >& tasks = os::glob(strings::format( paths::TASK_PATH, rootDir, slaveId, frameworkId, executorId, uuid.toString(), "*").get()); if (tasks.isError()) { return Error("Failed to find tasks for executor run " + uuid.toString() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(os::basename(path).get()); const Try<TaskState>& task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); } // Read the forked pid. string path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, uuid); Try<string> pid = os::read(path); if (pid.isError()) { message = "Failed to read executor's forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; return state; } } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid " + pid.get() + ": " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, uuid); pid = os::read(path); if (pid.isError()) { message = "Failed to read executor's libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; return state; } } state.libprocessPid = process::UPID(pid.get()); return state; }
TEST_F(FaultToleranceTest, ForwardStatusUpdateUnknownExecutor) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); Try<PID<Slave> > slave = StartSlave(&exec); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver(&sched, DEFAULT_FRAMEWORK_INFO, master.get()); FrameworkID frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)); driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); Offer offer = offers.get()[0]; TaskInfo task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offer.slave_id()); task.mutable_resources()->MergeFrom(offer.resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); vector<TaskInfo> tasks; tasks.push_back(task); Future<Nothing> statusUpdate; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureSatisfy(&statusUpdate)); // TASK_RUNNING of task1. EXPECT_CALL(exec, registered(_, _, _, _)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); driver.launchTasks(offer.id(), tasks); // Wait until TASK_RUNNING of task1 is received. AWAIT_READY(statusUpdate); // Simulate the slave receiving status update from an unknown // (e.g. exited) executor of the given framework. Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status)); // TASK_RUNNING of task2. TaskID taskId; taskId.set_value("task2"); StatusUpdate statusUpdate2 = createStatusUpdate( frameworkId, offer.slave_id(), taskId, TASK_RUNNING, "Dummy update"); process::dispatch(slave.get(), &Slave::statusUpdate, statusUpdate2); // Ensure that the scheduler receives task2's update. AWAIT_READY(status); EXPECT_EQ(taskId, status.get().task_id()); EXPECT_EQ(TASK_RUNNING, status.get().state()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
inline bool operator<(const TaskID& left, const TaskID& right) { return left.value() < right.value(); }
Try<RunState> RunState::recover( const string& rootDir, const SlaveID& slaveId, const FrameworkID& frameworkId, const ExecutorID& executorId, const UUID& uuid, bool strict) { RunState state; state.id = uuid; string message; // Find the tasks. const Try<list<string> >& tasks = os::glob(strings::format( paths::TASK_PATH, rootDir, slaveId, frameworkId, executorId, uuid.toString(), "*").get()); if (tasks.isError()) { return Error("Failed to find tasks for executor run " + uuid.toString() + ": " + tasks.error()); } // Recover tasks. foreach (const string& path, tasks.get()) { TaskID taskId; taskId.set_value(os::basename(path).get()); const Try<TaskState>& task = TaskState::recover( rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict); if (task.isError()) { return Error( "Failed to recover task " + taskId.value() + ": " + task.error()); } state.tasks[taskId] = task.get(); state.errors += task.get().errors; } // Read the forked pid. string path = paths::getForkedPidPath( rootDir, slaveId, frameworkId, executorId, uuid); if (!os::exists(path)) { // This could happen if the slave died before the isolator // checkpointed the forked pid. LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'"; return state; } Try<string> pid = os::read(path); if (pid.isError()) { message = "Failed to read executor forked pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor forked pid file '" << path << "'"; return state; } Try<pid_t> forkedPid = numify<pid_t>(pid.get()); if (forkedPid.isError()) { return Error("Failed to parse forked pid " + pid.get() + ": " + forkedPid.error()); } state.forkedPid = forkedPid.get(); // Read the libprocess pid. path = paths::getLibprocessPidPath( rootDir, slaveId, frameworkId, executorId, uuid); if (!os::exists(path)) { // This could happen if the slave died before the executor // registered with the slave. LOG(WARNING) << "Failed to find executor libprocess pid file '" << path << "'"; return state; } pid = os::read(path); if (pid.isError()) { message = "Failed to read executor libprocess pid from '" + path + "': " + pid.error(); if (strict) { return Error(message); } else { LOG(WARNING) << message; state.errors++; return state; } } if (pid.get().empty()) { // This could happen if the slave died after opening the file for // writing but before it checkpointed anything. LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'"; return state; } state.libprocessPid = process::UPID(pid.get()); // See if the sentinel file exists. path = paths::getExecutorSentinelPath( rootDir, slaveId, frameworkId, executorId, uuid); state.completed = os::exists(path); return state; }
// Ensures the scheduler driver can handle the UPDATE event. TEST_F(SchedulerDriverEventTest, Update) { Try<PID<Master>> master = StartMaster(); ASSERT_SOME(master); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<Message> frameworkRegisteredMessage = FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); driver.start(); AWAIT_READY(frameworkRegisteredMessage); UPID frameworkPid = frameworkRegisteredMessage.get().to; FrameworkRegisteredMessage message; ASSERT_TRUE(message.ParseFromString(frameworkRegisteredMessage.get().body)); FrameworkID frameworkId = message.framework_id(); SlaveID slaveId; slaveId.set_value("S"); TaskID taskId; taskId.set_value("T"); ExecutorID executorId; executorId.set_value("E"); // Generate an update that needs no acknowledgement. Event event; event.set_type(Event::UPDATE); event.mutable_update()->mutable_status()->CopyFrom( protobuf::createStatusUpdate( frameworkId, slaveId, taskId, TASK_RUNNING, TaskStatus::SOURCE_MASTER, None(), "message", None(), executorId).status()); Future<Nothing> statusUpdate; Future<Nothing> statusUpdate2; EXPECT_CALL(sched, statusUpdate(&driver, event.update().status())) .WillOnce(FutureSatisfy(&statusUpdate)) .WillOnce(FutureSatisfy(&statusUpdate2)); process::post(master.get(), frameworkPid, event); AWAIT_READY(statusUpdate); // Generate an update that requires acknowledgement. event.mutable_update()->mutable_status()->set_uuid(UUID::random().toBytes()); Future<mesos::scheduler::Call> acknowledgement = DROP_CALL( mesos::scheduler::Call(), mesos::scheduler::Call::ACKNOWLEDGE, _, _); process::post(master.get(), frameworkPid, event); AWAIT_READY(statusUpdate2); AWAIT_READY(acknowledgement); }
// This test ensures we don't break the API when it comes to JSON // representation of tasks. Also, we want to ensure that tasks are // modeled the same way when using 'Task' vs. 'TaskInfo'. TEST(HTTP, ModelTask) { TaskID taskId; taskId.set_value("t"); SlaveID slaveId; slaveId.set_value("s"); ExecutorID executorId; executorId.set_value("t"); FrameworkID frameworkId; frameworkId.set_value("f"); TaskState state = TASK_RUNNING; vector<TaskStatus> statuses; TaskStatus status; status.mutable_task_id()->CopyFrom(taskId); status.set_state(state); status.mutable_slave_id()->CopyFrom(slaveId); status.mutable_executor_id()->CopyFrom(executorId); status.set_timestamp(0.0); statuses.push_back(status); TaskInfo task; task.set_name("task"); task.mutable_task_id()->CopyFrom(taskId); task.mutable_slave_id()->CopyFrom(slaveId); task.mutable_command()->set_value("echo hello"); Task task_ = protobuf::createTask(task, state, frameworkId); task_.add_statuses()->CopyFrom(statuses[0]); JSON::Value object = model(task, frameworkId, state, statuses); JSON::Value object_ = model(task_); Try<JSON::Value> expected = JSON::parse( "{" " \"executor_id\":\"\"," " \"framework_id\":\"f\"," " \"id\":\"t\"," " \"name\":\"task\"," " \"resources\":" " {" " \"cpus\":0," " \"disk\":0," " \"mem\":0" " }," " \"slave_id\":\"s\"," " \"state\":\"TASK_RUNNING\"," " \"statuses\":" " [" " {" " \"state\":\"TASK_RUNNING\"," " \"timestamp\":0" " }" " ]" "}"); ASSERT_SOME(expected); EXPECT_EQ(expected.get(), object); EXPECT_EQ(expected.get(), object_); // Ensure both are modeled the same. EXPECT_EQ(object, object_); }
// The purpose of this test is to ensure that when slaves are removed // from the master, and then attempt to send status updates, we send // a ShutdownMessage to the slave. Why? Because during a network // partition, the master will remove a partitioned slave, thus sending // its tasks to LOST. At this point, when the partition is removed, // the slave may attempt to send updates if it was unaware that the // master removed it. We've already notified frameworks that these // tasks were LOST, so we have to have the slave shut down. TEST_F(PartitionTest, PartitionedSlaveStatusUpdates) { master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); // Allow the master to PING the slave, but drop all PONG messages // from the slave. Note that we don't match on the master / slave // PIDs because it's actually the SlaveObserver Process that sends // the pings. Future<Message> ping = FUTURE_MESSAGE( Eq(PingSlaveMessage().GetTypeName()), _, _); DROP_PROTOBUFS(PongSlaveMessage(), _, _); Future<SlaveRegisteredMessage> slaveRegisteredMessage = FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer); ASSERT_SOME(slave); AWAIT_READY(slaveRegisteredMessage); SlaveID slaveId = slaveRegisteredMessage.get().slave_id(); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); Future<FrameworkID> frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId)); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillRepeatedly(Return()); driver.start(); AWAIT_READY(frameworkId); // Drop the first shutdown message from the master (simulated // partition), allow the second shutdown message to pass when // the slave sends an update. Future<ShutdownMessage> shutdownMessage = DROP_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); EXPECT_CALL(sched, offerRescinded(&driver, _)) .WillRepeatedly(Return()); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); Clock::pause(); // Now, induce a partition of the slave by having the master // timeout the slave. size_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == masterFlags.max_slave_ping_timeouts) { break; } ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _); Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); } Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); // Wait for the master to attempt to shut down the slave. AWAIT_READY(shutdownMessage); // The master will notify the framework that the slave was lost. AWAIT_READY(slaveLost); shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); // At this point, the slave still thinks it's registered, so we // simulate a status update coming from the slave. TaskID taskId; taskId.set_value("task_id"); const StatusUpdate& update = protobuf::createStatusUpdate( frameworkId.get(), slaveId, taskId, TASK_RUNNING, TaskStatus::SOURCE_SLAVE, UUID::random()); StatusUpdateMessage message; message.mutable_update()->CopyFrom(update); message.set_pid(stringify(slave.get()->pid)); process::post(master.get()->pid, message); // The master should shutdown the slave upon receiving the update. AWAIT_READY(shutdownMessage); Clock::resume(); driver.stop(); driver.join(); }