// This test verifies that the pending future returned by // 'Authenticator::authenticate()' is properly failed when the Authenticator is // destructed in the middle of authentication. TYPED_TEST(CRAMMD5Authentication, AuthenticatorDestructionRace) { // Launch a dummy process (somebody to send the AuthenticateMessage). UPID pid = spawn(new ProcessBase(), true); Credential credential1; credential1.set_principal("benh"); credential1.set_secret("secret"); Credentials credentials; Credential* credential2 = credentials.add_credentials(); credential2->set_principal(credential1.principal()); credential2->set_secret(credential1.secret()); secrets::load(credentials); Future<Message> message = FUTURE_MESSAGE(Eq(AuthenticateMessage().GetTypeName()), _, _); Try<Authenticatee*> authenticatee = TypeParam::TypeAuthenticatee::create(); CHECK_SOME(authenticatee); Future<bool> client = authenticatee.get()->authenticate(pid, UPID(), credential1); AWAIT_READY(message); Try<Authenticator*> authenticator = TypeParam::TypeAuthenticator::create(); CHECK_SOME(authenticator); authenticator.get()->initialize(message.get().from); // Drop the AuthenticationStepMessage from authenticator to keep // the authentication from getting completed. Future<AuthenticationStepMessage> authenticationStepMessage = DROP_PROTOBUF(AuthenticationStepMessage(), _, _); Future<Option<string>> principal = authenticator.get()->authenticate(); AWAIT_READY(authenticationStepMessage); // At this point 'AuthenticatorProcess::authenticate()' has been // executed and its promise associated with the promise returned // by 'Authenticator::authenticate()'. // Authentication should be pending. ASSERT_TRUE(principal.isPending()); // Now delete the authenticator. delete authenticator.get(); // The future should be failed at this point. AWAIT_FAILED(principal); terminate(pid); delete authenticatee.get(); }
// The purpose of this test is to ensure that when slaves are removed // from the master, and then attempt to send exited executor messages, // we send a ShutdownMessage to the slave. Why? Because during a // network partition, the master will remove a partitioned slave, thus // sending its tasks to LOST. At this point, when the partition is // removed, the slave may attempt to send exited executor messages if // it was unaware that the master removed it. We've already // notified frameworks that the tasks under the executors were LOST, // so we have to have the slave shut down. TEST_F(PartitionTest, PartitionedSlaveExitedExecutor) { master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); // Allow the master to PING the slave, but drop all PONG messages // from the slave. Note that we don't match on the master / slave // PIDs because it's actually the SlaveObserver Process that sends // the pings. Future<Message> ping = FUTURE_MESSAGE( Eq(PingSlaveMessage().GetTypeName()), _, _); DROP_PROTOBUFS(PongSlaveMessage(), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); Future<FrameworkID> frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId));\ Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); driver.start(); AWAIT_READY(frameworkId); AWAIT_READY(offers); ASSERT_NE(0u, offers.get().size()); // Launch a task. This allows us to have the slave send an // ExitedExecutorMessage. TaskID taskId; taskId.set_value("1"); TaskInfo task; task.set_name(""); task.mutable_task_id()->MergeFrom(taskId); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); task.mutable_executor()->mutable_command()->set_value("sleep 60"); // Set up the expectations for launching the task. EXPECT_CALL(exec, registered(_, _, _, _)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Drop all the status updates from the slave, so that we can // ensure the ExitedExecutorMessage is what triggers the slave // shutdown. DROP_PROTOBUFS(StatusUpdateMessage(), _, master.get()->pid); driver.launchTasks(offers.get()[0].id(), {task}); // Drop the first shutdown message from the master (simulated // partition) and allow the second shutdown message to pass when // triggered by the ExitedExecutorMessage. Future<ShutdownMessage> shutdownMessage = DROP_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); Future<TaskStatus> lostStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&lostStatus)); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); Clock::pause(); // Now, induce a partition of the slave by having the master // timeout the slave. size_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == masterFlags.max_slave_ping_timeouts) { break; } ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _); Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); } Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); // The master will have notified the framework of the lost task. AWAIT_READY(lostStatus); EXPECT_EQ(TASK_LOST, lostStatus.get().state()); // Wait for the master to attempt to shut down the slave. AWAIT_READY(shutdownMessage); // The master will notify the framework that the slave was lost. AWAIT_READY(slaveLost); shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); // Induce an ExitedExecutorMessage from the slave. containerizer.destroy( frameworkId.get(), DEFAULT_EXECUTOR_INFO.executor_id()); // Upon receiving the message, the master will shutdown the slave. AWAIT_READY(shutdownMessage); Clock::resume(); driver.stop(); driver.join(); }
// The purpose of this test is to ensure that when slaves are removed // from the master, and then attempt to send status updates, we send // a ShutdownMessage to the slave. Why? Because during a network // partition, the master will remove a partitioned slave, thus sending // its tasks to LOST. At this point, when the partition is removed, // the slave may attempt to send updates if it was unaware that the // master removed it. We've already notified frameworks that these // tasks were LOST, so we have to have the slave shut down. TEST_F(PartitionTest, PartitionedSlaveStatusUpdates) { master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); // Allow the master to PING the slave, but drop all PONG messages // from the slave. Note that we don't match on the master / slave // PIDs because it's actually the SlaveObserver Process that sends // the pings. Future<Message> ping = FUTURE_MESSAGE( Eq(PingSlaveMessage().GetTypeName()), _, _); DROP_PROTOBUFS(PongSlaveMessage(), _, _); Future<SlaveRegisteredMessage> slaveRegisteredMessage = FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer); ASSERT_SOME(slave); AWAIT_READY(slaveRegisteredMessage); SlaveID slaveId = slaveRegisteredMessage.get().slave_id(); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); Future<FrameworkID> frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId)); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillRepeatedly(Return()); driver.start(); AWAIT_READY(frameworkId); // Drop the first shutdown message from the master (simulated // partition), allow the second shutdown message to pass when // the slave sends an update. Future<ShutdownMessage> shutdownMessage = DROP_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); EXPECT_CALL(sched, offerRescinded(&driver, _)) .WillRepeatedly(Return()); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); Clock::pause(); // Now, induce a partition of the slave by having the master // timeout the slave. size_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == masterFlags.max_slave_ping_timeouts) { break; } ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _); Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); } Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); // Wait for the master to attempt to shut down the slave. AWAIT_READY(shutdownMessage); // The master will notify the framework that the slave was lost. AWAIT_READY(slaveLost); shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); // At this point, the slave still thinks it's registered, so we // simulate a status update coming from the slave. TaskID taskId; taskId.set_value("task_id"); const StatusUpdate& update = protobuf::createStatusUpdate( frameworkId.get(), slaveId, taskId, TASK_RUNNING, TaskStatus::SOURCE_SLAVE, UUID::random()); StatusUpdateMessage message; message.mutable_update()->CopyFrom(update); message.set_pid(stringify(slave.get()->pid)); process::post(master.get()->pid, message); // The master should shutdown the slave upon receiving the update. AWAIT_READY(shutdownMessage); Clock::resume(); driver.stop(); driver.join(); }
// The purpose of this test is to ensure that when slaves are removed // from the master, and then attempt to re-register, we deny the // re-registration by sending a ShutdownMessage to the slave. // Why? Because during a network partition, the master will remove a // partitioned slave, thus sending its tasks to LOST. At this point, // when the partition is removed, the slave will attempt to // re-register with its running tasks. We've already notified // frameworks that these tasks were LOST, so we have to have the slave // slave shut down. TEST_F(PartitionTest, PartitionedSlaveReregistration) { master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); // Allow the master to PING the slave, but drop all PONG messages // from the slave. Note that we don't match on the master / slave // PIDs because it's actually the SlaveObserver Process that sends // the pings. Future<Message> ping = FUTURE_MESSAGE( Eq(PingSlaveMessage().GetTypeName()), _, _); DROP_PROTOBUFS(PongSlaveMessage(), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); StandaloneMasterDetector detector(master.get()->pid); Try<Owned<cluster::Slave>> slave = StartSlave(&detector, &containerizer); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); driver.start(); AWAIT_READY(offers); ASSERT_NE(0u, offers.get().size()); // Launch a task. This is to ensure the task is killed by the slave, // during shutdown. TaskID taskId; taskId.set_value("1"); TaskInfo task; task.set_name(""); task.mutable_task_id()->MergeFrom(taskId); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); task.mutable_executor()->mutable_command()->set_value("sleep 60"); // Set up the expectations for launching the task. EXPECT_CALL(exec, registered(_, _, _, _)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<TaskStatus> runningStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&runningStatus)); Future<Nothing> statusUpdateAck = FUTURE_DISPATCH( slave.get()->pid, &Slave::_statusUpdateAcknowledgement); driver.launchTasks(offers.get()[0].id(), {task}); AWAIT_READY(runningStatus); EXPECT_EQ(TASK_RUNNING, runningStatus.get().state()); // Wait for the slave to have handled the acknowledgment prior // to pausing the clock. AWAIT_READY(statusUpdateAck); // Drop the first shutdown message from the master (simulated // partition), allow the second shutdown message to pass when // the slave re-registers. Future<ShutdownMessage> shutdownMessage = DROP_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); Future<TaskStatus> lostStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&lostStatus)); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); Clock::pause(); // Now, induce a partition of the slave by having the master // timeout the slave. size_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == masterFlags.max_slave_ping_timeouts) { break; } ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _); Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); } Clock::advance(masterFlags.slave_ping_timeout); Clock::settle(); // The master will have notified the framework of the lost task. AWAIT_READY(lostStatus); EXPECT_EQ(TASK_LOST, lostStatus.get().state()); // Wait for the master to attempt to shut down the slave. AWAIT_READY(shutdownMessage); // The master will notify the framework that the slave was lost. AWAIT_READY(slaveLost); Clock::resume(); // We now complete the partition on the slave side as well. This // is done by simulating a master loss event which would normally // occur during a network partition. detector.appoint(None()); Future<Nothing> shutdown; EXPECT_CALL(exec, shutdown(_)) .WillOnce(FutureSatisfy(&shutdown)); shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()->pid); // Have the slave re-register with the master. detector.appoint(master.get()->pid); // Upon re-registration, the master will shutdown the slave. // The slave will then shut down the executor. AWAIT_READY(shutdownMessage); AWAIT_READY(shutdown); driver.stop(); driver.join(); }
// Ensures that the driver can handle an OFFERS event. // Note that this includes the ability to bypass the // master when sending framework messages. TEST_F(SchedulerDriverEventTest, Offers) { Try<PID<Master>> master = StartMaster(); ASSERT_SOME(master); MockScheduler sched; MesosSchedulerDriver schedDriver( &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&schedDriver, _, _)); Future<Message> frameworkRegisteredMessage = FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); schedDriver.start(); AWAIT_READY(frameworkRegisteredMessage); UPID frameworkPid = frameworkRegisteredMessage.get().to; // Start a slave and capture the offers. Future<ResourceOffersMessage> resourceOffersMessage = DROP_PROTOBUF(ResourceOffersMessage(), _, _); MockExecutor exec(DEFAULT_EXECUTOR_ID); Try<PID<Slave>> slave = StartSlave(&exec); ASSERT_SOME(slave); AWAIT_READY(resourceOffersMessage); google::protobuf::RepeatedPtrField<Offer> offers = resourceOffersMessage.get().offers(); ASSERT_EQ(1, offers.size()); // Ignore future offer messages. DROP_PROTOBUFS(ResourceOffersMessage(), _, _); // Send the offers event and expect a 'resourceOffers' call. Event event; event.set_type(Event::OFFERS); event.mutable_offers()->mutable_offers()->CopyFrom(offers); Future<Nothing> resourceOffers; EXPECT_CALL(sched, resourceOffers(&schedDriver, _)) .WillOnce(FutureSatisfy(&resourceOffers)); process::post(master.get(), frameworkPid, event); AWAIT_READY(resourceOffers); // To test that the framework -> executor messages are // sent directly to the slave, launch a task and send // the executor a message. EXPECT_CALL(exec, registered(_, _, _, _)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&schedDriver, _)) .WillOnce(FutureArg<1>(&status)); TaskInfo task = createTask(offers.Get(0), "", DEFAULT_EXECUTOR_ID); schedDriver.launchTasks(offers.Get(0).id(), {task}); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); // This message should skip the master! Future<FrameworkToExecutorMessage> frameworkToExecutorMessage = FUTURE_PROTOBUF(FrameworkToExecutorMessage(), frameworkPid, slave.get()); Future<string> data; EXPECT_CALL(exec, frameworkMessage(_, _)) .WillOnce(FutureArg<1>(&data)); schedDriver.sendFrameworkMessage( DEFAULT_EXECUTOR_ID, offers.Get(0).slave_id(), "hello"); AWAIT_READY(frameworkToExecutorMessage); AWAIT_EXPECT_EQ("hello", data); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); schedDriver.stop(); schedDriver.join(); Shutdown(); }
// This test verifies that a re-registering slave sends the terminal // unacknowledged tasks for a terminal executor. This is required // for the master to correctly reconcile its view with the slave's // view of tasks. This test drops a terminal update to the master // and then forces the slave to re-register. TEST_F(MasterSlaveReconciliationTest, SlaveReregisterTerminatedExecutor) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); StandaloneMasterDetector detector(master.get()->pid); Try<Owned<cluster::Slave>> slave = StartSlave(&detector, &containerizer); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); Future<FrameworkID> frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId)); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*")) .WillRepeatedly(Return()); // Ignore subsequent offers. ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status)); Future<StatusUpdateAcknowledgementMessage> statusUpdateAcknowledgementMessage = FUTURE_PROTOBUF( StatusUpdateAcknowledgementMessage(), master.get()->pid, slave.get()->pid); driver.start(); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); // Make sure the acknowledgement reaches the slave. AWAIT_READY(statusUpdateAcknowledgementMessage); // Drop the TASK_FINISHED status update sent to the master. Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), _, master.get()->pid); Future<ExitedExecutorMessage> executorExitedMessage = FUTURE_PROTOBUF(ExitedExecutorMessage(), _, _); TaskStatus finishedStatus; finishedStatus = status.get(); finishedStatus.set_state(TASK_FINISHED); execDriver->sendStatusUpdate(finishedStatus); // Ensure the update was sent. AWAIT_READY(statusUpdateMessage); EXPECT_CALL(sched, executorLost(&driver, DEFAULT_EXECUTOR_ID, _, _)); // Now kill the executor. containerizer.destroy(frameworkId.get(), DEFAULT_EXECUTOR_ID); Future<TaskStatus> status2; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status2)); // We drop the 'UpdateFrameworkMessage' from the master to slave to // stop the status update manager from retrying the update that was // already sent due to the new master detection. DROP_PROTOBUFS(UpdateFrameworkMessage(), _, _); detector.appoint(master.get()->pid); AWAIT_READY(status2); EXPECT_EQ(TASK_FINISHED, status2.get().state()); driver.stop(); driver.join(); }
// This test verifies that when the slave re-registers, the master // does not send TASK_LOST update for a task that has reached terminal // state but is waiting for an acknowledgement. TEST_F(MasterSlaveReconciliationTest, SlaveReregisterTerminalTask) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); StandaloneMasterDetector detector(master.get()->pid); Try<Owned<cluster::Slave>> slave = StartSlave(&detector, &containerizer); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); TaskInfo task; task.set_name("test task"); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); EXPECT_CALL(exec, registered(_, _, _, _)); // Send a terminal update right away. EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_FINISHED)); // Drop the status update from slave to the master, so that // the slave has a pending terminal update when it re-registers. DROP_PROTOBUF(StatusUpdateMessage(), _, master.get()->pid); Future<Nothing> _statusUpdate = FUTURE_DISPATCH(_, &Slave::_statusUpdate); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status)) .WillRepeatedly(Return()); // Ignore retried update due to update framework. driver.launchTasks(offers.get()[0].id(), {task}); AWAIT_READY(_statusUpdate); Future<SlaveReregisteredMessage> slaveReregisteredMessage = FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _); // Simulate a spurious master change event (e.g., due to ZooKeeper // expiration) at the slave to force re-registration. detector.appoint(master.get()->pid); AWAIT_READY(slaveReregisteredMessage); // The master should not send a TASK_LOST after the slave // re-registers. We check this by calling Clock::settle() so that // the only update the scheduler receives is the retried // TASK_FINISHED update. // NOTE: The status update manager resends the status update when // it detects a new master. Clock::pause(); Clock::settle(); AWAIT_READY(status); ASSERT_EQ(TASK_FINISHED, status.get().state()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); }
// This test verifies that the master reconciles tasks that are // missing from a re-registering slave. In this case, we trigger // a race between the slave re-registration message and the launch // message. There should be no TASK_LOST. // This was motivated by MESOS-1696. TEST_F(MasterSlaveReconciliationTest, ReconcileRace) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); TestContainerizer containerizer(&exec); StandaloneMasterDetector detector(master.get()->pid); Future<SlaveRegisteredMessage> slaveRegisteredMessage = FUTURE_PROTOBUF(SlaveRegisteredMessage(), master.get()->pid, _); Try<Owned<cluster::Slave>> slave = StartSlave(&detector, &containerizer); ASSERT_SOME(slave); AWAIT_READY(slaveRegisteredMessage); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); // Since the agent may have retried registration, we want to // ensure that any duplicate registrations are flushed before // we appoint the master again. Otherwise, the agent may // receive a stale registration message. Clock::pause(); Clock::settle(); Clock::resume(); // Trigger a re-registration of the slave and capture the message // so that we can spoof a race with a launch task message. DROP_PROTOBUFS(ReregisterSlaveMessage(), slave.get()->pid, master.get()->pid); Future<ReregisterSlaveMessage> reregisterSlaveMessage = DROP_PROTOBUF( ReregisterSlaveMessage(), slave.get()->pid, master.get()->pid); detector.appoint(master.get()->pid); AWAIT_READY(reregisterSlaveMessage); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); TaskInfo task; task.set_name("test task"); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); ExecutorDriver* executorDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&executorDriver)); // Leave the task in TASK_STAGING. Future<Nothing> launchTask; EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(FutureSatisfy(&launchTask)); EXPECT_CALL(sched, statusUpdate(&driver, _)) .Times(0); driver.launchTasks(offers.get()[0].id(), {task}); AWAIT_READY(launchTask); // Send the stale re-registration message, which does not contain // the task we just launched. This will trigger a reconciliation // by the master. Future<SlaveReregisteredMessage> slaveReregisteredMessage = FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _); // Prevent this from being dropped per the DROP_PROTOBUFS above. FUTURE_PROTOBUF( ReregisterSlaveMessage(), slave.get()->pid, master.get()->pid); process::post( slave.get()->pid, master.get()->pid, reregisterSlaveMessage.get()); AWAIT_READY(slaveReregisteredMessage); // Neither the master nor the slave should send a TASK_LOST // as part of the reconciliation. We check this by calling // Clock::settle() to flush all pending events. Clock::pause(); Clock::settle(); Clock::resume(); // Now send TASK_FINISHED and make sure it's the only message // received by the scheduler. Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status)); TaskStatus taskStatus; taskStatus.mutable_task_id()->CopyFrom(task.task_id()); taskStatus.set_state(TASK_FINISHED); executorDriver->sendStatusUpdate(taskStatus); AWAIT_READY(status); ASSERT_EQ(TASK_FINISHED, status.get().state()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); }
// This test verifies that the master reconciles tasks that are // missing from a re-registering slave. In this case, we drop the // RunTaskMessage so the slave should send TASK_LOST. TEST_F(MasterSlaveReconciliationTest, ReconcileLostTask) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); StandaloneMasterDetector detector(master.get()->pid); Try<Owned<cluster::Slave>> slave = StartSlave(&detector); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); TaskInfo task; task.set_name("test task"); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id()); task.mutable_resources()->MergeFrom(offers.get()[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); // We now launch a task and drop the corresponding RunTaskMessage on // the slave, to ensure that only the master knows about this task. Future<RunTaskMessage> runTaskMessage = DROP_PROTOBUF(RunTaskMessage(), _, _); driver.launchTasks(offers.get()[0].id(), {task}); AWAIT_READY(runTaskMessage); Future<SlaveReregisteredMessage> slaveReregisteredMessage = FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _); Future<StatusUpdateMessage> statusUpdateMessage = FUTURE_PROTOBUF(StatusUpdateMessage(), _, master.get()->pid); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status)); // Simulate a spurious master change event (e.g., due to ZooKeeper // expiration) at the slave to force re-registration. detector.appoint(master.get()->pid); AWAIT_READY(slaveReregisteredMessage); // Make sure the slave generated the TASK_LOST. AWAIT_READY(statusUpdateMessage); AWAIT_READY(status); ASSERT_EQ(task.task_id(), status.get().task_id()); ASSERT_EQ(TASK_LOST, status.get().state()); // Before we obtain the metrics, ensure that the master has finished // processing the status update so metrics have been updated. Clock::pause(); Clock::settle(); Clock::resume(); // Check metrics. JSON::Object stats = Metrics(); EXPECT_EQ(1u, stats.values.count("master/tasks_lost")); EXPECT_EQ(1u, stats.values["master/tasks_lost"]); EXPECT_EQ( 1u, stats.values.count( "master/task_lost/source_slave/reason_reconciliation")); EXPECT_EQ( 1u, stats.values["master/task_lost/source_slave/reason_reconciliation"]); driver.stop(); driver.join(); }
// This test verifies that the status update manager correctly includes // the latest state of the task in status update. TEST_F(StatusUpdateManagerTest, LatestTaskState) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); Try<PID<Slave> > slave = StartSlave(&exec); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*")) .WillRepeatedly(Return()); // Ignore subsequent offers. ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Signal when the first update is dropped. Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), _, master.get()); Future<Nothing> __statusUpdate = FUTURE_DISPATCH(_, &Slave::__statusUpdate); driver.start(); // Wait until TASK_RUNNING is sent to the master. AWAIT_READY(statusUpdateMessage); // Ensure the status update manager handles the TASK_RUNNING update. AWAIT_READY(__statusUpdate); // Pause the clock to avoid status update manager from retrying. Clock::pause(); Future<Nothing> __statusUpdate2 = FUTURE_DISPATCH(_, &Slave::__statusUpdate); // Now send TASK_FINISHED update. TaskStatus finishedStatus; finishedStatus = statusUpdateMessage.get().update().status(); finishedStatus.set_state(TASK_FINISHED); execDriver->sendStatusUpdate(finishedStatus); // Ensure the status update manager handles the TASK_FINISHED update. AWAIT_READY(__statusUpdate2); // Signal when the second update is dropped. Future<StatusUpdateMessage> statusUpdateMessage2 = DROP_PROTOBUF(StatusUpdateMessage(), _, master.get()); // Advance the clock for the status update manager to send a retry. Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL_MIN); AWAIT_READY(statusUpdateMessage2); // The update should correspond to TASK_RUNNING. ASSERT_EQ(TASK_RUNNING, statusUpdateMessage2.get().update().status().state()); // The update should include TASK_FINISHED as the latest state. ASSERT_EQ(TASK_FINISHED, statusUpdateMessage2.get().update().latest_state()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
// This test verifies that the slave and status update manager // properly handle duplicate status updates, when the second // update with the same UUID is received before the ACK for the // first update. The proper behavior here is for the status update // manager to drop the duplicate update. TEST_F(StatusUpdateManagerTest, DuplicateUpdateBeforeAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); Try<PID<Slave> > slave = StartSlave(&exec); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Capture the first status update message. Future<StatusUpdateMessage> statusUpdateMessage = FUTURE_PROTOBUF(StatusUpdateMessage(), _, _); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); // Drop the first ACK from the scheduler to the slave. Future<StatusUpdateAcknowledgementMessage> statusUpdateAckMessage = DROP_PROTOBUF(StatusUpdateAcknowledgementMessage(), _, slave.get()); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); AWAIT_READY(statusUpdateAckMessage); Future<Nothing> __statusUpdate = FUTURE_DISPATCH(slave.get(), &Slave::__statusUpdate); // Now resend the TASK_RUNNING update. process::post(slave.get(), statusUpdateMessage.get()); // At this point the status update manager has handled // the duplicate status update. AWAIT_READY(__statusUpdate); // After we advance the clock, the status update manager should // retry the TASK_RUNNING update and the scheduler should receive // and acknowledge it. Future<TaskStatus> update; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&update)); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL_MIN); Clock::settle(); // Ensure the scheduler receives TASK_FINISHED. AWAIT_READY(update); EXPECT_EQ(TASK_RUNNING, update.get().state()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); Clock::resume(); driver.stop(); driver.join(); Shutdown(); }
// This test verifies that status update manager ignores // duplicate ACK for an earlier update when it is waiting // for an ACK for a later update. This could happen when the // duplicate ACK is for a retried update. TEST_F(StatusUpdateManagerTest, IgnoreDuplicateStatusUpdateAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); Try<PID<Slave> > slave = StartSlave(&exec); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Drop the first update, so that status update manager // resends the update. Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), master.get(), _); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); StatusUpdate update = statusUpdateMessage.get().update(); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); // This is the ACK for the retried update. Future<Nothing> ack = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL_MIN); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); AWAIT_READY(ack); // Now send TASK_FINISHED update so that the status update manager // is waiting for its ACK, which it never gets because we drop the // update. DROP_PROTOBUFS(StatusUpdateMessage(), master.get(), _); Future<Nothing> update2 = FUTURE_DISPATCH(_, &Slave::_statusUpdate); TaskStatus status2 = status.get(); status2.set_state(TASK_FINISHED); execDriver->sendStatusUpdate(status2); AWAIT_READY(update2); // This is to catch the duplicate ack for TASK_RUNNING. Future<Nothing> duplicateAck = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); // Now send a duplicate ACK for the TASK_RUNNING update. process::dispatch( slave.get(), &Slave::statusUpdateAcknowledgement, master.get(), update.slave_id(), frameworkId, update.status().task_id(), update.uuid()); AWAIT_READY(duplicateAck); Clock::resume(); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
TEST_F(StatusUpdateManagerTest, RetryStatusUpdate) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)) .Times(1); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); EXPECT_CALL(exec, registered(_, _, _, _)) .Times(1); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), master.get(), _); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL_MIN); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); Clock::resume(); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }