void statusUpdate(const StatusUpdate& update, const UPID& pid) { const TaskStatus& status = update.status(); VLOG(1) << "Status update: task " << status.task_id() << " of framework " << update.framework_id() << " is now in state " << status.state(); CHECK(frameworkId == update.framework_id()); // TODO(benh): Note that this maybe a duplicate status update! // Once we get support to try and have a more consistent view // of what's running in the cluster, we'll just let this one // slide. The alternative is possibly dealing with a scheduler // failover and not correctly giving the scheduler it's status // update, which seems worse than giving a status update // multiple times (of course, if a scheduler re-uses a TaskID, // that could be bad. invoke(bind(&Scheduler::statusUpdate, sched, driver, cref(status))); if (pid) { // Acknowledge the message (we do this last, after we invoked // the scheduler, if we did at all, in case it causes a crash, // since this way the message might get resent/routed after the // scheduler comes back online). StatusUpdateAcknowledgementMessage message; message.mutable_framework_id()->MergeFrom(frameworkId); message.mutable_slave_id()->MergeFrom(update.slave_id()); message.mutable_task_id()->MergeFrom(status.task_id()); message.set_uuid(update.uuid()); send(pid, message); } }
void statusUpdateAcknowledgement(const StatusUpdate& update, const UPID& pid) { if (aborted) { VLOG(1) << "Not sending status update acknowledgment message because " << "the driver is aborted!"; return; } VLOG(2) << "Sending ACK for status update " << update << " to " << pid; StatusUpdateAcknowledgementMessage message; message.mutable_framework_id()->MergeFrom(framework.id()); message.mutable_slave_id()->MergeFrom(update.slave_id()); message.mutable_task_id()->MergeFrom(update.status().task_id()); message.set_uuid(update.uuid()); send(pid, message); }
// This test verifies that status update manager ignores // unexpected ACK for an earlier update when it is waiting // for an ACK for another update. We do this by dropping ACKs // for the original update and sending a random ACK to the slave. TEST_F(StatusUpdateManagerTest, IgnoreUnexpectedStatusUpdateAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<StatusUpdateMessage> statusUpdateMessage = FUTURE_PROTOBUF(StatusUpdateMessage(), master.get(), _); // Drop the ACKs, so that status update manager // retries the update. DROP_PROTOBUFS(StatusUpdateAcknowledgementMessage(), _, _); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); StatusUpdate update = statusUpdateMessage.get().update(); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); Future<Nothing> unexpectedAck = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); // Now send an ACK with a random UUID. process::dispatch( slave.get(), &Slave::statusUpdateAcknowledgement, update.slave_id(), frameworkId, update.status().task_id(), UUID::random().toBytes()); AWAIT_READY(unexpectedAck); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
// This test verifies that status update manager ignores // duplicate ACK for an earlier update when it is waiting // for an ACK for a later update. This could happen when the // duplicate ACK is for a retried update. TEST_F(StatusUpdateManagerTest, IgnoreDuplicateStatusUpdateAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Drop the first update, so that status update manager // resends the update. Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), master.get(), _); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); StatusUpdate update = statusUpdateMessage.get().update(); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); // This is the ACK for the retried update. Future<Nothing> ack = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); AWAIT_READY(ack); // Now send TASK_FINISHED update so that the status update manager // is waiting for its ACK, which it never gets because we drop the // update. DROP_PROTOBUFS(StatusUpdateMessage(), master.get(), _); Future<Nothing> update2 = FUTURE_DISPATCH(_, &Slave::_statusUpdate); TaskStatus status2 = status.get(); status2.set_state(TASK_FINISHED); execDriver->sendStatusUpdate(status2); AWAIT_READY(update2); // This is to catch the duplicate ack for TASK_RUNNING. Future<Nothing> duplicateAck = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); // Now send a duplicate ACK for the TASK_RUNNING update. process::dispatch( slave.get(), &Slave::statusUpdateAcknowledgement, update.slave_id(), frameworkId, update.status().task_id(), update.uuid()); AWAIT_READY(duplicateAck); Clock::resume(); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }