// This test checks that a failed over scheduler gets the retried status update // when the original instance dies without acknowledging the update. TEST_F(HttpFaultToleranceTest, SchedulerFailoverStatusUpdate) { master::Flags flags = CreateMasterFlags(); flags.authenticate_frameworks = false; Try<Owned<cluster::Master>> master = StartMaster(flags); ASSERT_SOME(master); auto scheduler = std::make_shared<v1::MockHTTPScheduler>(); auto executor = std::make_shared<v1::MockHTTPExecutor>(); ExecutorID executorId = DEFAULT_EXECUTOR_ID; TestContainerizer containerizer(executorId, executor); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer); ASSERT_SOME(slave); Future<Nothing> connected; EXPECT_CALL(*scheduler, connected(_)) .WillOnce(FutureSatisfy(&connected)) .WillRepeatedly(Return()); // Ignore future invocations. ContentType contentType = ContentType::PROTOBUF; v1::scheduler::TestMesos schedulerLibrary( master.get()->pid, contentType, scheduler); AWAIT_READY(connected); Future<Event::Subscribed> subscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) .WillOnce(FutureArg<1>(&subscribed)); EXPECT_CALL(*scheduler, heartbeat(_)) .WillRepeatedly(Return()); // Ignore heartbeats. Future<Event::Offers> offers; EXPECT_CALL(*scheduler, offers(_, _)) .WillOnce(FutureArg<1>(&offers)); { Call call; call.set_type(Call::SUBSCRIBE); Call::Subscribe* subscribe = call.mutable_subscribe(); subscribe->mutable_framework_info()->CopyFrom(v1::DEFAULT_FRAMEWORK_INFO); schedulerLibrary.send(call); } AWAIT_READY(subscribed); v1::FrameworkID frameworkId(subscribed->framework_id()); AWAIT_READY(offers); EXPECT_NE(0, offers->offers().size()); EXPECT_CALL(*executor, connected(_)) .WillOnce(v1::executor::SendSubscribe(frameworkId, evolve(executorId))); EXPECT_CALL(*executor, subscribed(_, _)); EXPECT_CALL(*executor, launch(_, _)) .WillOnce(v1::executor::SendUpdateFromTask( frameworkId, evolve(executorId), v1::TASK_RUNNING)); Future<Nothing> acknowledged; EXPECT_CALL(*executor, acknowledged(_, _)) .WillOnce(FutureSatisfy(&acknowledged)); Future<Event::Update> update; EXPECT_CALL(*scheduler, update(_, _)) .WillOnce(FutureArg<1>(&update)); const v1::Offer& offer = offers->offers(0); v1::TaskInfo taskInfo = evolve(createTask(devolve(offer), "", executorId)); { Call call; call.mutable_framework_id()->CopyFrom(frameworkId); call.set_type(Call::ACCEPT); Call::Accept* accept = call.mutable_accept(); accept->add_offer_ids()->CopyFrom(offer.id()); v1::Offer::Operation* operation = accept->add_operations(); operation->set_type(v1::Offer::Operation::LAUNCH); operation->mutable_launch()->add_task_infos()->CopyFrom(taskInfo); schedulerLibrary.send(call); } AWAIT_READY(acknowledged); AWAIT_READY(update); EXPECT_EQ(v1::TASK_RUNNING, update->status().state()); EXPECT_EQ(executorId, devolve(update->status().executor_id())); EXPECT_TRUE(update->status().has_executor_id()); EXPECT_TRUE(update->status().has_uuid()); // Failover the scheduler without acknowledging the status update. auto scheduler2 = std::make_shared<v1::MockHTTPScheduler>(); Future<Nothing> connected2; EXPECT_CALL(*scheduler2, connected(_)) .WillOnce(FutureSatisfy(&connected2)); // Failover to another scheduler instance. v1::scheduler::TestMesos schedulerLibrary2( master.get()->pid, contentType, scheduler2); AWAIT_READY(connected2); // The previously connected scheduler instance should receive an // error/disconnected event. Future<Nothing> error; EXPECT_CALL(*scheduler, error(_, _)) .WillOnce(FutureSatisfy(&error)); Future<Nothing> disconnected; EXPECT_CALL(*scheduler, disconnected(_)) .WillOnce(FutureSatisfy(&disconnected)); EXPECT_CALL(*scheduler2, subscribed(_, _)) .WillOnce(FutureArg<1>(&subscribed)); EXPECT_CALL(*scheduler2, heartbeat(_)) .WillRepeatedly(Return()); // Ignore heartbeats. // Scheduler2 should receive the retried status update. Future<Nothing> update2; EXPECT_CALL(*scheduler2, update(_, _)) .WillOnce(FutureSatisfy(&update2)) .WillRepeatedly(Return()); // Ignore subsequent updates. { Call call; call.mutable_framework_id()->CopyFrom(frameworkId); call.set_type(Call::SUBSCRIBE); Call::Subscribe* subscribe = call.mutable_subscribe(); subscribe->mutable_framework_info()->CopyFrom(v1::DEFAULT_FRAMEWORK_INFO); subscribe->mutable_framework_info()->mutable_id()->CopyFrom(frameworkId); schedulerLibrary2.send(call); } AWAIT_READY(error); AWAIT_READY(disconnected); AWAIT_READY(subscribed); EXPECT_EQ(frameworkId, subscribed->framework_id()); Clock::pause(); // Now advance time enough for the reliable timeout to kick in and // another status update to be sent. Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL_MIN); AWAIT_READY(update2); EXPECT_CALL(*executor, shutdown(_)) .Times(AtMost(1)); EXPECT_CALL(*executor, disconnected(_)) .Times(AtMost(1)); }
// The master reconciles operations that are missing from a re-registering // agent. // // In this case, the `ApplyOperationMessage` is dropped, so the agent should // respond with a OPERATION_DROPPED operation status update. // // This test verifies that if an operation ID is set, the framework receives // the OPERATION_DROPPED operation status update. // // This is a regression test for MESOS-8784. TEST_F( MasterSlaveReconciliationTest, ForwardOperationDroppedAfterExplicitReconciliation) { Clock::pause(); mesos::internal::master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); Future<UpdateSlaveMessage> updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); auto detector = std::make_shared<StandaloneMasterDetector>(master.get()->pid); mesos::internal::slave::Flags slaveFlags = CreateSlaveFlags(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); ASSERT_SOME(slave); // Advance the clock to trigger agent registration. Clock::advance(slaveFlags.registration_backoff_factor); // Wait for the agent to register. AWAIT_READY(updateSlaveMessage); // Start and register a resource provider. v1::ResourceProviderInfo resourceProviderInfo; resourceProviderInfo.set_type("org.apache.mesos.rp.test"); resourceProviderInfo.set_name("test"); v1::Resource disk = v1::createDiskResource( "200", "*", None(), None(), v1::createDiskSourceRaw()); Owned<v1::MockResourceProvider> resourceProvider( new v1::MockResourceProvider(resourceProviderInfo, v1::Resources(disk))); // Make the mock resource provider answer to reconciliation events with // OPERATION_DROPPED operation status updates. auto reconcileOperations = [&resourceProvider]( const v1::resource_provider::Event::ReconcileOperations& reconcile) { foreach (const v1::UUID& operationUuid, reconcile.operation_uuids()) { v1::resource_provider::Call call; call.set_type(v1::resource_provider::Call::UPDATE_OPERATION_STATUS); call.mutable_resource_provider_id()->CopyFrom( resourceProvider->info.id()); v1::resource_provider::Call::UpdateOperationStatus* updateOperationStatus = call.mutable_update_operation_status(); updateOperationStatus->mutable_status()->set_state( v1::OPERATION_DROPPED); updateOperationStatus->mutable_operation_uuid()->CopyFrom( operationUuid); resourceProvider->send(call); } }; EXPECT_CALL(*resourceProvider, reconcileOperations(_)) .WillOnce(Invoke(reconcileOperations)); Owned<EndpointDetector> endpointDetector( mesos::internal::tests::resource_provider::createEndpointDetector( slave.get()->pid)); updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); // NOTE: We need to resume the clock so that the resource provider can // fully register. Clock::resume(); ContentType contentType = ContentType::PROTOBUF; resourceProvider->start(endpointDetector, contentType); // Wait until the agent's resources have been updated to include the // resource provider resources. AWAIT_READY(updateSlaveMessage); Clock::pause(); // Start a v1 framework. auto scheduler = std::make_shared<v1::MockHTTPScheduler>(); v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_roles(0, DEFAULT_TEST_ROLE); EXPECT_CALL(*scheduler, connected(_)) .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo)); Future<v1::scheduler::Event::Subscribed> subscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) .WillOnce(FutureArg<1>(&subscribed)); // Ignore heartbeats. EXPECT_CALL(*scheduler, heartbeat(_)) .WillRepeatedly(Return()); Future<v1::scheduler::Event::Offers> offers; EXPECT_CALL(*scheduler, offers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(v1::scheduler::DeclineOffers()); v1::scheduler::TestMesos mesos(master.get()->pid, contentType, scheduler); AWAIT_READY(subscribed); v1::FrameworkID frameworkId(subscribed->framework_id()); AWAIT_READY(offers); ASSERT_FALSE(offers->offers().empty()); const v1::Offer& offer = offers->offers(0); // We'll drop the `ApplyOperationMessage` from the master to the agent. Future<ApplyOperationMessage> applyOperationMessage = DROP_PROTOBUF(ApplyOperationMessage(), master.get()->pid, _); v1::Resources resources = v1::Resources(offer.resources()).filter([](const v1::Resource& resource) { return resource.has_provider_id(); }); ASSERT_FALSE(resources.empty()); v1::Resource reserved = *(resources.begin()); reserved.add_reservations()->CopyFrom( v1::createDynamicReservationInfo( frameworkInfo.roles(0), DEFAULT_CREDENTIAL.principal())); v1::OperationID operationId; operationId.set_value("operation"); mesos.send(v1::createCallAccept( frameworkId, offer, {v1::RESERVE(reserved, operationId.value())})); AWAIT_READY(applyOperationMessage); Future<v1::scheduler::Event::UpdateOperationStatus> operationDroppedUpdate; EXPECT_CALL(*scheduler, updateOperationStatus(_, _)) .WillOnce(FutureArg<1>(&operationDroppedUpdate)); // Simulate a spurious master change event (e.g., due to ZooKeeper // expiration) at the slave to force re-registration. detector->appoint(master.get()->pid); // Advance the clock, so that the agent re-registers. Clock::advance(slaveFlags.registration_backoff_factor); // Wait for the framework to receive the OPERATION_DROPPED update. AWAIT_READY(operationDroppedUpdate); EXPECT_EQ(operationId, operationDroppedUpdate->status().operation_id()); EXPECT_EQ(v1::OPERATION_DROPPED, operationDroppedUpdate->status().state()); }