// This test checks that a scheduler gets a slave lost // message for a partitioned slave. TEST_F(PartitionTest, PartitionedSlave) { master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); // Set these expectations up before we spawn the slave so that we // don't miss the first PING. Future<Message> ping = FUTURE_MESSAGE( Eq(PingSlaveMessage().GetTypeName()), _, _); // Drop all the PONGs to simulate slave partition. DROP_PROTOBUFS(PongSlaveMessage(), _, _); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get()); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<Nothing> resourceOffers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureSatisfy(&resourceOffers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); // Need to make sure the framework AND slave have registered with // master. Waiting for resource offers should accomplish both. AWAIT_READY(resourceOffers); Clock::pause(); EXPECT_CALL(sched, offerRescinded(&driver, _)) .Times(AtMost(1)); Future<Nothing> slaveLost; EXPECT_CALL(sched, slaveLost(&driver, _)) .WillOnce(FutureSatisfy(&slaveLost)); // Now advance through the PINGs. size_t pings = 0; while (true) { AWAIT_READY(ping); pings++; if (pings == masterFlags.max_slave_ping_timeouts) { break; } ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _); Clock::advance(masterFlags.slave_ping_timeout); } Clock::advance(masterFlags.slave_ping_timeout); AWAIT_READY(slaveLost); slave.get()->terminate(); slave->reset(); JSON::Object stats = Metrics(); EXPECT_EQ(1, stats.values["master/slave_removals"]); EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]); driver.stop(); driver.join(); Clock::resume(); }
// Ensures that the driver can handle the SUBSCRIBED event // after a master failover. TEST_F(SchedulerDriverEventTest, SubscribedMasterFailover) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_failover_timeout(Weeks(2).secs()); // Make sure the initial registration calls 'registered'. MockScheduler sched; StandaloneMasterDetector detector(master.get()->pid); TestingMesosSchedulerDriver driver(&sched, &detector, frameworkInfo); // Intercept the registration message, send a SUBSCRIBED instead. Future<Message> frameworkRegisteredMessage = DROP_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); // Ensure that there will be no (re-)registration retries // from the scheduler driver. Clock::pause(); driver.start(); AWAIT_READY(frameworkRegisteredMessage); UPID frameworkPid = frameworkRegisteredMessage.get().to; FrameworkRegisteredMessage message; ASSERT_TRUE(message.ParseFromString(frameworkRegisteredMessage.get().body)); FrameworkID frameworkId = message.framework_id(); frameworkInfo.mutable_id()->CopyFrom(frameworkId); Event event; event.set_type(Event::SUBSCRIBED); event.mutable_subscribed()->mutable_framework_id()->CopyFrom(frameworkId); Future<Nothing> registered; EXPECT_CALL(sched, registered(&driver, frameworkId, _)) .WillOnce(FutureSatisfy(®istered)); process::post(master.get()->pid, frameworkPid, event); AWAIT_READY(registered); EXPECT_CALL(sched, disconnected(&driver)); // Fail over the master and expect a 'reregistered' call. // Note that the master sends a registered message for // this case (see MESOS-786). master->reset(); master = StartMaster(); ASSERT_SOME(master); frameworkRegisteredMessage = DROP_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); detector.appoint(master.get()->pid); AWAIT_READY(frameworkRegisteredMessage); Future<Nothing> reregistered; EXPECT_CALL(sched, reregistered(&driver, _)) .WillOnce(FutureSatisfy(&reregistered)); process::post(master.get()->pid, frameworkPid, event); AWAIT_READY(reregistered); driver.stop(); driver.join(); }
// This test verifies that the provisioner can recover the rootfses // for the child containers if there is no image specified for its // parent container. TEST_F(ProvisionerAppcTest, RecoverNestedContainerNoParentImage) { slave::Flags flags; flags.image_providers = "APPC"; flags.appc_store_dir = path::join(os::getcwd(), "store"); flags.image_provisioner_backend = COPY_BACKEND; flags.work_dir = path::join(sandbox.get(), "work_dir"); Try<Owned<Provisioner>> provisioner = Provisioner::create(flags); ASSERT_SOME(provisioner); Try<string> createImage = createTestImage( flags.appc_store_dir, getManifest()); ASSERT_SOME(createImage); // Recover. This is when the image in the store is loaded. AWAIT_READY(provisioner.get()->recover({})); Image image; image.mutable_appc()->CopyFrom(getTestImage()); ContainerID parent; ContainerID child; parent.set_value(UUID::random().toString()); child.set_value(UUID::random().toString()); child.mutable_parent()->CopyFrom(parent); AWAIT_READY(provisioner.get()->provision(child, image)); provisioner->reset(); // Create a new provisioner to recover the state from the container. provisioner = Provisioner::create(flags); ASSERT_SOME(provisioner); AWAIT_READY(provisioner.get()->recover({parent, child})); AWAIT_READY(provisioner.get()->provision(child, image)); const string provisionerDir = slave::paths::getProvisionerDir(flags.work_dir); string containerDir = slave::provisioner::paths::getContainerDir( provisionerDir, child); Future<bool> destroy = provisioner.get()->destroy(child); AWAIT_READY(destroy); EXPECT_TRUE(destroy.get()); EXPECT_FALSE(os::exists(containerDir)); containerDir = slave::provisioner::paths::getContainerDir( provisionerDir, parent); destroy = provisioner.get()->destroy(parent); AWAIT_READY(destroy); EXPECT_TRUE(destroy.get()); EXPECT_FALSE(os::exists(containerDir)); }
// This test verifies that, after a master failover, reconciliation of an // operation that is still pending on an agent results in `OPERATION_PENDING`. TEST_P(OperationReconciliationTest, AgentPendingOperationAfterMasterFailover) { Clock::pause(); mesos::internal::master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); Future<UpdateSlaveMessage> updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); auto detector = std::make_shared<StandaloneMasterDetector>(master.get()->pid); mesos::internal::slave::Flags slaveFlags = CreateSlaveFlags(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); ASSERT_SOME(slave); // Advance the clock to trigger agent registration. Clock::advance(slaveFlags.registration_backoff_factor); // Wait for the agent to register. AWAIT_READY(updateSlaveMessage); // Start and register a resource provider. ResourceProviderInfo resourceProviderInfo; resourceProviderInfo.set_type("org.apache.mesos.rp.test"); resourceProviderInfo.set_name("test"); Resource disk = createDiskResource( "200", "*", None(), None(), createDiskSourceRaw(None(), "profile")); Owned<MockResourceProvider> resourceProvider( new MockResourceProvider( resourceProviderInfo, Resources(disk))); // We override the mock resource provider's default action, so the operation // will stay in `OPERATION_PENDING`. Future<resource_provider::Event::ApplyOperation> applyOperation; EXPECT_CALL(*resourceProvider, applyOperation(_)) .WillOnce(FutureArg<0>(&applyOperation)); Owned<EndpointDetector> endpointDetector( mesos::internal::tests::resource_provider::createEndpointDetector( slave.get()->pid)); updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); // NOTE: We need to resume the clock so that the resource provider can // fully register. Clock::resume(); ContentType contentType = GetParam(); resourceProvider->start(endpointDetector, contentType); // Wait until the agent's resources have been updated to include the // resource provider resources. AWAIT_READY(updateSlaveMessage); ASSERT_TRUE(updateSlaveMessage->has_resource_providers()); ASSERT_EQ(1, updateSlaveMessage->resource_providers().providers_size()); Clock::pause(); // Start a v1 framework. auto scheduler = std::make_shared<MockHTTPScheduler>(); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_roles(0, DEFAULT_TEST_ROLE); EXPECT_CALL(*scheduler, connected(_)) .WillOnce(scheduler::SendSubscribe(frameworkInfo)); Future<scheduler::Event::Subscribed> subscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) .WillOnce(FutureArg<1>(&subscribed)); // Ignore heartbeats. EXPECT_CALL(*scheduler, heartbeat(_)) .WillRepeatedly(Return()); // Decline offers that do not contain wanted resources. EXPECT_CALL(*scheduler, offers(_, _)) .WillRepeatedly(scheduler::DeclineOffers()); Future<scheduler::Event::Offers> offers; auto isRaw = [](const Resource& r) { return r.has_disk() && r.disk().has_source() && r.disk().source().type() == Resource::DiskInfo::Source::RAW; }; EXPECT_CALL(*scheduler, offers(_, scheduler::OffersHaveAnyResource( std::bind(isRaw, lambda::_1)))) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(scheduler::DeclineOffers()); // Decline successive offers. scheduler::TestMesos mesos( master.get()->pid, contentType, scheduler, detector); AWAIT_READY(subscribed); FrameworkID frameworkId(subscribed->framework_id()); // NOTE: If the framework has not declined an unwanted offer yet when // the master updates the agent with the RAW disk resource, the new // allocation triggered by this update won't generate an allocatable // offer due to no CPU and memory resources. So here we first settle // the clock to ensure that the unwanted offer has been declined, then // advance the clock to trigger another allocation. Clock::settle(); Clock::advance(masterFlags.allocation_interval); AWAIT_READY(offers); ASSERT_FALSE(offers->offers().empty()); const Offer& offer = offers->offers(0); const AgentID& agentId = offer.agent_id(); Option<Resource> source; Option<ResourceProviderID> resourceProviderId; foreach (const Resource& resource, offer.resources()) { if (isRaw(resource)) { source = resource; ASSERT_TRUE(resource.has_provider_id()); resourceProviderId = resource.provider_id(); break; } } ASSERT_SOME(source); ASSERT_SOME(resourceProviderId); OperationID operationId; operationId.set_value("operation"); mesos.send(createCallAccept( frameworkId, offer, {CREATE_DISK( source.get(), Resource::DiskInfo::Source::MOUNT, None(), operationId)})); AWAIT_READY(applyOperation); // Simulate master failover. EXPECT_CALL(*scheduler, disconnected(_)); detector->appoint(None()); master->reset(); master = StartMaster(); ASSERT_SOME(master); // Settle the clock to ensure the master finishes recovering the registry. Clock::settle(); Future<SlaveReregisteredMessage> slaveReregistered = FUTURE_PROTOBUF( SlaveReregisteredMessage(), master.get()->pid, slave.get()->pid); updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); EXPECT_CALL(*scheduler, connected(_)) .WillOnce(scheduler::SendSubscribe(frameworkInfo, frameworkId)); Future<scheduler::Event::Subscribed> frameworkResubscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) .WillOnce(FutureArg<1>(&frameworkResubscribed)); // Simulate a new master detected event to the agent and the scheduler. detector->appoint(master.get()->pid); // Advance the clock, so that the agent re-registers. Clock::advance(slaveFlags.registration_backoff_factor); // Resume the clock to avoid deadlocks related to agent registration. // See MESOS-8828. Clock::resume(); // Wait for the framework and agent to re-register. AWAIT_READY(slaveReregistered); AWAIT_READY(updateSlaveMessage); AWAIT_READY(frameworkResubscribed); Clock::pause(); // Test explicit reconciliation { scheduler::Call::ReconcileOperations::Operation operation; operation.mutable_operation_id()->CopyFrom(operationId); operation.mutable_agent_id()->CopyFrom(agentId); const Future<scheduler::APIResult> result = mesos.call({createCallReconcileOperations(frameworkId, {operation})}); AWAIT_READY(result); // The master should respond with '200 OK' and with a `scheduler::Response`. ASSERT_EQ(process::http::Status::OK, result->status_code()); ASSERT_TRUE(result->has_response()); const scheduler::Response response = result->response(); ASSERT_EQ(scheduler::Response::RECONCILE_OPERATIONS, response.type()); ASSERT_TRUE(response.has_reconcile_operations()); const scheduler::Response::ReconcileOperations& reconcile = response.reconcile_operations(); ASSERT_EQ(1, reconcile.operation_statuses_size()); const OperationStatus& operationStatus = reconcile.operation_statuses(0); EXPECT_EQ(operationId, operationStatus.operation_id()); EXPECT_EQ(OPERATION_PENDING, operationStatus.state()); EXPECT_FALSE(operationStatus.has_uuid()); } // Test implicit reconciliation { const Future<scheduler::APIResult> result = mesos.call({createCallReconcileOperations(frameworkId, {})}); AWAIT_READY(result); // The master should respond with '200 OK' and with a `scheduler::Response`. ASSERT_EQ(process::http::Status::OK, result->status_code()); ASSERT_TRUE(result->has_response()); const scheduler::Response response = result->response(); ASSERT_EQ(scheduler::Response::RECONCILE_OPERATIONS, response.type()); ASSERT_TRUE(response.has_reconcile_operations()); const scheduler::Response::ReconcileOperations& reconcile = response.reconcile_operations(); ASSERT_EQ(1, reconcile.operation_statuses_size()); const OperationStatus& operationStatus = reconcile.operation_statuses(0); EXPECT_EQ(operationId, operationStatus.operation_id()); EXPECT_EQ(OPERATION_PENDING, operationStatus.state()); EXPECT_FALSE(operationStatus.has_uuid()); } }
// This test verifies that a provisioner can recover the rootfs // provisioned by a previous provisioner and then destroy it. Note // that we use the copy backend in this test so Linux is not required. TEST_F(ProvisionerAppcTest, Recover) { // Create provisioner. slave::Flags flags; flags.image_providers = "APPC"; flags.appc_store_dir = path::join(os::getcwd(), "store"); flags.image_provisioner_backend = COPY_BACKEND; flags.work_dir = path::join(sandbox.get(), "work_dir"); Try<Owned<Provisioner>> provisioner = Provisioner::create(flags); ASSERT_SOME(provisioner); Try<string> createImage = createTestImage( flags.appc_store_dir, getManifest()); ASSERT_SOME(createImage); // Recover. This is when the image in the store is loaded. AWAIT_READY(provisioner.get()->recover({})); Image image; image.mutable_appc()->CopyFrom(getTestImage()); ContainerID containerId; containerId.set_value(UUID::random().toString()); Future<slave::ProvisionInfo> provisionInfo = provisioner.get()->provision(containerId, image); AWAIT_READY(provisionInfo); provisioner->reset(); // Create a new provisioner to recover the state from the container. provisioner = Provisioner::create(flags); ASSERT_SOME(provisioner); AWAIT_READY(provisioner.get()->recover({containerId})); // It's possible for the user to provision two different rootfses // from the same image. AWAIT_READY(provisioner.get()->provision(containerId, image)); string provisionerDir = slave::paths::getProvisionerDir(flags.work_dir); string containerDir = slave::provisioner::paths::getContainerDir( provisionerDir, containerId); Try<hashmap<string, hashset<string>>> rootfses = slave::provisioner::paths::listContainerRootfses( provisionerDir, containerId); ASSERT_SOME(rootfses); // Verify that the rootfs is successfully provisioned. ASSERT_TRUE(rootfses->contains(flags.image_provisioner_backend)); EXPECT_EQ(2u, rootfses->get(flags.image_provisioner_backend)->size()); Future<bool> destroy = provisioner.get()->destroy(containerId); AWAIT_READY(destroy); EXPECT_TRUE(destroy.get()); // The container directory is successfully cleaned up. EXPECT_FALSE(os::exists(containerDir)); }