// TODO(vinod): Using a paused clock in the tests means that // future.await() may wait forever. This makes debugging hard. TEST_F(FrameworksManagerTestFixture, ResurrectExpiringFramework) { // This is the crucial test. // Resurrect an existing framework that is being removed,is being removed, // which should cause the remove to be unsuccessful. // Add a dummy framework. FrameworkID id; id.set_value("id"); FrameworkInfo info; info.set_name("test name"); info.set_user("test user"); // Add the framework. process::dispatch(manager, &FrameworksManager::add, id, info); Clock::pause(); // Remove after 2 secs. Future<Result<bool> > future1 = process::dispatch(manager, &FrameworksManager::remove, id, seconds(2.0)); // Resurrect in the meanwhile. Future<Result<bool> > future2 = process::dispatch(manager, &FrameworksManager::resurrect, id); ASSERT_TRUE(future2.await(2.0)); EXPECT_TRUE(future2.get().get()); Clock::update(Clock::now(manager) + 2.0); ASSERT_TRUE(future1.await(2.0)); EXPECT_FALSE(future1.get().get()); Clock::resume(); }
// This test verifies that a framework registration with authorized // role is successful. TEST_F(MasterAuthorizationTest, AuthorizedRole) { // Setup ACLs so that the framework can receive offers for role // "foo". ACLs acls; mesos::ACL::RegisterFramework* acl = acls.add_register_frameworks(); acl->mutable_principals()->add_values(DEFAULT_FRAMEWORK_INFO.principal()); acl->mutable_roles()->add_values("foo"); master::Flags flags = CreateMasterFlags(); flags.roles = "foo"; flags.acls = acls; Try<PID<Master> > master = StartMaster(flags); ASSERT_SOME(master); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_role("foo"); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); Future<Nothing> registered; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureSatisfy(®istered)); driver.start(); AWAIT_READY(registered); driver.stop(); driver.join(); Shutdown(); }
int main(int argc, char** argv) { if (argc != 2) { cerr << "Usage: " << argv[0] << " <master>" << endl; return -1; } DockerNoExecutorScheduler scheduler; FrameworkInfo framework; framework.set_user(""); // Have Mesos fill in the current user. framework.set_name("Docker No Executor Framework (C++)"); framework.set_checkpoint(true); MesosSchedulerDriver* driver; if (os::getenv("MESOS_AUTHENTICATE_FRAMEWORKS").isSome()) { cout << "Enabling authentication for the framework" << endl; Option<string> value = os::getenv("DEFAULT_PRINCIPAL"); if (value.isNone()) { EXIT(EXIT_FAILURE) << "Expecting authentication principal in the environment"; } Credential credential; credential.set_principal(value.get()); framework.set_principal(value.get()); value = os::getenv("DEFAULT_SECRET"); if (value.isNone()) { EXIT(EXIT_FAILURE) << "Expecting authentication secret in the environment"; } credential.set_secret(value.get()); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1], credential); } else { framework.set_principal("no-executor-framework-cpp"); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1]); } int status = driver->run() == DRIVER_STOPPED ? 0 : 1; // Ensure that the driver process terminates. driver->stop(); delete driver; return status; }
jobject convert(JNIEnv* env, const FrameworkInfo& frameworkInfo) { string data; frameworkInfo.SerializeToString(&data); // byte[] data = ..; jbyteArray jdata = env->NewByteArray(data.size()); env->SetByteArrayRegion(jdata, 0, data.size(), (jbyte*) data.data()); // FrameworkInfo frameworkInfo = FrameworkInfo.parseFrom(data); jclass clazz = FindMesosClass(env, "org/apache/mesos/Protos$FrameworkInfo"); jmethodID parseFrom = env->GetStaticMethodID(clazz, "parseFrom", "([B)Lorg/apache/mesos/Protos$FrameworkInfo;"); jobject jframeworkInfo = env->CallStaticObjectMethod(clazz, parseFrom, jdata); return jframeworkInfo; }
// This test launches a command task which has checkpoint enabled, and // agent is terminated when the task is running, after agent is restarted, // kill the task and then verify we can receive TASK_KILLED for the task. TEST_F(CniIsolatorTest, ROOT_SlaveRecovery) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); slave::Flags flags = CreateSlaveFlags(); flags.isolation = "network/cni"; flags.network_cni_plugins_dir = cniPluginDir; flags.network_cni_config_dir = cniConfigDir; Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags); ASSERT_SOME(slave); MockScheduler sched; // Enable checkpointing for the framework. FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); ASSERT_EQ(1u, offers->size()); const Offer& offer = offers.get()[0]; CommandInfo command; command.set_value("sleep 1000"); TaskInfo task = createTask( offer.slave_id(), Resources::parse("cpus:1;mem:128").get(), command); ContainerInfo* container = task.mutable_container(); container->set_type(ContainerInfo::MESOS); // Make sure the container join the mock CNI network. container->add_network_infos()->set_name("__MESOS_TEST__"); Future<TaskStatus> statusRunning; Future<TaskStatus> statusKilled; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&statusRunning)) .WillOnce(FutureArg<1>(&statusKilled)); EXPECT_CALL(sched, offerRescinded(&driver, _)) .Times(AtMost(1)); Future<Nothing> ack = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); driver.launchTasks(offer.id(), {task}); AWAIT_READY(statusRunning); EXPECT_EQ(task.task_id(), statusRunning->task_id()); EXPECT_EQ(TASK_RUNNING, statusRunning->state()); // Wait for the ACK to be checkpointed. AWAIT_READY(ack); // Stop the slave after TASK_RUNNING is received. slave.get()->terminate(); // Restart the slave. slave = StartSlave(detector.get(), flags); ASSERT_SOME(slave); // Kill the task. driver.killTask(task.task_id()); AWAIT_READY(statusKilled); EXPECT_EQ(task.task_id(), statusKilled->task_id()); EXPECT_EQ(TASK_KILLED, statusKilled->state()); driver.stop(); driver.join(); }
TEST_F(StatusUpdateManagerTest, CheckpointStatusUpdate) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)) .Times(1); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); EXPECT_CALL(exec, registered(_, _, _, _)) .Times(1); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); Future<Nothing> _statusUpdateAcknowledgement = FUTURE_DISPATCH(slave.get(), &Slave::_statusUpdateAcknowledgement); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); AWAIT_READY(_statusUpdateAcknowledgement); // Ensure that both the status update and its acknowledgement are // correctly checkpointed. Try<list<string> > found = os::find(flags.work_dir, TASK_UPDATES_FILE); ASSERT_SOME(found); ASSERT_EQ(1u, found.get().size()); Try<int> fd = os::open(found.get().front(), O_RDONLY); ASSERT_SOME(fd); int updates = 0; int acks = 0; string uuid; Result<StatusUpdateRecord> record = None(); while (true) { record = ::protobuf::read<StatusUpdateRecord>(fd.get()); ASSERT_FALSE(record.isError()); if (record.isNone()) { // Reached EOF. break; } if (record.get().type() == StatusUpdateRecord::UPDATE) { EXPECT_EQ(TASK_RUNNING, record.get().update().status().state()); uuid = record.get().update().uuid(); updates++; } else { EXPECT_EQ(uuid, record.get().uuid()); acks++; } } ASSERT_EQ(1, updates); ASSERT_EQ(1, acks); close(fd.get()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
// Ensures that the driver can handle the SUBSCRIBED event // after a master failover. TEST_F(SchedulerDriverEventTest, SubscribedMasterFailover) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_failover_timeout(Weeks(2).secs()); // Make sure the initial registration calls 'registered'. MockScheduler sched; StandaloneMasterDetector detector(master.get()->pid); TestingMesosSchedulerDriver driver(&sched, &detector, frameworkInfo); // Intercept the registration message, send a SUBSCRIBED instead. Future<Message> frameworkRegisteredMessage = DROP_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); // Ensure that there will be no (re-)registration retries // from the scheduler driver. Clock::pause(); driver.start(); AWAIT_READY(frameworkRegisteredMessage); UPID frameworkPid = frameworkRegisteredMessage.get().to; FrameworkRegisteredMessage message; ASSERT_TRUE(message.ParseFromString(frameworkRegisteredMessage.get().body)); FrameworkID frameworkId = message.framework_id(); frameworkInfo.mutable_id()->CopyFrom(frameworkId); Event event; event.set_type(Event::SUBSCRIBED); event.mutable_subscribed()->mutable_framework_id()->CopyFrom(frameworkId); Future<Nothing> registered; EXPECT_CALL(sched, registered(&driver, frameworkId, _)) .WillOnce(FutureSatisfy(®istered)); process::post(master.get()->pid, frameworkPid, event); AWAIT_READY(registered); EXPECT_CALL(sched, disconnected(&driver)); // Fail over the master and expect a 'reregistered' call. // Note that the master sends a registered message for // this case (see MESOS-786). master->reset(); master = StartMaster(); ASSERT_SOME(master); frameworkRegisteredMessage = DROP_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); detector.appoint(master.get()->pid); AWAIT_READY(frameworkRegisteredMessage); Future<Nothing> reregistered; EXPECT_CALL(sched, reregistered(&driver, _)) .WillOnce(FutureSatisfy(&reregistered)); process::post(master.get()->pid, frameworkPid, event); AWAIT_READY(reregistered); driver.stop(); driver.join(); }
// In this test, the framework is checkpointed so we expect the executor to // persist across the slave restart and to have the same resource usage before // and after. TEST_F(ROOT_XFS_QuotaTest, CheckpointRecovery) { slave::Flags flags = CreateSlaveFlags(); Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), CreateSlaveFlags()); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); Offer offer = offers.get()[0]; TaskInfo task = createTask( offer.slave_id(), Resources::parse("cpus:1;mem:128;disk:1").get(), "dd if=/dev/zero of=file bs=1048576 count=1; sleep 1000"); Future<TaskStatus> startingStatus; Future<TaskStatus> runningStatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&startingStatus)) .WillOnce(FutureArg<1>(&runningStatus)); driver.launchTasks(offer.id(), {task}); AWAIT_READY(startingStatus); EXPECT_EQ(task.task_id(), startingStatus->task_id()); EXPECT_EQ(TASK_STARTING, startingStatus->state()); AWAIT_READY(startingStatus); EXPECT_EQ(task.task_id(), runningStatus->task_id()); EXPECT_EQ(TASK_RUNNING, runningStatus->state()); Future<ResourceUsage> usage1 = process::dispatch(slave.get()->pid, &Slave::usage); AWAIT_READY(usage1); // We should have 1 executor using resources. ASSERT_EQ(1, usage1->executors().size()); // Restart the slave. slave.get()->terminate(); Future<SlaveReregisteredMessage> slaveReregisteredMessage = FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _); slave = StartSlave(detector.get(), flags); ASSERT_SOME(slave); // Wait for the slave to re-register. AWAIT_READY(slaveReregisteredMessage); Future<ResourceUsage> usage2 = process::dispatch(slave.get()->pid, &Slave::usage); AWAIT_READY(usage2); // We should have still have 1 executor using resources. ASSERT_EQ(1, usage1->executors().size()); Try<std::list<string>> sandboxes = os::glob(path::join( slave::paths::getSandboxRootDir(mountPoint.get()), "*", "frameworks", "*", "executors", "*", "runs", "*")); ASSERT_SOME(sandboxes); // One sandbox and one symlink. ASSERT_EQ(2u, sandboxes->size()); // Scan the remaining sandboxes. We ought to still have project IDs // assigned to them all. foreach (const string& sandbox, sandboxes.get()) { // Skip the "latest" symlink. if (os::stat::islink(sandbox)) { continue; } EXPECT_SOME(xfs::getProjectId(sandbox)); } driver.stop(); driver.join(); }
// This test verifies that the slave and status update manager // properly handle duplicate status updates, when the second // update with the same UUID is received before the ACK for the // first update. The proper behavior here is for the status update // manager to drop the duplicate update. TEST_F(StatusUpdateManagerTest, DuplicateUpdateBeforeAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Capture the first status update message. Future<StatusUpdateMessage> statusUpdateMessage = FUTURE_PROTOBUF(StatusUpdateMessage(), _, _); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); // Drop the first ACK from the scheduler to the slave. Future<StatusUpdateAcknowledgementMessage> statusUpdateAcknowledgementMessage = DROP_PROTOBUF(StatusUpdateAcknowledgementMessage(), _, slave.get()); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); AWAIT_READY(statusUpdateAcknowledgementMessage); Future<Nothing> _statusUpdate = FUTURE_DISPATCH(slave.get(), &Slave::_statusUpdate); // Now resend the TASK_RUNNING update. process::post(slave.get(), statusUpdateMessage.get()); // At this point the status update manager has handled // the duplicate status update. AWAIT_READY(_statusUpdate); // After we advance the clock, the status update manager should // retry the TASK_RUNING update and the scheduler should receive // and acknowledge it. Future<TaskStatus> update; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&update)); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL); Clock::settle(); // Ensure the scheduler receives TASK_FINISHED. AWAIT_READY(update); EXPECT_EQ(TASK_RUNNING, update.get().state()); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); Clock::resume(); driver.stop(); driver.join(); Shutdown(); }
// Test that memory pressure listening is restarted after recovery. TEST_F(MemoryPressureMesosTest, CGROUPS_ROOT_SlaveRecovery) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); slave::Flags flags = CreateSlaveFlags(); // We only care about memory cgroup for this test. flags.isolation = "cgroups/mem"; flags.agent_subsystems = None(); Fetcher fetcher; Try<MesosContainerizer*> _containerizer = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(_containerizer); Owned<MesosContainerizer> containerizer(_containerizer.get()); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); MockScheduler sched; // Enable checkpointing for the framework. FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); Offer offer = offers.get()[0]; // Run a task that triggers memory pressure event. We request 1G // disk because we are going to write a 512 MB file repeatedly. TaskInfo task = createTask( offer.slave_id(), Resources::parse("cpus:1;mem:256;disk:1024").get(), "while true; do dd count=512 bs=1M if=/dev/zero of=./temp; done"); Future<TaskStatus> running; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&running)); Future<Nothing> _statusUpdateAcknowledgement = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); driver.launchTasks(offers.get()[0].id(), {task}); AWAIT_READY(running); EXPECT_EQ(task.task_id(), running.get().task_id()); EXPECT_EQ(TASK_RUNNING, running.get().state()); // Wait for the ACK to be checkpointed. AWAIT_READY_FOR(_statusUpdateAcknowledgement, Seconds(120)); // We restart the slave to let it recover. slave.get()->terminate(); // Set up so we can wait until the new slave updates the container's // resources (this occurs after the executor has re-registered). Future<Nothing> update = FUTURE_DISPATCH(_, &MesosContainerizerProcess::update); // Use the same flags. _containerizer = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(_containerizer); containerizer.reset(_containerizer.get()); Future<SlaveReregisteredMessage> reregistered = FUTURE_PROTOBUF(SlaveReregisteredMessage(), master.get()->pid, _); slave = StartSlave(detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); AWAIT_READY(reregistered); // Wait until the containerizer is updated. AWAIT_READY(update); Future<hashset<ContainerID>> containers = containerizer->containers(); AWAIT_READY(containers); ASSERT_EQ(1u, containers.get().size()); ContainerID containerId = *(containers.get().begin()); // Wait a while for some memory pressure events to occur. Duration waited = Duration::zero(); do { Future<ResourceStatistics> usage = containerizer->usage(containerId); AWAIT_READY(usage); if (usage.get().mem_low_pressure_counter() > 0) { // We will check the correctness of the memory pressure counters // later, because the memory-hammering task is still active // and potentially incrementing these counters. break; } os::sleep(Milliseconds(100)); waited += Milliseconds(100); } while (waited < Seconds(5)); EXPECT_LE(waited, Seconds(5)); // Pause the clock to ensure that the reaper doesn't reap the exited // command executor and inform the containerizer/slave. Clock::pause(); Clock::settle(); Future<TaskStatus> killed; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&killed)); // Stop the memory-hammering task. driver.killTask(task.task_id()); AWAIT_READY_FOR(killed, Seconds(120)); EXPECT_EQ(task.task_id(), killed->task_id()); EXPECT_EQ(TASK_KILLED, killed->state()); // Now check the correctness of the memory pressure counters. Future<ResourceStatistics> usage = containerizer->usage(containerId); AWAIT_READY(usage); EXPECT_GE(usage.get().mem_low_pressure_counter(), usage.get().mem_medium_pressure_counter()); EXPECT_GE(usage.get().mem_medium_pressure_counter(), usage.get().mem_critical_pressure_counter()); Clock::resume(); driver.stop(); driver.join(); }
int main(int argc, char** argv) { if (argc != 3) { std::cerr << "Usage: " << argv[0] << " <master> <balloon limit in MB>" << std::endl; return -1; } // Verify the balloon limit. Try<size_t> limit = numify<size_t>(argv[2]); if (limit.isError()) { std::cerr << "Balloon limit is not a valid number" << std::endl; return -1; } if (limit.get() < EXECUTOR_MEMORY_MB) { std::cerr << "Please use a balloon limit bigger than " << EXECUTOR_MEMORY_MB << " MB" << std::endl; } // Find this executable's directory to locate executor. std::string path = os::realpath(::dirname(argv[0])).get(); std::string uri = path + "/balloon-executor"; if (getenv("MESOS_BUILD_DIR")) { uri = std::string(::getenv("MESOS_BUILD_DIR")) + "/src/balloon-executor"; } ExecutorInfo executor; executor.mutable_executor_id()->set_value("default"); executor.mutable_command()->set_value(uri); executor.set_name("Balloon Executor"); executor.set_source("balloon_test"); Resource* mem = executor.add_resources(); mem->set_name("mem"); mem->set_type(Value::SCALAR); mem->mutable_scalar()->set_value(EXECUTOR_MEMORY_MB); BalloonScheduler scheduler(executor, limit.get()); FrameworkInfo framework; framework.set_user(""); // Have Mesos fill in the current user. framework.set_name("Balloon Framework (C++)"); // TODO(vinod): Make checkpointing the default when it is default // on the slave. if (os::hasenv("MESOS_CHECKPOINT")) { cout << "Enabling checkpoint for the framework" << endl; framework.set_checkpoint(true); } MesosSchedulerDriver* driver; if (os::hasenv("MESOS_AUTHENTICATE")) { cout << "Enabling authentication for the framework" << endl; if (!os::hasenv("DEFAULT_PRINCIPAL")) { EXIT(1) << "Expecting authentication principal in the environment"; } if (!os::hasenv("DEFAULT_SECRET")) { EXIT(1) << "Expecting authentication secret in the environment"; } Credential credential; credential.set_principal(getenv("DEFAULT_PRINCIPAL")); credential.set_secret(getenv("DEFAULT_SECRET")); framework.set_principal(getenv("DEFAULT_PRINCIPAL")); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1], credential); } else { framework.set_principal("balloon-framework-cpp"); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1]); } int status = driver->run() == DRIVER_STOPPED ? 0 : 1; // Ensure that the driver process terminates. driver->stop(); delete driver; return status; }
// This test verifies that disk quota isolator recovers properly after // the slave restarts. TEST_F(DiskQuotaTest, SlaveRecovery) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); slave::Flags flags = CreateSlaveFlags(); flags.isolation = "posix/cpu,posix/mem,disk/du"; flags.container_disk_watch_interval = Milliseconds(1); Fetcher fetcher(flags); Try<MesosContainerizer*> _containerizer = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(_containerizer); Owned<MesosContainerizer> containerizer(_containerizer.get()); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); MockScheduler sched; // Enable checkpointing for the framework. FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); const Offer& offer = offers.get()[0]; // Create a task that uses 2MB disk. TaskInfo task = createTask( offer.slave_id(), Resources::parse("cpus:1;mem:128;disk:3").get(), "dd if=/dev/zero of=file bs=1048576 count=2 && sleep 1000"); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status)) .WillRepeatedly(Return()); // Ignore subsequent updates. driver.launchTasks(offer.id(), {task}); AWAIT_READY(status); EXPECT_EQ(task.task_id(), status->task_id()); EXPECT_EQ(TASK_RUNNING, status->state()); Future<hashset<ContainerID>> containers = containerizer->containers(); AWAIT_READY(containers); ASSERT_EQ(1u, containers->size()); const ContainerID& containerId = *(containers->begin()); // Stop the slave. slave.get()->terminate(); Future<ReregisterExecutorMessage> reregisterExecutorMessage = FUTURE_PROTOBUF(ReregisterExecutorMessage(), _, _); Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover); _containerizer = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(_containerizer); containerizer.reset(_containerizer.get()); detector = master.get()->createDetector(); slave = StartSlave(detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); Clock::pause(); AWAIT_READY(_recover); // Wait for slave to schedule reregister timeout. Clock::settle(); // Ensure the executor re-registers before completing recovery. AWAIT_READY(reregisterExecutorMessage); // Ensure the slave considers itself recovered. Clock::advance(flags.executor_reregistration_timeout); // NOTE: We resume the clock because we need the reaper to reap the // 'du' subprocess. Clock::resume(); // Wait until disk usage can be retrieved. Duration elapsed = Duration::zero(); while (true) { Future<ResourceStatistics> usage = containerizer->usage(containerId); AWAIT_READY(usage); ASSERT_TRUE(usage->has_disk_limit_bytes()); EXPECT_EQ(Megabytes(3), Bytes(usage->disk_limit_bytes())); if (usage->has_disk_used_bytes()) { EXPECT_LE(usage->disk_used_bytes(), usage->disk_limit_bytes()); // NOTE: This is to capture the regression in MESOS-2452. The data // stored in the executor meta directory should be less than 64K. if (usage->disk_used_bytes() > Kilobytes(64).bytes()) { break; } } ASSERT_LT(elapsed, Seconds(15)); os::sleep(Milliseconds(1)); elapsed += Milliseconds(1); } driver.stop(); driver.join(); }
TEST_F(StatusUpdateManagerTest, RetryStatusUpdate) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)) .Times(1); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); EXPECT_CALL(exec, registered(_, _, _, _)) .Times(1); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), master.get(), _); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); Clock::resume(); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
// This test verifies that the container will be killed if the volume // usage exceeds its quota. TEST_F(DiskQuotaTest, VolumeUsageExceedsQuota) { FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_role("role1"); master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); slave::Flags slaveFlags = CreateSlaveFlags(); slaveFlags.isolation = "posix/cpu,posix/mem,disk/du"; // NOTE: We can't pause the clock because we need the reaper to reap // the 'du' subprocess. slaveFlags.container_disk_watch_interval = Milliseconds(1); slaveFlags.enforce_container_disk_quota = true; slaveFlags.resources = "cpus:2;mem:128;disk(role1):128"; Try<Resources> initialResources = Resources::parse(slaveFlags.resources.get()); ASSERT_SOME(initialResources); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); ASSERT_SOME(slave); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); Future<FrameworkID> frameworkId; EXPECT_CALL(sched, registered(&driver, _, _)) .WillOnce(FutureArg<1>(&frameworkId)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(frameworkId); AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); const Offer& offer = offers.get()[0]; // Create a task that requests a 1 MB persistent volume but attempts // to use 2MB. Resources volume = createPersistentVolume( Megabytes(1), "role1", "id1", "volume_path", None(), None(), frameworkInfo.principal()); // We intentionally request a sandbox that is much bugger (16MB) than // the file the task writes (2MB) to the persistent volume (1MB). This // makes sure that the quota is indeed enforced on the persistent volume. Resources taskResources = Resources::parse("cpus:1;mem:64;disk(role1):16").get() + volume; TaskInfo task = createTask( offer.slave_id(), taskResources, "dd if=/dev/zero of=volume_path/file bs=1048576 count=2 && sleep 1000"); Future<TaskStatus> status1; Future<TaskStatus> status2; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status1)) .WillOnce(FutureArg<1>(&status2)); // Create the volume and launch the task. driver.acceptOffers( {offer.id()}, {CREATE(volume), LAUNCH({task})}); AWAIT_READY(status1); EXPECT_EQ(task.task_id(), status1->task_id()); EXPECT_EQ(TASK_RUNNING, status1->state()); AWAIT_READY(status2); EXPECT_EQ(task.task_id(), status1->task_id()); EXPECT_EQ(TASK_FAILED, status2->state()); driver.stop(); driver.join(); }
TEST_F(DiskQuotaTest, ResourceStatistics) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); slave::Flags flags = CreateSlaveFlags(); flags.isolation = "posix/cpu,posix/mem,disk/du"; flags.resources = strings::format("disk(%s):10", DEFAULT_TEST_ROLE).get(); // NOTE: We can't pause the clock because we need the reaper to reap // the 'du' subprocess. flags.container_disk_watch_interval = Milliseconds(1); Fetcher fetcher(flags); Try<MesosContainerizer*> _containerizer = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(_containerizer); Owned<MesosContainerizer> containerizer(_containerizer.get()); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_role(DEFAULT_TEST_ROLE); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); const Offer& offer = offers.get()[0]; Resource volume = createPersistentVolume( Megabytes(4), DEFAULT_TEST_ROLE, "id1", "path1", None(), None(), DEFAULT_CREDENTIAL.principal()); Resources taskResources = Resources::parse("cpus:1;mem:128").get(); taskResources += createDiskResource( "3", DEFAULT_TEST_ROLE, None(), None()); taskResources += volume; // Create a task that uses 2MB disk. TaskInfo task = createTask( offer.slave_id(), taskResources, "dd if=/dev/zero of=file bs=1048576 count=2 && " "dd if=/dev/zero of=path1/file bs=1048576 count=2 && " "sleep 1000"); Future<TaskStatus> status1; Future<TaskStatus> status2; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&status1)) .WillOnce(FutureArg<1>(&status2)) .WillRepeatedly(Return()); // Ignore subsequent updates. driver.acceptOffers( {offer.id()}, {CREATE(volume), LAUNCH({task})}); AWAIT_READY(status1); EXPECT_EQ(task.task_id(), status1->task_id()); EXPECT_EQ(TASK_RUNNING, status1->state()); Future<hashset<ContainerID>> containers = containerizer->containers(); AWAIT_READY(containers); ASSERT_EQ(1u, containers->size()); const ContainerID& containerId = *(containers->begin()); // Wait until disk usage can be retrieved. Duration elapsed = Duration::zero(); while (true) { Future<ResourceStatistics> usage = containerizer->usage(containerId); AWAIT_READY(usage); ASSERT_TRUE(usage->has_disk_limit_bytes()); EXPECT_EQ(Megabytes(3), Bytes(usage->disk_limit_bytes())); if (usage->has_disk_used_bytes()) { EXPECT_LE(usage->disk_used_bytes(), usage->disk_limit_bytes()); } ASSERT_EQ(2u, usage->disk_statistics().size()); bool done = true; foreach (const DiskStatistics& statistics, usage->disk_statistics()) { ASSERT_TRUE(statistics.has_limit_bytes()); EXPECT_EQ( statistics.has_persistence() ? Megabytes(4) : Megabytes(3), statistics.limit_bytes()); if (!statistics.has_used_bytes()) { done = false; } else { EXPECT_GT( statistics.has_persistence() ? Megabytes(4) : Megabytes(3), statistics.used_bytes()); } } if (done) { break; } ASSERT_LT(elapsed, Seconds(5)); os::sleep(Milliseconds(1)); elapsed += Milliseconds(1); } driver.killTask(task.task_id()); AWAIT_READY(status2); EXPECT_EQ(task.task_id(), status2->task_id()); EXPECT_EQ(TASK_KILLED, status2->state()); driver.stop(); driver.join(); }
int main(int argc, char** argv) { string master = MASTER; string k3binary; string paramfile; int total_peers; YAML::Node k3vars; // Parse Command Line options string path = os::realpath(dirname(argv[0])).get(); namespace po = boost::program_options; vector <string> optionalVars; po::options_description desc("K3 Run options"); desc.add_options() ("program", po::value<string>(&k3binary)->required(), "K3 executable program filename") ("numpeers", po::value<int>(&total_peers)->required(), "# of K3 Peers to launch") ("help,h", "Print help message") ("param,p", po::value<string>(¶mfile)->required(), "YAML Formatted input file"); po::positional_options_description positionalOptions; positionalOptions.add("program", 1); positionalOptions.add("numpeers", 1); po::variables_map vm; try { po::store(po::command_line_parser(argc, argv).options(desc) .positional(positionalOptions).run(), vm); if (vm.count("help") || vm.empty()) { cout << "K3 Distributed program framework backed by Mesos cluser" << endl; cout << desc << endl; return 0; } po::notify(vm); k3vars = YAML::LoadFile(paramfile); } catch (boost::program_options::required_option& e) { cerr << " ERROR: " << e.what() << endl << endl; cout << desc << endl; return 1; } catch (boost::program_options::error& e) { cerr << " ERROR: " << e.what() << endl << endl; cout << desc << endl; return 1; } KDScheduler scheduler(k3binary, total_peers, k3vars, path); FrameworkInfo framework; framework.set_user(""); // Have Mesos fill in the current user. framework.set_name(k3binary + "-" + stringify(total_peers)); framework.mutable_id()->set_value(k3binary); if (os::hasenv("MESOS_CHECKPOINT")) { cout << "Enabling checkpoint for the framework" << endl; framework.set_checkpoint(true); } MesosSchedulerDriver* driver; if (os::hasenv("MESOS_AUTHENTICATE")) { cout << "Enabling authentication for the framework" << endl; if (!os::hasenv("DEFAULT_PRINCIPAL")) { EXIT(1) << "Expecting authentication principal in the environment"; } if (!os::hasenv("DEFAULT_SECRET")) { EXIT(1) << "Expecting authentication secret in the environment"; } Credential credential; credential.set_principal(getenv("DEFAULT_PRINCIPAL")); credential.set_secret(getenv("DEFAULT_SECRET")); framework.set_principal(getenv("DEFAULT_PRINCIPAL")); driver = new MesosSchedulerDriver( &scheduler, framework, master, credential); } else { framework.set_principal("k3-docker-no-executor-framework-cpp"); driver = new MesosSchedulerDriver(&scheduler, framework, master); } int status = driver->run() == DRIVER_STOPPED ? 0 : 1; // Ensure that the driver process terminates. driver->stop(); delete driver; return status; }
// This test verifies that, after a master failover, reconciliation of an // operation that is still pending on an agent results in `OPERATION_PENDING`. TEST_P(OperationReconciliationTest, AgentPendingOperationAfterMasterFailover) { Clock::pause(); mesos::internal::master::Flags masterFlags = CreateMasterFlags(); Try<Owned<cluster::Master>> master = StartMaster(masterFlags); ASSERT_SOME(master); Future<UpdateSlaveMessage> updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); auto detector = std::make_shared<StandaloneMasterDetector>(master.get()->pid); mesos::internal::slave::Flags slaveFlags = CreateSlaveFlags(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags); ASSERT_SOME(slave); // Advance the clock to trigger agent registration. Clock::advance(slaveFlags.registration_backoff_factor); // Wait for the agent to register. AWAIT_READY(updateSlaveMessage); // Start and register a resource provider. ResourceProviderInfo resourceProviderInfo; resourceProviderInfo.set_type("org.apache.mesos.rp.test"); resourceProviderInfo.set_name("test"); Resource disk = createDiskResource( "200", "*", None(), None(), createDiskSourceRaw(None(), "profile")); Owned<MockResourceProvider> resourceProvider( new MockResourceProvider( resourceProviderInfo, Resources(disk))); // We override the mock resource provider's default action, so the operation // will stay in `OPERATION_PENDING`. Future<resource_provider::Event::ApplyOperation> applyOperation; EXPECT_CALL(*resourceProvider, applyOperation(_)) .WillOnce(FutureArg<0>(&applyOperation)); Owned<EndpointDetector> endpointDetector( mesos::internal::tests::resource_provider::createEndpointDetector( slave.get()->pid)); updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); // NOTE: We need to resume the clock so that the resource provider can // fully register. Clock::resume(); ContentType contentType = GetParam(); resourceProvider->start(endpointDetector, contentType); // Wait until the agent's resources have been updated to include the // resource provider resources. AWAIT_READY(updateSlaveMessage); ASSERT_TRUE(updateSlaveMessage->has_resource_providers()); ASSERT_EQ(1, updateSlaveMessage->resource_providers().providers_size()); Clock::pause(); // Start a v1 framework. auto scheduler = std::make_shared<MockHTTPScheduler>(); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_roles(0, DEFAULT_TEST_ROLE); EXPECT_CALL(*scheduler, connected(_)) .WillOnce(scheduler::SendSubscribe(frameworkInfo)); Future<scheduler::Event::Subscribed> subscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) .WillOnce(FutureArg<1>(&subscribed)); // Ignore heartbeats. EXPECT_CALL(*scheduler, heartbeat(_)) .WillRepeatedly(Return()); // Decline offers that do not contain wanted resources. EXPECT_CALL(*scheduler, offers(_, _)) .WillRepeatedly(scheduler::DeclineOffers()); Future<scheduler::Event::Offers> offers; auto isRaw = [](const Resource& r) { return r.has_disk() && r.disk().has_source() && r.disk().source().type() == Resource::DiskInfo::Source::RAW; }; EXPECT_CALL(*scheduler, offers(_, scheduler::OffersHaveAnyResource( std::bind(isRaw, lambda::_1)))) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(scheduler::DeclineOffers()); // Decline successive offers. scheduler::TestMesos mesos( master.get()->pid, contentType, scheduler, detector); AWAIT_READY(subscribed); FrameworkID frameworkId(subscribed->framework_id()); // NOTE: If the framework has not declined an unwanted offer yet when // the master updates the agent with the RAW disk resource, the new // allocation triggered by this update won't generate an allocatable // offer due to no CPU and memory resources. So here we first settle // the clock to ensure that the unwanted offer has been declined, then // advance the clock to trigger another allocation. Clock::settle(); Clock::advance(masterFlags.allocation_interval); AWAIT_READY(offers); ASSERT_FALSE(offers->offers().empty()); const Offer& offer = offers->offers(0); const AgentID& agentId = offer.agent_id(); Option<Resource> source; Option<ResourceProviderID> resourceProviderId; foreach (const Resource& resource, offer.resources()) { if (isRaw(resource)) { source = resource; ASSERT_TRUE(resource.has_provider_id()); resourceProviderId = resource.provider_id(); break; } } ASSERT_SOME(source); ASSERT_SOME(resourceProviderId); OperationID operationId; operationId.set_value("operation"); mesos.send(createCallAccept( frameworkId, offer, {CREATE_DISK( source.get(), Resource::DiskInfo::Source::MOUNT, None(), operationId)})); AWAIT_READY(applyOperation); // Simulate master failover. EXPECT_CALL(*scheduler, disconnected(_)); detector->appoint(None()); master->reset(); master = StartMaster(); ASSERT_SOME(master); // Settle the clock to ensure the master finishes recovering the registry. Clock::settle(); Future<SlaveReregisteredMessage> slaveReregistered = FUTURE_PROTOBUF( SlaveReregisteredMessage(), master.get()->pid, slave.get()->pid); updateSlaveMessage = FUTURE_PROTOBUF(UpdateSlaveMessage(), _, _); EXPECT_CALL(*scheduler, connected(_)) .WillOnce(scheduler::SendSubscribe(frameworkInfo, frameworkId)); Future<scheduler::Event::Subscribed> frameworkResubscribed; EXPECT_CALL(*scheduler, subscribed(_, _)) .WillOnce(FutureArg<1>(&frameworkResubscribed)); // Simulate a new master detected event to the agent and the scheduler. detector->appoint(master.get()->pid); // Advance the clock, so that the agent re-registers. Clock::advance(slaveFlags.registration_backoff_factor); // Resume the clock to avoid deadlocks related to agent registration. // See MESOS-8828. Clock::resume(); // Wait for the framework and agent to re-register. AWAIT_READY(slaveReregistered); AWAIT_READY(updateSlaveMessage); AWAIT_READY(frameworkResubscribed); Clock::pause(); // Test explicit reconciliation { scheduler::Call::ReconcileOperations::Operation operation; operation.mutable_operation_id()->CopyFrom(operationId); operation.mutable_agent_id()->CopyFrom(agentId); const Future<scheduler::APIResult> result = mesos.call({createCallReconcileOperations(frameworkId, {operation})}); AWAIT_READY(result); // The master should respond with '200 OK' and with a `scheduler::Response`. ASSERT_EQ(process::http::Status::OK, result->status_code()); ASSERT_TRUE(result->has_response()); const scheduler::Response response = result->response(); ASSERT_EQ(scheduler::Response::RECONCILE_OPERATIONS, response.type()); ASSERT_TRUE(response.has_reconcile_operations()); const scheduler::Response::ReconcileOperations& reconcile = response.reconcile_operations(); ASSERT_EQ(1, reconcile.operation_statuses_size()); const OperationStatus& operationStatus = reconcile.operation_statuses(0); EXPECT_EQ(operationId, operationStatus.operation_id()); EXPECT_EQ(OPERATION_PENDING, operationStatus.state()); EXPECT_FALSE(operationStatus.has_uuid()); } // Test implicit reconciliation { const Future<scheduler::APIResult> result = mesos.call({createCallReconcileOperations(frameworkId, {})}); AWAIT_READY(result); // The master should respond with '200 OK' and with a `scheduler::Response`. ASSERT_EQ(process::http::Status::OK, result->status_code()); ASSERT_TRUE(result->has_response()); const scheduler::Response response = result->response(); ASSERT_EQ(scheduler::Response::RECONCILE_OPERATIONS, response.type()); ASSERT_TRUE(response.has_reconcile_operations()); const scheduler::Response::ReconcileOperations& reconcile = response.reconcile_operations(); ASSERT_EQ(1, reconcile.operation_statuses_size()); const OperationStatus& operationStatus = reconcile.operation_statuses(0); EXPECT_EQ(operationId, operationStatus.operation_id()); EXPECT_EQ(OPERATION_PENDING, operationStatus.state()); EXPECT_FALSE(operationStatus.has_uuid()); } }
// In this test, the agent initially doesn't enable disk isolation // but then restarts with XFS disk isolation enabled. We verify that // the old container launched before the agent restart is // successfully recovered. TEST_F(ROOT_XFS_QuotaTest, RecoverOldContainers) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); Owned<MasterDetector> detector = master.get()->createDetector(); slave::Flags flags = CreateSlaveFlags(); // `CreateSlaveFlags()` enables `disk/xfs` so here we reset // `isolation` to empty. flags.isolation.clear(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(_, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); Offer offer = offers.get()[0]; TaskInfo task = createTask( offer.slave_id(), Resources::parse("cpus:1;mem:128;disk:1").get(), "dd if=/dev/zero of=file bs=1024 count=1; sleep 1000"); Future<TaskStatus> startingStatus; Future<TaskStatus> runningstatus; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&startingStatus)) .WillOnce(FutureArg<1>(&runningstatus)); driver.launchTasks(offer.id(), {task}); AWAIT_READY(startingStatus); EXPECT_EQ(task.task_id(), startingStatus->task_id()); EXPECT_EQ(TASK_STARTING, startingStatus->state()); AWAIT_READY(runningstatus); EXPECT_EQ(task.task_id(), runningstatus->task_id()); EXPECT_EQ(TASK_RUNNING, runningstatus->state()); { Future<ResourceUsage> usage = process::dispatch(slave.get()->pid, &Slave::usage); AWAIT_READY(usage); // We should have 1 executor using resources but it doesn't have // disk limit enabled. ASSERT_EQ(1, usage->executors().size()); const ResourceUsage_Executor& executor = usage->executors().Get(0); ASSERT_TRUE(executor.has_statistics()); ASSERT_FALSE(executor.statistics().has_disk_limit_bytes()); } // Restart the slave. slave.get()->terminate(); Future<SlaveReregisteredMessage> slaveReregisteredMessage = FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _); // This time use the agent flags that include XFS disk isolation. slave = StartSlave(detector.get(), CreateSlaveFlags()); ASSERT_SOME(slave); // Wait for the slave to re-register. AWAIT_READY(slaveReregisteredMessage); { Future<ResourceUsage> usage = process::dispatch(slave.get()->pid, &Slave::usage); AWAIT_READY(usage); // We should still have 1 executor using resources but it doesn't // have disk limit enabled. ASSERT_EQ(1, usage->executors().size()); const ResourceUsage_Executor& executor = usage->executors().Get(0); ASSERT_TRUE(executor.has_statistics()); ASSERT_FALSE(executor.statistics().has_disk_limit_bytes()); } driver.stop(); driver.join(); }
int main(int argc, char** argv) { if (argc != 2) { cerr << "Usage: " << argv[0] << " <master>" << endl; return -1; } // Find this executable's directory to locate executor. string path = os::realpath(dirname(argv[0])).get(); string uri = path + "/long-lived-executor"; if (getenv("MESOS_BUILD_DIR")) { uri = string(getenv("MESOS_BUILD_DIR")) + "/src/long-lived-executor"; } ExecutorInfo executor; executor.mutable_executor_id()->set_value("default"); executor.mutable_command()->set_value(uri); executor.set_name("Long Lived Executor (C++)"); executor.set_source("cpp_long_lived_framework"); LongLivedScheduler scheduler(executor); FrameworkInfo framework; framework.set_user(""); // Have Mesos fill in the current user. framework.set_name("Long Lived Framework (C++)"); // TODO(vinod): Make checkpointing the default when it is default // on the slave. if (os::hasenv("MESOS_CHECKPOINT")) { cout << "Enabling checkpoint for the framework" << endl; framework.set_checkpoint(true); } MesosSchedulerDriver* driver; if (os::hasenv("MESOS_AUTHENTICATE")) { cout << "Enabling authentication for the framework" << endl; if (!os::hasenv("DEFAULT_PRINCIPAL")) { EXIT(1) << "Expecting authentication principal in the environment"; } if (!os::hasenv("DEFAULT_SECRET")) { EXIT(1) << "Expecting authentication secret in the environment"; } Credential credential; credential.set_principal(getenv("DEFAULT_PRINCIPAL")); credential.set_secret(getenv("DEFAULT_SECRET")); framework.set_principal(getenv("DEFAULT_PRINCIPAL")); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1], credential); } else { framework.set_principal("long-lived-framework-cpp"); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1]); } int status = driver->run() == DRIVER_STOPPED ? 0 : 1; // Ensure that the driver process terminates. driver->stop(); delete driver; return status; }
inline bool operator==(const FrameworkInfo& left, const FrameworkInfo& right) { return (left.name() == right.name()) && (left.user() == right.user()); }
// This test verifies that persistent volumes are unmounted properly // after a checkpointed framework disappears and the slave restarts. // // TODO(jieyu): Even though the command task specifies a new // filesystem root, the executor (command executor) itself does not // change filesystem root (uses the host filesystem). We need to add a // test to test the scenario that the executor itself changes rootfs. TEST_F(LinuxFilesystemIsolatorMesosTest, ROOT_RecoverOrphanedPersistentVolume) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); string registry = path::join(sandbox.get(), "registry"); AWAIT_READY(DockerArchive::create(registry, "test_image")); slave::Flags flags = CreateSlaveFlags(); flags.resources = "cpus:2;mem:1024;disk(role1):1024"; flags.isolation = "filesystem/linux,docker/runtime"; flags.docker_registry = registry; flags.docker_store_dir = path::join(sandbox.get(), "store"); flags.image_providers = "docker"; Fetcher fetcher(flags); Try<MesosContainerizer*> create = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(create); Owned<Containerizer> containerizer(create.get()); Owned<MasterDetector> detector = master.get()->createDetector(); Try<Owned<cluster::Slave>> slave = StartSlave( detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); MockScheduler sched; FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_roles(0, "role1"); frameworkInfo.set_checkpoint(true); MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); ASSERT_FALSE(offers->empty()); Offer offer = offers.get()[0]; string dir1 = path::join(sandbox.get(), "dir1"); ASSERT_SOME(os::mkdir(dir1)); Resource persistentVolume = createPersistentVolume( Megabytes(64), "role1", "id1", "path1", None(), None(), frameworkInfo.principal()); // Create a task that does nothing for a long time. TaskInfo task = createTask( offer.slave_id(), Resources::parse("cpus:1;mem:512").get() + persistentVolume, "sleep 1000"); task.mutable_container()->CopyFrom(createContainerInfo( "test_image", {createVolumeHostPath("/tmp", dir1, Volume::RW)})); Future<TaskStatus> statusStarting; Future<TaskStatus> statusRunning; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&statusStarting)) .WillOnce(FutureArg<1>(&statusRunning)) .WillRepeatedly(DoDefault()); Future<Nothing> ack = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); // Create the persistent volumes and launch task via `acceptOffers`. driver.acceptOffers( {offer.id()}, {CREATE(persistentVolume), LAUNCH({task})}); AWAIT_READY(statusStarting); EXPECT_EQ(TASK_STARTING, statusStarting->state()); AWAIT_READY(statusRunning); EXPECT_EQ(TASK_RUNNING, statusRunning->state()); // Wait for the ACK to be checkpointed. AWAIT_READY(ack); Future<hashset<ContainerID>> containers = containerizer->containers(); AWAIT_READY(containers); ASSERT_EQ(1u, containers->size()); ContainerID containerId = *containers->begin(); // Restart the slave. slave.get()->terminate(); // Wipe the slave meta directory so that the slave will treat the // above running task as an orphan. ASSERT_SOME(os::rmdir(slave::paths::getMetaRootDir(flags.work_dir))); Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover); // Recreate the containerizer using the same helper as above. containerizer.reset(); create = MesosContainerizer::create(flags, true, &fetcher); ASSERT_SOME(create); containerizer.reset(create.get()); slave = StartSlave(detector.get(), containerizer.get(), flags); ASSERT_SOME(slave); // Wait until slave recovery is complete. AWAIT_READY(_recover); // Wait until the orphan containers are cleaned up. AWAIT_READY(containerizer->wait(containerId)); Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); ASSERT_SOME(table); // All mount targets should be under this directory. string directory = slave::paths::getSandboxRootDir(flags.work_dir); // Verify that the orphaned container's persistent volume and // the rootfs are unmounted. foreach (const fs::MountInfoTable::Entry& entry, table->entries) { EXPECT_FALSE(strings::contains(entry.target, directory)) << "Target was not unmounted: " << entry.target; } driver.stop(); driver.join(); }
// This test ensures that a task will transition straight from `TASK_KILLING` to // `TASK_KILLED`, even if the health check begins to fail during the kill policy // grace period. // // TODO(gkleiman): this test takes about 7 seconds to run, consider using mock // tasks and health checkers to speed it up. TEST_P(CommandExecutorTest, NoTransitionFromKillingToRunning) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); Owned<MasterDetector> detector = master.get()->createDetector(); slave::Flags flags = CreateSlaveFlags(); flags.http_command_executor = GetParam(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags); ASSERT_SOME(slave); // Start the framework with the task killing capability. FrameworkInfo::Capability capability; capability.set_type(FrameworkInfo::Capability::TASK_KILLING_STATE); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.add_capabilities()->CopyFrom(capability); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_EQ(1u, offers->size()); const string command = strings::format( "%s %s --sleep_duration=15", getTestHelperPath("test-helper"), KillPolicyTestHelper::NAME).get(); TaskInfo task = createTask(offers->front(), command); // Create a health check that succeeds until a temporary file is removed. Try<string> temporaryPath = os::mktemp(path::join(os::getcwd(), "XXXXXX")); ASSERT_SOME(temporaryPath); const string tmpPath = temporaryPath.get(); HealthCheck healthCheck; healthCheck.set_type(HealthCheck::COMMAND); healthCheck.mutable_command()->set_value("ls " + tmpPath + " >/dev/null"); healthCheck.set_delay_seconds(0); healthCheck.set_grace_period_seconds(0); healthCheck.set_interval_seconds(0); task.mutable_health_check()->CopyFrom(healthCheck); // Set the kill policy grace period to 5 seconds. KillPolicy killPolicy; killPolicy.mutable_grace_period()->set_nanoseconds(Seconds(5).ns()); task.mutable_kill_policy()->CopyFrom(killPolicy); vector<TaskInfo> tasks; tasks.push_back(task); Future<TaskStatus> statusRunning; Future<TaskStatus> statusHealthy; Future<TaskStatus> statusKilling; Future<TaskStatus> statusKilled; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(FutureArg<1>(&statusRunning)) .WillOnce(FutureArg<1>(&statusHealthy)) .WillOnce(FutureArg<1>(&statusKilling)) .WillOnce(FutureArg<1>(&statusKilled)); driver.launchTasks(offers->front().id(), tasks); AWAIT_READY(statusRunning); EXPECT_EQ(TASK_RUNNING, statusRunning.get().state()); AWAIT_READY(statusHealthy); EXPECT_EQ(TASK_RUNNING, statusHealthy.get().state()); EXPECT_TRUE(statusHealthy.get().has_healthy()); EXPECT_TRUE(statusHealthy.get().healthy()); driver.killTask(task.task_id()); AWAIT_READY(statusKilling); EXPECT_EQ(TASK_KILLING, statusKilling->state()); EXPECT_FALSE(statusKilling.get().has_healthy()); // Remove the temporary file, so that the health check fails. os::rm(tmpPath); AWAIT_READY(statusKilled); EXPECT_EQ(TASK_KILLED, statusKilled->state()); EXPECT_FALSE(statusKilled.get().has_healthy()); driver.stop(); driver.join(); }
// Ensures that the driver can handle the SUBSCRIBED event // after a scheduler failover. TEST_F(SchedulerDriverEventTest, SubscribedSchedulerFailover) { Try<PID<Master>> master = StartMaster(); ASSERT_SOME(master); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_failover_timeout(Weeks(2).secs()); // Make sure the initial registration calls 'registered'. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); // Intercept the registration message, send a SUBSCRIBED instead. Future<Message> frameworkRegisteredMessage = DROP_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); // Ensure that there will be no (re-)registration retries // from the scheduler driver. Clock::pause(); driver.start(); AWAIT_READY(frameworkRegisteredMessage); UPID frameworkPid = frameworkRegisteredMessage.get().to; FrameworkRegisteredMessage message; ASSERT_TRUE(message.ParseFromString(frameworkRegisteredMessage.get().body)); FrameworkID frameworkId = message.framework_id(); frameworkInfo.mutable_id()->CopyFrom(frameworkId); Event event; event.set_type(Event::SUBSCRIBED); event.mutable_subscribed()->mutable_framework_id()->CopyFrom(frameworkId); Future<Nothing> registered; EXPECT_CALL(sched, registered(&driver, frameworkId, _)) .WillOnce(FutureSatisfy(®istered)); process::post(master.get(), frameworkPid, event); AWAIT_READY(registered); // Fail over the scheduler and expect a 'registered' call. driver.stop(true); MockScheduler sched2; MesosSchedulerDriver driver2( &sched2, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); frameworkRegisteredMessage = DROP_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _); driver2.start(); AWAIT_READY(frameworkRegisteredMessage); UPID frameworkPid2 = frameworkRegisteredMessage.get().to; Future<Nothing> registered2; EXPECT_CALL(sched2, registered(&driver2, frameworkId, _)) .WillOnce(FutureSatisfy(®istered2)); process::post(master.get(), frameworkPid2, event); AWAIT_READY(registered2); }
int main(int argc, char** argv) { if (argc != 2) { cerr << "Usage: " << argv[0] << " <master>" << endl; return -1; } // Find this executable's directory to locate executor. string uri; Option<string> value = os::getenv("MESOS_BUILD_DIR"); if (value.isSome()) { uri = path::join(value.get(), "src", "long-lived-executor"); } else { uri = path::join( os::realpath(Path(argv[0]).dirname()).get(), "long-lived-executor"); } ExecutorInfo executor; executor.mutable_executor_id()->set_value("default"); executor.mutable_command()->set_value(uri); executor.set_name("Long Lived Executor (C++)"); executor.set_source("cpp_long_lived_framework"); LongLivedScheduler scheduler(executor); FrameworkInfo framework; framework.set_user(""); // Have Mesos fill in the current user. framework.set_name("Long Lived Framework (C++)"); value = os::getenv("MESOS_CHECKPOINT"); if (value.isSome()) { framework.set_checkpoint( numify<bool>(value.get()).get()); } MesosSchedulerDriver* driver; if (os::getenv("MESOS_AUTHENTICATE").isSome()) { cout << "Enabling authentication for the framework" << endl; value = os::getenv("DEFAULT_PRINCIPAL"); if (value.isNone()) { EXIT(1) << "Expecting authentication principal in the environment"; } Credential credential; credential.set_principal(value.get()); framework.set_principal(value.get()); value = os::getenv("DEFAULT_SECRET"); if (value.isNone()) { EXIT(1) << "Expecting authentication secret in the environment"; } credential.set_secret(value.get()); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1], credential); } else { framework.set_principal("long-lived-framework-cpp"); driver = new MesosSchedulerDriver( &scheduler, framework, argv[1]); } int status = driver->run() == DRIVER_STOPPED ? 0 : 1; // Ensure that the driver process terminates. driver->stop(); delete driver; return status; }
// This test verifies that status update manager ignores // duplicate ACK for an earlier update when it is waiting // for an ACK for a later update. This could happen when the // duplicate ACK is for a retried update. TEST_F(StatusUpdateManagerTest, IgnoreDuplicateStatusUpdateAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); // Drop the first update, so that status update manager // resends the update. Future<StatusUpdateMessage> statusUpdateMessage = DROP_PROTOBUF(StatusUpdateMessage(), master.get(), _); Clock::pause(); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); StatusUpdate update = statusUpdateMessage.get().update(); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); // This is the ACK for the retried update. Future<Nothing> ack = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); Clock::advance(slave::STATUS_UPDATE_RETRY_INTERVAL); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); AWAIT_READY(ack); // Now send TASK_FINISHED update so that the status update manager // is waiting for its ACK, which it never gets because we drop the // update. DROP_PROTOBUFS(StatusUpdateMessage(), master.get(), _); Future<Nothing> update2 = FUTURE_DISPATCH(_, &Slave::_statusUpdate); TaskStatus status2 = status.get(); status2.set_state(TASK_FINISHED); execDriver->sendStatusUpdate(status2); AWAIT_READY(update2); // This is to catch the duplicate ack for TASK_RUNNING. Future<Nothing> duplicateAck = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); // Now send a duplicate ACK for the TASK_RUNNING update. process::dispatch( slave.get(), &Slave::statusUpdateAcknowledgement, update.slave_id(), frameworkId, update.status().task_id(), update.uuid()); AWAIT_READY(duplicateAck); Clock::resume(); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
int main(int argc, char** argv) { Flags flags; Try<flags::Warnings> load = flags.load("MESOS_EXAMPLE_", argc, argv); if (load.isError()) { std::cerr << flags.usage(load.error()) << std::endl; return EXIT_FAILURE; } if (flags.help) { std::cout << flags.usage() << std::endl; return EXIT_SUCCESS; } mesos::internal::logging::initialize(argv[0], false); // Log any flag warnings (after logging is initialized). foreach (const flags::Warning& warning, load->warnings) { LOG(WARNING) << warning.message; } if (flags.qps <= 0.0) { EXIT(EXIT_FAILURE) << "Flag '--qps' needs to be greater than zero"; } LoadGeneratorScheduler scheduler(flags.qps, flags.duration); FrameworkInfo framework; framework.set_user(""); // Have Mesos fill in the current user. framework.set_principal(flags.principal); framework.set_name(FRAMEWORK_NAME); framework.set_checkpoint(flags.checkpoint); framework.add_roles(flags.role); framework.add_capabilities()->set_type( FrameworkInfo::Capability::RESERVATION_REFINEMENT); framework.set_checkpoint(flags.checkpoint); if (flags.master == "local") { // Configure master. os::setenv("MESOS_ROLES", flags.role); os::setenv("MESOS_AUTHENTICATE_FRAMEWORKS", stringify(flags.authenticate)); ACLs acls; ACL::RegisterFramework* acl = acls.add_register_frameworks(); acl->mutable_principals()->set_type(ACL::Entity::ANY); acl->mutable_roles()->add_values("*"); os::setenv("MESOS_ACLS", stringify(JSON::protobuf(acls))); } MesosSchedulerDriver* driver; if (flags.authenticate) { LOG(INFO) << "Enabling authentication for the framework"; Credential credential; credential.set_principal(flags.principal); if (flags.secret.isSome()) { credential.set_secret(flags.secret.get()); } driver = new MesosSchedulerDriver( &scheduler, framework, flags.master, credential); } else { driver = new MesosSchedulerDriver( &scheduler, framework, flags.master); } int status = driver->run() == DRIVER_STOPPED ? EXIT_SUCCESS : EXIT_FAILURE; // Ensure that the driver process terminates. driver->stop(); delete driver; return status; }
// This test ensures that the command executor sends TASK_KILLING // to frameworks that support the capability. // TODO(hausdorff): Enable test. The executor tests use the replicated log // by default. This is not currently supported on Windows, so they will all // fail until that changes. TEST_P_TEMP_DISABLED_ON_WINDOWS(CommandExecutorTest, TaskKillingCapability) { Try<Owned<cluster::Master>> master = StartMaster(); ASSERT_SOME(master); Owned<MasterDetector> detector = master.get()->createDetector(); slave::Flags flags = CreateSlaveFlags(); flags.http_command_executor = GetParam(); Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags); ASSERT_SOME(slave); // Start the framework with the task killing capability. FrameworkInfo::Capability capability; capability.set_type(FrameworkInfo::Capability::TASK_KILLING_STATE); FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.add_capabilities()->CopyFrom(capability); MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL); EXPECT_CALL(sched, registered(&driver, _, _)); Future<vector<Offer>> offers; EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_EQ(1u, offers->size()); // Launch a task with the command executor. TaskInfo task = createTask( offers->front().slave_id(), offers->front().resources(), "sleep 1000"); Future<TaskStatus> statusRunning; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&statusRunning)); driver.launchTasks(offers->front().id(), {task}); AWAIT_READY(statusRunning); EXPECT_EQ(TASK_RUNNING, statusRunning->state()); Future<TaskStatus> statusKilling, statusKilled; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&statusKilling)) .WillOnce(FutureArg<1>(&statusKilled)); driver.killTask(task.task_id()); AWAIT_READY(statusKilling); EXPECT_EQ(TASK_KILLING, statusKilling->state()); AWAIT_READY(statusKilled); EXPECT_EQ(TASK_KILLED, statusKilled->state()); driver.stop(); driver.join(); }
// This test verifies that status update manager ignores // unexpected ACK for an earlier update when it is waiting // for an ACK for another update. We do this by dropping ACKs // for the original update and sending a random ACK to the slave. TEST_F(StatusUpdateManagerTest, IgnoreUnexpectedStatusUpdateAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING)); Future<StatusUpdateMessage> statusUpdateMessage = FUTURE_PROTOBUF(StatusUpdateMessage(), master.get(), _); // Drop the ACKs, so that status update manager // retries the update. DROP_PROTOBUFS(StatusUpdateAcknowledgementMessage(), _, _); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(statusUpdateMessage); StatusUpdate update = statusUpdateMessage.get().update(); AWAIT_READY(status); EXPECT_EQ(TASK_RUNNING, status.get().state()); Future<Nothing> unexpectedAck = FUTURE_DISPATCH(_, &Slave::_statusUpdateAcknowledgement); // Now send an ACK with a random UUID. process::dispatch( slave.get(), &Slave::statusUpdateAcknowledgement, update.slave_id(), frameworkId, update.status().task_id(), UUID::random().toBytes()); AWAIT_READY(unexpectedAck); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }
int main(int argc, char** argv) { string execpath, master, remotecmd; uint64_t mem = 64; double cpus = -1.0, cores = -1.0; shift; while(true) { const string s = argc > 0 ? argv[0] : "--help"; if(argc > 1 && s == "--exec-uri") { execpath = argv[1]; shift; shift; } else if(argc > 1 && s == "--master") { master = argv[1]; shift; shift; } else if(argc > 1 && s == "--cpus") { cpus = (double)stof(argv[1]); assert(cpus > 0.0); shift; shift; } else if(argc > 1 && s == "--mem") { mem = Bytes::parse(argv[1]).get().megabytes(); shift; shift; } else if(argc > 1 && s == "--cores") { cores = (double)stof(argv[1]); assert(cores> -1.0); shift; shift; } else if(argc > 1 && s == "--remote-cmd") { remotecmd = argv[1]; shift; shift; } else { break; } } if(master.length() == 0 || execpath.length() == 0 || cpus == -1) { printf("Usage: chapel-scheduler --master <ip:port|required> --exec-uri <uri-to-the-location-of-a.out_real|required> --cpus <number-of-nodes|%N> --cores <number-of-cores to use|optional default is '2'> --mem <amt-of-ram-per-node|optional default is '64MB'> --remote-cmd %C\n"); exit(1); } const string& hostname = getHostname(); const uid_t uid = geteuid(); const struct passwd* pw = getpwuid(uid); // don't free this pointer! check the docs of this function to confirm! FrameworkInfo framework; framework.set_user(pw->pw_name); framework.set_name("Chapel Framework"); //framework.set_role(role); framework.set_principal("cpp"); ChapelScheduler scheduler(cpus, mem, cores, hostname, execpath, remotecmd, pw->pw_name); // set up signal handler for clean shutdown // struct sigaction action; action.sa_handler = SIGINTHandler; sigemptyset(&action.sa_mask); action.sa_flags = 0; sigaction(SIGINT, &action, NULL); sigaction(SIGHUP, &action, NULL); sigaction(SIGQUIT, &action, NULL); sigaction(SIGCHLD, &action, NULL); sigaction(SIGSTOP, &action, NULL); sigaction(SIGABRT, &action, NULL); schedulerDriver = new MesosSchedulerDriver(&scheduler, framework, master); const uint8_t status = schedulerDriver->run() == DRIVER_STOPPED ? 0 : 1; if(status) { cout << "Driver Stopped!" << endl; scheduler.terminateAllTasks(schedulerDriver); schedulerDriver.stop(); shutdown(); if(schedulerDriver != NULL) { delete schedulerDriver; schedulerDriver = NULL; } } return status; }
// This test verifies that the slave and status update manager // properly handle duplicate terminal status updates, when the // second update is received after the ACK for the first update. // The proper behavior here is for the status update manager to // forward the duplicate update to the scheduler. TEST_F(StatusUpdateManagerTest, DuplicateTerminalUpdateAfterAck) { Try<PID<Master> > master = StartMaster(); ASSERT_SOME(master); MockExecutor exec(DEFAULT_EXECUTOR_ID); slave::Flags flags = CreateSlaveFlags(); flags.checkpoint = true; Try<PID<Slave> > slave = StartSlave(&exec, flags); ASSERT_SOME(slave); FrameworkInfo frameworkInfo; // Bug in gcc 4.1.*, must assign on next line. frameworkInfo = DEFAULT_FRAMEWORK_INFO; frameworkInfo.set_checkpoint(true); // Enable checkpointing. MockScheduler sched; MesosSchedulerDriver driver( &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL); FrameworkID frameworkId; EXPECT_CALL(sched, registered(_, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); Future<vector<Offer> > offers; EXPECT_CALL(sched, resourceOffers(_, _)) .WillOnce(FutureArg<1>(&offers)) .WillRepeatedly(Return()); // Ignore subsequent offers. driver.start(); AWAIT_READY(offers); EXPECT_NE(0u, offers.get().size()); ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); // Send a terminal update right away. EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdateFromTask(TASK_FINISHED)); Future<TaskStatus> status; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&status)); Future<Nothing> _statusUpdateAcknowledgement = FUTURE_DISPATCH(slave.get(), &Slave::_statusUpdateAcknowledgement); driver.launchTasks(offers.get()[0].id(), createTasks(offers.get()[0])); AWAIT_READY(status); EXPECT_EQ(TASK_FINISHED, status.get().state()); AWAIT_READY(_statusUpdateAcknowledgement); Future<TaskStatus> update; EXPECT_CALL(sched, statusUpdate(_, _)) .WillOnce(FutureArg<1>(&update)); Future<Nothing> _statusUpdateAcknowledgement2 = FUTURE_DISPATCH(slave.get(), &Slave::_statusUpdateAcknowledgement); Clock::pause(); // Now send a TASK_KILLED update for the same task. TaskStatus status2 = status.get(); status2.set_state(TASK_KILLED); execDriver->sendStatusUpdate(status2); // Ensure the scheduler receives TASK_KILLED. AWAIT_READY(update); EXPECT_EQ(TASK_KILLED, update.get().state()); // Ensure the slave properly handles the ACK. // Clock::settle() ensures that the slave successfully // executes Slave::_statusUpdateAcknowledgement(). AWAIT_READY(_statusUpdateAcknowledgement2); Clock::settle(); Clock::resume(); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); driver.stop(); driver.join(); Shutdown(); }