void update(const TaskStatus& status) { CHECK_EQ(SUBSCRIBED, state); LOG(INFO) << "Task " << status.task_id().value() << " is in state " << TaskState_Name(status.state()) << (status.has_message() ? " with message: " + status.message() : ""); if (status.has_uuid()) { Call call; call.set_type(Call::ACKNOWLEDGE); CHECK(framework.has_id()); call.mutable_framework_id()->CopyFrom(framework.id()); Call::Acknowledge* acknowledge = call.mutable_acknowledge(); acknowledge->mutable_agent_id()->CopyFrom(status.agent_id()); acknowledge->mutable_task_id()->CopyFrom(status.task_id()); acknowledge->set_uuid(status.uuid()); mesos->send(call); } if (status.state() == TaskState::TASK_KILLED || status.state() == TaskState::TASK_LOST || status.state() == TaskState::TASK_FAILED || status.state() == TaskState::TASK_ERROR) { ++metrics.abnormal_terminations; } }
void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (!flags.long_running) { if (status.state() == TASK_FAILED && status.reason() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY) { // NOTE: We expect TASK_FAILED when this scheduler is launched by the // balloon_framework_test.sh shell script. The abort here ensures the // script considers the test result as "PASS". driver->abort(); } else if (status.state() == TASK_FAILED || status.state() == TASK_FINISHED || status.state() == TASK_KILLED || status.state() == TASK_LOST || status.state() == TASK_ERROR) { driver->stop(); } } if (stringify(tasksLaunched - 1) != status.task_id().value()) { // We might receive messages from older tasks. Ignore them. LOG(INFO) << "Ignoring status update from older task " << status.task_id(); return; } switch (status.state()) { case TASK_FINISHED: taskActive = false; ++metrics.tasks_finished; break; case TASK_FAILED: taskActive = false; if (status.reason() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY) { ++metrics.tasks_oomed; break; } // NOTE: Fetching the executor (e.g. `--executor_uri`) may fail // occasionally if the URI is rate limited. This case is common // enough that it makes sense to track this failure metric separately. if (status.reason() == TaskStatus::REASON_CONTAINER_LAUNCH_FAILED) { ++metrics.launch_failures; break; } case TASK_KILLED: case TASK_LOST: case TASK_ERROR: taskActive = false; ++metrics.abnormal_terminations; break; default: break; } }
void ChapelScheduler::statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (status.state() == TASK_FINISHED) { tasksFinished+=1; cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " finished of # tasksLaunched " << tasksLaunched << " # finished " << tasksFinished << endl; } if (status.state() == TASK_FAILED) { cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " FAILED!" << endl; terminateAllTasks(schedulerDriver); taskExecError=true; driver->stop(); } if (status.state() == TASK_LOST) { cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " LOST!" << endl; terminateAllTasks(schedulerDriver); taskExecError=true; map<string, TaskInfo>::iterator rm = launchedTsks.find(status.task_id().value()); if(rm != launchedTsks.end()) { launchedTsks.erase(rm); } } if (status.state() == TASK_KILLED) { cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " KILLED!" << endl; terminateAllTasks(schedulerDriver); taskExecError=true; map<string, TaskInfo>::iterator rm = launchedTsks.find(status.task_id().value()); if(rm != launchedTsks.end()) { launchedTsks.erase(rm); } } cout << "ChapelScheduler::statusUpdate\tMet termination criteria?\t" << (tasksFinished == tasksLaunched) << " " << tasksFinished << " " << tasksLaunched << " " << taskExecError << endl; if( taskExecError || ((tasksFinished == tasksLaunched) || (tasksFinished == cpusReq))) { if(tasksLaunched < tasksFinished) { cout << "ChapelScheduler::statusUpdate\tError getting nodes launched for the batch job! Try re-running the code!" << endl; } // Wait to receive any pending framework messages // // If some framework messages are lost, it may hang indefinitely // to solve the indefinite "hang", numAttempts caps out and then // terminates the while loop. // int attempts = 0; while(tasksFinished != tasksLaunched && attempts < numAttempts) { cout << "ChapelScheduler::statusUpdate\tExecution halted! Waiting for remote nodes to catch up! Attempts\t" << attempts << endl; sleep(1); attempts+=1; } cout << "All Chapel task for this framework instance are complete! Shutting down!" << endl; driver->stop(); } }
virtual void statusUpdate( SchedulerDriver* driver, const TaskStatus& status) { CHECK_EQ(name, status.task_id().value()); cout << "Received status update " << status.state() << " for task " << status.task_id() << endl; if (protobuf::isTerminalState(status.state())) { driver->stop(); } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { int taskId = lexical_cast<int>(status.task_id().value()); cout << "Task " << taskId << " is in state " << status.state() << endl; if (status.state() == TASK_FINISHED) tasksFinished++; if (tasksFinished == totalTasks) driver->stop(); }
bool operator == (const TaskStatus& left, const TaskStatus& right) { return left.task_id() == right.task_id() && left.state() == right.state() && left.data() == right.data() && left.message() == right.message() && left.slave_id() == right.slave_id() && left.timestamp() == right.timestamp() && left.executor_id() == right.executor_id() && left.healthy() == right.healthy() && left.source() == right.source() && left.reason() == right.reason() && left.uuid() == right.uuid(); }
void CephSchedulerAgent<T>::statusUpdate( T* driver, const TaskStatus& status) { LOG(INFO) << "Got status update from " << status.source(); string taskId = status.task_id().value(); if (status.state() == TASK_RUNNING) { LOG(INFO) << taskId << " is Running!"; stateMachine->updateTaskToRunning(taskId); if (status.has_message()){ vector<string> tokens = StringUtil::explode(status.message(), '.'); if ((MessageToScheduler)lexical_cast<int>(tokens[0]) == MessageToScheduler::CONSUMED_OSD_ID ){ string consumedOSDId = tokens[1]; LOG(INFO) << "Got message of \"consumed_OSD_ID\": "<<consumedOSDId; } } } else if (status.state() == TASK_STARTING) { LOG(INFO) << taskId << " is Waiting OSDID, ready for assign osd id!"; stateMachine->updateTaskToWaitingOSDID(taskId); } else if (status.state() == TASK_FAILED) { LOG(INFO) << taskId << " failed"; stateMachine->updateTaskToFailed(taskId); //TODO: if has message , add the OSD ID back to StateMachine } else if (status.state() == TASK_FINISHED) { //only disk executor will have this finished status if (status.has_message()){ vector<string> tokens = StringUtil::explode(status.message(), '.'); if ((MessageToScheduler)lexical_cast<int>(tokens[0]) == MessageToScheduler::DISK_READY ){ string failedDevsStr = tokens[1]; LOG(INFO) << "Got message of \"DISK_READY\": "<<failedDevsStr; vector<string> failedDevs = StringUtil::explode(failedDevsStr, ':'); string hostname = failedDevs[0]; vector<string> devs; if ("-" != failedDevs[1]) { vector<string> devs = StringUtil::explode(failedDevs[1], ','); } HostConfig* hostconfig = stateMachine->getConfig(hostname); //TODO: get this "4" from yml config hostconfig->updateDiskPartition(devs,lexical_cast<int>("4")); hostconfig->setDiskPreparationDone(); } } } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { string taskId = status.task_id().value(); cout << "Container " << taskId << " is in state " << taskState[status.state()] << endl; if (status.state() == TASK_FINISHED) containersFinished++; if (status.state() == TASK_FAILED) driver->stop(); cout << "Total complete: " << stringify(containersFinished) << " out of " << stringify(containersAssigned) << endl; if (containersFinished == containersAssigned) driver->stop(); }
// Returns a JSON object modeled on a TaskStatus. JSON::Object model(const TaskStatus& status) { JSON::Object object; object.values["state"] = TaskState_Name(status.state()); object.values["timestamp"] = status.timestamp(); return object; }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { std::cout << "Task in state " << status.state() << std::endl; if (status.has_message()) { std::cout << "Reason: " << status.message() << std::endl; } if (protobuf::isTerminalState(status.state())) { // NOTE: We expect TASK_FAILED here. The abort here ensures the shell // script invoking this test, considers the test result as 'PASS'. if (status.state() == TASK_FAILED) { driver->abort(); } else { driver->stop(); } } }
TEST(ResourceOffersTest, TaskUsesNoResources) { ASSERT_TRUE(GTEST_IS_THREADSAFE); PID<Master> master = local::launch(1, 2, 1 * Gigabyte, false); MockScheduler sched; MesosSchedulerDriver driver(&sched, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; trigger resourceOffersCall; EXPECT_CALL(sched, registered(&driver, _)) .Times(1); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&resourceOffersCall))) .WillRepeatedly(Return()); driver.start(); WAIT_UNTIL(resourceOffersCall); EXPECT_NE(0, offers.size()); TaskDescription task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); vector<TaskDescription> tasks; tasks.push_back(task); TaskStatus status; trigger statusUpdateCall; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall))); driver.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(statusUpdateCall); EXPECT_EQ(task.task_id(), status.task_id()); EXPECT_EQ(TASK_LOST, status.state()); EXPECT_TRUE(status.has_message()); EXPECT_EQ("Task uses no resources", status.message()); driver.stop(); driver.join(); local::shutdown(); }
virtual void statusUpdate( SchedulerDriver* driver, const TaskStatus& status) { CHECK_EQ(name, status.task_id().value()); cout << "Received status update " << status.state() << " for task " << status.task_id() << endl; cout << "RECEIVED UPDATE:" << endl; cout << "Message: " << status.message() << endl; if(status.state() == TASK_FINISHED) { cout << "=== Dumping data ===" << endl; cout << status.data() << endl; cout << "====================" << endl; } if (mesos::internal::protobuf::isTerminalState(status.state())) { driver->stop(); } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { int taskId = lexical_cast<int>(status.task_id().value()); cout << "Task " << taskId << " is in state " << status.state() << endl; if (status.state() == TASK_FINISHED || status.state() == TASK_FAILED || status.state() == TASK_KILLED || status.state() == TASK_LOST) { tasks[taskId].finished = true; tasksFinished++; if (status.state() == TASK_FINISHED) { successfulTasks++; } if (tasksFinished == tasks.size()) { driver->stop(); } } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (status.state() == TASK_FINISHED) { cout << "Task " << status.task_id().value() << " finished" << endl; tasksFinished++; } if (tasksFinished == tasksLaunched && crawlQueue.empty() && renderQueue.empty()) { // Wait to receive any pending framework messages // If some framework messages are lost, it may hang indefinitely. while (frameworkMessagesReceived != tasksFinished) { sleep(1); } shutdown(); driver->stop(); } }
void sendStatusUpdate(const TaskStatus& status) { if (status.state() == TASK_STAGING) { VLOG(1) << "Executor is not allowed to send " << "TASK_STAGING status update. Aborting!"; driver->abort(); Stopwatch stopwatch; if (FLAGS_v >= 1) { stopwatch.start(); } executor->error(driver, "Attempted to send TASK_STAGING status update"); VLOG(1) << "Executor::error took " << stopwatch.elapsed(); return; } StatusUpdateMessage message; StatusUpdate* update = message.mutable_update(); update->mutable_framework_id()->MergeFrom(frameworkId); update->mutable_executor_id()->MergeFrom(executorId); update->mutable_slave_id()->MergeFrom(slaveId); update->mutable_status()->MergeFrom(status); update->set_timestamp(Clock::now().secs()); update->set_uuid(UUID::random().toBytes()); message.set_pid(self()); VLOG(1) << "Executor sending status update " << *update; // Capture the status update. updates[UUID::fromBytes(update->uuid())] = *update; send(slave, message); }
TEST(FaultToleranceTest, SchedulerFailoverFrameworkMessage) { ASSERT_TRUE(GTEST_IS_THREADSAFE); SimpleAllocator a; Master m(&a); PID<Master> master = process::spawn(&m); MockExecutor exec; ExecutorDriver* execDriver; EXPECT_CALL(exec, registered(_, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdate(TASK_RUNNING)); EXPECT_CALL(exec, shutdown(_)) .Times(AtMost(1)); map<ExecutorID, Executor*> execs; execs[DEFAULT_EXECUTOR_ID] = &exec; TestingIsolationModule isolationModule(execs); Resources resources = Resources::parse("cpus:2;mem:1024"); Slave s(resources, true, &isolationModule); PID<Slave> slave = process::spawn(&s); BasicMasterDetector detector(master, slave, true); MockScheduler sched1; MesosSchedulerDriver driver1(&sched1, DEFAULT_FRAMEWORK_INFO, master); FrameworkID frameworkId; vector<Offer> offers; TaskStatus status; trigger sched1ResourceOfferCall, sched1StatusUpdateCall; EXPECT_CALL(sched1, registered(&driver1, _, _)) .WillOnce(SaveArg<1>(&frameworkId)); EXPECT_CALL(sched1, statusUpdate(&driver1, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&sched1StatusUpdateCall))); EXPECT_CALL(sched1, resourceOffers(&driver1, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&sched1ResourceOfferCall))) .WillRepeatedly(Return()); EXPECT_CALL(sched1, error(&driver1, "Framework failed over")) .Times(1); driver1.start(); WAIT_UNTIL(sched1ResourceOfferCall); EXPECT_NE(0, offers.size()); TaskInfo task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); task.mutable_resources()->MergeFrom(offers[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); vector<TaskInfo> tasks; tasks.push_back(task); driver1.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(sched1StatusUpdateCall); EXPECT_EQ(TASK_RUNNING, status.state()); MockScheduler sched2; FrameworkInfo framework2; // Bug in gcc 4.1.*, must assign on next line. framework2 = DEFAULT_FRAMEWORK_INFO; framework2.mutable_id()->MergeFrom(frameworkId); MesosSchedulerDriver driver2(&sched2, framework2, master); trigger sched2RegisteredCall, sched2FrameworkMessageCall; EXPECT_CALL(sched2, registered(&driver2, frameworkId, _)) .WillOnce(Trigger(&sched2RegisteredCall)); EXPECT_CALL(sched2, frameworkMessage(&driver2, _, _, _)) .WillOnce(Trigger(&sched2FrameworkMessageCall)); driver2.start(); WAIT_UNTIL(sched2RegisteredCall); execDriver->sendFrameworkMessage(""); WAIT_UNTIL(sched2FrameworkMessageCall); driver1.stop(); driver2.stop(); driver1.join(); driver2.join(); process::terminate(slave); process::wait(slave); process::terminate(master); process::wait(master); }
TEST(MasterTest, TaskRunning) { ASSERT_TRUE(GTEST_IS_THREADSAFE); SimpleAllocator a; Master m(&a); PID<Master> master = process::spawn(&m); MockExecutor exec; trigger shutdownCall; EXPECT_CALL(exec, registered(_, _, _, _, _, _)) .Times(1); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdate(TASK_RUNNING)); EXPECT_CALL(exec, shutdown(_)) .WillOnce(Trigger(&shutdownCall)); map<ExecutorID, Executor*> execs; execs[DEFAULT_EXECUTOR_ID] = &exec; TestingIsolationModule isolationModule(execs); Resources resources = Resources::parse("cpus:2;mem:1024"); Slave s(resources, true, &isolationModule); PID<Slave> slave = process::spawn(&s); BasicMasterDetector detector(master, slave, true); MockScheduler sched; MesosSchedulerDriver driver(&sched, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; TaskStatus status; trigger resourceOffersCall, statusUpdateCall, resourcesChangedCall; EXPECT_CALL(sched, registered(&driver, _)) .Times(1); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&resourceOffersCall))) .WillRepeatedly(Return()); EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall))); driver.start(); WAIT_UNTIL(resourceOffersCall); EXPECT_NE(0, offers.size()); TaskDescription task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); task.mutable_resources()->MergeFrom(offers[0].resources()); vector<TaskDescription> tasks; tasks.push_back(task); EXPECT_CALL(isolationModule, resourcesChanged(_, _, Resources(offers[0].resources()))) .WillOnce(Trigger(&resourcesChangedCall)); driver.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(statusUpdateCall); EXPECT_EQ(TASK_RUNNING, status.state()); WAIT_UNTIL(resourcesChangedCall); driver.stop(); driver.join(); WAIT_UNTIL(shutdownCall); // To ensure can deallocate MockExecutor. process::terminate(slave); process::wait(slave); process::terminate(master); process::wait(master); }
TEST(MasterTest, FrameworkMessage) { ASSERT_TRUE(GTEST_IS_THREADSAFE); SimpleAllocator a; Master m(&a); PID<Master> master = process::spawn(&m); MockExecutor exec; ExecutorDriver* execDriver; string execData; trigger execFrameworkMessageCall, shutdownCall; EXPECT_CALL(exec, registered(_, _, _, _, _, _)) .WillOnce(SaveArg<0>(&execDriver)); EXPECT_CALL(exec, launchTask(_, _)) .WillOnce(SendStatusUpdate(TASK_RUNNING)); EXPECT_CALL(exec, frameworkMessage(_, _)) .WillOnce(DoAll(SaveArg<1>(&execData), Trigger(&execFrameworkMessageCall))); EXPECT_CALL(exec, shutdown(_)) .WillOnce(Trigger(&shutdownCall)); map<ExecutorID, Executor*> execs; execs[DEFAULT_EXECUTOR_ID] = &exec; TestingIsolationModule isolationModule(execs); Resources resources = Resources::parse("cpus:2;mem:1024"); Slave s(resources, true, &isolationModule); PID<Slave> slave = process::spawn(&s); BasicMasterDetector detector(master, slave, true); // Launch the first (i.e., failing) scheduler and wait until the // first status update message is sent to it (drop the message). MockScheduler sched; MesosSchedulerDriver schedDriver(&sched, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; TaskStatus status; string schedData; trigger resourceOffersCall, statusUpdateCall, schedFrameworkMessageCall; EXPECT_CALL(sched, registered(&schedDriver, _)) .Times(1); EXPECT_CALL(sched, resourceOffers(&schedDriver, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&resourceOffersCall))) .WillRepeatedly(Return()); EXPECT_CALL(sched, statusUpdate(&schedDriver, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall))); EXPECT_CALL(sched, frameworkMessage(&schedDriver, _, _, _)) .WillOnce(DoAll(SaveArg<3>(&schedData), Trigger(&schedFrameworkMessageCall))); schedDriver.start(); WAIT_UNTIL(resourceOffersCall); EXPECT_NE(0, offers.size()); TaskDescription task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); task.mutable_resources()->MergeFrom(offers[0].resources()); vector<TaskDescription> tasks; tasks.push_back(task); schedDriver.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(statusUpdateCall); EXPECT_EQ(TASK_RUNNING, status.state()); string hello = "hello"; schedDriver.sendFrameworkMessage(offers[0].slave_id(), DEFAULT_EXECUTOR_ID, hello); WAIT_UNTIL(execFrameworkMessageCall); EXPECT_EQ(hello, execData); string reply = "reply"; execDriver->sendFrameworkMessage(reply); WAIT_UNTIL(schedFrameworkMessageCall); EXPECT_EQ(reply, schedData); schedDriver.stop(); schedDriver.join(); WAIT_UNTIL(shutdownCall); // To ensure can deallocate MockExecutor. process::terminate(slave); process::wait(slave); process::terminate(master); process::wait(master); }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { int taskId = lexical_cast<int>(status.task_id().value()); cout << "Task " << taskId << " is in state " << status.state() << endl; }
TEST(ResourceOffersTest, ResourcesGetReofferedAfterTaskDescriptionError) { ASSERT_TRUE(GTEST_IS_THREADSAFE); PID<Master> master = local::launch(1, 2, 1 * Gigabyte, false); MockScheduler sched1; MesosSchedulerDriver driver1(&sched1, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; trigger sched1ResourceOffersCall; EXPECT_CALL(sched1, registered(&driver1, _)) .Times(1); EXPECT_CALL(sched1, resourceOffers(&driver1, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&sched1ResourceOffersCall))) .WillRepeatedly(Return()); driver1.start(); WAIT_UNTIL(sched1ResourceOffersCall); EXPECT_NE(0, offers.size()); TaskDescription task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); Resource* cpus = task.add_resources(); cpus->set_name("cpus"); cpus->set_type(Value::SCALAR); cpus->mutable_scalar()->set_value(0); Resource* mem = task.add_resources(); mem->set_name("mem"); mem->set_type(Value::SCALAR); mem->mutable_scalar()->set_value(1 * Gigabyte); vector<TaskDescription> tasks; tasks.push_back(task); TaskStatus status; trigger sched1StatusUpdateCall; EXPECT_CALL(sched1, statusUpdate(&driver1, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&sched1StatusUpdateCall))); driver1.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(sched1StatusUpdateCall); EXPECT_EQ(task.task_id(), status.task_id()); EXPECT_EQ(TASK_LOST, status.state()); EXPECT_TRUE(status.has_message()); EXPECT_EQ("Task uses invalid resources", status.message()); driver1.stop(); driver1.join(); MockScheduler sched2; MesosSchedulerDriver driver2(&sched2, "", DEFAULT_EXECUTOR_INFO, master); trigger sched2ResourceOffersCall; EXPECT_CALL(sched2, registered(&driver2, _)) .Times(1); EXPECT_CALL(sched2, resourceOffers(&driver2, _)) .WillOnce(Trigger(&sched2ResourceOffersCall)) .WillRepeatedly(Return()); EXPECT_CALL(sched2, offerRescinded(&driver2, _)) .Times(AtMost(1)); driver2.start(); WAIT_UNTIL(sched2ResourceOffersCall); driver2.stop(); driver2.join(); local::shutdown(); }
// TOOD(vinod): Disabling this test for now because // of the following race condition breaking this test: // We do a driver.launchTasks() after post(noMasterDetected) // but since dispatch (which is used by launchTasks()) uses // a different queue than post, it might so happen that the latter // message is dequeued before the former, thus breaking the test. TEST(FaultToleranceTest, DISABLED_TaskLost) { ASSERT_TRUE(GTEST_IS_THREADSAFE); MockFilter filter; process::filter(&filter); EXPECT_MESSAGE(filter, _, _, _) .WillRepeatedly(Return(false)); SimpleAllocator a; Master m(&a); PID<Master> master = process::spawn(&m); MockExecutor exec; EXPECT_CALL(exec, registered(_, _, _, _)) .Times(0); EXPECT_CALL(exec, launchTask(_, _)) .Times(0); EXPECT_CALL(exec, shutdown(_)) .Times(0); map<ExecutorID, Executor*> execs; execs[DEFAULT_EXECUTOR_ID] = &exec; TestingIsolationModule isolationModule(execs); Resources resources = Resources::parse("cpus:2;mem:1024"); Slave s(resources, true, &isolationModule); PID<Slave> slave = process::spawn(&s); BasicMasterDetector detector(master, slave, true); MockScheduler sched; MesosSchedulerDriver driver(&sched, DEFAULT_FRAMEWORK_INFO, master); vector<Offer> offers; trigger statusUpdateCall, resourceOffersCall; TaskStatus status; EXPECT_CALL(sched, registered(&driver, _, _)) .Times(1); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&resourceOffersCall))) .WillRepeatedly(Return()); EXPECT_CALL(sched, offerRescinded(&driver, _)) .Times(AtMost(1)); EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall))); process::Message message; EXPECT_MESSAGE(filter, Eq(FrameworkRegisteredMessage().GetTypeName()), _, _) .WillOnce(DoAll(SaveArgField<0>(&process::MessageEvent::message, &message), Return(false))); driver.start(); WAIT_UNTIL(resourceOffersCall); // Simulate a spurious noMasterDetected event at the scheduler. NoMasterDetectedMessage noMasterDetectedMsg; process::post(message.to, noMasterDetectedMsg); EXPECT_NE(0, offers.size()); TaskInfo task; task.set_name("test task"); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); task.mutable_resources()->MergeFrom(offers[0].resources()); task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO); vector<TaskInfo> tasks; tasks.push_back(task); driver.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(statusUpdateCall); EXPECT_EQ(status.state(), TASK_LOST); driver.stop(); driver.join(); process::terminate(slave); process::wait(slave); process::terminate(master); process::wait(master); process::filter(NULL); }
void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (stringify(tasksLaunched - 1) != status.task_id().value()) { // We might receive messages from older tasks. Ignore them. LOG(INFO) << "Ignoring status update from older task " << status.task_id(); return; } switch (status.state()) { case TASK_FINISHED: if (flags.run_once) { driver->stop(); break; } taskActive = false; ++metrics.tasks_finished; break; case TASK_FAILED: if (flags.run_once) { driver->abort(); break; } taskActive = false; if (status.reason() == TaskStatus::REASON_CONTAINER_LIMITATION_DISK) { ++metrics.tasks_disk_full; // Increment abnormal_termination metric counter in case the task // wasn't supposed to consume beyond its disk quota but still got // terminated because of disk overuse. if (flags.disk_use_limit >= DISK_PER_TASK) { ++metrics.abnormal_terminations; } break; } ++metrics.abnormal_terminations; break; case TASK_KILLED: case TASK_LOST: case TASK_ERROR: case TASK_DROPPED: case TASK_UNREACHABLE: case TASK_GONE: case TASK_GONE_BY_OPERATOR: if (flags.run_once) { driver->abort(); } taskActive = false; ++metrics.abnormal_terminations; break; case TASK_STARTING: case TASK_RUNNING: case TASK_STAGING: case TASK_KILLING: case TASK_UNKNOWN: break; } }