void update(const TaskStatus& status) { CHECK_EQ(SUBSCRIBED, state); LOG(INFO) << "Task " << status.task_id().value() << " is in state " << TaskState_Name(status.state()) << (status.has_message() ? " with message: " + status.message() : ""); if (status.has_uuid()) { Call call; call.set_type(Call::ACKNOWLEDGE); CHECK(framework.has_id()); call.mutable_framework_id()->CopyFrom(framework.id()); Call::Acknowledge* acknowledge = call.mutable_acknowledge(); acknowledge->mutable_agent_id()->CopyFrom(status.agent_id()); acknowledge->mutable_task_id()->CopyFrom(status.task_id()); acknowledge->set_uuid(status.uuid()); mesos->send(call); } if (status.state() == TaskState::TASK_KILLED || status.state() == TaskState::TASK_LOST || status.state() == TaskState::TASK_FAILED || status.state() == TaskState::TASK_ERROR) { ++metrics.abnormal_terminations; } }
void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (!flags.long_running) { if (status.state() == TASK_FAILED && status.reason() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY) { // NOTE: We expect TASK_FAILED when this scheduler is launched by the // balloon_framework_test.sh shell script. The abort here ensures the // script considers the test result as "PASS". driver->abort(); } else if (status.state() == TASK_FAILED || status.state() == TASK_FINISHED || status.state() == TASK_KILLED || status.state() == TASK_LOST || status.state() == TASK_ERROR) { driver->stop(); } } if (stringify(tasksLaunched - 1) != status.task_id().value()) { // We might receive messages from older tasks. Ignore them. LOG(INFO) << "Ignoring status update from older task " << status.task_id(); return; } switch (status.state()) { case TASK_FINISHED: taskActive = false; ++metrics.tasks_finished; break; case TASK_FAILED: taskActive = false; if (status.reason() == TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY) { ++metrics.tasks_oomed; break; } // NOTE: Fetching the executor (e.g. `--executor_uri`) may fail // occasionally if the URI is rate limited. This case is common // enough that it makes sense to track this failure metric separately. if (status.reason() == TaskStatus::REASON_CONTAINER_LAUNCH_FAILED) { ++metrics.launch_failures; break; } case TASK_KILLED: case TASK_LOST: case TASK_ERROR: taskActive = false; ++metrics.abnormal_terminations; break; default: break; } }
virtual void statusUpdate( SchedulerDriver* driver, const TaskStatus& status) { CHECK_EQ(name, status.task_id().value()); cout << "Received status update " << status.state() << " for task " << status.task_id() << endl; if (protobuf::isTerminalState(status.state())) { driver->stop(); } }
bool operator == (const TaskStatus& left, const TaskStatus& right) { return left.task_id() == right.task_id() && left.state() == right.state() && left.data() == right.data() && left.message() == right.message() && left.slave_id() == right.slave_id() && left.timestamp() == right.timestamp() && left.executor_id() == right.executor_id() && left.healthy() == right.healthy() && left.source() == right.source() && left.reason() == right.reason() && left.uuid() == right.uuid(); }
TEST(ResourceOffersTest, TaskUsesNoResources) { ASSERT_TRUE(GTEST_IS_THREADSAFE); PID<Master> master = local::launch(1, 2, 1 * Gigabyte, false); MockScheduler sched; MesosSchedulerDriver driver(&sched, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; trigger resourceOffersCall; EXPECT_CALL(sched, registered(&driver, _)) .Times(1); EXPECT_CALL(sched, resourceOffers(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&resourceOffersCall))) .WillRepeatedly(Return()); driver.start(); WAIT_UNTIL(resourceOffersCall); EXPECT_NE(0, offers.size()); TaskDescription task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); vector<TaskDescription> tasks; tasks.push_back(task); TaskStatus status; trigger statusUpdateCall; EXPECT_CALL(sched, statusUpdate(&driver, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall))); driver.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(statusUpdateCall); EXPECT_EQ(task.task_id(), status.task_id()); EXPECT_EQ(TASK_LOST, status.state()); EXPECT_TRUE(status.has_message()); EXPECT_EQ("Task uses no resources", status.message()); driver.stop(); driver.join(); local::shutdown(); }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { int taskId = lexical_cast<int>(status.task_id().value()); cout << "Task " << taskId << " is in state " << status.state() << endl; if (status.state() == TASK_FINISHED) tasksFinished++; if (tasksFinished == totalTasks) driver->stop(); }
virtual void statusUpdate( SchedulerDriver* driver, const TaskStatus& status) { CHECK_EQ(name, status.task_id().value()); cout << "Received status update " << status.state() << " for task " << status.task_id() << endl; cout << "RECEIVED UPDATE:" << endl; cout << "Message: " << status.message() << endl; if(status.state() == TASK_FINISHED) { cout << "=== Dumping data ===" << endl; cout << status.data() << endl; cout << "====================" << endl; } if (mesos::internal::protobuf::isTerminalState(status.state())) { driver->stop(); } }
void CephSchedulerAgent<T>::statusUpdate( T* driver, const TaskStatus& status) { LOG(INFO) << "Got status update from " << status.source(); string taskId = status.task_id().value(); if (status.state() == TASK_RUNNING) { LOG(INFO) << taskId << " is Running!"; stateMachine->updateTaskToRunning(taskId); if (status.has_message()){ vector<string> tokens = StringUtil::explode(status.message(), '.'); if ((MessageToScheduler)lexical_cast<int>(tokens[0]) == MessageToScheduler::CONSUMED_OSD_ID ){ string consumedOSDId = tokens[1]; LOG(INFO) << "Got message of \"consumed_OSD_ID\": "<<consumedOSDId; } } } else if (status.state() == TASK_STARTING) { LOG(INFO) << taskId << " is Waiting OSDID, ready for assign osd id!"; stateMachine->updateTaskToWaitingOSDID(taskId); } else if (status.state() == TASK_FAILED) { LOG(INFO) << taskId << " failed"; stateMachine->updateTaskToFailed(taskId); //TODO: if has message , add the OSD ID back to StateMachine } else if (status.state() == TASK_FINISHED) { //only disk executor will have this finished status if (status.has_message()){ vector<string> tokens = StringUtil::explode(status.message(), '.'); if ((MessageToScheduler)lexical_cast<int>(tokens[0]) == MessageToScheduler::DISK_READY ){ string failedDevsStr = tokens[1]; LOG(INFO) << "Got message of \"DISK_READY\": "<<failedDevsStr; vector<string> failedDevs = StringUtil::explode(failedDevsStr, ':'); string hostname = failedDevs[0]; vector<string> devs; if ("-" != failedDevs[1]) { vector<string> devs = StringUtil::explode(failedDevs[1], ','); } HostConfig* hostconfig = stateMachine->getConfig(hostname); //TODO: get this "4" from yml config hostconfig->updateDiskPartition(devs,lexical_cast<int>("4")); hostconfig->setDiskPreparationDone(); } } } }
void ChapelScheduler::statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (status.state() == TASK_FINISHED) { tasksFinished+=1; cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " finished of # tasksLaunched " << tasksLaunched << " # finished " << tasksFinished << endl; } if (status.state() == TASK_FAILED) { cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " FAILED!" << endl; terminateAllTasks(schedulerDriver); taskExecError=true; driver->stop(); } if (status.state() == TASK_LOST) { cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " LOST!" << endl; terminateAllTasks(schedulerDriver); taskExecError=true; map<string, TaskInfo>::iterator rm = launchedTsks.find(status.task_id().value()); if(rm != launchedTsks.end()) { launchedTsks.erase(rm); } } if (status.state() == TASK_KILLED) { cout << "ChapelScheduler::statusUpdate\tTask " << status.task_id().value() << " KILLED!" << endl; terminateAllTasks(schedulerDriver); taskExecError=true; map<string, TaskInfo>::iterator rm = launchedTsks.find(status.task_id().value()); if(rm != launchedTsks.end()) { launchedTsks.erase(rm); } } cout << "ChapelScheduler::statusUpdate\tMet termination criteria?\t" << (tasksFinished == tasksLaunched) << " " << tasksFinished << " " << tasksLaunched << " " << taskExecError << endl; if( taskExecError || ((tasksFinished == tasksLaunched) || (tasksFinished == cpusReq))) { if(tasksLaunched < tasksFinished) { cout << "ChapelScheduler::statusUpdate\tError getting nodes launched for the batch job! Try re-running the code!" << endl; } // Wait to receive any pending framework messages // // If some framework messages are lost, it may hang indefinitely // to solve the indefinite "hang", numAttempts caps out and then // terminates the while loop. // int attempts = 0; while(tasksFinished != tasksLaunched && attempts < numAttempts) { cout << "ChapelScheduler::statusUpdate\tExecution halted! Waiting for remote nodes to catch up! Attempts\t" << attempts << endl; sleep(1); attempts+=1; } cout << "All Chapel task for this framework instance are complete! Shutting down!" << endl; driver->stop(); } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { string taskId = status.task_id().value(); cout << "Container " << taskId << " is in state " << taskState[status.state()] << endl; if (status.state() == TASK_FINISHED) containersFinished++; if (status.state() == TASK_FAILED) driver->stop(); cout << "Total complete: " << stringify(containersFinished) << " out of " << stringify(containersAssigned) << endl; if (containersFinished == containersAssigned) driver->stop(); }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (status.state() == TASK_FINISHED) { cout << "Task " << status.task_id().value() << " finished" << endl; tasksFinished++; } if (tasksFinished == tasksLaunched && crawlQueue.empty() && renderQueue.empty()) { // Wait to receive any pending framework messages // If some framework messages are lost, it may hang indefinitely. while (frameworkMessagesReceived != tasksFinished) { sleep(1); } shutdown(); driver->stop(); } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { int taskId = lexical_cast<int>(status.task_id().value()); cout << "Task " << taskId << " is in state " << status.state() << endl; if (status.state() == TASK_FINISHED || status.state() == TASK_FAILED || status.state() == TASK_KILLED || status.state() == TASK_LOST) { tasks[taskId].finished = true; tasksFinished++; if (status.state() == TASK_FINISHED) { successfulTasks++; } if (tasksFinished == tasks.size()) { driver->stop(); } } }
virtual void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { int taskId = lexical_cast<int>(status.task_id().value()); cout << "Task " << taskId << " is in state " << status.state() << endl; }
void statusUpdate(SchedulerDriver* driver, const TaskStatus& status) { if (stringify(tasksLaunched - 1) != status.task_id().value()) { // We might receive messages from older tasks. Ignore them. LOG(INFO) << "Ignoring status update from older task " << status.task_id(); return; } switch (status.state()) { case TASK_FINISHED: if (flags.run_once) { driver->stop(); break; } taskActive = false; ++metrics.tasks_finished; break; case TASK_FAILED: if (flags.run_once) { driver->abort(); break; } taskActive = false; if (status.reason() == TaskStatus::REASON_CONTAINER_LIMITATION_DISK) { ++metrics.tasks_disk_full; // Increment abnormal_termination metric counter in case the task // wasn't supposed to consume beyond its disk quota but still got // terminated because of disk overuse. if (flags.disk_use_limit >= DISK_PER_TASK) { ++metrics.abnormal_terminations; } break; } ++metrics.abnormal_terminations; break; case TASK_KILLED: case TASK_LOST: case TASK_ERROR: case TASK_DROPPED: case TASK_UNREACHABLE: case TASK_GONE: case TASK_GONE_BY_OPERATOR: if (flags.run_once) { driver->abort(); } taskActive = false; ++metrics.abnormal_terminations; break; case TASK_STARTING: case TASK_RUNNING: case TASK_STAGING: case TASK_KILLING: case TASK_UNKNOWN: break; } }
TEST(ResourceOffersTest, ResourcesGetReofferedAfterTaskDescriptionError) { ASSERT_TRUE(GTEST_IS_THREADSAFE); PID<Master> master = local::launch(1, 2, 1 * Gigabyte, false); MockScheduler sched1; MesosSchedulerDriver driver1(&sched1, "", DEFAULT_EXECUTOR_INFO, master); vector<Offer> offers; trigger sched1ResourceOffersCall; EXPECT_CALL(sched1, registered(&driver1, _)) .Times(1); EXPECT_CALL(sched1, resourceOffers(&driver1, _)) .WillOnce(DoAll(SaveArg<1>(&offers), Trigger(&sched1ResourceOffersCall))) .WillRepeatedly(Return()); driver1.start(); WAIT_UNTIL(sched1ResourceOffersCall); EXPECT_NE(0, offers.size()); TaskDescription task; task.set_name(""); task.mutable_task_id()->set_value("1"); task.mutable_slave_id()->MergeFrom(offers[0].slave_id()); Resource* cpus = task.add_resources(); cpus->set_name("cpus"); cpus->set_type(Value::SCALAR); cpus->mutable_scalar()->set_value(0); Resource* mem = task.add_resources(); mem->set_name("mem"); mem->set_type(Value::SCALAR); mem->mutable_scalar()->set_value(1 * Gigabyte); vector<TaskDescription> tasks; tasks.push_back(task); TaskStatus status; trigger sched1StatusUpdateCall; EXPECT_CALL(sched1, statusUpdate(&driver1, _)) .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&sched1StatusUpdateCall))); driver1.launchTasks(offers[0].id(), tasks); WAIT_UNTIL(sched1StatusUpdateCall); EXPECT_EQ(task.task_id(), status.task_id()); EXPECT_EQ(TASK_LOST, status.state()); EXPECT_TRUE(status.has_message()); EXPECT_EQ("Task uses invalid resources", status.message()); driver1.stop(); driver1.join(); MockScheduler sched2; MesosSchedulerDriver driver2(&sched2, "", DEFAULT_EXECUTOR_INFO, master); trigger sched2ResourceOffersCall; EXPECT_CALL(sched2, registered(&driver2, _)) .Times(1); EXPECT_CALL(sched2, resourceOffers(&driver2, _)) .WillOnce(Trigger(&sched2ResourceOffersCall)) .WillRepeatedly(Return()); EXPECT_CALL(sched2, offerRescinded(&driver2, _)) .Times(AtMost(1)); driver2.start(); WAIT_UNTIL(sched2ResourceOffersCall); driver2.stop(); driver2.join(); local::shutdown(); }