Example #1
0
void ChapleScheduler::terminateAllTasks(SchedulerDriver* driver) {
   for(map<string, TaskInfo>::iterator i = launchedTsks.begin(); i != launchedTsks.end(); i++) {
      cout << "\tChapel Task " << i->first << " notified to terminate" << endl;
      TaskID tid;
      tid.set_value(i->first);
      driver->killTask(tid);
   }
}
Example #2
0
  void killTask(ExecutorDriver* driver, const TaskID& taskId)
  {
    LOG(INFO) << "Received killTask for task " << taskId.value();

    // Using shutdown grace period as a default is backwards compatible
    // with the `stop_timeout` flag, deprecated in 1.0.
    Duration gracePeriod = shutdownGracePeriod;

    if (killPolicy.isSome() && killPolicy->has_grace_period()) {
      gracePeriod = Nanoseconds(killPolicy->grace_period().nanoseconds());
    }

    killTask(driver, taskId, gracePeriod);
  }
Example #3
0
jobject convert(JNIEnv* env, const TaskID& taskId)
{
  string data;
  taskId.SerializeToString(&data);

  // byte[] data = ..;
  jbyteArray jdata = env->NewByteArray(data.size());
  env->SetByteArrayRegion(jdata, 0, data.size(), (jbyte*) data.data());

  // TaskID taskId = TaskID.parseFrom(data);
  jclass clazz = FindMesosClass(env, "org/apache/mesos/Protos$TaskID");

  jmethodID parseFrom =
    env->GetStaticMethodID(clazz, "parseFrom",
                           "([B)Lorg/apache/mesos/Protos$TaskID;");

  jobject jtaskId = env->CallStaticObjectMethod(clazz, parseFrom, jdata);

  return jtaskId;
}
Example #4
0
    void killTask (ExecutorDriver* driver, const TaskID& taskId) override {
      const string& ti = taskId.value();
      pid_t pid;

      {
        lock_guard<mutex> lock(TaskId2PidLock);

        auto iter = TaskId2Pid.find(ti);

        if (iter == TaskId2Pid.end()) {
          LOG(WARNING)
          << "unknown task id '" << ti << "'";
          return;
        }

        pid = iter->second;
      }

      // TODO(fc) be graceful
      kill(pid, 9);
    }
Example #5
0
inline bool operator==(const TaskID& left, const std::string& right)
{
  return left.value() == right;
}
Example #6
0
inline std::size_t hash_value(const TaskID& taskId)
{
  size_t seed = 0;
  boost::hash_combine(seed, taskId.value());
  return seed;
}
Example #7
0
// The purpose of this test is to ensure that when slaves are removed
// from the master, and then attempt to send exited executor messages,
// we send a ShutdownMessage to the slave. Why? Because during a
// network partition, the master will remove a partitioned slave, thus
// sending its tasks to LOST. At this point, when the partition is
// removed, the slave may attempt to send exited executor messages if
// it was unaware that the master removed it. We've already
// notified frameworks that the tasks under the executors were LOST,
// so we have to have the slave shut down.
TEST_F(PartitionTest, PartitionedSlaveExitedExecutor)
{
  Try<PID<Master> > master = StartMaster();
  ASSERT_SOME(master);

  // Allow the master to PING the slave, but drop all PONG messages
  // from the slave. Note that we don't match on the master / slave
  // PIDs because it's actually the SlaveObserver Process that sends
  // the pings.
  Future<Message> ping = FUTURE_MESSAGE(Eq("PING"), _, _);
  DROP_MESSAGES(Eq("PONG"), _, _);

  MockExecutor exec(DEFAULT_EXECUTOR_ID);
  TestContainerizer containerizer(&exec);

  Try<PID<Slave> > slave = StartSlave(&containerizer);
  ASSERT_SOME(slave);

  MockScheduler sched;
  MesosSchedulerDriver driver(
      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);

  Future<FrameworkID> frameworkId;
  EXPECT_CALL(sched, registered(&driver, _, _))
    .WillOnce(FutureArg<1>(&frameworkId));\

  Future<vector<Offer> > offers;
  EXPECT_CALL(sched, resourceOffers(&driver, _))
    .WillOnce(FutureArg<1>(&offers))
    .WillRepeatedly(Return());

  driver.start();

  AWAIT_READY(frameworkId);
  AWAIT_READY(offers);
  ASSERT_NE(0u, offers.get().size());

  // Launch a task. This allows us to have the slave send an
  // ExitedExecutorMessage.
  TaskID taskId;
  taskId.set_value("1");

  TaskInfo task;
  task.set_name("");
  task.mutable_task_id()->MergeFrom(taskId);
  task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id());
  task.mutable_resources()->MergeFrom(offers.get()[0].resources());
  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
  task.mutable_executor()->mutable_command()->set_value("sleep 60");

  vector<TaskInfo> tasks;
  tasks.push_back(task);

  // Set up the expectations for launching the task.
  EXPECT_CALL(exec, registered(_, _, _, _));

  EXPECT_CALL(exec, launchTask(_, _))
    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));

  // Drop all the status updates from the slave, so that we can
  // ensure the ExitedExecutorMessage is what triggers the slave
  // shutdown.
  DROP_PROTOBUFS(StatusUpdateMessage(), _, master.get());

  driver.launchTasks(offers.get()[0].id(), tasks);

  // Drop the first shutdown message from the master (simulated
  // partition) and allow the second shutdown message to pass when
  // triggered by the ExitedExecutorMessage.
  Future<ShutdownMessage> shutdownMessage =
    DROP_PROTOBUF(ShutdownMessage(), _, slave.get());

  Future<TaskStatus> lostStatus;
  EXPECT_CALL(sched, statusUpdate(&driver, _))
    .WillOnce(FutureArg<1>(&lostStatus));

  Future<Nothing> slaveLost;
  EXPECT_CALL(sched, slaveLost(&driver, _))
    .WillOnce(FutureSatisfy(&slaveLost));

  Clock::pause();

  // Now, induce a partition of the slave by having the master
  // timeout the slave.
  uint32_t pings = 0;
  while (true) {
    AWAIT_READY(ping);
    pings++;
    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
     break;
    }
    ping = FUTURE_MESSAGE(Eq("PING"), _, _);
    Clock::advance(master::SLAVE_PING_TIMEOUT);
    Clock::settle();
  }

  Clock::advance(master::SLAVE_PING_TIMEOUT);
  Clock::settle();

  // The master will have notified the framework of the lost task.
  AWAIT_READY(lostStatus);
  EXPECT_EQ(TASK_LOST, lostStatus.get().state());

  // Wait for the master to attempt to shut down the slave.
  AWAIT_READY(shutdownMessage);

  // The master will notify the framework that the slave was lost.
  AWAIT_READY(slaveLost);

  shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get());

  // Induce an ExitedExecutorMessage from the slave.
  containerizer.destroy(
      frameworkId.get(), DEFAULT_EXECUTOR_INFO.executor_id());

  // Upon receiving the message, the master will shutdown the slave.
  AWAIT_READY(shutdownMessage);

  Clock::resume();

  driver.stop();
  driver.join();

  Shutdown();
}
Example #8
0
// The purpose of this test is to ensure that when slaves are removed
// from the master, and then attempt to re-register, we deny the
// re-registration by sending a ShutdownMessage to the slave.
// Why? Because during a network partition, the master will remove a
// partitioned slave, thus sending its tasks to LOST. At this point,
// when the partition is removed, the slave will attempt to
// re-register with its running tasks. We've already notified
// frameworks that these tasks were LOST, so we have to have the slave
// slave shut down.
TEST_F(PartitionTest, PartitionedSlaveReregistration)
{
  Try<PID<Master> > master = StartMaster();
  ASSERT_SOME(master);

  // Allow the master to PING the slave, but drop all PONG messages
  // from the slave. Note that we don't match on the master / slave
  // PIDs because it's actually the SlaveObserver Process that sends
  // the pings.
  Future<Message> ping = FUTURE_MESSAGE(Eq("PING"), _, _);
  DROP_MESSAGES(Eq("PONG"), _, _);

  MockExecutor exec(DEFAULT_EXECUTOR_ID);

  StandaloneMasterDetector detector(master.get());

  Try<PID<Slave> > slave = StartSlave(&exec, &detector);
  ASSERT_SOME(slave);

  MockScheduler sched;
  MesosSchedulerDriver driver(
      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);

  EXPECT_CALL(sched, registered(&driver, _, _));

  Future<vector<Offer> > offers;
  EXPECT_CALL(sched, resourceOffers(&driver, _))
    .WillOnce(FutureArg<1>(&offers))
    .WillRepeatedly(Return());

  driver.start();

  AWAIT_READY(offers);
  ASSERT_NE(0u, offers.get().size());

  // Launch a task. This is to ensure the task is killed by the slave,
  // during shutdown.
  TaskID taskId;
  taskId.set_value("1");

  TaskInfo task;
  task.set_name("");
  task.mutable_task_id()->MergeFrom(taskId);
  task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id());
  task.mutable_resources()->MergeFrom(offers.get()[0].resources());
  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
  task.mutable_executor()->mutable_command()->set_value("sleep 60");

  vector<TaskInfo> tasks;
  tasks.push_back(task);

  // Set up the expectations for launching the task.
  EXPECT_CALL(exec, registered(_, _, _, _));
  EXPECT_CALL(exec, launchTask(_, _))
    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));

  Future<TaskStatus> runningStatus;
  EXPECT_CALL(sched, statusUpdate(&driver, _))
    .WillOnce(FutureArg<1>(&runningStatus));

  Future<Nothing> statusUpdateAck = FUTURE_DISPATCH(
      slave.get(), &Slave::_statusUpdateAcknowledgement);

  driver.launchTasks(offers.get()[0].id(), tasks);

  AWAIT_READY(runningStatus);
  EXPECT_EQ(TASK_RUNNING, runningStatus.get().state());

  // Wait for the slave to have handled the acknowledgment prior
  // to pausing the clock.
  AWAIT_READY(statusUpdateAck);

  // Drop the first shutdown message from the master (simulated
  // partition), allow the second shutdown message to pass when
  // the slave re-registers.
  Future<ShutdownMessage> shutdownMessage =
    DROP_PROTOBUF(ShutdownMessage(), _, slave.get());

  Future<TaskStatus> lostStatus;
  EXPECT_CALL(sched, statusUpdate(&driver, _))
    .WillOnce(FutureArg<1>(&lostStatus));

  Future<Nothing> slaveLost;
  EXPECT_CALL(sched, slaveLost(&driver, _))
    .WillOnce(FutureSatisfy(&slaveLost));

  Clock::pause();

  // Now, induce a partition of the slave by having the master
  // timeout the slave.
  uint32_t pings = 0;
  while (true) {
    AWAIT_READY(ping);
    pings++;
    if (pings == master::MAX_SLAVE_PING_TIMEOUTS) {
     break;
    }
    ping = FUTURE_MESSAGE(Eq("PING"), _, _);
    Clock::advance(master::SLAVE_PING_TIMEOUT);
    Clock::settle();
  }

  Clock::advance(master::SLAVE_PING_TIMEOUT);
  Clock::settle();

  // The master will have notified the framework of the lost task.
  AWAIT_READY(lostStatus);
  EXPECT_EQ(TASK_LOST, lostStatus.get().state());

  // Wait for the master to attempt to shut down the slave.
  AWAIT_READY(shutdownMessage);

  // The master will notify the framework that the slave was lost.
  AWAIT_READY(slaveLost);

  Clock::resume();

  // We now complete the partition on the slave side as well. This
  // is done by simulating a master loss event which would normally
  // occur during a network partition.
  detector.appoint(None());

  Future<Nothing> shutdown;
  EXPECT_CALL(exec, shutdown(_))
    .WillOnce(FutureSatisfy(&shutdown));

  shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get());

  // Have the slave re-register with the master.
  detector.appoint(master.get());

  // Upon re-registration, the master will shutdown the slave.
  // The slave will then shut down the executor.
  AWAIT_READY(shutdownMessage);
  AWAIT_READY(shutdown);

  driver.stop();
  driver.join();

  Shutdown();
}
Example #9
0
// This test ensures we don't break the API when it comes to JSON
// representation of tasks.
TEST(HTTPTest, ModelTask)
{
  TaskID taskId;
  taskId.set_value("t");

  SlaveID slaveId;
  slaveId.set_value("s");

  ExecutorID executorId;
  executorId.set_value("t");

  FrameworkID frameworkId;
  frameworkId.set_value("f");

  TaskState state = TASK_RUNNING;

  vector<TaskStatus> statuses;

  TaskStatus status;
  status.mutable_task_id()->CopyFrom(taskId);
  status.set_state(state);
  status.mutable_slave_id()->CopyFrom(slaveId);
  status.mutable_executor_id()->CopyFrom(executorId);
  status.set_timestamp(0.0);

  statuses.push_back(status);

  Labels labels;
  labels.add_labels()->CopyFrom(createLabel("ACTION", "port:7987 DENY"));

  Ports ports;
  Port* port = ports.add_ports();
  port->set_number(80);
  port->mutable_labels()->CopyFrom(labels);

  DiscoveryInfo discovery;
  discovery.set_visibility(DiscoveryInfo::CLUSTER);
  discovery.set_name("discover");
  discovery.mutable_ports()->CopyFrom(ports);

  TaskInfo taskInfo;
  taskInfo.set_name("task");
  taskInfo.mutable_task_id()->CopyFrom(taskId);
  taskInfo.mutable_slave_id()->CopyFrom(slaveId);
  taskInfo.mutable_command()->set_value("echo hello");
  taskInfo.mutable_discovery()->CopyFrom(discovery);

  Task task = createTask(taskInfo, state, frameworkId);
  task.add_statuses()->CopyFrom(statuses[0]);

  JSON::Value object = model(task);

  Try<JSON::Value> expected = JSON::parse(
      "{"
      "  \"executor_id\":\"\","
      "  \"framework_id\":\"f\","
      "  \"id\":\"t\","
      "  \"name\":\"task\","
      "  \"resources\":"
      "  {"
      "    \"cpus\":0,"
      "    \"disk\":0,"
      "    \"gpus\":0,"
      "    \"mem\":0"
      "  },"
      "  \"slave_id\":\"s\","
      "  \"state\":\"TASK_RUNNING\","
      "  \"statuses\":"
      "  ["
      "    {"
      "      \"state\":\"TASK_RUNNING\","
      "      \"timestamp\":0"
      "    }"
      "  ],"
      " \"discovery\":"
      " {"
      "   \"name\":\"discover\","
      "   \"ports\":"
      "   {"
      "     \"ports\":"
      "     ["
      "       {"
      "         \"number\":80,"
      "         \"labels\":"
      "         {"
      "           \"labels\":"
      "           ["
      "             {"
      "              \"key\":\"ACTION\","
      "              \"value\":\"port:7987 DENY\""
      "             }"
      "           ]"
      "         }"
      "       }"
      "     ]"
      "   },"
      "   \"visibility\":\"CLUSTER\""
      " }"
      "}");

  ASSERT_SOME(expected);

  EXPECT_EQ(expected.get(), object);
}
TEST(MasterTest, KillTask)
{
  ASSERT_TRUE(GTEST_IS_THREADSAFE);

  SimpleAllocator a;
  Master m(&a);
  PID<Master> master = process::spawn(&m);

  MockExecutor exec;

  trigger killTaskCall, shutdownCall;

  EXPECT_CALL(exec, registered(_, _, _, _, _, _))
    .Times(1);

  EXPECT_CALL(exec, launchTask(_, _))
    .WillOnce(SendStatusUpdate(TASK_RUNNING));

  EXPECT_CALL(exec, killTask(_, _))
    .WillOnce(Trigger(&killTaskCall));

  EXPECT_CALL(exec, shutdown(_))
    .WillOnce(Trigger(&shutdownCall));

  map<ExecutorID, Executor*> execs;
  execs[DEFAULT_EXECUTOR_ID] = &exec;

  TestingIsolationModule isolationModule(execs);

  Resources resources = Resources::parse("cpus:2;mem:1024");

  Slave s(resources, true, &isolationModule);
  PID<Slave> slave = process::spawn(&s);

  BasicMasterDetector detector(master, slave, true);

  MockScheduler sched;
  MesosSchedulerDriver driver(&sched, "", DEFAULT_EXECUTOR_INFO, master);

  vector<Offer> offers;
  TaskStatus status;

  trigger resourceOffersCall, statusUpdateCall;

  EXPECT_CALL(sched, registered(&driver, _))
    .Times(1);

  EXPECT_CALL(sched, resourceOffers(&driver, _))
    .WillOnce(DoAll(SaveArg<1>(&offers),
                    Trigger(&resourceOffersCall)))
    .WillRepeatedly(Return());

  EXPECT_CALL(sched, statusUpdate(&driver, _))
    .WillOnce(DoAll(SaveArg<1>(&status), Trigger(&statusUpdateCall)));

  driver.start();

  WAIT_UNTIL(resourceOffersCall);

  EXPECT_NE(0, offers.size());

  TaskID taskId;
  taskId.set_value("1");

  TaskDescription task;
  task.set_name("");
  task.mutable_task_id()->MergeFrom(taskId);
  task.mutable_slave_id()->MergeFrom(offers[0].slave_id());
  task.mutable_resources()->MergeFrom(offers[0].resources());

  vector<TaskDescription> tasks;
  tasks.push_back(task);

  driver.launchTasks(offers[0].id(), tasks);

  WAIT_UNTIL(statusUpdateCall);

  EXPECT_EQ(TASK_RUNNING, status.state());

  driver.killTask(taskId);

  WAIT_UNTIL(killTaskCall);

  driver.stop();
  driver.join();

  WAIT_UNTIL(shutdownCall); // To ensure can deallocate MockExecutor.

  process::terminate(slave);
  process::wait(slave);

  process::terminate(master);
  process::wait(master);
}
Example #11
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    bool strict,
    bool rebooted)
{
  RunState state;
  state.id = containerId;
  string message;

  // See if the sentinel file exists. This is done first so it is
  // known even if partial state is returned, e.g., if the libprocess
  // pid file is not recovered. It indicates the slave removed the
  // executor.
  string path = paths::getExecutorSentinelPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  state.completed = os::exists(path);

  // Find the tasks.
  Try<list<string>> tasks = paths::getTaskPaths(
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      containerId);

  if (tasks.isError()) {
    return Error(
        "Failed to find tasks for executor run " + containerId.value() +
        ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(Path(path).basename());

    Try<TaskState> task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
    state.errors += task->errors;
  }

  path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  // If agent host is rebooted, we do not read the forked pid and libprocess pid
  // since those two pids are obsolete after reboot. And we remove the forked
  // pid file to make sure we will not read it in the case the agent process is
  // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e.,
  // agent recovery is done after the reboot).
  if (rebooted) {
    if (os::exists(path)) {
      Try<Nothing> rm = os::rm(path);
      if (rm.isError()) {
        return Error(
            "Failed to remove executor forked pid file '" + path + "': " +
            rm.error());
      }
    }

    return state;
  }

  if (!os::exists(path)) {
    // This could happen if the slave died before the containerizer checkpointed
    // the forked pid or agent process is restarted after agent host is rebooted
    // since we remove this file in the above code.
    LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
    return state;
  }

  // Read the forked pid.
  Result<string> pid = state::read<string>(path);
  if (pid.isError()) {
    message = "Failed to read executor forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid->empty()) {
    // This could happen if the slave is hard rebooted after the file is created
    // but before the data is synced on disk.
    LOG(WARNING) << "Found empty executor forked pid file '" << path << "'";
    return state;
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid '" + pid.get() + "' "
                 "from pid file '" + path + "': " +
                 forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  if (os::exists(path)) {
    pid = state::read<string>(path);

    if (pid.isError()) {
      message = "Failed to read executor libprocess pid from '" + path +
                "': " + pid.error();

      if (strict) {
        return Error(message);
      } else {
        LOG(WARNING) << message;
        state.errors++;
        return state;
      }
    }

    if (pid->empty()) {
      // This could happen if the slave is hard rebooted after the file is
      // created but before the data is synced on disk.
      LOG(WARNING) << "Found empty executor libprocess pid file '" << path
                   << "'";
      return state;
    }

    state.libprocessPid = process::UPID(pid.get());
    state.http = false;

    return state;
  }

  path = paths::getExecutorHttpMarkerPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  // The marker could be absent if the slave died before the executor
  // registered with the slave.
  if (!os::exists(path)) {
    LOG(WARNING) << "Failed to find '" << paths::LIBPROCESS_PID_FILE
                 << "' or '" << paths::HTTP_MARKER_FILE
                 << "' for container " << containerId
                 << " of executor '" << executorId
                 << "' of framework " << frameworkId;
    return state;
  }

  state.http = true;
  return state;
}
Example #12
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const ContainerID& containerId,
    bool strict)
{
  RunState state;
  state.id = containerId;
  string message;

  // See if the sentinel file exists. This is done first so it is
  // known even if partial state is returned, e.g., if the libprocess
  // pid file is not recovered. It indicates the slave removed the
  // executor.
  string path = paths::getExecutorSentinelPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  state.completed = os::exists(path);

  // Find the tasks.
  Try<list<string> > tasks = paths::getTaskPaths(
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      containerId);

  if (tasks.isError()) {
    return Error(
        "Failed to find tasks for executor run " + containerId.value() +
        ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(Path(path).basename());

    Try<TaskState> task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, containerId, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
    state.errors += task.get().errors;
  }

  // Read the forked pid.
  path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);
  if (!os::exists(path)) {
    // This could happen if the slave died before the isolator
    // checkpointed the forked pid.
    LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
    return state;
  }

  Try<string> pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor forked pid file '" << path << "'";
    return state;
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid " + pid.get() +
                 ": " + forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, containerId);

  if (!os::exists(path)) {
    // This could happen if the slave died before the executor
    // registered with the slave.
    LOG(WARNING)
      << "Failed to find executor libprocess pid file '" << path << "'";
    return state;
  }

  pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor libprocess pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'";
    return state;
  }

  state.libprocessPid = process::UPID(pid.get());

  return state;
}
Example #13
0
int main(int argc, char** argv)
{
  GOOGLE_PROTOBUF_VERIFY_VERSION;

  Flags flags;

  Try<flags::Warnings> load = flags.load(None(), argc, argv);

  if (load.isError()) {
    cerr << flags.usage(load.error()) << endl;
    return EXIT_FAILURE;
  }

  if (flags.help) {
    cout << flags.usage() << endl;
    return EXIT_SUCCESS;
  }

  // Log any flag warnings.
  foreach (const flags::Warning& warning, load->warnings) {
    LOG(WARNING) << warning.message;
  }

  if (flags.health_check_json.isNone()) {
    cerr << flags.usage("Expected JSON with health check description") << endl;
    return EXIT_FAILURE;
  }

  Try<JSON::Object> parse =
    JSON::parse<JSON::Object>(flags.health_check_json.get());

  if (parse.isError()) {
    cerr << flags.usage("Failed to parse --health_check_json: " + parse.error())
         << endl;
    return EXIT_FAILURE;
  }

  Try<HealthCheck> check = protobuf::parse<HealthCheck>(parse.get());

  if (check.isError()) {
    cerr << flags.usage("Failed to parse --health_check_json: " + check.error())
         << endl;
    return EXIT_SUCCESS;
  }

  if (flags.executor.isNone()) {
    cerr << flags.usage("Missing required option --executor") << endl;
    return EXIT_FAILURE;
  }

  if (check.get().has_http() && check.get().has_command()) {
    cerr << flags.usage("Both 'http' and 'command' health check requested")
         << endl;
    return EXIT_FAILURE;
  }

  if (!check.get().has_http() && !check.get().has_command()) {
    cerr << flags.usage("Expecting one of 'http' or 'command' health check")
         << endl;
    return EXIT_FAILURE;
  }

  if (flags.task_id.isNone()) {
    cerr << flags.usage("Missing required option --task_id") << endl;
    return EXIT_FAILURE;
  }

  TaskID taskID;
  taskID.set_value(flags.task_id.get());

  mesos::internal::HealthCheckerProcess process(
    check.get(),
    flags.executor.get(),
    taskID);

  process::spawn(&process);

  process::Future<Nothing> checking =
    process::dispatch(
      process, &mesos::internal::HealthCheckerProcess::healthCheck);

  checking.await();

  process::terminate(process);
  process::wait(process);

  if (checking.isFailed()) {
    LOG(WARNING) << "Health check failed " << checking.failure();
    return EXIT_FAILURE;
  }

  return EXIT_SUCCESS;
}
Example #14
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const UUID& uuid,
    bool strict)
{
  RunState state;
  state.id = uuid;
  string message;

  // Find the tasks.
  const Try<list<string> >& tasks = os::glob(strings::format(
      paths::TASK_PATH,
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      uuid.toString(),
      "*").get());

  if (tasks.isError()) {
    return Error("Failed to find tasks for executor run " + uuid.toString() +
                 ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(os::basename(path).get());

    const Try<TaskState>& task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
  }

  // Read the forked pid.
  string path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  Try<string> pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor's forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      return state;
    }
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid " + pid.get() +
                 ": " + forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor's libprocess pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      return state;
    }
  }

  state.libprocessPid = process::UPID(pid.get());

  return state;
}
Example #15
0
TEST_F(FaultToleranceTest, ForwardStatusUpdateUnknownExecutor)
{
  Try<PID<Master> > master = StartMaster();
  ASSERT_SOME(master);

  MockExecutor exec(DEFAULT_EXECUTOR_ID);

  Try<PID<Slave> > slave = StartSlave(&exec);
  ASSERT_SOME(slave);

  MockScheduler sched;
  MesosSchedulerDriver driver(&sched, DEFAULT_FRAMEWORK_INFO, master.get());

  FrameworkID frameworkId;
  EXPECT_CALL(sched, registered(&driver, _, _))
    .WillOnce(SaveArg<1>(&frameworkId));

  Future<vector<Offer> > offers;
  EXPECT_CALL(sched, resourceOffers(&driver, _))
    .WillOnce(FutureArg<1>(&offers));

  driver.start();

  AWAIT_READY(offers);
  EXPECT_NE(0u, offers.get().size());
  Offer offer = offers.get()[0];

  TaskInfo task;
  task.set_name("");
  task.mutable_task_id()->set_value("1");
  task.mutable_slave_id()->MergeFrom(offer.slave_id());
  task.mutable_resources()->MergeFrom(offer.resources());
  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);

  vector<TaskInfo> tasks;
  tasks.push_back(task);

  Future<Nothing> statusUpdate;
  EXPECT_CALL(sched, statusUpdate(&driver, _))
    .WillOnce(FutureSatisfy(&statusUpdate));    // TASK_RUNNING of task1.

  EXPECT_CALL(exec, registered(_, _, _, _));

  EXPECT_CALL(exec, launchTask(_, _))
    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));

  driver.launchTasks(offer.id(), tasks);

  // Wait until TASK_RUNNING of task1 is received.
  AWAIT_READY(statusUpdate);

  // Simulate the slave receiving status update from an unknown
  // (e.g. exited) executor of the given framework.
  Future<TaskStatus> status;
  EXPECT_CALL(sched, statusUpdate(&driver, _))
    .WillOnce(FutureArg<1>(&status));           // TASK_RUNNING of task2.

  TaskID taskId;
  taskId.set_value("task2");

  StatusUpdate statusUpdate2 = createStatusUpdate(
      frameworkId, offer.slave_id(), taskId, TASK_RUNNING, "Dummy update");

  process::dispatch(slave.get(), &Slave::statusUpdate, statusUpdate2);

  // Ensure that the scheduler receives task2's update.
  AWAIT_READY(status);
  EXPECT_EQ(taskId, status.get().task_id());
  EXPECT_EQ(TASK_RUNNING, status.get().state());

  EXPECT_CALL(exec, shutdown(_))
    .Times(AtMost(1));

  driver.stop();
  driver.join();

  Shutdown();
}
Example #16
0
inline bool operator<(const TaskID& left, const TaskID& right)
{
  return left.value() < right.value();
}
Example #17
0
Try<RunState> RunState::recover(
    const string& rootDir,
    const SlaveID& slaveId,
    const FrameworkID& frameworkId,
    const ExecutorID& executorId,
    const UUID& uuid,
    bool strict)
{
  RunState state;
  state.id = uuid;
  string message;

  // Find the tasks.
  const Try<list<string> >& tasks = os::glob(strings::format(
      paths::TASK_PATH,
      rootDir,
      slaveId,
      frameworkId,
      executorId,
      uuid.toString(),
      "*").get());

  if (tasks.isError()) {
    return Error("Failed to find tasks for executor run " + uuid.toString() +
                 ": " + tasks.error());
  }

  // Recover tasks.
  foreach (const string& path, tasks.get()) {
    TaskID taskId;
    taskId.set_value(os::basename(path).get());

    const Try<TaskState>& task = TaskState::recover(
        rootDir, slaveId, frameworkId, executorId, uuid, taskId, strict);

    if (task.isError()) {
      return Error(
          "Failed to recover task " + taskId.value() + ": " + task.error());
    }

    state.tasks[taskId] = task.get();
    state.errors += task.get().errors;
  }

  // Read the forked pid.
  string path = paths::getForkedPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);
  if (!os::exists(path)) {
    // This could happen if the slave died before the isolator
    // checkpointed the forked pid.
    LOG(WARNING) << "Failed to find executor forked pid file '" << path << "'";
    return state;
  }

  Try<string> pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor forked pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor forked pid file '" << path << "'";
    return state;
  }

  Try<pid_t> forkedPid = numify<pid_t>(pid.get());
  if (forkedPid.isError()) {
    return Error("Failed to parse forked pid " + pid.get() +
                 ": " + forkedPid.error());
  }

  state.forkedPid = forkedPid.get();

  // Read the libprocess pid.
  path = paths::getLibprocessPidPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  if (!os::exists(path)) {
    // This could happen if the slave died before the executor
    // registered with the slave.
    LOG(WARNING)
      << "Failed to find executor libprocess pid file '" << path << "'";
    return state;
  }

  pid = os::read(path);

  if (pid.isError()) {
    message = "Failed to read executor libprocess pid from '" + path +
              "': " + pid.error();

    if (strict) {
      return Error(message);
    } else {
      LOG(WARNING) << message;
      state.errors++;
      return state;
    }
  }

  if (pid.get().empty()) {
    // This could happen if the slave died after opening the file for
    // writing but before it checkpointed anything.
    LOG(WARNING) << "Found empty executor libprocess pid file '" << path << "'";
    return state;
  }

  state.libprocessPid = process::UPID(pid.get());

  // See if the sentinel file exists.
  path = paths::getExecutorSentinelPath(
      rootDir, slaveId, frameworkId, executorId, uuid);

  state.completed = os::exists(path);

  return state;
}
// Ensures the scheduler driver can handle the UPDATE event.
TEST_F(SchedulerDriverEventTest, Update)
{
  Try<PID<Master>> master = StartMaster();
  ASSERT_SOME(master);

  MockScheduler sched;
  MesosSchedulerDriver driver(
      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);

  EXPECT_CALL(sched, registered(&driver, _, _));

  Future<Message> frameworkRegisteredMessage =
    FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);

  driver.start();

  AWAIT_READY(frameworkRegisteredMessage);
  UPID frameworkPid = frameworkRegisteredMessage.get().to;

  FrameworkRegisteredMessage message;
  ASSERT_TRUE(message.ParseFromString(frameworkRegisteredMessage.get().body));

  FrameworkID frameworkId = message.framework_id();

  SlaveID slaveId;
  slaveId.set_value("S");

  TaskID taskId;
  taskId.set_value("T");

  ExecutorID executorId;
  executorId.set_value("E");

  // Generate an update that needs no acknowledgement.
  Event event;
  event.set_type(Event::UPDATE);
  event.mutable_update()->mutable_status()->CopyFrom(
      protobuf::createStatusUpdate(
          frameworkId,
          slaveId,
          taskId,
          TASK_RUNNING,
          TaskStatus::SOURCE_MASTER,
          None(),
          "message",
          None(),
          executorId).status());

  Future<Nothing> statusUpdate;
  Future<Nothing> statusUpdate2;
  EXPECT_CALL(sched, statusUpdate(&driver, event.update().status()))
    .WillOnce(FutureSatisfy(&statusUpdate))
    .WillOnce(FutureSatisfy(&statusUpdate2));

  process::post(master.get(), frameworkPid, event);

  AWAIT_READY(statusUpdate);

  // Generate an update that requires acknowledgement.
  event.mutable_update()->mutable_status()->set_uuid(UUID::random().toBytes());

  Future<mesos::scheduler::Call> acknowledgement = DROP_CALL(
      mesos::scheduler::Call(), mesos::scheduler::Call::ACKNOWLEDGE, _, _);

  process::post(master.get(), frameworkPid, event);

  AWAIT_READY(statusUpdate2);
  AWAIT_READY(acknowledgement);
}
Example #19
0
// This test ensures we don't break the API when it comes to JSON
// representation of tasks. Also, we want to ensure that tasks are
// modeled the same way when using 'Task' vs. 'TaskInfo'.
TEST(HTTP, ModelTask)
{
  TaskID taskId;
  taskId.set_value("t");

  SlaveID slaveId;
  slaveId.set_value("s");

  ExecutorID executorId;
  executorId.set_value("t");

  FrameworkID frameworkId;
  frameworkId.set_value("f");

  TaskState state = TASK_RUNNING;

  vector<TaskStatus> statuses;

  TaskStatus status;
  status.mutable_task_id()->CopyFrom(taskId);
  status.set_state(state);
  status.mutable_slave_id()->CopyFrom(slaveId);
  status.mutable_executor_id()->CopyFrom(executorId);
  status.set_timestamp(0.0);

  statuses.push_back(status);

  TaskInfo task;
  task.set_name("task");
  task.mutable_task_id()->CopyFrom(taskId);
  task.mutable_slave_id()->CopyFrom(slaveId);
  task.mutable_command()->set_value("echo hello");

  Task task_ = protobuf::createTask(task, state, frameworkId);
  task_.add_statuses()->CopyFrom(statuses[0]);

  JSON::Value object = model(task, frameworkId, state, statuses);
  JSON::Value object_ = model(task_);

  Try<JSON::Value> expected = JSON::parse(
      "{"
      "  \"executor_id\":\"\","
      "  \"framework_id\":\"f\","
      "  \"id\":\"t\","
      "  \"name\":\"task\","
      "  \"resources\":"
      "  {"
      "    \"cpus\":0,"
      "    \"disk\":0,"
      "    \"mem\":0"
      "  },"
      "  \"slave_id\":\"s\","
      "  \"state\":\"TASK_RUNNING\","
      "  \"statuses\":"
      "  ["
      "    {"
      "      \"state\":\"TASK_RUNNING\","
      "      \"timestamp\":0"
      "    }"
      "  ]"
      "}");

  ASSERT_SOME(expected);

  EXPECT_EQ(expected.get(), object);
  EXPECT_EQ(expected.get(), object_);

  // Ensure both are modeled the same.
  EXPECT_EQ(object, object_);
}
Example #20
0
// The purpose of this test is to ensure that when slaves are removed
// from the master, and then attempt to send status updates, we send
// a ShutdownMessage to the slave. Why? Because during a network
// partition, the master will remove a partitioned slave, thus sending
// its tasks to LOST. At this point, when the partition is removed,
// the slave may attempt to send updates if it was unaware that the
// master removed it. We've already notified frameworks that these
// tasks were LOST, so we have to have the slave shut down.
TEST_F(PartitionTest, PartitionedSlaveStatusUpdates)
{
  master::Flags masterFlags = CreateMasterFlags();
  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
  ASSERT_SOME(master);

  // Allow the master to PING the slave, but drop all PONG messages
  // from the slave. Note that we don't match on the master / slave
  // PIDs because it's actually the SlaveObserver Process that sends
  // the pings.
  Future<Message> ping = FUTURE_MESSAGE(
      Eq(PingSlaveMessage().GetTypeName()), _, _);

  DROP_PROTOBUFS(PongSlaveMessage(), _, _);

  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);

  MockExecutor exec(DEFAULT_EXECUTOR_ID);
  TestContainerizer containerizer(&exec);

  Owned<MasterDetector> detector = master.get()->createDetector();
  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer);
  ASSERT_SOME(slave);

  AWAIT_READY(slaveRegisteredMessage);
  SlaveID slaveId = slaveRegisteredMessage.get().slave_id();

  MockScheduler sched;
  MesosSchedulerDriver driver(
      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);

  Future<FrameworkID> frameworkId;
  EXPECT_CALL(sched, registered(&driver, _, _))
    .WillOnce(FutureArg<1>(&frameworkId));

  EXPECT_CALL(sched, resourceOffers(&driver, _))
    .WillRepeatedly(Return());

  driver.start();

  AWAIT_READY(frameworkId);

  // Drop the first shutdown message from the master (simulated
  // partition), allow the second shutdown message to pass when
  // the slave sends an update.
  Future<ShutdownMessage> shutdownMessage =
    DROP_PROTOBUF(ShutdownMessage(), _, slave.get()->pid);

  EXPECT_CALL(sched, offerRescinded(&driver, _))
    .WillRepeatedly(Return());

  Future<Nothing> slaveLost;
  EXPECT_CALL(sched, slaveLost(&driver, _))
    .WillOnce(FutureSatisfy(&slaveLost));

  Clock::pause();

  // Now, induce a partition of the slave by having the master
  // timeout the slave.
  size_t pings = 0;
  while (true) {
    AWAIT_READY(ping);
    pings++;
    if (pings == masterFlags.max_slave_ping_timeouts) {
     break;
    }
    ping = FUTURE_MESSAGE(Eq(PingSlaveMessage().GetTypeName()), _, _);
    Clock::advance(masterFlags.slave_ping_timeout);
    Clock::settle();
  }

  Clock::advance(masterFlags.slave_ping_timeout);
  Clock::settle();

  // Wait for the master to attempt to shut down the slave.
  AWAIT_READY(shutdownMessage);

  // The master will notify the framework that the slave was lost.
  AWAIT_READY(slaveLost);

  shutdownMessage = FUTURE_PROTOBUF(ShutdownMessage(), _, slave.get()->pid);

  // At this point, the slave still thinks it's registered, so we
  // simulate a status update coming from the slave.
  TaskID taskId;
  taskId.set_value("task_id");
  const StatusUpdate& update = protobuf::createStatusUpdate(
      frameworkId.get(),
      slaveId,
      taskId,
      TASK_RUNNING,
      TaskStatus::SOURCE_SLAVE,
      UUID::random());

  StatusUpdateMessage message;
  message.mutable_update()->CopyFrom(update);
  message.set_pid(stringify(slave.get()->pid));

  process::post(master.get()->pid, message);

  // The master should shutdown the slave upon receiving the update.
  AWAIT_READY(shutdownMessage);

  Clock::resume();

  driver.stop();
  driver.join();
}