Exemple #1
0
char*
connectAndSendUserCommand(char c,
                          int *coordCmdStatus,
                          int *numPeers,
                          int *isRunning,
                          int *ckptInterval)
{
  char *replyData = NULL;
  int coordFd = createNewSocketToCoordinator(COORD_ANY);
  if (coordFd == -1) {
    *coordCmdStatus = CoordCmdStatus::ERROR_COORDINATOR_NOT_FOUND;
    return replyData;
  }

  // Tell the coordinator to run given user command
  DmtcpMessage msg(DMT_USER_CMD);
  msg.coordCmd = c;

  if (c == 'i') {
    const char *interval = getenv(ENV_VAR_CKPT_INTR);
    if (interval != NULL) {
      msg.theCheckpointInterval = jalib::StringToInt(interval);
    }
  }
  JASSERT(Util::writeAll(coordFd, &msg, sizeof(msg)) == sizeof(msg));

  // The coordinator will violently close our socket...
  if (c == 'q' || c == 'Q') {
    *coordCmdStatus = CoordCmdStatus::NOERROR;
    return replyData;
  }

  // Receive REPLY
  DmtcpMessage reply;
  reply.poison();
  recvMsgFromCoordinatorRaw(coordFd, &reply, (void**)&replyData);
  reply.assertValid();
  JASSERT(reply.type == DMT_USER_CMD_RESULT);

  if (coordCmdStatus != NULL) {
    *coordCmdStatus = reply.coordCmdStatus;
  }
  if (numPeers != NULL) {
    *numPeers = reply.numPeers;
  }
  if (isRunning != NULL) {
    *isRunning = reply.isRunning;
  }
  if (ckptInterval != NULL) {
    *ckptInterval = reply.theCheckpointInterval;
  }

  _real_close(coordFd);

  return replyData;
}
Exemple #2
0
DmtcpMessage
sendRecvHandshake(int fd,
                  DmtcpMessage msg,
                  string progname,
                  UniquePid *compId)
{
  if (dmtcp_virtual_to_real_pid) {
    msg.realPid = dmtcp_virtual_to_real_pid(getpid());
  } else {
    msg.realPid = getpid();
  }

  msg.theCheckpointInterval = getCkptInterval();

  string hostname = jalib::Filesystem::GetCurrentHostname();

  size_t buflen = hostname.length() + progname.length() + 2;
  char buf[buflen];
  strcpy(buf, hostname.c_str());
  strcpy(&buf[hostname.length() + 1], progname.c_str());

  sendMsgToCoordinatorRaw(fd, msg, buf, buflen);

  recvMsgFromCoordinatorRaw(fd, &msg);
  msg.assertValid();
  if (msg.type == DMT_KILL_PEER) {
    JTRACE("Received KILL message from coordinator, exiting");
    _real_exit(0);
  }
  if (msg.type == DMT_REJECT_NOT_RUNNING) {
    JASSERT(false)
    .Text("Connection rejected by the coordinator.\n"
          "Reason: Current computation not in RUNNING state.\n"
          "         Is a checkpoint/restart in progress?");
  } else if (msg.type == DMT_REJECT_WRONG_COMP) {
    JASSERT(compId != NULL);
    JASSERT(false) (*compId)
    .Text("Connection rejected by the coordinator.\n"
          " Reason: This process has a different computation group.");
  }
  // Coordinator also prints this, but its stderr may go to /dev/null
  if (msg.type == DMT_REJECT_NOT_RESTARTING) {
    string coordinatorHost = ""; // C++ magic code; "" to be invisibly replaced
    int coordinatorPort;
    getCoordHostAndPort(COORD_ANY, coordinatorHost, &coordinatorPort);
    JNOTE ("\n\n*** Computation not in RESTARTING or CHECKPOINTED state."
        "\n***Can't join the existing coordinator, as it is serving a"
        "\n***different computation.  Consider launching a new coordinator."
        "\n***Consider, also, checking with:  dmtcp_command --status")
        (coordinatorPort);
  }
  JASSERT(msg.type == DMT_ACCEPT)(msg.type);
  return msg;
}
Exemple #3
0
void
DmtcpWorker::waitForSuspendMessage()
{
  SharedData::resetBarrierInfo();

  if (dmtcp_no_coordinator()) {
    string shmFile = jalib::Filesystem::GetDeviceName(PROTECTED_SHM_FD);
    JASSERT(!shmFile.empty());
    unlink(shmFile.c_str());
    CoordinatorAPI::waitForCheckpointCommand();
    ProcessInfo::instance().numPeers(1);
    ProcessInfo::instance().compGroup(SharedData::getCompId());
    return;
  }

  if (ThreadSync::destroyDmtcpWorkerLockTryLock() != 0) {
    JTRACE("User thread is performing exit()."
           " ckpt thread exit()ing as well");
    ckptThreadPerformExit();
  }
  if (exitInProgress()) {
    ThreadSync::destroyDmtcpWorkerLockUnlock();
    ckptThreadPerformExit();
  }

  JTRACE("waiting for SUSPEND message");

  DmtcpMessage msg;
  CoordinatorAPI::recvMsgFromCoordinator(&msg);

  if (exitInProgress()) {
    ThreadSync::destroyDmtcpWorkerLockUnlock();
    ckptThreadPerformExit();
  }

  msg.assertValid();
  if (msg.type == DMT_KILL_PEER) {
    JTRACE("Received KILL message from coordinator, exiting");
    _exit(0);
  }

  JASSERT(msg.type == DMT_DO_SUSPEND) (msg.type);

  // Coordinator sends some computation information along with the SUSPEND
  // message. Extracting that.
  SharedData::updateGeneration(msg.compGroup.computationGeneration());
  JASSERT(SharedData::getCompId() == msg.compGroup.upid())
    (SharedData::getCompId()) (msg.compGroup);

  _exitAfterCkpt = msg.exitAfterCkpt;
}
pid_t dmtcp::DmtcpCoordinatorAPI::getVirtualPidFromCoordinator()
{
  connectToCoordinator();
  DmtcpMessage msg(DMT_GET_VIRTUAL_PID);
  _coordinatorSocket << msg;

  DmtcpMessage reply;
  reply.poison();
  _coordinatorSocket >> reply;
  reply.assertValid();
  JASSERT(reply.type == DMT_GET_VIRTUAL_PID_RESULT) (reply.type);
  JASSERT(reply.virtualPid != -1);

  _coordinatorSocket.close();
  return reply.virtualPid;
}
Exemple #5
0
// On input, val points to a buffer in user memory and *val_len is the maximum
//   size of that buffer (the memory allocated by user).
// On output, we copy data to val, and set *val_len to the actual buffer size
//   (to the size of the data that we copied to the user buffer).
int CoordinatorAPI::sendQueryToCoordinator(const char *id,
                                           const void *key,
                                           uint32_t key_len,
                                           void *val,
                                           uint32_t *val_len)
{
  DmtcpMessage msg (DMT_NAME_SERVICE_QUERY);
  JWARNING(strlen(id) < sizeof(msg.nsid));
  strncpy(msg.nsid, id, 8);
  msg.keyLen = key_len;
  msg.valLen = 0;
  msg.extraBytes = key_len;
  jalib::JSocket sock = _coordinatorSocket;

  if (key == NULL || key_len == 0 || val == NULL || val_len == 0) {
    return 0;
  }

  if (dmtcp_is_running_state()) {
    if (!_nsSock.isValid()) {
      _nsSock = createNewSocketToCoordinator(COORD_ANY);
      JASSERT(_nsSock.isValid());
      _nsSock.changeFd(PROTECTED_NS_FD);
      DmtcpMessage m(DMT_NAME_SERVICE_WORKER);
      _nsSock << m;
    }
    sock = _nsSock;
    JASSERT(sock.isValid());
  }

  sock << msg;
  sock.writeAll((const char *)key, key_len);

  msg.poison();
  sock >> msg;
  msg.assertValid();
  JASSERT(msg.type == DMT_NAME_SERVICE_QUERY_RESPONSE &&
          msg.extraBytes == msg.valLen);

  JASSERT (*val_len >= msg.valLen);
  *val_len = msg.valLen;
  if (*val_len > 0) {
    sock.readAll((char*)val, *val_len);
  }

  return *val_len;
}
Exemple #6
0
void
DmtcpWorker::acknowledgeSuspendMsg()
{
  if (dmtcp_no_coordinator()) {
    return;
  }

  JTRACE("Waiting for DMT_DO_CHECKPOINT message");
  CoordinatorAPI::sendMsgToCoordinator(DmtcpMessage(DMT_OK));

  DmtcpMessage msg;
  CoordinatorAPI::recvMsgFromCoordinator(&msg);
  msg.assertValid();

  JASSERT(msg.type == DMT_COMPUTATION_INFO) (msg.type);
  JTRACE("Computation information") (msg.compGroup) (msg.numPeers);
  ProcessInfo::instance().compGroup(msg.compGroup);
  ProcessInfo::instance().numPeers(msg.numPeers);
}
Exemple #7
0
void waitForBarrier(const string& barrierId)
{
  sendMsgToCoordinator(DmtcpMessage(DMT_OK));

  JTRACE("waiting for DMT_BARRIER_RELEASED message");

  char *extraData = NULL;
  DmtcpMessage msg;
  recvMsgFromCoordinator(&msg, (void**)&extraData);

  msg.assertValid();
  if (msg.type == DMT_KILL_PEER) {
    JTRACE("Received KILL message from coordinator, exiting");
    _exit(0);
  }

  JASSERT(msg.type == DMT_BARRIER_RELEASED) (msg.type);
  JASSERT(extraData != NULL);
  JASSERT(barrierId == extraData) (barrierId) (extraData);

  JALLOC_FREE(extraData);
}
Exemple #8
0
DmtcpMessage CoordinatorAPI::sendRecvHandshake(DmtcpMessage msg,
                                               string progname,
                                               UniquePid *compId)
{
  if (dmtcp_virtual_to_real_pid) {
    msg.realPid = dmtcp_virtual_to_real_pid(getpid());
  } else {
    msg.realPid = getpid();
  }

  msg.theCheckpointInterval = getCkptInterval();
  string hostname = jalib::Filesystem::GetCurrentHostname();
  msg.extraBytes = hostname.length() + 1 + progname.length() + 1;

  _coordinatorSocket << msg;
  _coordinatorSocket.writeAll(hostname.c_str(), hostname.length() + 1);
  _coordinatorSocket.writeAll(progname.c_str(), progname.length() + 1);

  msg.poison();
  _coordinatorSocket >> msg;
  msg.assertValid();
  if (msg.type == DMT_KILL_PEER) {
    JTRACE("Received KILL message from coordinator, exiting");
    _real_exit (0);
  }
  if (msg.type == DMT_REJECT_NOT_RUNNING) {
    JASSERT(false)
      .Text("Connection rejected by the coordinator.\n"
            "Reason: Current computation not in RUNNING state.\n"
            "         Is a checkpoint/restart in progress?");
  } else if (msg.type == DMT_REJECT_WRONG_COMP) {
    JASSERT(compId != NULL);
    JASSERT(false) (*compId)
      .Text("Connection rejected by the coordinator.\n"
            " Reason: This process has a different computation group.");
  }
  JASSERT(msg.type == DMT_ACCEPT)(msg.type);
  return msg;
}
Exemple #9
0
void
DmtcpWorker::waitForSuspendMessage()
{
  SharedData::resetBarrierInfo();

  if (dmtcp_no_coordinator()) {
    string shmFile = jalib::Filesystem::GetDeviceName(PROTECTED_SHM_FD);
    JASSERT(!shmFile.empty());
    unlink(shmFile.c_str());
    CoordinatorAPI::waitForCheckpointCommand();
    ProcessInfo::instance().numPeers(1);
    ProcessInfo::instance().compGroup(SharedData::getCompId());
    return;
  }

  JTRACE("waiting for SUSPEND message");

  DmtcpMessage msg;
  CoordinatorAPI::recvMsgFromCoordinator(&msg);

  // Before validating message; make sure we are not exiting.
  if (exitInProgress) {
    ckptThreadPerformExit();
  }

  msg.assertValid();

  JASSERT(msg.type == DMT_DO_SUSPEND) (msg.type);

  // Coordinator sends some computation information along with the SUSPEND
  // message. Extracting that.
  SharedData::updateGeneration(msg.compGroup.computationGeneration());
  JASSERT(SharedData::getCompId() == msg.compGroup.upid())
    (SharedData::getCompId()) (msg.compGroup);

  exitAfterCkpt = msg.exitAfterCkpt;
}
void dmtcp::ConnectionRewirer::onConnect ( const jalib::JSocket& sock,  const struct sockaddr* /*remoteAddr*/,socklen_t /*remoteLen*/ )
{
  jalib::JSocket remote = sock;
  DmtcpMessage msg;
  msg.poison();
  remote >> msg;
  msg.assertValid();
  JASSERT ( msg.type == DMT_RESTORE_RECONNECTED ) ( msg.type ).Text ( "unexpected message" );

  iterator i = _pendingIncoming.find ( msg.restorePid );

  JASSERT ( i != _pendingIncoming.end() ) ( msg.restorePid )
  .Text ( "got unexpected incoming restore request" );

  const dmtcp::vector<int>& fds = i->second;
  JASSERT ( fds.size() > 0 );
  int fd0 = fds[0];

  remote.changeFd ( fd0 );

  JTRACE ( "restoring incoming connection" ) ( msg.restorePid ) ( fd0 ) ( fds.size() );

  for ( size_t i = 1; i<fds.size(); ++i )
  {
    JTRACE ( "restoring extra fd" ) ( fd0 ) ( fds[i] );
    JASSERT ( _real_dup2 ( fd0,fds[i] ) == fds[i] ) ( fd0 ) ( fds[i] ) ( msg.restorePid )
    .Text ( "dup2() failed" );
  }

  _pendingIncoming.erase ( i );


  if ( pendingCount() ==0 ) finishup();
#ifdef DEBUG
  else debugPrint();
#endif
}
Exemple #11
0
void DmtcpWorker::waitForCoordinatorMsg(string msgStr,
                                               DmtcpMessageType type)
{
  if (dmtcp_no_coordinator()) {
    if (type == DMT_DO_SUSPEND) {
      string shmFile = jalib::Filesystem::GetDeviceName(PROTECTED_SHM_FD);
      JASSERT(!shmFile.empty());
      unlink(shmFile.c_str());
      CoordinatorAPI::instance().waitForCheckpointCommand();
      ProcessInfo::instance().numPeers(1);
      ProcessInfo::instance().compGroup(SharedData::getCompId());
    }
    return;
  }

  if (type == DMT_DO_SUSPEND) {
    if (ThreadSync::destroyDmtcpWorkerLockTryLock() != 0) {
      JTRACE("User thread is performing exit()."
               " ckpt thread exit()ing as well");
      ckptThreadPerformExit();
    }
    if (exitInProgress()) {
      ThreadSync::destroyDmtcpWorkerLockUnlock();
      ckptThreadPerformExit();
    }
  }

  DmtcpMessage msg;

  if (type == DMT_DO_SUSPEND) {
    // Make a dummy syscall to inform superior of our status before we go into
    // select. If ptrace is disabled, this call has no significant effect.
    _real_syscall(DMTCP_FAKE_SYSCALL);
  } else {
    msg.type = DMT_OK;
    msg.state = WorkerState::currentState();
    CoordinatorAPI::instance().sendMsgToCoordinator(msg);
  }

  JTRACE("waiting for " + msgStr + " message");
  CoordinatorAPI::instance().recvMsgFromCoordinator(&msg);
  if (type == DMT_DO_SUSPEND && exitInProgress()) {
    ThreadSync::destroyDmtcpWorkerLockUnlock();
    ckptThreadPerformExit();
  }

  msg.assertValid();
  if (msg.type == DMT_KILL_PEER) {
    JTRACE("Received KILL message from coordinator, exiting");
    _exit (0);
  }

  JASSERT(msg.type == type) (msg.type) (type);

  // Coordinator sends some computation information along with the SUSPEND
  // message. Extracting that.
  if (type == DMT_DO_SUSPEND) {
    SharedData::updateGeneration(msg.compGroup.computationGeneration());
    JASSERT(SharedData::getCompId() == msg.compGroup.upid())
      (SharedData::getCompId()) (msg.compGroup);
  } else if (type == DMT_DO_FD_LEADER_ELECTION) {
    JTRACE("Computation information") (msg.compGroup) (msg.numPeers);
    ProcessInfo::instance().compGroup(msg.compGroup);
    ProcessInfo::instance().numPeers(msg.numPeers);
  }
}