Example #1
0
int RestoreTarget::find_stdin(SlidingFdTable& slidingFd)
{
  ConnectionToFds::const_iterator i;
  for (i = _conToFd.begin(); i!=_conToFd.end(); ++i) {
    const dmtcp::vector<int>& fds = i->second;
    for (size_t x=0; x<fds.size(); ++x) {
      if (fds[x] == STDIN_FILENO) {
        JTRACE("Found stdin: fds[x] <---> slidingFd.getFdfor ()")
          (x) (fds[x]) (slidingFd.getFdFor (i->first));
        return slidingFd.getFdFor (i->first);
      }
    }
  }
  return -1;
}
Example #2
0
void RestoreTarget::dupAllSockets (SlidingFdTable& slidingFd)
{
  int lastfd = -1;
  dmtcp::vector<int> fdlist;

  ConnectionToFds::const_iterator i;
  for (i = _conToFd.begin(); i!=_conToFd.end(); ++i) {
    Connection& con = ConnectionList::instance() [i->first];
    if (con.conType() == Connection::INVALID) {
      JWARNING(false)(i->first).Text("Can't restore invalid Connection");
      continue;
    }

    const dmtcp::vector<int>& fds = i->second;
    for (size_t x=0; x<fds.size(); ++x) {
      int fd = fds[x];
      fdlist.push_back (fd);
      slidingFd.freeUpFd (fd);
      int oldFd = slidingFd.getFdFor (i->first);
      JTRACE ("restoring fd") (i->first) (oldFd) (fd);
      //let connection do custom dup2 handling
      con.restartDup2(oldFd, fd);

      if (fd > lastfd) {
        lastfd = fd;
      }
    }
  }

  size_t j;
  for (int i = 0 ; i < slidingFd.startFd() ; i++) {
    for (j = 0 ; j < fdlist.size() ; j++) {
      if (fdlist.at (j) == i) {
        break;
      }
    }
    if (j == fdlist.size()) {
      close ( i );
    }
  }

  slidingFd.closeAll();
}
Example #3
0
void RestoreTarget::CreateProcess(CoordinatorAPI& coordinatorAPI,
                                  SlidingFdTable& slidingFd)
{
  //change UniquePid
  UniquePid::resetOnFork(upid());
  //UniquePid::ThisProcess(true) = _conToFd.upid();

  Util::initializeLogFile(procname());
  JTRACE("Creating process during restart") (upid()) (procname());

  JTRACE("")(getpid())(getppid())(getsid(0));
  ProcessInfo &pInfo = _processInfo;
  pid_t psid = pInfo.sid();

  JTRACE("Restore /proc/self/* fds");
  ConnectionList& connections = ConnectionList::instance();
  ConnectionList::iterator it;
  for (it = connections.begin(); it != connections.end(); ++it) {
    dmtcp::Connection *con = it->second;
    if (con->subType() == FileConnection::FILE_PROCFS) {
      dmtcp::FileConnection *filecon = (dmtcp::FileConnection*) con;
      char buf[32];
      dmtcp::vector<int> fds;
      fds.push_back(slidingFd.getFdFor(con->id()));
      sprintf(buf, "/proc/%d/", pInfo.pid());
      if (dmtcp::Util::strStartsWith(filecon->filePath(), buf)) {
        filecon->restore(fds);
      }
    }
  }


  if (!isSessionLeader()) {

    // Restore Group information
    restoreGroup(slidingFd);

    // If process is not session leader, restore it and all children.
    t_iterator it = _children.begin();
    for (; it != _children.end(); it++) {
      JTRACE ("Forking Child Process") ((*it)->upid());
      pid_t cid = fork();

      if (cid == 0) {
        (*it)->CreateProcess (coordinatorAPI, slidingFd);
        JASSERT (false) . Text ("Unreachable");
      }
      JASSERT (cid > 0);
    }
  } else {
    // Process is session leader.
    // There may be not setsid-ed children.
    for (t_iterator it = _children.begin(); it != _children.end(); it++) {
      s_iterator sit = (*it)->getSmap().find(psid);
      JTRACE("Restore processes that were created before their parent called setsid()");
      if (sit == (*it)->getSmap().end()) {
        JTRACE ("Forking Child Process") ((*it)->upid());
        pid_t cid = fork();
        if (cid == 0) {
          (*it)->CreateProcess (coordinatorAPI, slidingFd);
          JASSERT (false) . Text ("Unreachable");
        }
        JASSERT (cid > 0);
      }
    }

    pid_t nsid = setsid();
    JTRACE("change SID")(nsid);

    // Restore Group information
    restoreGroup(slidingFd);

    for (t_iterator it = _children.begin(); it != _children.end(); it++) {
      JTRACE("Restore processes that was created after their parent called setsid()");
      s_iterator sit = (*it)->getSmap().find(psid);
      if (sit != (*it)->getSmap().end()) {
        JTRACE ("Forking Child Process") ((*it)->upid());
        pid_t cid = fork();
        if (cid == 0) {
          (*it)->CreateProcess (coordinatorAPI, slidingFd);
          JASSERT (false) . Text ("Unreachable");
        }
        JASSERT (cid> 0);
      }
    }

    for (t_iterator it = _roots.begin() ; it != _roots.end(); it++) {
      JTRACE ("Forking Dependent Root Process") ((*it)->upid());
      pid_t cid;
      if ((cid = fork())) {
        waitpid(cid, NULL, 0);
      } else {
        if (fork())
          exit(0);
        (*it)->CreateProcess(coordinatorAPI, slidingFd);
        JASSERT (false) . Text("Unreachable");
      }
    }
  }

  bool isTheGroupLeader = isGroupLeader(); // Calls JTRACE;avoid recursion
  JTRACE("Child and dependent root processes forked, restoring process")
    (upid())(getpid())(isTheGroupLeader);

  //Reconnect to dmtcp_coordinator
  WorkerState::setCurrentState (WorkerState::RESTARTING);

  coordinatorAPI.connectToCoordinator();
  coordinatorAPI.sendCoordinatorHandshake(procname(), _processInfo.compGroup());
  coordinatorAPI.recvCoordinatorHandshake();

  //restart targets[i]
  dupAllSockets (slidingFd);

  mtcpRestart();

  JASSERT (false).Text ("unreachable");
}
int main(int argc, char** argv)
{
  bool autoStartCoordinator=true;
  bool isRestart = true;
  int allowedModes = dmtcp::DmtcpCoordinatorAPI::COORD_ANY;

  initializeJalib();

  if (!getenv(ENV_VAR_QUIET)) {
    setenv(ENV_VAR_QUIET, "0", 0);
  }

  if (argc == 1) {
    JASSERT_STDERR << DMTCP_VERSION_AND_COPYRIGHT_INFO;
    JASSERT_STDERR << "(For help:  " << argv[0] << " --help)\n\n";
    return DMTCP_FAIL_RC;
  }

  //process args
  shift;
  while (true) {
    dmtcp::string s = argc>0 ? argv[0] : "--help";
    if (s == "--help" && argc == 1) {
      JASSERT_STDERR << theUsage;
      return DMTCP_FAIL_RC;
    } else if ((s == "--version") && argc == 1) {
      JASSERT_STDERR << DMTCP_VERSION_AND_COPYRIGHT_INFO;
      return DMTCP_FAIL_RC;
    } else if (s == "--no-check") {
      autoStartCoordinator = false;
      shift;
    } else if (s == "-j" || s == "--join") {
      allowedModes = dmtcp::DmtcpCoordinatorAPI::COORD_JOIN;
      shift;
    } else if (s == "-n" || s == "--new") {
      allowedModes = dmtcp::DmtcpCoordinatorAPI::COORD_NEW;
      shift;
    } else if (s == "--new-coordinator") {
      allowedModes = dmtcp::DmtcpCoordinatorAPI::COORD_FORCE_NEW;
      shift;
    } else if (s == "-b" || s == "--batch") {
      allowedModes = dmtcp::DmtcpCoordinatorAPI::COORD_BATCH;
      shift;
    } else if (s == "-i" || s == "--interval" ||
               (s.c_str()[0] == '-' && s.c_str()[1] == 'i' &&
                isdigit(s.c_str()[2]))) {
      if (isdigit(s.c_str()[2])) { // if -i5, for example
        setenv(ENV_VAR_CKPT_INTR, s.c_str()+2, 1);
        shift;
      } else { // else -i 5
        setenv(ENV_VAR_CKPT_INTR, argv[1], 1);
        shift; shift;
      }
    } else if (argc > 1 && (s == "-h" || s == "--host")) {
      setenv(ENV_VAR_NAME_HOST, argv[1], 1);
      shift; shift;
    } else if (argc > 1 && (s == "-p" || s == "--port")) {
      setenv(ENV_VAR_NAME_PORT, argv[1], 1);
      shift; shift;
    } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) {
      setenv(ENV_VAR_TMPDIR, argv[1], 1);
      shift; shift;
    } else if (s == "-q" || s == "--quiet") {
      *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1;
      // Just in case a non-standard version of setenv is being used:
      setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1);
      shift;
    } else if ((s.length() > 2 && s.substr(0, 2) == "--") ||
               (s.length() > 1 && s.substr(0, 1) == "-")) {
      JASSERT_STDERR << "Invalid Argument\n";
      JASSERT_STDERR << theUsage;
      return DMTCP_FAIL_RC;
    } else if (argc > 1 && s == "--") {
      shift;
      break;
    } else {
      break;
    }
  }

  dmtcp::UniquePid::setTmpDir(getenv(ENV_VAR_TMPDIR));
  dmtcpTmpDir = dmtcp::UniquePid::getTmpDir();

  jassert_quiet = *getenv(ENV_VAR_QUIET) - '0';

  //make sure JASSERT initializes now, rather than during restart
  Util::initializeLogFile();

  if (jassert_quiet == 0)
    JASSERT_STDERR << DMTCP_BANNER;

  if (autoStartCoordinator)
    dmtcp::DmtcpCoordinatorAPI::startCoordinatorIfNeeded(allowedModes,
                                                         isRestart);

  JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc);

  bool doAbort = false;
  for (; argc > 0; shift) {
    dmtcp::string restorename(argv[0]);
    struct stat buf;
    int rc = stat(restorename.c_str(), &buf);
    if (Util::strStartsWith(restorename, "ckpt_") &&
        Util::strEndsWith(restorename, "_files")) {
      continue;
    } else if (!Util::strEndsWith(restorename, ".dmtcp")) {
      JNOTE("File doesn't have .dmtcp extension. Check Usage.")
        (restorename);
      JASSERT_STDERR << theUsage;
      doAbort = true;
    } else if (rc == -1) {
      char error_msg[1024];
      sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str());
      perror(error_msg);
      doAbort = true;
    } else if (buf.st_uid != getuid()) { /*Could also run if geteuid() matches*/
      printf("\nProcess uid (%d) doesn't match uid (%lu) of\n" \
             "checkpoint image (%s).\n" \
	     "This is dangerous.  Aborting for security reasons.\n" \
             "If you still want to do this (at your own risk),\n" \
             "  then modify dmtcp/src/%s:%d and re-compile.\n",
             getuid(), buf.st_uid, restorename.c_str(), __FILE__, __LINE__ - 6);
      doAbort = true;
    }
    if (doAbort) {
      exit(DMTCP_FAIL_RC);
    }

    JTRACE("Will restart ckpt image _argv[0]_") (argv[0]);
    targets.push_back (RestoreTarget (argv[0]));
  }

  if (targets.size() <= 0) {
    JNOTE("ERROR: No DMTCP checkpoint image(s) found. Check Usage.");
    JASSERT_STDERR << theUsage;
    exit(DMTCP_FAIL_RC);
  }

  // Check that all targets belongs to one computation Group
  // If not - abort
  compGroup = targets[0].compGroup();
  numPeers = targets[0].numPeers();
  for(size_t i=0; i<targets.size(); i++) {
    JTRACE ("Check targets: ")
      (targets[i].path()) (targets[i].compGroup()) (targets[i].numPeers());
    if (compGroup != targets[i].compGroup()) {
      JASSERT(false)(compGroup)(targets[i].compGroup())
	.Text("ERROR: Restored programs belong to different computation IDs");
    } else if (numPeers != targets[i].numPeers()) {
      JASSERT(false)(numPeers)(targets[i].numPeers())
	.Text("ERROR: Different number of processes saved in checkpoint images");
    }
  }

  SlidingFdTable slidingFd;
  ConnectionToFds conToFd;

  ostringstream out;
  out << "will restore:\n";
  out << "\tfd  -> connection-id\n";
  ConnectionList& connections = ConnectionList::instance();
  ConnectionList::iterator it;
  for (it = connections.begin(); it != connections.end(); ++it) {
    int fd = slidingFd.getFdFor(it->first);
    conToFd[it->first].push_back(fd);
    out << "\t" << fd << " -> " << (it->first)
        << " -> " << (it->second)->str() << "\n";
  }
  JTRACE ("Allocating fds for Connections") (out.str());

  //------------------------
  WorkerState::setCurrentState(WorkerState::RESTARTING);
  ConnectionState ckptCoord(conToFd);
  DmtcpCoordinatorAPI coordinatorAPI;
  restoreSockets(coordinatorAPI, ckptCoord);

  /* Create the file to hold the pid/tid maps. */
  openOriginalToCurrentMappingFiles();

#ifndef PID_VIRTUALIZATION
  int i = (int)targets.size();

  //fork into targs.size() processes
  while (--i > 0) {
    int cid = fork();
    if (cid == 0) break;
    else JASSERT(cid > 0);
  }
  RestoreTarget& targ = targets[i];

  JTRACE("forked, restoring process")
    (i) (targets.size()) (targ.upid()) (getpid());

  //change UniquePid
  UniquePid::resetOnFork(targ.upid());

  //Reconnect to dmtcp_coordinator
  WorkerState::setCurrentState (WorkerState::RESTARTING);

  int tmpCoordFd = dup(PROTECTED_COORD_FD);
  JASSERT(tmpCoordFd != -1);
  coordinatorAPI.connectToCoordinator();
  coordinatorAPI.sendCoordinatorHandshake(targ.procname(), targ.compGroup());
  coordinatorAPI.recvCoordinatorHandshake();
  close(tmpCoordFd);

  //restart targets[i]
  targets[i].dupAllSockets (slidingFd);
  targets[i].mtcpRestart();

  JASSERT(false).Text("unreachable");
  return -1;
#endif
  //size_t i = targets.size();

  // Create roots vector, assign children to their parents.
  // Delete children that don't exist.
  BuildProcessTree();

  // Process all checkpoints to find one of them that can switch
  // needed Group to foreground.
  ProcessGroupInfo();
  // Create session meta-information in each node of the process tree.
  // Node contains info about all sessions which exists at lower levels.
  // Also node is aware of session leader existence at lower levels.
  SetupSessions();

  int pgrp_index=-1;
  JTRACE("Creating ROOT Processes") (roots.size());
  for (size_t j = 0 ; j < roots.size(); ++j) {
    if (roots[j].indep == false) {
      // We will restore this process from one of the independent roots.
      continue;
    }
    if (pgrp_index == -1 && !roots[j].t->isInitChild()) {
      pgrp_index = j;
      continue;
    }

    pid_t cid = fork();
    if (cid == 0) {
      JTRACE ("Root of process tree") (getpid()) (getppid());
      if (roots[j].t->isInitChild()) {
        JTRACE ("Create init-child process") (getpid()) (getppid());
        if (fork())
          _exit(0);
      }
      roots[j].t->CreateProcess(coordinatorAPI, slidingFd);
      JASSERT (false) .Text("Unreachable");
    }
    JASSERT (cid > 0);
    if (roots[j].t->isInitChild()) {
      waitpid(cid, NULL, 0);
    }
  }

  JTRACE("Restore processes without corresponding Root Target");
  int flat_index = -1;
  size_t j = 0;
  if (pgrp_index < 0) { // No root processes at all
    // Find first flat process that can replace currently running
    //   dmtcp_restart context.
    for (j = 0; j < targets.size(); ++j) {
      if (!targets[j].isMarkedUsed()) {
        // Save first flat-like process to be restored after all others
        flat_index = j;
        j++;
        break;
      }
    }
  }
  // Use j set to 0 (if at least one root non-init-child process exists),
  // or else j set to some value if no such process found.
  for(; j < targets.size(); ++j) {
    if (!targets[j].isMarkedUsed()) {
      if (pgrp_index < 0) {
        // Save first flat-like process to be restored after all others
        pgrp_index = j;
        continue;
      } else {
        targets[j].CreateProcess(coordinatorAPI, slidingFd);
        JTRACE("Need in flat-like restore for process") (targets[j].upid());
      }
    }
  }

  if (pgrp_index >= 0) {
    JTRACE("Restore first Root Target")(roots[pgrp_index].t->upid());
    roots[pgrp_index].t->CreateProcess(coordinatorAPI, slidingFd);
  } else if (flat_index >= 0) {
    JTRACE("Restore first Flat Target")(targets[flat_index].upid());
    targets[flat_index].CreateProcess(coordinatorAPI, slidingFd);
  } else {
    // FIXME: Under what conditions will this path be exercised?
    JNOTE ("unknown type of target?") (targets[flat_index].path());
  }
// #endif
}