Esempio n. 1
0
int main(int argc, char** argv)
{
  char *tmpdir_arg = NULL;
  char *ckptdir_arg = NULL;

  initializeJalib();

  if (!getenv(ENV_VAR_QUIET)) {
    setenv(ENV_VAR_QUIET, "0", 0);
  }

  if (getenv(ENV_VAR_DISABLE_STRICT_CHECKING)) {
    noStrictChecking = true;
  }

  if (getenv(ENV_VAR_CHECKPOINT_DIR)) {
    ckptdir_arg = getenv(ENV_VAR_CHECKPOINT_DIR);
  }

  if (argc == 1) {
    printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO);
    printf("(For help: %s --help)\n\n", argv[0]);
    return DMTCP_FAIL_RC;
  }

  //process args
  shift;
  while (true) {
    string s = argc>0 ? argv[0] : "--help";
    if (s == "--help" && argc == 1) {
      printf("%s", theUsage);
      return DMTCP_FAIL_RC;
    } else if ((s == "--version") && argc == 1) {
      printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO);
      return DMTCP_FAIL_RC;
    } else if (s == "-j" || s == "--join") {
      allowedModes = COORD_JOIN;
      shift;
    } else if (s == "--new-coordinator") {
      allowedModes = COORD_NEW;
      shift;
    } else if (s == "--no-strict-checking") {
      noStrictChecking = true;
      shift;
    } else if (s == "-i" || s == "--interval") {
      setenv(ENV_VAR_CKPT_INTR, argv[1], 1);
      shift; shift;
    } else if (argv[0][0] == '-' && argv[0][1] == 'i' &&
               isdigit(argv[0][2])) { // else if -i5, for example
      setenv(ENV_VAR_CKPT_INTR, argv[0]+2, 1);
      shift;
    } else if (argc > 1 && (s == "-h" || s == "--coord-host" || s == "--host")){
      setenv(ENV_VAR_NAME_HOST, argv[1], 1);
      shift; shift;
    } else if (argc>1 && (s == "-p" || s == "--coord-port" || s == "--port")) {
      setenv(ENV_VAR_NAME_PORT, argv[1], 1);
      shift; shift;
    } else if (argv[0][0] == '-' && argv[0][1] == 'p' &&
               isdigit(argv[0][2])) { // else if -p0, for example
      setenv(ENV_VAR_NAME_PORT, argv[0]+2, 1);
      shift;
    } else if (argc>1 && s == "--port-file"){
      thePortFile = argv[1];
      shift; shift;
    } else if (argc > 1 && (s == "-c" || s == "--ckptdir")) {
      ckptdir_arg = argv[1];
      shift; shift;
    } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) {
      tmpdir_arg = argv[1];
      shift; shift;
    } else if (s == "-q" || s == "--quiet") {
      *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1;
      // Just in case a non-standard version of setenv is being used:
      setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1);
      shift;
    } else if ((s.length() > 2 && s.substr(0, 2) == "--") ||
               (s.length() > 1 && s.substr(0, 1) == "-")) {
      printf("Invalid Argument\n%s", theUsage);
      return DMTCP_FAIL_RC;
    } else if (argc > 1 && s == "--") {
      shift;
      break;
    } else {
      break;
    }
  }

  tmpDir = Util::calcTmpDir(tmpdir_arg);
  if (ckptdir_arg) {
    setNewCkptDir(ckptdir_arg);
  }

  jassert_quiet = *getenv(ENV_VAR_QUIET) - '0';

  //make sure JASSERT initializes now, rather than during restart
  Util::initializeLogFile(tmpDir);

  if (!noStrictChecking && jassert_quiet < 2 &&
      (getuid() == 0 || geteuid() == 0)) {
    JASSERT_STDERR <<
      "WARNING:  Running dmtcp_restart as root can be dangerous.\n"
      "  An unknown checkpoint image or bugs in DMTCP may lead to unforeseen\n"
      "  consequences.  Continuing as root ....\n";
  }

  JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc);

  bool doAbort = false;
  for (; argc > 0; shift) {
    string restorename(argv[0]);
    struct stat buf;
    int rc = stat(restorename.c_str(), &buf);
    if (Util::strEndsWith(restorename, "_files")) {
      continue;
    } else if (!Util::strEndsWith(restorename, ".dmtcp")) {
      JNOTE("File doesn't have .dmtcp extension. Check Usage.") (restorename);
      // Don't test for --quiet here.  We're aborting.  We need to say why.
      JASSERT_STDERR << theUsage;
      doAbort = true;
    } else if (rc == -1) {
      char error_msg[1024];
      sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str());
      perror(error_msg);
      doAbort = true;
    } else if (buf.st_uid != getuid() && !noStrictChecking) {
      /*Could also run if geteuid() matches*/
      printf("\nProcess uid (%d) doesn't match uid (%d) of\n" \
             "checkpoint image (%s).\n" \
	     "This is dangerous.  Aborting for security reasons.\n" \
             "If you still want to do this, then re-run dmtcp_restart\n" \
             "  with the --no-strict-checking flag.\n",
             getuid(), buf.st_uid, restorename.c_str());
      doAbort = true;
    }
    if (doAbort) {
      exit(DMTCP_FAIL_RC);
    }

    JTRACE("Will restart ckpt image") (argv[0]);
    RestoreTarget *t = new RestoreTarget(argv[0]);
    targets[t->upid()] = t;
  }

  // Prepare list of independent process tree roots
  RestoreTargetMap::iterator i;
  for (i = targets.begin(); i != targets.end(); i++) {
    RestoreTarget *t1 = i->second;
    if (t1->isRootOfProcessTree()) {
      RestoreTargetMap::iterator j;
      for (j = targets.begin(); j != targets.end(); j++) {
        RestoreTarget *t2 = j->second;
        if (t1 == t2) continue;
        if (t1->sid() == t2->pid()) {
          break;
        }
      }
      if (j == targets.end()) {
        independentProcessTreeRoots[t1->upid()] = t1;
      }
    }
  }
  JASSERT(independentProcessTreeRoots.size() > 0)
    .Text("There must be at least one process tree that doesn't have\n"
          "  a different process as session leader.");

  WorkerState::setCurrentState(WorkerState::RESTARTING);

  /* Try to find non-orphaned process in independent procs list */
  RestoreTarget *t;
  bool foundNonOrphan = false;
  RestoreTargetMap::iterator it;
  int size = independentProcessTreeRoots.size();
  printf("size = %d\n", size);
  for (it = independentProcessTreeRoots.begin();
       it != independentProcessTreeRoots.end();
       it++) {
    t = it->second;
    if ( !t->isOrphan() ) {
      foundNonOrphan = true;
      break;
    }
  }

  JASSERT(t->pid() != 0);
  JASSERT(!t->noCoordinator() || allowedModes == COORD_ANY)
    .Text("Process had no coordinator prior to checkpoint;\n"
          "  but either --join or --new-coordinator was specified.");

  if( foundNonOrphan ){
    t->createProcess(true);
  } else {
      /* we were unable to find any non-orphaned procs.
       * pick the first one and orphan it */
      t = independentProcessTreeRoots.begin()->second;
      t->createOrphanedProcess(true);
  }

  JASSERT(false).Text("unreachable");
  return -1;
}
int main(int argc, char** argv)
{
  bool autoStartCoordinator=true;
  bool isRestart = true;
  dmtcp::CoordinatorAPI::CoordinatorMode allowedModes =
    dmtcp::CoordinatorAPI::COORD_ANY;

  initializeJalib();

  if (!getenv(ENV_VAR_QUIET)) {
    setenv(ENV_VAR_QUIET, "0", 0);
  }

  if (argc == 1) {
    JASSERT_STDERR << DMTCP_VERSION_AND_COPYRIGHT_INFO;
    JASSERT_STDERR << "(For help:  " << argv[0] << " --help)\n\n";
    return DMTCP_FAIL_RC;
  }

  //process args
  shift;
  while (true) {
    dmtcp::string s = argc>0 ? argv[0] : "--help";
    if (s == "--help" && argc == 1) {
      JASSERT_STDERR << theUsage;
      return DMTCP_FAIL_RC;
    } else if ((s == "--version") && argc == 1) {
      JASSERT_STDERR << DMTCP_VERSION_AND_COPYRIGHT_INFO;
      return DMTCP_FAIL_RC;
    } else if (s == "--no-check") {
      autoStartCoordinator = false;
      shift;
    } else if (s == "-j" || s == "--join") {
      allowedModes = dmtcp::CoordinatorAPI::COORD_JOIN;
      shift;
    } else if (s == "-n" || s == "--new") {
      allowedModes = dmtcp::CoordinatorAPI::COORD_NEW;
      shift;
    } else if (s == "--new-coordinator") {
      allowedModes = dmtcp::CoordinatorAPI::COORD_FORCE_NEW;
      shift;
    } else if (s == "-b" || s == "--batch") {
      allowedModes = dmtcp::CoordinatorAPI::COORD_BATCH;
      shift;
    } else if (s == "-i" || s == "--interval" ||
               (s.c_str()[0] == '-' && s.c_str()[1] == 'i' &&
                isdigit(s.c_str()[2]))) {
      if (isdigit(s.c_str()[2])) { // if -i5, for example
        setenv(ENV_VAR_CKPT_INTR, s.c_str()+2, 1);
        shift;
      } else { // else -i 5
        setenv(ENV_VAR_CKPT_INTR, argv[1], 1);
        shift; shift;
      }
    } else if (argc > 1 && (s == "-h" || s == "--host")) {
      setenv(ENV_VAR_NAME_HOST, argv[1], 1);
      shift; shift;
    } else if (argc > 1 && (s == "-p" || s == "--port")) {
      setenv(ENV_VAR_NAME_PORT, argv[1], 1);
      shift; shift;
    } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) {
      setenv(ENV_VAR_TMPDIR, argv[1], 1);
      shift; shift;
    } else if (s == "-q" || s == "--quiet") {
      *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1;
      // Just in case a non-standard version of setenv is being used:
      setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1);
      shift;
    } else if ((s.length() > 2 && s.substr(0, 2) == "--") ||
               (s.length() > 1 && s.substr(0, 1) == "-")) {
      JASSERT_STDERR << "Invalid Argument\n";
      JASSERT_STDERR << theUsage;
      return DMTCP_FAIL_RC;
    } else if (argc > 1 && s == "--") {
      shift;
      break;
    } else {
      break;
    }
  }

  dmtcp::UniquePid::setTmpDir(getenv(ENV_VAR_TMPDIR));
  dmtcpTmpDir = dmtcp::UniquePid::getTmpDir();

  jassert_quiet = *getenv(ENV_VAR_QUIET) - '0';

  //make sure JASSERT initializes now, rather than during restart
  Util::initializeLogFile();

  if (jassert_quiet == 0)
    JASSERT_STDERR << DMTCP_BANNER;

  JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc);

  bool doAbort = false;
  for (; argc > 0; shift) {
    dmtcp::string restorename(argv[0]);
    struct stat buf;
    int rc = stat(restorename.c_str(), &buf);
    if (Util::strEndsWith(restorename, "_files")) {
      continue;
    } else if (!Util::strEndsWith(restorename, ".dmtcp")) {
      JNOTE("File doesn't have .dmtcp extension. Check Usage.")
        (restorename);
      JASSERT_STDERR << theUsage;
      doAbort = true;
    } else if (rc == -1) {
      char error_msg[1024];
      sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str());
      perror(error_msg);
      doAbort = true;
    } else if (buf.st_uid != getuid()) { /*Could also run if geteuid() matches*/
      printf("\nProcess uid (%d) doesn't match uid (%d) of\n" \
             "checkpoint image (%s).\n" \
	     "This is dangerous.  Aborting for security reasons.\n" \
             "If you still want to do this (at your own risk),\n" \
             "  then modify dmtcp/src/%s:%d and re-compile.\n",
             getuid(), buf.st_uid, restorename.c_str(), __FILE__, __LINE__ - 6);
      doAbort = true;
    }
    if (doAbort) {
      exit(DMTCP_FAIL_RC);
    }

    JTRACE("Will restart ckpt image") (argv[0]);
    RestoreTarget *t = new RestoreTarget(argv[0]);
    targets[t->upid()] = t;
  }

  // Prepare list of independent process tree roots
  RestoreTargetMap::iterator i;
  for (i = targets.begin(); i != targets.end(); i++) {
    RestoreTarget *t1 = i->second;
    if (t1->isRootOfProcessTree()) {
      RestoreTargetMap::iterator j;
      for (j = targets.begin(); j != targets.end(); j++) {
        RestoreTarget *t2 = j->second;
        if (t1 == t2) continue;
        if (t1->sid() == t2->pid()) {
          break;
        }
      }
      if (j == targets.end()) {
        independentProcessTreeRoots[t1->upid()] = t1;
      }
    }
  }
  JASSERT(independentProcessTreeRoots.size() > 0)
    .Text("There must atleast one process tree which doesn't have a different "
          "process as session leader.");

  if (autoStartCoordinator) {
    dmtcp::CoordinatorAPI::startCoordinatorIfNeeded(allowedModes,
                                                    isRestart);
  }

  RestoreTarget *t = independentProcessTreeRoots.begin()->second;
  JASSERT(t->pid() != 0);
  t->createProcess(true);
  JASSERT(false).Text("unreachable");
  return -1;
}
Esempio n. 3
0
    void createProcess(bool createIndependentRootProcesses = false)
    {
      UniquePid::ThisProcess() = _pInfo.upid();
      UniquePid::ParentProcess() = _pInfo.uppid();
      Util::initializeLogFile(_pInfo.procname());

      if (createIndependentRootProcesses) {
        DmtcpUniqueProcessId compId = _pInfo.compGroup().upid();
        CoordinatorInfo coordInfo;
        struct in_addr localIPAddr;
        if (_pInfo.noCoordinator()) {
          allowedModes = COORD_NONE;
        }

        // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used
        const char *host = NULL;
        int port = UNINITIALIZED_PORT;
        CoordinatorAPI::getCoordHostAndPort(allowedModes, &host, &port);
        // FIXME:  We will use the new HOST and PORT here, but after restart,,
        //           we will use the old HOST and PORT from the ckpt image.
        CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes,
                                                           _pInfo.procname(),
                                                           _pInfo.compGroup(),
                                                           _pInfo.numPeers(),
                                                           &coordInfo,
                                                           host,
                                                           port,
                                                           &localIPAddr);
        // If port was 0, we'll get new random port when coordinator starts up.
        CoordinatorAPI::getCoordHostAndPort(allowedModes, &host, &port);
        Util::writeCoordPortToFile(port, thePortFile.c_str());

        string installDir =
          jalib::Filesystem::DirName(jalib::Filesystem::GetProgramDir());

#if defined(__i386__) || defined(__arm__)
        if (Util::strEndsWith(installDir, "/lib/dmtcp/32")) {
          // If dmtcp_launch was compiled for 32 bits in 64-bit O/S, then note:
          // DMTCP_ROOT/bin/dmtcp_launch is a symbolic link to:
          //    DMTCP_ROOT/bin/dmtcp_launch/lib/dmtcp/32/bin
          // GetProgramDir() followed the link.  So, need to remove the suffix.
          char *str = const_cast<char*>(installDir.c_str());
          str[strlen(str) - strlen("/lib/dmtcp/32")] = '\0';
          installDir = str;
        }
#endif

        /* We need to initialize SharedData here to make sure that it is
         * initialized with the correct coordinator timestamp.  The coordinator
         * timestamp is updated only during postCkpt callback. However, the
         * SharedData area may be initialized earlier (for example, while
         * recreating threads), causing it to use *older* timestamp.
         */
        SharedData::initialize(tmpDir.c_str(),
                               installDir.c_str(),
                               &compId,
                               &coordInfo,
                               &localIPAddr);

        Util::prepareDlsymWrapper();
      }

      JTRACE("Creating process during restart") (upid()) (_pInfo.procname());

      RestoreTargetMap::iterator it;
      for (it = targets.begin(); it != targets.end(); it++) {
        RestoreTarget *t = it->second;
        if (_pInfo.upid() == t->_pInfo.upid()) {
          continue;
        } else if (_pInfo.isChild(t->upid()) &&
                   t->_pInfo.sid() != _pInfo.pid()) {
          t->createDependentChildProcess();
        }
      }

      if (createIndependentRootProcesses) {
        RestoreTargetMap::iterator it;
        for (it = independentProcessTreeRoots.begin();
             it != independentProcessTreeRoots.end();
             it++) {
          RestoreTarget *t = it->second;
          if (t != this) {
            t->createDependentNonChildProcess();
          }
        }
      }

      // If we were the session leader, become one now.
      if (_pInfo.sid() == _pInfo.pid()) {
        if (getsid(0) != _pInfo.pid()) {
          JWARNING(setsid() != -1) (getsid(0)) (JASSERT_ERRNO)
            .Text("Failed to restore this process as session leader.");
        }
      }

      // Now recreate processes with sid == _pid
      for (it = targets.begin(); it != targets.end(); it++) {
        RestoreTarget *t = it->second;
        if (_pInfo.upid() == t->_pInfo.upid()) {
          continue;
        } else if (t->_pInfo.sid() == _pInfo.pid()) {
          if (_pInfo.isChild(t->upid())) {
            t->createDependentChildProcess();
          } else if (t->isRootOfProcessTree()) {
            t->createDependentNonChildProcess();
          }
        }
      }

      // Now close all open fds except _fd;
      for (it = targets.begin(); it != targets.end(); it++) {
        RestoreTarget *t = it->second;
        if (t != this) {
          close(t->fd());
        }
      }

      string ckptDir = jalib::Filesystem::GetDeviceName(PROTECTED_CKPT_DIR_FD);
      if (ckptDir.length() == 0) {
        // Create the ckpt-dir fd so that the restarted process can know about
        // the abs-path of ckpt-image.
        string dirName = jalib::Filesystem::DirName(_path);
        int dirfd = open(dirName.c_str(), O_RDONLY);
        JASSERT(dirfd != -1) (JASSERT_ERRNO);
        if (dirfd != PROTECTED_CKPT_DIR_FD) {
          JASSERT(dup2(dirfd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD);
          close(dirfd);
        }
      }

      if (!createIndependentRootProcesses) {
        // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used
        const char *host = NULL;
        int port = UNINITIALIZED_PORT;
        int *port_p = &port;
        CoordinatorAPI::getCoordHostAndPort(allowedModes, &host, port_p);
        CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes,
                                                           _pInfo.procname(),
                                                           _pInfo.compGroup(),
                                                           _pInfo.numPeers(),
                                                           NULL,
                                                           host,
                                                           port,
                                                           NULL);
      }

      setEnvironFd();
      int is32bitElf = 0;

#if defined(__x86_64__) || defined(__aarch64__)
      is32bitElf = (_pInfo.elfType() == ProcessInfo::Elf_32);
#elif defined(__i386__) || defined(__arm__)
      is32bitElf = true;
#endif


      runMtcpRestart(is32bitElf, _fd, &_pInfo);

      JASSERT ( false ).Text ( "unreachable" );
    }
    void createProcess(bool createIndependentRootProcesses = false)
    {
      //change UniquePid
      UniquePid::resetOnFork(upid());
      dmtcp::Util::initializeLogFile(_pInfo.procname());

      JTRACE("Creating process during restart") (upid()) (_pInfo.procname());

      RestoreTargetMap::iterator it;
      for (it = targets.begin(); it != targets.end(); it++) {
        RestoreTarget *t = it->second;
        if (_pInfo.upid() == t->_pInfo.upid()) {
          continue;
        } else if (_pInfo.isChild(t->upid()) &&
                   t->_pInfo.sid() != _pInfo.pid()) {
          t->createDependentProcess(true);
        }
      }

      if (createIndependentRootProcesses) {
        RestoreTargetMap::iterator it;
        for (it = independentProcessTreeRoots.begin();
             it != independentProcessTreeRoots.end();
             it++) {
          RestoreTarget *t = it->second;
          if (t != this) {
            t->createDependentProcess(false);
          }
        }
      }

      // If we were the session leader, become one now.
      if (_pInfo.sid() == _pInfo.pid()) {
        if (getsid(0) != _pInfo.pid()) {
          JWARNING(setsid() != -1) (getsid(0)) (JASSERT_ERRNO)
            .Text("Failed to restore this process as session leader.");
        }
      }

      // Now recreate processes with sid == _pid
      for (it = targets.begin(); it != targets.end(); it++) {
        RestoreTarget *t = it->second;
        if (_pInfo.upid() == t->_pInfo.upid()) {
          continue;
        } else if (t->_pInfo.sid() == _pInfo.pid()) {
          t->createDependentProcess(_pInfo.isChild(t->upid()));
        }
      }

      // Now close all open fds except _fd;
      for (it = targets.begin(); it != targets.end(); it++) {
        RestoreTarget *t = it->second;
        if (t != this) {
          close(t->fd());
        }
      }

      // Create the ckpt-dir fd so that the restarted process can know about
      // the abs-path of ckpt-image.
      dmtcp::string deviceName = jalib::Filesystem::GetDeviceName(_fd);
      dmtcp::string dirName = jalib::Filesystem::DirName(deviceName);
      int dirfd = open(dirName.c_str(), O_RDONLY);
      JASSERT(dirfd != -1) (JASSERT_ERRNO);
      if (dirfd != PROTECTED_CKPT_DIR_FD) {
        JASSERT(dup2(dirfd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD);
        close(dirfd);
      }

      dmtcp::CoordinatorAPI coordinatorAPI;
      coordinatorAPI.connectToCoordinator();
      dmtcp::Util::runMtcpRestore(_path.c_str(), _fd, _extDecompPid,
                                  _pInfo.argvSize(), _pInfo.envSize());

      JASSERT ( false ).Text ( "unreachable" );
    }