int main(int argc, char** argv) { char *tmpdir_arg = NULL; char *ckptdir_arg = NULL; initializeJalib(); if (!getenv(ENV_VAR_QUIET)) { setenv(ENV_VAR_QUIET, "0", 0); } if (getenv(ENV_VAR_DISABLE_STRICT_CHECKING)) { noStrictChecking = true; } if (getenv(ENV_VAR_CHECKPOINT_DIR)) { ckptdir_arg = getenv(ENV_VAR_CHECKPOINT_DIR); } if (argc == 1) { printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO); printf("(For help: %s --help)\n\n", argv[0]); return DMTCP_FAIL_RC; } //process args shift; while (true) { string s = argc>0 ? argv[0] : "--help"; if (s == "--help" && argc == 1) { printf("%s", theUsage); return DMTCP_FAIL_RC; } else if ((s == "--version") && argc == 1) { printf("%s", DMTCP_VERSION_AND_COPYRIGHT_INFO); return DMTCP_FAIL_RC; } else if (s == "-j" || s == "--join") { allowedModes = COORD_JOIN; shift; } else if (s == "--new-coordinator") { allowedModes = COORD_NEW; shift; } else if (s == "--no-strict-checking") { noStrictChecking = true; shift; } else if (s == "-i" || s == "--interval") { setenv(ENV_VAR_CKPT_INTR, argv[1], 1); shift; shift; } else if (argv[0][0] == '-' && argv[0][1] == 'i' && isdigit(argv[0][2])) { // else if -i5, for example setenv(ENV_VAR_CKPT_INTR, argv[0]+2, 1); shift; } else if (argc > 1 && (s == "-h" || s == "--coord-host" || s == "--host")){ setenv(ENV_VAR_NAME_HOST, argv[1], 1); shift; shift; } else if (argc>1 && (s == "-p" || s == "--coord-port" || s == "--port")) { setenv(ENV_VAR_NAME_PORT, argv[1], 1); shift; shift; } else if (argv[0][0] == '-' && argv[0][1] == 'p' && isdigit(argv[0][2])) { // else if -p0, for example setenv(ENV_VAR_NAME_PORT, argv[0]+2, 1); shift; } else if (argc>1 && s == "--port-file"){ thePortFile = argv[1]; shift; shift; } else if (argc > 1 && (s == "-c" || s == "--ckptdir")) { ckptdir_arg = argv[1]; shift; shift; } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) { tmpdir_arg = argv[1]; shift; shift; } else if (s == "-q" || s == "--quiet") { *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1; // Just in case a non-standard version of setenv is being used: setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1); shift; } else if ((s.length() > 2 && s.substr(0, 2) == "--") || (s.length() > 1 && s.substr(0, 1) == "-")) { printf("Invalid Argument\n%s", theUsage); return DMTCP_FAIL_RC; } else if (argc > 1 && s == "--") { shift; break; } else { break; } } tmpDir = Util::calcTmpDir(tmpdir_arg); if (ckptdir_arg) { setNewCkptDir(ckptdir_arg); } jassert_quiet = *getenv(ENV_VAR_QUIET) - '0'; //make sure JASSERT initializes now, rather than during restart Util::initializeLogFile(tmpDir); if (!noStrictChecking && jassert_quiet < 2 && (getuid() == 0 || geteuid() == 0)) { JASSERT_STDERR << "WARNING: Running dmtcp_restart as root can be dangerous.\n" " An unknown checkpoint image or bugs in DMTCP may lead to unforeseen\n" " consequences. Continuing as root ....\n"; } JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc); bool doAbort = false; for (; argc > 0; shift) { string restorename(argv[0]); struct stat buf; int rc = stat(restorename.c_str(), &buf); if (Util::strEndsWith(restorename, "_files")) { continue; } else if (!Util::strEndsWith(restorename, ".dmtcp")) { JNOTE("File doesn't have .dmtcp extension. Check Usage.") (restorename); // Don't test for --quiet here. We're aborting. We need to say why. JASSERT_STDERR << theUsage; doAbort = true; } else if (rc == -1) { char error_msg[1024]; sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str()); perror(error_msg); doAbort = true; } else if (buf.st_uid != getuid() && !noStrictChecking) { /*Could also run if geteuid() matches*/ printf("\nProcess uid (%d) doesn't match uid (%d) of\n" \ "checkpoint image (%s).\n" \ "This is dangerous. Aborting for security reasons.\n" \ "If you still want to do this, then re-run dmtcp_restart\n" \ " with the --no-strict-checking flag.\n", getuid(), buf.st_uid, restorename.c_str()); doAbort = true; } if (doAbort) { exit(DMTCP_FAIL_RC); } JTRACE("Will restart ckpt image") (argv[0]); RestoreTarget *t = new RestoreTarget(argv[0]); targets[t->upid()] = t; } // Prepare list of independent process tree roots RestoreTargetMap::iterator i; for (i = targets.begin(); i != targets.end(); i++) { RestoreTarget *t1 = i->second; if (t1->isRootOfProcessTree()) { RestoreTargetMap::iterator j; for (j = targets.begin(); j != targets.end(); j++) { RestoreTarget *t2 = j->second; if (t1 == t2) continue; if (t1->sid() == t2->pid()) { break; } } if (j == targets.end()) { independentProcessTreeRoots[t1->upid()] = t1; } } } JASSERT(independentProcessTreeRoots.size() > 0) .Text("There must be at least one process tree that doesn't have\n" " a different process as session leader."); WorkerState::setCurrentState(WorkerState::RESTARTING); /* Try to find non-orphaned process in independent procs list */ RestoreTarget *t; bool foundNonOrphan = false; RestoreTargetMap::iterator it; int size = independentProcessTreeRoots.size(); printf("size = %d\n", size); for (it = independentProcessTreeRoots.begin(); it != independentProcessTreeRoots.end(); it++) { t = it->second; if ( !t->isOrphan() ) { foundNonOrphan = true; break; } } JASSERT(t->pid() != 0); JASSERT(!t->noCoordinator() || allowedModes == COORD_ANY) .Text("Process had no coordinator prior to checkpoint;\n" " but either --join or --new-coordinator was specified."); if( foundNonOrphan ){ t->createProcess(true); } else { /* we were unable to find any non-orphaned procs. * pick the first one and orphan it */ t = independentProcessTreeRoots.begin()->second; t->createOrphanedProcess(true); } JASSERT(false).Text("unreachable"); return -1; }
int main(int argc, char** argv) { bool autoStartCoordinator=true; bool isRestart = true; dmtcp::CoordinatorAPI::CoordinatorMode allowedModes = dmtcp::CoordinatorAPI::COORD_ANY; initializeJalib(); if (!getenv(ENV_VAR_QUIET)) { setenv(ENV_VAR_QUIET, "0", 0); } if (argc == 1) { JASSERT_STDERR << DMTCP_VERSION_AND_COPYRIGHT_INFO; JASSERT_STDERR << "(For help: " << argv[0] << " --help)\n\n"; return DMTCP_FAIL_RC; } //process args shift; while (true) { dmtcp::string s = argc>0 ? argv[0] : "--help"; if (s == "--help" && argc == 1) { JASSERT_STDERR << theUsage; return DMTCP_FAIL_RC; } else if ((s == "--version") && argc == 1) { JASSERT_STDERR << DMTCP_VERSION_AND_COPYRIGHT_INFO; return DMTCP_FAIL_RC; } else if (s == "--no-check") { autoStartCoordinator = false; shift; } else if (s == "-j" || s == "--join") { allowedModes = dmtcp::CoordinatorAPI::COORD_JOIN; shift; } else if (s == "-n" || s == "--new") { allowedModes = dmtcp::CoordinatorAPI::COORD_NEW; shift; } else if (s == "--new-coordinator") { allowedModes = dmtcp::CoordinatorAPI::COORD_FORCE_NEW; shift; } else if (s == "-b" || s == "--batch") { allowedModes = dmtcp::CoordinatorAPI::COORD_BATCH; shift; } else if (s == "-i" || s == "--interval" || (s.c_str()[0] == '-' && s.c_str()[1] == 'i' && isdigit(s.c_str()[2]))) { if (isdigit(s.c_str()[2])) { // if -i5, for example setenv(ENV_VAR_CKPT_INTR, s.c_str()+2, 1); shift; } else { // else -i 5 setenv(ENV_VAR_CKPT_INTR, argv[1], 1); shift; shift; } } else if (argc > 1 && (s == "-h" || s == "--host")) { setenv(ENV_VAR_NAME_HOST, argv[1], 1); shift; shift; } else if (argc > 1 && (s == "-p" || s == "--port")) { setenv(ENV_VAR_NAME_PORT, argv[1], 1); shift; shift; } else if (argc > 1 && (s == "-t" || s == "--tmpdir")) { setenv(ENV_VAR_TMPDIR, argv[1], 1); shift; shift; } else if (s == "-q" || s == "--quiet") { *getenv(ENV_VAR_QUIET) = *getenv(ENV_VAR_QUIET) + 1; // Just in case a non-standard version of setenv is being used: setenv(ENV_VAR_QUIET, getenv(ENV_VAR_QUIET), 1); shift; } else if ((s.length() > 2 && s.substr(0, 2) == "--") || (s.length() > 1 && s.substr(0, 1) == "-")) { JASSERT_STDERR << "Invalid Argument\n"; JASSERT_STDERR << theUsage; return DMTCP_FAIL_RC; } else if (argc > 1 && s == "--") { shift; break; } else { break; } } dmtcp::UniquePid::setTmpDir(getenv(ENV_VAR_TMPDIR)); dmtcpTmpDir = dmtcp::UniquePid::getTmpDir(); jassert_quiet = *getenv(ENV_VAR_QUIET) - '0'; //make sure JASSERT initializes now, rather than during restart Util::initializeLogFile(); if (jassert_quiet == 0) JASSERT_STDERR << DMTCP_BANNER; JTRACE("New dmtcp_restart process; _argc_ ckpt images") (argc); bool doAbort = false; for (; argc > 0; shift) { dmtcp::string restorename(argv[0]); struct stat buf; int rc = stat(restorename.c_str(), &buf); if (Util::strEndsWith(restorename, "_files")) { continue; } else if (!Util::strEndsWith(restorename, ".dmtcp")) { JNOTE("File doesn't have .dmtcp extension. Check Usage.") (restorename); JASSERT_STDERR << theUsage; doAbort = true; } else if (rc == -1) { char error_msg[1024]; sprintf(error_msg, "\ndmtcp_restart: ckpt image %s", restorename.c_str()); perror(error_msg); doAbort = true; } else if (buf.st_uid != getuid()) { /*Could also run if geteuid() matches*/ printf("\nProcess uid (%d) doesn't match uid (%d) of\n" \ "checkpoint image (%s).\n" \ "This is dangerous. Aborting for security reasons.\n" \ "If you still want to do this (at your own risk),\n" \ " then modify dmtcp/src/%s:%d and re-compile.\n", getuid(), buf.st_uid, restorename.c_str(), __FILE__, __LINE__ - 6); doAbort = true; } if (doAbort) { exit(DMTCP_FAIL_RC); } JTRACE("Will restart ckpt image") (argv[0]); RestoreTarget *t = new RestoreTarget(argv[0]); targets[t->upid()] = t; } // Prepare list of independent process tree roots RestoreTargetMap::iterator i; for (i = targets.begin(); i != targets.end(); i++) { RestoreTarget *t1 = i->second; if (t1->isRootOfProcessTree()) { RestoreTargetMap::iterator j; for (j = targets.begin(); j != targets.end(); j++) { RestoreTarget *t2 = j->second; if (t1 == t2) continue; if (t1->sid() == t2->pid()) { break; } } if (j == targets.end()) { independentProcessTreeRoots[t1->upid()] = t1; } } } JASSERT(independentProcessTreeRoots.size() > 0) .Text("There must atleast one process tree which doesn't have a different " "process as session leader."); if (autoStartCoordinator) { dmtcp::CoordinatorAPI::startCoordinatorIfNeeded(allowedModes, isRestart); } RestoreTarget *t = independentProcessTreeRoots.begin()->second; JASSERT(t->pid() != 0); t->createProcess(true); JASSERT(false).Text("unreachable"); return -1; }
void createProcess(bool createIndependentRootProcesses = false) { UniquePid::ThisProcess() = _pInfo.upid(); UniquePid::ParentProcess() = _pInfo.uppid(); Util::initializeLogFile(_pInfo.procname()); if (createIndependentRootProcesses) { DmtcpUniqueProcessId compId = _pInfo.compGroup().upid(); CoordinatorInfo coordInfo; struct in_addr localIPAddr; if (_pInfo.noCoordinator()) { allowedModes = COORD_NONE; } // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used const char *host = NULL; int port = UNINITIALIZED_PORT; CoordinatorAPI::getCoordHostAndPort(allowedModes, &host, &port); // FIXME: We will use the new HOST and PORT here, but after restart,, // we will use the old HOST and PORT from the ckpt image. CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes, _pInfo.procname(), _pInfo.compGroup(), _pInfo.numPeers(), &coordInfo, host, port, &localIPAddr); // If port was 0, we'll get new random port when coordinator starts up. CoordinatorAPI::getCoordHostAndPort(allowedModes, &host, &port); Util::writeCoordPortToFile(port, thePortFile.c_str()); string installDir = jalib::Filesystem::DirName(jalib::Filesystem::GetProgramDir()); #if defined(__i386__) || defined(__arm__) if (Util::strEndsWith(installDir, "/lib/dmtcp/32")) { // If dmtcp_launch was compiled for 32 bits in 64-bit O/S, then note: // DMTCP_ROOT/bin/dmtcp_launch is a symbolic link to: // DMTCP_ROOT/bin/dmtcp_launch/lib/dmtcp/32/bin // GetProgramDir() followed the link. So, need to remove the suffix. char *str = const_cast<char*>(installDir.c_str()); str[strlen(str) - strlen("/lib/dmtcp/32")] = '\0'; installDir = str; } #endif /* We need to initialize SharedData here to make sure that it is * initialized with the correct coordinator timestamp. The coordinator * timestamp is updated only during postCkpt callback. However, the * SharedData area may be initialized earlier (for example, while * recreating threads), causing it to use *older* timestamp. */ SharedData::initialize(tmpDir.c_str(), installDir.c_str(), &compId, &coordInfo, &localIPAddr); Util::prepareDlsymWrapper(); } JTRACE("Creating process during restart") (upid()) (_pInfo.procname()); RestoreTargetMap::iterator it; for (it = targets.begin(); it != targets.end(); it++) { RestoreTarget *t = it->second; if (_pInfo.upid() == t->_pInfo.upid()) { continue; } else if (_pInfo.isChild(t->upid()) && t->_pInfo.sid() != _pInfo.pid()) { t->createDependentChildProcess(); } } if (createIndependentRootProcesses) { RestoreTargetMap::iterator it; for (it = independentProcessTreeRoots.begin(); it != independentProcessTreeRoots.end(); it++) { RestoreTarget *t = it->second; if (t != this) { t->createDependentNonChildProcess(); } } } // If we were the session leader, become one now. if (_pInfo.sid() == _pInfo.pid()) { if (getsid(0) != _pInfo.pid()) { JWARNING(setsid() != -1) (getsid(0)) (JASSERT_ERRNO) .Text("Failed to restore this process as session leader."); } } // Now recreate processes with sid == _pid for (it = targets.begin(); it != targets.end(); it++) { RestoreTarget *t = it->second; if (_pInfo.upid() == t->_pInfo.upid()) { continue; } else if (t->_pInfo.sid() == _pInfo.pid()) { if (_pInfo.isChild(t->upid())) { t->createDependentChildProcess(); } else if (t->isRootOfProcessTree()) { t->createDependentNonChildProcess(); } } } // Now close all open fds except _fd; for (it = targets.begin(); it != targets.end(); it++) { RestoreTarget *t = it->second; if (t != this) { close(t->fd()); } } string ckptDir = jalib::Filesystem::GetDeviceName(PROTECTED_CKPT_DIR_FD); if (ckptDir.length() == 0) { // Create the ckpt-dir fd so that the restarted process can know about // the abs-path of ckpt-image. string dirName = jalib::Filesystem::DirName(_path); int dirfd = open(dirName.c_str(), O_RDONLY); JASSERT(dirfd != -1) (JASSERT_ERRNO); if (dirfd != PROTECTED_CKPT_DIR_FD) { JASSERT(dup2(dirfd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD); close(dirfd); } } if (!createIndependentRootProcesses) { // dmtcp_restart sets ENV_VAR_NAME_HOST/PORT, even if cmd line flag used const char *host = NULL; int port = UNINITIALIZED_PORT; int *port_p = &port; CoordinatorAPI::getCoordHostAndPort(allowedModes, &host, port_p); CoordinatorAPI::instance().connectToCoordOnRestart(allowedModes, _pInfo.procname(), _pInfo.compGroup(), _pInfo.numPeers(), NULL, host, port, NULL); } setEnvironFd(); int is32bitElf = 0; #if defined(__x86_64__) || defined(__aarch64__) is32bitElf = (_pInfo.elfType() == ProcessInfo::Elf_32); #elif defined(__i386__) || defined(__arm__) is32bitElf = true; #endif runMtcpRestart(is32bitElf, _fd, &_pInfo); JASSERT ( false ).Text ( "unreachable" ); }
void createProcess(bool createIndependentRootProcesses = false) { //change UniquePid UniquePid::resetOnFork(upid()); dmtcp::Util::initializeLogFile(_pInfo.procname()); JTRACE("Creating process during restart") (upid()) (_pInfo.procname()); RestoreTargetMap::iterator it; for (it = targets.begin(); it != targets.end(); it++) { RestoreTarget *t = it->second; if (_pInfo.upid() == t->_pInfo.upid()) { continue; } else if (_pInfo.isChild(t->upid()) && t->_pInfo.sid() != _pInfo.pid()) { t->createDependentProcess(true); } } if (createIndependentRootProcesses) { RestoreTargetMap::iterator it; for (it = independentProcessTreeRoots.begin(); it != independentProcessTreeRoots.end(); it++) { RestoreTarget *t = it->second; if (t != this) { t->createDependentProcess(false); } } } // If we were the session leader, become one now. if (_pInfo.sid() == _pInfo.pid()) { if (getsid(0) != _pInfo.pid()) { JWARNING(setsid() != -1) (getsid(0)) (JASSERT_ERRNO) .Text("Failed to restore this process as session leader."); } } // Now recreate processes with sid == _pid for (it = targets.begin(); it != targets.end(); it++) { RestoreTarget *t = it->second; if (_pInfo.upid() == t->_pInfo.upid()) { continue; } else if (t->_pInfo.sid() == _pInfo.pid()) { t->createDependentProcess(_pInfo.isChild(t->upid())); } } // Now close all open fds except _fd; for (it = targets.begin(); it != targets.end(); it++) { RestoreTarget *t = it->second; if (t != this) { close(t->fd()); } } // Create the ckpt-dir fd so that the restarted process can know about // the abs-path of ckpt-image. dmtcp::string deviceName = jalib::Filesystem::GetDeviceName(_fd); dmtcp::string dirName = jalib::Filesystem::DirName(deviceName); int dirfd = open(dirName.c_str(), O_RDONLY); JASSERT(dirfd != -1) (JASSERT_ERRNO); if (dirfd != PROTECTED_CKPT_DIR_FD) { JASSERT(dup2(dirfd, PROTECTED_CKPT_DIR_FD) == PROTECTED_CKPT_DIR_FD); close(dirfd); } dmtcp::CoordinatorAPI coordinatorAPI; coordinatorAPI.connectToCoordinator(); dmtcp::Util::runMtcpRestore(_path.c_str(), _fd, _extDecompPid, _pInfo.argvSize(), _pInfo.envSize()); JASSERT ( false ).Text ( "unreachable" ); }