int createNewConnectionBeforeFork(string& progname) { JASSERT(!noCoordinator()) .Text("Process attempted to call fork() while in --no-coordinator mode\n" " Because the coordinator is embedded in a single process,\n" " DMTCP will not work with multiple processes."); struct sockaddr_storage addr; uint32_t len; SharedData::getCoordAddr((struct sockaddr *)&addr, &len); socklen_t addrlen = len; int sock = jalib::JClientSocket((struct sockaddr *)&addr, addrlen); JASSERT(sock != -1); DmtcpMessage hello_local(DMT_NEW_WORKER); DmtcpMessage hello_remote = sendRecvHandshake(sock, hello_local, progname); JASSERT(hello_remote.virtualPid != -1); if (dmtcp_virtual_to_real_pid) { JTRACE("Got virtual pid from coordinator") (hello_remote.virtualPid); pid_t pid = getpid(); pid_t realPid = dmtcp_virtual_to_real_pid(pid); Util::setVirtualPidEnvVar(hello_remote.virtualPid, pid, realPid); } return sock; }
DmtcpMessage sendRecvHandshake(int fd, DmtcpMessage msg, string progname, UniquePid *compId) { if (dmtcp_virtual_to_real_pid) { msg.realPid = dmtcp_virtual_to_real_pid(getpid()); } else { msg.realPid = getpid(); } msg.theCheckpointInterval = getCkptInterval(); string hostname = jalib::Filesystem::GetCurrentHostname(); size_t buflen = hostname.length() + progname.length() + 2; char buf[buflen]; strcpy(buf, hostname.c_str()); strcpy(&buf[hostname.length() + 1], progname.c_str()); sendMsgToCoordinatorRaw(fd, msg, buf, buflen); recvMsgFromCoordinatorRaw(fd, &msg); msg.assertValid(); if (msg.type == DMT_KILL_PEER) { JTRACE("Received KILL message from coordinator, exiting"); _real_exit(0); } if (msg.type == DMT_REJECT_NOT_RUNNING) { JASSERT(false) .Text("Connection rejected by the coordinator.\n" "Reason: Current computation not in RUNNING state.\n" " Is a checkpoint/restart in progress?"); } else if (msg.type == DMT_REJECT_WRONG_COMP) { JASSERT(compId != NULL); JASSERT(false) (*compId) .Text("Connection rejected by the coordinator.\n" " Reason: This process has a different computation group."); } // Coordinator also prints this, but its stderr may go to /dev/null if (msg.type == DMT_REJECT_NOT_RESTARTING) { string coordinatorHost = ""; // C++ magic code; "" to be invisibly replaced int coordinatorPort; getCoordHostAndPort(COORD_ANY, coordinatorHost, &coordinatorPort); JNOTE ("\n\n*** Computation not in RESTARTING or CHECKPOINTED state." "\n***Can't join the existing coordinator, as it is serving a" "\n***different computation. Consider launching a new coordinator." "\n***Consider, also, checking with: dmtcp_command --status") (coordinatorPort); } JASSERT(msg.type == DMT_ACCEPT)(msg.type); return msg; }
void dmtcp::ProcessInfo::restoreProcessGroupInfo() { // Restore group assignment if (dmtcp_virtual_to_real_pid && dmtcp_virtual_to_real_pid(_gid) != _gid) { pid_t cgid = getpgid(0); // Group ID is known inside checkpointed processes if (_gid != cgid) { JTRACE("Restore Group Assignment") (_gid) (_fgid) (cgid) (_pid) (_ppid) (getppid()); JWARNING(setpgid(0, _gid) == 0) (_gid) (JASSERT_ERRNO) .Text("Cannot change group information"); } else { JTRACE("Group is already assigned") (_gid) (cgid); } } else { JTRACE("SKIP Group information, GID unknown"); } }
void resetOnFork(int sock) { JASSERT(Util::isValidFd(sock)); JASSERT(sock != PROTECTED_COORD_FD); Util::changeFd(sock, PROTECTED_COORD_FD); JASSERT(Util::isValidFd(coordinatorSocket)); JTRACE("Informing coordinator of new process") (UniquePid::ThisProcess()); DmtcpMessage msg(DMT_UPDATE_PROCESS_INFO_AFTER_FORK); if (dmtcp_virtual_to_real_pid) { msg.realPid = dmtcp_virtual_to_real_pid(getpid()); } else { msg.realPid = getpid(); } sendMsgToCoordinator(msg); _real_close(nsSock); nsSock = -1; }
static void restart() { restore_term_settings(); /* If DMTCP_RESTART_PAUSE2 set, sleep 15 seconds to allow gdb attach.*/ if (getenv("MTCP_RESTART_PAUSE2") || getenv("DMTCP_RESTART_PAUSE2")) { #ifdef HAS_PR_SET_PTRACER prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // For: gdb attach #endif // ifdef HAS_PR_SET_PTRACER struct timespec delay = { 15, 0 }; /* 15 seconds */ printf("Pausing 15 seconds. Do: gdb <PROGNAME> %d\n", dmtcp_virtual_to_real_pid(getpid())); nanosleep(&delay, NULL); #ifdef HAS_PR_SET_PTRACER prctl(PR_SET_PTRACER, 0, 0, 0, 0); // Revert permission to default. #endif // ifdef HAS_PR_SET_PTRACER } }