示例#1
0
int
createNewConnectionBeforeFork(string& progname)
{
  JASSERT(!noCoordinator())
  .Text("Process attempted to call fork() while in --no-coordinator mode\n"
        "  Because the coordinator is embedded in a single process,\n"
        "    DMTCP will not work with multiple processes.");
  struct sockaddr_storage addr;
  uint32_t len;
  SharedData::getCoordAddr((struct sockaddr *)&addr, &len);
  socklen_t addrlen = len;
  int sock = jalib::JClientSocket((struct sockaddr *)&addr, addrlen);
  JASSERT(sock != -1);

  DmtcpMessage hello_local(DMT_NEW_WORKER);
  DmtcpMessage hello_remote = sendRecvHandshake(sock, hello_local, progname);
  JASSERT(hello_remote.virtualPid != -1);

  if (dmtcp_virtual_to_real_pid) {
    JTRACE("Got virtual pid from coordinator") (hello_remote.virtualPid);
    pid_t pid = getpid();
    pid_t realPid = dmtcp_virtual_to_real_pid(pid);
    Util::setVirtualPidEnvVar(hello_remote.virtualPid, pid, realPid);
  }
  return sock;
}
示例#2
0
DmtcpMessage
sendRecvHandshake(int fd,
                  DmtcpMessage msg,
                  string progname,
                  UniquePid *compId)
{
  if (dmtcp_virtual_to_real_pid) {
    msg.realPid = dmtcp_virtual_to_real_pid(getpid());
  } else {
    msg.realPid = getpid();
  }

  msg.theCheckpointInterval = getCkptInterval();

  string hostname = jalib::Filesystem::GetCurrentHostname();

  size_t buflen = hostname.length() + progname.length() + 2;
  char buf[buflen];
  strcpy(buf, hostname.c_str());
  strcpy(&buf[hostname.length() + 1], progname.c_str());

  sendMsgToCoordinatorRaw(fd, msg, buf, buflen);

  recvMsgFromCoordinatorRaw(fd, &msg);
  msg.assertValid();
  if (msg.type == DMT_KILL_PEER) {
    JTRACE("Received KILL message from coordinator, exiting");
    _real_exit(0);
  }
  if (msg.type == DMT_REJECT_NOT_RUNNING) {
    JASSERT(false)
    .Text("Connection rejected by the coordinator.\n"
          "Reason: Current computation not in RUNNING state.\n"
          "         Is a checkpoint/restart in progress?");
  } else if (msg.type == DMT_REJECT_WRONG_COMP) {
    JASSERT(compId != NULL);
    JASSERT(false) (*compId)
    .Text("Connection rejected by the coordinator.\n"
          " Reason: This process has a different computation group.");
  }
  // Coordinator also prints this, but its stderr may go to /dev/null
  if (msg.type == DMT_REJECT_NOT_RESTARTING) {
    string coordinatorHost = ""; // C++ magic code; "" to be invisibly replaced
    int coordinatorPort;
    getCoordHostAndPort(COORD_ANY, coordinatorHost, &coordinatorPort);
    JNOTE ("\n\n*** Computation not in RESTARTING or CHECKPOINTED state."
        "\n***Can't join the existing coordinator, as it is serving a"
        "\n***different computation.  Consider launching a new coordinator."
        "\n***Consider, also, checking with:  dmtcp_command --status")
        (coordinatorPort);
  }
  JASSERT(msg.type == DMT_ACCEPT)(msg.type);
  return msg;
}
示例#3
0
void dmtcp::ProcessInfo::restoreProcessGroupInfo()
{
  // Restore group assignment
  if (dmtcp_virtual_to_real_pid && dmtcp_virtual_to_real_pid(_gid) != _gid) {
    pid_t cgid = getpgid(0);
    // Group ID is known inside checkpointed processes
    if (_gid != cgid) {
      JTRACE("Restore Group Assignment")
        (_gid) (_fgid) (cgid) (_pid) (_ppid) (getppid());
      JWARNING(setpgid(0, _gid) == 0) (_gid) (JASSERT_ERRNO)
        .Text("Cannot change group information");
    } else {
      JTRACE("Group is already assigned") (_gid) (cgid);
    }
  } else {
    JTRACE("SKIP Group information, GID unknown");
  }
}
示例#4
0
void resetOnFork(int sock)
{
  JASSERT(Util::isValidFd(sock));
  JASSERT(sock != PROTECTED_COORD_FD);
  Util::changeFd(sock, PROTECTED_COORD_FD);
  JASSERT(Util::isValidFd(coordinatorSocket));

  JTRACE("Informing coordinator of new process") (UniquePid::ThisProcess());

  DmtcpMessage msg(DMT_UPDATE_PROCESS_INFO_AFTER_FORK);
  if (dmtcp_virtual_to_real_pid) {
    msg.realPid = dmtcp_virtual_to_real_pid(getpid());
  } else {
    msg.realPid = getpid();
  }
  sendMsgToCoordinator(msg);
  _real_close(nsSock);
  nsSock = -1;
}
示例#5
0
static void
restart()
{
  restore_term_settings();

  /* If DMTCP_RESTART_PAUSE2 set, sleep 15 seconds to allow gdb attach.*/
  if (getenv("MTCP_RESTART_PAUSE2") || getenv("DMTCP_RESTART_PAUSE2")) {
#ifdef HAS_PR_SET_PTRACER
    prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); // For: gdb attach
#endif // ifdef HAS_PR_SET_PTRACER
    struct timespec delay = { 15, 0 }; /* 15 seconds */
    printf("Pausing 15 seconds. Do:  gdb <PROGNAME> %d\n",
           dmtcp_virtual_to_real_pid(getpid()));
    nanosleep(&delay, NULL);
#ifdef HAS_PR_SET_PTRACER
    prctl(PR_SET_PTRACER, 0, 0, 0, 0); // Revert permission to default.
#endif // ifdef HAS_PR_SET_PTRACER
  }
}