void dmtcp::DmtcpCoordinatorAPI::informCoordinatorOfNewProcessOnFork
  (jalib::JSocket& coordSock)
{
  JASSERT(coordSock.isValid());
  JASSERT(coordSock.sockfd() != PROTECTED_COORD_FD);
  _coordinatorSocket = coordSock;
  _coordinatorSocket.changeFd(PROTECTED_COORD_FD);

  JTRACE("Informing coordinator of new process") (UniquePid::ThisProcess());
  sendCoordinatorHandshake(jalib::Filesystem::GetProgramName() + "_(forked)",
                           UniquePid::ComputationId(),
                           -1,
                           DMT_UPDATE_PROCESS_INFO_AFTER_FORK);
}
Example #2
0
void dmtcp::ConnectionState::doReconnect(jalib::JSocket& coordinator,
                                         jalib::JSocket& restoreListen)
{
  _rewirer.addDataSocket(new jalib::JChunkReader(coordinator,
                                                 sizeof(DmtcpMessage)));
  _rewirer.addListenSocket(restoreListen);
  _rewirer.setCoordinatorFd(coordinator.sockfd());

  handleDuplicateFilesInSeparateConnections();

  ConnectionList& connections = ConnectionList::instance();

  // Here we modify the restore algorithm by splitting it in two parts. In the
  // first part we restore all the connection except the PTY_SLAVE types and in
  // the second part we restore only PTY_SLAVE connections. This is done to
  // make sure that by the time we are trying to restore a PTY_SLAVE
  // connection, its corresponding PTY_MASTER connection has already been
  // restored.
  // UPDATE: We also restore the files for which the we didn't have the lock in
  //         second iteration along with PTY_SLAVEs
  // Part 1: Restore all but Pseudo-terminal slaves and file connection which
  //         were not checkpointed
  ConnectionList::iterator i;
  for (i= connections.begin(); i != connections.end(); ++i) {
    ConnectionIdentifier id = i->first;
    Connection *con = i->second;

    JASSERT(_conToFds[id].size() > 0)
      .Text("stale connections should be gone by now");

    if (con->subType() == FileConnection::FILE_PROCFS) {
      continue;
    }

    if (con->conType() == Connection::TCP) {
      TcpConnection *tcpCon =(TcpConnection *) con;
      if (tcpCon->peerType() == TcpConnection::PEER_SOCKETPAIR) {
        ConnectionIdentifier peerId = tcpCon->getSocketpairPeerId();
        TcpConnection *peerCon =
          (TcpConnection*) connections.getConnection(peerId);
        if (peerCon != NULL) {
          tcpCon->restoreSocketPair(_conToFds[id], peerCon, _conToFds[peerId]);
          continue;
        }
      }
    }

    if (con->restoreInSecondIteration() == false) {
      con->restore(_conToFds[id], &_rewirer);
    }
  }

  // Part 2: Restore all Pseudo-terminal slaves and file connections that were
  //         not checkpointed.
  for (i = connections.begin(); i != connections.end(); ++i) {
    Connection *con = i->second;
    JASSERT(_conToFds[i->first].size() > 0)
      .Text("stale connections should be gone by now");

    if (con->subType() == FileConnection::FILE_PROCFS) {
      continue;
    }

    if (con->restoreInSecondIteration() == true) {
      con->restore(_conToFds[i->first], &_rewirer);
    }
  }
  _rewirer.doReconnect();
}