bool dmtcp::DmtcpWorker::waitForStage2bCheckpoint()
{
  waitForCoordinatorMsg ( "PEER_LOOKUP", DMT_DO_PEER_LOOKUP );
  JTRACE ( "Looking up Socket Peers..." );
  theTcpConnections.clear();
  theCheckpointState->preCheckpointPeerLookup(theTcpConnections);
  sendPeerLookupRequest(theTcpConnections);
  JTRACE ( "Done Socket Peer Lookup" );


  WorkerState::setCurrentState ( WorkerState::PEER_LOOKUP_COMPLETE );

  {
    dmtcp::DmtcpMessage msg;

    msg.type = DMT_OK;
    msg.state = WorkerState::currentState();
    _coordinatorSocket << msg;

    JTRACE ( "waiting for DRAIN/RESUME message" );

    do {
      msg.poison();
      _coordinatorSocket >> msg;
      msg.assertValid();

      if ( msg.type == DMT_KILL_PEER ) {
        JTRACE ( "Received KILL message from coordinator, exiting" );
        _exit ( 0 );
      }
      JTRACE ( "received message" ) (msg.type );
      if ( msg.type != DMT_UNKNOWN_PEER )
        break;

      JTRACE ("received DMT_UNKNOWN_PEER message") (msg.conId);

      TcpConnection* con =
        (TcpConnection*) &( ConnectionList::instance() [msg.conId] );
      con->markExternal();
      externalTcpConnections.push_back(msg.conId);
      _waitingForExternalSocketsToClose = true;

    } while ( msg.type == DMT_UNKNOWN_PEER );

    JASSERT ( msg.type == DMT_DO_DRAIN || msg.type == DMT_DO_RESUME )
            ( msg.type );

    ConnectionList& connections = ConnectionList::instance();

    // Tcp Accept and Connect connection with PeerType UNKNOWN should be marked as INTERNAL
    for ( ConnectionList::iterator i = connections.begin()
        ; i!= connections.end()
        ; ++i )
    {
      Connection* con =  i->second;
      if ( con->conType() == Connection::TCP ) {
        TcpConnection* tcpCon = (TcpConnection *) con;
        if ( (tcpCon->tcpType() == TcpConnection::TCP_ACCEPT ||
             tcpCon->tcpType() == TcpConnection::TCP_CONNECT) &&
             tcpCon->peerType() == TcpConnection::PEER_UNKNOWN )
          tcpCon->markInternal();
      }
    }
    if ( msg.type == DMT_DO_RESUME ) {
      JTRACE ( "Peer Lookup not complete, skipping checkpointing \n\n\n\n\n");
      return false;
    }

    JASSERT (msg.type == DMT_DO_DRAIN);
  }
}
Example #2
0
void dmtcp::ConnectionState::doReconnect(jalib::JSocket& coordinator,
                                         jalib::JSocket& restoreListen)
{
  _rewirer.addDataSocket(new jalib::JChunkReader(coordinator,
                                                 sizeof(DmtcpMessage)));
  _rewirer.addListenSocket(restoreListen);
  _rewirer.setCoordinatorFd(coordinator.sockfd());

  handleDuplicateFilesInSeparateConnections();

  ConnectionList& connections = ConnectionList::instance();

  // Here we modify the restore algorithm by splitting it in two parts. In the
  // first part we restore all the connection except the PTY_SLAVE types and in
  // the second part we restore only PTY_SLAVE connections. This is done to
  // make sure that by the time we are trying to restore a PTY_SLAVE
  // connection, its corresponding PTY_MASTER connection has already been
  // restored.
  // UPDATE: We also restore the files for which the we didn't have the lock in
  //         second iteration along with PTY_SLAVEs
  // Part 1: Restore all but Pseudo-terminal slaves and file connection which
  //         were not checkpointed
  ConnectionList::iterator i;
  for (i= connections.begin(); i != connections.end(); ++i) {
    ConnectionIdentifier id = i->first;
    Connection *con = i->second;

    JASSERT(_conToFds[id].size() > 0)
      .Text("stale connections should be gone by now");

    if (con->subType() == FileConnection::FILE_PROCFS) {
      continue;
    }

    if (con->conType() == Connection::TCP) {
      TcpConnection *tcpCon =(TcpConnection *) con;
      if (tcpCon->peerType() == TcpConnection::PEER_SOCKETPAIR) {
        ConnectionIdentifier peerId = tcpCon->getSocketpairPeerId();
        TcpConnection *peerCon =
          (TcpConnection*) connections.getConnection(peerId);
        if (peerCon != NULL) {
          tcpCon->restoreSocketPair(_conToFds[id], peerCon, _conToFds[peerId]);
          continue;
        }
      }
    }

    if (con->restoreInSecondIteration() == false) {
      con->restore(_conToFds[id], &_rewirer);
    }
  }

  // Part 2: Restore all Pseudo-terminal slaves and file connections that were
  //         not checkpointed.
  for (i = connections.begin(); i != connections.end(); ++i) {
    Connection *con = i->second;
    JASSERT(_conToFds[i->first].size() > 0)
      .Text("stale connections should be gone by now");

    if (con->subType() == FileConnection::FILE_PROCFS) {
      continue;
    }

    if (con->restoreInSecondIteration() == true) {
      con->restore(_conToFds[i->first], &_rewirer);
    }
  }
  _rewirer.doReconnect();
}