bool dmtcp::DmtcpWorker::waitForStage2bCheckpoint() { waitForCoordinatorMsg ( "PEER_LOOKUP", DMT_DO_PEER_LOOKUP ); JTRACE ( "Looking up Socket Peers..." ); theTcpConnections.clear(); theCheckpointState->preCheckpointPeerLookup(theTcpConnections); sendPeerLookupRequest(theTcpConnections); JTRACE ( "Done Socket Peer Lookup" ); WorkerState::setCurrentState ( WorkerState::PEER_LOOKUP_COMPLETE ); { dmtcp::DmtcpMessage msg; msg.type = DMT_OK; msg.state = WorkerState::currentState(); _coordinatorSocket << msg; JTRACE ( "waiting for DRAIN/RESUME message" ); do { msg.poison(); _coordinatorSocket >> msg; msg.assertValid(); if ( msg.type == DMT_KILL_PEER ) { JTRACE ( "Received KILL message from coordinator, exiting" ); _exit ( 0 ); } JTRACE ( "received message" ) (msg.type ); if ( msg.type != DMT_UNKNOWN_PEER ) break; JTRACE ("received DMT_UNKNOWN_PEER message") (msg.conId); TcpConnection* con = (TcpConnection*) &( ConnectionList::instance() [msg.conId] ); con->markExternal(); externalTcpConnections.push_back(msg.conId); _waitingForExternalSocketsToClose = true; } while ( msg.type == DMT_UNKNOWN_PEER ); JASSERT ( msg.type == DMT_DO_DRAIN || msg.type == DMT_DO_RESUME ) ( msg.type ); ConnectionList& connections = ConnectionList::instance(); // Tcp Accept and Connect connection with PeerType UNKNOWN should be marked as INTERNAL for ( ConnectionList::iterator i = connections.begin() ; i!= connections.end() ; ++i ) { Connection* con = i->second; if ( con->conType() == Connection::TCP ) { TcpConnection* tcpCon = (TcpConnection *) con; if ( (tcpCon->tcpType() == TcpConnection::TCP_ACCEPT || tcpCon->tcpType() == TcpConnection::TCP_CONNECT) && tcpCon->peerType() == TcpConnection::PEER_UNKNOWN ) tcpCon->markInternal(); } } if ( msg.type == DMT_DO_RESUME ) { JTRACE ( "Peer Lookup not complete, skipping checkpointing \n\n\n\n\n"); return false; } JASSERT (msg.type == DMT_DO_DRAIN); } }
void dmtcp::ConnectionState::doReconnect(jalib::JSocket& coordinator, jalib::JSocket& restoreListen) { _rewirer.addDataSocket(new jalib::JChunkReader(coordinator, sizeof(DmtcpMessage))); _rewirer.addListenSocket(restoreListen); _rewirer.setCoordinatorFd(coordinator.sockfd()); handleDuplicateFilesInSeparateConnections(); ConnectionList& connections = ConnectionList::instance(); // Here we modify the restore algorithm by splitting it in two parts. In the // first part we restore all the connection except the PTY_SLAVE types and in // the second part we restore only PTY_SLAVE connections. This is done to // make sure that by the time we are trying to restore a PTY_SLAVE // connection, its corresponding PTY_MASTER connection has already been // restored. // UPDATE: We also restore the files for which the we didn't have the lock in // second iteration along with PTY_SLAVEs // Part 1: Restore all but Pseudo-terminal slaves and file connection which // were not checkpointed ConnectionList::iterator i; for (i= connections.begin(); i != connections.end(); ++i) { ConnectionIdentifier id = i->first; Connection *con = i->second; JASSERT(_conToFds[id].size() > 0) .Text("stale connections should be gone by now"); if (con->subType() == FileConnection::FILE_PROCFS) { continue; } if (con->conType() == Connection::TCP) { TcpConnection *tcpCon =(TcpConnection *) con; if (tcpCon->peerType() == TcpConnection::PEER_SOCKETPAIR) { ConnectionIdentifier peerId = tcpCon->getSocketpairPeerId(); TcpConnection *peerCon = (TcpConnection*) connections.getConnection(peerId); if (peerCon != NULL) { tcpCon->restoreSocketPair(_conToFds[id], peerCon, _conToFds[peerId]); continue; } } } if (con->restoreInSecondIteration() == false) { con->restore(_conToFds[id], &_rewirer); } } // Part 2: Restore all Pseudo-terminal slaves and file connections that were // not checkpointed. for (i = connections.begin(); i != connections.end(); ++i) { Connection *con = i->second; JASSERT(_conToFds[i->first].size() > 0) .Text("stale connections should be gone by now"); if (con->subType() == FileConnection::FILE_PROCFS) { continue; } if (con->restoreInSecondIteration() == true) { con->restore(_conToFds[i->first], &_rewirer); } } _rewirer.doReconnect(); }