char* connectAndSendUserCommand(char c, int *coordCmdStatus, int *numPeers, int *isRunning, int *ckptInterval) { char *replyData = NULL; int coordFd = createNewSocketToCoordinator(COORD_ANY); if (coordFd == -1) { *coordCmdStatus = CoordCmdStatus::ERROR_COORDINATOR_NOT_FOUND; return replyData; } // Tell the coordinator to run given user command DmtcpMessage msg(DMT_USER_CMD); msg.coordCmd = c; if (c == 'i') { const char *interval = getenv(ENV_VAR_CKPT_INTR); if (interval != NULL) { msg.theCheckpointInterval = jalib::StringToInt(interval); } } JASSERT(Util::writeAll(coordFd, &msg, sizeof(msg)) == sizeof(msg)); // The coordinator will violently close our socket... if (c == 'q' || c == 'Q') { *coordCmdStatus = CoordCmdStatus::NOERROR; return replyData; } // Receive REPLY DmtcpMessage reply; reply.poison(); recvMsgFromCoordinatorRaw(coordFd, &reply, (void**)&replyData); reply.assertValid(); JASSERT(reply.type == DMT_USER_CMD_RESULT); if (coordCmdStatus != NULL) { *coordCmdStatus = reply.coordCmdStatus; } if (numPeers != NULL) { *numPeers = reply.numPeers; } if (isRunning != NULL) { *isRunning = reply.isRunning; } if (ckptInterval != NULL) { *ckptInterval = reply.theCheckpointInterval; } _real_close(coordFd); return replyData; }
DmtcpMessage sendRecvHandshake(int fd, DmtcpMessage msg, string progname, UniquePid *compId) { if (dmtcp_virtual_to_real_pid) { msg.realPid = dmtcp_virtual_to_real_pid(getpid()); } else { msg.realPid = getpid(); } msg.theCheckpointInterval = getCkptInterval(); string hostname = jalib::Filesystem::GetCurrentHostname(); size_t buflen = hostname.length() + progname.length() + 2; char buf[buflen]; strcpy(buf, hostname.c_str()); strcpy(&buf[hostname.length() + 1], progname.c_str()); sendMsgToCoordinatorRaw(fd, msg, buf, buflen); recvMsgFromCoordinatorRaw(fd, &msg); msg.assertValid(); if (msg.type == DMT_KILL_PEER) { JTRACE("Received KILL message from coordinator, exiting"); _real_exit(0); } if (msg.type == DMT_REJECT_NOT_RUNNING) { JASSERT(false) .Text("Connection rejected by the coordinator.\n" "Reason: Current computation not in RUNNING state.\n" " Is a checkpoint/restart in progress?"); } else if (msg.type == DMT_REJECT_WRONG_COMP) { JASSERT(compId != NULL); JASSERT(false) (*compId) .Text("Connection rejected by the coordinator.\n" " Reason: This process has a different computation group."); } // Coordinator also prints this, but its stderr may go to /dev/null if (msg.type == DMT_REJECT_NOT_RESTARTING) { string coordinatorHost = ""; // C++ magic code; "" to be invisibly replaced int coordinatorPort; getCoordHostAndPort(COORD_ANY, coordinatorHost, &coordinatorPort); JNOTE ("\n\n*** Computation not in RESTARTING or CHECKPOINTED state." "\n***Can't join the existing coordinator, as it is serving a" "\n***different computation. Consider launching a new coordinator." "\n***Consider, also, checking with: dmtcp_command --status") (coordinatorPort); } JASSERT(msg.type == DMT_ACCEPT)(msg.type); return msg; }
void DmtcpWorker::waitForSuspendMessage() { SharedData::resetBarrierInfo(); if (dmtcp_no_coordinator()) { string shmFile = jalib::Filesystem::GetDeviceName(PROTECTED_SHM_FD); JASSERT(!shmFile.empty()); unlink(shmFile.c_str()); CoordinatorAPI::waitForCheckpointCommand(); ProcessInfo::instance().numPeers(1); ProcessInfo::instance().compGroup(SharedData::getCompId()); return; } if (ThreadSync::destroyDmtcpWorkerLockTryLock() != 0) { JTRACE("User thread is performing exit()." " ckpt thread exit()ing as well"); ckptThreadPerformExit(); } if (exitInProgress()) { ThreadSync::destroyDmtcpWorkerLockUnlock(); ckptThreadPerformExit(); } JTRACE("waiting for SUSPEND message"); DmtcpMessage msg; CoordinatorAPI::recvMsgFromCoordinator(&msg); if (exitInProgress()) { ThreadSync::destroyDmtcpWorkerLockUnlock(); ckptThreadPerformExit(); } msg.assertValid(); if (msg.type == DMT_KILL_PEER) { JTRACE("Received KILL message from coordinator, exiting"); _exit(0); } JASSERT(msg.type == DMT_DO_SUSPEND) (msg.type); // Coordinator sends some computation information along with the SUSPEND // message. Extracting that. SharedData::updateGeneration(msg.compGroup.computationGeneration()); JASSERT(SharedData::getCompId() == msg.compGroup.upid()) (SharedData::getCompId()) (msg.compGroup); _exitAfterCkpt = msg.exitAfterCkpt; }
pid_t dmtcp::DmtcpCoordinatorAPI::getVirtualPidFromCoordinator() { connectToCoordinator(); DmtcpMessage msg(DMT_GET_VIRTUAL_PID); _coordinatorSocket << msg; DmtcpMessage reply; reply.poison(); _coordinatorSocket >> reply; reply.assertValid(); JASSERT(reply.type == DMT_GET_VIRTUAL_PID_RESULT) (reply.type); JASSERT(reply.virtualPid != -1); _coordinatorSocket.close(); return reply.virtualPid; }
// On input, val points to a buffer in user memory and *val_len is the maximum // size of that buffer (the memory allocated by user). // On output, we copy data to val, and set *val_len to the actual buffer size // (to the size of the data that we copied to the user buffer). int CoordinatorAPI::sendQueryToCoordinator(const char *id, const void *key, uint32_t key_len, void *val, uint32_t *val_len) { DmtcpMessage msg (DMT_NAME_SERVICE_QUERY); JWARNING(strlen(id) < sizeof(msg.nsid)); strncpy(msg.nsid, id, 8); msg.keyLen = key_len; msg.valLen = 0; msg.extraBytes = key_len; jalib::JSocket sock = _coordinatorSocket; if (key == NULL || key_len == 0 || val == NULL || val_len == 0) { return 0; } if (dmtcp_is_running_state()) { if (!_nsSock.isValid()) { _nsSock = createNewSocketToCoordinator(COORD_ANY); JASSERT(_nsSock.isValid()); _nsSock.changeFd(PROTECTED_NS_FD); DmtcpMessage m(DMT_NAME_SERVICE_WORKER); _nsSock << m; } sock = _nsSock; JASSERT(sock.isValid()); } sock << msg; sock.writeAll((const char *)key, key_len); msg.poison(); sock >> msg; msg.assertValid(); JASSERT(msg.type == DMT_NAME_SERVICE_QUERY_RESPONSE && msg.extraBytes == msg.valLen); JASSERT (*val_len >= msg.valLen); *val_len = msg.valLen; if (*val_len > 0) { sock.readAll((char*)val, *val_len); } return *val_len; }
void DmtcpWorker::acknowledgeSuspendMsg() { if (dmtcp_no_coordinator()) { return; } JTRACE("Waiting for DMT_DO_CHECKPOINT message"); CoordinatorAPI::sendMsgToCoordinator(DmtcpMessage(DMT_OK)); DmtcpMessage msg; CoordinatorAPI::recvMsgFromCoordinator(&msg); msg.assertValid(); JASSERT(msg.type == DMT_COMPUTATION_INFO) (msg.type); JTRACE("Computation information") (msg.compGroup) (msg.numPeers); ProcessInfo::instance().compGroup(msg.compGroup); ProcessInfo::instance().numPeers(msg.numPeers); }
void waitForBarrier(const string& barrierId) { sendMsgToCoordinator(DmtcpMessage(DMT_OK)); JTRACE("waiting for DMT_BARRIER_RELEASED message"); char *extraData = NULL; DmtcpMessage msg; recvMsgFromCoordinator(&msg, (void**)&extraData); msg.assertValid(); if (msg.type == DMT_KILL_PEER) { JTRACE("Received KILL message from coordinator, exiting"); _exit(0); } JASSERT(msg.type == DMT_BARRIER_RELEASED) (msg.type); JASSERT(extraData != NULL); JASSERT(barrierId == extraData) (barrierId) (extraData); JALLOC_FREE(extraData); }
DmtcpMessage CoordinatorAPI::sendRecvHandshake(DmtcpMessage msg, string progname, UniquePid *compId) { if (dmtcp_virtual_to_real_pid) { msg.realPid = dmtcp_virtual_to_real_pid(getpid()); } else { msg.realPid = getpid(); } msg.theCheckpointInterval = getCkptInterval(); string hostname = jalib::Filesystem::GetCurrentHostname(); msg.extraBytes = hostname.length() + 1 + progname.length() + 1; _coordinatorSocket << msg; _coordinatorSocket.writeAll(hostname.c_str(), hostname.length() + 1); _coordinatorSocket.writeAll(progname.c_str(), progname.length() + 1); msg.poison(); _coordinatorSocket >> msg; msg.assertValid(); if (msg.type == DMT_KILL_PEER) { JTRACE("Received KILL message from coordinator, exiting"); _real_exit (0); } if (msg.type == DMT_REJECT_NOT_RUNNING) { JASSERT(false) .Text("Connection rejected by the coordinator.\n" "Reason: Current computation not in RUNNING state.\n" " Is a checkpoint/restart in progress?"); } else if (msg.type == DMT_REJECT_WRONG_COMP) { JASSERT(compId != NULL); JASSERT(false) (*compId) .Text("Connection rejected by the coordinator.\n" " Reason: This process has a different computation group."); } JASSERT(msg.type == DMT_ACCEPT)(msg.type); return msg; }
void DmtcpWorker::waitForSuspendMessage() { SharedData::resetBarrierInfo(); if (dmtcp_no_coordinator()) { string shmFile = jalib::Filesystem::GetDeviceName(PROTECTED_SHM_FD); JASSERT(!shmFile.empty()); unlink(shmFile.c_str()); CoordinatorAPI::waitForCheckpointCommand(); ProcessInfo::instance().numPeers(1); ProcessInfo::instance().compGroup(SharedData::getCompId()); return; } JTRACE("waiting for SUSPEND message"); DmtcpMessage msg; CoordinatorAPI::recvMsgFromCoordinator(&msg); // Before validating message; make sure we are not exiting. if (exitInProgress) { ckptThreadPerformExit(); } msg.assertValid(); JASSERT(msg.type == DMT_DO_SUSPEND) (msg.type); // Coordinator sends some computation information along with the SUSPEND // message. Extracting that. SharedData::updateGeneration(msg.compGroup.computationGeneration()); JASSERT(SharedData::getCompId() == msg.compGroup.upid()) (SharedData::getCompId()) (msg.compGroup); exitAfterCkpt = msg.exitAfterCkpt; }
void dmtcp::ConnectionRewirer::onConnect ( const jalib::JSocket& sock, const struct sockaddr* /*remoteAddr*/,socklen_t /*remoteLen*/ ) { jalib::JSocket remote = sock; DmtcpMessage msg; msg.poison(); remote >> msg; msg.assertValid(); JASSERT ( msg.type == DMT_RESTORE_RECONNECTED ) ( msg.type ).Text ( "unexpected message" ); iterator i = _pendingIncoming.find ( msg.restorePid ); JASSERT ( i != _pendingIncoming.end() ) ( msg.restorePid ) .Text ( "got unexpected incoming restore request" ); const dmtcp::vector<int>& fds = i->second; JASSERT ( fds.size() > 0 ); int fd0 = fds[0]; remote.changeFd ( fd0 ); JTRACE ( "restoring incoming connection" ) ( msg.restorePid ) ( fd0 ) ( fds.size() ); for ( size_t i = 1; i<fds.size(); ++i ) { JTRACE ( "restoring extra fd" ) ( fd0 ) ( fds[i] ); JASSERT ( _real_dup2 ( fd0,fds[i] ) == fds[i] ) ( fd0 ) ( fds[i] ) ( msg.restorePid ) .Text ( "dup2() failed" ); } _pendingIncoming.erase ( i ); if ( pendingCount() ==0 ) finishup(); #ifdef DEBUG else debugPrint(); #endif }
void DmtcpWorker::waitForCoordinatorMsg(string msgStr, DmtcpMessageType type) { if (dmtcp_no_coordinator()) { if (type == DMT_DO_SUSPEND) { string shmFile = jalib::Filesystem::GetDeviceName(PROTECTED_SHM_FD); JASSERT(!shmFile.empty()); unlink(shmFile.c_str()); CoordinatorAPI::instance().waitForCheckpointCommand(); ProcessInfo::instance().numPeers(1); ProcessInfo::instance().compGroup(SharedData::getCompId()); } return; } if (type == DMT_DO_SUSPEND) { if (ThreadSync::destroyDmtcpWorkerLockTryLock() != 0) { JTRACE("User thread is performing exit()." " ckpt thread exit()ing as well"); ckptThreadPerformExit(); } if (exitInProgress()) { ThreadSync::destroyDmtcpWorkerLockUnlock(); ckptThreadPerformExit(); } } DmtcpMessage msg; if (type == DMT_DO_SUSPEND) { // Make a dummy syscall to inform superior of our status before we go into // select. If ptrace is disabled, this call has no significant effect. _real_syscall(DMTCP_FAKE_SYSCALL); } else { msg.type = DMT_OK; msg.state = WorkerState::currentState(); CoordinatorAPI::instance().sendMsgToCoordinator(msg); } JTRACE("waiting for " + msgStr + " message"); CoordinatorAPI::instance().recvMsgFromCoordinator(&msg); if (type == DMT_DO_SUSPEND && exitInProgress()) { ThreadSync::destroyDmtcpWorkerLockUnlock(); ckptThreadPerformExit(); } msg.assertValid(); if (msg.type == DMT_KILL_PEER) { JTRACE("Received KILL message from coordinator, exiting"); _exit (0); } JASSERT(msg.type == type) (msg.type) (type); // Coordinator sends some computation information along with the SUSPEND // message. Extracting that. if (type == DMT_DO_SUSPEND) { SharedData::updateGeneration(msg.compGroup.computationGeneration()); JASSERT(SharedData::getCompId() == msg.compGroup.upid()) (SharedData::getCompId()) (msg.compGroup); } else if (type == DMT_DO_FD_LEADER_ELECTION) { JTRACE("Computation information") (msg.compGroup) (msg.numPeers); ProcessInfo::instance().compGroup(msg.compGroup); ProcessInfo::instance().numPeers(msg.numPeers); } }