void createNewConnToCoord(CoordinatorMode mode) { int sockfd = -1; if (mode & COORD_JOIN) { sockfd = createNewSocketToCoordinator(mode); JASSERT(sockfd != -1) (JASSERT_ERRNO) .Text("Coordinator not found, but --join was specified. Exiting."); } else if (mode & COORD_NEW) { startNewCoordinator(mode); sockfd = createNewSocketToCoordinator(mode); JASSERT(sockfd != -1) (JASSERT_ERRNO) .Text("Error connecting to newly started coordinator."); } else if (mode & COORD_ANY) { sockfd = createNewSocketToCoordinator(mode); if (sockfd == -1) { JTRACE("Coordinator not found, trying to start a new one."); startNewCoordinator(mode); sockfd = createNewSocketToCoordinator(mode); JASSERT(sockfd != -1) (JASSERT_ERRNO) .Text("Error connecting to newly started coordinator."); } } else { JASSERT(false).Text("Not Reached"); } Util::changeFd(sockfd, PROTECTED_COORD_FD); JASSERT(Util::isValidFd(coordinatorSocket)); }
void dmtcp::DmtcpCoordinatorAPI::startCoordinatorIfNeeded(int modes, int isRestart) { const static int CS_OK = DMTCP_FAIL_RC+1; const static int CS_NO = DMTCP_FAIL_RC+2; int coordinatorStatus = -1; if (modes & COORD_BATCH) { startNewCoordinator ( modes, isRestart ); return; } //fork a child process to probe the coordinator if (fork()==0) { //fork so if we hit an error parent won't die dup2(2,1); //copy stderr to stdout dup2(open("/dev/null",O_RDWR), 2); //close stderr int result[DMTCPMESSAGE_NUM_PARAMS]; DmtcpCoordinatorAPI coordinatorAPI; { if ( coordinatorAPI.tryConnectToCoordinator() == false ) { JTRACE("Coordinator not found. Will try to start a new one."); _real_exit(DMTCP_FAIL_RC); } } coordinatorAPI.sendUserCommand('s',result); coordinatorAPI._coordinatorSocket.close(); // result[0] == numPeers of coord; bool result[1] == computation is running if(result[0]==0 || result[1] ^ isRestart){ if(result[0] != 0) { int num_processes = result[0]; JTRACE("Joining existing computation.") (num_processes); } _real_exit(CS_OK); }else{ JTRACE("Existing computation not in a running state," \ " perhaps checkpoint in progress?"); _real_exit(CS_NO); } } errno = 0; // FIXME: wait() could return -1 if a signal happened before child exits JASSERT(::wait(&coordinatorStatus)>0)(JASSERT_ERRNO); JASSERT(WIFEXITED(coordinatorStatus)); //is coordinator running? if (WEXITSTATUS(coordinatorStatus) != CS_OK) { //is coordinator in funny state? if(WEXITSTATUS(coordinatorStatus) == CS_NO){ JASSERT (false) (isRestart) .Text ("Coordinator in a funny state. Peers exist, not restarting," \ "\n but not in a running state. Often this means you are" \ " connecting to\n a stale coordinator from a previous" \ " computation.\n Try killing the other coordinator," \ " or using a different port for the new comp."); }else if (WEXITSTATUS(coordinatorStatus) == DMTCP_FAIL_RC) { JTRACE("Coordinator not found. Starting a new one."); }else{ JTRACE("Bad result found for coordinator. Will try start a new one."); } startNewCoordinator ( modes, isRestart ); }else{ if (modes & COORD_FORCE_NEW) { JTRACE("Forcing new coordinator. --new-coordinator flag given."); startNewCoordinator ( modes, isRestart ); return; } JASSERT( modes & COORD_JOIN ) .Text("Coordinator already running, but '--new' flag was given."); } }