void startNewCoordinator(CoordinatorMode mode) { string host; int port; getCoordHostAndPort(mode, host, &port); JASSERT(strcmp(host.c_str(), "localhost") == 0 || strcmp(host.c_str(), "127.0.0.1") == 0 || jalib::Filesystem::GetCurrentHostname() == host.c_str()) (host) (jalib::Filesystem::GetCurrentHostname()) .Text("Won't automatically start coordinator because DMTCP_HOST" " is set to a remote host."); // Create a socket and bind it to an unused port. errno = 0; jalib::JServerSocket coordinatorListenerSocket(jalib::JSockAddr::ANY, port, 128); JASSERT(coordinatorListenerSocket.isValid()) (coordinatorListenerSocket.port()) (JASSERT_ERRNO) (host) (port) .Text("Failed to create socket to coordinator port." "\nIf msg is \"Address already in use\"," " this may be an old coordinator." "\nEither try again a few seconds or a minute later," "\nOr kill other coordinators on this host and port:" "\n dmtcp_command ---coord-host XXX --coord-port XXX" "\nOr specify --join-coordinator if joining existing computation."); // Now dup the sockfd to coordinatorListenerSocket.changeFd(PROTECTED_COORD_FD); setCoordPort(coordinatorListenerSocket.port()); JTRACE("Starting a new coordinator automatically.") (coordinatorListenerSocket.port()); if (fork() == 0) { /* NOTE: This code assumes that dmtcp_launch (the current program) * and dmtcp_coordinator are in the same directory. Namely, * GetProgramDir() gets the dir of the current program (dmtcp_launch). * Hence, if dmtcp_coordinator is in a different directory, then * jalib::Filesystem::GetProgramDir() + "/dmtcp_coordinator" * will not exist, and the child will fail. */ // We can't use Util::getPath() here since the SharedData has not been // initialized yet. string coordinator = jalib::Filesystem::GetProgramDir() + "/dmtcp_coordinator"; char *modeStr = (char *)"--daemon"; char *args[] = { (char *)coordinator.c_str(), (char *)"--quiet", /* If we wish to also suppress coordinator warnings, call --quiet twice */ (char *)"--exit-on-last", modeStr, NULL }; execv(args[0], args); JASSERT(false)(coordinator)(JASSERT_ERRNO).Text( "exec(dmtcp_coordinator) failed"); } else { int status; _real_close(PROTECTED_COORD_FD); JASSERT(wait(&status) > 0) (JASSERT_ERRNO); } }
void dmtcp::DmtcpCoordinatorAPI::startNewCoordinator(int modes, int isRestart) { int coordinatorStatus = -1; //get location of coordinator const char *coordinatorAddr = getenv ( ENV_VAR_NAME_HOST ); if(coordinatorAddr == NULL) coordinatorAddr = DEFAULT_HOST; const char *coordinatorPortStr = getenv ( ENV_VAR_NAME_PORT ); dmtcp::string s = coordinatorAddr; if(s != "localhost" && s != "127.0.0.1" && s != jalib::Filesystem::GetCurrentHostname()){ JASSERT(false)(s)(jalib::Filesystem::GetCurrentHostname()) .Text("Won't automatically start coordinator because DMTCP_HOST" " is set to a remote host."); _real_exit(DMTCP_FAIL_RC); } if ( modes & COORD_BATCH || modes & COORD_FORCE_NEW ) { // Create a socket and bind it to an unused port. jalib::JServerSocket coordinatorListenerSocket ( jalib::JSockAddr::ANY, 0 ); errno = 0; JASSERT ( coordinatorListenerSocket.isValid() ) ( coordinatorListenerSocket.port() ) ( JASSERT_ERRNO ) .Text ( "Failed to create listen socket." "\nIf msg is \"Address already in use\", this may be an old coordinator." "\nKill other coordinators and try again in a minute or so." ); // Now dup the sockfd to coordinatorListenerSocket.changeFd(PROTECTED_COORD_FD); dmtcp::string coordPort= jalib::XToString(coordinatorListenerSocket.port()); setenv ( ENV_VAR_NAME_PORT, coordPort.c_str(), 1 ); } JTRACE("Starting a new coordinator automatically.") (coordinatorPortStr); if(fork()==0){ dmtcp::string coordinator = jalib::Filesystem::FindHelperUtility("dmtcp_coordinator"); char *modeStr = (char *)"--background"; if ( modes & COORD_BATCH ) { modeStr = (char *)"--batch"; } char * args[] = { (char*)coordinator.c_str(), (char*)"--exit-on-last", modeStr, NULL }; execv(args[0], args); JASSERT(false)(coordinator)(JASSERT_ERRNO).Text("exec(dmtcp_coordinator) failed"); } else { _real_close ( PROTECTED_COORD_FD ); } errno = 0; if ( modes & COORD_BATCH ) { // FIXME: If running in batch Mode, we sleep here for 5 seconds to let // the coordinator get started up. We need to fix this in future. sleep(5); } else { JASSERT(wait(&coordinatorStatus)>0)(JASSERT_ERRNO); JASSERT(WEXITSTATUS(coordinatorStatus) == 0) .Text("Failed to start coordinator, port already in use. You may use a different port by running with \'-p 12345\'\n"); } }