/** * Starts the agent process. May throw arbitrary exceptions. */ virtual pid_t start() { this_thread::disable_interruption di; this_thread::disable_syscall_interruption dsi; string exeFilename = getExeFilename(); SocketPair fds; int e, ret; pid_t pid; /* Create feedback fd for this agent process. We'll send some startup * arguments to this agent process through this fd, and we'll receive * startup information through it as well. */ fds = createUnixSocketPair(); pid = syscalls::fork(); if (pid == 0) { // Child /* Make sure file descriptor FEEDBACK_FD refers to the newly created * feedback fd (fds[1]) and close all other file descriptors. * In this child process we don't care about the original FEEDBACK_FD * (which is the Watchdog's communication channel to the agents starter.) * * fds[1] is guaranteed to be != FEEDBACK_FD because the watchdog * is started with FEEDBACK_FD already assigned. */ syscalls::close(fds[0]); if (syscalls::dup2(fds[1], FEEDBACK_FD) == -1) { /* Something went wrong, report error through feedback fd. */ e = errno; try { writeArrayMessage(fds[1], "system error before exec", "dup2() failed", toString(e).c_str(), NULL); _exit(1); } catch (...) { fprintf(stderr, "Passenger Watchdog: dup2() failed: %s (%d)\n", strerror(e), e); fflush(stderr); _exit(1); } } closeAllFileDescriptors(FEEDBACK_FD); /* Become the process group leader so that the watchdog can kill the * agent as well as all its descendant processes. */ setpgid(getpid(), getpid()); setOomScore(oldOomScore); try { execProgram(); } catch (...) { fprintf(stderr, "PassengerWatchdog: execProgram() threw an exception\n"); fflush(stderr); _exit(1); } e = errno; try { writeArrayMessage(FEEDBACK_FD, "exec error", toString(e).c_str(), NULL); } catch (...) { fprintf(stderr, "Passenger Watchdog: could not execute %s: %s (%d)\n", exeFilename.c_str(), strerror(e), e); fflush(stderr); } _exit(1); } else if (pid == -1) { // Error e = errno; throw SystemException("Cannot fork a new process", e); } else { // Parent FileDescriptor feedbackFd = fds[0]; vector<string> args; fds[1].close(); this_thread::restore_interruption ri(di); this_thread::restore_syscall_interruption rsi(dsi); ScopeGuard failGuard(boost::bind(killAndWait, pid)); /* Send startup arguments. Ignore EPIPE and ECONNRESET here * because the child process might have sent an feedback message * without reading startup arguments. */ try { sendStartupArguments(pid, feedbackFd); } catch (const SystemException &ex) { if (ex.code() != EPIPE && ex.code() != ECONNRESET) { throw SystemException(string("Unable to start the ") + name() + ": an error occurred while sending startup arguments", ex.code()); } } // Now read its feedback. try { ret = readArrayMessage(feedbackFd, args); } catch (const SystemException &e) { if (e.code() == ECONNRESET) { ret = false; } else { throw SystemException(string("Unable to start the ") + name() + ": unable to read its startup information", e.code()); } } if (!ret) { this_thread::disable_interruption di2; this_thread::disable_syscall_interruption dsi2; int status; /* The feedback fd was prematurely closed for an unknown reason. * Did the agent process crash? * * We use timedWaitPid() here because if the process crashed * because of an uncaught exception, the file descriptor * might be closed before the process has printed an error * message, so we give it some time to print the error * before we kill it. */ ret = timedWaitPid(pid, &status, 5000); if (ret == 0) { /* Doesn't look like it; it seems it's still running. * We can't do anything without proper feedback so kill * the agent process and throw an exception. */ failGuard.runNow(); throw RuntimeException(string("Unable to start the ") + name() + ": it froze and reported an unknown error during its startup"); } else if (ret != -1 && WIFSIGNALED(status)) { /* Looks like a crash which caused a signal. */ throw RuntimeException(string("Unable to start the ") + name() + ": it seems to have been killed with signal " + getSignalName(WTERMSIG(status)) + " during startup"); } else if (ret == -1) { /* Looks like it exited after detecting an error. */ throw RuntimeException(string("Unable to start the ") + name() + ": it seems to have crashed during startup for an unknown reason"); } else { /* Looks like it exited after detecting an error, but has an exit code. */ throw RuntimeException(string("Unable to start the ") + name() + ": it seems to have crashed during startup for an unknown reason, " "with exit code " + toString(WEXITSTATUS(status))); } } if (args[0] == "system error before exec") { throw SystemException(string("Unable to start the ") + name() + ": " + args[1], atoi(args[2])); } else if (args[0] == "exec error") { e = atoi(args[1]); if (e == ENOENT) { throw RuntimeException(string("Unable to start the ") + name() + " because its executable (" + getExeFilename() + ") " "doesn't exist. This probably means that your " "Phusion Passenger installation is broken or " "incomplete. Please reinstall Phusion Passenger"); } else { throw SystemException(string("Unable to start the ") + name() + " because exec(\"" + getExeFilename() + "\") failed", atoi(args[1])); } } else if (!processStartupInfo(pid, feedbackFd, args)) { throw RuntimeException(string("The ") + name() + " sent an unknown startup info message '" + args[0] + "'"); } lock_guard<boost::mutex> l(lock); this->feedbackFd = feedbackFd; this->pid = pid; failGuard.clear(); return pid; } }
void threadMain(shared_ptr<AgentWatcher> self) { try { pid_t pid, ret; int status, e; while (!this_thread::interruption_requested()) { { lock_guard<boost::mutex> l(lock); pid = this->pid; } // Process can be started before the watcher thread is launched. if (pid == 0) { pid = start(); } ret = syscalls::waitpid(pid, &status, 0); if (ret == -1 && errno == ECHILD) { /* If the agent is attached to gdb then waitpid() * here can return -1 with errno == ECHILD. * Fallback to kill() polling for checking * whether the agent is alive. */ ret = pid; status = 0; P_WARN("waitpid() on " << name() << " (pid=" << pid << ") returned -1 with " << "errno = ECHILD, falling back to kill polling"); waitpidUsingKillPolling(pid); e = 0; } else { e = errno; } { lock_guard<boost::mutex> l(lock); this->pid = 0; } this_thread::disable_interruption di; this_thread::disable_syscall_interruption dsi; if (ret == -1) { P_WARN(name() << " (pid=" << pid << ") crashed or killed for " "an unknown reason (errno = " << strerror(e) << "), restarting it..."); } else if (WIFEXITED(status)) { if (WEXITSTATUS(status) == 0) { /* When the web server is gracefully exiting, it will * tell one or more agents to gracefully exit with exit * status 0. If we see this then it means the watchdog * is gracefully shutting down too and we should stop * watching. */ return; } else { P_WARN(name() << " (pid=" << pid << ") crashed with exit status " << WEXITSTATUS(status) << ", restarting it..."); } } else { P_WARN(name() << " (pid=" << pid << ") crashed with signal " << getSignalName(WTERMSIG(status)) << ", restarting it..."); } const char *sleepTime; if ((sleepTime = getenv("PASSENGER_AGENT_RESTART_SLEEP")) != NULL) { sleep(atoi(sleepTime)); } } } catch (const boost::thread_interrupted &) { } catch (const tracable_exception &e) { lock_guard<boost::mutex> l(lock); threadExceptionMessage = e.what(); threadExceptionBacktrace = e.backtrace(); errorEvent->notify(); } catch (const std::exception &e) { lock_guard<boost::mutex> l(lock); threadExceptionMessage = e.what(); errorEvent->notify(); } catch (...) { lock_guard<boost::mutex> l(lock); threadExceptionMessage = "Unknown error"; errorEvent->notify(); } }
void execute(void * callbackObj, SignalHeader * const header, Uint8 prio, Uint32 * const theData, LinearSectionPtr ptr[3]){ const Uint32 secCount = header->m_noOfSections; const Uint32 length = header->theLength; #ifdef TRACE_DISTRIBUTED ndbout_c("recv: %s(%d) from (%s, %d)", getSignalName(header->theVerId_signalNumber), header->theVerId_signalNumber, getBlockName(refToBlock(header->theSendersBlockRef)), refToNode(header->theSendersBlockRef)); #endif bool ok = true; Ptr<SectionSegment> secPtr[3]; switch(secCount){ case 3: ok &= import(secPtr[2], ptr[2].p, ptr[2].sz); case 2: ok &= import(secPtr[1], ptr[1].p, ptr[1].sz); case 1: ok &= import(secPtr[0], ptr[0].p, ptr[0].sz); } /** * Check that we haven't received a too long signal */ ok &= (length + secCount <= 25); Uint32 secPtrI[3]; if(ok){ /** * Normal path */ secPtrI[0] = secPtr[0].i; secPtrI[1] = secPtr[1].i; secPtrI[2] = secPtr[2].i; globalScheduler.execute(header, prio, theData, secPtrI); return; } /** * Out of memory */ for(Uint32 i = 0; i<secCount; i++){ if(secPtr[i].p != 0){ g_sectionSegmentPool.releaseList(relSz(ptr[i].sz), secPtr[i].i, secPtr[i].p->m_lastSegment); } } Uint32 gsn = header->theVerId_signalNumber; Uint32 len = header->theLength; Uint32 newLen= (len > 22 ? 22 : len); SignalDroppedRep * rep = (SignalDroppedRep*)theData; memmove(rep->originalData, theData, (4 * newLen)); rep->originalGsn = gsn; rep->originalLength = len; rep->originalSectionCount = secCount; header->theVerId_signalNumber = GSN_SIGNAL_DROPPED_REP; header->theLength = newLen + 3; header->m_noOfSections = 0; globalScheduler.execute(header, prio, theData, secPtrI); }