/* * This is a post- fix recursive function for starting every service * that s depends on before starting s. * @param s A Service_T object */ static void do_start(Service_T s) { ASSERT(s); if (s->visited) return; s->visited = TRUE; if (s->dependantlist) { Dependant_T d; for (d = s->dependantlist; d; d = d->next ) { Service_T parent = Util_getService(d->dependant); ASSERT(parent); do_start(parent); } } if (s->start && (s->type!=TYPE_PROCESS || !Util_isProcessRunning(s))) { LogInfo("'%s' start: %s\n", s->name, s->start->arg[0]); spawn(s, s->start, NULL); /* We only wait for a process type, other service types does not have a pid file to watch */ if (s->type == TYPE_PROCESS) wait_start(s); } Util_monitorSet(s); }
/* * This function simply stops the service p. * @param s A Service_T object * @return TRUE if the service was stopped otherwise FALSE */ static int do_stop(Service_T s) { ASSERT(s); if (s->depend_visited) return TRUE; s->depend_visited = TRUE; /* do soft unmonitor - start counter and error state is kept */ if (s->monitor != MONITOR_NOT) { s->monitor = MONITOR_NOT; DEBUG("Monitoring disabled -- service %s\n", s->name); } if (s->stop && (s->type!=TYPE_PROCESS || Util_isProcessRunning(s))) { LogInfo("'%s' stop: %s\n", s->name, s->stop->arg[0]); spawn(s, s->stop, NULL); if (s->type == TYPE_PROCESS) { /* Only wait for process service types */ if (!wait_stop(s)) return FALSE; } } Util_resetInfo(s); return TRUE; }
/** * Validate a given process service s. Events are posted according to * its configuration. In case of a fatal event FALSE is returned. */ int check_process(Service_T s) { pid_t pid = -1; Port_T pp = NULL; Resource_T pr = NULL; ASSERT(s); /* Test for running process */ if (!(pid = Util_isProcessRunning(s, FALSE))) { Event_post(s, Event_Nonexist, STATE_FAILED, s->action_NONEXIST, "process is not running"); return FALSE; } else Event_post(s, Event_Nonexist, STATE_SUCCEEDED, s->action_NONEXIST, "process is running with pid %d", (int)pid); if (Run.doprocess) { if (update_process_data(s, ptree, ptreesize, pid)) { check_process_state(s); check_process_pid(s); check_process_ppid(s); for (pr = s->resourcelist; pr; pr = pr->next) check_process_resources(s, pr); } else LogError("'%s' failed to get service data\n", s->name); } /* Test each host:port and protocol in the service's portlist */ if (s->portlist) for (pp = s->portlist; pp; pp = pp->next) check_connection(s, pp); return TRUE; }
/* * This function waits for the process to change state. If the process state doesn't match the expectation, * a failed event is posted to notify the user. The time is saved on enter so in the case that the time steps * backwards/forwards, the wait_process will wait for absolute time and not stall or prematurely exit. * @param service A Service to wait for * @param expect A expected state (see Process_Status) * @return Either Process_Started if the process is running or Process_Stopped if it's not running */ static Process_Status wait_process(Service_T s, Process_Status expect) { int debug = Run.debug, isrunning = FALSE; unsigned long now = time(NULL) * 1000, wait = 50; assert(s->start || s->restart); unsigned long timeout = (s->start ? s->start->timeout : s->restart->timeout) * 1000 + now; ASSERT(s); do { Time_usleep(wait * USEC_PER_MSEC); now += wait ; wait = wait < 1000 ? wait * 2 : 1000; // double the wait during each cycle until 1s is reached isrunning = Util_isProcessRunning(s, TRUE); if ((expect == Process_Stopped && ! isrunning) || (expect == Process_Started && isrunning)) break; Run.debug = FALSE; // Turn off debug second time through to avoid flooding the log with pid file does not exist. This poll stuff here _will_ be refactored away } while (now < timeout && ! Run.stopped); Run.debug = debug; // restore the debug state if (isrunning) { if (expect == Process_Started) Event_post(s, Event_Exec, STATE_SUCCEEDED, s->action_EXEC, "started"); else Event_post(s, Event_Exec, STATE_FAILED, s->action_EXEC, "failed to stop"); return Process_Started; } else { if (expect == Process_Started) Event_post(s, Event_Exec, STATE_FAILED, s->action_EXEC, "failed to start"); else Event_post(s, Event_Exec, STATE_SUCCEEDED, s->action_EXEC, "stopped"); return Process_Stopped; } }
/* * This function runs in its own thread and waits for the service to * start running. If the service did not start a failed event is * posted to notify the user. * @param service A Service to wait for */ static void wait_start(Service_T s) { time_t timeout = time(NULL) + s->start->timeout; ASSERT(s); while ((time(NULL) < timeout) && !Run.stopped) { if (Util_isProcessRunning(s)) break; sleep(1); } if (!Util_isProcessRunning(s)) Event_post(s, Event_Exec, STATE_FAILED, s->action_EXEC, "failed to start"); else Event_post(s, Event_Exec, STATE_SUCCEEDED, s->action_EXEC, "started"); return; }
/* * This function waits for the service to stop running. If the service * did not stop a failed event is posted to notify the user. This * function does purposefully not run in its own thread because, if we * did a restart we need to know if we successfully managed to stop * the service first before we can do a start. * @param service A Service to wait for * @return TRUE if the service was stopped otherwise FALSE */ static int wait_stop(Service_T s) { time_t timeout = time(NULL) + s->stop->timeout; ASSERT(s); while ((time(NULL) < timeout) && !Run.stopped) { if (!Util_isProcessRunning(s)) break; sleep(1); } if (Util_isProcessRunning(s)) { Event_post(s, Event_Exec, STATE_FAILED, s->action_EXEC, "failed to stop"); return FALSE; } else { Event_post(s, Event_Exec, STATE_SUCCEEDED, s->action_EXEC, "stopped"); } return TRUE; }
/* * This function runs in its own thread and waits for the service to * start running. If the service did not start a failed event is * posted to notify the user. * @param service A Service to wait for */ static void wait_start(Service_T s, SpawnResult *spawn_result) { time_t timeout = time(NULL) + s->start->timeout; ASSERT(s); while ((time(NULL) < timeout) && !Run.stopped) { if (Util_isProcessRunning(s)) break; if (spawn_result != NULL && spawn_result->exit_code > 0) { LogError("%s start exit with code %d\n", s->name, spawn_result->exit_code); break; } sleep(1); } if (!Util_isProcessRunning(s)) Event_post(s, Event_Exec, STATE_FAILED, s->action_EXEC, "failed to start"); else Event_post(s, Event_Exec, STATE_SUCCEEDED, s->action_EXEC, "started"); return; }
/* * Setup the environment with special MONIT_xxx variables. The program * executed may use such variable for various purposes. */ static void set_monit_environment(Service_T S, command_t C, Event_T E, const char *date) { setenv("MONIT_DATE", date, 1); setenv("MONIT_SERVICE", S->name, 1); setenv("MONIT_HOST", Run.system->name, 1); setenv("MONIT_EVENT", E ? Event_get_description(E) : C == S->start ? "Started" : C == S->stop ? "Stopped" : "No Event", 1); setenv("MONIT_DESCRIPTION", E ? Event_get_message(E) : C == S->start ? "Started" : C == S->stop ? "Stopped" : "No Event", 1); if (S->type == TYPE_PROCESS) { putenv(Str_cat("MONIT_PROCESS_PID=%d", Util_isProcessRunning(S, FALSE))); putenv(Str_cat("MONIT_PROCESS_MEMORY=%ld", S->inf->priv.process.mem_kbyte)); putenv(Str_cat("MONIT_PROCESS_CHILDREN=%d", S->inf->priv.process.children)); putenv(Str_cat("MONIT_PROCESS_CPU_PERCENT=%d", S->inf->priv.process.cpu_percent)); } }
/* * This function simply stops the service p. * @param s A Service_T object * @param flag TRUE if the monitoring should be disabled or FALSE if monitoring should continue (when stop is part of restart) * @return TRUE if the service was stopped otherwise FALSE */ static int do_stop(Service_T s, int flag) { int rv = TRUE; ASSERT(s); if (s->depend_visited) return rv; s->depend_visited = TRUE; if (s->stop) { if (s->type != TYPE_PROCESS || Util_isProcessRunning(s, FALSE)) { LogInfo("'%s' stop: %s\n", s->name, s->stop->arg[0]); spawn(s, s->stop, NULL); if (s->type == TYPE_PROCESS && (wait_process(s, Process_Stopped) != Process_Stopped)) // Only wait for process service types stop rv = FALSE; } } else { LogDebug("'%s' stop skipped -- method not defined\n", s->name); } if (flag) Util_monitorUnset(s); else Util_resetInfo(s); return rv; }
/** * Check to see if we should try to start/stop service * @param S A service name as stated in the config file * @param A An action id describing the action to execute * @return FALSE for error, otherwise TRUE */ int control_service(const char *S, int A) { Service_T s = NULL; ASSERT(S); if (! (s = Util_getService(S))) { LogError("%s: service '%s' -- doesn't exist\n", prog, S); return FALSE; } switch(A) { case ACTION_START: if (s->type == TYPE_PROCESS) { if (Util_isProcessRunning(s)) { DEBUG("%s: Process already running -- process %s\n", prog, S); Util_monitorSet(s); return TRUE; } if (!s->start) { LogError("%s: Start method not defined -- process %s\n", prog, S); Util_monitorSet(s); return FALSE; } } do_depend(s, ACTION_STOP); do_start(s); do_depend(s, ACTION_START); break; case ACTION_STOP: if (s->type == TYPE_PROCESS && !s->stop) { LogError("%s: Stop method not defined -- process %s\n", prog, S); Util_monitorUnset(s); return FALSE; } /* soft unmonitor and stop: */ do_depend(s, ACTION_STOP); do_stop(s); /* hard unmonitor - will reset all counters and flags: */ do_depend(s, ACTION_UNMONITOR); do_unmonitor(s); break; case ACTION_RESTART: if (s->type == TYPE_PROCESS && (!s->start || !s->stop)) { LogError("%s: Start or stop method not defined -- process %s\n", prog, S); Util_monitorSet(s); return FALSE; } LogInfo("'%s' trying to restart\n", s->name); do_depend(s, ACTION_STOP); if (do_stop(s)) { /* Only start if stop succeeded */ do_start(s); do_depend(s, ACTION_START); } else { /* enable monitoring of this service again to allow the restart retry * in the next cycle up to timeout limit */ Util_monitorSet(s); } break; case ACTION_MONITOR: /* We only enable monitoring of this service and all prerequisite * services. Chain of services which depends on this service keep * its state */ do_monitor(s); break; case ACTION_UNMONITOR: /* We disable monitoring of this service and all services which * depends on it */ do_depend(s, ACTION_UNMONITOR); do_unmonitor(s); break; default: LogError("%s: service '%s' -- invalid action %s\n", prog, S, A); return FALSE; } return TRUE; }
static int _commandExecute(Service_T S, command_t c, char *msg, int msglen, int64_t *timeout) { ASSERT(S); ASSERT(c); ASSERT(msg); msg[0] = 0; int status = -1; Command_T C = NULL; TRY { // May throw exception if the program doesn't exist (was removed while Monit was up) C = Command_new(c->arg[0], NULL); } ELSE { snprintf(msg, msglen, "Program %s failed: %s", c->arg[0], Exception_frame.message); } END_TRY; if (C) { for (int i = 1; i < c->length; i++) Command_appendArgument(C, c->arg[i]); if (c->has_uid) Command_setUid(C, c->uid); if (c->has_gid) Command_setGid(C, c->gid); Command_setEnv(C, "MONIT_DATE", Time_string(Time_now(), (char[26]){})); Command_setEnv(C, "MONIT_SERVICE", S->name); Command_setEnv(C, "MONIT_HOST", Run.system->name); Command_setEnv(C, "MONIT_EVENT", c == S->start ? "Started" : c == S->stop ? "Stopped" : "Restarted"); Command_setEnv(C, "MONIT_DESCRIPTION", c == S->start ? "Started" : c == S->stop ? "Stopped" : "Restarted"); if (S->type == Service_Process) { Command_vSetEnv(C, "MONIT_PROCESS_PID", "%d", Util_isProcessRunning(S, false)); Command_vSetEnv(C, "MONIT_PROCESS_MEMORY", "%ld", S->inf->priv.process.mem_kbyte); Command_vSetEnv(C, "MONIT_PROCESS_CHILDREN", "%d", S->inf->priv.process.children); Command_vSetEnv(C, "MONIT_PROCESS_CPU_PERCENT", "%d", S->inf->priv.process.cpu_percent); } Process_T P = Command_execute(C); Command_free(&C); if (P) { do { Time_usleep(RETRY_INTERVAL); *timeout -= RETRY_INTERVAL; } while ((status = Process_exitStatus(P)) < 0 && *timeout > 0 && ! (Run.flags & Run_Stopped)); if (*timeout <= 0) snprintf(msg, msglen, "Program %s timed out", c->arg[0]); int n, total = 0; char buf[STRLEN]; do { if ((n = _getOutput(Process_getErrorStream(P), buf, sizeof(buf))) <= 0) n = _getOutput(Process_getInputStream(P), buf, sizeof(buf)); if (n > 0) { buf[n] = 0; DEBUG("%s", buf); // Report the first message (override existing plain timeout message if some program output is available) if (! total) snprintf(msg, msglen, "%s: %s%s", c->arg[0], *timeout <= 0 ? "Program timed out -- " : "", buf); total += n; } } while (n > 0 && Run.debug && total < 2048); // Limit the debug output (if the program will have endless output, such as 'yes' utility, we have to stop at some point to not spin here forever) Process_free(&P); // Will kill the program if still running } }