static void mon_child_cb(struct ev_loop* loop, ev_child* w, int revents V_UNUSED) { dmn_assert(loop); dmn_assert(w); dmn_assert(revents == EV_CHILD); ev_child_stop(loop, w); // always single-shot mon_t* this_mon = w->data; ev_timer_stop(loop, this_mon->cmd_timeout); this_mon->cmd_pid = 0; bool failed = true; int status = w->rstatus; if(WIFEXITED(status)) { if(!WEXITSTATUS(status)) failed = false; } else { if(WIFSIGNALED(status)) dmn_log_warn("Monitor child process for '%s' terminated by signal %u", this_mon->cmd->desc, WTERMSIG(status)); else dmn_log_warn("Monitor child process for '%s' terminated abnormally...", this_mon->cmd->desc); } // If timeout already sent a failure, don't double-send // here when we reap the SIGKILL'd child if(this_mon->result_pending) { if(!killed_by) { sendq_enq(emc_encode_mon(this_mon->cmd->idx, failed)); ev_io_start(loop, plugin_write_watcher); } if (num_proc > 0) { num_proc--; } this_mon->result_pending = false; } }
F_NONNULL static void die_gracefully(struct ev_loop* loop) { dmn_assert(loop); dmn_assert(killed_by); static bool done_once = false; if(!done_once) { // avoid repetition done_once = true; // send friendly death message to plugin sendq_enq(emc_encode_exit()); ev_io_start(loop, plugin_write_watcher); // kill interval timers for future invocations // and immediately clamp the remaining timeout // for any running commands to 2.0s. for(unsigned i = 0; i < num_mons; i++) { ev_timer_stop(loop, mons[i].interval_timer); if(ev_is_active(mons[i].cmd_timeout)) { if(ev_timer_remaining(loop, mons[i].cmd_timeout) > 2.0) { ev_timer_stop(loop, mons[i].cmd_timeout); ev_timer_set(mons[i].cmd_timeout, 2.0, 0.); ev_timer_start(loop, mons[i].cmd_timeout); } } } } }
static void mon_timeout_cb(struct ev_loop* loop, ev_timer* w, int revents V_UNUSED) { dmn_assert(loop); dmn_assert(w); dmn_assert(revents == EV_TIMER); mon_t* this_mon = w->data; dmn_assert(this_mon->result_pending); dmn_log_warn("Monitor child process for '%s' timed out after %u seconds. Marking failed and sending SIGKILL...", this_mon->cmd->desc, this_mon->cmd->timeout); kill(this_mon->cmd_pid, SIGKILL); // note we don't stop the child_watcher because we still // wait to reap the status below. I suppose technically // if SIGKILL doesn't work (e.g. stupid blocking NFS thing // in child proc), eventually we'll hit a new interval // and restart the child watcher for a new child, effectively // giving up on waitpid() of this child. Not much else we // could do in that case anyways. sendq_enq(emc_encode_mon(this_mon->cmd->idx, true)); ev_io_start(loop, plugin_write_watcher); this_mon->result_pending = false; }