/* remove an event from the queue */ void remove_event(squeue_t *sq, timed_event *event) { #ifdef USE_EVENT_BROKER /* send event data to broker */ broker_timed_event(NEBTYPE_TIMEDEVENT_REMOVE, NEBFLAG_NONE, NEBATTR_NONE, event, NULL); #endif if (!event || !event->sq_event) return; if (sq) squeue_remove(sq, event->sq_event); else logit(NSLOG_RUNTIME_ERROR, TRUE, "Error: remove_event() called for %s event with NULL sq parameter\n", EVENT_TYPE_STR(event->event_type)); if (sq == nagios_squeue) track_events(event->event_type, -1); event->sq_event = NULL; /* mark this event as unscheduled */ /* * if we catch an event from the queue which gets removed when * we go polling for input (as might happen with f.e. downtime * events that we get "cancel" commands for just as they are * about to start or expire), we must make sure we mark the * current event as no longer scheduled, or we'll run into * segfaults and memory corruptions for sure. */ if (event == current_event) { current_event = NULL; } }
static void destroy_job(child_process *cp) { /* * we must remove the job's timeout ticker, * or we'll end up accessing an already free()'d * pointer, or the pointer to a different child. */ squeue_remove(sq, cp->ei->sq_event); running_jobs--; fanout_remove(ptab, cp->ei->pid); if (cp->outstd.buf) { free(cp->outstd.buf); cp->outstd.buf = NULL; } if (cp->outerr.buf) { free(cp->outerr.buf); cp->outerr.buf = NULL; } kvvec_destroy(cp->request, KVVEC_FREE_ALL); free(cp->cmd); free(cp->ei); free(cp); }
static int finish_job(child_process *cp, int reason) { static struct kvvec resp = KVVEC_INITIALIZER; struct rusage *ru = &cp->rusage; int i, ret; /* how many key/value pairs do we need? */ if (kvvec_init(&resp, 12 + cp->request->kv_pairs) == NULL) { /* what the hell do we do now? */ exit_worker(); } gettimeofday(&cp->stop, NULL); if (running_jobs != squeue_size(sq)) { wlog("running_jobs(%d) != squeue_size(sq) (%d)\n", running_jobs, squeue_size(sq)); wlog("started: %d; running: %d; finished: %d\n", started, running_jobs, started - running_jobs); } /* * we must remove the job's timeout ticker, * or we'll end up accessing an already free()'d * pointer, or the pointer to a different child. */ squeue_remove(sq, cp->sq_event); /* get rid of still open filedescriptors */ if (cp->outstd.fd != -1) iobroker_close(iobs, cp->outstd.fd); if (cp->outerr.fd != -1) iobroker_close(iobs, cp->outerr.fd); cp->runtime = tv_delta_f(&cp->start, &cp->stop); /* * Now build the return message. * First comes the request, minus environment variables */ for (i = 0; i < cp->request->kv_pairs; i++) { struct key_value *kv = &cp->request->kv[i]; /* skip environment macros */ if (kv->key_len == 3 && !strcmp(kv->key, "env")) { continue; } kvvec_addkv_wlen(&resp, kv->key, kv->key_len, kv->value, kv->value_len); } kvvec_addkv(&resp, "wait_status", (char *)mkstr("%d", cp->ret)); kvvec_addkv_wlen(&resp, "outstd", 6, cp->outstd.buf, cp->outstd.len); kvvec_addkv_wlen(&resp, "outerr", 6, cp->outerr.buf, cp->outerr.len); kvvec_add_tv(&resp, "start", cp->start); kvvec_add_tv(&resp, "stop", cp->stop); kvvec_addkv(&resp, "runtime", (char *)mkstr("%f", cp->runtime)); if (!reason) { /* child exited nicely */ kvvec_addkv(&resp, "exited_ok", "1"); kvvec_add_tv(&resp, "ru_utime", ru->ru_utime); kvvec_add_tv(&resp, "ru_stime", ru->ru_stime); kvvec_add_long(&resp, "ru_minflt", ru->ru_minflt); kvvec_add_long(&resp, "ru_majflt", ru->ru_majflt); kvvec_add_long(&resp, "ru_nswap", ru->ru_nswap); kvvec_add_long(&resp, "ru_inblock", ru->ru_inblock); kvvec_add_long(&resp, "ru_oublock", ru->ru_oublock); kvvec_add_long(&resp, "ru_nsignals", ru->ru_nsignals); } else { /* some error happened */ kvvec_addkv(&resp, "exited_ok", "0"); kvvec_addkv(&resp, "error_code", (char *)mkstr("%d", reason)); } ret = send_kvvec(master_sd, &resp); if (ret < 0 && errno == EPIPE) exit_worker(); running_jobs--; if (cp->outstd.buf) { free(cp->outstd.buf); cp->outstd.buf = NULL; } if (cp->outerr.buf) { free(cp->outerr.buf); cp->outerr.buf = NULL; } kvvec_destroy(cp->request, KVVEC_FREE_ALL); free(cp->cmd); free(cp); return 0; }
/* * "What can the harvest hope for, if not for the care * of the Reaper Man?" * -- Terry Pratchett, Reaper Man * * We end up here no matter if the job is stale (ie, the child is * stuck in uninterruptable sleep) or if it's the first time we try * to kill it. * A job is considered reaped once we reap our direct child, in * which case init will become parent of our grandchildren. * It's also considered fully reaped if kill() results in ESRCH or * EPERM, or if wait()ing for the process group results in ECHILD. */ static void kill_job(child_process *cp, int reason) { int ret, status, reaped = 0; int pid = cp ? cp->ei->pid : 0; /* * first attempt at reaping, so see if we just failed to * notice that things were going wrong her */ if (reason == ETIME && !check_completion(cp, WNOHANG)) { timeouts++; wlog("job %d with pid %d reaped at timeout. timeouts=%u; started=%u", cp->id, pid, timeouts, started); return; } /* brutal but efficient */ if (kill(-cp->ei->pid, SIGKILL) < 0) { if (errno == ESRCH) { reaped = 1; } else { wlog("kill(-%d, SIGKILL) failed: %s\n", cp->ei->pid, strerror(errno)); } } /* * we must iterate at least once, in case kill() returns * ESRCH when there's zombies */ do { ret = waitpid(cp->ei->pid, &status, WNOHANG); if (ret < 0 && errno == EINTR) continue; if (ret == cp->ei->pid || (ret < 0 && errno == ECHILD)) { reaped = 1; break; } if (!ret) { struct timeval tv; gettimeofday(&tv, NULL); /* * stale process (signal may not have been delivered, or * the child can be stuck in uninterruptible sleep). We * can't hang around forever, so just reschedule a new * reap attempt later. */ if (reason == ESTALE) { tv.tv_sec += 5; wlog("Failed to reap child with pid %d. Next attempt @ %lu.%lu", cp->ei->pid, tv.tv_sec, tv.tv_usec); } else { tv.tv_usec = 250000; if (tv.tv_usec > 1000000) { tv.tv_usec -= 1000000; tv.tv_sec += 1; } cp->ei->state = ESTALE; finish_job(cp, reason); } squeue_remove(sq, cp->ei->sq_event); cp->ei->sq_event = squeue_add_tv(sq, &tv, cp); return; } } while (!reaped); if (cp->ei->state != ESTALE) finish_job(cp, reason); else wlog("job %d (pid=%d): Dormant child reaped", cp->id, cp->ei->pid); destroy_job(cp); }
int main(int argc, char **argv) { squeue_t *sq; struct timeval tv; sq_test_event a, b, c, d, *x; t_set_colors(0); t_start("squeue tests"); a.id = 1; b.id = 2; c.id = 3; d.id = 4; gettimeofday(&tv, NULL); /* Order in is a, b, c, d, but we should get b, c, d, a out. */ srand(tv.tv_usec ^ tv.tv_sec); t((sq = squeue_create(1024)) != NULL); t(squeue_size(sq) == 0); /* we fill and empty the squeue completely once before testing */ sq_test_random(sq); t(squeue_size(sq) == 0, "Size should be 0 after first sq_test_random"); t((a.evt = squeue_add(sq, time(NULL) + 9, &a)) != NULL); t(squeue_size(sq) == 1); t((b.evt = squeue_add(sq, time(NULL) + 3, &b)) != NULL); t(squeue_size(sq) == 2); t((c.evt = squeue_add_msec(sq, time(NULL) + 5, 0, &c)) != NULL); t(squeue_size(sq) == 3); t((d.evt = squeue_add_usec(sq, time(NULL) + 5, 1, &d)) != NULL); t(squeue_size(sq) == 4); /* add and remove lots. remainder should be what we have above */ sq_test_random(sq); /* testing squeue_peek() */ t((x = (sq_test_event *)squeue_peek(sq)) != NULL); t(x == &b, "x: %p; a: %p; b: %p; c: %p; d: %p\n", x, &a, &b, &c, &d); t(x->id == b.id); t(squeue_size(sq) == 4); /* testing squeue_remove() and re-add */ t(squeue_remove(sq, b.evt) == 0); t(squeue_size(sq) == 3); t((x = squeue_peek(sq)) != NULL); t(x == &c); t((b.evt = squeue_add(sq, time(NULL) + 3, &b)) != NULL); t(squeue_size(sq) == 4); /* peek should now give us the &b event (again) */ t((x = squeue_peek(sq)) != NULL); if (x != &b) { printf("about to fail pretty f*****g hard...\n"); printf("ea: %p; &b: %p; &c: %p; ed: %p; x: %p\n", &a, &b, &c, &d, x); } t(x == &b); t(x->id == b.id); t(squeue_size(sq) == 4); /* testing squeue_pop(), lifo manner */ t((x = squeue_pop(sq)) != NULL); t(squeue_size(sq) == 3, "squeue_size(sq) = %d\n", squeue_size(sq)); t(x == &b, "x: %p; &b: %p\n", x, &b); t(x->id == b.id, "x->id: %lu; d.id: %lu\n", x->id, d.id); /* Test squeue_pop() */ t((x = squeue_pop(sq)) != NULL); t(squeue_size(sq) == 2); t(x == &c, "x->id: %lu; c.id: %lu\n", x->id, c.id); t(x->id == c.id, "x->id: %lu; c.id: %lu\n", x->id, c.id); /* this should fail gracefully (-1 return from squeue_remove()) */ t(squeue_remove(NULL, NULL) == -1); t(squeue_remove(NULL, a.evt) == -1); squeue_foreach(sq, sq_walker, NULL); /* clean up to prevent false valgrind positives */ squeue_destroy(sq, 0); return t_end(); }