static int handle_worker_result(int sd, int events, void *arg) { wproc_object_job *oj = NULL; char *buf, *error_reason = NULL; unsigned long size; int ret; static struct kvvec kvv = KVVEC_INITIALIZER; struct wproc_worker *wp = (struct wproc_worker *)arg; if(iocache_capacity(wp->ioc) == 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_capacity() is 0 for worker %s.\n", wp->name); } ret = iocache_read(wp->ioc, wp->sd); if (ret < 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_read() from %s returned %d: %s\n", wp->name, ret, strerror(errno)); return 0; } else if (ret == 0) { logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: Socket to worker %s broken, removing", wp->name); wproc_num_workers_online--; iobroker_unregister(nagios_iobs, sd); if (workers.len <= 0) { /* there aren't global workers left, we can't run any more checks * we should try respawning a few of the standard ones */ logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: All our workers are dead, we can't do anything!"); } remove_worker(wp); fanout_destroy(wp->jobs, fo_reassign_wproc_job); wp->jobs = NULL; wproc_destroy(wp, 0); return 0; } while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) { struct wproc_job *job; wproc_result wpres; /* log messages are handled first */ if (size > 5 && !memcmp(buf, "log=", 4)) { logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: %s: %s\n", wp->name, buf + 4); continue; } /* for everything else we need to actually parse */ if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: Failed to parse key/value vector from worker response with len %lu. First kv=%s", size, buf ? buf : "(NULL)"); continue; } memset(&wpres, 0, sizeof(wpres)); wpres.job_id = -1; wpres.type = -1; wpres.response = &kvv; parse_worker_result(&wpres, &kvv); job = get_job(wp, wpres.job_id); if (!job) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: Job with id '%d' doesn't exist on %s.\n", wpres.job_id, wp->name); continue; } if (wpres.type != job->type) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: %s claims job %d is type %d, but we think it's type %d\n", wp->name, job->id, wpres.type, job->type); break; } oj = (wproc_object_job *)job->arg; /* * ETIME ("Timer expired") doesn't really happen * on any modern systems, so we reuse it to mean * "program timed out" */ if (wpres.error_code == ETIME) { wpres.early_timeout = TRUE; } if (wpres.early_timeout) { asprintf(&error_reason, "timed out after %.2fs", tv_delta_f(&wpres.start, &wpres.stop)); } else if (WIFSIGNALED(wpres.wait_status)) { asprintf(&error_reason, "died by signal %d%s after %.2f seconds", WTERMSIG(wpres.wait_status), WCOREDUMP(wpres.wait_status) ? " (core dumped)" : "", tv_delta_f(&wpres.start, &wpres.stop)); } else if (job->type != WPJOB_CHECK && WEXITSTATUS(wpres.wait_status) != 0) { asprintf(&error_reason, "is a non-check helper but exited with return code %d", WEXITSTATUS(wpres.wait_status)); } if (error_reason) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: %s job %d from worker %s %s", wpjob_type_name(job->type), job->id, wp->name, error_reason); logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: command: %s\n", job->command); if (job->type != WPJOB_CHECK && oj) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: host=%s; service=%s; contact=%s\n", oj->host_name ? oj->host_name : "(none)", oj->service_description ? oj->service_description : "(none)", oj->contact_name ? oj->contact_name : "(none)"); } else if (oj) { struct check_result *cr = (struct check_result *)job->arg; logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: host=%s; service=%s;\n", cr->host_name, cr->service_description); } logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: early_timeout=%d; exited_ok=%d; wait_status=%d; error_code=%d;\n", wpres.early_timeout, wpres.exited_ok, wpres.wait_status, wpres.error_code); wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc: stderr", wpres.outerr); wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc: stdout", wpres.outstd); } my_free(error_reason); switch (job->type) { case WPJOB_CHECK: ret = handle_worker_check(&wpres, wp, job); break; case WPJOB_NOTIFY: if (wpres.early_timeout) { if (oj->service_description) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->service_description, oj->host_name, job->command, tv2float(&wpres.runtime)); } else { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->host_name, job->command, tv2float(&wpres.runtime)); } } break; case WPJOB_OCSP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n", job->command, oj->service_description, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_OCHP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n", job->command, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_CALLBACK: run_job_callback(job, &wpres, 0); break; default: logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type); break; } destroy_job(job); } return 0; }
static int finish_job(child_process *cp, int reason) { static struct kvvec resp = KVVEC_INITIALIZER; struct rusage *ru = &cp->rusage; int i, ret; /* how many key/value pairs do we need? */ if (kvvec_init(&resp, 12 + cp->request->kv_pairs) == NULL) { /* what the hell do we do now? */ exit_worker(); } gettimeofday(&cp->stop, NULL); if (running_jobs != squeue_size(sq)) { wlog("running_jobs(%d) != squeue_size(sq) (%d)\n", running_jobs, squeue_size(sq)); wlog("started: %d; running: %d; finished: %d\n", started, running_jobs, started - running_jobs); } /* * we must remove the job's timeout ticker, * or we'll end up accessing an already free()'d * pointer, or the pointer to a different child. */ squeue_remove(sq, cp->sq_event); /* get rid of still open filedescriptors */ if (cp->outstd.fd != -1) iobroker_close(iobs, cp->outstd.fd); if (cp->outerr.fd != -1) iobroker_close(iobs, cp->outerr.fd); cp->runtime = tv_delta_f(&cp->start, &cp->stop); /* * Now build the return message. * First comes the request, minus environment variables */ for (i = 0; i < cp->request->kv_pairs; i++) { struct key_value *kv = &cp->request->kv[i]; /* skip environment macros */ if (kv->key_len == 3 && !strcmp(kv->key, "env")) { continue; } kvvec_addkv_wlen(&resp, kv->key, kv->key_len, kv->value, kv->value_len); } kvvec_addkv(&resp, "wait_status", (char *)mkstr("%d", cp->ret)); kvvec_addkv_wlen(&resp, "outstd", 6, cp->outstd.buf, cp->outstd.len); kvvec_addkv_wlen(&resp, "outerr", 6, cp->outerr.buf, cp->outerr.len); kvvec_add_tv(&resp, "start", cp->start); kvvec_add_tv(&resp, "stop", cp->stop); kvvec_addkv(&resp, "runtime", (char *)mkstr("%f", cp->runtime)); if (!reason) { /* child exited nicely */ kvvec_addkv(&resp, "exited_ok", "1"); kvvec_add_tv(&resp, "ru_utime", ru->ru_utime); kvvec_add_tv(&resp, "ru_stime", ru->ru_stime); kvvec_add_long(&resp, "ru_minflt", ru->ru_minflt); kvvec_add_long(&resp, "ru_majflt", ru->ru_majflt); kvvec_add_long(&resp, "ru_nswap", ru->ru_nswap); kvvec_add_long(&resp, "ru_inblock", ru->ru_inblock); kvvec_add_long(&resp, "ru_oublock", ru->ru_oublock); kvvec_add_long(&resp, "ru_nsignals", ru->ru_nsignals); } else { /* some error happened */ kvvec_addkv(&resp, "exited_ok", "0"); kvvec_addkv(&resp, "error_code", (char *)mkstr("%d", reason)); } ret = send_kvvec(master_sd, &resp); if (ret < 0 && errno == EPIPE) exit_worker(); running_jobs--; if (cp->outstd.buf) { free(cp->outstd.buf); cp->outstd.buf = NULL; } if (cp->outerr.buf) { free(cp->outerr.buf); cp->outerr.buf = NULL; } kvvec_destroy(cp->request, KVVEC_FREE_ALL); free(cp->cmd); free(cp); return 0; }
/* handles a timed event */ int handle_timed_event(timed_event *event) { host *temp_host = NULL; service *temp_service = NULL; void (*userfunc)(void *); struct timeval tv; const struct timeval *event_runtime; double latency; log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_timed_event() start\n"); #ifdef USE_EVENT_BROKER /* send event data to broker */ broker_timed_event(NEBTYPE_TIMEDEVENT_EXECUTE, NEBFLAG_NONE, NEBATTR_NONE, event, NULL); #endif log_debug_info(DEBUGL_EVENTS, 0, "** Timed Event ** Type: EVENT_%s, Run Time: %s", EVENT_TYPE_STR(event->event_type), ctime(&event->run_time)); /* get event latency */ gettimeofday(&tv, NULL); event_runtime = squeue_event_runtime(event->sq_event); latency = (double)(tv_delta_f(event_runtime, &tv)); if (latency < 0.0) /* events may run up to 0.1 seconds early */ latency = 0.0; /* how should we handle the event? */ switch (event->event_type) { case EVENT_SERVICE_CHECK: temp_service = (service *)event->event_data; log_debug_info(DEBUGL_EVENTS, 0, "** Service Check Event ==> Host: '%s', Service: '%s', Options: %d, Latency: %f sec\n", temp_service->host_name, temp_service->description, event->event_options, latency); /* run the service check */ run_scheduled_service_check(temp_service, event->event_options, latency); break; case EVENT_HOST_CHECK: temp_host = (host *)event->event_data; log_debug_info(DEBUGL_EVENTS, 0, "** Host Check Event ==> Host: '%s', Options: %d, Latency: %f sec\n", temp_host->name, event->event_options, latency); /* run the host check */ run_scheduled_host_check(temp_host, event->event_options, latency); break; case EVENT_PROGRAM_SHUTDOWN: log_debug_info(DEBUGL_EVENTS, 0, "** Program Shutdown Event. Latency: %.3fs\n", latency); /* set the shutdown flag */ sigshutdown = TRUE; /* log the shutdown */ logit(NSLOG_PROCESS_INFO, TRUE, "PROGRAM_SHUTDOWN event encountered, shutting down...\n"); break; case EVENT_PROGRAM_RESTART: log_debug_info(DEBUGL_EVENTS, 0, "** Program Restart Event. Latency: %.3fs\n", latency); /* set the restart flag */ sigrestart = TRUE; /* log the restart */ logit(NSLOG_PROCESS_INFO, TRUE, "PROGRAM_RESTART event encountered, restarting...\n"); break; case EVENT_CHECK_REAPER: log_debug_info(DEBUGL_EVENTS, 0, "** Check Result Reaper. Latency: %.3fs\n", latency); /* reap host and service check results */ reap_check_results(); break; case EVENT_ORPHAN_CHECK: log_debug_info(DEBUGL_EVENTS, 0, "** Orphaned Host and Service Check Event. Latency: %.3fs\n", latency); /* check for orphaned hosts and services */ if (check_orphaned_hosts == TRUE) check_for_orphaned_hosts(); if (check_orphaned_services == TRUE) check_for_orphaned_services(); break; case EVENT_RETENTION_SAVE: log_debug_info(DEBUGL_EVENTS, 0, "** Retention Data Save Event. Latency: %.3fs\n", latency); /* save state retention data */ save_state_information(TRUE); break; case EVENT_STATUS_SAVE: log_debug_info(DEBUGL_EVENTS, 0, "** Status Data Save Event. Latency: %.3fs\n", latency); /* save all status data (program, host, and service) */ update_all_status_data(); break; case EVENT_SCHEDULED_DOWNTIME: log_debug_info(DEBUGL_EVENTS, 0, "** Scheduled Downtime Event. Latency: %.3fs\n", latency); /* process scheduled downtime info */ if (event->event_data) { handle_scheduled_downtime_by_id(*(unsigned long *)event->event_data); free(event->event_data); event->event_data = NULL; } break; case EVENT_SFRESHNESS_CHECK: log_debug_info(DEBUGL_EVENTS, 0, "** Service Result Freshness Check Event. Latency: %.3fs\n", latency); /* check service result freshness */ check_service_result_freshness(); break; case EVENT_HFRESHNESS_CHECK: log_debug_info(DEBUGL_EVENTS, 0, "** Host Result Freshness Check Event. Latency: %.3fs\n", latency); /* check host result freshness */ check_host_result_freshness(); break; case EVENT_EXPIRE_DOWNTIME: log_debug_info(DEBUGL_EVENTS, 0, "** Expire Downtime Event. Latency: %.3fs\n", latency); /* check for expired scheduled downtime entries */ check_for_expired_downtime(); break; case EVENT_EXPIRE_COMMENT: log_debug_info(DEBUGL_EVENTS, 0, "** Expire Comment Event. Latency: %.3fs\n", latency); /* check for expired comment */ check_for_expired_comment((unsigned long)event->event_data); break; case EVENT_CHECK_PROGRAM_UPDATE: /* this doesn't do anything anymore */ break; case EVENT_USER_FUNCTION: log_debug_info(DEBUGL_EVENTS, 0, "** User Function Event. Latency: %.3fs\n", latency); /* run a user-defined function */ if (event->event_data != NULL) { userfunc = event->event_data; (*userfunc)(event->event_args); } break; default: break; } log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_timed_event() end\n"); return OK; }
int finish_job(child_process *cp, int reason) { static struct kvvec resp = KVVEC_INITIALIZER; struct rusage *ru = &cp->ei->rusage; int i, ret; /* get rid of still open filedescriptors */ if (cp->outstd.fd != -1) { gather_output(cp, &cp->outstd, 1); iobroker_close(iobs, cp->outstd.fd); } if (cp->outerr.fd != -1) { gather_output(cp, &cp->outerr, 1); iobroker_close(iobs, cp->outerr.fd); } /* Make sure network-supplied data doesn't contain nul bytes */ strip_nul_bytes(cp->outstd); strip_nul_bytes(cp->outerr); /* how many key/value pairs do we need? */ if (kvvec_init(&resp, 12 + cp->request->kv_pairs) == NULL) { /* what the hell do we do now? */ exit_worker(1, "Failed to init response key/value vector"); } gettimeofday(&cp->ei->stop, NULL); if (running_jobs != squeue_size(sq)) { wlog("running_jobs(%d) != squeue_size(sq) (%d)\n", running_jobs, squeue_size(sq)); wlog("started: %d; running: %d; finished: %d\n", started, running_jobs, started - running_jobs); } cp->ei->runtime = tv_delta_f(&cp->ei->start, &cp->ei->stop); /* * Now build the return message. * First comes the request, minus environment variables */ for (i = 0; i < cp->request->kv_pairs; i++) { struct key_value *kv = &cp->request->kv[i]; /* skip environment macros */ if (kv->key_len == 3 && !strcmp(kv->key, "env")) { continue; } kvvec_addkv_wlen(&resp, kv->key, kv->key_len, kv->value, kv->value_len); } kvvec_addkv(&resp, "wait_status", mkstr("%d", cp->ret)); kvvec_add_tv(&resp, "start", cp->ei->start); kvvec_add_tv(&resp, "stop", cp->ei->stop); kvvec_addkv(&resp, "runtime", mkstr("%f", cp->ei->runtime)); if (!reason) { /* child exited nicely (or with a signal, so check wait_status) */ kvvec_addkv(&resp, "exited_ok", "1"); kvvec_add_tv(&resp, "ru_utime", ru->ru_utime); kvvec_add_tv(&resp, "ru_stime", ru->ru_stime); kvvec_add_long(&resp, "ru_minflt", ru->ru_minflt); kvvec_add_long(&resp, "ru_majflt", ru->ru_majflt); kvvec_add_long(&resp, "ru_inblock", ru->ru_inblock); kvvec_add_long(&resp, "ru_oublock", ru->ru_oublock); } else { /* some error happened */ kvvec_addkv(&resp, "exited_ok", "0"); kvvec_addkv(&resp, "error_code", mkstr("%d", reason)); } kvvec_addkv_wlen(&resp, "outerr", 6, cp->outerr.buf, cp->outerr.len); kvvec_addkv_wlen(&resp, "outstd", 6, cp->outstd.buf, cp->outstd.len); ret = worker_send_kvvec(master_sd, &resp); if (ret < 0 && errno == EPIPE) exit_worker(1, "Failed to send kvvec struct to master"); return 0; }