int iobroker_close(iobroker_set *iobs, int fd) { int result; result = iobroker_unregister(iobs, fd); if (fd >= 0) /* make sure valgrind shuts up */ (void)close(fd); return result; }
/* * This gets called when a connect() attempt has become writable. * It's entirely possible that the node we're trying to connect * to has connected to us while we were waiting for them, in * which case we need to figure out which of the two connections * we're supposed to use. */ static int conn_writable(int sd, int events, void *node_) { merlin_node *node = (merlin_node *)node_; int result; int sel_sd; /* unregister so we don't peg one cpu at 100% */ ldebug("CONN: In conn_writable(): node=%s; sd=%d; node->conn_sock=%d", node->name, sd, node->conn_sock); iobroker_unregister(nagios_iobs, sd); if (node->sock < 0) { /* no inbound connection accept()'ed yet */ node->sock = sd; node->conn_sock = -1; if (!net_is_connected(node)) { node_disconnect(node, "Connection attempt failed: %s", strerror(errno)); close(sd); return 0; } iobroker_register(nagios_iobs, sd, node, net_input); node_set_state(node, STATE_NEGOTIATING, "Connect completed successfully. Negotiating protocol"); return 0; } sel_sd = net_negotiate_socket(node, node->conn_sock, node->sock); if (sel_sd < 0) { node_disconnect(node, "Failed to negotiate socket"); return 0; } if (sel_sd == node->conn_sock) { iobroker_close(nagios_iobs, node->sock); } else if (sel_sd == node->sock) { iobroker_close(nagios_iobs, node->conn_sock); } node->sock = sel_sd; node->conn_sock = -1; node_set_state(node, STATE_NEGOTIATING, "polled for writability"); /* now re-register for input */ ldebug("IOB: registering %s(%d) for input events", node->name, node->sock); result = iobroker_register(nagios_iobs, node->sock, node, net_input); if (result < 0) { lerr("IOB: Failed to register %s(%d) for input events: %s", node->name, node->sock, iobroker_strerror(result)); } return 0; }
/* a service for registering workers */ static int register_worker(int sd, char *buf, unsigned int len) { int i, is_global = 1; struct kvvec *info; struct wproc_worker *worker; logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: Registry request: %s\n", buf); if (!(worker = calloc(1, sizeof(*worker)))) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: Failed to allocate worker: %s\n", strerror(errno)); return 500; } info = buf2kvvec(buf, len, '=', ';', 0); if (info == NULL) { free(worker); logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: Failed to parse registration request\n"); return 500; } worker->sd = sd; worker->ioc = iocache_create(1 * 1024 * 1024); iobroker_unregister(nagios_iobs, sd); iobroker_register(nagios_iobs, sd, worker, handle_worker_result); for(i = 0; i < info->kv_pairs; i++) { struct key_value *kv = &info->kv[i]; if (!strcmp(kv->key, "name")) { worker->name = strdup(kv->value); } else if (!strcmp(kv->key, "pid")) { worker->pid = atoi(kv->value); } else if (!strcmp(kv->key, "max_jobs")) { worker->max_jobs = atoi(kv->value); } else if (!strcmp(kv->key, "plugin")) { struct wproc_list *command_handlers; is_global = 0; if (!(command_handlers = dkhash_get(specialized_workers, kv->value, NULL))) { command_handlers = calloc(1, sizeof(struct wproc_list)); command_handlers->wps = calloc(1, sizeof(struct wproc_worker**)); command_handlers->len = 1; command_handlers->wps[0] = worker; dkhash_insert(specialized_workers, strdup(kv->value), NULL, command_handlers); } else { command_handlers->len++; command_handlers->wps = realloc(command_handlers->wps, command_handlers->len * sizeof(struct wproc_worker**)); command_handlers->wps[command_handlers->len - 1] = worker; } worker->wp_list = command_handlers; } } if (!worker->max_jobs) { /* * each worker uses two filedescriptors per job, one to * connect to the master and about 13 to handle libraries * and memory allocation, so this guesstimate shouldn't * be too far off (for local workers, at least). */ worker->max_jobs = (iobroker_max_usable_fds() / 2) - 50; } worker->jobs = fanout_create(worker->max_jobs); if (is_global) { workers.len++; workers.wps = realloc(workers.wps, workers.len * sizeof(struct wproc_worker *)); workers.wps[workers.len - 1] = worker; worker->wp_list = &workers; } wproc_num_workers_online++; kvvec_destroy(info, 0); nsock_printf_nul(sd, "OK"); /* signal query handler to release its iocache for this one */ return QH_TAKEOVER; }
static int handle_worker_result(int sd, int events, void *arg) { wproc_object_job *oj = NULL; char *buf, *error_reason = NULL; unsigned long size; int ret; static struct kvvec kvv = KVVEC_INITIALIZER; struct wproc_worker *wp = (struct wproc_worker *)arg; if(iocache_capacity(wp->ioc) == 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_capacity() is 0 for worker %s.\n", wp->name); } ret = iocache_read(wp->ioc, wp->sd); if (ret < 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_read() from %s returned %d: %s\n", wp->name, ret, strerror(errno)); return 0; } else if (ret == 0) { logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: Socket to worker %s broken, removing", wp->name); wproc_num_workers_online--; iobroker_unregister(nagios_iobs, sd); if (workers.len <= 0) { /* there aren't global workers left, we can't run any more checks * we should try respawning a few of the standard ones */ logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: All our workers are dead, we can't do anything!"); } remove_worker(wp); fanout_destroy(wp->jobs, fo_reassign_wproc_job); wp->jobs = NULL; wproc_destroy(wp, 0); return 0; } while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) { struct wproc_job *job; wproc_result wpres; /* log messages are handled first */ if (size > 5 && !memcmp(buf, "log=", 4)) { logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: %s: %s\n", wp->name, buf + 4); continue; } /* for everything else we need to actually parse */ if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: Failed to parse key/value vector from worker response with len %lu. First kv=%s", size, buf ? buf : "(NULL)"); continue; } memset(&wpres, 0, sizeof(wpres)); wpres.job_id = -1; wpres.type = -1; wpres.response = &kvv; parse_worker_result(&wpres, &kvv); job = get_job(wp, wpres.job_id); if (!job) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: Job with id '%d' doesn't exist on %s.\n", wpres.job_id, wp->name); continue; } if (wpres.type != job->type) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: %s claims job %d is type %d, but we think it's type %d\n", wp->name, job->id, wpres.type, job->type); break; } oj = (wproc_object_job *)job->arg; /* * ETIME ("Timer expired") doesn't really happen * on any modern systems, so we reuse it to mean * "program timed out" */ if (wpres.error_code == ETIME) { wpres.early_timeout = TRUE; } if (wpres.early_timeout) { asprintf(&error_reason, "timed out after %.2fs", tv_delta_f(&wpres.start, &wpres.stop)); } else if (WIFSIGNALED(wpres.wait_status)) { asprintf(&error_reason, "died by signal %d%s after %.2f seconds", WTERMSIG(wpres.wait_status), WCOREDUMP(wpres.wait_status) ? " (core dumped)" : "", tv_delta_f(&wpres.start, &wpres.stop)); } else if (job->type != WPJOB_CHECK && WEXITSTATUS(wpres.wait_status) != 0) { asprintf(&error_reason, "is a non-check helper but exited with return code %d", WEXITSTATUS(wpres.wait_status)); } if (error_reason) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: %s job %d from worker %s %s", wpjob_type_name(job->type), job->id, wp->name, error_reason); logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: command: %s\n", job->command); if (job->type != WPJOB_CHECK && oj) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: host=%s; service=%s; contact=%s\n", oj->host_name ? oj->host_name : "(none)", oj->service_description ? oj->service_description : "(none)", oj->contact_name ? oj->contact_name : "(none)"); } else if (oj) { struct check_result *cr = (struct check_result *)job->arg; logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: host=%s; service=%s;\n", cr->host_name, cr->service_description); } logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: early_timeout=%d; exited_ok=%d; wait_status=%d; error_code=%d;\n", wpres.early_timeout, wpres.exited_ok, wpres.wait_status, wpres.error_code); wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc: stderr", wpres.outerr); wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc: stdout", wpres.outstd); } my_free(error_reason); switch (job->type) { case WPJOB_CHECK: ret = handle_worker_check(&wpres, wp, job); break; case WPJOB_NOTIFY: if (wpres.early_timeout) { if (oj->service_description) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->service_description, oj->host_name, job->command, tv2float(&wpres.runtime)); } else { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->host_name, job->command, tv2float(&wpres.runtime)); } } break; case WPJOB_OCSP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n", job->command, oj->service_description, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_OCHP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n", job->command, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_CALLBACK: run_job_callback(job, &wpres, 0); break; default: logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type); break; } destroy_job(job); } return 0; }
int iobroker_deregister(iobroker_set *iobs, int fd) { return iobroker_unregister(iobs, fd); }