static int print_input(int sd, int events, void *wp_) { int ret, pkt = 0; simple_worker *wp = (simple_worker *)wp_; struct kvvec kvv = KVVEC_INITIALIZER; char *buf; unsigned long tot_bytes = 0, size; /* * if some command filled the buffer, we grow it and read some * more until we hit the limit * @todo Define a limit :p */ size = iocache_size(wp->ioc); if (!iocache_capacity(wp->ioc)) { if (iocache_size(wp->ioc) < MAX_IOCACHE_SIZE) { /* double the size */ iocache_grow(wp->ioc, iocache_size(wp->ioc)); printf("Growing iocache for worker %d. sizes old/new %lu/%lu\n", wp->pid, size, iocache_size(wp->ioc)); } else { printf("iocache_size() for worker %d is already at max\n", wp->pid); } } ret = iocache_read(wp->ioc, sd); if (!ret) { printf("Worker with pid %d seems to have crashed. Exiting\n", wp->pid); exit(1); } if (ret < 0) { printf("iocache_read() from worker %d returned %d: %m\n", wp->pid, ret); return 0; } printf("read %d bytes from worker with pid %d::\n", ret, wp->pid); while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) { int i, ret; tot_bytes += size; ret = worker_buf2kvvec_prealloc(&kvv, buf, (unsigned int)size, KVVEC_ASSIGN); if (!ret < 0) { printf("main: Failed to parse buffer of size %lu to key/value vector\n", size); continue; } for (i = 0; i < kvv.kv_pairs; i++) { struct key_value *kv = &kvv.kv[i]; if (!i && memcmp(kv->key, buf, kv->key_len)) { printf("### kv[0]->key doesn't match buf. error in kvvec?\n"); } printf("main: %2d.%02d: %s=%s\n", pkt, i, kv->key, kv->value); } pkt++; } printf("iocache: available: %lu; size: %lu; capacity: %lu\n", iocache_available(wp->ioc), iocache_size(wp->ioc), iocache_capacity(wp->ioc)); printf("Got %d packets in %ld bytes (ret: %d)\n", pkt, tot_bytes, ret); return 0; }
/* * Read as much data as we possibly can from the node so * that whatever parsing code there is can handle it later. * All information the caller needs will reside in the * nodes own merlin_iocache function, and we return the * number of bytes read, or -1 on errors. * The io-cache buffer must be allocated before we get * to this point, and if the caller wants to poll the * socket for input, it'll have to do so itself. */ int node_recv(merlin_node *node) { int bytes_read; iocache *ioc = node->ioc; if (!node || node->sock < 0) { return -1; } bytes_read = iocache_read(ioc, node->sock); /* * If we read something, update the stat counter * and return. The caller will have to handle the * input as it sees fit */ if (bytes_read > 0) { node->last_action = node->last_recv = time(NULL); node->stats.bytes.read += bytes_read; return bytes_read; } /* no real error, but no new data, so return 0 */ if (errno == EAGAIN || errno == EWOULDBLOCK) { ldebug("No input available from %s node %s.", node_type(node), node->name); return 0; } /* * Remote endpoint shut down, or we ran into some random error * we can't handle any other way than disconnecting the node and * letting the write machinery attempt to reconnect later */ if (bytes_read < 0) { lerr("Failed to read from socket %d into %p for %s node %s: %s", node->sock, ioc, node_type(node), node->name, strerror(errno)); } /* zero-read. We've been disconnected for some reason */ ldebug("bytes_read: %d; errno: %d; strerror(%d): %s", bytes_read, errno, errno, strerror(errno)); node_disconnect(node, "recv() failed"); return -1; }
static int receive_command(int sd, int events, void *discard) { int ioc_ret; char *buf; unsigned long size; if (!ioc) { ioc = iocache_create(512 * 1024); } ioc_ret = iocache_read(ioc, sd); /* master closed the connection, so we exit */ if (ioc_ret == 0) { iobroker_close(iobs, sd); exit_worker(); } if (ioc_ret < 0) { /* XXX: handle this somehow */ } #if 0 /* debug-volley */ buf = iocache_use_size(ioc, ioc_ret); write(master_sd, buf, ioc_ret); return 0; #endif /* * now loop over all inbound messages in the iocache. * Since KV_TERMINATOR is a nul-byte, they're separated by 3 nuls */ while ((buf = iocache_use_delim(ioc, MSG_DELIM, MSG_DELIM_LEN_RECV, &size))) { struct kvvec *kvv; /* we must copy vars here, as we preserve them for the response */ kvv = buf2kvvec(buf, (unsigned int)size, KV_SEP, PAIR_SEP, KVVEC_COPY); if (kvv) spawn_job(kvv); } return 0; }
static int handle_worker_result(int sd, int events, void *arg) { wproc_object_job *oj = NULL; char *buf, *error_reason = NULL; unsigned long size; int ret; static struct kvvec kvv = KVVEC_INITIALIZER; struct wproc_worker *wp = (struct wproc_worker *)arg; if(iocache_capacity(wp->ioc) == 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_capacity() is 0 for worker %s.\n", wp->name); } ret = iocache_read(wp->ioc, wp->sd); if (ret < 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_read() from %s returned %d: %s\n", wp->name, ret, strerror(errno)); return 0; } else if (ret == 0) { logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: Socket to worker %s broken, removing", wp->name); wproc_num_workers_online--; iobroker_unregister(nagios_iobs, sd); if (workers.len <= 0) { /* there aren't global workers left, we can't run any more checks * we should try respawning a few of the standard ones */ logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: All our workers are dead, we can't do anything!"); } remove_worker(wp); fanout_destroy(wp->jobs, fo_reassign_wproc_job); wp->jobs = NULL; wproc_destroy(wp, 0); return 0; } while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) { struct wproc_job *job; wproc_result wpres; /* log messages are handled first */ if (size > 5 && !memcmp(buf, "log=", 4)) { logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: %s: %s\n", wp->name, buf + 4); continue; } /* for everything else we need to actually parse */ if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: Failed to parse key/value vector from worker response with len %lu. First kv=%s", size, buf ? buf : "(NULL)"); continue; } memset(&wpres, 0, sizeof(wpres)); wpres.job_id = -1; wpres.type = -1; wpres.response = &kvv; parse_worker_result(&wpres, &kvv); job = get_job(wp, wpres.job_id); if (!job) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: Job with id '%d' doesn't exist on %s.\n", wpres.job_id, wp->name); continue; } if (wpres.type != job->type) { logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: %s claims job %d is type %d, but we think it's type %d\n", wp->name, job->id, wpres.type, job->type); break; } oj = (wproc_object_job *)job->arg; /* * ETIME ("Timer expired") doesn't really happen * on any modern systems, so we reuse it to mean * "program timed out" */ if (wpres.error_code == ETIME) { wpres.early_timeout = TRUE; } if (wpres.early_timeout) { asprintf(&error_reason, "timed out after %.2fs", tv_delta_f(&wpres.start, &wpres.stop)); } else if (WIFSIGNALED(wpres.wait_status)) { asprintf(&error_reason, "died by signal %d%s after %.2f seconds", WTERMSIG(wpres.wait_status), WCOREDUMP(wpres.wait_status) ? " (core dumped)" : "", tv_delta_f(&wpres.start, &wpres.stop)); } else if (job->type != WPJOB_CHECK && WEXITSTATUS(wpres.wait_status) != 0) { asprintf(&error_reason, "is a non-check helper but exited with return code %d", WEXITSTATUS(wpres.wait_status)); } if (error_reason) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: %s job %d from worker %s %s", wpjob_type_name(job->type), job->id, wp->name, error_reason); logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: command: %s\n", job->command); if (job->type != WPJOB_CHECK && oj) { logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: host=%s; service=%s; contact=%s\n", oj->host_name ? oj->host_name : "(none)", oj->service_description ? oj->service_description : "(none)", oj->contact_name ? oj->contact_name : "(none)"); } else if (oj) { struct check_result *cr = (struct check_result *)job->arg; logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: host=%s; service=%s;\n", cr->host_name, cr->service_description); } logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: early_timeout=%d; exited_ok=%d; wait_status=%d; error_code=%d;\n", wpres.early_timeout, wpres.exited_ok, wpres.wait_status, wpres.error_code); wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc: stderr", wpres.outerr); wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc: stdout", wpres.outstd); } my_free(error_reason); switch (job->type) { case WPJOB_CHECK: ret = handle_worker_check(&wpres, wp, job); break; case WPJOB_NOTIFY: if (wpres.early_timeout) { if (oj->service_description) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->service_description, oj->host_name, job->command, tv2float(&wpres.runtime)); } else { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->host_name, job->command, tv2float(&wpres.runtime)); } } break; case WPJOB_OCSP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n", job->command, oj->service_description, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_OCHP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n", job->command, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_CALLBACK: run_job_callback(job, &wpres, 0); break; default: logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type); break; } destroy_job(job); } return 0; }
static int handle_worker_result(int sd, int events, void *arg) { worker_process *wp = (worker_process *)arg; wproc_object_job *oj; char *buf; unsigned long size; int ret; static struct kvvec kvv = KVVEC_INITIALIZER; ret = iocache_read(wp->ioc, wp->sd); if (ret < 0) { logit(NSLOG_RUNTIME_WARNING, TRUE, "iocache_read() from worker %d returned %d: %s\n", wp->pid, ret, strerror(errno)); return 0; } else if (ret == 0) { /* * XXX FIXME worker exited. spawn a new on to replace it * and distribute all unfinished jobs from this one to others */ return 0; } while ((buf = iocache_use_delim(wp->ioc, MSG_DELIM, MSG_DELIM_LEN, &size))) { int job_id = -1; worker_job *job; wproc_result wpres; /* log messages are handled first */ if (size > 5 && !memcmp(buf, "log=", 4)) { logit(NSLOG_INFO_MESSAGE, TRUE, "worker %d: %s\n", wp->pid, buf + 4); continue; } /* for everything else we need to actually parse */ if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) { /* XXX FIXME log an error */ continue; } memset(&wpres, 0, sizeof(wpres)); wpres.job_id = -1; wpres.type = -1; wpres.response = &kvv; parse_worker_result(&wpres, &kvv); job = get_job(wp, wpres.job_id); if (!job) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker job with id '%d' doesn't exist on worker %d.\n", job_id, wp->pid); continue; } if (wpres.type != job->type) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d claims job %d is type %d, but we think it's type %d\n", wp->pid, job->id, wpres.type, job->type); break; } oj = (wproc_object_job *)job->arg; /* * ETIME ("Timer expired") doesn't really happen * on any modern systems, so we reuse it to mean * "program timed out" */ if (wpres.error_code == ETIME) { wpres.early_timeout = TRUE; } switch (job->type) { case WPJOB_CHECK: ret = handle_worker_check(&wpres, wp, job); break; case WPJOB_NOTIFY: if (wpres.early_timeout) { if (oj->service_description) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->service_description, oj->host_name, job->command, tv2float(&wpres.runtime)); } else { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n", oj->contact_name, oj->host_name, job->command, tv2float(&wpres.runtime)); } } break; case WPJOB_OCSP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n", job->command, oj->service_description, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_OCHP: if (wpres.early_timeout) { logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n", job->command, oj->host_name, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_SVC_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Service event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_GLOBAL_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Global host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; case WPJOB_HOST_EVTHANDLER: if (wpres.early_timeout) { logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE, "Warning: Host event handler command '%s' timed out after %.2f seconds\n", job->command, tv2float(&wpres.runtime)); } break; default: logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type); break; } destroy_job(wp, job); } return 0; }