예제 #1
0
파일: wproc.c 프로젝트: rlugojr/nagioscore
static int print_input(int sd, int events, void *wp_)
{
	int ret, pkt = 0;
	simple_worker *wp = (simple_worker *)wp_;
	struct kvvec kvv = KVVEC_INITIALIZER;
	char *buf;
	unsigned long tot_bytes = 0, size;

	/*
	 * if some command filled the buffer, we grow it and read some
	 * more until we hit the limit
	 * @todo Define a limit :p
	 */
	size = iocache_size(wp->ioc);
	if (!iocache_capacity(wp->ioc)) {
		if (iocache_size(wp->ioc) < MAX_IOCACHE_SIZE) {
			/* double the size */
			iocache_grow(wp->ioc, iocache_size(wp->ioc));
			printf("Growing iocache for worker %d. sizes old/new %lu/%lu\n",
				   wp->pid, size, iocache_size(wp->ioc));
		} else {
			printf("iocache_size() for worker %d is already at max\n", wp->pid);
		}
	}

	ret = iocache_read(wp->ioc, sd);
	if (!ret) {
		printf("Worker with pid %d seems to have crashed. Exiting\n", wp->pid);
		exit(1);
	}
	if (ret < 0) {
		printf("iocache_read() from worker %d returned %d: %m\n", wp->pid, ret);
		return 0;
	}
	printf("read %d bytes from worker with pid %d::\n", ret, wp->pid);
	while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) {
		int i, ret;
		tot_bytes += size;
		ret = worker_buf2kvvec_prealloc(&kvv, buf, (unsigned int)size, KVVEC_ASSIGN);
		if (!ret < 0) {
			printf("main: Failed to parse buffer of size %lu to key/value vector\n", size);
			continue;
		}
		for (i = 0; i < kvv.kv_pairs; i++) {
			struct key_value *kv = &kvv.kv[i];
			if (!i && memcmp(kv->key, buf, kv->key_len)) {
				printf("### kv[0]->key doesn't match buf. error in kvvec?\n");
			}
			printf("main: %2d.%02d: %s=%s\n", pkt, i, kv->key, kv->value);
		}
		pkt++;
	}

	printf("iocache: available: %lu; size: %lu; capacity: %lu\n",
		   iocache_available(wp->ioc), iocache_size(wp->ioc), iocache_capacity(wp->ioc));
	printf("Got %d packets in %ld bytes (ret: %d)\n", pkt, tot_bytes, ret);

	return 0;
}
예제 #2
0
파일: node.c 프로젝트: ageric/merlin
/*
 * Read as much data as we possibly can from the node so
 * that whatever parsing code there is can handle it later.
 * All information the caller needs will reside in the
 * nodes own merlin_iocache function, and we return the
 * number of bytes read, or -1 on errors.
 * The io-cache buffer must be allocated before we get
 * to this point, and if the caller wants to poll the
 * socket for input, it'll have to do so itself.
 */
int node_recv(merlin_node *node)
{
	int bytes_read;
	iocache *ioc = node->ioc;

	if (!node || node->sock < 0) {
		return -1;
	}

	bytes_read = iocache_read(ioc, node->sock);

	/*
	 * If we read something, update the stat counter
	 * and return. The caller will have to handle the
	 * input as it sees fit
	 */
	if (bytes_read > 0) {
		node->last_action = node->last_recv = time(NULL);
		node->stats.bytes.read += bytes_read;
		return bytes_read;
	}

	/* no real error, but no new data, so return 0 */
	if (errno == EAGAIN || errno == EWOULDBLOCK) {
		ldebug("No input available from %s node %s.", node_type(node), node->name);
		return 0;
	}

	/*
	 * Remote endpoint shut down, or we ran into some random error
	 * we can't handle any other way than disconnecting the node and
	 * letting the write machinery attempt to reconnect later
	 */
	if (bytes_read < 0) {
		lerr("Failed to read from socket %d into %p for %s node %s: %s",
		     node->sock, ioc, node_type(node), node->name, strerror(errno));
	}

	/* zero-read. We've been disconnected for some reason */
	ldebug("bytes_read: %d; errno: %d; strerror(%d): %s",
		   bytes_read, errno, errno, strerror(errno));
	node_disconnect(node, "recv() failed");
	return -1;
}
예제 #3
0
파일: worker.c 프로젝트: formorer/nagios
static int receive_command(int sd, int events, void *discard)
{
	int ioc_ret;
	char *buf;
	unsigned long size;

	if (!ioc) {
		ioc = iocache_create(512 * 1024);
	}
	ioc_ret = iocache_read(ioc, sd);

	/* master closed the connection, so we exit */
	if (ioc_ret == 0) {
		iobroker_close(iobs, sd);
		exit_worker();
	}
	if (ioc_ret < 0) {
		/* XXX: handle this somehow */
	}

#if 0
	/* debug-volley */
	buf = iocache_use_size(ioc, ioc_ret);
	write(master_sd, buf, ioc_ret);
	return 0;
#endif
	/*
	 * now loop over all inbound messages in the iocache.
	 * Since KV_TERMINATOR is a nul-byte, they're separated by 3 nuls
	 */
	while ((buf = iocache_use_delim(ioc, MSG_DELIM, MSG_DELIM_LEN_RECV, &size))) {
		struct kvvec *kvv;
		/* we must copy vars here, as we preserve them for the response */
		kvv = buf2kvvec(buf, (unsigned int)size, KV_SEP, PAIR_SEP, KVVEC_COPY);
		if (kvv)
			spawn_job(kvv);
	}

	return 0;
}
예제 #4
0
static int handle_worker_result(int sd, int events, void *arg)
{
	wproc_object_job *oj = NULL;
	char *buf, *error_reason = NULL;
	unsigned long size;
	int ret;
	static struct kvvec kvv = KVVEC_INITIALIZER;
	struct wproc_worker *wp = (struct wproc_worker *)arg;

	if(iocache_capacity(wp->ioc) == 0) {
		logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_capacity() is 0 for worker %s.\n", wp->name);
	}

	ret = iocache_read(wp->ioc, wp->sd);

	if (ret < 0) {
		logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_read() from %s returned %d: %s\n",
			  wp->name, ret, strerror(errno));
		return 0;
	} else if (ret == 0) {
		logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: Socket to worker %s broken, removing", wp->name);
		wproc_num_workers_online--;
		iobroker_unregister(nagios_iobs, sd);
		if (workers.len <= 0) {
			/* there aren't global workers left, we can't run any more checks
			 * we should try respawning a few of the standard ones
			 */
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: All our workers are dead, we can't do anything!");
		}
		remove_worker(wp);
		fanout_destroy(wp->jobs, fo_reassign_wproc_job);
		wp->jobs = NULL;
		wproc_destroy(wp, 0);
		return 0;
	}
	while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) {
		struct wproc_job *job;
		wproc_result wpres;

		/* log messages are handled first */
		if (size > 5 && !memcmp(buf, "log=", 4)) {
			logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: %s: %s\n", wp->name, buf + 4);
			continue;
		}

		/* for everything else we need to actually parse */
		if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) {
			logit(NSLOG_RUNTIME_ERROR, TRUE,
				  "wproc: Failed to parse key/value vector from worker response with len %lu. First kv=%s",
				  size, buf ? buf : "(NULL)");
			continue;
		}

		memset(&wpres, 0, sizeof(wpres));
		wpres.job_id = -1;
		wpres.type = -1;
		wpres.response = &kvv;
		parse_worker_result(&wpres, &kvv);

		job = get_job(wp, wpres.job_id);
		if (!job) {
			logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: Job with id '%d' doesn't exist on %s.\n",
				  wpres.job_id, wp->name);
			continue;
		}
		if (wpres.type != job->type) {
			logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: %s claims job %d is type %d, but we think it's type %d\n",
				  wp->name, job->id, wpres.type, job->type);
			break;
		}
		oj = (wproc_object_job *)job->arg;

		/*
		 * ETIME ("Timer expired") doesn't really happen
		 * on any modern systems, so we reuse it to mean
		 * "program timed out"
		 */
		if (wpres.error_code == ETIME) {
			wpres.early_timeout = TRUE;
		}
		if (wpres.early_timeout) {
			asprintf(&error_reason, "timed out after %.2fs", tv_delta_f(&wpres.start, &wpres.stop));
		}
		else if (WIFSIGNALED(wpres.wait_status)) {
			asprintf(&error_reason, "died by signal %d%s after %.2f seconds",
			         WTERMSIG(wpres.wait_status),
			         WCOREDUMP(wpres.wait_status) ? " (core dumped)" : "",
			         tv_delta_f(&wpres.start, &wpres.stop));
		}
		else if (job->type != WPJOB_CHECK && WEXITSTATUS(wpres.wait_status) != 0) {
			asprintf(&error_reason, "is a non-check helper but exited with return code %d",
			         WEXITSTATUS(wpres.wait_status));
		}
		if (error_reason) {
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: %s job %d from worker %s %s",
			      wpjob_type_name(job->type), job->id, wp->name, error_reason);
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   command: %s\n", job->command);
			if (job->type != WPJOB_CHECK && oj) {
				logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   host=%s; service=%s; contact=%s\n",
				      oj->host_name ? oj->host_name : "(none)",
				      oj->service_description ? oj->service_description : "(none)",
				      oj->contact_name ? oj->contact_name : "(none)");
			} else if (oj) {
				struct check_result *cr = (struct check_result *)job->arg;
				logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   host=%s; service=%s;\n",
				      cr->host_name, cr->service_description);
			}
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   early_timeout=%d; exited_ok=%d; wait_status=%d; error_code=%d;\n",
			      wpres.early_timeout, wpres.exited_ok, wpres.wait_status, wpres.error_code);
			wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   stderr", wpres.outerr);
			wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   stdout", wpres.outstd);
		}
		my_free(error_reason);

		switch (job->type) {
		case WPJOB_CHECK:
			ret = handle_worker_check(&wpres, wp, job);
			break;
		case WPJOB_NOTIFY:
			if (wpres.early_timeout) {
				if (oj->service_description) {
					logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n",
						  oj->contact_name, oj->service_description,
						  oj->host_name, job->command,
						  tv2float(&wpres.runtime));
				} else {
					logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n",
						  oj->contact_name, oj->host_name,
						  job->command, tv2float(&wpres.runtime));
				}
			}
			break;
		case WPJOB_OCSP:
			if (wpres.early_timeout) {
				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n",
					  job->command, oj->service_description, oj->host_name,
					  tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_OCHP:
			if (wpres.early_timeout) {
				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n",
					  job->command, oj->host_name, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_GLOBAL_SVC_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Global service event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_SVC_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Service event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_GLOBAL_HOST_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Global host event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_HOST_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Host event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;

		case WPJOB_CALLBACK:
			run_job_callback(job, &wpres, 0);
			break;

		default:
			logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type);
			break;
		}
		destroy_job(job);
	}

	return 0;
}
예제 #5
0
파일: workers.c 프로젝트: atj/nagios
static int handle_worker_result(int sd, int events, void *arg)
{
	worker_process *wp = (worker_process *)arg;
	wproc_object_job *oj;
	char *buf;
	unsigned long size;
	int ret;
	static struct kvvec kvv = KVVEC_INITIALIZER;

	ret = iocache_read(wp->ioc, wp->sd);

	if (ret < 0) {
		logit(NSLOG_RUNTIME_WARNING, TRUE, "iocache_read() from worker %d returned %d: %s\n",
			  wp->pid, ret, strerror(errno));
		return 0;
	} else if (ret == 0) {
		/*
		 * XXX FIXME worker exited. spawn a new on to replace it
		 * and distribute all unfinished jobs from this one to others
		 */
		return 0;
	}

	while ((buf = iocache_use_delim(wp->ioc, MSG_DELIM, MSG_DELIM_LEN, &size))) {
		int job_id = -1;
		worker_job *job;
		wproc_result wpres;

		/* log messages are handled first */
		if (size > 5 && !memcmp(buf, "log=", 4)) {
			logit(NSLOG_INFO_MESSAGE, TRUE, "worker %d: %s\n", wp->pid, buf + 4);
			continue;
		}

		/* for everything else we need to actually parse */
		if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) {
			/* XXX FIXME log an error */
			continue;
		}

		memset(&wpres, 0, sizeof(wpres));
		wpres.job_id = -1;
		wpres.type = -1;
		wpres.response = &kvv;
		parse_worker_result(&wpres, &kvv);

		job = get_job(wp, wpres.job_id);
		if (!job) {
			logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker job with id '%d' doesn't exist on worker %d.\n",
				  job_id, wp->pid);
			continue;
		}
		if (wpres.type != job->type) {
			logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d claims job %d is type %d, but we think it's type %d\n",
				  wp->pid, job->id, wpres.type, job->type);
			break;
		}
		oj = (wproc_object_job *)job->arg;

		/*
		 * ETIME ("Timer expired") doesn't really happen
		 * on any modern systems, so we reuse it to mean
		 * "program timed out"
		 */
		if (wpres.error_code == ETIME) {
			wpres.early_timeout = TRUE;
		}
		switch (job->type) {
		case WPJOB_CHECK:
			ret = handle_worker_check(&wpres, wp, job);
			break;
		case WPJOB_NOTIFY:
			if (wpres.early_timeout) {
				if (oj->service_description) {
					logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n",
						  oj->contact_name, oj->service_description,
						  oj->host_name, job->command,
						  tv2float(&wpres.runtime));
				} else {
					logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n",
						  oj->contact_name, oj->host_name,
						  job->command, tv2float(&wpres.runtime));
				}
			}
			break;
		case WPJOB_OCSP:
			if (wpres.early_timeout) {
				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n",
					  job->command, oj->service_description, oj->host_name,
					  tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_OCHP:
			if (wpres.early_timeout) {
				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n",
					  job->command, oj->host_name, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_GLOBAL_SVC_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Global service event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_SVC_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Service event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_GLOBAL_HOST_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Global host event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_HOST_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Host event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;

		default:
			logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type);
			break;
		}
		destroy_job(wp, job);
	}

	return 0;
}