Esempio n. 1
0
static int handle_worker_result(int sd, int events, void *arg)
{
	wproc_object_job *oj = NULL;
	char *buf, *error_reason = NULL;
	unsigned long size;
	int ret;
	static struct kvvec kvv = KVVEC_INITIALIZER;
	struct wproc_worker *wp = (struct wproc_worker *)arg;

	if(iocache_capacity(wp->ioc) == 0) {
		logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_capacity() is 0 for worker %s.\n", wp->name);
	}

	ret = iocache_read(wp->ioc, wp->sd);

	if (ret < 0) {
		logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: iocache_read() from %s returned %d: %s\n",
			  wp->name, ret, strerror(errno));
		return 0;
	} else if (ret == 0) {
		logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: Socket to worker %s broken, removing", wp->name);
		wproc_num_workers_online--;
		iobroker_unregister(nagios_iobs, sd);
		if (workers.len <= 0) {
			/* there aren't global workers left, we can't run any more checks
			 * we should try respawning a few of the standard ones
			 */
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: All our workers are dead, we can't do anything!");
		}
		remove_worker(wp);
		fanout_destroy(wp->jobs, fo_reassign_wproc_job);
		wp->jobs = NULL;
		wproc_destroy(wp, 0);
		return 0;
	}
	while ((buf = worker_ioc2msg(wp->ioc, &size, 0))) {
		struct wproc_job *job;
		wproc_result wpres;

		/* log messages are handled first */
		if (size > 5 && !memcmp(buf, "log=", 4)) {
			logit(NSLOG_INFO_MESSAGE, TRUE, "wproc: %s: %s\n", wp->name, buf + 4);
			continue;
		}

		/* for everything else we need to actually parse */
		if (buf2kvvec_prealloc(&kvv, buf, size, '=', '\0', KVVEC_ASSIGN) <= 0) {
			logit(NSLOG_RUNTIME_ERROR, TRUE,
				  "wproc: Failed to parse key/value vector from worker response with len %lu. First kv=%s",
				  size, buf ? buf : "(NULL)");
			continue;
		}

		memset(&wpres, 0, sizeof(wpres));
		wpres.job_id = -1;
		wpres.type = -1;
		wpres.response = &kvv;
		parse_worker_result(&wpres, &kvv);

		job = get_job(wp, wpres.job_id);
		if (!job) {
			logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: Job with id '%d' doesn't exist on %s.\n",
				  wpres.job_id, wp->name);
			continue;
		}
		if (wpres.type != job->type) {
			logit(NSLOG_RUNTIME_WARNING, TRUE, "wproc: %s claims job %d is type %d, but we think it's type %d\n",
				  wp->name, job->id, wpres.type, job->type);
			break;
		}
		oj = (wproc_object_job *)job->arg;

		/*
		 * ETIME ("Timer expired") doesn't really happen
		 * on any modern systems, so we reuse it to mean
		 * "program timed out"
		 */
		if (wpres.error_code == ETIME) {
			wpres.early_timeout = TRUE;
		}
		if (wpres.early_timeout) {
			asprintf(&error_reason, "timed out after %.2fs", tv_delta_f(&wpres.start, &wpres.stop));
		}
		else if (WIFSIGNALED(wpres.wait_status)) {
			asprintf(&error_reason, "died by signal %d%s after %.2f seconds",
			         WTERMSIG(wpres.wait_status),
			         WCOREDUMP(wpres.wait_status) ? " (core dumped)" : "",
			         tv_delta_f(&wpres.start, &wpres.stop));
		}
		else if (job->type != WPJOB_CHECK && WEXITSTATUS(wpres.wait_status) != 0) {
			asprintf(&error_reason, "is a non-check helper but exited with return code %d",
			         WEXITSTATUS(wpres.wait_status));
		}
		if (error_reason) {
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc: %s job %d from worker %s %s",
			      wpjob_type_name(job->type), job->id, wp->name, error_reason);
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   command: %s\n", job->command);
			if (job->type != WPJOB_CHECK && oj) {
				logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   host=%s; service=%s; contact=%s\n",
				      oj->host_name ? oj->host_name : "(none)",
				      oj->service_description ? oj->service_description : "(none)",
				      oj->contact_name ? oj->contact_name : "(none)");
			} else if (oj) {
				struct check_result *cr = (struct check_result *)job->arg;
				logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   host=%s; service=%s;\n",
				      cr->host_name, cr->service_description);
			}
			logit(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   early_timeout=%d; exited_ok=%d; wait_status=%d; error_code=%d;\n",
			      wpres.early_timeout, wpres.exited_ok, wpres.wait_status, wpres.error_code);
			wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   stderr", wpres.outerr);
			wproc_logdump_buffer(NSLOG_RUNTIME_ERROR, TRUE, "wproc:   stdout", wpres.outstd);
		}
		my_free(error_reason);

		switch (job->type) {
		case WPJOB_CHECK:
			ret = handle_worker_check(&wpres, wp, job);
			break;
		case WPJOB_NOTIFY:
			if (wpres.early_timeout) {
				if (oj->service_description) {
					logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of service '%s' on host '%s' by command '%s' timed out after %.2f seconds\n",
						  oj->contact_name, oj->service_description,
						  oj->host_name, job->command,
						  tv2float(&wpres.runtime));
				} else {
					logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Notifying contact '%s' of host '%s' by command '%s' timed out after %.2f seconds\n",
						  oj->contact_name, oj->host_name,
						  job->command, tv2float(&wpres.runtime));
				}
			}
			break;
		case WPJOB_OCSP:
			if (wpres.early_timeout) {
				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCSP command '%s' for service '%s' on host '%s' timed out after %.2f seconds\n",
					  job->command, oj->service_description, oj->host_name,
					  tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_OCHP:
			if (wpres.early_timeout) {
				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: OCHP command '%s' for host '%s' timed out after %.2f seconds\n",
					  job->command, oj->host_name, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_GLOBAL_SVC_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Global service event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_SVC_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Service event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_GLOBAL_HOST_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Global host event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;
		case WPJOB_HOST_EVTHANDLER:
			if (wpres.early_timeout) {
				logit(NSLOG_EVENT_HANDLER | NSLOG_RUNTIME_WARNING, TRUE,
					  "Warning: Host event handler command '%s' timed out after %.2f seconds\n",
					  job->command, tv2float(&wpres.runtime));
			}
			break;

		case WPJOB_CALLBACK:
			run_job_callback(job, &wpres, 0);
			break;

		default:
			logit(NSLOG_RUNTIME_WARNING, TRUE, "Worker %d: Unknown jobtype: %d\n", wp->pid, job->type);
			break;
		}
		destroy_job(job);
	}

	return 0;
}
Esempio n. 2
0
static int finish_job(child_process *cp, int reason)
{
	static struct kvvec resp = KVVEC_INITIALIZER;
	struct rusage *ru = &cp->rusage;
	int i, ret;

	/* how many key/value pairs do we need? */
	if (kvvec_init(&resp, 12 + cp->request->kv_pairs) == NULL) {
		/* what the hell do we do now? */
		exit_worker();
	}

	gettimeofday(&cp->stop, NULL);

	if (running_jobs != squeue_size(sq)) {
		wlog("running_jobs(%d) != squeue_size(sq) (%d)\n",
			 running_jobs, squeue_size(sq));
		wlog("started: %d; running: %d; finished: %d\n",
			 started, running_jobs, started - running_jobs);
	}

	/*
	 * we must remove the job's timeout ticker,
	 * or we'll end up accessing an already free()'d
	 * pointer, or the pointer to a different child.
	 */
	squeue_remove(sq, cp->sq_event);

	/* get rid of still open filedescriptors */
	if (cp->outstd.fd != -1)
		iobroker_close(iobs, cp->outstd.fd);
	if (cp->outerr.fd != -1)
		iobroker_close(iobs, cp->outerr.fd);

	cp->runtime = tv_delta_f(&cp->start, &cp->stop);

	/*
	 * Now build the return message.
	 * First comes the request, minus environment variables
	 */
	for (i = 0; i < cp->request->kv_pairs; i++) {
		struct key_value *kv = &cp->request->kv[i];
		/* skip environment macros */
		if (kv->key_len == 3 && !strcmp(kv->key, "env")) {
			continue;
		}
		kvvec_addkv_wlen(&resp, kv->key, kv->key_len, kv->value, kv->value_len);
	}
	kvvec_addkv(&resp, "wait_status", (char *)mkstr("%d", cp->ret));
	kvvec_addkv_wlen(&resp, "outstd", 6, cp->outstd.buf, cp->outstd.len);
	kvvec_addkv_wlen(&resp, "outerr", 6, cp->outerr.buf, cp->outerr.len);
	kvvec_add_tv(&resp, "start", cp->start);
	kvvec_add_tv(&resp, "stop", cp->stop);
	kvvec_addkv(&resp, "runtime", (char *)mkstr("%f", cp->runtime));
	if (!reason) {
		/* child exited nicely */
		kvvec_addkv(&resp, "exited_ok", "1");
		kvvec_add_tv(&resp, "ru_utime", ru->ru_utime);
		kvvec_add_tv(&resp, "ru_stime", ru->ru_stime);
		kvvec_add_long(&resp, "ru_minflt", ru->ru_minflt);
		kvvec_add_long(&resp, "ru_majflt", ru->ru_majflt);
		kvvec_add_long(&resp, "ru_nswap", ru->ru_nswap);
		kvvec_add_long(&resp, "ru_inblock", ru->ru_inblock);
		kvvec_add_long(&resp, "ru_oublock", ru->ru_oublock);
		kvvec_add_long(&resp, "ru_nsignals", ru->ru_nsignals);
	} else {
		/* some error happened */
		kvvec_addkv(&resp, "exited_ok", "0");
		kvvec_addkv(&resp, "error_code", (char *)mkstr("%d", reason));
	}
	ret = send_kvvec(master_sd, &resp);
	if (ret < 0 && errno == EPIPE)
		exit_worker();

	running_jobs--;
	if (cp->outstd.buf) {
		free(cp->outstd.buf);
		cp->outstd.buf = NULL;
	}
	if (cp->outerr.buf) {
		free(cp->outerr.buf);
		cp->outerr.buf = NULL;
	}

	kvvec_destroy(cp->request, KVVEC_FREE_ALL);
	free(cp->cmd);
	free(cp);

	return 0;
}
Esempio n. 3
0
/* handles a timed event */
int handle_timed_event(timed_event *event)
{
	host *temp_host = NULL;
	service *temp_service = NULL;
	void (*userfunc)(void *);
	struct timeval tv;
	const struct timeval *event_runtime;
	double latency;


	log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_timed_event() start\n");

#ifdef USE_EVENT_BROKER
	/* send event data to broker */
	broker_timed_event(NEBTYPE_TIMEDEVENT_EXECUTE, NEBFLAG_NONE, NEBATTR_NONE, event, NULL);
#endif

	log_debug_info(DEBUGL_EVENTS, 0, "** Timed Event ** Type: EVENT_%s, Run Time: %s", EVENT_TYPE_STR(event->event_type), ctime(&event->run_time));

	/* get event latency */
	gettimeofday(&tv, NULL);
	event_runtime = squeue_event_runtime(event->sq_event);
	latency = (double)(tv_delta_f(event_runtime, &tv));
	if (latency < 0.0) /* events may run up to 0.1 seconds early */
		latency = 0.0;

	/* how should we handle the event? */
	switch (event->event_type) {

	case EVENT_SERVICE_CHECK:

		temp_service = (service *)event->event_data;

		log_debug_info(DEBUGL_EVENTS, 0, "** Service Check Event ==> Host: '%s', Service: '%s', Options: %d, Latency: %f sec\n", temp_service->host_name, temp_service->description, event->event_options, latency);

		/* run the service check */
		run_scheduled_service_check(temp_service, event->event_options, latency);
		break;

	case EVENT_HOST_CHECK:

		temp_host = (host *)event->event_data;

		log_debug_info(DEBUGL_EVENTS, 0, "** Host Check Event ==> Host: '%s', Options: %d, Latency: %f sec\n", temp_host->name, event->event_options, latency);

		/* run the host check */
		run_scheduled_host_check(temp_host, event->event_options, latency);
		break;

	case EVENT_PROGRAM_SHUTDOWN:

		log_debug_info(DEBUGL_EVENTS, 0, "** Program Shutdown Event. Latency: %.3fs\n", latency);

		/* set the shutdown flag */
		sigshutdown = TRUE;

		/* log the shutdown */
		logit(NSLOG_PROCESS_INFO, TRUE, "PROGRAM_SHUTDOWN event encountered, shutting down...\n");
		break;

	case EVENT_PROGRAM_RESTART:

		log_debug_info(DEBUGL_EVENTS, 0, "** Program Restart Event. Latency: %.3fs\n", latency);

		/* set the restart flag */
		sigrestart = TRUE;

		/* log the restart */
		logit(NSLOG_PROCESS_INFO, TRUE, "PROGRAM_RESTART event encountered, restarting...\n");
		break;

	case EVENT_CHECK_REAPER:

		log_debug_info(DEBUGL_EVENTS, 0, "** Check Result Reaper. Latency: %.3fs\n", latency);

		/* reap host and service check results */
		reap_check_results();
		break;

	case EVENT_ORPHAN_CHECK:

		log_debug_info(DEBUGL_EVENTS, 0, "** Orphaned Host and Service Check Event. Latency: %.3fs\n", latency);

		/* check for orphaned hosts and services */
		if (check_orphaned_hosts == TRUE)
			check_for_orphaned_hosts();
		if (check_orphaned_services == TRUE)
			check_for_orphaned_services();
		break;

	case EVENT_RETENTION_SAVE:

		log_debug_info(DEBUGL_EVENTS, 0, "** Retention Data Save Event. Latency: %.3fs\n", latency);

		/* save state retention data */
		save_state_information(TRUE);
		break;

	case EVENT_STATUS_SAVE:

		log_debug_info(DEBUGL_EVENTS, 0, "** Status Data Save Event. Latency: %.3fs\n", latency);

		/* save all status data (program, host, and service) */
		update_all_status_data();
		break;

	case EVENT_SCHEDULED_DOWNTIME:

		log_debug_info(DEBUGL_EVENTS, 0, "** Scheduled Downtime Event. Latency: %.3fs\n", latency);

		/* process scheduled downtime info */
		if (event->event_data) {
			handle_scheduled_downtime_by_id(*(unsigned long *)event->event_data);
			free(event->event_data);
			event->event_data = NULL;
		}
		break;

	case EVENT_SFRESHNESS_CHECK:

		log_debug_info(DEBUGL_EVENTS, 0, "** Service Result Freshness Check Event. Latency: %.3fs\n", latency);

		/* check service result freshness */
		check_service_result_freshness();
		break;

	case EVENT_HFRESHNESS_CHECK:

		log_debug_info(DEBUGL_EVENTS, 0, "** Host Result Freshness Check Event. Latency: %.3fs\n", latency);

		/* check host result freshness */
		check_host_result_freshness();
		break;

	case EVENT_EXPIRE_DOWNTIME:

		log_debug_info(DEBUGL_EVENTS, 0, "** Expire Downtime Event. Latency: %.3fs\n", latency);

		/* check for expired scheduled downtime entries */
		check_for_expired_downtime();
		break;

	case EVENT_EXPIRE_COMMENT:

		log_debug_info(DEBUGL_EVENTS, 0, "** Expire Comment Event. Latency: %.3fs\n", latency);

		/* check for expired comment */
		check_for_expired_comment((unsigned long)event->event_data);
		break;

	case EVENT_CHECK_PROGRAM_UPDATE:
		/* this doesn't do anything anymore */
		break;

	case EVENT_USER_FUNCTION:

		log_debug_info(DEBUGL_EVENTS, 0, "** User Function Event. Latency: %.3fs\n", latency);

		/* run a user-defined function */
		if (event->event_data != NULL) {
			userfunc = event->event_data;
			(*userfunc)(event->event_args);
		}
		break;

	default:

		break;
	}

	log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_timed_event() end\n");

	return OK;
}
Esempio n. 4
0
int finish_job(child_process *cp, int reason)
{
	static struct kvvec resp = KVVEC_INITIALIZER;
	struct rusage *ru = &cp->ei->rusage;
	int i, ret;

	/* get rid of still open filedescriptors */
	if (cp->outstd.fd != -1) {
		gather_output(cp, &cp->outstd, 1);
		iobroker_close(iobs, cp->outstd.fd);
	}
	if (cp->outerr.fd != -1) {
		gather_output(cp, &cp->outerr, 1);
		iobroker_close(iobs, cp->outerr.fd);
	}

	/* Make sure network-supplied data doesn't contain nul bytes */
	strip_nul_bytes(cp->outstd);
	strip_nul_bytes(cp->outerr);

	/* how many key/value pairs do we need? */
	if (kvvec_init(&resp, 12 + cp->request->kv_pairs) == NULL) {
		/* what the hell do we do now? */
		exit_worker(1, "Failed to init response key/value vector");
	}

	gettimeofday(&cp->ei->stop, NULL);

	if (running_jobs != squeue_size(sq)) {
		wlog("running_jobs(%d) != squeue_size(sq) (%d)\n",
			 running_jobs, squeue_size(sq));
		wlog("started: %d; running: %d; finished: %d\n",
			 started, running_jobs, started - running_jobs);
	}

	cp->ei->runtime = tv_delta_f(&cp->ei->start, &cp->ei->stop);

	/*
	 * Now build the return message.
	 * First comes the request, minus environment variables
	 */
	for (i = 0; i < cp->request->kv_pairs; i++) {
		struct key_value *kv = &cp->request->kv[i];
		/* skip environment macros */
		if (kv->key_len == 3 && !strcmp(kv->key, "env")) {
			continue;
		}
		kvvec_addkv_wlen(&resp, kv->key, kv->key_len, kv->value, kv->value_len);
	}
	kvvec_addkv(&resp, "wait_status", mkstr("%d", cp->ret));
	kvvec_add_tv(&resp, "start", cp->ei->start);
	kvvec_add_tv(&resp, "stop", cp->ei->stop);
	kvvec_addkv(&resp, "runtime", mkstr("%f", cp->ei->runtime));
	if (!reason) {
		/* child exited nicely (or with a signal, so check wait_status) */
		kvvec_addkv(&resp, "exited_ok", "1");
		kvvec_add_tv(&resp, "ru_utime", ru->ru_utime);
		kvvec_add_tv(&resp, "ru_stime", ru->ru_stime);
		kvvec_add_long(&resp, "ru_minflt", ru->ru_minflt);
		kvvec_add_long(&resp, "ru_majflt", ru->ru_majflt);
		kvvec_add_long(&resp, "ru_inblock", ru->ru_inblock);
		kvvec_add_long(&resp, "ru_oublock", ru->ru_oublock);
	} else {
		/* some error happened */
		kvvec_addkv(&resp, "exited_ok", "0");
		kvvec_addkv(&resp, "error_code", mkstr("%d", reason));
	}
	kvvec_addkv_wlen(&resp, "outerr", 6, cp->outerr.buf, cp->outerr.len);
	kvvec_addkv_wlen(&resp, "outstd", 6, cp->outstd.buf, cp->outstd.len);
	ret = worker_send_kvvec(master_sd, &resp);
	if (ret < 0 && errno == EPIPE)
		exit_worker(1, "Failed to send kvvec struct to master");

	return 0;
}