Esempio n. 1
0
int main(int argc, char *argv[])
{
	char *msg;
	int seq;
	int argi;
	int alertcolors, alertinterval;
	char *configfn = NULL;
	char *checkfn = NULL;
	int checkpointinterval = 900;
	char acklogfn[PATH_MAX];
	FILE *acklogfd = NULL;
	char notiflogfn[PATH_MAX];
	FILE *notiflogfd = NULL;
	char *tracefn = NULL;
	struct sigaction sa;
	int configchanged;
	time_t lastxmit = 0;

	MEMDEFINE(acklogfn);
	MEMDEFINE(notiflogfn);

	libxymon_init(argv[0]);

	/* Dont save the error buffer */
	save_errbuf = 0;

	/* Load alert config */
	alertcolors = colorset(xgetenv("ALERTCOLORS"), ((1 << COL_GREEN) | (1 << COL_BLUE)));
	alertinterval = 60*atoi(xgetenv("ALERTREPEAT"));

	/* Create our loookup-trees */
	hostnames = xtreeNew(strcasecmp);
	testnames = xtreeNew(strcasecmp);
	locations = xtreeNew(strcasecmp);

	for (argi=1; (argi < argc); argi++) {
		if (argnmatch(argv[argi], "--config=")) {
			configfn = strdup(strchr(argv[argi], '=')+1);
		}
		else if (argnmatch(argv[argi], "--checkpoint-file=")) {
			checkfn = strdup(strchr(argv[argi], '=')+1);
		}
		else if (argnmatch(argv[argi], "--checkpoint-interval=")) {
			char *p = strchr(argv[argi], '=') + 1;
			checkpointinterval = atoi(p);
		}
		else if (argnmatch(argv[argi], "--dump-config")) {
			load_alertconfig(configfn, alertcolors, alertinterval);
			dump_alertconfig(1);
			return 0;
		}
		else if (argnmatch(argv[argi], "--cfid")) {
			include_configid = 1;
		}
		else if (argnmatch(argv[argi], "--test")) {
			char *testhost = NULL, *testservice = NULL, *testpage = NULL, 
			     *testcolor = "red", *testgroups = NULL;
			void *hinfo;
			int testdur = 0;
			FILE *logfd = NULL;
			activealerts_t *awalk = NULL;
			int paramno = 0;

			argi++; if (argi < argc) testhost = argv[argi];
			argi++; if (argi < argc) testservice = argv[argi];
			argi++; 
			while (argi < argc) {
				if (strncasecmp(argv[argi], "--duration=", 11) == 0) {
					testdur = durationvalue(strchr(argv[argi], '=')+1);
				}
				else if (strncasecmp(argv[argi], "--color=", 8) == 0) {
					testcolor = strchr(argv[argi], '=')+1;
				}
				else if (strncasecmp(argv[argi], "--group=", 8) == 0) {
					testgroups = strchr(argv[argi], '=')+1;
				}
				else if (strncasecmp(argv[argi], "--time=", 7) == 0) {
					fakestarttime = (time_t)atoi(strchr(argv[argi], '=')+1);
				}
				else {
					paramno++;
					if (paramno == 1) testdur = atoi(argv[argi]);
					else if (paramno == 2) testcolor = argv[argi];
					else if (paramno == 3) fakestarttime = (time_t) atoi(argv[argi]);
				}

				argi++;
			}

			if ((testhost == NULL) || (testservice == NULL)) {
				printf("Usage: xymond_alert --test HOST SERVICE [options]\n");
				printf("Possible options:\n\t[--duration=MINUTES]\n\t[--color=COLOR]\n\t[--group=GROUPNAME]\n\t[--time=TIMESPEC]\n");

				return 1;
			}

			load_hostnames(xgetenv("HOSTSCFG"), NULL, get_fqdn());
			hinfo = hostinfo(testhost);
			if (hinfo) {
				testpage = strdup(xmh_item(hinfo, XMH_ALLPAGEPATHS));
			}
			else {
				errprintf("Host not found in hosts.cfg - assuming it is on the top page\n");
				testpage = "";
			}

			awalk = (activealerts_t *)calloc(1, sizeof(activealerts_t));
			awalk->hostname = find_name(hostnames, testhost);
			awalk->testname = find_name(testnames, testservice);
			awalk->location = find_name(locations, testpage);
			awalk->ip = strdup("127.0.0.1");
			awalk->color = awalk->maxcolor = parse_color(testcolor);
			awalk->pagemessage = "Test of the alert configuration";
			awalk->eventstart = getcurrenttime(NULL) - testdur*60;
			awalk->groups = (testgroups ? strdup(testgroups) : NULL);
			awalk->state = A_PAGING;
			awalk->cookie = 12345;
			awalk->next = NULL;

			logfd = fopen("/dev/null", "w");
			starttrace(NULL);
			testonly = 1;

			load_alertconfig(configfn, alertcolors, alertinterval);
			load_holidays(0);
			send_alert(awalk, logfd);
			return 0;
		}
		else if (argnmatch(argv[argi], "--trace=")) {
			tracefn = strdup(strchr(argv[argi], '=')+1);
			starttrace(tracefn);
		}
		else if (net_worker_option(argv[argi])) {
			/* Handled in the subroutine */
		}
		else if (standardoption(argv[argi])) {
			if (showhelp) return 0;
		}
		else {
			errprintf("Unknown option '%s'\n", argv[argi]);
		}
	}

	/* Do the network stuff if needed */
	net_worker_run(ST_ALERT, LOC_SINGLESERVER, NULL);

	if (checkfn) {
		load_checkpoint(checkfn);
		nextcheckpoint = gettimer() + checkpointinterval;
		dbgprintf("Next checkpoint at %d, interval %d\n", (int) nextcheckpoint, checkpointinterval);
	}

	setup_signalhandler("xymond_alert");
	/* Need to handle these ourselves, so we can shutdown and save state-info */
	memset(&sa, 0, sizeof(sa));
	sa.sa_handler = sig_handler;
	sigaction(SIGPIPE, &sa, NULL);
	sigaction(SIGTERM, &sa, NULL);
	sigaction(SIGINT,  &sa, NULL);
	sigaction(SIGCHLD, &sa, NULL);
	sigaction(SIGUSR1, &sa, NULL);
	sigaction(SIGHUP,  &sa, NULL);

	if (xgetenv("XYMONSERVERLOGS")) {
		sprintf(acklogfn, "%s/acknowledge.log", xgetenv("XYMONSERVERLOGS"));
		acklogfd = fopen(acklogfn, "a");
		sprintf(notiflogfn, "%s/notifications.log", xgetenv("XYMONSERVERLOGS"));
		notiflogfd = fopen(notiflogfn, "a");
	}

	/*
	 * The general idea here is that this loop handles receiving of alert-
	 * and ack-messages from the master daemon, and maintains a list of 
	 * host+test combinations that may have alerts going out.
	 *
	 * This module does not deal with any specific alert-configuration, 
	 * it just picks up the alert messages, maintains the list of 
	 * known tests that are in some sort of critical condition, and
	 * periodically pushes alerts to the do_alert.c module for handling.
	 *
	 * The only modification of alerts that happen here is the handling
	 * of when the next alert is due. It calls into the next_alert() 
	 * routine to learn when an alert should be repeated, and also 
	 * deals with Acknowledgments that stop alerts from going out for
	 * a period of time.
	 */
	while (running) {
		char *eoln, *restofmsg;
		char *metadata[20];
		char *p;
		int metacount;
		char *hostname = NULL, *testname = NULL;
		struct timespec timeout;
		time_t now, nowtimer;
		int anytogo;
		activealerts_t *awalk;
		int childstat;

		nowtimer = gettimer();
		if (checkfn && (nowtimer > nextcheckpoint)) {
			dbgprintf("Saving checkpoint\n");
			nextcheckpoint = nowtimer + checkpointinterval;
			save_checkpoint(checkfn);

			if (acklogfd) acklogfd = freopen(acklogfn, "a", acklogfd);
			if (notiflogfd) notiflogfd = freopen(notiflogfn, "a", notiflogfd);
		}

		timeout.tv_sec = 60; timeout.tv_nsec = 0;
		msg = get_xymond_message(C_PAGE, "xymond_alert", &seq, &timeout);
		if (msg == NULL) {
			running = 0;
			continue;
		}

		/* See what time it is - must happen AFTER the timeout */
		now = getcurrenttime(NULL);

		/* Split the message in the first line (with meta-data), and the rest */
 		eoln = strchr(msg, '\n');
		if (eoln) {
			*eoln = '\0';
			restofmsg = eoln+1;
		}
		else {
			restofmsg = "";
		}

		/* 
		 * Now parse the meta-data into elements.
		 * We use our own "gettok()" routine which works
		 * like strtok(), but can handle empty elements.
		 */
		metacount = 0; 
		memset(&metadata, 0, sizeof(metadata));
		p = gettok(msg, "|");
		while (p && (metacount < 19)) {
			metadata[metacount] = p;
			metacount++;
			p = gettok(NULL, "|");
		}
		metadata[metacount] = NULL;

		if (metacount > 3) hostname = metadata[3];
		if (metacount > 4) testname = metadata[4];

		if ((metacount > 10) && (strncmp(metadata[0], "@@page", 6) == 0)) {
			/* @@page|timestamp|sender|hostname|testname|hostip|expiretime|color|prevcolor|changetime|location|cookie|osname|classname|grouplist|modifiers */

			int newcolor, newalertstatus, oldalertstatus;

			dbgprintf("Got page message from %s:%s\n", hostname, testname);
			traceprintf("@@page %s:%s:%s=%s\n", hostname, testname, metadata[10], metadata[7]);

			awalk = find_active(hostname, testname);
			if (awalk == NULL) {
				char *hwalk = find_name(hostnames, hostname);
				char *twalk = find_name(testnames, testname);
				char *pwalk = find_name(locations, metadata[10]);

				awalk = (activealerts_t *)calloc(1, sizeof(activealerts_t));
				awalk->hostname = hwalk;
				awalk->testname = twalk;
				awalk->location = pwalk;
				awalk->cookie = -1;
				awalk->state = A_DEAD;
				/*
				 * Use changetime here, if we restart the alert module then
				 * this gets the duration values more right than using "now".
				 * Also, define this only when a new alert arrives - we should
				 * NOT clear this when a status goes yellow->red, or if it
				 * flaps between yellow and red.
				 */
				awalk->eventstart = atoi(metadata[9]);
				add_active(awalk->hostname, awalk);
				traceprintf("New record\n");
			}

			newcolor = parse_color(metadata[7]);
			oldalertstatus = ((alertcolors & (1 << awalk->color)) != 0);
			newalertstatus = ((alertcolors & (1 << newcolor)) != 0);

			traceprintf("state %d->%d\n", oldalertstatus, newalertstatus);

			if (newalertstatus) {
				/* It's in an alert state. */
				awalk->color = newcolor;
				awalk->state = A_PAGING;

				if (newcolor > awalk->maxcolor) {
					if (awalk->maxcolor != 0) {
						/*
						 * Severity has increased (yellow -> red).
						 * Clear the repeat-interval, and set maxcolor to
						 * the new color. If it drops to yellow again,
						 * maxcolor stays at red, so a test that flaps
						 * between yellow and red will only alert on red
						 * the first time, and then follow the repeat
						 * interval.
						 */
						dbgprintf("Severity increased, cleared repeat interval: %s/%s %s->%s\n",
							awalk->hostname, awalk->testname,
							colorname(awalk->maxcolor), colorname(newcolor));
						clear_interval(awalk);
					}

					awalk->maxcolor = newcolor;
				}
			}
			else {
				/* 
				 * Send one "recovered" message out now, then go to A_DEAD.
				 * Dont update the color here - we want recoveries to go out 
				 * only if the alert color triggered an alert
				 */
				awalk->state = (newcolor == COL_BLUE) ? A_DISABLED : A_RECOVERED;
			}

			if (oldalertstatus != newalertstatus) {
				dbgprintf("Alert status changed from %d to %d\n", oldalertstatus, newalertstatus);
				clear_interval(awalk);
			}

			if (awalk->ip) xfree(awalk->ip);
			awalk->ip = strdup(metadata[5]);
			awalk->cookie = atoi(metadata[11]);
			if (awalk->osname) xfree(awalk->osname);
			awalk->osname    = (metadata[12] ? strdup(metadata[12]) : NULL);
			if (awalk->classname) xfree(awalk->classname);
			awalk->classname = (metadata[13] ? strdup(metadata[13]) : NULL);
			if (awalk->groups) xfree(awalk->groups);
			awalk->groups    = (metadata[14] ? strdup(metadata[14]) : NULL);
			if (awalk->pagemessage) xfree(awalk->pagemessage);
			if (metadata[15]) {
				/* Modifiers are more interesting than the message itself */
				awalk->pagemessage = (char *)malloc(strlen(awalk->hostname) + strlen(awalk->testname) + strlen(colorname(awalk->color)) + strlen(metadata[15]) + strlen(restofmsg) + 10);
				sprintf(awalk->pagemessage, "%s:%s %s\n%s\n%s",
					awalk->hostname, awalk->testname, colorname(awalk->color), metadata[15], restofmsg);
			}
			else {
				awalk->pagemessage = strdup(restofmsg);
			}
		}
		else if ((metacount > 5) && (strncmp(metadata[0], "@@ack", 5) == 0)) {
 			/* @@ack|timestamp|sender|hostname|testname|hostip|expiretime */

			/*
			 * An ack is handled simply by setting the next
			 * alert-time to when the ack expires.
			 */
			time_t nextalert = atoi(metadata[6]);

			dbgprintf("Got ack message from %s:%s\n", hostname, testname);
			traceprintf("@@ack: %s:%s now=%d, ackeduntil %d\n",
				     hostname, testname, (int)now, (int)nextalert);

			awalk = find_active(hostname, testname);

			if (acklogfd) {
				int cookie = (awalk ? awalk->cookie : -1);
				int color  = (awalk ? awalk->color : 0);

				fprintf(acklogfd, "%d\t%d\t%d\t%d\t%s\t%s.%s\t%s\t%s\n",
					(int)now, cookie, 
					(int)((nextalert - now) / 60), cookie,
					"np_filename_not_used", 
					hostname, testname, 
					colorname(color),
					nlencode(restofmsg));
				fflush(acklogfd);
			}

			if (awalk && (awalk->state == A_PAGING)) {
				traceprintf("Record updated\n");
				awalk->state = A_ACKED;
				awalk->nextalerttime = nextalert;
				if (awalk->ackmessage) xfree(awalk->ackmessage);
				awalk->ackmessage = strdup(restofmsg);
			}
			else {
				traceprintf("No record\n");
			}
		}
		else if ((metacount > 4) && (strncmp(metadata[0], "@@notify", 5) == 0)) {
			/* @@notify|timestamp|sender|hostname|testname|pagepath */

			char *hwalk = find_name(hostnames, hostname);
			char *twalk = find_name(testnames, testname);
			char *pwalk = find_name(locations, (metadata[5] ? metadata[5] : ""));

			awalk = (activealerts_t *)calloc(1, sizeof(activealerts_t));
			awalk->hostname = hwalk;
			awalk->testname = twalk;
			awalk->location = pwalk;
			awalk->cookie = -1;
			awalk->pagemessage = strdup(restofmsg);
			awalk->eventstart = getcurrenttime(NULL);
			awalk->state = A_NOTIFY;
			add_active(awalk->hostname, awalk);
		}
		else if ((metacount > 3) && 
			 ((strncmp(metadata[0], "@@drophost", 10) == 0) || (strncmp(metadata[0], "@@dropstate", 11) == 0))) {
			/* @@drophost|timestamp|sender|hostname */
			/* @@dropstate|timestamp|sender|hostname */
			xtreePos_t handle;

			handle = xtreeFind(hostnames, hostname);
			if (handle != xtreeEnd(hostnames)) {
				alertanchor_t *anchor = (alertanchor_t *)xtreeData(hostnames, handle);
				for (awalk = anchor->head; (awalk); awalk = awalk->next) awalk->state = A_DEAD;
			}
		}
		else if ((metacount > 4) && (strncmp(metadata[0], "@@droptest", 10) == 0)) {
			/* @@droptest|timestamp|sender|hostname|testname */

			awalk = find_active(hostname, testname);
			if (awalk) awalk->state = A_DEAD;
		}
		else if ((metacount > 4) && (strncmp(metadata[0], "@@renamehost", 12) == 0)) {
			/* @@renamehost|timestamp|sender|hostname|newhostname */

			/* 
			 * We handle rename's simply by dropping the alert. If there is still an
			 * active alert for the host, it will have to be dealt with when the next
			 * status update arrives.
			 */
			xtreePos_t handle;

			handle = xtreeFind(hostnames, hostname);
			if (handle != xtreeEnd(hostnames)) {
				alertanchor_t *anchor = (alertanchor_t *)xtreeData(hostnames, handle);
				for (awalk = anchor->head; (awalk); awalk = awalk->next) awalk->state = A_DEAD;
			}
		}
		else if ((metacount > 5) && (strncmp(metadata[0], "@@renametest", 12) == 0)) {
			/* @@renametest|timestamp|sender|hostname|oldtestname|newtestname */

			/* 
			 * We handle rename's simply by dropping the alert. If there is still an
			 * active alert for the host, it will have to be dealt with when the next
			 * status update arrives.
			 */
			awalk = find_active(hostname, testname);
			if (awalk) awalk->state = A_DEAD;
		}
		else if (strncmp(metadata[0], "@@shutdown", 10) == 0) {
			running = 0;
			errprintf("Got a shutdown message\n");
			continue;
		}
		else if (strncmp(metadata[0], "@@logrotate", 11) == 0) {
			char *fn = xgetenv("XYMONCHANNEL_LOGFILENAME");
			if (fn && strlen(fn)) {
				reopen_file(fn, "a", stdout);
				reopen_file(fn, "a", stderr);

				if (tracefn) {
					stoptrace();
					starttrace(tracefn);
				}
			}
			continue;
		}
		else if (strncmp(metadata[0], "@@reload", 8) == 0) {
			/* Nothing ... right now */
		}
		else if (strncmp(metadata[0], "@@idle", 6) == 0) {
			/* Timeout */
		}

		/*
		 * When a burst of alerts happen, we get lots of alert messages
		 * coming in quickly. So lets handle them in bunches and only 
		 * do the full alert handling once every 10 secs - that lets us
		 * combine a bunch of alerts into one transmission process.
		 */
		if (nowtimer < (lastxmit+10)) continue;
		lastxmit = nowtimer;

		/* 
		 * Loop through the activealerts list and see if anything is pending.
		 * This is an optimization, we could just as well just fork off the
		 * notification child and let it handle all of it. But there is no
		 * reason to fork a child process unless it is going to do something.
		 */
		configchanged = load_alertconfig(configfn, alertcolors, alertinterval);
		configchanged += load_holidays(0);
		anytogo = 0;
		for (awalk = alistBegin(); (awalk); awalk = alistNext()) {
			int anymatch = 0;

			switch (awalk->state) {
			  case A_NORECIP:
				if (!configchanged) break;

				/* The configuration has changed - switch NORECIP -> PAGING */
				awalk->state = A_PAGING;
				clear_interval(awalk);
				/* Fall through */

			  case A_PAGING:
				if (have_recipient(awalk, &anymatch)) {
					if (awalk->nextalerttime <= now) anytogo++;
				}
				else {
					if (!anymatch) {
						awalk->state = A_NORECIP;
						cleanup_alert(awalk);
					}
				}
				break;

			  case A_ACKED:
				if (awalk->nextalerttime <= now) {
					/* An ack has expired, so drop the ack message and switch to A_PAGING */
					anytogo++;
					if (awalk->ackmessage) xfree(awalk->ackmessage);
					awalk->state = A_PAGING;
				}
				break;

			  case A_RECOVERED:
			  case A_DISABLED:
			  case A_NOTIFY:
				anytogo++;
				break;

			  case A_DEAD:
				break;
			}
		}
		dbgprintf("%d alerts to go\n", anytogo);

		if (anytogo) {
			pid_t childpid;

			childpid = fork();
			if (childpid == 0) {
				/* The child */
				start_alerts();
				for (awalk = alistBegin(); (awalk); awalk = alistNext()) {
					switch (awalk->state) {
					  case A_PAGING:
						if (awalk->nextalerttime <= now) {
							send_alert(awalk, notiflogfd);
						}
						break;

					  case A_ACKED:
						/* Cannot be A_ACKED unless the ack is still valid, so no alert. */
						break;

					  case A_RECOVERED:
					  case A_DISABLED:
					  case A_NOTIFY:
						send_alert(awalk, notiflogfd);
						break;

					  case A_NORECIP:
					  case A_DEAD:
						break;
					}
				}
				finish_alerts();

				/* Child does not continue */
				exit(0);
			}
			else if (childpid < 0) {
				errprintf("Fork failed, cannot send alerts: %s\n", strerror(errno));
			}
		}

		/* Update the state flag and the next-alert timestamp */
		for (awalk = alistBegin(); (awalk); awalk = alistNext()) {
			switch (awalk->state) {
			  case A_PAGING:
				if (awalk->nextalerttime <= now) awalk->nextalerttime = next_alert(awalk);
				break;

			  case A_NORECIP:
				break;

			  case A_ACKED:
				/* Still cannot get here except if ack is still valid */
				break;

			  case A_RECOVERED:
			  case A_DISABLED:
			  case A_NOTIFY:
				awalk->state = A_DEAD;
				/* Fall through */

			  case A_DEAD:
				cleanup_alert(awalk); 
				break;
			}
		}

		clean_all_active();

		/* Pickup any finished child processes to avoid zombies */
		while (wait3(&childstat, WNOHANG, NULL) > 0) ;
	}

	if (checkfn) save_checkpoint(checkfn);
	if (acklogfd) fclose(acklogfd);
	if (notiflogfd) fclose(notiflogfd);
	stoptrace();

	MEMUNDEFINE(notiflogfn);
	MEMUNDEFINE(acklogfn);

	if (termsig >= 0) {
		errprintf("Terminated by signal %d\n", termsig);
	}

	return 0;
}
Esempio n. 2
0
int main(int argc, char *argv[])
{
	char *msg;
	int argi;
	struct sigaction sa;
	char *exthandler = NULL;
	char *extids = NULL;
	char *processor = NULL;
	struct sockaddr_un ctlsockaddr;
	int ctlsocket;

	/* Handle program options. */
	for (argi = 1; (argi < argc); argi++) {
		if (strcmp(argv[argi], "--debug") == 0) {
			debug = 1;
		}
		else if (argnmatch(argv[argi], "--rrddir=")) {
			char *p = strchr(argv[argi], '=');
			rrddir = strdup(p+1);
		}
		else if (argnmatch(argv[argi], "--extra-script=")) {
			char *p = strchr(argv[argi], '=');
			exthandler = strdup(p+1);
		}
		else if (argnmatch(argv[argi], "--extra-tests=")) {
			char *p = strchr(argv[argi], '=');
			extids = strdup(p+1);
		}
		else if (argnmatch(argv[argi], "--processor=")) {
			char *p = strchr(argv[argi], '=');
			processor = strdup(p+1);
		}
		else if (strcmp(argv[argi], "--no-cache") == 0) {
			use_rrd_cache = 0;
		}
		else if (net_worker_option(argv[argi])) {
			/* Handled in the subroutine */
		}
	}

	save_errbuf = 0;

	if ((rrddir == NULL) && xgetenv("XYMONRRDS")) {
		rrddir = strdup(xgetenv("XYMONRRDS"));
	}

	if (exthandler && extids) setup_exthandler(exthandler, extids);

	/* Do the network stuff if needed */
	net_worker_run(ST_RRD, LOC_STICKY, update_locator_hostdata);

	setup_signalhandler("xymond_rrd");
	memset(&sa, 0, sizeof(sa));
	sa.sa_handler = sig_handler;
	sigaction(SIGHUP, &sa, NULL);
	sigaction(SIGCHLD, &sa, NULL);
	sigaction(SIGTERM, &sa, NULL);
	sigaction(SIGINT, &sa, NULL);
	signal(SIGPIPE, SIG_DFL);

	/* Setup the control socket that receives cache-flush commands */
	memset(&ctlsockaddr, 0, sizeof(ctlsockaddr));
	sprintf(ctlsockaddr.sun_path, "%s/rrdctl.%d", xgetenv("XYMONTMP"), getpid());
	unlink(ctlsockaddr.sun_path);     /* In case it was accidentally left behind */
	ctlsockaddr.sun_family = AF_UNIX;
	ctlsocket = socket(AF_UNIX, SOCK_DGRAM, 0);
	if (ctlsocket == -1) {
		errprintf("Cannot create cache-control socket (%s)\n", strerror(errno));
		return 1;
	}
	fcntl(ctlsocket, F_SETFL, O_NONBLOCK);
	if (bind(ctlsocket, (struct sockaddr *)&ctlsockaddr, sizeof(ctlsockaddr)) == -1) {
		errprintf("Cannot bind to cache-control socket (%s)\n", strerror(errno));
		return 1;
	}
	/* Linux obeys filesystem permissions on the socket file, so make it world-accessible */
	if (chmod(ctlsockaddr.sun_path, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH) == -1) {
		errprintf("Setting permissions on cache-control socket failed: %s\n", strerror(errno));
	}

	/* Load the RRD definitions */
	load_rrddefs();

	/* If we are passing data to an external processor, create the pipe to it */
	setup_extprocessor(processor);

	running = 1;
	while (running) {
		char *eoln, *restofmsg = NULL;
		char *metadata[MAX_META+1];
		int metacount;
		char *p;
		char *hostname = NULL, *testname = NULL, *sender = NULL, *classname = NULL, *pagepaths = NULL;
		xymonrrd_t *ldef = NULL;
		time_t tstamp;
		int childstat;
                ssize_t n;
		char ctlbuf[PATH_MAX];
		int gotcachectlmessage;
		time_t now;

		/* See if we have any cache-control messages pending */
		do {
			n = recv(ctlsocket, ctlbuf, sizeof(ctlbuf), 0);
			gotcachectlmessage = (n > 0);
			if (gotcachectlmessage) {
				/* We have a control message */
				char *bol, *eol;

				ctlbuf[n] = '\0';
				bol = ctlbuf;
				do {
					eol = strchr(bol, '\n'); if (eol) *eol = '\0';
					rrdcacheflushhost(bol);
					if (eol) { bol = eol+1; } else bol = NULL;
				} while (bol && *bol);
			}
		} while (gotcachectlmessage);

		/* Get next message */
		msg = get_xymond_message(C_LAST, argv[0], &seq, NULL);
		if (msg == NULL) {
			running = 0;
			continue;
		}

		now = gettimer();
		if (reloadtime < now) {
			/* Reload configuration files */
			load_hostnames(xgetenv("HOSTSCFG"), NULL, get_fqdn());
			load_client_config(NULL);
			reloadtime = now + 600;
		}

		/* Split the message in the first line (with meta-data), and the rest */
 		eoln = strchr(msg, '\n');
		if (eoln) {
			*eoln = '\0';
			restofmsg = eoln+1;
		}

		/* Parse the meta-data */
		metacount = 0; 
		memset(&metadata, 0, sizeof(metadata));
		p = gettok(msg, "|");
		while (p && (metacount < MAX_META)) {
			metadata[metacount++] = p;
			p = gettok(NULL, "|");
		}
		metadata[metacount] = NULL;

		if ((metacount >= 14) && (strncmp(metadata[0], "@@status", 8) == 0)) {
			/*
			 * @@status|timestamp|sender|origin|hostname|testname|expiretime|color|testflags|\
			 * prevcolor|changetime|ackexpiretime|ackmessage|disableexpiretime|disablemessage|\
			 * clienttstamp|flapping|classname|pagepaths
			 */
			int color = parse_color(metadata[7]);

			switch (color) {
			  case COL_GREEN:
			  case COL_YELLOW:
			  case COL_RED:
			  case COL_BLUE: /* Blue is OK, because it only arrives here when an update is sent */
				tstamp = atoi(metadata[1]);
				sender = metadata[2];
				hostname = metadata[4]; 
				testname = metadata[5];
				classname = (metadata[17] ? metadata[17] : "");
				pagepaths = (metadata[18] ? metadata[18] : "");
				ldef = find_xymon_rrd(testname, metadata[8]);
				update_rrd(hostname, testname, restofmsg, tstamp, sender, ldef, classname, pagepaths);
				break;

			  default:
				/* Ignore reports with purple, blue or clear - they have no data we want. */
				break;
			}
		}
		else if ((metacount > 5) && (strncmp(metadata[0], "@@data", 6) == 0)) {
			/* @@data|timestamp|sender|origin|hostname|testname|classname|pagepaths */
			tstamp = atoi(metadata[1]);
			sender = metadata[2];
			hostname = metadata[4]; 
			testname = metadata[5];
			classname = (metadata[6] ? metadata[6] : "");
			pagepaths = (metadata[7] ? metadata[7] : "");
			ldef = find_xymon_rrd(testname, "");
			update_rrd(hostname, testname, restofmsg, tstamp, sender, ldef, classname, pagepaths);
		}
		else if (strncmp(metadata[0], "@@shutdown", 10) == 0) {
			running = 0;
			continue;
		}
		else if (strncmp(metadata[0], "@@idle", 6) == 0) {
			/* Ignored */
			continue;
		}
		else if (strncmp(metadata[0], "@@logrotate", 11) == 0) {
			char *fn = xgetenv("XYMONCHANNEL_LOGFILENAME");
			if (fn && strlen(fn)) {
				freopen(fn, "a", stdout);
				freopen(fn, "a", stderr);
			}
			continue;
		}
		else if (strncmp(metadata[0], "@@reload", 8) == 0) {
			reloadtime = 0;
		}
		else if ((metacount > 3) && (strncmp(metadata[0], "@@drophost", 10) == 0)) {
			char hostdir[PATH_MAX];
			hostname = metadata[3];

			MEMDEFINE(hostdir);

			sprintf(hostdir, "%s/%s", rrddir, hostname);
			dropdirectory(hostdir, 1);

			MEMUNDEFINE(hostdir);
		}
		else if ((metacount > 4) && (strncmp(metadata[0], "@@droptest", 10) == 0)) {
			/*
			 * Not implemented. Mappings of testnames -> rrd files is
			 * too complex, so on the rare occasion that a single test
			 * is deleted, they will have to delete the rrd files themselves.
			 */
		}
		else if ((metacount > 4) && (strncmp(metadata[0], "@@renamehost", 12) == 0)) {
			char oldhostdir[PATH_MAX];
			char newhostdir[PATH_MAX];
			char *newhostname;

			MEMDEFINE(oldhostdir);
			MEMDEFINE(newhostdir);

			hostname = metadata[3];
			newhostname = metadata[4];
			sprintf(oldhostdir, "%s/%s", rrddir, hostname);
			sprintf(newhostdir, "%s/%s", rrddir, newhostname);
			rename(oldhostdir, newhostdir);

			if (net_worker_locatorbased()) locator_rename_host(hostname, newhostname, ST_RRD);

			MEMUNDEFINE(newhostdir);
			MEMUNDEFINE(oldhostdir);
		}
		else if ((metacount > 5) && (strncmp(metadata[0], "@@renametest", 12) == 0)) {
			/* Not implemented. See "droptest". */
		}

		/* 
		 * We fork a subprocess when processing drophost requests.
		 * Pickup any finished child processes to avoid zombies
		 */
		while (wait3(&childstat, WNOHANG, NULL) > 0) ;
	}

	/* Flush all cached updates to disk */
	errprintf("Shutting down, flushing cached updates to disk\n");
	rrdcacheflushall();
	errprintf("Cache flush completed\n");

	/* Close the external processor */
	shutdown_extprocessor();

	/* Close the control socket */
	close(ctlsocket);
	unlink(ctlsockaddr.sun_path);

	return 0;
}
Esempio n. 3
0
int main(int argc, char *argv[])
{
	char *msg;
	int running;
	int argi, seq;
	int recentperiod = 3600;
	int maxrecentcount = 5;
	int logdirfull = 0;
	int minlogspace = 5;
	struct sigaction sa;

	/* Handle program options. */
	for (argi = 1; (argi < argc); argi++) {
                if (argnmatch(argv[argi], "--logdir=")) {
			clientlogdir = strchr(argv[argi], '=')+1;
		}
		else if (argnmatch(argv[argi], "--recent-period=")) {
			char *p = strchr(argv[argi], '=');
			recentperiod = 60*atoi(p+1);
		}
		else if (argnmatch(argv[argi], "--recent-count=")) {
			char *p = strchr(argv[argi], '=');
			maxrecentcount = atoi(p+1);
		}
		else if (argnmatch(argv[argi], "--minimum-free=")) {
			minlogspace = atoi(strchr(argv[argi], '=')+1);
		}
		else if (strcmp(argv[argi], "--debug") == 0) {
			/*
			 * A global "debug" variable is available. If
			 * it is set, then "dbgprintf()" outputs debug messages.
			 */
			debug = 1;
		}
		else if (net_worker_option(argv[argi])) {
			/* Handled in the subroutine */
		}
	}

	if (clientlogdir == NULL) clientlogdir = xgetenv("CLIENTLOGS");
	if (clientlogdir == NULL) {
		clientlogdir = (char *)malloc(strlen(xgetenv("XYMONVAR")) + 10);
		sprintf(clientlogdir, "%s/hostdata", xgetenv("XYMONVAR"));
	}

	save_errbuf = 0;

	/* Do the network stuff if needed */
	net_worker_run(ST_HOSTDATA, LOC_STICKY, update_locator_hostdata);

	setup_signalhandler("xymond_hostdata");
	memset(&sa, 0, sizeof(sa));
	sa.sa_handler = sig_handler;
	signal(SIGCHLD, SIG_IGN);
	sigaction(SIGHUP, &sa, NULL);
	signal(SIGPIPE, SIG_DFL);

	savetimes = xtreeNew(strcasecmp);

	running = 1;
	while (running) {
		char *eoln, *restofmsg, *p;
		char *metadata[MAX_META+1];
		int metacount;

		msg = get_xymond_message(C_CLICHG, "xymond_hostdata", &seq, NULL);
		if (msg == NULL) {
			/*
			 * get_xymond_message will return NULL if xymond_channel closes
			 * the input pipe. We should shutdown when that happens.
			 */
			running = 0;
			continue;
		}

		if (nextfscheck < gettimer()) {
			logdirfull = (chkfreespace(clientlogdir, minlogspace, minlogspace) != 0);
			if (logdirfull) errprintf("Hostdata directory %s has less than %d%% free space - disabling save of data for 5 minutes\n", clientlogdir, minlogspace);
			nextfscheck = gettimer() + 300;
		}

		/* Split the message in the first line (with meta-data), and the rest */
 		eoln = strchr(msg, '\n');
		if (eoln) {
			*eoln = '\0';
			restofmsg = eoln+1;
		}
		else {
			restofmsg = "";
		}

		metacount = 0; 
		memset(&metadata, 0, sizeof(metadata));
		p = gettok(msg, "|");
		while (p && (metacount < MAX_META)) {
			metadata[metacount++] = p;
			p = gettok(NULL, "|");
		}
		metadata[metacount] = NULL;

		if (strncmp(metadata[0], "@@clichg", 8) == 0) {
			xtreePos_t handle;
			savetimes_t *itm;
			int i, recentcount;
			time_t now = gettimer();
			char hostdir[PATH_MAX];
			char fn[PATH_MAX];
			FILE *fd;

			/* metadata[3] is the hostname */
			handle = xtreeFind(savetimes, metadata[3]);
			if (handle != xtreeEnd(savetimes)) {
				itm = (savetimes_t *)xtreeData(savetimes, handle);
			}
			else {
				itm = (savetimes_t *)calloc(1, sizeof(savetimes_t));
				itm->hostname = strdup(metadata[3]);
				xtreeAdd(savetimes, itm->hostname, itm);
			}

			/* See how many times we've saved the hostdata recently (within the past 'recentperiod' seconds) */
			for (i=0, recentcount=0; ((i < 12) && (itm->tstamp[i] > (now - recentperiod))); i++) recentcount++;
			/* If it's been saved less than 'maxrecentcount' times, then save it. Otherwise just drop it */
			if (!logdirfull && (recentcount < maxrecentcount)) {
				int written, closestatus, ok = 1;

				for (i = 10; (i > 0); i--) itm->tstamp[i+1] = itm->tstamp[i];
				itm->tstamp[0] = now;

				sprintf(hostdir, "%s/%s", clientlogdir, metadata[3]);
				mkdir(hostdir, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
				sprintf(fn, "%s/%s", hostdir, metadata[4]);
				fd = fopen(fn, "w");
				if (fd == NULL) {
					errprintf("Cannot create file %s: %s\n", fn, strerror(errno));
					continue;
				}
				written = fwrite(restofmsg, 1, strlen(restofmsg), fd);
				if (written != strlen(restofmsg)) {
					errprintf("Cannot write hostdata file %s: %s\n", fn, strerror(errno));
					closestatus = fclose(fd);	/* Ignore any close errors */
					ok = 0;
				}
				else {
					closestatus = fclose(fd);
					if (closestatus != 0) {
						errprintf("Cannot write hostdata file %s: %s\n", fn, strerror(errno));
						ok = 0;
					}
				}

				if (!ok) remove(fn);
			}
		}

		/*
		 * A "shutdown" message is sent when the master daemon
		 * terminates. The child workers should shutdown also.
		 */
		else if (strncmp(metadata[0], "@@shutdown", 10) == 0) {
			running = 0;
			continue;
		}
		else if (strncmp(metadata[0], "@@idle", 6) == 0) {
			/* Ignored */
			continue;
		}

		/*
		 * A "logrotate" message is sent when the Xymon logs are
		 * rotated. The child workers must re-open their logfiles,
		 * typically stdin and stderr - the filename is always
		 * provided in the XYMONCHANNEL_LOGFILENAME environment.
		 */
		else if (strncmp(metadata[0], "@@logrotate", 11) == 0) {
			char *fn = xgetenv("XYMONCHANNEL_LOGFILENAME");
			if (fn && strlen(fn)) {
				reopen_file(fn, "a", stdout);
				reopen_file(fn, "a", stderr);
			}
			continue;
		}

		else if ((metacount > 3) && (strncmp(metadata[0], "@@drophost", 10) == 0)) {
			/* @@drophost|timestamp|sender|hostname */
			char hostdir[PATH_MAX];
			snprintf(hostdir, sizeof(hostdir), "%s/%s", clientlogdir, basename(metadata[3]));
			dropdirectory(hostdir, 1);
		}

		else if ((metacount > 4) && (strncmp(metadata[0], "@@renamehost", 12) == 0)) {
			/* @@renamehost|timestamp|sender|hostname|newhostname */
			char oldhostdir[PATH_MAX], newhostdir[PATH_MAX];
			snprintf(oldhostdir, sizeof(oldhostdir), "%s/%s", clientlogdir, basename(metadata[3]));
			snprintf(newhostdir, sizeof(newhostdir), "%s/%s", clientlogdir, basename(metadata[4]));
			rename(oldhostdir, newhostdir);

			if (net_worker_locatorbased()) locator_rename_host(metadata[3], metadata[4], ST_HOSTDATA);
		}
		else if (strncmp(metadata[0], "@@reload", 8) == 0) {
			/* Do nothing */
		}
	}

	return 0;
}