static void print_host(hostlist_t *host, htnames_t *testnames[], int testcount) { int testi, rowcount, netcount; void *hinfo = hostinfo(host->hostname); char *dispname = NULL, *clientalias = NULL, *comment = NULL, *description = NULL, *pagepathtitle = NULL; char *net = NULL, *nkalerts = NULL; char *nktime = NULL, *downtime = NULL, *reporttime = NULL; char *itm; tag_t *taghead = NULL; int contidx = 0, haveping = 0; char contcol[1024]; activealerts_t *alert; strbuffer_t *buf = newstrbuffer(0); fprintf(stdout, "<p style=\"page-break-before: always\">\n"); fprintf(stdout, "<table width=\"100%%\" border=1 summary=\"%s configuration\">\n", host->hostname); pagepathtitle = bbh_item(hinfo, BBH_PAGEPATHTITLE); if (!pagepathtitle || (strlen(pagepathtitle) == 0)) pagepathtitle = "Top page"; dispname = bbh_item(hinfo, BBH_DISPLAYNAME); if (dispname && (strcmp(dispname, host->hostname) == 0)) dispname = NULL; clientalias = bbh_item(hinfo, BBH_CLIENTALIAS); if (clientalias && (strcmp(clientalias, host->hostname) == 0)) clientalias = NULL; comment = bbh_item(hinfo, BBH_COMMENT); description = bbh_item(hinfo, BBH_DESCRIPTION); net = bbh_item(hinfo, BBH_NET); nkalerts = bbh_item(hinfo, BBH_NK); nktime = bbh_item(hinfo, BBH_NKTIME); if (!nktime) nktime = "24x7"; else nktime = strdup(timespec_text(nktime)); downtime = bbh_item(hinfo, BBH_DOWNTIME); if (downtime) downtime = strdup(timespec_text(downtime)); reporttime = bbh_item(hinfo, BBH_REPORTTIME); if (!reporttime) reporttime = "24x7"; else reporttime = strdup(timespec_text(reporttime)); rowcount = 1; if (pagepathtitle) rowcount++; if (dispname || clientalias) rowcount++; if (comment) rowcount++; if (description) rowcount++; if (!newnkconfig && nktime) rowcount++; if (downtime) rowcount++; if (reporttime) rowcount++; fprintf(stdout, "<tr>\n"); fprintf(stdout, "<th rowspan=%d align=left width=\"25%%\" valign=top>Basics</th>\n", rowcount); fprintf(stdout, "<th align=center>%s (%s)</th>\n", (dispname ? dispname : host->hostname), bbh_item(hinfo, BBH_IP)); fprintf(stdout, "</tr>\n"); if (dispname || clientalias) { fprintf(stdout, "<tr><td>Aliases:"); if (dispname) fprintf(stdout, " %s", dispname); if (clientalias) fprintf(stdout, " %s", clientalias); fprintf(stdout, "</td></tr>\n"); } if (pagepathtitle) fprintf(stdout, "<tr><td>Monitoring location: %s</td></tr>\n", pagepathtitle); if (comment) fprintf(stdout, "<tr><td>Comment: %s</td></tr>\n", comment); if (description) fprintf(stdout, "<tr><td>Description: %s</td></tr>\n", description); if (!newnkconfig && nktime) fprintf(stdout, "<tr><td>NK monitoring period: %s</td></tr>\n", nktime); if (downtime) fprintf(stdout, "<tr><td>Planned downtime: %s</td></tr>\n", downtime); if (reporttime) fprintf(stdout, "<tr><td>SLA Reporting Period: %s</td></tr>\n", reporttime); /* Build a list of the network tests */ itm = bbh_item_walk(hinfo); while (itm) { char *visdata = NULL, *colname = NULL, *expdata = NULL; bburl_t bu; int dialuptest = 0, reversetest = 0, alwaystruetest = 0, httpextra = 0; if (*itm == '?') { dialuptest=1; itm++; } if (*itm == '!') { reversetest=1; itm++; } if (*itm == '~') { alwaystruetest=1; itm++; } if ( argnmatch(itm, "http") || argnmatch(itm, "content=http") || argnmatch(itm, "cont;http") || argnmatch(itm, "cont=") || argnmatch(itm, "nocont;http") || argnmatch(itm, "nocont=") || argnmatch(itm, "post;http") || argnmatch(itm, "post=") || argnmatch(itm, "nopost;http") || argnmatch(itm, "nopost=") || argnmatch(itm, "type;http") || argnmatch(itm, "type=") ) { visdata = decode_url(itm, &bu); colname = bu.columnname; if (!colname) { if (bu.expdata) { httpextra = 1; if (contidx == 0) { colname = "content"; contidx++; } else { sprintf(contcol, "content%d", contidx); colname = contcol; contidx++; } } else { colname = "http"; } } expdata = bu.expdata; } else if (strncmp(itm, "rpc=", 4) == 0) { colname = "rpc"; visdata = strdup(itm+4); } else if (strncmp(itm, "dns=", 4) == 0) { colname = "dns"; visdata = strdup(itm+4); } else if (strncmp(itm, "dig=", 4) == 0) { colname = "dns"; visdata = strdup(itm+4); } else if (strncmp(itm, pingplus, strlen(pingplus)) == 0) { haveping = 1; colname = pingcolumn; visdata = strdup(itm+strlen(pingplus)); } else if (is_net_test(itm)) { colname = strdup(itm); } if (colname) { tag_t *newitem; addtolist: for (newitem = taghead; (newitem && strcmp(newitem->columnname, colname)); newitem = newitem->next); if (!newitem) { newitem = (tag_t *)calloc(1, sizeof(tag_t)); newitem->columnname = strdup(colname); newitem->visualdata = (visdata ? strdup(visdata) : NULL); newitem->expdata = (expdata ? strdup(expdata) : NULL); newitem->next = taghead; taghead = newitem; } else { /* Multiple tags for one column - must be http */ newitem->visualdata = (char *)realloc(newitem->visualdata, strlen(newitem->visualdata) + strlen(visdata) + 5); strcat(newitem->visualdata, "<br>"); strcat(newitem->visualdata, visdata); } if (httpextra) { httpextra = 0; colname = "http"; expdata = NULL; goto addtolist; } } itm = bbh_item_walk(NULL); } if (!haveping && !bbh_item(hinfo, BBH_FLAG_NOCONN)) { for (testi = 0; (testi < testcount); testi++) { if (strcmp(testnames[testi]->name, pingcolumn) == 0) { tag_t *newitem = (tag_t *)calloc(1, sizeof(tag_t)); newitem->columnname = strdup(pingcolumn); newitem->next = taghead; taghead = newitem; } } } /* Add the "badFOO" settings */ itm = bbh_item_walk(hinfo); while (itm) { if (strncmp(itm, "bad", 3) == 0) { char *tname, *p; int b1, b2, b3, n = -1; tag_t *tag = NULL; tname = itm+3; p = strchr(tname, ':'); if (p) { *p = '\0'; n = sscanf(p+1, "%d:%d:%d", &b1, &b2, &b3); for (tag = taghead; (tag && strcmp(tag->columnname, tname)); tag = tag->next); *p = ':'; } if (tag && (n == 3)) { tag->b1 = b1; tag->b2 = b2; tag->b3 = b3; } } itm = bbh_item_walk(NULL); } if (taghead) { fprintf(stdout, "<tr>\n"); fprintf(stdout, "<th align=left valign=top>Network tests"); if (net) fprintf(stdout, "<br>(from %s)", net); fprintf(stdout, "</th>\n"); fprintf(stdout, "<td><table border=0 cellpadding=\"3\" cellspacing=\"5\" summary=\"%s network tests\">\n", host->hostname); fprintf(stdout, "<tr><th align=left valign=top>Service</th><th align=left valign=top>NK</th><th align=left valign=top>C/Y/R limits</th><th align=left valign=top>Specifics</th></tr>\n"); } for (testi = 0, netcount = 0; (testi < testcount); testi++) { tag_t *twalk; for (twalk = taghead; (twalk && strcasecmp(twalk->columnname, testnames[testi]->name)); twalk = twalk->next); if (!twalk) continue; use_columndoc(testnames[testi]->name); fprintf(stdout, "<tr>"); fprintf(stdout, "<td valign=top>%s</td>", testnames[testi]->name); fprintf(stdout, "<td valign=top>%s</td>", nkval(host->hostname, testnames[testi]->name, nkalerts)); fprintf(stdout, "<td valign=top>"); if (twalk->b1 || twalk->b2 || twalk->b3) { fprintf(stdout, "%d/%d/%d", twalk->b1, twalk->b2, twalk->b3); } else { fprintf(stdout, "-/-/-"); } fprintf(stdout, "</td>"); fprintf(stdout, "<td valign=top>"); fprintf(stdout, "<i>%s</i>", (twalk->visualdata ? twalk->visualdata : " ")); if (twalk->expdata) fprintf(stdout, " must return <i>'%s'</i>", twalk->expdata); fprintf(stdout, "</td>"); fprintf(stdout, "</tr>"); netcount++; } if (taghead) { fprintf(stdout, "</table></td>\n"); fprintf(stdout, "</tr>\n"); } if (netcount != testcount) { fprintf(stdout, "<tr>\n"); fprintf(stdout, "<th align=left valign=top>Local tests</th>\n"); fprintf(stdout, "<td><table border=0 cellpadding=\"3\" cellspacing=\"5\" summary=\"%s local tests\">\n", host->hostname); fprintf(stdout, "<tr><th align=left valign=top>Service</th><th align=left valign=top>NK</th><th align=left valign=top>C/Y/R limits</th><th align=left valign=top>Configuration <i>(NB: Thresholds on client may differ)</i></th></tr>\n"); } for (testi = 0; (testi < testcount); testi++) { tag_t *twalk; for (twalk = taghead; (twalk && strcasecmp(twalk->columnname, testnames[testi]->name)); twalk = twalk->next); if (twalk) continue; use_columndoc(testnames[testi]->name); fprintf(stdout, "<tr>"); fprintf(stdout, "<td valign=top>%s</td>", testnames[testi]->name); fprintf(stdout, "<td valign=top>%s</td>", nkval(host->hostname, testnames[testi]->name, nkalerts)); fprintf(stdout, "<td valign=top>-/-/-</td>"); /* Make up some default configuration data */ fprintf(stdout, "<td valign=top>"); if (strcmp(testnames[testi]->name, "cpu") == 0) { fprintf(stdout, "UNIX - Yellow: Load average > 1.5, Red: Load average > 3.0<br>"); fprintf(stdout, "Windows - Yellow: CPU utilisation > 80%%, Red: CPU utilisation > 95%%"); } else if (strcmp(testnames[testi]->name, "disk") == 0) { fprintf(stdout, "Default limits: Yellow 90%% full, Red 95%% full<br>\n"); print_disklist(host->hostname); } else if (strcmp(testnames[testi]->name, "memory") == 0) { fprintf(stdout, "Yellow: swap/pagefile use > 80%%, Red: swap/pagefile use > 90%%"); } else if (strcmp(testnames[testi]->name, "procs") == 0) { htnames_t *walk; if (!host->procs) fprintf(stdout, "No processes monitored<br>\n"); for (walk = host->procs; (walk); walk = walk->next) { fprintf(stdout, "%s<br>\n", walk->name); } } else if (strcmp(testnames[testi]->name, "svcs") == 0) { htnames_t *walk; if (!host->svcs) fprintf(stdout, "No services monitored<br>\n"); for (walk = host->svcs; (walk); walk = walk->next) { fprintf(stdout, "%s<br>\n", walk->name); } } else { fprintf(stdout, " "); } fprintf(stdout, "</td>"); fprintf(stdout, "</tr>"); } if (netcount != testcount) { fprintf(stdout, "</table></td>\n"); fprintf(stdout, "</tr>\n"); } /* Do the alerts */ alert = (activealerts_t *)calloc(1, sizeof(activealerts_t)); alert->hostname = host->hostname; alert->location = bbh_item(hinfo, BBH_ALLPAGEPATHS); strcpy(alert->ip, "127.0.0.1"); alert->color = COL_RED; alert->pagemessage = ""; alert->state = A_PAGING; alert->cookie = 12345; alert_printmode(2); for (testi = 0; (testi < testcount); testi++) { alert->testname = testnames[testi]->name; if (have_recipient(alert, NULL)) print_alert_recipients(alert, buf); } xfree(alert); if (STRBUFLEN(buf) > 0) { fprintf(stdout, "<tr>\n"); fprintf(stdout, "<th align=left valign=top>Alerts</th>\n"); fprintf(stdout, "<td><table border=0 cellpadding=\"3\" cellspacing=\"5\" summary=\"%s alerts\">\n", host->hostname); fprintf(stdout, "<tr><th>Service</th><th>Recipient</th><th>1st Delay</th><th>Stop after</th><th>Repeat</th><th>Time of Day</th><th>Colors</th></tr>\n"); fprintf(stdout, "%s", STRBUF(buf)); fprintf(stdout, "</table></td>\n"); fprintf(stdout, "</tr>\n"); } /* Finish off this host */ fprintf(stdout, "</table>\n"); freestrbuffer(buf); }
int main(int argc, char *argv[]) { char *msg; int seq; int argi; int alertcolors, alertinterval; char *configfn = NULL; char *checkfn = NULL; int checkpointinterval = 900; char acklogfn[PATH_MAX]; FILE *acklogfd = NULL; char notiflogfn[PATH_MAX]; FILE *notiflogfd = NULL; char *tracefn = NULL; struct sigaction sa; int configchanged; time_t lastxmit = 0; MEMDEFINE(acklogfn); MEMDEFINE(notiflogfn); libxymon_init(argv[0]); /* Dont save the error buffer */ save_errbuf = 0; /* Load alert config */ alertcolors = colorset(xgetenv("ALERTCOLORS"), ((1 << COL_GREEN) | (1 << COL_BLUE))); alertinterval = 60*atoi(xgetenv("ALERTREPEAT")); /* Create our loookup-trees */ hostnames = xtreeNew(strcasecmp); testnames = xtreeNew(strcasecmp); locations = xtreeNew(strcasecmp); for (argi=1; (argi < argc); argi++) { if (argnmatch(argv[argi], "--config=")) { configfn = strdup(strchr(argv[argi], '=')+1); } else if (argnmatch(argv[argi], "--checkpoint-file=")) { checkfn = strdup(strchr(argv[argi], '=')+1); } else if (argnmatch(argv[argi], "--checkpoint-interval=")) { char *p = strchr(argv[argi], '=') + 1; checkpointinterval = atoi(p); } else if (argnmatch(argv[argi], "--dump-config")) { load_alertconfig(configfn, alertcolors, alertinterval); dump_alertconfig(1); return 0; } else if (argnmatch(argv[argi], "--cfid")) { include_configid = 1; } else if (argnmatch(argv[argi], "--test")) { char *testhost = NULL, *testservice = NULL, *testpage = NULL, *testcolor = "red", *testgroups = NULL; void *hinfo; int testdur = 0; FILE *logfd = NULL; activealerts_t *awalk = NULL; int paramno = 0; argi++; if (argi < argc) testhost = argv[argi]; argi++; if (argi < argc) testservice = argv[argi]; argi++; while (argi < argc) { if (strncasecmp(argv[argi], "--duration=", 11) == 0) { testdur = durationvalue(strchr(argv[argi], '=')+1); } else if (strncasecmp(argv[argi], "--color=", 8) == 0) { testcolor = strchr(argv[argi], '=')+1; } else if (strncasecmp(argv[argi], "--group=", 8) == 0) { testgroups = strchr(argv[argi], '=')+1; } else if (strncasecmp(argv[argi], "--time=", 7) == 0) { fakestarttime = (time_t)atoi(strchr(argv[argi], '=')+1); } else { paramno++; if (paramno == 1) testdur = atoi(argv[argi]); else if (paramno == 2) testcolor = argv[argi]; else if (paramno == 3) fakestarttime = (time_t) atoi(argv[argi]); } argi++; } if ((testhost == NULL) || (testservice == NULL)) { printf("Usage: xymond_alert --test HOST SERVICE [options]\n"); printf("Possible options:\n\t[--duration=MINUTES]\n\t[--color=COLOR]\n\t[--group=GROUPNAME]\n\t[--time=TIMESPEC]\n"); return 1; } load_hostnames(xgetenv("HOSTSCFG"), NULL, get_fqdn()); hinfo = hostinfo(testhost); if (hinfo) { testpage = strdup(xmh_item(hinfo, XMH_ALLPAGEPATHS)); } else { errprintf("Host not found in hosts.cfg - assuming it is on the top page\n"); testpage = ""; } awalk = (activealerts_t *)calloc(1, sizeof(activealerts_t)); awalk->hostname = find_name(hostnames, testhost); awalk->testname = find_name(testnames, testservice); awalk->location = find_name(locations, testpage); awalk->ip = strdup("127.0.0.1"); awalk->color = awalk->maxcolor = parse_color(testcolor); awalk->pagemessage = "Test of the alert configuration"; awalk->eventstart = getcurrenttime(NULL) - testdur*60; awalk->groups = (testgroups ? strdup(testgroups) : NULL); awalk->state = A_PAGING; awalk->cookie = 12345; awalk->next = NULL; logfd = fopen("/dev/null", "w"); starttrace(NULL); testonly = 1; load_alertconfig(configfn, alertcolors, alertinterval); load_holidays(0); send_alert(awalk, logfd); return 0; } else if (argnmatch(argv[argi], "--trace=")) { tracefn = strdup(strchr(argv[argi], '=')+1); starttrace(tracefn); } else if (net_worker_option(argv[argi])) { /* Handled in the subroutine */ } else if (standardoption(argv[argi])) { if (showhelp) return 0; } else { errprintf("Unknown option '%s'\n", argv[argi]); } } /* Do the network stuff if needed */ net_worker_run(ST_ALERT, LOC_SINGLESERVER, NULL); if (checkfn) { load_checkpoint(checkfn); nextcheckpoint = gettimer() + checkpointinterval; dbgprintf("Next checkpoint at %d, interval %d\n", (int) nextcheckpoint, checkpointinterval); } setup_signalhandler("xymond_alert"); /* Need to handle these ourselves, so we can shutdown and save state-info */ memset(&sa, 0, sizeof(sa)); sa.sa_handler = sig_handler; sigaction(SIGPIPE, &sa, NULL); sigaction(SIGTERM, &sa, NULL); sigaction(SIGINT, &sa, NULL); sigaction(SIGCHLD, &sa, NULL); sigaction(SIGUSR1, &sa, NULL); sigaction(SIGHUP, &sa, NULL); if (xgetenv("XYMONSERVERLOGS")) { sprintf(acklogfn, "%s/acknowledge.log", xgetenv("XYMONSERVERLOGS")); acklogfd = fopen(acklogfn, "a"); sprintf(notiflogfn, "%s/notifications.log", xgetenv("XYMONSERVERLOGS")); notiflogfd = fopen(notiflogfn, "a"); } /* * The general idea here is that this loop handles receiving of alert- * and ack-messages from the master daemon, and maintains a list of * host+test combinations that may have alerts going out. * * This module does not deal with any specific alert-configuration, * it just picks up the alert messages, maintains the list of * known tests that are in some sort of critical condition, and * periodically pushes alerts to the do_alert.c module for handling. * * The only modification of alerts that happen here is the handling * of when the next alert is due. It calls into the next_alert() * routine to learn when an alert should be repeated, and also * deals with Acknowledgments that stop alerts from going out for * a period of time. */ while (running) { char *eoln, *restofmsg; char *metadata[20]; char *p; int metacount; char *hostname = NULL, *testname = NULL; struct timespec timeout; time_t now, nowtimer; int anytogo; activealerts_t *awalk; int childstat; nowtimer = gettimer(); if (checkfn && (nowtimer > nextcheckpoint)) { dbgprintf("Saving checkpoint\n"); nextcheckpoint = nowtimer + checkpointinterval; save_checkpoint(checkfn); if (acklogfd) acklogfd = freopen(acklogfn, "a", acklogfd); if (notiflogfd) notiflogfd = freopen(notiflogfn, "a", notiflogfd); } timeout.tv_sec = 60; timeout.tv_nsec = 0; msg = get_xymond_message(C_PAGE, "xymond_alert", &seq, &timeout); if (msg == NULL) { running = 0; continue; } /* See what time it is - must happen AFTER the timeout */ now = getcurrenttime(NULL); /* Split the message in the first line (with meta-data), and the rest */ eoln = strchr(msg, '\n'); if (eoln) { *eoln = '\0'; restofmsg = eoln+1; } else { restofmsg = ""; } /* * Now parse the meta-data into elements. * We use our own "gettok()" routine which works * like strtok(), but can handle empty elements. */ metacount = 0; memset(&metadata, 0, sizeof(metadata)); p = gettok(msg, "|"); while (p && (metacount < 19)) { metadata[metacount] = p; metacount++; p = gettok(NULL, "|"); } metadata[metacount] = NULL; if (metacount > 3) hostname = metadata[3]; if (metacount > 4) testname = metadata[4]; if ((metacount > 10) && (strncmp(metadata[0], "@@page", 6) == 0)) { /* @@page|timestamp|sender|hostname|testname|hostip|expiretime|color|prevcolor|changetime|location|cookie|osname|classname|grouplist|modifiers */ int newcolor, newalertstatus, oldalertstatus; dbgprintf("Got page message from %s:%s\n", hostname, testname); traceprintf("@@page %s:%s:%s=%s\n", hostname, testname, metadata[10], metadata[7]); awalk = find_active(hostname, testname); if (awalk == NULL) { char *hwalk = find_name(hostnames, hostname); char *twalk = find_name(testnames, testname); char *pwalk = find_name(locations, metadata[10]); awalk = (activealerts_t *)calloc(1, sizeof(activealerts_t)); awalk->hostname = hwalk; awalk->testname = twalk; awalk->location = pwalk; awalk->cookie = -1; awalk->state = A_DEAD; /* * Use changetime here, if we restart the alert module then * this gets the duration values more right than using "now". * Also, define this only when a new alert arrives - we should * NOT clear this when a status goes yellow->red, or if it * flaps between yellow and red. */ awalk->eventstart = atoi(metadata[9]); add_active(awalk->hostname, awalk); traceprintf("New record\n"); } newcolor = parse_color(metadata[7]); oldalertstatus = ((alertcolors & (1 << awalk->color)) != 0); newalertstatus = ((alertcolors & (1 << newcolor)) != 0); traceprintf("state %d->%d\n", oldalertstatus, newalertstatus); if (newalertstatus) { /* It's in an alert state. */ awalk->color = newcolor; awalk->state = A_PAGING; if (newcolor > awalk->maxcolor) { if (awalk->maxcolor != 0) { /* * Severity has increased (yellow -> red). * Clear the repeat-interval, and set maxcolor to * the new color. If it drops to yellow again, * maxcolor stays at red, so a test that flaps * between yellow and red will only alert on red * the first time, and then follow the repeat * interval. */ dbgprintf("Severity increased, cleared repeat interval: %s/%s %s->%s\n", awalk->hostname, awalk->testname, colorname(awalk->maxcolor), colorname(newcolor)); clear_interval(awalk); } awalk->maxcolor = newcolor; } } else { /* * Send one "recovered" message out now, then go to A_DEAD. * Dont update the color here - we want recoveries to go out * only if the alert color triggered an alert */ awalk->state = (newcolor == COL_BLUE) ? A_DISABLED : A_RECOVERED; } if (oldalertstatus != newalertstatus) { dbgprintf("Alert status changed from %d to %d\n", oldalertstatus, newalertstatus); clear_interval(awalk); } if (awalk->ip) xfree(awalk->ip); awalk->ip = strdup(metadata[5]); awalk->cookie = atoi(metadata[11]); if (awalk->osname) xfree(awalk->osname); awalk->osname = (metadata[12] ? strdup(metadata[12]) : NULL); if (awalk->classname) xfree(awalk->classname); awalk->classname = (metadata[13] ? strdup(metadata[13]) : NULL); if (awalk->groups) xfree(awalk->groups); awalk->groups = (metadata[14] ? strdup(metadata[14]) : NULL); if (awalk->pagemessage) xfree(awalk->pagemessage); if (metadata[15]) { /* Modifiers are more interesting than the message itself */ awalk->pagemessage = (char *)malloc(strlen(awalk->hostname) + strlen(awalk->testname) + strlen(colorname(awalk->color)) + strlen(metadata[15]) + strlen(restofmsg) + 10); sprintf(awalk->pagemessage, "%s:%s %s\n%s\n%s", awalk->hostname, awalk->testname, colorname(awalk->color), metadata[15], restofmsg); } else { awalk->pagemessage = strdup(restofmsg); } } else if ((metacount > 5) && (strncmp(metadata[0], "@@ack", 5) == 0)) { /* @@ack|timestamp|sender|hostname|testname|hostip|expiretime */ /* * An ack is handled simply by setting the next * alert-time to when the ack expires. */ time_t nextalert = atoi(metadata[6]); dbgprintf("Got ack message from %s:%s\n", hostname, testname); traceprintf("@@ack: %s:%s now=%d, ackeduntil %d\n", hostname, testname, (int)now, (int)nextalert); awalk = find_active(hostname, testname); if (acklogfd) { int cookie = (awalk ? awalk->cookie : -1); int color = (awalk ? awalk->color : 0); fprintf(acklogfd, "%d\t%d\t%d\t%d\t%s\t%s.%s\t%s\t%s\n", (int)now, cookie, (int)((nextalert - now) / 60), cookie, "np_filename_not_used", hostname, testname, colorname(color), nlencode(restofmsg)); fflush(acklogfd); } if (awalk && (awalk->state == A_PAGING)) { traceprintf("Record updated\n"); awalk->state = A_ACKED; awalk->nextalerttime = nextalert; if (awalk->ackmessage) xfree(awalk->ackmessage); awalk->ackmessage = strdup(restofmsg); } else { traceprintf("No record\n"); } } else if ((metacount > 4) && (strncmp(metadata[0], "@@notify", 5) == 0)) { /* @@notify|timestamp|sender|hostname|testname|pagepath */ char *hwalk = find_name(hostnames, hostname); char *twalk = find_name(testnames, testname); char *pwalk = find_name(locations, (metadata[5] ? metadata[5] : "")); awalk = (activealerts_t *)calloc(1, sizeof(activealerts_t)); awalk->hostname = hwalk; awalk->testname = twalk; awalk->location = pwalk; awalk->cookie = -1; awalk->pagemessage = strdup(restofmsg); awalk->eventstart = getcurrenttime(NULL); awalk->state = A_NOTIFY; add_active(awalk->hostname, awalk); } else if ((metacount > 3) && ((strncmp(metadata[0], "@@drophost", 10) == 0) || (strncmp(metadata[0], "@@dropstate", 11) == 0))) { /* @@drophost|timestamp|sender|hostname */ /* @@dropstate|timestamp|sender|hostname */ xtreePos_t handle; handle = xtreeFind(hostnames, hostname); if (handle != xtreeEnd(hostnames)) { alertanchor_t *anchor = (alertanchor_t *)xtreeData(hostnames, handle); for (awalk = anchor->head; (awalk); awalk = awalk->next) awalk->state = A_DEAD; } } else if ((metacount > 4) && (strncmp(metadata[0], "@@droptest", 10) == 0)) { /* @@droptest|timestamp|sender|hostname|testname */ awalk = find_active(hostname, testname); if (awalk) awalk->state = A_DEAD; } else if ((metacount > 4) && (strncmp(metadata[0], "@@renamehost", 12) == 0)) { /* @@renamehost|timestamp|sender|hostname|newhostname */ /* * We handle rename's simply by dropping the alert. If there is still an * active alert for the host, it will have to be dealt with when the next * status update arrives. */ xtreePos_t handle; handle = xtreeFind(hostnames, hostname); if (handle != xtreeEnd(hostnames)) { alertanchor_t *anchor = (alertanchor_t *)xtreeData(hostnames, handle); for (awalk = anchor->head; (awalk); awalk = awalk->next) awalk->state = A_DEAD; } } else if ((metacount > 5) && (strncmp(metadata[0], "@@renametest", 12) == 0)) { /* @@renametest|timestamp|sender|hostname|oldtestname|newtestname */ /* * We handle rename's simply by dropping the alert. If there is still an * active alert for the host, it will have to be dealt with when the next * status update arrives. */ awalk = find_active(hostname, testname); if (awalk) awalk->state = A_DEAD; } else if (strncmp(metadata[0], "@@shutdown", 10) == 0) { running = 0; errprintf("Got a shutdown message\n"); continue; } else if (strncmp(metadata[0], "@@logrotate", 11) == 0) { char *fn = xgetenv("XYMONCHANNEL_LOGFILENAME"); if (fn && strlen(fn)) { reopen_file(fn, "a", stdout); reopen_file(fn, "a", stderr); if (tracefn) { stoptrace(); starttrace(tracefn); } } continue; } else if (strncmp(metadata[0], "@@reload", 8) == 0) { /* Nothing ... right now */ } else if (strncmp(metadata[0], "@@idle", 6) == 0) { /* Timeout */ } /* * When a burst of alerts happen, we get lots of alert messages * coming in quickly. So lets handle them in bunches and only * do the full alert handling once every 10 secs - that lets us * combine a bunch of alerts into one transmission process. */ if (nowtimer < (lastxmit+10)) continue; lastxmit = nowtimer; /* * Loop through the activealerts list and see if anything is pending. * This is an optimization, we could just as well just fork off the * notification child and let it handle all of it. But there is no * reason to fork a child process unless it is going to do something. */ configchanged = load_alertconfig(configfn, alertcolors, alertinterval); configchanged += load_holidays(0); anytogo = 0; for (awalk = alistBegin(); (awalk); awalk = alistNext()) { int anymatch = 0; switch (awalk->state) { case A_NORECIP: if (!configchanged) break; /* The configuration has changed - switch NORECIP -> PAGING */ awalk->state = A_PAGING; clear_interval(awalk); /* Fall through */ case A_PAGING: if (have_recipient(awalk, &anymatch)) { if (awalk->nextalerttime <= now) anytogo++; } else { if (!anymatch) { awalk->state = A_NORECIP; cleanup_alert(awalk); } } break; case A_ACKED: if (awalk->nextalerttime <= now) { /* An ack has expired, so drop the ack message and switch to A_PAGING */ anytogo++; if (awalk->ackmessage) xfree(awalk->ackmessage); awalk->state = A_PAGING; } break; case A_RECOVERED: case A_DISABLED: case A_NOTIFY: anytogo++; break; case A_DEAD: break; } } dbgprintf("%d alerts to go\n", anytogo); if (anytogo) { pid_t childpid; childpid = fork(); if (childpid == 0) { /* The child */ start_alerts(); for (awalk = alistBegin(); (awalk); awalk = alistNext()) { switch (awalk->state) { case A_PAGING: if (awalk->nextalerttime <= now) { send_alert(awalk, notiflogfd); } break; case A_ACKED: /* Cannot be A_ACKED unless the ack is still valid, so no alert. */ break; case A_RECOVERED: case A_DISABLED: case A_NOTIFY: send_alert(awalk, notiflogfd); break; case A_NORECIP: case A_DEAD: break; } } finish_alerts(); /* Child does not continue */ exit(0); } else if (childpid < 0) { errprintf("Fork failed, cannot send alerts: %s\n", strerror(errno)); } } /* Update the state flag and the next-alert timestamp */ for (awalk = alistBegin(); (awalk); awalk = alistNext()) { switch (awalk->state) { case A_PAGING: if (awalk->nextalerttime <= now) awalk->nextalerttime = next_alert(awalk); break; case A_NORECIP: break; case A_ACKED: /* Still cannot get here except if ack is still valid */ break; case A_RECOVERED: case A_DISABLED: case A_NOTIFY: awalk->state = A_DEAD; /* Fall through */ case A_DEAD: cleanup_alert(awalk); break; } } clean_all_active(); /* Pickup any finished child processes to avoid zombies */ while (wait3(&childstat, WNOHANG, NULL) > 0) ; } if (checkfn) save_checkpoint(checkfn); if (acklogfd) fclose(acklogfd); if (notiflogfd) fclose(notiflogfd); stoptrace(); MEMUNDEFINE(notiflogfn); MEMUNDEFINE(acklogfn); if (termsig >= 0) { errprintf("Terminated by signal %d\n", termsig); } return 0; }