/** Post a new job from Nagios to the dnxServer job queue. * * @param[in] joblist - the job list to which the new job should be posted. * @param[in] serial - the serial number of the new job. * @param[in] jdp - a pointer to a job data structure. * @param[in] ds - a pointer to the nagios job that's being posted. * @param[in] pNode - a dnxClient node request structure that is being * posted with this job. The dispatcher thread will send the job to the * associated node. * * @return Zero on success, or a non-zero error value. */ static int dnxPostNewJob(DnxJobList * joblist, unsigned long serial, DnxJobData * jdp, nebstruct_service_check_data * ds, DnxNodeRequest * pNode) { DnxNewJob Job; int ret; assert(ds); assert(ds->command_line); // fill-in the job structure with the necessary information dnxMakeXID(&Job.xid, DNX_OBJ_JOB, serial, 0); Job.payload = jdp; Job.cmd = xstrdup(ds->command_line); Job.start_time = ds->start_time.tv_sec; Job.timeout = ds->timeout; Job.expires = Job.start_time + Job.timeout + 5; Job.pNode = pNode; dnxDebug(2, "DnxNebMain: Posting Job [%lu]: %s.", serial, Job.cmd); // post to the Job Queue if ((ret = dnxJobListAdd(joblist, &Job)) != DNX_OK) { dnxStatsInc(0, JOBS_REJECTED_NO_SLOTS); dnxLog("Failed to post Job [%lu]; \"%s\": %d.", Job.xid.objSerial, Job.cmd, ret); } else { dnxStatsInc(0, JOBS_HANDLED); dnxAuditJob(&Job, "ASSIGN"); } return ret; }
/** Launches an external command and waits for it to return a status code. * * @param[in] script - the command line to be launched. * * @return Zero on success, or a non-zero error value. */ static int launchScript(char * script) { int ret; assert(script); // exec the script - system waits till child completes if ((ret = system(script)) == -1) { dnxLog("Failed to exec script: %s.", strerror(errno)); ret = DNX_ERR_INVALID; } else ret = DNX_OK; dnxLog("Sync script returned %d.", WEXITSTATUS(ret)); return ret; }
/** Initialize worker thread communication resources. * * @param[in] ws - a pointer to a worker thread's status data structure. * * @return Zero on success, or a non-zero error value. */ static int initWorkerComm(DnxWorkerStatus * ws) { char szChanDisp[64]; char szChanColl[64]; int ret; // create a channel for sending job requests (named after its memory address) sprintf(szChanDisp, "Dispatch:%lx", ws); if ((ret = dnxChanMapAdd(szChanDisp, ws->iwlm->cfg.dispatcher)) != DNX_OK) { dnxLog("WLM: Failed to initialize dispatcher channel: %s.", dnxErrorString(ret)); return ret; } if ((ret = dnxConnect(szChanDisp, 1, &ws->dispatch)) != DNX_OK) { dnxLog("WLM: Failed to open dispatcher channel: %s.", dnxErrorString(ret)); dnxChanMapDelete(szChanDisp); return ret; } // create a channel for sending job results (named after its memory address) sprintf(szChanColl, "Collect:%lx", ws); if ((ret = dnxChanMapAdd(szChanColl, ws->iwlm->cfg.collector)) != DNX_OK) { dnxLog("WLM: Failed to initialize collector channel: %s.", dnxErrorString(ret)); dnxDisconnect(ws->dispatch); dnxChanMapDelete(szChanDisp); return ret; } if ((ret = dnxConnect(szChanColl, 1, &ws->collect)) != DNX_OK) { dnxLog("WLM: Failed to open collector channel: %s.", dnxErrorString(ret)); dnxChanMapDelete(szChanColl); dnxDisconnect(ws->dispatch); dnxChanMapDelete(szChanDisp); return ret; } return 0; }
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long tail; int ret = DNX_OK; assert(pJobList && pJob); DNX_PT_MUTEX_LOCK(&ilist->mut); tail = ilist->tail; // verify space in the job list, this keeps a single empty buffer element to // protect us from not knowing a full ring from an empty one if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) { dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); ret = DNX_ERR_CAPACITY; } else { // add the slot index to the Job's XID - this allows us to index // the job list using the returned result's XID.objSlot field pJob->xid.objSlot = tail; // We were unable to get an available dnxClient job request so we // put the job into the queue anyway and have the timer thread try // and find a dnxClient for it later if (pJob->pNode->xid.objSlot == -1) { pJob->state = DNX_JOB_UNBOUND; } else { pJob->state = DNX_JOB_PENDING; } dnxAuditJob(pJob, "ASSIGN"); // add this job to the job list memcpy(&ilist->list[tail], pJob, sizeof *pJob); ilist->tail = tail; dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail); if(pJob->state == DNX_JOB_PENDING) { pthread_cond_signal(&ilist->cond); // signal that a new job is available } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Process Data Event Handler. * * @param[in] event_type - the event regarding which we were called by Nagios. * @param[in] data - an opaque pointer to an event-specific data structure. * * @return Zero if all is okay, but we want nagios to handle this event; * non-zero if there's a problem of some sort. */ static int ehProcessData(int event_type, void * data) { nebstruct_process_data *procdata = (nebstruct_process_data *)data; // validate our event type - ignore wrong event type assert(event_type == NEBCALLBACK_PROCESS_DATA); if (event_type != NEBCALLBACK_PROCESS_DATA) return OK; // sanity-check our data structure - should never happen assert(procdata); if (!procdata) { dnxLog("Startup handler received NULL process data structure."); return ERROR; } // look for process event loop start event if (procdata->type == NEBTYPE_PROCESS_EVENTLOOPSTART) { dnxDebug(2, "Startup handler received PROCESS_EVENTLOOPSTART event."); // execute sync script, if defined if (cfg.syncScript) { dnxLog("Startup handler executing plugin sync script: %s.", cfg.syncScript); // NB: This halts Nagios execution until the script exits... launchScript(cfg.syncScript); } // if server init fails, do server shutdown if (dnxServerInit() != 0) dnxServerDeInit(); } return OK; }
int dnxInitAgent(char * agentUrl, DnxCfgParser * parser) { int ret; s_shutdown = 0; s_agentTid = 0; s_parser = parser; if ((ret = dnxChanMapAdd(s_agentName, agentUrl)) != DNX_OK) dnxLog("AGENT channel init failed: %s.", dnxErrorString(ret)); else if ((ret = dnxConnect(s_agentName, DNX_MODE_PASSIVE, &s_agent)) != DNX_OK) { dnxLog("AGENT channel connect failed: %s.", dnxErrorString(ret)); dnxChanMapDelete(s_agentName); } else if ((ret = pthread_create(&s_agentTid, 0, dnxAgentServer, 0)) != 0) { dnxLog("AGENT server init failed: %s.", strerror(ret)); dnxDisconnect(s_agent); dnxChanMapDelete(s_agentName); ret = DNX_ERR_THREAD; } return ret; }
/** Validate a configuration data structure in context. * * @param[in] dict - the dictionary used by the DnxCfgParser. * @param[in] vptrs - an array of opaque objects (either pointers or values) * to be checked. * @param[in] passthru - an opaque pointer passed through from * dnxCfgParserCreate. In this routine, it's the regex_t object into which * we should parse the regular expression if one is given. * * @return Zero on success, or a non-zero error value. This error value is * passed back through dnxCfgParserParse. */ static int validateCfg(DnxCfgDict * dict, void ** vptrs, void * passthru) { regex_t * rep = (regex_t *)passthru; int err, ret = DNX_ERR_INVALID; DnxServerCfg cfg; assert(dict && vptrs && passthru); // setup data structure so we can use the same functionality we had before cfg.agentUrl = (char *)vptrs[ 0]; cfg.dispatcherUrl = (char *)vptrs[ 1]; cfg.collectorUrl = (char *)vptrs[ 2]; cfg.authWorkerNodes = (char *)vptrs[ 3]; cfg.maxNodeRequests = (unsigned)(intptr_t)vptrs[ 4]; cfg.minServiceSlots = (unsigned)(intptr_t)vptrs[ 5]; cfg.expirePollInterval = (unsigned)(intptr_t)vptrs[ 6]; cfg.localCheckPattern = (char *)vptrs[ 7]; cfg.syncScript = (char *)vptrs[ 8]; cfg.logFilePath = (char *)vptrs[ 9]; cfg.debugFilePath = (char *)vptrs[10]; cfg.auditFilePath = (char *)vptrs[11]; cfg.debugLevel = (unsigned)(intptr_t)vptrs[12]; // validate configuration items in context if (!cfg.agentUrl) dnxLog("config: Missing channelAgent parameter."); else if (!cfg.dispatcherUrl) dnxLog("config: Missing channelDispatcher parameter."); else if (!cfg.collectorUrl) dnxLog("config: Missing channelCollector parameter."); else if (cfg.maxNodeRequests < 1) dnxLog("config: Invalid maxNodeRequests parameter."); else if (cfg.minServiceSlots < 1) dnxLog("config: Invalid minServiceSlots parameter."); else if (cfg.expirePollInterval < 1) dnxLog("config: Invalid expirePollInterval parameter."); else if (cfg.localCheckPattern && (err = regcomp(rep, cfg.localCheckPattern, REG_EXTENDED | REG_NOSUB)) != 0) { char buffer[128]; regerror(err, rep, buffer, sizeof buffer); dnxLog("config: Failed to compile localCheckPattern (\"%s\"): %s.", cfg.localCheckPattern, buffer); regfree(rep); } else ret = 0; return ret; }
char *ntop(const char * sastr) { const struct sockaddr * sa = (const struct sockaddr *)sastr; assert(sa); if(!sa) { return xstrdup("DNX Error: Address Uknown or Corrupt! "); } char * buf = NULL; switch(sa->sa_family) { case AF_INET: buf = (char *)xcalloc(INET_ADDRSTRLEN +1,sizeof(char)); if(buf) { inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr),buf, INET_ADDRSTRLEN); } break; case AF_INET6: buf = (char *)xcalloc(INET6_ADDRSTRLEN +1,sizeof(char)); if(buf) { inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr),buf, INET6_ADDRSTRLEN); } break; default: buf = xstrdup("127.0.0.1"); break; } if(!buf) { dnxLog("ntop: out of memory, sleeping for 1 second before trying again"); sleep(1); return(ntop((char *)sa)); }else{ return buf; } }
/** Register a new client node "request for work" request. * * The message is either stored or used to find an existing node request * that should be updated. If stored, @p ppMsg is returned as zero so that * it will be reallocated by the caller. In all other cases, the same * message block can be reused by the caller for the next request. * * @param[in] ireg - the registrar on which to register a new client request. * @param[in] ppMsg - the address of the dnx client request node pointer. * * @return Zero on success, or a non-zero error value. */ static int dnxRegisterNode(iDnxRegistrar * ireg, DnxNodeRequest ** ppMsg) { pthread_t tid = pthread_self(); DnxNodeRequest * pReq; time_t now = time(0); int ret = DNX_OK; assert(ireg && ppMsg && *ppMsg); // compute expiration time of this request pReq = *ppMsg; pReq->expires = now + pReq->ttl; dnxStatsInc(pReq->address, REQUESTS_RECEIVED); // locate existing node: update expiration time, or add to the queue if (dnxQueueFind(ireg->rqueue, (void **)&pReq, dnxCompareNodeReq) == DNX_QRES_FOUND) { pReq->expires = (*ppMsg)->expires; dnxDebug(2, "dnxRegistrar[%lx]: Updated req [%lu,%lu] at %u; expires at %u.", tid, pReq->xid.objSerial, pReq->xid.objSlot, (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000)); } else if ((ret = dnxQueuePut(ireg->rqueue, *ppMsg)) == DNX_OK) { *ppMsg = 0; // we're keeping this message; return NULL dnxDebug(2, "dnxRegistrar[%lx]: Added req [%lu,%lu] at %u; expires at %u.", tid, pReq->xid.objSerial, pReq->xid.objSlot, (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000)); } else dnxLog("DNX Registrar: Unable to enqueue node request: %s.", dnxErrorString(ret)); return ret; }
/** Grow the thread pool to the configured number of threads. * * This routine calculates an appropriate growth factor. If the current * number of threads is less than the requested initial pool size, then the * pool is grown to the initial pool size. If the current number of threads * is near the maximum pool size, then only grow to the maximum. Otherwise it * is grown by the configured pool growth value. * * @param[in] iwlm - a reference to the work load manager whose thread * pool size is to be increased. * * @return Zero on success, or a non-zero error value. */ static int growThreadPool(iDnxWlm * iwlm) { unsigned i, add, growsz; int ret; // set additional thread count - keep us between the min and the max if (iwlm->threads < iwlm->cfg.poolInitial) growsz = iwlm->cfg.poolInitial - iwlm->threads; else if (iwlm->threads + iwlm->cfg.poolGrow > iwlm->cfg.poolMax) growsz = iwlm->cfg.poolMax - iwlm->threads; else growsz = iwlm->cfg.poolGrow; // fill as many empty slots as we can or need to for (i = iwlm->threads, add = growsz; i < iwlm->poolsz && add > 0; i++, add--) { if ((ret = workerCreate(iwlm, &iwlm->pool[i])) != 0) break; iwlm->threads++; iwlm->tcreated++; } dnxLog("WLM: Increased thread pool by %d.", growsz - add); return ret; }
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm) { iDnxWlm * iwlm; struct ifaddrs * ifa = NULL; assert(cfg && pwlm); assert(cfg->poolMin > 0); assert(cfg->poolMax >= cfg->poolMin); assert(cfg->poolInitial >= cfg->poolMin); assert(cfg->poolInitial <= cfg->poolMax); // allocate and configure the master thread pool data structure if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0) return DNX_ERR_MEMORY; memset(iwlm, 0, sizeof *iwlm); iwlm->cfg = *cfg; iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher); iwlm->cfg.collector = xstrdup(iwlm->cfg.collector); iwlm->poolsz = iwlm->cfg.poolMax; iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool); iwlm->minexectm = (unsigned)(-1); // the largest possible value memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool); // cache our (primary?) ip address in binary and string format if (getifaddrs(&ifa) == 0) { u_int setflags = IFF_UP | IFF_RUNNING; u_int clrflags = IFF_LOOPBACK; struct ifaddrs * ifcur = ifa; // locate the first proper AF_NET address in our interface list while (ifcur && (ifcur->ifa_addr == 0 || ifcur->ifa_addr->sa_family != AF_INET || (ifcur->ifa_flags & setflags) != setflags || (ifcur->ifa_flags & clrflags) != 0)) ifcur = ifcur->ifa_next; if (ifcur) { // cache binary and presentation (string) versions of the ip address iwlm->myipaddr = (unsigned long) ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr; inet_ntop(ifcur->ifa_addr->sa_family, &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr, iwlm->myipaddrstr, sizeof iwlm->myipaddrstr); } freeifaddrs(ifa); } char unset[] = "NULL"; if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set { dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread."); char machineName [MAX_HOSTNAME]; if(strcmp(cfg->hostname, unset)==0) { dnxDebug(3, "dnxWlmCreate: Hostname undefined in config."); // Get our hostname if(gethostname(machineName, MAX_HOSTNAME)==0) { dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName); // cache hostname strcpy(iwlm->myhostname, machineName); } else { dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?]," "please set hostname in config.", machineName); sprintf( machineName, "localhost"); strcpy(iwlm->myhostname, machineName); } } else { dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname); strcpy(iwlm->myhostname, cfg->hostname); } } else { dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname); strcpy(iwlm->cfg.hostname, iwlm->myhostname); } // if any of the above failed, we really can't continue if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool) { xfree(iwlm->cfg.dispatcher); xfree(iwlm->cfg.collector); xfree(iwlm); return DNX_ERR_MEMORY; } // create initial worker thread pool DNX_PT_MUTEX_INIT(&iwlm->mutex); DNX_PT_MUTEX_LOCK(&iwlm->mutex); { int ret; if ((ret = growThreadPool(iwlm)) != DNX_OK) { if (iwlm->threads) dnxLog("WLM: Error creating SOME worker threads: %s; " "continuing with smaller initial pool.", dnxErrorString(ret)); else { dnxLog("WLM: Unable to create ANY worker threads: %s; " "terminating.", dnxErrorString(ret)); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); DNX_PT_MUTEX_DESTROY(&iwlm->mutex); xfree(iwlm); return ret; } } } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxLog("WLM: Started worker thread pool."); *pwlm = (DnxWlm *)iwlm; return DNX_OK; }
/** The main thread routine for a worker thread. * * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this * thread. * * @return Always returns 0. */ static void * dnxWorker(void * data) { DnxWorkerStatus * ws = (DnxWorkerStatus *)data; pthread_t tid = pthread_self(); int retries = 0; iDnxWlm * iwlm; assert(data); iwlm = ws->iwlm; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxWorkerCleanup, data); time(&ws->tstart); // set thread start time (for stats) while (!iwlm->terminate) { DnxNodeRequest msg; DnxJob job; int ret; // setup job request message - use thread id and node address in XID dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr); msg.reqType = DNX_REQ_REGISTER; msg.jobCap = 1; msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff; msg.hn = iwlm->myhostname; // request a job, and then wait for a job to come in... if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) { dnxLog("Worker[%lx]: Error sending node request: %s.", tid, dnxErrorString(ret)); } else { DNX_PT_MUTEX_LOCK(&iwlm->mutex); iwlm->reqsent++; DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); } // wait for job, even if request was never sent if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxLog("Worker[%lx]: Error receiving job: %s.", tid, dnxErrorString(ret)); } // Allow thread to be canceled pthread_testcancel(); DNX_PT_MUTEX_LOCK(&iwlm->mutex); cleanThreadPool(iwlm); // ensure counts are accurate before using them if (ret != DNX_OK) { // if above pool minimum and exceeded max retries... if (iwlm->threads > iwlm->cfg.poolMin && ++retries > iwlm->cfg.maxRetries) { dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); break; } } else { iwlm->jobsrcvd++; iwlm->active++; // dnxSendJobAck(ws->collect, &job, &job.address); // dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", // tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd); // DnxAck ack; // ack.xid = job.xid; // ack.timestamp = job.timestamp; dnxSendJobAck(ws->collect, &job, 0); dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp); // check pool size before we get too busy - // if we're not shutting down and we haven't reached the configured // maximum and this is the last thread out, then increase the pool if (!iwlm->terminate && iwlm->threads < iwlm->cfg.poolMax && iwlm->active == iwlm->threads) // Maybe more aggressive here growThreadPool(iwlm); } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); // if we have a job, execute it and reset retry count if (ret == DNX_OK) { char resData[MAX_RESULT_DATA + 1]; DnxResult result; time_t jobstart; dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd); // prepare result structure result.xid = job.xid; // result xid must match job xid result.state = DNX_JOB_COMPLETE; // complete or expired result.delta = 0; result.resCode = DNX_PLUGIN_RESULT_OK; result.resData = 0; /** @todo Allocate result data buffer based on configured buffer size. */ // we want to be able to cancel threads while they're out on a task // in order to obtain timely shutdown for long jobs - move into // async cancel mode, but only for the duration of the check pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0); *resData = 0; jobstart = time(0); dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0); result.delta = time(0) - jobstart; pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); // store allocated copy of the result string if (*resData) result.resData = xstrdup(resData); dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.", tid, job.xid.objSerial, job.xid.objSlot, result.delta, result.resCode, result.resData); // if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { // dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", // tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); // } // Wait while we wait for an Ack to our Results DnxJob ack; int trys = 1; while(trys < 4) { if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); break; } // Now wait for our Ack if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); } else if (ret == DNX_ERR_TIMEOUT) { // we didn't get our Ack trys++; } else { // We got our Ack dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); break; } } xfree(result.resData); // update all statistics DNX_PT_MUTEX_LOCK(&iwlm->mutex); { // track status if (result.resCode == DNX_PLUGIN_RESULT_OK) iwlm->jobsok++; else iwlm->jobsfail++; // track min/max/avg execution time if (result.delta > iwlm->maxexectm) iwlm->maxexectm = result.delta; if (result.delta < iwlm->minexectm) iwlm->minexectm = result.delta; iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2; // total job processing time iwlm->jobtm += (unsigned)result.delta; iwlm->active--; // reduce active count } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); ws->serial++; // increment job serial number for next job retries = 0; } } pthread_cleanup_pop(1); return 0; }
/** Log changes between old and new configuration data sets. * * Dynamic reconfiguration of dispatcher and collector URL's is not allowed * so we don't need to check differences in those string values. * * @param[in] ocp - a reference to the old configuration data set. * @param[in] ncp - a reference to the new configuration data set. */ static void logConfigChanges(DnxWlmCfgData * ocp, DnxWlmCfgData * ncp) { if (strcmp(ocp->dispatcher, ncp->dispatcher) != 0) dnxLog("Config parameter 'channelDispatcher' changed from %s to %s. " "NOTE: Changing the dispatcher URL requires a restart.", ocp->dispatcher, ncp->dispatcher); if (strcmp(ocp->collector, ncp->collector) != 0) dnxLog("Config parameter 'channelCollector' changed from %s to %s. " "NOTE: Changing the collector URL requires a restart.", ocp->collector, ncp->collector); if (ocp->reqTimeout != ncp->reqTimeout) dnxLog("Config parameter 'threadRequestTimeout' changed from %u to %u.", ocp->reqTimeout, ncp->reqTimeout); if (ocp->ttlBackoff != ncp->ttlBackoff) dnxLog("Config parameter 'threadTtlBackoff' changed from %u to %u.", ocp->ttlBackoff, ncp->ttlBackoff); if (ocp->maxRetries != ncp->maxRetries) dnxLog("Config parameter 'threadMaxTimeouts' changed from %u to %u.", ocp->maxRetries, ncp->maxRetries); if (ocp->poolMin != ncp->poolMin) dnxLog("Config parameter 'poolMin' changed from %u to %u.", ocp->poolMin, ncp->poolMin); if (ocp->poolInitial != ncp->poolInitial) dnxLog("Config parameter 'poolInitial' changed from %u to %u.", ocp->poolInitial, ncp->poolInitial); if (ocp->poolMax != ncp->poolMax) dnxLog("Config parameter 'poolMax' changed from %u to %u.", ocp->poolMax, ncp->poolMax); if (ocp->poolGrow != ncp->poolGrow) dnxLog("Config parameter 'poolGrow' changed from %u to %u.", ocp->poolGrow, ncp->poolGrow); if (ocp->pollInterval != ncp->pollInterval) dnxLog("Config parameter 'wlmPollInterval' changed from %u to %u.", ocp->pollInterval, ncp->pollInterval); if (ocp->shutdownGrace != ncp->shutdownGrace) dnxLog("Config parameter 'wlmShutdownGracePeriod' changed from %u to %u.", ocp->shutdownGrace, ncp->shutdownGrace); if (ocp->maxResults != ncp->maxResults) dnxLog("Config parameter 'maxResultBuffer' changed from %u to %u.", ocp->maxResults, ncp->maxResults); if (ocp->showNodeAddr != ncp->showNodeAddr) dnxLog("Config parameter 'showNodeAddr' changed from %s to %s.", ocp->showNodeAddr? "TRUE" : "FALSE", ncp->showNodeAddr? "TRUE" : "FALSE"); if (ocp->hostname != ncp->hostname) dnxLog("Config parameter 'hostname' changed from %s to %s.", ocp->hostname, ncp->hostname); }
/** The agent thread control procedure. * * @param[in] data - thread data; not used. * * @return Always returns a null pointer (zero). */ static void * dnxAgentServer(void * data) { int ret; DnxMgmtRequest Msg; Msg.action = 0; dnxLog("DNX Server Agent awaiting commands..."); while (!s_shutdown) { memset(Msg.address, '\0', DNX_MAX_ADDRESS); // wait 2 second for a request; process the request, if valid if ((ret = dnxWaitForMgmtRequest(s_agent, &Msg, Msg.address, 2)) == DNX_OK) { DnxMgmtReply Rsp; char addrstr[DNX_MAX_ADDRSTR]; dnxDebug(2, "Received MgmtRequest from %s.", dnxNtop(Msg.address, addrstr, sizeof addrstr)); // setup some default response values Rsp.xid = Msg.xid; Rsp.status = DNX_REQ_ACK; Rsp.reply = 0; // perform the requested action if (!strcmp(Msg.action, "RESETSTATS")) { dnxStatsResetServerStats(); dnxStatsForEachNode(dnxResetNodeStats, 0); Rsp.reply = xstrdup("OK"); } else if (!strncmp(Msg.action, "GETSTATS ", 9)) { if ((Rsp.reply = buildMgmtStatsReply(Msg.action + 9)) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strncmp(Msg.action, "GETNODESTATS ", 13)) { if ((Rsp.reply = buildMgmtNodeStatsReply(Msg.action + 13)) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETNODELIST")) { if ((Rsp.reply = buildMgmtNodeListReply()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETCONFIG")) { if ((Rsp.reply = buildMgmtCfgReply()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETVERSION")) { if ((Rsp.reply = versionText()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "HELP")) { if ((Rsp.reply = buildHelpReply()) == 0) Rsp.status = DNX_REQ_NAK; } // send response, log response failures if ((ret = dnxSendMgmtReply(s_agent, &Rsp, Msg.address)) != 0) dnxLog("Agent response failure: %s.", dnxErrorString(ret)); // free request and reply message buffers xfree(Rsp.reply); xfree(Msg.action); } else if (ret != DNX_ERR_TIMEOUT) dnxLog("Agent channel failure: %s.", dnxErrorString(ret)); } dnxLog("Agent terminating..."); return 0; }
/** Service Check Event Handler. * * @param[in] event_type - the event type for which we're being called. * @param[in] data - an opaque pointer to nagios event-specific data. * * @return Zero if we want Nagios to handle the event; * NEBERROR_CALLBACKOVERRIDE indicates that we want to handle the event * ourselves; any other non-zero value represents an error. */ static int ehSvcCheck(int event_type, void * data) { static unsigned long serial = 0; // the number of service checks processed nebstruct_service_check_data * svcdata = (nebstruct_service_check_data *)data; DnxNodeRequest * pNode; DnxJobData * jdp; int ret; if (event_type != NEBCALLBACK_SERVICE_CHECK_DATA) return OK; if (svcdata == 0) { dnxLog("Service handler received NULL service data structure."); return ERROR; // shouldn't happen - internal Nagios error } if (svcdata->type != NEBTYPE_SERVICECHECK_INITIATE) return OK; // ignore non-initiate service checks // check for local execution pattern on command line if (cfg.localCheckPattern && regexec(®Ex, svcdata->command_line, 0, 0, 0) == 0) { dnxDebug(1, "Service will execute locally: %s.", svcdata->command_line); return OK; // tell nagios execute locally } dnxDebug(3, "ehSvcCheck: Received Job [%lu] at %lu (%lu).", serial, (unsigned long)time(0), (unsigned long)svcdata->start_time.tv_sec); if ((ret = dnxGetNodeRequest(registrar, &pNode)) != DNX_OK) { dnxDebug(3, "ehSvcCheck: No worker nodes requests available: %s.",dnxErrorString(ret)); return OK; // tell nagios execute locally } // allocate and populate a new job payload object if ((jdp = (DnxJobData *)xmalloc(sizeof *jdp)) == 0) { dnxDebug(1, "ehSvcCheck: Out of memory!"); return OK; } memset(jdp, 0, sizeof *jdp); jdp->svc = (service *)svcdata->OBJECT_FIELD_NAME; assert(jdp->svc); #if CURRENT_NEB_API_VERSION == 3 { // a nagios 3.x global variable extern check_result check_result_info; /** @todo patch nagios to pass these values to the event handler. */ jdp->chkopts = check_result_info.check_options; jdp->schedule = check_result_info.scheduled_check; jdp->reschedule = check_result_info.reschedule_check; } #endif if ((ret = dnxPostNewJob(joblist, serial, jdp, svcdata, pNode)) != DNX_OK) { dnxLog("Unable to post job [%lu]: %s.", serial, dnxErrorString(ret)); xfree(jdp); return OK; // tell nagios execute locally } serial++; // bump serial number return NEBERROR_CALLBACKOVERRIDE; // tell nagios we want it }
/** The main NEB module initialization routine. * * This function gets called when the module is loaded by the event broker. * * @param[in] flags - module flags - not used * @param[in] args - module arguments. These come from the nagios * configuration file, and are passed through to the module as it loads. * @param[in] handle - our module handle - passed from the OS to nagios as * nagios loaded us. * * @return Zero on success, or a non-zero error value. */ int nebmodule_init(int flags, char * args, nebmodule * handle) { int ret; myHandle = handle; // module args string should contain a fully-qualified config file path if (!args || !*args) args = DNX_DEFAULT_SERVER_CONFIG_FILE; if ((ret = initConfig(args)) != 0) return ERROR; // set configured debug level and syslog log facility code dnxLogInit(cfg.logFilePath, cfg.debugFilePath, cfg.auditFilePath, &cfg.debugLevel); dnxLog("-------- DNX Server Module Version %s Startup --------", VERSION); dnxLog("Copyright (c) 2006-2010 Intellectual Reserve. All rights reserved."); dnxLog("Configuration file: %s.", args); dnxLog("Dispatcher: %s.", cfg.dispatcherUrl); dnxLog("Collector: %s.", cfg.collectorUrl); dnxLog("Agent: %s.", cfg.agentUrl); if (cfg.debugFilePath && cfg.debugLevel != 0) { dnxLog("Debug logging enabled at level %d to %s.", cfg.debugLevel, cfg.debugFilePath); #if DEBUG_HEAP dnxLog("Debug heap is enabled."); #endif #if DEBUG_LOCKS dnxLog("Debug locks are enabled."); #endif } if (cfg.auditFilePath) dnxLog("Auditing enabled to %s.", cfg.auditFilePath); #if DEBUG_HEAP dnxLog("Debug heap is enabled."); #endif #if DEBUG_LOCKS dnxLog("Debug locks are enabled."); #endif // subscribe to PROCESS_DATA call-backs in order to defer initialization // until after Nagios validates its configuration and environment. if ((ret = neb_register_callback(NEBCALLBACK_PROCESS_DATA, myHandle, 0, ehProcessData)) != OK) { dnxLog("PROCESS_DATA event registration failed: %s.", dnxErrorString(ret)); releaseConfig(); return ERROR; } start_time = time(0); dnxLog("-------- DNX Server Module Startup Complete --------"); return OK; }
/** The main timer thread procedure entry point. * * @param[in] data - an opaque pointer to thread data for the timer thread. * This is actually the dnx server global data object. * * @return Always returns 0. */ static void * dnxTimer(void * data) { iDnxTimer * itimer = (iDnxTimer *)data; DnxNewJob ExpiredList[MAX_EXPIRED]; int i, totalExpired; int ret = 0; assert(data); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxTimerCleanup, data); dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self()); while (1) { pthread_testcancel(); dnxCancelableSleep(itimer->sleepms); // search for expired jobs in the pending queue totalExpired = MAX_EXPIRED; if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, &totalExpired)) == DNX_OK && totalExpired > 0) { for (i = 0; i < totalExpired; i++) { char msg[256]; char addrstr[DNX_MAX_ADDRSTR]; DnxNewJob * job = &ExpiredList[i]; dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.", pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd); dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT); dnxAuditJob(job, "EXPIRE"); // if (job->ack) snprintf(msg, sizeof msg, "(DNX: Service Check [%lu,%lu] Timed Out - " "Node: %s - Failed to return job response in time allowed)", job->xid.objSerial, job->xid.objSlot, addrstr); // else // snprintf(msg, sizeof msg, // "(DNX: Service Check [%lu,%lu] Timed Out - " // "Node: %s - Failed to acknowledge job receipt)", // job->xid.objSerial, job->xid.objSlot, addrstr); dnxDebug(2, msg); // report the expired job to Nagios ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, time(0) - job->start_time, 1, 0, msg); dnxJobCleanup(job); } } if (totalExpired > 0 || ret != DNX_OK) dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d Retcode=%d: %s.", pthread_self(), totalExpired, ret, dnxErrorString(ret)); } dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret)); pthread_cleanup_pop(1); return 0; }
/** Initialize the dnxServer. * * @return Zero on success, or a non-zero error value. */ static int dnxServerInit(void) { int ret, joblistsz; // clear globals so we know what to "undo" as we back out joblist = 0; registrar = 0; dispatcher = 0; collector = 0; if ((ret = dnxChanMapInit(0)) != 0) { dnxLog("Failed to initialize channel map: %s.", dnxErrorString(ret)); return ret; } joblistsz = dnxCalculateJobListSize(); dnxLog("Allocating %d service request slots in the DNX job list.", joblistsz); if ((ret = dnxJobListCreate(joblistsz, &joblist)) != 0) { dnxLog("Failed to initialize DNX job list with %d slots.", joblistsz); return ret; } // create and configure collector if ((ret = dnxCollectorCreate("Collect", cfg.collectorUrl, joblist, &collector)) != 0) return ret; // create and configure dispatcher if ((ret = dnxDispatcherCreate("Dispatch", cfg.dispatcherUrl, joblist, &dispatcher)) != 0) return ret; // create worker node registrar if ((ret = dnxRegistrarCreate(joblistsz * 2, dnxDispatcherGetChannel(dispatcher), ®istrar)) != 0) return ret; // initialize server management agent if ((ret = dnxInitAgent(cfg.agentUrl, parser)) != 0) return ret; #if CURRENT_NEB_API_VERSION == 3 && defined(DIRECT_POST) // register for timed event to piggy-back on reaper thread neb_register_callback(NEBCALLBACK_TIMED_EVENT_DATA, myHandle, 0, ehTimedEvent); dnxLog("Registered for TIMEDEVENT_EXECUTE event."); #endif // registration for this event starts everything rolling neb_register_callback(NEBCALLBACK_SERVICE_CHECK_DATA, myHandle, 0, ehSvcCheck); dnxLog("Registered for SERVICE_CHECK_DATA event."); dnxLog("Server initialization completed."); return 0; }