/** Create a new worker thread. * * @param[in] iwlm - the WLM object whose thread pool is being updated. * @param[in] pws - the address of storage for the address of the newly * allocated and configured worker status structure. * * @return Zero on success, or a non-zero error value. */ static int workerCreate(iDnxWlm * iwlm, DnxWorkerStatus ** pws) { DnxWorkerStatus * ws = NULL; int ret; // allocate and clear a new worker status structure if ((ws = (DnxWorkerStatus *)xmalloc(sizeof *ws)) == 0) return DNX_ERR_MEMORY; memset(ws, 0, sizeof *ws); ws->iwlm = iwlm; // initialize our communications channels if ((ret = initWorkerComm(ws)) != 0) { dnxLog("WLM: Failed to initialize worker comm channels: %s.",dnxErrorString(ret)); xfree(ws); return ret; } // create a worker thread ws->state = DNX_THREAD_RUNNING; // set thread state to active if ((ret = pthread_create(&ws->tid, 0, dnxWorker, ws)) != 0) { dnxLog("WLM: Failed to create worker thread: %s.", strerror(ret)); releaseWorkerComm(ws); xfree(ws); return DNX_ERR_THREAD; } *pws = ws; return 0; }
int dnxTimerCreate(DnxJobList * joblist, int sleeptime, DnxTimer ** ptimer) { iDnxTimer * itimer; int ret; assert(joblist && ptimer); // don't allow sleep times outside the range 1/10th sec to 5 minutes if (sleeptime < 100 || sleeptime > 300000) sleeptime = DNX_DEF_TIMER_SLEEP; if ((itimer = (iDnxTimer *)xmalloc(sizeof *itimer)) == 0) return DNX_ERR_MEMORY; // initialize the itimer memset(itimer, 0, sizeof *itimer); itimer->joblist = joblist; itimer->sleepms = sleeptime; // create the timer thread if ((ret = pthread_create(&itimer->tid, 0, dnxTimer, itimer)) != 0) { dnxLog("Timer thread creation failed: %s.", dnxErrorString(ret)); xfree(itimer); return DNX_ERR_THREAD; } *ptimer = (DnxTimer *)itimer; return DNX_OK; }
int dnxRegistrarCreate(DnxChannel * chan, unsigned queuesz, DnxRegistrar ** preg) { iDnxRegistrar * ireg; int ret; assert(chan && queuesz && preg); if ((ireg = (iDnxRegistrar *)xmalloc(sizeof *ireg)) == 0) return DNX_ERR_MEMORY; memset(ireg, 0, sizeof *ireg); ireg->channel = chan; if ((ret = dnxQueueCreate(queuesz, xfree, &ireg->rqueue)) != 0) { dnxLog("DNX Registrar: Queue creation failed: %s.", dnxErrorString(ret)); xfree(ireg); return ret; } if ((ret = pthread_create(&ireg->tid, 0, dnxRegistrar, ireg)) != 0) { dnxLog("DNX Registrar: Thread creation failed: %s.", strerror(ret)); dnxQueueDestroy(ireg->rqueue); xfree(ireg); return DNX_ERR_THREAD; } *preg = (DnxRegistrar *)ireg; return DNX_OK; }
/** The main NEB module initialization routine. * * This function gets called when the module is loaded by the event broker. * * @param[in] flags - module flags - not used * @param[in] args - module arguments. These come from the nagios * configuration file, and are passed through to the module as it loads. * @param[in] handle - our module handle - passed from the OS to nagios as * nagios loaded us. * * @return Zero on success, or a non-zero error value. */ int nebmodule_init(int flags, char * args, nebmodule * handle) { int ret; myHandle = handle; // module args string should contain a fully-qualified config file path if (!args || !*args) args = DNX_DEFAULT_SERVER_CONFIG_FILE; if ((ret = initConfig(args)) != 0) return ERROR; // set configured debug level and syslog log facility code dnxLogInit(cfg.logFilePath, cfg.debugFilePath, cfg.auditFilePath, &cfg.debugLevel); dnxLog("-------- DNX Server Module Version %s Startup --------", VERSION); dnxLog("Copyright (c) 2006-2010 Intellectual Reserve. All rights reserved."); dnxLog("Configuration file: %s.", args); dnxLog("Dispatcher: %s.", cfg.dispatcherUrl); dnxLog("Collector: %s.", cfg.collectorUrl); dnxLog("Agent: %s.", cfg.agentUrl); if (cfg.debugFilePath && cfg.debugLevel != 0) { dnxLog("Debug logging enabled at level %d to %s.", cfg.debugLevel, cfg.debugFilePath); #if DEBUG_HEAP dnxLog("Debug heap is enabled."); #endif #if DEBUG_LOCKS dnxLog("Debug locks are enabled."); #endif } if (cfg.auditFilePath) dnxLog("Auditing enabled to %s.", cfg.auditFilePath); #if DEBUG_HEAP dnxLog("Debug heap is enabled."); #endif #if DEBUG_LOCKS dnxLog("Debug locks are enabled."); #endif // subscribe to PROCESS_DATA call-backs in order to defer initialization // until after Nagios validates its configuration and environment. if ((ret = neb_register_callback(NEBCALLBACK_PROCESS_DATA, myHandle, 0, ehProcessData)) != OK) { dnxLog("PROCESS_DATA event registration failed: %s.", dnxErrorString(ret)); releaseConfig(); return ERROR; } start_time = time(0); dnxLog("-------- DNX Server Module Startup Complete --------"); return OK; }
/** The main thread entry point procedure for the registrar thread. * * This thread handles all inbound requests in a single-threaded fashion, * so we can safely call dnxStatsInc here for new nodes. * * @param[in] data - an opaque pointer to registrar thread data. This is * actually a pointer to the dnx server global data structure. * * @return Always returns NULL. */ static void * dnxRegistrar(void * data) { iDnxRegistrar * ireg = (iDnxRegistrar *)data; DnxNodeRequest * pMsg = 0; assert(data); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); dnxLog("DNX Registrar: Awaiting worker node requests..."); while (1) { int ret; // (re)allocate message block if not consumed in last pass if (pMsg == 0 && (pMsg = (DnxNodeRequest *)xmalloc(sizeof *pMsg)) == 0) { dnxCancelableSleep(10); // sleep for a while and try again... continue; } pthread_cleanup_push(xfree, pMsg); pthread_testcancel(); // wait on the registrar socket for a request if ((ret = dnxWaitForNodeRequest(ireg->channel, pMsg, pMsg->address, DNX_REGISTRAR_REQUEST_TIMEOUT)) == DNX_OK) { switch (pMsg->reqType) { case DNX_REQ_REGISTER: ret = dnxRegisterNode(ireg, &pMsg); break; case DNX_REQ_DEREGISTER: ret = dnxDeregisterNode(ireg, pMsg); break; default: ret = DNX_ERR_UNSUPPORTED; } } pthread_cleanup_pop(0); if (ret != DNX_OK && ret != DNX_ERR_TIMEOUT) dnxLog("DNX Registrar: Process node request failed: %s.", dnxErrorString(ret)); } return 0; }
/** Initialize worker thread communication resources. * * @param[in] ws - a pointer to a worker thread's status data structure. * * @return Zero on success, or a non-zero error value. */ static int initWorkerComm(DnxWorkerStatus * ws) { char szChanDisp[64]; char szChanColl[64]; int ret; // create a channel for sending job requests (named after its memory address) sprintf(szChanDisp, "Dispatch:%lx", ws); if ((ret = dnxChanMapAdd(szChanDisp, ws->iwlm->cfg.dispatcher)) != DNX_OK) { dnxLog("WLM: Failed to initialize dispatcher channel: %s.", dnxErrorString(ret)); return ret; } if ((ret = dnxConnect(szChanDisp, 1, &ws->dispatch)) != DNX_OK) { dnxLog("WLM: Failed to open dispatcher channel: %s.", dnxErrorString(ret)); dnxChanMapDelete(szChanDisp); return ret; } // create a channel for sending job results (named after its memory address) sprintf(szChanColl, "Collect:%lx", ws); if ((ret = dnxChanMapAdd(szChanColl, ws->iwlm->cfg.collector)) != DNX_OK) { dnxLog("WLM: Failed to initialize collector channel: %s.", dnxErrorString(ret)); dnxDisconnect(ws->dispatch); dnxChanMapDelete(szChanDisp); return ret; } if ((ret = dnxConnect(szChanColl, 1, &ws->collect)) != DNX_OK) { dnxLog("WLM: Failed to open collector channel: %s.", dnxErrorString(ret)); dnxChanMapDelete(szChanColl); dnxDisconnect(ws->dispatch); dnxChanMapDelete(szChanDisp); return ret; } return 0; }
int dnxInitAgent(char * agentUrl, DnxCfgParser * parser) { int ret; s_shutdown = 0; s_agentTid = 0; s_parser = parser; if ((ret = dnxChanMapAdd(s_agentName, agentUrl)) != DNX_OK) dnxLog("AGENT channel init failed: %s.", dnxErrorString(ret)); else if ((ret = dnxConnect(s_agentName, DNX_MODE_PASSIVE, &s_agent)) != DNX_OK) { dnxLog("AGENT channel connect failed: %s.", dnxErrorString(ret)); dnxChanMapDelete(s_agentName); } else if ((ret = pthread_create(&s_agentTid, 0, dnxAgentServer, 0)) != 0) { dnxLog("AGENT server init failed: %s.", strerror(ret)); dnxDisconnect(s_agent); dnxChanMapDelete(s_agentName); ret = DNX_ERR_THREAD; } return ret; }
int dnxGetNodeRequest(DnxRegistrar * reg, DnxNodeRequest ** ppNode) { iDnxRegistrar * ireg = (iDnxRegistrar *)reg; int ret, discard_count = 0; DnxNodeRequest * node = 0; assert(reg && ppNode); while ((ret = dnxQueueGet(ireg->rqueue, (void **)&node)) == DNX_OK) { time_t now = time(0); // verify that this request's Time-To-Live (TTL) has not expired if (node->expires > now) break; dnxStatsInc(node->address, REQUESTS_EXPIRED); dnxDebug(3, "dnxRegisterNode: Expired req [%lu,%lu] at %u; expired at %u.", node->xid.objSerial, node->xid.objSlot, (unsigned)(now % 1000), (unsigned)(node->expires % 1000)); discard_count++; xfree(node); node = 0; } if (discard_count > 0) dnxDebug(1, "dnxGetNodeRequest: Discarded %d expired node requests.", discard_count); if (ret != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxStatsInc(0, JOBS_REJECTED_NO_NODES); dnxDebug(2, "dnxGetNodeRequest: Unable to fulfill node request: %s.", dnxErrorString(ret)); } *ppNode = node; // return a node or NULL return ret; }
/** Register a new client node "request for work" request. * * The message is either stored or used to find an existing node request * that should be updated. If stored, @p ppMsg is returned as zero so that * it will be reallocated by the caller. In all other cases, the same * message block can be reused by the caller for the next request. * * @param[in] ireg - the registrar on which to register a new client request. * @param[in] ppMsg - the address of the dnx client request node pointer. * * @return Zero on success, or a non-zero error value. */ static int dnxRegisterNode(iDnxRegistrar * ireg, DnxNodeRequest ** ppMsg) { pthread_t tid = pthread_self(); DnxNodeRequest * pReq; time_t now = time(0); int ret = DNX_OK; assert(ireg && ppMsg && *ppMsg); // compute expiration time of this request pReq = *ppMsg; pReq->expires = now + pReq->ttl; dnxStatsInc(pReq->address, REQUESTS_RECEIVED); // locate existing node: update expiration time, or add to the queue if (dnxQueueFind(ireg->rqueue, (void **)&pReq, dnxCompareNodeReq) == DNX_QRES_FOUND) { pReq->expires = (*ppMsg)->expires; dnxDebug(2, "dnxRegistrar[%lx]: Updated req [%lu,%lu] at %u; expires at %u.", tid, pReq->xid.objSerial, pReq->xid.objSlot, (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000)); } else if ((ret = dnxQueuePut(ireg->rqueue, *ppMsg)) == DNX_OK) { *ppMsg = 0; // we're keeping this message; return NULL dnxDebug(2, "dnxRegistrar[%lx]: Added req [%lu,%lu] at %u; expires at %u.", tid, pReq->xid.objSerial, pReq->xid.objSlot, (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000)); } else dnxLog("DNX Registrar: Unable to enqueue node request: %s.", dnxErrorString(ret)); return ret; }
/** The main timer thread procedure entry point. * * @param[in] data - an opaque pointer to thread data for the timer thread. * This is actually the dnx server global data object. * * @return Always returns 0. */ static void * dnxTimer(void * data) { iDnxTimer * itimer = (iDnxTimer *)data; DnxNewJob ExpiredList[MAX_EXPIRED]; int i, totalExpired; int ret = 0; assert(data); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxTimerCleanup, data); dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self()); while (1) { pthread_testcancel(); dnxCancelableSleep(itimer->sleepms); // search for expired jobs in the pending queue totalExpired = MAX_EXPIRED; if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, &totalExpired)) == DNX_OK && totalExpired > 0) { for (i = 0; i < totalExpired; i++) { char msg[256]; char addrstr[DNX_MAX_ADDRSTR]; DnxNewJob * job = &ExpiredList[i]; dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.", pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd); dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT); dnxAuditJob(job, "EXPIRE"); // if (job->ack) snprintf(msg, sizeof msg, "(DNX: Service Check [%lu,%lu] Timed Out - " "Node: %s - Failed to return job response in time allowed)", job->xid.objSerial, job->xid.objSlot, addrstr); // else // snprintf(msg, sizeof msg, // "(DNX: Service Check [%lu,%lu] Timed Out - " // "Node: %s - Failed to acknowledge job receipt)", // job->xid.objSerial, job->xid.objSlot, addrstr); dnxDebug(2, msg); // report the expired job to Nagios ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, time(0) - job->start_time, 1, 0, msg); dnxJobCleanup(job); } } if (totalExpired > 0 || ret != DNX_OK) dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d Retcode=%d: %s.", pthread_self(), totalExpired, ret, dnxErrorString(ret)); } dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret)); pthread_cleanup_pop(1); return 0; }
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm) { iDnxWlm * iwlm; struct ifaddrs * ifa = NULL; assert(cfg && pwlm); assert(cfg->poolMin > 0); assert(cfg->poolMax >= cfg->poolMin); assert(cfg->poolInitial >= cfg->poolMin); assert(cfg->poolInitial <= cfg->poolMax); // allocate and configure the master thread pool data structure if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0) return DNX_ERR_MEMORY; memset(iwlm, 0, sizeof *iwlm); iwlm->cfg = *cfg; iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher); iwlm->cfg.collector = xstrdup(iwlm->cfg.collector); iwlm->poolsz = iwlm->cfg.poolMax; iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool); iwlm->minexectm = (unsigned)(-1); // the largest possible value memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool); // cache our (primary?) ip address in binary and string format if (getifaddrs(&ifa) == 0) { u_int setflags = IFF_UP | IFF_RUNNING; u_int clrflags = IFF_LOOPBACK; struct ifaddrs * ifcur = ifa; // locate the first proper AF_NET address in our interface list while (ifcur && (ifcur->ifa_addr == 0 || ifcur->ifa_addr->sa_family != AF_INET || (ifcur->ifa_flags & setflags) != setflags || (ifcur->ifa_flags & clrflags) != 0)) ifcur = ifcur->ifa_next; if (ifcur) { // cache binary and presentation (string) versions of the ip address iwlm->myipaddr = (unsigned long) ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr; inet_ntop(ifcur->ifa_addr->sa_family, &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr, iwlm->myipaddrstr, sizeof iwlm->myipaddrstr); } freeifaddrs(ifa); } char unset[] = "NULL"; if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set { dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread."); char machineName [MAX_HOSTNAME]; if(strcmp(cfg->hostname, unset)==0) { dnxDebug(3, "dnxWlmCreate: Hostname undefined in config."); // Get our hostname if(gethostname(machineName, MAX_HOSTNAME)==0) { dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName); // cache hostname strcpy(iwlm->myhostname, machineName); } else { dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?]," "please set hostname in config.", machineName); sprintf( machineName, "localhost"); strcpy(iwlm->myhostname, machineName); } } else { dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname); strcpy(iwlm->myhostname, cfg->hostname); } } else { dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname); strcpy(iwlm->cfg.hostname, iwlm->myhostname); } // if any of the above failed, we really can't continue if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool) { xfree(iwlm->cfg.dispatcher); xfree(iwlm->cfg.collector); xfree(iwlm); return DNX_ERR_MEMORY; } // create initial worker thread pool DNX_PT_MUTEX_INIT(&iwlm->mutex); DNX_PT_MUTEX_LOCK(&iwlm->mutex); { int ret; if ((ret = growThreadPool(iwlm)) != DNX_OK) { if (iwlm->threads) dnxLog("WLM: Error creating SOME worker threads: %s; " "continuing with smaller initial pool.", dnxErrorString(ret)); else { dnxLog("WLM: Unable to create ANY worker threads: %s; " "terminating.", dnxErrorString(ret)); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); DNX_PT_MUTEX_DESTROY(&iwlm->mutex); xfree(iwlm); return ret; } } } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxLog("WLM: Started worker thread pool."); *pwlm = (DnxWlm *)iwlm; return DNX_OK; }
/** The main thread routine for a worker thread. * * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this * thread. * * @return Always returns 0. */ static void * dnxWorker(void * data) { DnxWorkerStatus * ws = (DnxWorkerStatus *)data; pthread_t tid = pthread_self(); int retries = 0; iDnxWlm * iwlm; assert(data); iwlm = ws->iwlm; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxWorkerCleanup, data); time(&ws->tstart); // set thread start time (for stats) while (!iwlm->terminate) { DnxNodeRequest msg; DnxJob job; int ret; // setup job request message - use thread id and node address in XID dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr); msg.reqType = DNX_REQ_REGISTER; msg.jobCap = 1; msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff; msg.hn = iwlm->myhostname; // request a job, and then wait for a job to come in... if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) { dnxLog("Worker[%lx]: Error sending node request: %s.", tid, dnxErrorString(ret)); } else { DNX_PT_MUTEX_LOCK(&iwlm->mutex); iwlm->reqsent++; DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); } // wait for job, even if request was never sent if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxLog("Worker[%lx]: Error receiving job: %s.", tid, dnxErrorString(ret)); } // Allow thread to be canceled pthread_testcancel(); DNX_PT_MUTEX_LOCK(&iwlm->mutex); cleanThreadPool(iwlm); // ensure counts are accurate before using them if (ret != DNX_OK) { // if above pool minimum and exceeded max retries... if (iwlm->threads > iwlm->cfg.poolMin && ++retries > iwlm->cfg.maxRetries) { dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); break; } } else { iwlm->jobsrcvd++; iwlm->active++; // dnxSendJobAck(ws->collect, &job, &job.address); // dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", // tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd); // DnxAck ack; // ack.xid = job.xid; // ack.timestamp = job.timestamp; dnxSendJobAck(ws->collect, &job, 0); dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp); // check pool size before we get too busy - // if we're not shutting down and we haven't reached the configured // maximum and this is the last thread out, then increase the pool if (!iwlm->terminate && iwlm->threads < iwlm->cfg.poolMax && iwlm->active == iwlm->threads) // Maybe more aggressive here growThreadPool(iwlm); } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); // if we have a job, execute it and reset retry count if (ret == DNX_OK) { char resData[MAX_RESULT_DATA + 1]; DnxResult result; time_t jobstart; dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd); // prepare result structure result.xid = job.xid; // result xid must match job xid result.state = DNX_JOB_COMPLETE; // complete or expired result.delta = 0; result.resCode = DNX_PLUGIN_RESULT_OK; result.resData = 0; /** @todo Allocate result data buffer based on configured buffer size. */ // we want to be able to cancel threads while they're out on a task // in order to obtain timely shutdown for long jobs - move into // async cancel mode, but only for the duration of the check pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0); *resData = 0; jobstart = time(0); dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0); result.delta = time(0) - jobstart; pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); // store allocated copy of the result string if (*resData) result.resData = xstrdup(resData); dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.", tid, job.xid.objSerial, job.xid.objSlot, result.delta, result.resCode, result.resData); // if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { // dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", // tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); // } // Wait while we wait for an Ack to our Results DnxJob ack; int trys = 1; while(trys < 4) { if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); break; } // Now wait for our Ack if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); } else if (ret == DNX_ERR_TIMEOUT) { // we didn't get our Ack trys++; } else { // We got our Ack dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); break; } } xfree(result.resData); // update all statistics DNX_PT_MUTEX_LOCK(&iwlm->mutex); { // track status if (result.resCode == DNX_PLUGIN_RESULT_OK) iwlm->jobsok++; else iwlm->jobsfail++; // track min/max/avg execution time if (result.delta > iwlm->maxexectm) iwlm->maxexectm = result.delta; if (result.delta < iwlm->minexectm) iwlm->minexectm = result.delta; iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2; // total job processing time iwlm->jobtm += (unsigned)result.delta; iwlm->active--; // reduce active count } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); ws->serial++; // increment job serial number for next job retries = 0; } } pthread_cleanup_pop(1); return 0; }
/** The agent thread control procedure. * * @param[in] data - thread data; not used. * * @return Always returns a null pointer (zero). */ static void * dnxAgentServer(void * data) { int ret; DnxMgmtRequest Msg; Msg.action = 0; dnxLog("DNX Server Agent awaiting commands..."); while (!s_shutdown) { memset(Msg.address, '\0', DNX_MAX_ADDRESS); // wait 2 second for a request; process the request, if valid if ((ret = dnxWaitForMgmtRequest(s_agent, &Msg, Msg.address, 2)) == DNX_OK) { DnxMgmtReply Rsp; char addrstr[DNX_MAX_ADDRSTR]; dnxDebug(2, "Received MgmtRequest from %s.", dnxNtop(Msg.address, addrstr, sizeof addrstr)); // setup some default response values Rsp.xid = Msg.xid; Rsp.status = DNX_REQ_ACK; Rsp.reply = 0; // perform the requested action if (!strcmp(Msg.action, "RESETSTATS")) { dnxStatsResetServerStats(); dnxStatsForEachNode(dnxResetNodeStats, 0); Rsp.reply = xstrdup("OK"); } else if (!strncmp(Msg.action, "GETSTATS ", 9)) { if ((Rsp.reply = buildMgmtStatsReply(Msg.action + 9)) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strncmp(Msg.action, "GETNODESTATS ", 13)) { if ((Rsp.reply = buildMgmtNodeStatsReply(Msg.action + 13)) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETNODELIST")) { if ((Rsp.reply = buildMgmtNodeListReply()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETCONFIG")) { if ((Rsp.reply = buildMgmtCfgReply()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETVERSION")) { if ((Rsp.reply = versionText()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "HELP")) { if ((Rsp.reply = buildHelpReply()) == 0) Rsp.status = DNX_REQ_NAK; } // send response, log response failures if ((ret = dnxSendMgmtReply(s_agent, &Rsp, Msg.address)) != 0) dnxLog("Agent response failure: %s.", dnxErrorString(ret)); // free request and reply message buffers xfree(Rsp.reply); xfree(Msg.action); } else if (ret != DNX_ERR_TIMEOUT) dnxLog("Agent channel failure: %s.", dnxErrorString(ret)); } dnxLog("Agent terminating..."); return 0; }
/** Service Check Event Handler. * * @param[in] event_type - the event type for which we're being called. * @param[in] data - an opaque pointer to nagios event-specific data. * * @return Zero if we want Nagios to handle the event; * NEBERROR_CALLBACKOVERRIDE indicates that we want to handle the event * ourselves; any other non-zero value represents an error. */ static int ehSvcCheck(int event_type, void * data) { static unsigned long serial = 0; // the number of service checks processed nebstruct_service_check_data * svcdata = (nebstruct_service_check_data *)data; DnxNodeRequest * pNode; DnxJobData * jdp; int ret; if (event_type != NEBCALLBACK_SERVICE_CHECK_DATA) return OK; if (svcdata == 0) { dnxLog("Service handler received NULL service data structure."); return ERROR; // shouldn't happen - internal Nagios error } if (svcdata->type != NEBTYPE_SERVICECHECK_INITIATE) return OK; // ignore non-initiate service checks // check for local execution pattern on command line if (cfg.localCheckPattern && regexec(®Ex, svcdata->command_line, 0, 0, 0) == 0) { dnxDebug(1, "Service will execute locally: %s.", svcdata->command_line); return OK; // tell nagios execute locally } dnxDebug(3, "ehSvcCheck: Received Job [%lu] at %lu (%lu).", serial, (unsigned long)time(0), (unsigned long)svcdata->start_time.tv_sec); if ((ret = dnxGetNodeRequest(registrar, &pNode)) != DNX_OK) { dnxDebug(3, "ehSvcCheck: No worker nodes requests available: %s.",dnxErrorString(ret)); return OK; // tell nagios execute locally } // allocate and populate a new job payload object if ((jdp = (DnxJobData *)xmalloc(sizeof *jdp)) == 0) { dnxDebug(1, "ehSvcCheck: Out of memory!"); return OK; } memset(jdp, 0, sizeof *jdp); jdp->svc = (service *)svcdata->OBJECT_FIELD_NAME; assert(jdp->svc); #if CURRENT_NEB_API_VERSION == 3 { // a nagios 3.x global variable extern check_result check_result_info; /** @todo patch nagios to pass these values to the event handler. */ jdp->chkopts = check_result_info.check_options; jdp->schedule = check_result_info.scheduled_check; jdp->reschedule = check_result_info.reschedule_check; } #endif if ((ret = dnxPostNewJob(joblist, serial, jdp, svcdata, pNode)) != DNX_OK) { dnxLog("Unable to post job [%lu]: %s.", serial, dnxErrorString(ret)); xfree(jdp); return OK; // tell nagios execute locally } serial++; // bump serial number return NEBERROR_CALLBACKOVERRIDE; // tell nagios we want it }
/** Initialize the dnxServer. * * @return Zero on success, or a non-zero error value. */ static int dnxServerInit(void) { int ret, joblistsz; // clear globals so we know what to "undo" as we back out joblist = 0; registrar = 0; dispatcher = 0; collector = 0; if ((ret = dnxChanMapInit(0)) != 0) { dnxLog("Failed to initialize channel map: %s.", dnxErrorString(ret)); return ret; } joblistsz = dnxCalculateJobListSize(); dnxLog("Allocating %d service request slots in the DNX job list.", joblistsz); if ((ret = dnxJobListCreate(joblistsz, &joblist)) != 0) { dnxLog("Failed to initialize DNX job list with %d slots.", joblistsz); return ret; } // create and configure collector if ((ret = dnxCollectorCreate("Collect", cfg.collectorUrl, joblist, &collector)) != 0) return ret; // create and configure dispatcher if ((ret = dnxDispatcherCreate("Dispatch", cfg.dispatcherUrl, joblist, &dispatcher)) != 0) return ret; // create worker node registrar if ((ret = dnxRegistrarCreate(joblistsz * 2, dnxDispatcherGetChannel(dispatcher), ®istrar)) != 0) return ret; // initialize server management agent if ((ret = dnxInitAgent(cfg.agentUrl, parser)) != 0) return ret; #if CURRENT_NEB_API_VERSION == 3 && defined(DIRECT_POST) // register for timed event to piggy-back on reaper thread neb_register_callback(NEBCALLBACK_TIMED_EVENT_DATA, myHandle, 0, ehTimedEvent); dnxLog("Registered for TIMEDEVENT_EXECUTE event."); #endif // registration for this event starts everything rolling neb_register_callback(NEBCALLBACK_SERVICE_CHECK_DATA, myHandle, 0, ehSvcCheck); dnxLog("Registered for SERVICE_CHECK_DATA event."); dnxLog("Server initialization completed."); return 0; }
/** The main program entry point for the dnx management client. * * @param[in] argc - the number of elements in the @p argv array. * @param[in] argv - a null-terminated array of command-line arguments. * * @return Zero on success, or a non-zero error code that is returned to the * shell. Any non-zero codes should be values between 1 and 127. */ int main(int argc, char ** argv) { extern char * optarg; extern int optind, opterr, optopt; gTopDCS = dnxComStatCreateDCS("127.0.0.1"); int ch, ret; char * cp, * prog, * cmdstr; char * hoststr, * portstr; // get program base name prog = (char *)((cp = strrchr(argv[0], '/')) != 0 ? (cp + 1) : argv[0]); // parse arguments hoststr = "localhost"; portstr = "12482"; opterr = 0; cmdstr = 0; while ((ch = getopt(argc, argv, "hvc:s:p:")) != -1) { switch (ch) { case 's': hoststr = optarg; break; case 'p': portstr = optarg; break; case 'c': cmdstr = optarg; break; case 'v': printf("\n %s version %s\n Bug reports: %s.\n\n", prog, VERSION, PACKAGE_BUGREPORT); exit(0); case 'h': default : usage(prog); } } // ensure we've been given a command if (!cmdstr) { fprintf(stderr, "%s: No command string specified.\n", prog); usage(prog); } // init comm sub-system; send command; wait for response if ((ret = dnxChanMapInit(0)) != 0) fprintf(stderr, "%s: Error initializing channel map: %s.\n", prog, dnxErrorString(ret)); else { char url[1024]; snprintf(url, sizeof url, "udp://%s:%s", hoststr, portstr); if ((ret = dnxChanMapAdd("MgmtClient", url)) != 0) fprintf(stderr, "%s: Error adding channel (%s): %s.\n", prog, url, dnxErrorString(ret)); else { DnxChannel * channel; if ((ret = dnxConnect("MgmtClient", 1, &channel)) != 0) fprintf(stderr, "%s: Error connecting to server (%s): %s.\n", prog, url, dnxErrorString(ret)); else { DnxMgmtRequest req; memset(&req, 0, sizeof req); dnxMakeXID(&req.xid, DNX_OBJ_MANAGER, 0, 0); req.action = cmdstr; if ((ret = dnxSendMgmtRequest(channel, &req, 0)) != 0) fprintf(stderr, "%s: Error sending request: %s.\n", prog, dnxErrorString(ret)); else { DnxMgmtReply rsp; if ((ret = dnxWaitForMgmtReply(channel, &rsp, 0, 10)) != 0) fprintf(stderr, "%s: Error receiving response: %s.\n", prog, dnxErrorString(ret)); else { if (rsp.status == DNX_REQ_ACK) printf("%s\n", rsp.reply); else fprintf(stderr, "%s: Request failed on server.\nResponse was (%s)\n", prog,rsp.reply); } } dnxDisconnect(channel); } dnxChanMapDelete("MgmtClient"); } dnxChanMapRelease(); } xheapchk(); return ret? -1: 0; }