void dnxWlmDestroy(DnxWlm * wlm) { iDnxWlm * iwlm = (iDnxWlm *)wlm; time_t expires; unsigned i; assert(wlm); dnxLog("WLM: Beginning termination sequence..."); // sleep till we can't stand it anymore, then kill everyone iwlm->terminate = 1; expires = iwlm->cfg.shutdownGrace + time(0); DNX_PT_MUTEX_LOCK(&iwlm->mutex); while (iwlm->threads > 0 && time(0) < expires) { cleanThreadPool(iwlm); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxCancelableSleep(100); DNX_PT_MUTEX_LOCK(&iwlm->mutex); } // check for workers remaining after grace period if (iwlm->threads) dnxDebug(1, "WLM: Termination - %d workers remaining" " after grace period.", iwlm->threads); // cancel all remaining workers for (i = 0; i < iwlm->threads; i++) if (iwlm->pool[i]->state == DNX_THREAD_RUNNING) { dnxDebug(1, "WLMDestroy: Cancelling worker[%lx].", iwlm->pool[i]->tid); pthread_cancel(iwlm->pool[i]->tid); } // give remaining thread some time to quit DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxCancelableSleep(1000); DNX_PT_MUTEX_LOCK(&iwlm->mutex); // join all zombies (should be everything left) cleanThreadPool(iwlm); assert(iwlm->threads == 0); xfree(iwlm->pool); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); DNX_PT_MUTEX_DESTROY(&iwlm->mutex); xfree(iwlm->cfg.dispatcher); xfree(iwlm->cfg.collector); xfree(iwlm); dnxLog("WLM: Termination sequence complete."); }
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long tail; int ret = DNX_OK; assert(pJobList && pJob); DNX_PT_MUTEX_LOCK(&ilist->mut); tail = ilist->tail; // verify space in the job list, this keeps a single empty buffer element to // protect us from not knowing a full ring from an empty one if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) { dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); ret = DNX_ERR_CAPACITY; } else { // add the slot index to the Job's XID - this allows us to index // the job list using the returned result's XID.objSlot field pJob->xid.objSlot = tail; // We were unable to get an available dnxClient job request so we // put the job into the queue anyway and have the timer thread try // and find a dnxClient for it later if (pJob->pNode->xid.objSlot == -1) { pJob->state = DNX_JOB_UNBOUND; } else { pJob->state = DNX_JOB_PENDING; } dnxAuditJob(pJob, "ASSIGN"); // add this job to the job list memcpy(&ilist->list[tail], pJob, sizeof *pJob); ilist->tail = tail; dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail); if(pJob->state == DNX_JOB_PENDING) { pthread_cond_signal(&ilist->cond); // signal that a new job is available } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Dispatch thread clean-up routine * * @param[in] data - an opaque pointer to a worker's status data structure. */ static void dnxWorkerCleanup(void * data) { DnxWorkerStatus * ws = (DnxWorkerStatus *)data; assert(data); dnxDebug(2, "Worker[%lx]: Terminating.", pthread_self()); ws->state = DNX_THREAD_ZOMBIE; }
/** Clean up zombie threads and compact the thread pool. * * @param[in] iwlm - a pointer to the work load manager data structure. */ static void cleanThreadPool(iDnxWlm * iwlm) { unsigned i = 0; time_t now = time(0); iwlm->lastclean = now; // keep track of when we last cleaned // look for zombie threads to join while (i < iwlm->threads) { if (iwlm->pool[i]->state == DNX_THREAD_ZOMBIE) { DnxWorkerStatus * ws = iwlm->pool[i]; int ret; dnxDebug(1, "WLM: Joining worker[%lx]...", ws->tid); pthread_join(ws->tid, 0); // reduce thread count; update stats iwlm->threads--; iwlm->tdestroyed++; iwlm->threadtm += (unsigned)(now - ws->tstart); // release thread resources; delete thread; compact ptr array releaseWorkerComm(ws); xfree(iwlm->pool[i]); memmove(&iwlm->pool[i], &iwlm->pool[i + 1], (iwlm->threads - i) * sizeof iwlm->pool[i]); continue; } i++; } }
/** Post a new job from Nagios to the dnxServer job queue. * * @param[in] joblist - the job list to which the new job should be posted. * @param[in] serial - the serial number of the new job. * @param[in] jdp - a pointer to a job data structure. * @param[in] ds - a pointer to the nagios job that's being posted. * @param[in] pNode - a dnxClient node request structure that is being * posted with this job. The dispatcher thread will send the job to the * associated node. * * @return Zero on success, or a non-zero error value. */ static int dnxPostNewJob(DnxJobList * joblist, unsigned long serial, DnxJobData * jdp, nebstruct_service_check_data * ds, DnxNodeRequest * pNode) { DnxNewJob Job; int ret; assert(ds); assert(ds->command_line); // fill-in the job structure with the necessary information dnxMakeXID(&Job.xid, DNX_OBJ_JOB, serial, 0); Job.payload = jdp; Job.cmd = xstrdup(ds->command_line); Job.start_time = ds->start_time.tv_sec; Job.timeout = ds->timeout; Job.expires = Job.start_time + Job.timeout + 5; Job.pNode = pNode; dnxDebug(2, "DnxNebMain: Posting Job [%lu]: %s.", serial, Job.cmd); // post to the Job Queue if ((ret = dnxJobListAdd(joblist, &Job)) != DNX_OK) { dnxStatsInc(0, JOBS_REJECTED_NO_SLOTS); dnxLog("Failed to post Job [%lu]; \"%s\": %d.", Job.xid.objSerial, Job.cmd, ret); } else { dnxStatsInc(0, JOBS_HANDLED); dnxAuditJob(&Job, "ASSIGN"); } return ret; }
int dnxJobListCollect(DnxJobList * pJobList, DnxXID * pxid, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; int ret = DNX_OK; assert(pJobList && pxid && pJob); // parameter validation current = pxid->objSlot; dnxDebug(4, "dnxJobListCollect: Job serial (%lu) slot (%lu) list head(%i)", pxid->objSerial, pxid->objSlot, ilist->head); if (current >= ilist->size) // runtime validation requires check return DNX_ERR_INVALID; // corrupt client network message DNX_PT_MUTEX_LOCK(&ilist->mut); // verify that the XID of this result matches the XID of the service check if (ilist->list[current].state == DNX_JOB_NULL || !dnxEqualXIDs(pxid, &ilist->list[current].xid)) { dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] not found.", pxid->objSerial, pxid->objSlot); ret = DNX_ERR_NOTFOUND; // Very old job or we restarted and lost state } else if(ilist->list[current].state == DNX_JOB_EXPIRED) { dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] expired before retrieval.", pxid->objSerial, pxid->objSlot); ret = DNX_ERR_EXPIRED; // job expired; removed by the timer } else { if(ilist->list[current].state == DNX_JOB_COMPLETE || ilist->list[current].state == DNX_JOB_RECEIVED) { dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] already retrieved.", pxid->objSerial, pxid->objSlot); ilist->list[current].ack = 0; ret = DNX_ERR_ALREADY; // It needs another Ack } else { // DNX_JOB_INPROGRESS // DNX_JOB_UNBOUND!! ilist->list[current].state = DNX_JOB_RECEIVED; // make a copy to return to the Collector memcpy(pJob, &ilist->list[current], sizeof *pJob); dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] completed. Copy of result for (%s) assigned to collector.", pxid->objSerial, pxid->objSlot, pJob->cmd); } // Signal to the dispatcher that we need to send an Ack pthread_cond_signal(&ilist->cond); } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
int dnxGetNodeRequest(DnxRegistrar * reg, DnxNodeRequest ** ppNode) { iDnxRegistrar * ireg = (iDnxRegistrar *)reg; int ret, discard_count = 0; DnxNodeRequest * node = 0; assert(reg && ppNode); while ((ret = dnxQueueGet(ireg->rqueue, (void **)&node)) == DNX_OK) { time_t now = time(0); // verify that this request's Time-To-Live (TTL) has not expired if (node->expires > now) break; dnxStatsInc(node->address, REQUESTS_EXPIRED); dnxDebug(3, "dnxRegisterNode: Expired req [%lu,%lu] at %u; expired at %u.", node->xid.objSerial, node->xid.objSlot, (unsigned)(now % 1000), (unsigned)(node->expires % 1000)); discard_count++; xfree(node); node = 0; } if (discard_count > 0) dnxDebug(1, "dnxGetNodeRequest: Discarded %d expired node requests.", discard_count); if (ret != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxStatsInc(0, JOBS_REJECTED_NO_NODES); dnxDebug(2, "dnxGetNodeRequest: Unable to fulfill node request: %s.", dnxErrorString(ret)); } *ppNode = node; // return a node or NULL return ret; }
/** Register a new client node "request for work" request. * * The message is either stored or used to find an existing node request * that should be updated. If stored, @p ppMsg is returned as zero so that * it will be reallocated by the caller. In all other cases, the same * message block can be reused by the caller for the next request. * * @param[in] ireg - the registrar on which to register a new client request. * @param[in] ppMsg - the address of the dnx client request node pointer. * * @return Zero on success, or a non-zero error value. */ static int dnxRegisterNode(iDnxRegistrar * ireg, DnxNodeRequest ** ppMsg) { pthread_t tid = pthread_self(); DnxNodeRequest * pReq; time_t now = time(0); int ret = DNX_OK; assert(ireg && ppMsg && *ppMsg); // compute expiration time of this request pReq = *ppMsg; pReq->expires = now + pReq->ttl; dnxStatsInc(pReq->address, REQUESTS_RECEIVED); // locate existing node: update expiration time, or add to the queue if (dnxQueueFind(ireg->rqueue, (void **)&pReq, dnxCompareNodeReq) == DNX_QRES_FOUND) { pReq->expires = (*ppMsg)->expires; dnxDebug(2, "dnxRegistrar[%lx]: Updated req [%lu,%lu] at %u; expires at %u.", tid, pReq->xid.objSerial, pReq->xid.objSlot, (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000)); } else if ((ret = dnxQueuePut(ireg->rqueue, *ppMsg)) == DNX_OK) { *ppMsg = 0; // we're keeping this message; return NULL dnxDebug(2, "dnxRegistrar[%lx]: Added req [%lu,%lu] at %u; expires at %u.", tid, pReq->xid.objSerial, pReq->xid.objSlot, (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000)); } else dnxLog("DNX Registrar: Unable to enqueue node request: %s.", dnxErrorString(ret)); return ret; }
DnxQueueResult dnxQueueRemove(DnxQueue * queue, void ** ppPayload, DnxQueueResult (*Compare)(void * pLeft, void * pRight)) { DnxQueueResult bFound = DNX_QRES_CONTINUE; iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item, * prev; int counter = 0; assert(queue && ppPayload && Compare); DNX_PT_MUTEX_LOCK(&iqueue->mutex); prev = 0; for (item = iqueue->head; item; item = item->next) { counter++; if ((bFound = Compare(*ppPayload, item->pPayload)) != DNX_QRES_CONTINUE) { if (bFound == DNX_QRES_FOUND) { *ppPayload = item->pPayload; // cross-link previous to next and free current if (prev) prev->next = item->next; else // removing the head item iqueue->head = item->next; if (item->next == 0) // removing the tail item iqueue->tail = prev; if (iqueue->current == item) // advance circular pointer if ((iqueue->current = item->next) == 0) iqueue->current = iqueue->head; iqueue->size--; } break; } prev = item; } dnxDebug(8, "dnxQueueRemove: (%i) elements searched in (%i) sized queue", counter, iqueue->size); DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); if (bFound == DNX_QRES_FOUND) { xfree(item); // free the queue entry wrapper object } return bFound; }
int dnxJobListMarkComplete(DnxJobList * pJobList, DnxXID * pXid) { iDnxJobList * ilist = (iDnxJobList *)pJobList; assert(pJobList && pXid); // parameter validation int ret = DNX_ERR_NOTFOUND; dnxDebug(4, "dnxJobListMarkComplete: Job [%lu:%lu]", pXid->objSerial, pXid->objSlot); unsigned long current = pXid->objSlot; DNX_PT_MUTEX_LOCK(&ilist->mut); if (dnxEqualXIDs(pXid, &ilist->list[current].xid)) { if(ilist->list[current].state == DNX_JOB_RECEIVED) { ilist->list[current].state = DNX_JOB_COMPLETE; ret = DNX_OK; } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
int dnxJobListMarkAck(DnxJobList * pJobList, DnxResult * pRes) { iDnxJobList * ilist = (iDnxJobList *)pJobList; assert(pJobList && pRes); // parameter validation time_t now = time(0); int ret = DNX_ERR_NOTFOUND; dnxDebug(4, "dnxJobListMarkAck: Job [%lu:%lu] serial (%lu) slot (%lu) latency (%lu) sec.", pRes->xid.objSerial, pRes->xid.objSlot, pRes->xid.objSerial, pRes->xid.objSlot, (now - pRes->timestamp)); unsigned long current = pRes->xid.objSlot; DNX_PT_MUTEX_LOCK(&ilist->mut); if (dnxEqualXIDs(&(pRes->xid), &ilist->list[current].xid)) { if(ilist->list[current].state == DNX_JOB_PENDING || ilist->list[current].state == DNX_JOB_UNBOUND) { ilist->list[current].state = DNX_JOB_INPROGRESS; dnxAuditJob(&(ilist->list[current]), "ACK"); ret = DNX_OK; } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Timed Event Handler. * * Nagios calls this routine once each time a timed event needs to execute. * The particular event we care about is the REAPER event. * * @param[in] event_type - the event type for which we're being called. * @param[in] data - an opaque pointer to nagios event-specific data. * * @return Zero, but the value is ignored by Nagios in this event. */ static int ehTimedEvent(int event_type, void * data) { nebstruct_timed_event_data * ted = (nebstruct_timed_event_data *)data; timed_event * event = (timed_event*)data; int ret; // sanity checks if (event_type != NEBCALLBACK_TIMED_EVENT_DATA || ted == 0) return ERROR; // we only care about REAPER events if (ted->event_type != EVENT_CHECK_REAPER) return OK; dnxDebug(3, "Reaper handler called."); dnxMoveResultsToNagios(); return OK; }
int dnxQueueGet(DnxQueue * queue, void ** ppPayload) { iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item = 0; assert(queue && ppPayload); dnxDebug(8, "dnxQueueGet: iQueue size(%i)", iqueue->size); DNX_PT_MUTEX_LOCK(&iqueue->mutex); if (iqueue->size > 0) { // remove the 'head' item from the queue item = iqueue->head; iqueue->head = item->next; if (iqueue->current == item) iqueue->current = item->next; // adjust tail pointer if queue is now empty if (iqueue->head == 0) iqueue->tail = 0; iqueue->size--; } DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); // return the payload to the caller, free queue item if (item) { *ppPayload = item->pPayload; xfree(item); return DNX_OK; } return DNX_ERR_NOTFOUND; }
/** Process Data Event Handler. * * @param[in] event_type - the event regarding which we were called by Nagios. * @param[in] data - an opaque pointer to an event-specific data structure. * * @return Zero if all is okay, but we want nagios to handle this event; * non-zero if there's a problem of some sort. */ static int ehProcessData(int event_type, void * data) { nebstruct_process_data *procdata = (nebstruct_process_data *)data; // validate our event type - ignore wrong event type assert(event_type == NEBCALLBACK_PROCESS_DATA); if (event_type != NEBCALLBACK_PROCESS_DATA) return OK; // sanity-check our data structure - should never happen assert(procdata); if (!procdata) { dnxLog("Startup handler received NULL process data structure."); return ERROR; } // look for process event loop start event if (procdata->type == NEBTYPE_PROCESS_EVENTLOOPSTART) { dnxDebug(2, "Startup handler received PROCESS_EVENTLOOPSTART event."); // execute sync script, if defined if (cfg.syncScript) { dnxLog("Startup handler executing plugin sync script: %s.", cfg.syncScript); // NB: This halts Nagios execution until the script exits... launchScript(cfg.syncScript); } // if server init fails, do server shutdown if (dnxServerInit() != 0) dnxServerDeInit(); } return OK; }
/** Service Check Event Handler. * * @param[in] event_type - the event type for which we're being called. * @param[in] data - an opaque pointer to nagios event-specific data. * * @return Zero if we want Nagios to handle the event; * NEBERROR_CALLBACKOVERRIDE indicates that we want to handle the event * ourselves; any other non-zero value represents an error. */ static int ehSvcCheck(int event_type, void * data) { static unsigned long serial = 0; // the number of service checks processed nebstruct_service_check_data * svcdata = (nebstruct_service_check_data *)data; DnxNodeRequest * pNode; DnxJobData * jdp; int ret; if (event_type != NEBCALLBACK_SERVICE_CHECK_DATA) return OK; if (svcdata == 0) { dnxLog("Service handler received NULL service data structure."); return ERROR; // shouldn't happen - internal Nagios error } if (svcdata->type != NEBTYPE_SERVICECHECK_INITIATE) return OK; // ignore non-initiate service checks // check for local execution pattern on command line if (cfg.localCheckPattern && regexec(®Ex, svcdata->command_line, 0, 0, 0) == 0) { dnxDebug(1, "Service will execute locally: %s.", svcdata->command_line); return OK; // tell nagios execute locally } dnxDebug(3, "ehSvcCheck: Received Job [%lu] at %lu (%lu).", serial, (unsigned long)time(0), (unsigned long)svcdata->start_time.tv_sec); if ((ret = dnxGetNodeRequest(registrar, &pNode)) != DNX_OK) { dnxDebug(3, "ehSvcCheck: No worker nodes requests available: %s.",dnxErrorString(ret)); return OK; // tell nagios execute locally } // allocate and populate a new job payload object if ((jdp = (DnxJobData *)xmalloc(sizeof *jdp)) == 0) { dnxDebug(1, "ehSvcCheck: Out of memory!"); return OK; } memset(jdp, 0, sizeof *jdp); jdp->svc = (service *)svcdata->OBJECT_FIELD_NAME; assert(jdp->svc); #if CURRENT_NEB_API_VERSION == 3 { // a nagios 3.x global variable extern check_result check_result_info; /** @todo patch nagios to pass these values to the event handler. */ jdp->chkopts = check_result_info.check_options; jdp->schedule = check_result_info.scheduled_check; jdp->reschedule = check_result_info.reschedule_check; } #endif if ((ret = dnxPostNewJob(joblist, serial, jdp, svcdata, pNode)) != DNX_OK) { dnxLog("Unable to post job [%lu]: %s.", serial, dnxErrorString(ret)); xfree(jdp); return OK; // tell nagios execute locally } serial++; // bump serial number return NEBERROR_CALLBACKOVERRIDE; // tell nagios we want it }
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm) { iDnxWlm * iwlm; struct ifaddrs * ifa = NULL; assert(cfg && pwlm); assert(cfg->poolMin > 0); assert(cfg->poolMax >= cfg->poolMin); assert(cfg->poolInitial >= cfg->poolMin); assert(cfg->poolInitial <= cfg->poolMax); // allocate and configure the master thread pool data structure if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0) return DNX_ERR_MEMORY; memset(iwlm, 0, sizeof *iwlm); iwlm->cfg = *cfg; iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher); iwlm->cfg.collector = xstrdup(iwlm->cfg.collector); iwlm->poolsz = iwlm->cfg.poolMax; iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool); iwlm->minexectm = (unsigned)(-1); // the largest possible value memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool); // cache our (primary?) ip address in binary and string format if (getifaddrs(&ifa) == 0) { u_int setflags = IFF_UP | IFF_RUNNING; u_int clrflags = IFF_LOOPBACK; struct ifaddrs * ifcur = ifa; // locate the first proper AF_NET address in our interface list while (ifcur && (ifcur->ifa_addr == 0 || ifcur->ifa_addr->sa_family != AF_INET || (ifcur->ifa_flags & setflags) != setflags || (ifcur->ifa_flags & clrflags) != 0)) ifcur = ifcur->ifa_next; if (ifcur) { // cache binary and presentation (string) versions of the ip address iwlm->myipaddr = (unsigned long) ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr; inet_ntop(ifcur->ifa_addr->sa_family, &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr, iwlm->myipaddrstr, sizeof iwlm->myipaddrstr); } freeifaddrs(ifa); } char unset[] = "NULL"; if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set { dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread."); char machineName [MAX_HOSTNAME]; if(strcmp(cfg->hostname, unset)==0) { dnxDebug(3, "dnxWlmCreate: Hostname undefined in config."); // Get our hostname if(gethostname(machineName, MAX_HOSTNAME)==0) { dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName); // cache hostname strcpy(iwlm->myhostname, machineName); } else { dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?]," "please set hostname in config.", machineName); sprintf( machineName, "localhost"); strcpy(iwlm->myhostname, machineName); } } else { dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname); strcpy(iwlm->myhostname, cfg->hostname); } } else { dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname); strcpy(iwlm->cfg.hostname, iwlm->myhostname); } // if any of the above failed, we really can't continue if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool) { xfree(iwlm->cfg.dispatcher); xfree(iwlm->cfg.collector); xfree(iwlm); return DNX_ERR_MEMORY; } // create initial worker thread pool DNX_PT_MUTEX_INIT(&iwlm->mutex); DNX_PT_MUTEX_LOCK(&iwlm->mutex); { int ret; if ((ret = growThreadPool(iwlm)) != DNX_OK) { if (iwlm->threads) dnxLog("WLM: Error creating SOME worker threads: %s; " "continuing with smaller initial pool.", dnxErrorString(ret)); else { dnxLog("WLM: Unable to create ANY worker threads: %s; " "terminating.", dnxErrorString(ret)); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); DNX_PT_MUTEX_DESTROY(&iwlm->mutex); xfree(iwlm); return ret; } } } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxLog("WLM: Started worker thread pool."); *pwlm = (DnxWlm *)iwlm; return DNX_OK; }
/** The main timer thread procedure entry point. * * @param[in] data - an opaque pointer to thread data for the timer thread. * This is actually the dnx server global data object. * * @return Always returns 0. */ static void * dnxTimer(void * data) { iDnxTimer * itimer = (iDnxTimer *)data; DnxNewJob ExpiredList[MAX_EXPIRED]; int i, totalExpired; int ret = 0; assert(data); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxTimerCleanup, data); dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self()); while (1) { pthread_testcancel(); dnxCancelableSleep(itimer->sleepms); // search for expired jobs in the pending queue totalExpired = MAX_EXPIRED; if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, &totalExpired)) == DNX_OK && totalExpired > 0) { for (i = 0; i < totalExpired; i++) { char msg[256]; char addrstr[DNX_MAX_ADDRSTR]; DnxNewJob * job = &ExpiredList[i]; dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.", pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd); dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT); dnxAuditJob(job, "EXPIRE"); // if (job->ack) snprintf(msg, sizeof msg, "(DNX: Service Check [%lu,%lu] Timed Out - " "Node: %s - Failed to return job response in time allowed)", job->xid.objSerial, job->xid.objSlot, addrstr); // else // snprintf(msg, sizeof msg, // "(DNX: Service Check [%lu,%lu] Timed Out - " // "Node: %s - Failed to acknowledge job receipt)", // job->xid.objSerial, job->xid.objSlot, addrstr); dnxDebug(2, msg); // report the expired job to Nagios ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, time(0) - job->start_time, 1, 0, msg); dnxJobCleanup(job); } } if (totalExpired > 0 || ret != DNX_OK) dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d Retcode=%d: %s.", pthread_self(), totalExpired, ret, dnxErrorString(ret)); } dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret)); pthread_cleanup_pop(1); return 0; }
/** The agent thread control procedure. * * @param[in] data - thread data; not used. * * @return Always returns a null pointer (zero). */ static void * dnxAgentServer(void * data) { int ret; DnxMgmtRequest Msg; Msg.action = 0; dnxLog("DNX Server Agent awaiting commands..."); while (!s_shutdown) { memset(Msg.address, '\0', DNX_MAX_ADDRESS); // wait 2 second for a request; process the request, if valid if ((ret = dnxWaitForMgmtRequest(s_agent, &Msg, Msg.address, 2)) == DNX_OK) { DnxMgmtReply Rsp; char addrstr[DNX_MAX_ADDRSTR]; dnxDebug(2, "Received MgmtRequest from %s.", dnxNtop(Msg.address, addrstr, sizeof addrstr)); // setup some default response values Rsp.xid = Msg.xid; Rsp.status = DNX_REQ_ACK; Rsp.reply = 0; // perform the requested action if (!strcmp(Msg.action, "RESETSTATS")) { dnxStatsResetServerStats(); dnxStatsForEachNode(dnxResetNodeStats, 0); Rsp.reply = xstrdup("OK"); } else if (!strncmp(Msg.action, "GETSTATS ", 9)) { if ((Rsp.reply = buildMgmtStatsReply(Msg.action + 9)) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strncmp(Msg.action, "GETNODESTATS ", 13)) { if ((Rsp.reply = buildMgmtNodeStatsReply(Msg.action + 13)) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETNODELIST")) { if ((Rsp.reply = buildMgmtNodeListReply()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETCONFIG")) { if ((Rsp.reply = buildMgmtCfgReply()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "GETVERSION")) { if ((Rsp.reply = versionText()) == 0) Rsp.status = DNX_REQ_NAK; } else if (!strcmp(Msg.action, "HELP")) { if ((Rsp.reply = buildHelpReply()) == 0) Rsp.status = DNX_REQ_NAK; } // send response, log response failures if ((ret = dnxSendMgmtReply(s_agent, &Rsp, Msg.address)) != 0) dnxLog("Agent response failure: %s.", dnxErrorString(ret)); // free request and reply message buffers xfree(Rsp.reply); xfree(Msg.action); } else if (ret != DNX_ERR_TIMEOUT) dnxLog("Agent channel failure: %s.", dnxErrorString(ret)); } dnxLog("Agent terminating..."); return 0; }
/** The main thread routine for a worker thread. * * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this * thread. * * @return Always returns 0. */ static void * dnxWorker(void * data) { DnxWorkerStatus * ws = (DnxWorkerStatus *)data; pthread_t tid = pthread_self(); int retries = 0; iDnxWlm * iwlm; assert(data); iwlm = ws->iwlm; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxWorkerCleanup, data); time(&ws->tstart); // set thread start time (for stats) while (!iwlm->terminate) { DnxNodeRequest msg; DnxJob job; int ret; // setup job request message - use thread id and node address in XID dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr); msg.reqType = DNX_REQ_REGISTER; msg.jobCap = 1; msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff; msg.hn = iwlm->myhostname; // request a job, and then wait for a job to come in... if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) { dnxLog("Worker[%lx]: Error sending node request: %s.", tid, dnxErrorString(ret)); } else { DNX_PT_MUTEX_LOCK(&iwlm->mutex); iwlm->reqsent++; DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); } // wait for job, even if request was never sent if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxLog("Worker[%lx]: Error receiving job: %s.", tid, dnxErrorString(ret)); } // Allow thread to be canceled pthread_testcancel(); DNX_PT_MUTEX_LOCK(&iwlm->mutex); cleanThreadPool(iwlm); // ensure counts are accurate before using them if (ret != DNX_OK) { // if above pool minimum and exceeded max retries... if (iwlm->threads > iwlm->cfg.poolMin && ++retries > iwlm->cfg.maxRetries) { dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); break; } } else { iwlm->jobsrcvd++; iwlm->active++; // dnxSendJobAck(ws->collect, &job, &job.address); // dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", // tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd); // DnxAck ack; // ack.xid = job.xid; // ack.timestamp = job.timestamp; dnxSendJobAck(ws->collect, &job, 0); dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp); // check pool size before we get too busy - // if we're not shutting down and we haven't reached the configured // maximum and this is the last thread out, then increase the pool if (!iwlm->terminate && iwlm->threads < iwlm->cfg.poolMax && iwlm->active == iwlm->threads) // Maybe more aggressive here growThreadPool(iwlm); } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); // if we have a job, execute it and reset retry count if (ret == DNX_OK) { char resData[MAX_RESULT_DATA + 1]; DnxResult result; time_t jobstart; dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd); // prepare result structure result.xid = job.xid; // result xid must match job xid result.state = DNX_JOB_COMPLETE; // complete or expired result.delta = 0; result.resCode = DNX_PLUGIN_RESULT_OK; result.resData = 0; /** @todo Allocate result data buffer based on configured buffer size. */ // we want to be able to cancel threads while they're out on a task // in order to obtain timely shutdown for long jobs - move into // async cancel mode, but only for the duration of the check pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0); *resData = 0; jobstart = time(0); dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0); result.delta = time(0) - jobstart; pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); // store allocated copy of the result string if (*resData) result.resData = xstrdup(resData); dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.", tid, job.xid.objSerial, job.xid.objSlot, result.delta, result.resCode, result.resData); // if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { // dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", // tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); // } // Wait while we wait for an Ack to our Results DnxJob ack; int trys = 1; while(trys < 4) { if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); break; } // Now wait for our Ack if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); } else if (ret == DNX_ERR_TIMEOUT) { // we didn't get our Ack trys++; } else { // We got our Ack dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); break; } } xfree(result.resData); // update all statistics DNX_PT_MUTEX_LOCK(&iwlm->mutex); { // track status if (result.resCode == DNX_PLUGIN_RESULT_OK) iwlm->jobsok++; else iwlm->jobsfail++; // track min/max/avg execution time if (result.delta > iwlm->maxexectm) iwlm->maxexectm = result.delta; if (result.delta < iwlm->minexectm) iwlm->minexectm = result.delta; iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2; // total job processing time iwlm->jobtm += (unsigned)result.delta; iwlm->active--; // reduce active count } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); ws->serial++; // increment job serial number for next job retries = 0; } } pthread_cleanup_pop(1); return 0; }
int dnxJobListDispatch(DnxJobList * pJobList, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; int ret = DNX_OK; //DNX_ERR_TIMEOUT; struct timeval now; struct timespec timeout; assert(pJobList && pJob); DNX_PT_MUTEX_LOCK(&ilist->mut); // start at current head current = ilist->head; dnxDebug(6, "dnxJobListDispatch: BEFORE: Head=%lu, Tail=%lu, Queue=%lu.", ilist->head, ilist->tail, ilist->size); while (1) { switch (ilist->list[current].state) { case DNX_JOB_INPROGRESS: dnxDebug(8, "dnxJobListDispatch: In Progress Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_NULL: dnxDebug(8, "dnxJobListDispatch: Null Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_EXPIRED: dnxDebug(8, "dnxJobListDispatch: Expired Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_UNBOUND: dnxDebug(8, "dnxJobListDispatch: Unbound Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_PENDING: gettimeofday(&now, 0); // Check to see if we have recently dispatched this if((ilist->list[current].pNode)->retry > now.tv_sec) { dnxDebug(5, "dnxJobListDispatch: Pending job [%lu:%lu] waiting for Ack, resend in (%i) sec.", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot, ((ilist->list[current].pNode)->retry - now.tv_sec)); break; } else { if((ilist->list[current].pNode)->retry) { // Make sure the dnxClient service offer is still fresh if((ilist->list[current].pNode)->expires < now.tv_sec) { dnxDebug(4, "dnxJobListDispatch: Pending job [%lu:%lu] waiting for Ack, client node expired. Resubmitting.", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot); ilist->list[current].state = DNX_JOB_UNBOUND; // reset the node? // It's likely that the same client will be servicing us // or that the job might come back in the mean time, so we // should keep this node as long as possible // We just need to make sure that the Affinity is correct and that // it's only used to find a new node, so if we get as far as // resubmitting, we will have a valid node anyway // If the original job comes back, the acks will get all messed up // not sure how to deal with that other than to just be graceful // about receiving lots of results... // dnxDeleteNodeReq(ilist->list[current].pNode); // DnxNodeRequest * pNode = dnxCreateNodeReq(); ilist->list[current].pNode->flags = *(dnxGetAffinity(ilist->list[current].host_name)); // ilist->list[current].pNode->hn = xstrdup(ilist->list[current].host_name); // ilist->list[current].pNode->addr = NULL; // We should leave the address alone so we don't segfault if results come in late // but should we reset these? // ilist->list[current].pNode->xid.objSlot = -1; // ilist->list[current].pNode->xid.objSerial = ilist->list[current].xid.objSerial; // ilist->list[current].pNode = pNode; } break; } else { // This is a new job, so dispatch it dnxDebug(4, "dnxJobListDispatch: Dispatching new job [%lu:%lu] waiting for Ack", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot); } } // set our retry interval // This should be fairly forgiving in case we just missed the Ack but it actually // got the job and is returning our results. (ilist->list[current].pNode)->retry = now.tv_sec + 5; // make a copy for the Dispatcher to send to client memcpy(pJob, &ilist->list[current], sizeof *pJob); // release the mutex DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; case DNX_JOB_COMPLETE: case DNX_JOB_RECEIVED: // This is a job that we have received the response and we need to send an ack to // the client to let it know we got it if(ilist->list[current].ack) { // Only send a single Ack break; } // make a copy for the Dispatcher to send an Ack to the client memcpy(pJob, &ilist->list[current], sizeof *pJob); dnxDebug(4, "dnxJobListDispatch: Received job [%lu:%lu] sending Ack.", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot); // release the mutex DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; } if (current == ilist->tail) { // if we are at the end of the queue gettimeofday(&now, 0); timeout.tv_sec = now.tv_sec + DNX_JOBLIST_TIMEOUT; timeout.tv_nsec = now.tv_usec * 1000; if ((ret = pthread_cond_timedwait(&ilist->cond, &ilist->mut, &timeout)) == ETIMEDOUT) { // We waited for the time out period and no new jobs arrived. So give control back to caller. dnxDebug(5, "dnxJobListDispatch: Reached end of dispatch queue. Thread timer returned."); DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; } else { // We were signaled that there is a new job, so lets move back to the head and get it! current = ilist->head; dnxDebug(5, "dnxJobListDispatch: Reached end of dispatch queue. A new job arrived."); } } else { // move to next item in queue current = ((current + 1) % ilist->size); } } }
int dnxJobListExpire(DnxJobList * pJobList, DnxNewJob * pExpiredJobs, int * totalJobs) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; DnxNewJob * pJob; int jobCount = 0; time_t now; assert(pJobList && pExpiredJobs && totalJobs && *totalJobs > 0); DNX_PT_MUTEX_LOCK(&ilist->mut); // get the current time (after we acquire the lock! In case we had to wait) now = time(0); // walk the entire job list - InProgress and Pending jobs (in that order) current = ilist->head; int zero_factor = ilist->size - current; // add this value to normalize the index dnxDebug(6, "dnxJobListExpire: searching for (%i) expired objects. Head(%lu) Tail(%i)", *totalJobs, ilist->head, ilist->tail); int state = 0; while(jobCount < *totalJobs) { state = (pJob = &ilist->list[current])->state; unsigned long dispatch_timeout = now - DNX_DISPATCH_TIMEOUT; // only examine jobs that are either awaiting dispatch or results switch (state) { case DNX_JOB_UNBOUND: if(pJob->start_time <= dispatch_timeout) { dnxDebug(2, "dnxJobListExpire: Expiring Unbound %s Job [%lu:%lu] count(%i) type(%i) Start Time: (%lu) Now: (%lu) Expire: (%lu)", (pJob->object_check_type ? "Host" : "Service"), pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->start_time, now, dispatch_timeout); // Put the old job in a purgable state pJob->state = DNX_JOB_EXPIRED; // Add a copy to the expired job list memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob)); } else { // If there is a client associated with it, xid.objSlot != -1 // then it means we may be getting a result coming back to us // This job has not expired, try and get a dnxClient for it if (dnxGetNodeRequest(dnxGetRegistrar(), &(pJob->pNode)) == DNX_OK) { // If OK we have successfully dispatched it so update it's expiration dnxDebug(2, "dnxJobListExpire: Dequeueing DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state); pJob->state = DNX_JOB_PENDING; pthread_cond_signal(&ilist->cond); // signal that a new job is available } else { dnxDebug(6, "dnxJobListExpire: Unable to dequeue DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state); } } break; case DNX_JOB_PENDING: case DNX_JOB_INPROGRESS: // check the job's expiration stamp if (pJob->expires <= now) { // // This is an expired job, it was sent out, but never came back dnxDebug(1, "dnxJobListExpire: Expiring Job [%lu:%lu] count(%i) type(%i) Exp: (%lu) Now: (%lu)", pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->expires, now); // Put the old job in a purgable state pJob->state = DNX_JOB_EXPIRED; // Add a copy to the expired job list memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob)); } break; case DNX_JOB_COMPLETE: // If the Ack hasn't been sent out yet, give it time to complete if(! pJob->ack) { dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. count(%i) type(%i)", current, state); break; } case DNX_JOB_EXPIRED: dnxJobCleanup(pJob); dnxDebug(3, "dnxJobListExpire: Nullified Job. count(%i) type(%i)", current, state); case DNX_JOB_NULL: if(current == ilist->head && current != ilist->tail) { ilist->head = ((current + 1) % ilist->size); dnxDebug(2, "dnxJobListExpire: Moving head to (%i). count(%i) type(%i)", ilist->head, current, pJob->state); // we have an old item at the head of the list, so we need to // increment the head. It should never be larger than the tail. } else { dnxDebug(5, "dnxJobListExpire: Null Job. count(%i) type(%i)", current, pJob->state); } break; case DNX_JOB_RECEIVED: if(! pJob->ack) { dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. job [%lu:%lu] count(%i) type(%i)", current, state); } else { dnxDebug(2, "dnxJobListExpire: Ack sent. job [%lu:%lu] count(%i) type(%i)", current, state); } // The Collector thread will set this to DNX_JOB_COMPLETE once it has // replied to Nagios, but we don't advance the head until that happens break; } // bail-out if this was the job list tail if (current == ilist->tail) { break; } // increment the job list index current = ((current + 1) % ilist->size); } // update the total jobs in the expired job list *totalJobs = jobCount; DNX_PT_MUTEX_UNLOCK(&ilist->mut); return DNX_OK; }