void dnxWlmDestroy(DnxWlm * wlm) { iDnxWlm * iwlm = (iDnxWlm *)wlm; time_t expires; unsigned i; assert(wlm); dnxLog("WLM: Beginning termination sequence..."); // sleep till we can't stand it anymore, then kill everyone iwlm->terminate = 1; expires = iwlm->cfg.shutdownGrace + time(0); DNX_PT_MUTEX_LOCK(&iwlm->mutex); while (iwlm->threads > 0 && time(0) < expires) { cleanThreadPool(iwlm); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxCancelableSleep(100); DNX_PT_MUTEX_LOCK(&iwlm->mutex); } // check for workers remaining after grace period if (iwlm->threads) dnxDebug(1, "WLM: Termination - %d workers remaining" " after grace period.", iwlm->threads); // cancel all remaining workers for (i = 0; i < iwlm->threads; i++) if (iwlm->pool[i]->state == DNX_THREAD_RUNNING) { dnxDebug(1, "WLMDestroy: Cancelling worker[%lx].", iwlm->pool[i]->tid); pthread_cancel(iwlm->pool[i]->tid); } // give remaining thread some time to quit DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxCancelableSleep(1000); DNX_PT_MUTEX_LOCK(&iwlm->mutex); // join all zombies (should be everything left) cleanThreadPool(iwlm); assert(iwlm->threads == 0); xfree(iwlm->pool); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); DNX_PT_MUTEX_DESTROY(&iwlm->mutex); xfree(iwlm->cfg.dispatcher); xfree(iwlm->cfg.collector); xfree(iwlm); dnxLog("WLM: Termination sequence complete."); }
int dnxWlmReconfigure(DnxWlm * wlm, DnxWlmCfgData * cfg) { iDnxWlm * iwlm = (iDnxWlm *)wlm; DnxWorkerStatus ** pool; int ret = 0; assert(wlm && cfg); assert(cfg->poolMin > 0); assert(cfg->poolMax >= cfg->poolMin); assert(cfg->poolInitial >= cfg->poolMin); assert(cfg->poolInitial <= cfg->poolMax); DNX_PT_MUTEX_LOCK(&iwlm->mutex); // dynamic reconfiguration of dispatcher/collector URL's is not allowed logConfigChanges(&iwlm->cfg, cfg); iwlm->cfg.reqTimeout = cfg->reqTimeout; iwlm->cfg.ttlBackoff = cfg->ttlBackoff; iwlm->cfg.maxRetries = cfg->maxRetries; iwlm->cfg.poolMin = cfg->poolMin; iwlm->cfg.poolInitial = cfg->poolInitial; iwlm->cfg.poolMax = cfg->poolMax; iwlm->cfg.poolGrow = cfg->poolGrow; iwlm->cfg.pollInterval = cfg->pollInterval; iwlm->cfg.shutdownGrace = cfg->shutdownGrace; iwlm->cfg.maxResults = cfg->maxResults; iwlm->cfg.showNodeAddr = cfg->showNodeAddr; strcpy(iwlm->cfg.hostname, cfg->hostname); // we can't reduce the poolsz until the number of threads // drops below the new maximum while (iwlm->threads > iwlm->cfg.poolMax) { DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxCancelableSleep(3 * 1000); DNX_PT_MUTEX_LOCK(&iwlm->mutex); } // reallocate the pool to the new size if ((pool = (DnxWorkerStatus **)xrealloc(iwlm->pool, iwlm->cfg.poolMax * sizeof *pool)) == 0) ret = DNX_ERR_MEMORY; else { iwlm->poolsz = iwlm->cfg.poolMax; iwlm->pool = pool; } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); return ret; }
/** Return the next item payload without removing it from the queue. * * Ownership of the queue item payload does NOT transfer to the caller. * * @param[in] queue - the queue from which the next item payload should * be returned. * @param[out] ppPayload - the address of storage in which to return a * reference to the next item payload. * * @return Zero on success, or DNX_ERR_NOTFOUND if there is no next node. * * @note Not currently used (or exported by the dnxQueue.h header file). * * @note Cancellation safe. */ int dnxQueueNext(DnxQueue * queue, void ** ppPayload) { iDnxQueue * iqueue = (iDnxQueue *)queue; assert(queue && ppPayload); *ppPayload = 0; DNX_PT_MUTEX_LOCK(&iqueue->mutex); // save pointer to current payload if (iqueue->current) { *ppPayload = iqueue->current->pPayload; // advance circular buffer pointer if (iqueue->current->next) iqueue->current = iqueue->current->next; else iqueue->current = iqueue->head; } DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); return *ppPayload ? DNX_OK : DNX_ERR_NOTFOUND; }
/** Clean up resources allocated by the channel map sub-system. */ void dnxChanMapRelease(void) { if (dnxInit) { int i; DNX_PT_MUTEX_LOCK(&chanMutex); for (i = 0; i < DNX_MAX_CHAN_MAP; i++) { xfree(gChannelMap[i].name); xfree(gChannelMap[i].url); } memset(gChannelMap, 0, sizeof gChannelMap); DNX_PT_MUTEX_UNLOCK(&chanMutex); DNX_PT_MUTEX_DESTROY(&chanMutex); // de-initialize transport module table i = elemcount(gTMList); while (i--) gTMList[i].txExit(); dnxInit = 0; } }
void dnxWlmGetStats(DnxWlm * wlm, DnxWlmStats * wsp) { iDnxWlm * iwlm = (iDnxWlm *)wlm; assert(wlm && wsp); DNX_PT_MUTEX_LOCK(&iwlm->mutex); wsp->jobs_succeeded = iwlm->jobsok; wsp->jobs_failed = iwlm->jobsfail; wsp->threads_created = iwlm->tcreated; wsp->threads_destroyed = iwlm->tdestroyed; wsp->total_threads = iwlm->threads; wsp->active_threads = iwlm->active; wsp->requests_sent = iwlm->reqsent; wsp->jobs_received = iwlm->jobsrcvd; wsp->min_exec_time = iwlm->minexectm; wsp->avg_exec_time = iwlm->avgexectm; wsp->max_exec_time = iwlm->maxexectm; wsp->avg_total_threads = iwlm->avgthreads; wsp->avg_active_threads = iwlm->avgactive; wsp->thread_time = iwlm->threadtm; wsp->job_time = iwlm->jobtm; wsp->packets_out = iwlm->packets_out; wsp->packets_in = iwlm->packets_in; DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); }
DnxQueueResult dnxQueueFind(DnxQueue * queue, void ** ppPayload, DnxQueueResult (*Compare)(void * pLeft, void * pRight)) { DnxQueueResult bFound = DNX_QRES_CONTINUE; iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item; assert(queue && ppPayload && Compare); DNX_PT_MUTEX_LOCK(&iqueue->mutex); for (item = iqueue->head; item; item = item->next) { if ((bFound = Compare(*ppPayload, item->pPayload)) != DNX_QRES_CONTINUE) { if (bFound == DNX_QRES_FOUND) *ppPayload = item->pPayload; break; } } DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); return bFound; }
void dnxQueueDestroy(DnxQueue * queue) { iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item; assert(queue); DNX_PT_MUTEX_LOCK(&iqueue->mutex); // first free any requests that might be on the queue item = iqueue->head; while (item != 0) { iDnxQueueEntry * next = item->next; iqueue->freepayload(item->pPayload); xfree(item); item = next; } DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); DNX_PT_MUTEX_DESTROY(&iqueue->mutex); pthread_cond_destroy(&iqueue->cv); xfree(iqueue); }
/** Move the DNX N3 results queue to Nagios. * * This function should only be called on the Nagios timed event handler * thread so there are no race conditions between Nagios's processing of it's * results queue and DNX's addition of data to that queue. */ static void dnxMoveResultsToNagios(void) { check_result * local; // safely save off currently local list DNX_PT_MUTEX_LOCK(&dnxResultListMutex); local = dnxResultList; dnxResultList = 0; DNX_PT_MUTEX_UNLOCK(&dnxResultListMutex); // merge local into check_result_list, store in check_result_list check_result_list = dnxMergeLists(local, check_result_list); }
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long tail; int ret = DNX_OK; assert(pJobList && pJob); DNX_PT_MUTEX_LOCK(&ilist->mut); tail = ilist->tail; // verify space in the job list, this keeps a single empty buffer element to // protect us from not knowing a full ring from an empty one if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) { dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); ret = DNX_ERR_CAPACITY; } else { // add the slot index to the Job's XID - this allows us to index // the job list using the returned result's XID.objSlot field pJob->xid.objSlot = tail; // We were unable to get an available dnxClient job request so we // put the job into the queue anyway and have the timer thread try // and find a dnxClient for it later if (pJob->pNode->xid.objSlot == -1) { pJob->state = DNX_JOB_UNBOUND; } else { pJob->state = DNX_JOB_PENDING; } dnxAuditJob(pJob, "ASSIGN"); // add this job to the job list memcpy(&ilist->list[tail], pJob, sizeof *pJob); ilist->tail = tail; dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail); if(pJob->state == DNX_JOB_PENDING) { pthread_cond_signal(&ilist->cond); // signal that a new job is available } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Return the number of items in the queue. * * @param[in] queue - the queue to be queried for item count. * * @return The count of items in the queue. * * @note Not currently used (or exported by the dnxQueue.h header file). * * @note Cancellation safe. */ int dnxQueueSize(DnxQueue * queue) { iDnxQueue * iqueue = (iDnxQueue *)queue; int count; assert(queue); DNX_PT_MUTEX_LOCK(&iqueue->mutex); count = (int)iqueue->size; DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); return count; }
DnxQueueResult dnxQueueRemove(DnxQueue * queue, void ** ppPayload, DnxQueueResult (*Compare)(void * pLeft, void * pRight)) { DnxQueueResult bFound = DNX_QRES_CONTINUE; iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item, * prev; int counter = 0; assert(queue && ppPayload && Compare); DNX_PT_MUTEX_LOCK(&iqueue->mutex); prev = 0; for (item = iqueue->head; item; item = item->next) { counter++; if ((bFound = Compare(*ppPayload, item->pPayload)) != DNX_QRES_CONTINUE) { if (bFound == DNX_QRES_FOUND) { *ppPayload = item->pPayload; // cross-link previous to next and free current if (prev) prev->next = item->next; else // removing the head item iqueue->head = item->next; if (item->next == 0) // removing the tail item iqueue->tail = prev; if (iqueue->current == item) // advance circular pointer if ((iqueue->current = item->next) == 0) iqueue->current = iqueue->head; iqueue->size--; } break; } prev = item; } dnxDebug(8, "dnxQueueRemove: (%i) elements searched in (%i) sized queue", counter, iqueue->size); DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); if (bFound == DNX_QRES_FOUND) { xfree(item); // free the queue entry wrapper object } return bFound; }
void dnxWlmResetStats(DnxWlm * wlm) { iDnxWlm * iwlm = (iDnxWlm *)wlm; assert(wlm); DNX_PT_MUTEX_LOCK(&iwlm->mutex); iwlm->jobtm = iwlm->threadtm = 0; iwlm->jobsok = iwlm->jobsfail = iwlm->tcreated = iwlm->tdestroyed = 0; iwlm->reqsent = iwlm->jobsrcvd = iwlm->avgexectm = 0; iwlm->maxexectm = iwlm->avgthreads = iwlm->avgactive = 0; iwlm->minexectm = (unsigned)(-1); // the largest possible value iwlm->packets_out = 0; iwlm->packets_in = 0; DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); }
/** Allocate an unconnected channel. * * @param[in] name - the name of the channel to allocate. * @param[out] icpp - the address of storage for the returned object * reference. * * @return Zero on success, or a non-zero error value. */ static int dnxChanMapAllocChannel(char * name, iDnxChannel ** icpp) { DnxChanMap * chanMap; int ret; assert(name && *name && icpp); DNX_PT_MUTEX_LOCK(&chanMutex); if ((ret = dnxChanMapFindName(name, &chanMap)) == DNX_OK) ret = chanMap->txAlloc(chanMap->url, icpp); DNX_PT_MUTEX_UNLOCK(&chanMutex); return ret; }
int dnxJobListCollect(DnxJobList * pJobList, DnxXID * pxid, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; int ret = DNX_OK; assert(pJobList && pxid && pJob); // parameter validation current = pxid->objSlot; dnxDebug(4, "dnxJobListCollect: Job serial (%lu) slot (%lu) list head(%i)", pxid->objSerial, pxid->objSlot, ilist->head); if (current >= ilist->size) // runtime validation requires check return DNX_ERR_INVALID; // corrupt client network message DNX_PT_MUTEX_LOCK(&ilist->mut); // verify that the XID of this result matches the XID of the service check if (ilist->list[current].state == DNX_JOB_NULL || !dnxEqualXIDs(pxid, &ilist->list[current].xid)) { dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] not found.", pxid->objSerial, pxid->objSlot); ret = DNX_ERR_NOTFOUND; // Very old job or we restarted and lost state } else if(ilist->list[current].state == DNX_JOB_EXPIRED) { dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] expired before retrieval.", pxid->objSerial, pxid->objSlot); ret = DNX_ERR_EXPIRED; // job expired; removed by the timer } else { if(ilist->list[current].state == DNX_JOB_COMPLETE || ilist->list[current].state == DNX_JOB_RECEIVED) { dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] already retrieved.", pxid->objSerial, pxid->objSlot); ilist->list[current].ack = 0; ret = DNX_ERR_ALREADY; // It needs another Ack } else { // DNX_JOB_INPROGRESS // DNX_JOB_UNBOUND!! ilist->list[current].state = DNX_JOB_RECEIVED; // make a copy to return to the Collector memcpy(pJob, &ilist->list[current], sizeof *pJob); dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] completed. Copy of result for (%s) assigned to collector.", pxid->objSerial, pxid->objSlot, pJob->cmd); } // Signal to the dispatcher that we need to send an Ack pthread_cond_signal(&ilist->cond); } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Add a check result to the dnx check result list in sorted order * * The check result is added to the check result list in ascending order, * sorted by finish time. * * @param[in] cr - the check result to be addd to the results list. */ static void dnxAddResultToList(check_result * newcr) { check_result ** curp; assert(newcr); DNX_PT_MUTEX_LOCK(&dnxResultListMutex); for (curp = &dnxResultList; *curp; curp = &(*curp)->next) if (dnxTimeCompare(&(*curp)->finish_time, &newcr->finish_time) >= 0) break; newcr->next = *curp; *curp = newcr; DNX_PT_MUTEX_UNLOCK(&dnxResultListMutex); }
int dnxJobListMarkComplete(DnxJobList * pJobList, DnxXID * pXid) { iDnxJobList * ilist = (iDnxJobList *)pJobList; assert(pJobList && pXid); // parameter validation int ret = DNX_ERR_NOTFOUND; dnxDebug(4, "dnxJobListMarkComplete: Job [%lu:%lu]", pXid->objSerial, pXid->objSlot); unsigned long current = pXid->objSlot; DNX_PT_MUTEX_LOCK(&ilist->mut); if (dnxEqualXIDs(pXid, &ilist->list[current].xid)) { if(ilist->list[current].state == DNX_JOB_RECEIVED) { ilist->list[current].state = DNX_JOB_COMPLETE; ret = DNX_OK; } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Delete a channel map by name. * * @param[in] name - the name of the channel map to be deleted. */ void dnxChanMapDelete(char * name) { DnxChanMap * chanMap; assert(name && *name); DNX_PT_MUTEX_LOCK(&chanMutex); // locate resource by name if (dnxChanMapFindName(name, &chanMap) == DNX_OK) { // release allocated variables, clear object xfree(chanMap->name); xfree(chanMap->url); memset(chanMap, 0, sizeof *chanMap); } DNX_PT_MUTEX_UNLOCK(&chanMutex); }
/** Waits and returns the first pending item payload from a queue. * * Suspends the calling thread if the queue is empty. The returned payload * and its resources becomes the property of the caller. * * @param[in] queue - the queue to be waited on. * @param[out] ppPayload - the address of storage in which to return the * payload of the first queue item. * * @return Zero on success, or DNX_ERR_NOTFOUND if not found. * * @note Not currently used (or exported by the dnxQueue.h header file). * * @note Cancellation safe. */ int dnxQueueGetWait(DnxQueue * queue, void ** ppPayload) { iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item = 0; assert(queue && ppPayload); DNX_PT_MUTEX_LOCK(&iqueue->mutex); // block this thread until it can dequeue a request while (item == 0) { // see if we have any queue items already waiting if (iqueue->size > 0) { item = iqueue->head; iqueue->head = item->next; if (iqueue->current == item) iqueue->current = item->next; // adjust the tail pointer if the queue is now empty if (iqueue->head == 0) iqueue->tail = 0; iqueue->size--; } else // queue is empty pthread_cond_wait(&iqueue->cv, &iqueue->mutex); } DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); // return the payload to the caller. if (item) { *ppPayload = item->pPayload; xfree(item); return DNX_OK; } return DNX_ERR_NOTFOUND; }
int dnxJobListMarkAck(DnxJobList * pJobList, DnxResult * pRes) { iDnxJobList * ilist = (iDnxJobList *)pJobList; assert(pJobList && pRes); // parameter validation time_t now = time(0); int ret = DNX_ERR_NOTFOUND; dnxDebug(4, "dnxJobListMarkAck: Job [%lu:%lu] serial (%lu) slot (%lu) latency (%lu) sec.", pRes->xid.objSerial, pRes->xid.objSlot, pRes->xid.objSerial, pRes->xid.objSlot, (now - pRes->timestamp)); unsigned long current = pRes->xid.objSlot; DNX_PT_MUTEX_LOCK(&ilist->mut); if (dnxEqualXIDs(&(pRes->xid), &ilist->list[current].xid)) { if(ilist->list[current].state == DNX_JOB_PENDING || ilist->list[current].state == DNX_JOB_UNBOUND) { ilist->list[current].state = DNX_JOB_INPROGRESS; dnxAuditJob(&(ilist->list[current]), "ACK"); ret = DNX_OK; } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** Add a new channel to the global channel map. * * @param[in] name - the name of the new channel to be added. * @param[in] url - the URL to associate with this new channel. * * @return Zero on success, or a non-zero error value. */ int dnxChanMapAdd(char * name, char * url) { DnxChanMap tmp, * chanMap; int ret; assert(name && *name && url && strlen(url) < DNX_MAX_URL); // parse and validate the URL if ((ret = dnxChanMapUrlParse(&tmp, url)) != DNX_OK) return ret; // set the name, unless we are overriding an existing channel if ((tmp.name = xstrdup(name)) == 0 || (tmp.url = xstrdup(url)) == 0) { xfree(tmp.name); return DNX_ERR_MEMORY; } DNX_PT_MUTEX_LOCK(&chanMutex); // see if this name already exists, otherwise grab an empty channel slot if ((ret = dnxChanMapFindName(name, &chanMap)) == DNX_OK || (ret = dnxChanMapFindSlot(&chanMap)) == DNX_OK) { xfree(chanMap->name); xfree(chanMap->url); memcpy(chanMap, &tmp, sizeof *chanMap); } DNX_PT_MUTEX_UNLOCK(&chanMutex); // on error, release previously allocated memory if (ret != DNX_OK) { xfree(tmp.name); xfree(tmp.url); } return ret; }
int dnxQueueGet(DnxQueue * queue, void ** ppPayload) { iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item = 0; assert(queue && ppPayload); dnxDebug(8, "dnxQueueGet: iQueue size(%i)", iqueue->size); DNX_PT_MUTEX_LOCK(&iqueue->mutex); if (iqueue->size > 0) { // remove the 'head' item from the queue item = iqueue->head; iqueue->head = item->next; if (iqueue->current == item) iqueue->current = item->next; // adjust tail pointer if queue is now empty if (iqueue->head == 0) iqueue->tail = 0; iqueue->size--; } DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); // return the payload to the caller, free queue item if (item) { *ppPayload = item->pPayload; xfree(item); return DNX_OK; } return DNX_ERR_NOTFOUND; }
/** The main thread routine for a worker thread. * * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this * thread. * * @return Always returns 0. */ static void * dnxWorker(void * data) { DnxWorkerStatus * ws = (DnxWorkerStatus *)data; pthread_t tid = pthread_self(); int retries = 0; iDnxWlm * iwlm; assert(data); iwlm = ws->iwlm; pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxWorkerCleanup, data); time(&ws->tstart); // set thread start time (for stats) while (!iwlm->terminate) { DnxNodeRequest msg; DnxJob job; int ret; // setup job request message - use thread id and node address in XID dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr); msg.reqType = DNX_REQ_REGISTER; msg.jobCap = 1; msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff; msg.hn = iwlm->myhostname; // request a job, and then wait for a job to come in... if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) { dnxLog("Worker[%lx]: Error sending node request: %s.", tid, dnxErrorString(ret)); } else { DNX_PT_MUTEX_LOCK(&iwlm->mutex); iwlm->reqsent++; DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); } // wait for job, even if request was never sent if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxLog("Worker[%lx]: Error receiving job: %s.", tid, dnxErrorString(ret)); } // Allow thread to be canceled pthread_testcancel(); DNX_PT_MUTEX_LOCK(&iwlm->mutex); cleanThreadPool(iwlm); // ensure counts are accurate before using them if (ret != DNX_OK) { // if above pool minimum and exceeded max retries... if (iwlm->threads > iwlm->cfg.poolMin && ++retries > iwlm->cfg.maxRetries) { dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); break; } } else { iwlm->jobsrcvd++; iwlm->active++; // dnxSendJobAck(ws->collect, &job, &job.address); // dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", // tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd); // DnxAck ack; // ack.xid = job.xid; // ack.timestamp = job.timestamp; dnxSendJobAck(ws->collect, &job, 0); dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp); // check pool size before we get too busy - // if we're not shutting down and we haven't reached the configured // maximum and this is the last thread out, then increase the pool if (!iwlm->terminate && iwlm->threads < iwlm->cfg.poolMax && iwlm->active == iwlm->threads) // Maybe more aggressive here growThreadPool(iwlm); } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); // if we have a job, execute it and reset retry count if (ret == DNX_OK) { char resData[MAX_RESULT_DATA + 1]; DnxResult result; time_t jobstart; dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd); // prepare result structure result.xid = job.xid; // result xid must match job xid result.state = DNX_JOB_COMPLETE; // complete or expired result.delta = 0; result.resCode = DNX_PLUGIN_RESULT_OK; result.resData = 0; /** @todo Allocate result data buffer based on configured buffer size. */ // we want to be able to cancel threads while they're out on a task // in order to obtain timely shutdown for long jobs - move into // async cancel mode, but only for the duration of the check pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0); *resData = 0; jobstart = time(0); dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0); result.delta = time(0) - jobstart; pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); // store allocated copy of the result string if (*resData) result.resData = xstrdup(resData); dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.", tid, job.xid.objSerial, job.xid.objSlot, result.delta, result.resCode, result.resData); // if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { // dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", // tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); // } // Wait while we wait for an Ack to our Results DnxJob ack; int trys = 1; while(trys < 4) { if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) { dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret)); break; } // Now wait for our Ack if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) { dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); } else if (ret == DNX_ERR_TIMEOUT) { // we didn't get our Ack trys++; } else { // We got our Ack dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).", tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys); break; } } xfree(result.resData); // update all statistics DNX_PT_MUTEX_LOCK(&iwlm->mutex); { // track status if (result.resCode == DNX_PLUGIN_RESULT_OK) iwlm->jobsok++; else iwlm->jobsfail++; // track min/max/avg execution time if (result.delta > iwlm->maxexectm) iwlm->maxexectm = result.delta; if (result.delta < iwlm->minexectm) iwlm->minexectm = result.delta; iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2; // total job processing time iwlm->jobtm += (unsigned)result.delta; iwlm->active--; // reduce active count } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); ws->serial++; // increment job serial number for next job retries = 0; } } pthread_cleanup_pop(1); return 0; }
int dnxQueuePut(DnxQueue * queue, void * pPayload) { iDnxQueue * iqueue = (iDnxQueue *)queue; iDnxQueueEntry * item; assert(queue); // create structure to store the new request if ((item = (iDnxQueueEntry *)xmalloc(sizeof *item)) == 0) return DNX_ERR_MEMORY; // We only put a pointer here, because this is a generic queue // so we need to allocate a unique object before calling the queue item->pPayload = pPayload; item->next = 0; DNX_PT_MUTEX_LOCK(&iqueue->mutex); // add new request to end of list, updating list pointers as required if (iqueue->size == 0) // special case - list is empty iqueue->head = iqueue->tail = iqueue->current = item; else { iqueue->tail->next = item; iqueue->tail = item; } iqueue->size++; // check for queue overflow if this queue was created with a maximum size if (iqueue->maxsz > 0 && iqueue->size > iqueue->maxsz) { // remove the oldest entry at the queue head // we had better make sure it's a useless queue object though, otherwise // we are throwing away data!!! item = iqueue->head; iqueue->head = item->next; if (iqueue->current == item) iqueue->current = item->next; // adjust tail if queue is now empty if (iqueue->head == 0) iqueue->tail = 0; iqueue->size--; // call item payload destructor, if one was supplied if (iqueue->freepayload) iqueue->freepayload(item->pPayload); xfree(item); } // signal any waiters - there's a new item in the queue pthread_cond_signal(&iqueue->cv); DNX_PT_MUTEX_UNLOCK(&iqueue->mutex); return DNX_OK; }
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm) { iDnxWlm * iwlm; struct ifaddrs * ifa = NULL; assert(cfg && pwlm); assert(cfg->poolMin > 0); assert(cfg->poolMax >= cfg->poolMin); assert(cfg->poolInitial >= cfg->poolMin); assert(cfg->poolInitial <= cfg->poolMax); // allocate and configure the master thread pool data structure if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0) return DNX_ERR_MEMORY; memset(iwlm, 0, sizeof *iwlm); iwlm->cfg = *cfg; iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher); iwlm->cfg.collector = xstrdup(iwlm->cfg.collector); iwlm->poolsz = iwlm->cfg.poolMax; iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool); iwlm->minexectm = (unsigned)(-1); // the largest possible value memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool); // cache our (primary?) ip address in binary and string format if (getifaddrs(&ifa) == 0) { u_int setflags = IFF_UP | IFF_RUNNING; u_int clrflags = IFF_LOOPBACK; struct ifaddrs * ifcur = ifa; // locate the first proper AF_NET address in our interface list while (ifcur && (ifcur->ifa_addr == 0 || ifcur->ifa_addr->sa_family != AF_INET || (ifcur->ifa_flags & setflags) != setflags || (ifcur->ifa_flags & clrflags) != 0)) ifcur = ifcur->ifa_next; if (ifcur) { // cache binary and presentation (string) versions of the ip address iwlm->myipaddr = (unsigned long) ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr; inet_ntop(ifcur->ifa_addr->sa_family, &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr, iwlm->myipaddrstr, sizeof iwlm->myipaddrstr); } freeifaddrs(ifa); } char unset[] = "NULL"; if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set { dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread."); char machineName [MAX_HOSTNAME]; if(strcmp(cfg->hostname, unset)==0) { dnxDebug(3, "dnxWlmCreate: Hostname undefined in config."); // Get our hostname if(gethostname(machineName, MAX_HOSTNAME)==0) { dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName); // cache hostname strcpy(iwlm->myhostname, machineName); } else { dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?]," "please set hostname in config.", machineName); sprintf( machineName, "localhost"); strcpy(iwlm->myhostname, machineName); } } else { dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname); strcpy(iwlm->myhostname, cfg->hostname); } } else { dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname); strcpy(iwlm->cfg.hostname, iwlm->myhostname); } // if any of the above failed, we really can't continue if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool) { xfree(iwlm->cfg.dispatcher); xfree(iwlm->cfg.collector); xfree(iwlm); return DNX_ERR_MEMORY; } // create initial worker thread pool DNX_PT_MUTEX_INIT(&iwlm->mutex); DNX_PT_MUTEX_LOCK(&iwlm->mutex); { int ret; if ((ret = growThreadPool(iwlm)) != DNX_OK) { if (iwlm->threads) dnxLog("WLM: Error creating SOME worker threads: %s; " "continuing with smaller initial pool.", dnxErrorString(ret)); else { dnxLog("WLM: Unable to create ANY worker threads: %s; " "terminating.", dnxErrorString(ret)); DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); DNX_PT_MUTEX_DESTROY(&iwlm->mutex); xfree(iwlm); return ret; } } } DNX_PT_MUTEX_UNLOCK(&iwlm->mutex); dnxLog("WLM: Started worker thread pool."); *pwlm = (DnxWlm *)iwlm; return DNX_OK; }
int dnxJobListExpire(DnxJobList * pJobList, DnxNewJob * pExpiredJobs, int * totalJobs) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; DnxNewJob * pJob; int jobCount = 0; time_t now; assert(pJobList && pExpiredJobs && totalJobs && *totalJobs > 0); DNX_PT_MUTEX_LOCK(&ilist->mut); // get the current time (after we acquire the lock! In case we had to wait) now = time(0); // walk the entire job list - InProgress and Pending jobs (in that order) current = ilist->head; int zero_factor = ilist->size - current; // add this value to normalize the index dnxDebug(6, "dnxJobListExpire: searching for (%i) expired objects. Head(%lu) Tail(%i)", *totalJobs, ilist->head, ilist->tail); int state = 0; while(jobCount < *totalJobs) { state = (pJob = &ilist->list[current])->state; unsigned long dispatch_timeout = now - DNX_DISPATCH_TIMEOUT; // only examine jobs that are either awaiting dispatch or results switch (state) { case DNX_JOB_UNBOUND: if(pJob->start_time <= dispatch_timeout) { dnxDebug(2, "dnxJobListExpire: Expiring Unbound %s Job [%lu:%lu] count(%i) type(%i) Start Time: (%lu) Now: (%lu) Expire: (%lu)", (pJob->object_check_type ? "Host" : "Service"), pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->start_time, now, dispatch_timeout); // Put the old job in a purgable state pJob->state = DNX_JOB_EXPIRED; // Add a copy to the expired job list memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob)); } else { // If there is a client associated with it, xid.objSlot != -1 // then it means we may be getting a result coming back to us // This job has not expired, try and get a dnxClient for it if (dnxGetNodeRequest(dnxGetRegistrar(), &(pJob->pNode)) == DNX_OK) { // If OK we have successfully dispatched it so update it's expiration dnxDebug(2, "dnxJobListExpire: Dequeueing DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state); pJob->state = DNX_JOB_PENDING; pthread_cond_signal(&ilist->cond); // signal that a new job is available } else { dnxDebug(6, "dnxJobListExpire: Unable to dequeue DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state); } } break; case DNX_JOB_PENDING: case DNX_JOB_INPROGRESS: // check the job's expiration stamp if (pJob->expires <= now) { // // This is an expired job, it was sent out, but never came back dnxDebug(1, "dnxJobListExpire: Expiring Job [%lu:%lu] count(%i) type(%i) Exp: (%lu) Now: (%lu)", pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->expires, now); // Put the old job in a purgable state pJob->state = DNX_JOB_EXPIRED; // Add a copy to the expired job list memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob)); } break; case DNX_JOB_COMPLETE: // If the Ack hasn't been sent out yet, give it time to complete if(! pJob->ack) { dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. count(%i) type(%i)", current, state); break; } case DNX_JOB_EXPIRED: dnxJobCleanup(pJob); dnxDebug(3, "dnxJobListExpire: Nullified Job. count(%i) type(%i)", current, state); case DNX_JOB_NULL: if(current == ilist->head && current != ilist->tail) { ilist->head = ((current + 1) % ilist->size); dnxDebug(2, "dnxJobListExpire: Moving head to (%i). count(%i) type(%i)", ilist->head, current, pJob->state); // we have an old item at the head of the list, so we need to // increment the head. It should never be larger than the tail. } else { dnxDebug(5, "dnxJobListExpire: Null Job. count(%i) type(%i)", current, pJob->state); } break; case DNX_JOB_RECEIVED: if(! pJob->ack) { dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. job [%lu:%lu] count(%i) type(%i)", current, state); } else { dnxDebug(2, "dnxJobListExpire: Ack sent. job [%lu:%lu] count(%i) type(%i)", current, state); } // The Collector thread will set this to DNX_JOB_COMPLETE once it has // replied to Nagios, but we don't advance the head until that happens break; } // bail-out if this was the job list tail if (current == ilist->tail) { break; } // increment the job list index current = ((current + 1) % ilist->size); } // update the total jobs in the expired job list *totalJobs = jobCount; DNX_PT_MUTEX_UNLOCK(&ilist->mut); return DNX_OK; }
int dnxJobListDispatch(DnxJobList * pJobList, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; int ret = DNX_OK; //DNX_ERR_TIMEOUT; struct timeval now; struct timespec timeout; assert(pJobList && pJob); DNX_PT_MUTEX_LOCK(&ilist->mut); // start at current head current = ilist->head; dnxDebug(6, "dnxJobListDispatch: BEFORE: Head=%lu, Tail=%lu, Queue=%lu.", ilist->head, ilist->tail, ilist->size); while (1) { switch (ilist->list[current].state) { case DNX_JOB_INPROGRESS: dnxDebug(8, "dnxJobListDispatch: In Progress Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_NULL: dnxDebug(8, "dnxJobListDispatch: Null Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_EXPIRED: dnxDebug(8, "dnxJobListDispatch: Expired Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_UNBOUND: dnxDebug(8, "dnxJobListDispatch: Unbound Item in slot:(%lu) head:(%lu) tail:(%lu).", current, ilist->head, ilist->tail); break; case DNX_JOB_PENDING: gettimeofday(&now, 0); // Check to see if we have recently dispatched this if((ilist->list[current].pNode)->retry > now.tv_sec) { dnxDebug(5, "dnxJobListDispatch: Pending job [%lu:%lu] waiting for Ack, resend in (%i) sec.", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot, ((ilist->list[current].pNode)->retry - now.tv_sec)); break; } else { if((ilist->list[current].pNode)->retry) { // Make sure the dnxClient service offer is still fresh if((ilist->list[current].pNode)->expires < now.tv_sec) { dnxDebug(4, "dnxJobListDispatch: Pending job [%lu:%lu] waiting for Ack, client node expired. Resubmitting.", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot); ilist->list[current].state = DNX_JOB_UNBOUND; // reset the node? // It's likely that the same client will be servicing us // or that the job might come back in the mean time, so we // should keep this node as long as possible // We just need to make sure that the Affinity is correct and that // it's only used to find a new node, so if we get as far as // resubmitting, we will have a valid node anyway // If the original job comes back, the acks will get all messed up // not sure how to deal with that other than to just be graceful // about receiving lots of results... // dnxDeleteNodeReq(ilist->list[current].pNode); // DnxNodeRequest * pNode = dnxCreateNodeReq(); ilist->list[current].pNode->flags = *(dnxGetAffinity(ilist->list[current].host_name)); // ilist->list[current].pNode->hn = xstrdup(ilist->list[current].host_name); // ilist->list[current].pNode->addr = NULL; // We should leave the address alone so we don't segfault if results come in late // but should we reset these? // ilist->list[current].pNode->xid.objSlot = -1; // ilist->list[current].pNode->xid.objSerial = ilist->list[current].xid.objSerial; // ilist->list[current].pNode = pNode; } break; } else { // This is a new job, so dispatch it dnxDebug(4, "dnxJobListDispatch: Dispatching new job [%lu:%lu] waiting for Ack", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot); } } // set our retry interval // This should be fairly forgiving in case we just missed the Ack but it actually // got the job and is returning our results. (ilist->list[current].pNode)->retry = now.tv_sec + 5; // make a copy for the Dispatcher to send to client memcpy(pJob, &ilist->list[current], sizeof *pJob); // release the mutex DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; case DNX_JOB_COMPLETE: case DNX_JOB_RECEIVED: // This is a job that we have received the response and we need to send an ack to // the client to let it know we got it if(ilist->list[current].ack) { // Only send a single Ack break; } // make a copy for the Dispatcher to send an Ack to the client memcpy(pJob, &ilist->list[current], sizeof *pJob); dnxDebug(4, "dnxJobListDispatch: Received job [%lu:%lu] sending Ack.", ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot); // release the mutex DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; } if (current == ilist->tail) { // if we are at the end of the queue gettimeofday(&now, 0); timeout.tv_sec = now.tv_sec + DNX_JOBLIST_TIMEOUT; timeout.tv_nsec = now.tv_usec * 1000; if ((ret = pthread_cond_timedwait(&ilist->cond, &ilist->mut, &timeout)) == ETIMEDOUT) { // We waited for the time out period and no new jobs arrived. So give control back to caller. dnxDebug(5, "dnxJobListDispatch: Reached end of dispatch queue. Thread timer returned."); DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; } else { // We were signaled that there is a new job, so lets move back to the head and get it! current = ilist->head; dnxDebug(5, "dnxJobListDispatch: Reached end of dispatch queue. A new job arrived."); } } else { // move to next item in queue current = ((current + 1) % ilist->size); } } }