示例#1
0
void dnxWlmDestroy(DnxWlm * wlm)
{
   iDnxWlm * iwlm = (iDnxWlm *)wlm;
   time_t expires;
   unsigned i;

   assert(wlm);

   dnxLog("WLM: Beginning termination sequence...");

   // sleep till we can't stand it anymore, then kill everyone
   iwlm->terminate = 1;
   expires = iwlm->cfg.shutdownGrace + time(0);

   DNX_PT_MUTEX_LOCK(&iwlm->mutex);
   while (iwlm->threads > 0 && time(0) < expires)
   {
      cleanThreadPool(iwlm);
      DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
      dnxCancelableSleep(100);
      DNX_PT_MUTEX_LOCK(&iwlm->mutex);
   }

   // check for workers remaining after grace period
   if (iwlm->threads)
      dnxDebug(1, "WLM: Termination - %d workers remaining"
            " after grace period.", iwlm->threads);
      
   // cancel all remaining workers
   for (i = 0; i < iwlm->threads; i++)
      if (iwlm->pool[i]->state == DNX_THREAD_RUNNING)
      {
         dnxDebug(1, "WLMDestroy: Cancelling worker[%lx].", iwlm->pool[i]->tid);
         pthread_cancel(iwlm->pool[i]->tid);
      }

   // give remaining thread some time to quit
   DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
   dnxCancelableSleep(1000);
   DNX_PT_MUTEX_LOCK(&iwlm->mutex);

   // join all zombies (should be everything left)
   cleanThreadPool(iwlm);
   assert(iwlm->threads == 0);
   xfree(iwlm->pool);
   DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

   DNX_PT_MUTEX_DESTROY(&iwlm->mutex);

   xfree(iwlm->cfg.dispatcher);
   xfree(iwlm->cfg.collector);
   xfree(iwlm);

   dnxLog("WLM: Termination sequence complete.");
}
示例#2
0
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long tail;
   int ret = DNX_OK;

   assert(pJobList && pJob);

   DNX_PT_MUTEX_LOCK(&ilist->mut);

   tail = ilist->tail;

   // verify space in the job list, this keeps a single empty buffer element to 
   // protect us from not knowing a full ring from an empty one
   if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) {
      dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", 
            ilist->size, pJob->cmd);
      dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", 
            ilist->size, pJob->cmd);
     ret = DNX_ERR_CAPACITY;
   } else {
      // add the slot index to the Job's XID - this allows us to index 
      //    the job list using the returned result's XID.objSlot field
      pJob->xid.objSlot = tail;
      // We were unable to get an available dnxClient job request so we
      // put the job into the queue anyway and have the timer thread try 
      // and find a dnxClient for it later
      if (pJob->pNode->xid.objSlot == -1) {
         pJob->state = DNX_JOB_UNBOUND;
      } else {
         pJob->state = DNX_JOB_PENDING;
      }
      
      dnxAuditJob(pJob, "ASSIGN");
      
      // add this job to the job list
      memcpy(&ilist->list[tail], pJob, sizeof *pJob);
      
      ilist->tail = tail;
   
      dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", 
            pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail);
      
      if(pJob->state == DNX_JOB_PENDING) {
         pthread_cond_signal(&ilist->cond);  // signal that a new job is available
      }         
   }

   DNX_PT_MUTEX_UNLOCK(&ilist->mut);

   return ret;
}
示例#3
0
/** Dispatch thread clean-up routine
 * 
 * @param[in] data - an opaque pointer to a worker's status data structure.
 */
static void dnxWorkerCleanup(void * data)
{
   DnxWorkerStatus * ws = (DnxWorkerStatus *)data;
   assert(data);
   dnxDebug(2, "Worker[%lx]: Terminating.", pthread_self());
   ws->state = DNX_THREAD_ZOMBIE;
}
示例#4
0
/** Clean up zombie threads and compact the thread pool.
 * 
 * @param[in] iwlm - a pointer to the work load manager data structure.
 */
static void cleanThreadPool(iDnxWlm * iwlm)
{
   unsigned i = 0;
   time_t now = time(0);

   iwlm->lastclean = now;  // keep track of when we last cleaned

   // look for zombie threads to join
   while (i < iwlm->threads)
   {
      if (iwlm->pool[i]->state == DNX_THREAD_ZOMBIE)
      {
         DnxWorkerStatus * ws = iwlm->pool[i];
         int ret;

         dnxDebug(1, "WLM: Joining worker[%lx]...", ws->tid);
         pthread_join(ws->tid, 0);

         // reduce thread count; update stats
         iwlm->threads--;
         iwlm->tdestroyed++;
         iwlm->threadtm += (unsigned)(now - ws->tstart);

         // release thread resources; delete thread; compact ptr array
         releaseWorkerComm(ws);
         xfree(iwlm->pool[i]);
         memmove(&iwlm->pool[i], &iwlm->pool[i + 1], 
               (iwlm->threads - i) * sizeof iwlm->pool[i]);
         continue;
      }
      i++;
   }
}
示例#5
0
文件: dnxNebMain.c 项目: dnsmichi/DNX
/** Post a new job from Nagios to the dnxServer job queue.
 *
 * @param[in] joblist - the job list to which the new job should be posted.
 * @param[in] serial - the serial number of the new job.
 * @param[in] jdp - a pointer to a job data structure.
 * @param[in] ds - a pointer to the nagios job that's being posted.
 * @param[in] pNode - a dnxClient node request structure that is being
 *    posted with this job. The dispatcher thread will send the job to the
 *    associated node.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxPostNewJob(DnxJobList * joblist, unsigned long serial, 
      DnxJobData * jdp, nebstruct_service_check_data * ds, 
      DnxNodeRequest * pNode)
{
   DnxNewJob Job;
   int ret;

   assert(ds);
   assert(ds->command_line);

   // fill-in the job structure with the necessary information
   dnxMakeXID(&Job.xid, DNX_OBJ_JOB, serial, 0);
   Job.payload    = jdp;
   Job.cmd        = xstrdup(ds->command_line);
   Job.start_time = ds->start_time.tv_sec;
   Job.timeout    = ds->timeout;
   Job.expires    = Job.start_time + Job.timeout + 5;
   Job.pNode      = pNode;

   dnxDebug(2, "DnxNebMain: Posting Job [%lu]: %s.", serial, Job.cmd);

   // post to the Job Queue
   if ((ret = dnxJobListAdd(joblist, &Job)) != DNX_OK)
   {
      dnxStatsInc(0, JOBS_REJECTED_NO_SLOTS);
      dnxLog("Failed to post Job [%lu]; \"%s\": %d.",
            Job.xid.objSerial, Job.cmd, ret);
   }
   else
   {
      dnxStatsInc(0, JOBS_HANDLED);
      dnxAuditJob(&Job, "ASSIGN");
   }
   return ret;
}
示例#6
0
int dnxJobListCollect(DnxJobList * pJobList, DnxXID * pxid, DnxNewJob * pJob)
{
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long current;
   int ret = DNX_OK;
   assert(pJobList && pxid && pJob);   // parameter validation

   current = pxid->objSlot;

   dnxDebug(4, "dnxJobListCollect: Job serial (%lu) slot (%lu) list head(%i)", 
        pxid->objSerial, pxid->objSlot, ilist->head);

   if (current >= ilist->size)         // runtime validation requires check
      return DNX_ERR_INVALID;          // corrupt client network message

   DNX_PT_MUTEX_LOCK(&ilist->mut);
   
   // verify that the XID of this result matches the XID of the service check 
   if (ilist->list[current].state == DNX_JOB_NULL 
         || !dnxEqualXIDs(pxid, &ilist->list[current].xid)) {
      dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] not found.", pxid->objSerial, pxid->objSlot);      
      ret = DNX_ERR_NOTFOUND;          // Very old job or we restarted and lost state
   } else if(ilist->list[current].state == DNX_JOB_EXPIRED) {
      dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] expired before retrieval.", pxid->objSerial, pxid->objSlot);      
      ret = DNX_ERR_EXPIRED;          // job expired; removed by the timer
   } else {
      if(ilist->list[current].state == DNX_JOB_COMPLETE || ilist->list[current].state == DNX_JOB_RECEIVED) {
         dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] already retrieved.", pxid->objSerial, pxid->objSlot);      
         ilist->list[current].ack = 0;
         ret = DNX_ERR_ALREADY;           // It needs another Ack
      } else {
         // DNX_JOB_INPROGRESS // DNX_JOB_UNBOUND!!
         ilist->list[current].state = DNX_JOB_RECEIVED;      
         // make a copy to return to the Collector
         memcpy(pJob, &ilist->list[current], sizeof *pJob);
         dnxDebug(4, "dnxJobListCollect: Job [%lu:%lu] completed. Copy of result for (%s) assigned to collector.",
             pxid->objSerial, pxid->objSlot, pJob->cmd);
      }
      
      // Signal to the dispatcher that we need to send an Ack
      pthread_cond_signal(&ilist->cond);
   }

   DNX_PT_MUTEX_UNLOCK(&ilist->mut);

   return ret;
}
示例#7
0
int dnxGetNodeRequest(DnxRegistrar * reg, DnxNodeRequest ** ppNode)
{
   iDnxRegistrar * ireg = (iDnxRegistrar *)reg;
   int ret, discard_count = 0;
   DnxNodeRequest * node = 0;

   assert(reg && ppNode);

   while ((ret = dnxQueueGet(ireg->rqueue, (void **)&node)) == DNX_OK)
   {
      time_t now = time(0);

      // verify that this request's Time-To-Live (TTL) has not expired
      if (node->expires > now)
         break;

      dnxStatsInc(node->address, REQUESTS_EXPIRED);

      dnxDebug(3, "dnxRegisterNode: Expired req [%lu,%lu] at %u; expired at %u.",
            node->xid.objSerial, node->xid.objSlot, (unsigned)(now % 1000), 
            (unsigned)(node->expires % 1000));

      discard_count++;

      xfree(node);
      node = 0;
   }

   if (discard_count > 0)
      dnxDebug(1, "dnxGetNodeRequest: Discarded %d expired node requests.",
            discard_count);

   if (ret != DNX_OK && ret != DNX_ERR_TIMEOUT)
   {
      dnxStatsInc(0, JOBS_REJECTED_NO_NODES);
      dnxDebug(2, "dnxGetNodeRequest: Unable to fulfill node request: %s.",
            dnxErrorString(ret));
   }

   *ppNode = node;   // return a node or NULL

   return ret;
}
示例#8
0
/** Register a new client node "request for work" request.
 *
 * The message is either stored or used to find an existing node request
 * that should be updated. If stored, @p ppMsg is returned as zero so that
 * it will be reallocated by the caller. In all other cases, the same
 * message block can be reused by the caller for the next request.
 *
 * @param[in] ireg - the registrar on which to register a new client request.
 * @param[in] ppMsg - the address of the dnx client request node pointer.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxRegisterNode(iDnxRegistrar * ireg, DnxNodeRequest ** ppMsg)
{
   pthread_t tid = pthread_self();
   DnxNodeRequest * pReq;
   time_t now = time(0);
   int ret = DNX_OK;

   assert(ireg && ppMsg && *ppMsg);

   // compute expiration time of this request
   pReq = *ppMsg;
   pReq->expires = now + pReq->ttl;

   dnxStatsInc(pReq->address, REQUESTS_RECEIVED);

   // locate existing node: update expiration time, or add to the queue
   if (dnxQueueFind(ireg->rqueue, (void **)&pReq, dnxCompareNodeReq) == DNX_QRES_FOUND)
   {
      pReq->expires = (*ppMsg)->expires;
      dnxDebug(2,
            "dnxRegistrar[%lx]: Updated req [%lu,%lu] at %u; expires at %u.",
            tid, pReq->xid.objSerial, pReq->xid.objSlot,
            (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000));
   }
   else if ((ret = dnxQueuePut(ireg->rqueue, *ppMsg)) == DNX_OK)
   {
      *ppMsg = 0;    // we're keeping this message; return NULL
      dnxDebug(2,
            "dnxRegistrar[%lx]: Added req [%lu,%lu] at %u; expires at %u.",
            tid, pReq->xid.objSerial, pReq->xid.objSlot,
            (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000));
   }
   else
      dnxLog("DNX Registrar: Unable to enqueue node request: %s.",
            dnxErrorString(ret));

   return ret;
}
示例#9
0
DnxQueueResult dnxQueueRemove(DnxQueue * queue, void ** ppPayload, 
      DnxQueueResult (*Compare)(void * pLeft, void * pRight)) {
   DnxQueueResult bFound = DNX_QRES_CONTINUE;
   iDnxQueue * iqueue = (iDnxQueue *)queue;
   iDnxQueueEntry * item, * prev;
   int counter = 0;

   assert(queue && ppPayload && Compare);

   DNX_PT_MUTEX_LOCK(&iqueue->mutex);

   prev = 0;
   for (item = iqueue->head; item; item = item->next) {
      counter++;
      if ((bFound = Compare(*ppPayload, item->pPayload)) != DNX_QRES_CONTINUE) {
         if (bFound == DNX_QRES_FOUND) {
            *ppPayload = item->pPayload;

            // cross-link previous to next and free current
            if (prev)
               prev->next = item->next;
            else                          // removing the head item
               iqueue->head = item->next;

            if (item->next == 0)          // removing the tail item
               iqueue->tail = prev;

            if (iqueue->current == item)  // advance circular pointer
               if ((iqueue->current = item->next) == 0)
                  iqueue->current = iqueue->head;

            iqueue->size--;
         }
         break;
      }
      prev = item;
   }

   dnxDebug(8, "dnxQueueRemove: (%i) elements searched in (%i) sized queue", 
      counter, iqueue->size);

   DNX_PT_MUTEX_UNLOCK(&iqueue->mutex);

   if (bFound == DNX_QRES_FOUND) {
      xfree(item);       // free the queue entry wrapper object
   }
   return bFound;
}
示例#10
0
int dnxJobListMarkComplete(DnxJobList * pJobList, DnxXID * pXid) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   assert(pJobList && pXid);   // parameter validation
   int ret = DNX_ERR_NOTFOUND;
   dnxDebug(4, "dnxJobListMarkComplete: Job [%lu:%lu]", 
        pXid->objSerial, pXid->objSlot);
   unsigned long current = pXid->objSlot;

   DNX_PT_MUTEX_LOCK(&ilist->mut);
   if (dnxEqualXIDs(pXid, &ilist->list[current].xid)) {
      if(ilist->list[current].state == DNX_JOB_RECEIVED) {
         ilist->list[current].state = DNX_JOB_COMPLETE;
         ret = DNX_OK;
      }
   }
   DNX_PT_MUTEX_UNLOCK(&ilist->mut);
   return ret;
}
示例#11
0
int dnxJobListMarkAck(DnxJobList * pJobList, DnxResult * pRes) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   assert(pJobList && pRes);   // parameter validation
   time_t now = time(0);
   int ret = DNX_ERR_NOTFOUND;
   dnxDebug(4, "dnxJobListMarkAck: Job [%lu:%lu] serial (%lu) slot (%lu) latency (%lu) sec.", 
        pRes->xid.objSerial, pRes->xid.objSlot, pRes->xid.objSerial, pRes->xid.objSlot, (now - pRes->timestamp));
   unsigned long current = pRes->xid.objSlot;

   DNX_PT_MUTEX_LOCK(&ilist->mut);
   if (dnxEqualXIDs(&(pRes->xid), &ilist->list[current].xid)) {
      if(ilist->list[current].state == DNX_JOB_PENDING || ilist->list[current].state == DNX_JOB_UNBOUND) {
         ilist->list[current].state = DNX_JOB_INPROGRESS;
         dnxAuditJob(&(ilist->list[current]), "ACK");
         ret = DNX_OK;
      }
   }
   DNX_PT_MUTEX_UNLOCK(&ilist->mut);
   return ret;
}
示例#12
0
文件: dnxNebMain.c 项目: dnsmichi/DNX
/** Timed Event Handler.
 *
 * Nagios calls this routine once each time a timed event needs to execute. 
 * The particular event we care about is the REAPER event. 
 *
 * @param[in] event_type - the event type for which we're being called.
 * @param[in] data - an opaque pointer to nagios event-specific data.
 *
 * @return Zero, but the value is ignored by Nagios in this event.
 */
static int ehTimedEvent(int event_type, void * data)
{
   nebstruct_timed_event_data * ted = (nebstruct_timed_event_data *)data;
   timed_event * event = (timed_event*)data;
   int ret;

   // sanity checks
   if (event_type != NEBCALLBACK_TIMED_EVENT_DATA || ted == 0)
      return ERROR;

   // we only care about REAPER events
   if (ted->event_type != EVENT_CHECK_REAPER)
      return OK;

   dnxDebug(3, "Reaper handler called.");
   
   dnxMoveResultsToNagios();

   return OK;
}
示例#13
0
int dnxQueueGet(DnxQueue * queue, void ** ppPayload)
{
   iDnxQueue * iqueue = (iDnxQueue *)queue;
   iDnxQueueEntry * item = 0;
   
   assert(queue && ppPayload);
   
   dnxDebug(8, "dnxQueueGet: iQueue size(%i)", iqueue->size);

   DNX_PT_MUTEX_LOCK(&iqueue->mutex);
   
   if (iqueue->size > 0) 
   {
      // remove the 'head' item from the queue
      item = iqueue->head;
      iqueue->head = item->next;
      if (iqueue->current == item)
         iqueue->current = item->next;

      // adjust tail pointer if queue is now empty
      if (iqueue->head == 0)
         iqueue->tail = 0;
   
      iqueue->size--;
   }
   
   DNX_PT_MUTEX_UNLOCK(&iqueue->mutex);

   // return the payload to the caller, free queue item
   if (item) 
   {
      *ppPayload = item->pPayload;
      xfree(item);
      return DNX_OK;
   }

   return DNX_ERR_NOTFOUND;
}
示例#14
0
文件: dnxNebMain.c 项目: dnsmichi/DNX
/** Process Data Event Handler.
 *
 * @param[in] event_type - the event regarding which we were called by Nagios.
 * @param[in] data - an opaque pointer to an event-specific data structure.
 *
 * @return Zero if all is okay, but we want nagios to handle this event;
 *    non-zero if there's a problem of some sort.
 */
static int ehProcessData(int event_type, void * data)
{
   nebstruct_process_data *procdata = (nebstruct_process_data *)data;

   // validate our event type - ignore wrong event type
   assert(event_type == NEBCALLBACK_PROCESS_DATA);
   if (event_type != NEBCALLBACK_PROCESS_DATA)
      return OK;

   // sanity-check our data structure - should never happen
   assert(procdata);
   if (!procdata)
   {
      dnxLog("Startup handler received NULL process data structure.");
      return ERROR;
   }

   // look for process event loop start event
   if (procdata->type == NEBTYPE_PROCESS_EVENTLOOPSTART)
   {
      dnxDebug(2, "Startup handler received PROCESS_EVENTLOOPSTART event.");

      // execute sync script, if defined
      if (cfg.syncScript)
      {
         dnxLog("Startup handler executing plugin sync script: %s.", cfg.syncScript);

         // NB: This halts Nagios execution until the script exits...
         launchScript(cfg.syncScript);
      }

      // if server init fails, do server shutdown
      if (dnxServerInit() != 0)
         dnxServerDeInit();
   }
   return OK;
}
示例#15
0
文件: dnxNebMain.c 项目: dnsmichi/DNX
/** Service Check Event Handler.
 *
 * @param[in] event_type - the event type for which we're being called.
 * @param[in] data - an opaque pointer to nagios event-specific data.
 *
 * @return Zero if we want Nagios to handle the event;
 *    NEBERROR_CALLBACKOVERRIDE indicates that we want to handle the event
 *    ourselves; any other non-zero value represents an error.
 */
static int ehSvcCheck(int event_type, void * data)
{
   static unsigned long serial = 0; // the number of service checks processed

   nebstruct_service_check_data * svcdata = (nebstruct_service_check_data *)data;
   DnxNodeRequest * pNode;
   DnxJobData * jdp;
   int ret;

   if (event_type != NEBCALLBACK_SERVICE_CHECK_DATA)
      return OK;

   if (svcdata == 0)
   {
      dnxLog("Service handler received NULL service data structure.");
      return ERROR;  // shouldn't happen - internal Nagios error
   }

   if (svcdata->type != NEBTYPE_SERVICECHECK_INITIATE)
      return OK;  // ignore non-initiate service checks

   // check for local execution pattern on command line
   if (cfg.localCheckPattern && regexec(&regEx, svcdata->command_line, 0, 0, 0) == 0)
   {
      dnxDebug(1, "Service will execute locally: %s.", svcdata->command_line);
      return OK;     // tell nagios execute locally
   }

   dnxDebug(3, "ehSvcCheck: Received Job [%lu] at %lu (%lu).",
         serial, (unsigned long)time(0),
         (unsigned long)svcdata->start_time.tv_sec);

   if ((ret = dnxGetNodeRequest(registrar, &pNode)) != DNX_OK)
   {
      dnxDebug(3, "ehSvcCheck: No worker nodes requests available: %s.",dnxErrorString(ret));
      return OK;     // tell nagios execute locally
   }

   // allocate and populate a new job payload object
   if ((jdp = (DnxJobData *)xmalloc(sizeof *jdp)) == 0)
   {
      dnxDebug(1, "ehSvcCheck: Out of memory!");
      return OK;
   }
   memset(jdp, 0, sizeof *jdp);
   jdp->svc = (service *)svcdata->OBJECT_FIELD_NAME;

   assert(jdp->svc);

#if CURRENT_NEB_API_VERSION == 3
   {
      // a nagios 3.x global variable
      extern check_result check_result_info;

      /** @todo patch nagios to pass these values to the event handler. */

      jdp->chkopts    = check_result_info.check_options;
      jdp->schedule   = check_result_info.scheduled_check;
      jdp->reschedule = check_result_info.reschedule_check;
   }
#endif

   if ((ret = dnxPostNewJob(joblist, serial, jdp, svcdata, pNode)) != DNX_OK)
   {
      dnxLog("Unable to post job [%lu]: %s.", serial, dnxErrorString(ret));
      xfree(jdp);
      return OK;     // tell nagios execute locally
   }

   serial++;                           // bump serial number
   return NEBERROR_CALLBACKOVERRIDE;   // tell nagios we want it
}
示例#16
0
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm)
{
   iDnxWlm * iwlm;
   struct ifaddrs * ifa = NULL;

   assert(cfg && pwlm);
   assert(cfg->poolMin > 0);
   assert(cfg->poolMax >= cfg->poolMin);
   assert(cfg->poolInitial >= cfg->poolMin);
   assert(cfg->poolInitial <= cfg->poolMax);

   // allocate and configure the master thread pool data structure
   if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0)
      return DNX_ERR_MEMORY;

   memset(iwlm, 0, sizeof *iwlm);
   iwlm->cfg = *cfg;
   iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher);
   iwlm->cfg.collector = xstrdup(iwlm->cfg.collector);
   iwlm->poolsz = iwlm->cfg.poolMax;
   iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool);
   iwlm->minexectm = (unsigned)(-1);   // the largest possible value
   memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool);

   // cache our (primary?) ip address in binary and string format
   if (getifaddrs(&ifa) == 0)
   {
      u_int setflags = IFF_UP | IFF_RUNNING;
      u_int clrflags = IFF_LOOPBACK;
      struct ifaddrs * ifcur = ifa;

      // locate the first proper AF_NET address in our interface list
      while (ifcur && (ifcur->ifa_addr == 0 
            || ifcur->ifa_addr->sa_family != AF_INET 
            || (ifcur->ifa_flags & setflags) != setflags
            || (ifcur->ifa_flags & clrflags) != 0))
         ifcur = ifcur->ifa_next;

      if (ifcur)
      {
         // cache binary and presentation (string) versions of the ip address
         iwlm->myipaddr = (unsigned long)
               ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr;
         inet_ntop(ifcur->ifa_addr->sa_family,
                &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr,
                iwlm->myipaddrstr, sizeof iwlm->myipaddrstr);
      }
      freeifaddrs(ifa);
   }
   
   char unset[] = "NULL";
   if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set
   {
      dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread.");
      char machineName [MAX_HOSTNAME];
      if(strcmp(cfg->hostname, unset)==0)
      {
         dnxDebug(3, "dnxWlmCreate: Hostname undefined in config.");
         // Get our hostname
         if(gethostname(machineName, MAX_HOSTNAME)==0)
         {
            dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName);
            // cache hostname
            strcpy(iwlm->myhostname, machineName);
         } else {
            dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?],"
               "please set hostname in config.", machineName);
            sprintf( machineName, "localhost");
            strcpy(iwlm->myhostname, machineName);
         }
      } else {
         dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname);
         strcpy(iwlm->myhostname, cfg->hostname);
      }
   } else {
      dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname);
      strcpy(iwlm->cfg.hostname, iwlm->myhostname);
   }

   // if any of the above failed, we really can't continue
   if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool)
   {
      xfree(iwlm->cfg.dispatcher);
      xfree(iwlm->cfg.collector);
      xfree(iwlm);
      return DNX_ERR_MEMORY;
   }

   // create initial worker thread pool
   DNX_PT_MUTEX_INIT(&iwlm->mutex);
   DNX_PT_MUTEX_LOCK(&iwlm->mutex);
   {
      int ret;
      if ((ret = growThreadPool(iwlm)) != DNX_OK)
      {
         if (iwlm->threads)
            dnxLog("WLM: Error creating SOME worker threads: %s; "
                  "continuing with smaller initial pool.", dnxErrorString(ret));
         else
         {
            dnxLog("WLM: Unable to create ANY worker threads: %s; "
                  "terminating.", dnxErrorString(ret));
            DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
            DNX_PT_MUTEX_DESTROY(&iwlm->mutex);
            xfree(iwlm);
            return ret;
         }
      }
   }
   DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

   dnxLog("WLM: Started worker thread pool.");

   *pwlm = (DnxWlm *)iwlm;

   return DNX_OK;
}
示例#17
0
文件: dnxTimer.c 项目: dnsmichi/DNX
/** The main timer thread procedure entry point.
 *
 * @param[in] data - an opaque pointer to thread data for the timer thread.
 *    This is actually the dnx server global data object.
 *
 * @return Always returns 0.
 */
static void * dnxTimer(void * data)
{
   iDnxTimer * itimer = (iDnxTimer *)data;
   DnxNewJob ExpiredList[MAX_EXPIRED];
   int i, totalExpired;
   int ret = 0;

   assert(data);

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxTimerCleanup, data);

   dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self());

   while (1)
   {
      pthread_testcancel();

      dnxCancelableSleep(itimer->sleepms);

      // search for expired jobs in the pending queue
      totalExpired = MAX_EXPIRED;
      if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, 
            &totalExpired)) == DNX_OK && totalExpired > 0)
      {
         for (i = 0; i < totalExpired; i++)
         {
            char msg[256];
            char addrstr[DNX_MAX_ADDRSTR];
            DnxNewJob * job = &ExpiredList[i];

            dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.",
                  pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd);

            dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT);
            dnxAuditJob(job, "EXPIRE");

//          if (job->ack)
               snprintf(msg, sizeof msg, 
                     "(DNX: Service Check [%lu,%lu] Timed Out - "
                     "Node: %s - Failed to return job response in time allowed)",
                     job->xid.objSerial, job->xid.objSlot, addrstr);
//          else
//             snprintf(msg, sizeof msg, 
//                   "(DNX: Service Check [%lu,%lu] Timed Out - "
//                   "Node: %s - Failed to acknowledge job receipt)",
//                   job->xid.objSerial, job->xid.objSlot, addrstr);

            dnxDebug(2, msg);

            // report the expired job to Nagios
            ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, 
                  time(0) - job->start_time, 1, 0, msg);
            dnxJobCleanup(job);
         }
      }

      if (totalExpired > 0 || ret != DNX_OK)
         dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d  Retcode=%d: %s.",
               pthread_self(), totalExpired, ret, dnxErrorString(ret));
   }

   dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret));

   pthread_cleanup_pop(1);
   return 0;
}
示例#18
0
文件: dnxAgent.c 项目: dnsmichi/DNX
/** The agent thread control procedure.
 *
 * @param[in] data - thread data; not used.
 *
 * @return Always returns a null pointer (zero).
 */
static void * dnxAgentServer(void * data)
{
   int ret;
   DnxMgmtRequest Msg;
   Msg.action = 0;

   dnxLog("DNX Server Agent awaiting commands...");

   while (!s_shutdown)
   {
      memset(Msg.address, '\0', DNX_MAX_ADDRESS);

      // wait 2 second for a request; process the request, if valid
      if ((ret = dnxWaitForMgmtRequest(s_agent, &Msg, Msg.address, 2)) == DNX_OK)
      {
         DnxMgmtReply Rsp;
         char addrstr[DNX_MAX_ADDRSTR];

         dnxDebug(2, "Received MgmtRequest from %s.", 
               dnxNtop(Msg.address, addrstr, sizeof addrstr));

         // setup some default response values
         Rsp.xid = Msg.xid;
         Rsp.status = DNX_REQ_ACK;
         Rsp.reply = 0;

         // perform the requested action
         if (!strcmp(Msg.action, "RESETSTATS"))
         {
            dnxStatsResetServerStats();
            dnxStatsForEachNode(dnxResetNodeStats, 0);
            Rsp.reply = xstrdup("OK");
         }
         else if (!strncmp(Msg.action, "GETSTATS ", 9))
         {
            if ((Rsp.reply = buildMgmtStatsReply(Msg.action + 9)) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strncmp(Msg.action, "GETNODESTATS ", 13))
         {
            if ((Rsp.reply = buildMgmtNodeStatsReply(Msg.action + 13)) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETNODELIST"))
         {
            if ((Rsp.reply = buildMgmtNodeListReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETCONFIG"))
         {
            if ((Rsp.reply = buildMgmtCfgReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETVERSION"))
         {
            if ((Rsp.reply = versionText()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "HELP"))
         {
            if ((Rsp.reply = buildHelpReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }

         // send response, log response failures
         if ((ret = dnxSendMgmtReply(s_agent, &Rsp, Msg.address)) != 0)
            dnxLog("Agent response failure: %s.", dnxErrorString(ret));

         // free request and reply message buffers
         xfree(Rsp.reply);
         xfree(Msg.action);
      }
      else if (ret != DNX_ERR_TIMEOUT)
         dnxLog("Agent channel failure: %s.", dnxErrorString(ret));
   }

   dnxLog("Agent terminating...");

   return 0;
}
示例#19
0
/** The main thread routine for a worker thread.
 * 
 * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this
 *    thread.
 * 
 * @return Always returns 0.
 */
static void * dnxWorker(void * data)
{
   DnxWorkerStatus * ws = (DnxWorkerStatus *)data;
   pthread_t tid = pthread_self();
   int retries = 0;
   iDnxWlm * iwlm;

   assert(data);
   
   iwlm = ws->iwlm;

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxWorkerCleanup, data);

   time(&ws->tstart);   // set thread start time (for stats)

   while (!iwlm->terminate)
   {
      DnxNodeRequest msg;
      DnxJob job;
      int ret;
      
      // setup job request message - use thread id and node address in XID
      dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr);
      msg.reqType = DNX_REQ_REGISTER;
      msg.jobCap = 1;
      msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff;
      msg.hn = iwlm->myhostname;
      // request a job, and then wait for a job to come in...
      if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) {
         dnxLog("Worker[%lx]: Error sending node request: %s.", 
               tid, dnxErrorString(ret));
      } else {
         DNX_PT_MUTEX_LOCK(&iwlm->mutex);
         iwlm->reqsent++;
         DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
      }

      // wait for job, even if request was never sent
      if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) {
         dnxLog("Worker[%lx]: Error receiving job: %s.",
               tid, dnxErrorString(ret));
      }
      
      // Allow thread to be canceled
      pthread_testcancel();

      DNX_PT_MUTEX_LOCK(&iwlm->mutex);
      cleanThreadPool(iwlm); // ensure counts are accurate before using them
      if (ret != DNX_OK)
      {
         // if above pool minimum and exceeded max retries...
         if (iwlm->threads > iwlm->cfg.poolMin 
               && ++retries > iwlm->cfg.maxRetries)
         {
            dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid);
            DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
            break;
         }
      }
      else
      {
         iwlm->jobsrcvd++;
         iwlm->active++;
//          dnxSendJobAck(ws->collect, &job, &job.address);
//          dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", 
//                tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd);
         
//          DnxAck ack;
//          ack.xid = job.xid;
//          ack.timestamp = job.timestamp;
         
         dnxSendJobAck(ws->collect, &job, 0);
         dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", 
               tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp);



         // check pool size before we get too busy -
         // if we're not shutting down and we haven't reached the configured
         // maximum and this is the last thread out, then increase the pool
         if (!iwlm->terminate 
               && iwlm->threads < iwlm->cfg.poolMax
               && iwlm->active == iwlm->threads) // Maybe more aggressive here
            growThreadPool(iwlm);
      }
      DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

      // if we have a job, execute it and reset retry count
      if (ret == DNX_OK)
      {
         char resData[MAX_RESULT_DATA + 1];
         DnxResult result;
         time_t jobstart;


         dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", 
               tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd);
               
               
         
         
         // prepare result structure
         result.xid = job.xid;               // result xid must match job xid
         result.state = DNX_JOB_COMPLETE;    // complete or expired
         result.delta = 0;
         result.resCode = DNX_PLUGIN_RESULT_OK;
         result.resData = 0;

         /** @todo Allocate result data buffer based on configured buffer size. */

         // we want to be able to cancel threads while they're out on a task
         // in order to obtain timely shutdown for long jobs - move into
         // async cancel mode, but only for the duration of the check
         pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0);

         *resData = 0;
         jobstart = time(0);
         dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0);
         result.delta = time(0) - jobstart;

         pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);

         // store allocated copy of the result string
         if (*resData) result.resData = xstrdup(resData);

         dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.",
               tid, job.xid.objSerial, job.xid.objSlot, result.delta, 
               result.resCode, result.resData);

//          if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) {
//             dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.",
//                   tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret));
//          }
         

         // Wait while we wait for an Ack to our Results
         DnxJob ack;
         int trys = 1;
         while(trys < 4) {
            if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) {
               dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret));
               break;
            }
            // Now wait for our Ack
            if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) {
               dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys);
            } else if (ret == DNX_ERR_TIMEOUT) {
               // we didn't get our Ack
               trys++;
            } else {
               // We got our Ack
               dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys);
               break;
            }
         }


         xfree(result.resData);
 
         // update all statistics
         DNX_PT_MUTEX_LOCK(&iwlm->mutex);
         {
            // track status
            if (result.resCode == DNX_PLUGIN_RESULT_OK) 
               iwlm->jobsok++;
            else 
               iwlm->jobsfail++;

            // track min/max/avg execution time
            if (result.delta > iwlm->maxexectm)
               iwlm->maxexectm = result.delta;
            if (result.delta < iwlm->minexectm)
               iwlm->minexectm = result.delta;
            iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2;

            // total job processing time
            iwlm->jobtm += (unsigned)result.delta;
            iwlm->active--;   // reduce active count
         }
         DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

         ws->serial++;     // increment job serial number for next job
         retries = 0;
      }
   }
   pthread_cleanup_pop(1);
   return 0;
}
示例#20
0
int dnxJobListDispatch(DnxJobList * pJobList, DnxNewJob * pJob)
{
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long current;
   int ret = DNX_OK; //DNX_ERR_TIMEOUT;
   struct timeval now;
   struct timespec timeout;

   assert(pJobList && pJob);

   DNX_PT_MUTEX_LOCK(&ilist->mut);


   // start at current head
   current = ilist->head;

   dnxDebug(6, "dnxJobListDispatch: BEFORE: Head=%lu, Tail=%lu, Queue=%lu.", 
       ilist->head, ilist->tail, ilist->size);

   while (1) {
 
      switch (ilist->list[current].state) {
         case DNX_JOB_INPROGRESS:
            dnxDebug(8, "dnxJobListDispatch: In Progress Item in slot:(%lu) head:(%lu) tail:(%lu).", 
               current, ilist->head, ilist->tail);
            break;
         case DNX_JOB_NULL:
            dnxDebug(8, "dnxJobListDispatch: Null Item in slot:(%lu) head:(%lu) tail:(%lu).", 
               current, ilist->head, ilist->tail);
            break;
         case DNX_JOB_EXPIRED:
            dnxDebug(8, "dnxJobListDispatch: Expired Item in slot:(%lu) head:(%lu) tail:(%lu).", 
               current, ilist->head, ilist->tail);
            break;
         case DNX_JOB_UNBOUND:
            dnxDebug(8, "dnxJobListDispatch: Unbound Item in slot:(%lu) head:(%lu) tail:(%lu).", 
               current, ilist->head, ilist->tail);
            break;
         case DNX_JOB_PENDING:
            gettimeofday(&now, 0);

            // Check to see if we have recently dispatched this
            if((ilist->list[current].pNode)->retry > now.tv_sec) {
               dnxDebug(5, "dnxJobListDispatch: Pending job [%lu:%lu] waiting for Ack, resend in (%i) sec.",
                  ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot, ((ilist->list[current].pNode)->retry - now.tv_sec));
               break;
            } else {
                if((ilist->list[current].pNode)->retry) {
                  // Make sure the dnxClient service offer is still fresh
                  if((ilist->list[current].pNode)->expires < now.tv_sec) {
                     dnxDebug(4, "dnxJobListDispatch: Pending job [%lu:%lu] waiting for Ack, client node expired. Resubmitting.",
                     ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot);
                     ilist->list[current].state = DNX_JOB_UNBOUND;
                     
                     // reset the node?
                     // It's likely that the same client will be servicing us
                     // or that the job might come back in the mean time, so we
                     // should keep this node as long as possible
                     // We just need to make sure that the Affinity is correct and that 
                     // it's only used to find a new node, so if we get as far as 
                     // resubmitting, we will have a valid node anyway
                     
                     // If the original job comes back, the acks will get all messed up
                     // not sure how to deal with that other than to just be graceful
                     // about receiving lots of results...
                   
                     
//                      dnxDeleteNodeReq(ilist->list[current].pNode);
//                      DnxNodeRequest * pNode = dnxCreateNodeReq();
                     ilist->list[current].pNode->flags = *(dnxGetAffinity(ilist->list[current].host_name));
//                      ilist->list[current].pNode->hn = xstrdup(ilist->list[current].host_name);
//                      ilist->list[current].pNode->addr = NULL;

                     // We should leave the address alone so we don't segfault if results come in late
                     // but should we reset these? 
//                      ilist->list[current].pNode->xid.objSlot = -1;
//                      ilist->list[current].pNode->xid.objSerial = ilist->list[current].xid.objSerial;
//                      ilist->list[current].pNode = pNode;
                  }
                  break;                  
               } else {
                  // This is a new job, so dispatch it
                  dnxDebug(4, "dnxJobListDispatch: Dispatching new job [%lu:%lu] waiting for Ack",
                     ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot);
               }
            }
            
            // set our retry interval
            // This should be fairly forgiving in case we just missed the Ack but it actually
            // got the job and is returning our results.
            (ilist->list[current].pNode)->retry = now.tv_sec + 5; 
            
         
            // make a copy for the Dispatcher to send to client
            memcpy(pJob, &ilist->list[current], sizeof *pJob);
            
            // release the mutex
            DNX_PT_MUTEX_UNLOCK(&ilist->mut);
            return ret;
         case DNX_JOB_COMPLETE:
         case DNX_JOB_RECEIVED:
            // This is a job that we have received the response and we need to send an ack to
            // the client to let it know we got it
            if(ilist->list[current].ack) {
               // Only send a single Ack
               break;
            }
            // make a copy for the Dispatcher to send an Ack to the client
            memcpy(pJob, &ilist->list[current], sizeof *pJob);
            
            dnxDebug(4, "dnxJobListDispatch: Received job [%lu:%lu] sending Ack.",
               ilist->list[current].xid.objSerial, ilist->list[current].xid.objSlot);
            
            // release the mutex
            DNX_PT_MUTEX_UNLOCK(&ilist->mut);
            return ret;
      }

      if (current == ilist->tail) {
         // if we are at the end of the queue
         gettimeofday(&now, 0);
         timeout.tv_sec = now.tv_sec + DNX_JOBLIST_TIMEOUT;
         timeout.tv_nsec = now.tv_usec * 1000;
         if ((ret = pthread_cond_timedwait(&ilist->cond, &ilist->mut, &timeout)) == ETIMEDOUT) {
            // We waited for the time out period and no new jobs arrived. So give control back to caller.
            dnxDebug(5, "dnxJobListDispatch: Reached end of dispatch queue. Thread timer returned.");      
            DNX_PT_MUTEX_UNLOCK(&ilist->mut);
            return ret;
         } else {
            // We were signaled that there is a new job, so lets move back to the head and get it!
            current = ilist->head;
            dnxDebug(5, "dnxJobListDispatch: Reached end of dispatch queue. A new job arrived.");      
         }
      } else {
         // move to next item in queue
         current = ((current + 1) % ilist->size);
      }
   }
}
示例#21
0
int dnxJobListExpire(DnxJobList * pJobList, DnxNewJob * pExpiredJobs, int * totalJobs) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long current;
   DnxNewJob * pJob;
   int jobCount = 0;
   time_t now;

   assert(pJobList && pExpiredJobs && totalJobs && *totalJobs > 0);

   DNX_PT_MUTEX_LOCK(&ilist->mut);

   // get the current time (after we acquire the lock! In case we had to wait)
   now = time(0);

   // walk the entire job list - InProgress and Pending jobs (in that order)
   current = ilist->head;
   int zero_factor = ilist->size - current; // add this value to normalize the index
   dnxDebug(6, "dnxJobListExpire: searching for (%i) expired objects. Head(%lu) Tail(%i)", *totalJobs, ilist->head, ilist->tail);
   int state = 0;
   while(jobCount < *totalJobs) {
      state = (pJob = &ilist->list[current])->state;
      unsigned long dispatch_timeout = now - DNX_DISPATCH_TIMEOUT;

      // only examine jobs that are either awaiting dispatch or results
      switch (state) {
         case DNX_JOB_UNBOUND:
            if(pJob->start_time <= dispatch_timeout) {
               dnxDebug(2, "dnxJobListExpire: Expiring Unbound %s Job [%lu:%lu] count(%i) type(%i) Start Time: (%lu) Now: (%lu) Expire: (%lu)",
                  (pJob->object_check_type ? "Host" : "Service"),  pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->start_time, now, dispatch_timeout);               
               // Put the old job in a purgable state   
               pJob->state = DNX_JOB_EXPIRED;
               
               // Add a copy to the expired job list
               memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob));    
            } else {
               // If there is a client associated with it, xid.objSlot != -1
               // then it means we may be getting a result coming back to us
            
               // This job has not expired, try and get a dnxClient for it
               if (dnxGetNodeRequest(dnxGetRegistrar(), &(pJob->pNode)) == DNX_OK) { 
                  // If OK we have successfully dispatched it so update it's expiration
                  dnxDebug(2, "dnxJobListExpire: Dequeueing DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", 
                     pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state);
                  pJob->state = DNX_JOB_PENDING;
                  pthread_cond_signal(&ilist->cond);  // signal that a new job is available
               } else {
                  dnxDebug(6, "dnxJobListExpire: Unable to dequeue DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", 
                     pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state);
               }
            }
            break;
         case DNX_JOB_PENDING:
         case DNX_JOB_INPROGRESS:
            // check the job's expiration stamp
            if (pJob->expires <= now) { //  
               // This is an expired job, it was sent out, but never came back
               dnxDebug(1, "dnxJobListExpire: Expiring Job [%lu:%lu] count(%i) type(%i) Exp: (%lu) Now: (%lu)",
                  pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->expires, now);               
               // Put the old job in a purgable state   
               pJob->state = DNX_JOB_EXPIRED;
               // Add a copy to the expired job list
               memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob));
            } 
            break;
         case DNX_JOB_COMPLETE:
            // If the Ack hasn't been sent out yet, give it time to complete
            if(! pJob->ack) {
               dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. count(%i) type(%i)", current, state);
               break;
            }
         case DNX_JOB_EXPIRED:
            dnxJobCleanup(pJob);
            dnxDebug(3, "dnxJobListExpire: Nullified Job. count(%i) type(%i)", current, state);
         case DNX_JOB_NULL:
            if(current == ilist->head && current != ilist->tail) {
               ilist->head = ((current + 1) % ilist->size);
               dnxDebug(2, "dnxJobListExpire: Moving head to (%i). count(%i) type(%i)", ilist->head, current, pJob->state);
               // we have an old item at the head of the list, so we need to
               // increment the head. It should never be larger than the tail.
            } else {
               dnxDebug(5, "dnxJobListExpire: Null Job. count(%i) type(%i)", current, pJob->state);
            }
            break;
         case DNX_JOB_RECEIVED:
            if(! pJob->ack) {
               dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. job [%lu:%lu] count(%i) type(%i)", current, state);
            } else {
               dnxDebug(2, "dnxJobListExpire: Ack sent. job [%lu:%lu] count(%i) type(%i)", current, state);
            }
            // The Collector thread will set this to DNX_JOB_COMPLETE once it has 
            // replied to Nagios, but we don't advance the head until that happens
            break;
      }

      // bail-out if this was the job list tail
      if (current == ilist->tail) {
         break;
      }
      // increment the job list index
      current = ((current + 1) % ilist->size);
   }
      
   // update the total jobs in the expired job list
   *totalJobs = jobCount;
   DNX_PT_MUTEX_UNLOCK(&ilist->mut);

   return DNX_OK;
}