Ejemplo n.º 1
0
/** The main timer thread procedure entry point.
 *
 * @param[in] data - an opaque pointer to thread data for the timer thread.
 *    This is actually the dnx server global data object.
 *
 * @return Always returns 0.
 */
static void * dnxTimer(void * data)
{
   iDnxTimer * itimer = (iDnxTimer *)data;
   DnxNewJob ExpiredList[MAX_EXPIRED];
   int i, totalExpired;
   int ret = 0;

   assert(data);

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxTimerCleanup, data);

   dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self());

   while (1)
   {
      pthread_testcancel();

      dnxCancelableSleep(itimer->sleepms);

      // search for expired jobs in the pending queue
      totalExpired = MAX_EXPIRED;
      if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, 
            &totalExpired)) == DNX_OK && totalExpired > 0)
      {
         for (i = 0; i < totalExpired; i++)
         {
            char msg[256];
            char addrstr[DNX_MAX_ADDRSTR];
            DnxNewJob * job = &ExpiredList[i];

            dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.",
                  pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd);

            dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT);
            dnxAuditJob(job, "EXPIRE");

//          if (job->ack)
               snprintf(msg, sizeof msg, 
                     "(DNX: Service Check [%lu,%lu] Timed Out - "
                     "Node: %s - Failed to return job response in time allowed)",
                     job->xid.objSerial, job->xid.objSlot, addrstr);
//          else
//             snprintf(msg, sizeof msg, 
//                   "(DNX: Service Check [%lu,%lu] Timed Out - "
//                   "Node: %s - Failed to acknowledge job receipt)",
//                   job->xid.objSerial, job->xid.objSlot, addrstr);

            dnxDebug(2, msg);

            // report the expired job to Nagios
            ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, 
                  time(0) - job->start_time, 1, 0, msg);
            dnxJobCleanup(job);
         }
      }

      if (totalExpired > 0 || ret != DNX_OK)
         dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d  Retcode=%d: %s.",
               pthread_self(), totalExpired, ret, dnxErrorString(ret));
   }

   dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret));

   pthread_cleanup_pop(1);
   return 0;
}
Ejemplo n.º 2
0
int dnxJobListExpire(DnxJobList * pJobList, DnxNewJob * pExpiredJobs, int * totalJobs) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long current;
   DnxNewJob * pJob;
   int jobCount = 0;
   time_t now;

   assert(pJobList && pExpiredJobs && totalJobs && *totalJobs > 0);

   DNX_PT_MUTEX_LOCK(&ilist->mut);

   // get the current time (after we acquire the lock! In case we had to wait)
   now = time(0);

   // walk the entire job list - InProgress and Pending jobs (in that order)
   current = ilist->head;
   int zero_factor = ilist->size - current; // add this value to normalize the index
   dnxDebug(6, "dnxJobListExpire: searching for (%i) expired objects. Head(%lu) Tail(%i)", *totalJobs, ilist->head, ilist->tail);
   int state = 0;
   while(jobCount < *totalJobs) {
      state = (pJob = &ilist->list[current])->state;
      unsigned long dispatch_timeout = now - DNX_DISPATCH_TIMEOUT;

      // only examine jobs that are either awaiting dispatch or results
      switch (state) {
         case DNX_JOB_UNBOUND:
            if(pJob->start_time <= dispatch_timeout) {
               dnxDebug(2, "dnxJobListExpire: Expiring Unbound %s Job [%lu:%lu] count(%i) type(%i) Start Time: (%lu) Now: (%lu) Expire: (%lu)",
                  (pJob->object_check_type ? "Host" : "Service"),  pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->start_time, now, dispatch_timeout);               
               // Put the old job in a purgable state   
               pJob->state = DNX_JOB_EXPIRED;
               
               // Add a copy to the expired job list
               memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob));    
            } else {
               // If there is a client associated with it, xid.objSlot != -1
               // then it means we may be getting a result coming back to us
            
               // This job has not expired, try and get a dnxClient for it
               if (dnxGetNodeRequest(dnxGetRegistrar(), &(pJob->pNode)) == DNX_OK) { 
                  // If OK we have successfully dispatched it so update it's expiration
                  dnxDebug(2, "dnxJobListExpire: Dequeueing DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", 
                     pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state);
                  pJob->state = DNX_JOB_PENDING;
                  pthread_cond_signal(&ilist->cond);  // signal that a new job is available
               } else {
                  dnxDebug(6, "dnxJobListExpire: Unable to dequeue DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", 
                     pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state);
               }
            }
            break;
         case DNX_JOB_PENDING:
         case DNX_JOB_INPROGRESS:
            // check the job's expiration stamp
            if (pJob->expires <= now) { //  
               // This is an expired job, it was sent out, but never came back
               dnxDebug(1, "dnxJobListExpire: Expiring Job [%lu:%lu] count(%i) type(%i) Exp: (%lu) Now: (%lu)",
                  pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->expires, now);               
               // Put the old job in a purgable state   
               pJob->state = DNX_JOB_EXPIRED;
               // Add a copy to the expired job list
               memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob));
            } 
            break;
         case DNX_JOB_COMPLETE:
            // If the Ack hasn't been sent out yet, give it time to complete
            if(! pJob->ack) {
               dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. count(%i) type(%i)", current, state);
               break;
            }
         case DNX_JOB_EXPIRED:
            dnxJobCleanup(pJob);
            dnxDebug(3, "dnxJobListExpire: Nullified Job. count(%i) type(%i)", current, state);
         case DNX_JOB_NULL:
            if(current == ilist->head && current != ilist->tail) {
               ilist->head = ((current + 1) % ilist->size);
               dnxDebug(2, "dnxJobListExpire: Moving head to (%i). count(%i) type(%i)", ilist->head, current, pJob->state);
               // we have an old item at the head of the list, so we need to
               // increment the head. It should never be larger than the tail.
            } else {
               dnxDebug(5, "dnxJobListExpire: Null Job. count(%i) type(%i)", current, pJob->state);
            }
            break;
         case DNX_JOB_RECEIVED:
            if(! pJob->ack) {
               dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. job [%lu:%lu] count(%i) type(%i)", current, state);
            } else {
               dnxDebug(2, "dnxJobListExpire: Ack sent. job [%lu:%lu] count(%i) type(%i)", current, state);
            }
            // The Collector thread will set this to DNX_JOB_COMPLETE once it has 
            // replied to Nagios, but we don't advance the head until that happens
            break;
      }

      // bail-out if this was the job list tail
      if (current == ilist->tail) {
         break;
      }
      // increment the job list index
      current = ((current + 1) % ilist->size);
   }
      
   // update the total jobs in the expired job list
   *totalJobs = jobCount;
   DNX_PT_MUTEX_UNLOCK(&ilist->mut);

   return DNX_OK;
}