/** The main timer thread procedure entry point. * * @param[in] data - an opaque pointer to thread data for the timer thread. * This is actually the dnx server global data object. * * @return Always returns 0. */ static void * dnxTimer(void * data) { iDnxTimer * itimer = (iDnxTimer *)data; DnxNewJob ExpiredList[MAX_EXPIRED]; int i, totalExpired; int ret = 0; assert(data); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxTimerCleanup, data); dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self()); while (1) { pthread_testcancel(); dnxCancelableSleep(itimer->sleepms); // search for expired jobs in the pending queue totalExpired = MAX_EXPIRED; if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, &totalExpired)) == DNX_OK && totalExpired > 0) { for (i = 0; i < totalExpired; i++) { char msg[256]; char addrstr[DNX_MAX_ADDRSTR]; DnxNewJob * job = &ExpiredList[i]; dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.", pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd); dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT); dnxAuditJob(job, "EXPIRE"); // if (job->ack) snprintf(msg, sizeof msg, "(DNX: Service Check [%lu,%lu] Timed Out - " "Node: %s - Failed to return job response in time allowed)", job->xid.objSerial, job->xid.objSlot, addrstr); // else // snprintf(msg, sizeof msg, // "(DNX: Service Check [%lu,%lu] Timed Out - " // "Node: %s - Failed to acknowledge job receipt)", // job->xid.objSerial, job->xid.objSlot, addrstr); dnxDebug(2, msg); // report the expired job to Nagios ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, time(0) - job->start_time, 1, 0, msg); dnxJobCleanup(job); } } if (totalExpired > 0 || ret != DNX_OK) dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d Retcode=%d: %s.", pthread_self(), totalExpired, ret, dnxErrorString(ret)); } dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret)); pthread_cleanup_pop(1); return 0; }
int dnxJobListExpire(DnxJobList * pJobList, DnxNewJob * pExpiredJobs, int * totalJobs) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long current; DnxNewJob * pJob; int jobCount = 0; time_t now; assert(pJobList && pExpiredJobs && totalJobs && *totalJobs > 0); DNX_PT_MUTEX_LOCK(&ilist->mut); // get the current time (after we acquire the lock! In case we had to wait) now = time(0); // walk the entire job list - InProgress and Pending jobs (in that order) current = ilist->head; int zero_factor = ilist->size - current; // add this value to normalize the index dnxDebug(6, "dnxJobListExpire: searching for (%i) expired objects. Head(%lu) Tail(%i)", *totalJobs, ilist->head, ilist->tail); int state = 0; while(jobCount < *totalJobs) { state = (pJob = &ilist->list[current])->state; unsigned long dispatch_timeout = now - DNX_DISPATCH_TIMEOUT; // only examine jobs that are either awaiting dispatch or results switch (state) { case DNX_JOB_UNBOUND: if(pJob->start_time <= dispatch_timeout) { dnxDebug(2, "dnxJobListExpire: Expiring Unbound %s Job [%lu:%lu] count(%i) type(%i) Start Time: (%lu) Now: (%lu) Expire: (%lu)", (pJob->object_check_type ? "Host" : "Service"), pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->start_time, now, dispatch_timeout); // Put the old job in a purgable state pJob->state = DNX_JOB_EXPIRED; // Add a copy to the expired job list memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob)); } else { // If there is a client associated with it, xid.objSlot != -1 // then it means we may be getting a result coming back to us // This job has not expired, try and get a dnxClient for it if (dnxGetNodeRequest(dnxGetRegistrar(), &(pJob->pNode)) == DNX_OK) { // If OK we have successfully dispatched it so update it's expiration dnxDebug(2, "dnxJobListExpire: Dequeueing DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state); pJob->state = DNX_JOB_PENDING; pthread_cond_signal(&ilist->cond); // signal that a new job is available } else { dnxDebug(6, "dnxJobListExpire: Unable to dequeue DNX_JOB_UNBOUND job [%lu:%lu] Expires in (%i) seconds. Dispatch TO:(%i) Now: (%lu) count(%i) type(%i)", pJob->xid.objSerial, pJob->xid.objSlot, pJob->start_time - dispatch_timeout, dispatch_timeout, now, current, state); } } break; case DNX_JOB_PENDING: case DNX_JOB_INPROGRESS: // check the job's expiration stamp if (pJob->expires <= now) { // // This is an expired job, it was sent out, but never came back dnxDebug(1, "dnxJobListExpire: Expiring Job [%lu:%lu] count(%i) type(%i) Exp: (%lu) Now: (%lu)", pJob->xid.objSerial, pJob->xid.objSlot, current, state, pJob->expires, now); // Put the old job in a purgable state pJob->state = DNX_JOB_EXPIRED; // Add a copy to the expired job list memcpy(&pExpiredJobs[jobCount++], pJob, sizeof(DnxNewJob)); } break; case DNX_JOB_COMPLETE: // If the Ack hasn't been sent out yet, give it time to complete if(! pJob->ack) { dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. count(%i) type(%i)", current, state); break; } case DNX_JOB_EXPIRED: dnxJobCleanup(pJob); dnxDebug(3, "dnxJobListExpire: Nullified Job. count(%i) type(%i)", current, state); case DNX_JOB_NULL: if(current == ilist->head && current != ilist->tail) { ilist->head = ((current + 1) % ilist->size); dnxDebug(2, "dnxJobListExpire: Moving head to (%i). count(%i) type(%i)", ilist->head, current, pJob->state); // we have an old item at the head of the list, so we need to // increment the head. It should never be larger than the tail. } else { dnxDebug(5, "dnxJobListExpire: Null Job. count(%i) type(%i)", current, pJob->state); } break; case DNX_JOB_RECEIVED: if(! pJob->ack) { dnxDebug(3, "dnxJobListExpire: Waiting to send Ack. job [%lu:%lu] count(%i) type(%i)", current, state); } else { dnxDebug(2, "dnxJobListExpire: Ack sent. job [%lu:%lu] count(%i) type(%i)", current, state); } // The Collector thread will set this to DNX_JOB_COMPLETE once it has // replied to Nagios, but we don't advance the head until that happens break; } // bail-out if this was the job list tail if (current == ilist->tail) { break; } // increment the job list index current = ((current + 1) % ilist->size); } // update the total jobs in the expired job list *totalJobs = jobCount; DNX_PT_MUTEX_UNLOCK(&ilist->mut); return DNX_OK; }