/** Post a new job from Nagios to the dnxServer job queue. * * @param[in] joblist - the job list to which the new job should be posted. * @param[in] serial - the serial number of the new job. * @param[in] jdp - a pointer to a job data structure. * @param[in] ds - a pointer to the nagios job that's being posted. * @param[in] pNode - a dnxClient node request structure that is being * posted with this job. The dispatcher thread will send the job to the * associated node. * * @return Zero on success, or a non-zero error value. */ static int dnxPostNewJob(DnxJobList * joblist, unsigned long serial, DnxJobData * jdp, nebstruct_service_check_data * ds, DnxNodeRequest * pNode) { DnxNewJob Job; int ret; assert(ds); assert(ds->command_line); // fill-in the job structure with the necessary information dnxMakeXID(&Job.xid, DNX_OBJ_JOB, serial, 0); Job.payload = jdp; Job.cmd = xstrdup(ds->command_line); Job.start_time = ds->start_time.tv_sec; Job.timeout = ds->timeout; Job.expires = Job.start_time + Job.timeout + 5; Job.pNode = pNode; dnxDebug(2, "DnxNebMain: Posting Job [%lu]: %s.", serial, Job.cmd); // post to the Job Queue if ((ret = dnxJobListAdd(joblist, &Job)) != DNX_OK) { dnxStatsInc(0, JOBS_REJECTED_NO_SLOTS); dnxLog("Failed to post Job [%lu]; \"%s\": %d.", Job.xid.objSerial, Job.cmd, ret); } else { dnxStatsInc(0, JOBS_HANDLED); dnxAuditJob(&Job, "ASSIGN"); } return ret; }
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) { iDnxJobList * ilist = (iDnxJobList *)pJobList; unsigned long tail; int ret = DNX_OK; assert(pJobList && pJob); DNX_PT_MUTEX_LOCK(&ilist->mut); tail = ilist->tail; // verify space in the job list, this keeps a single empty buffer element to // protect us from not knowing a full ring from an empty one if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) { dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", ilist->size, pJob->cmd); ret = DNX_ERR_CAPACITY; } else { // add the slot index to the Job's XID - this allows us to index // the job list using the returned result's XID.objSlot field pJob->xid.objSlot = tail; // We were unable to get an available dnxClient job request so we // put the job into the queue anyway and have the timer thread try // and find a dnxClient for it later if (pJob->pNode->xid.objSlot == -1) { pJob->state = DNX_JOB_UNBOUND; } else { pJob->state = DNX_JOB_PENDING; } dnxAuditJob(pJob, "ASSIGN"); // add this job to the job list memcpy(&ilist->list[tail], pJob, sizeof *pJob); ilist->tail = tail; dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail); if(pJob->state == DNX_JOB_PENDING) { pthread_cond_signal(&ilist->cond); // signal that a new job is available } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
int dnxJobListMarkAckSent(DnxJobList * pJobList, DnxXID * pXid) { iDnxJobList * ilist = (iDnxJobList *)pJobList; assert(pJobList && pXid); // parameter validation int ret = DNX_ERR_NOTFOUND; dnxDebug(4, "dnxJobListMarkAckSent: Job [%lu:%lu]", pXid->objSerial, pXid->objSlot); unsigned long current = pXid->objSlot; DNX_PT_MUTEX_LOCK(&ilist->mut); if (dnxEqualXIDs(pXid, &ilist->list[current].xid)) { if(ilist->list[current].state == DNX_JOB_RECEIVED || ilist->list[current].state == DNX_JOB_COMPLETE) { ilist->list[current].ack = 1; dnxAuditJob(&(ilist->list[current]), "CONFIRMED"); ret = DNX_OK; } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
int dnxJobListMarkAck(DnxJobList * pJobList, DnxResult * pRes) { iDnxJobList * ilist = (iDnxJobList *)pJobList; assert(pJobList && pRes); // parameter validation time_t now = time(0); int ret = DNX_ERR_NOTFOUND; dnxDebug(4, "dnxJobListMarkAck: Job [%lu:%lu] serial (%lu) slot (%lu) latency (%lu) sec.", pRes->xid.objSerial, pRes->xid.objSlot, pRes->xid.objSerial, pRes->xid.objSlot, (now - pRes->timestamp)); unsigned long current = pRes->xid.objSlot; DNX_PT_MUTEX_LOCK(&ilist->mut); if (dnxEqualXIDs(&(pRes->xid), &ilist->list[current].xid)) { if(ilist->list[current].state == DNX_JOB_PENDING || ilist->list[current].state == DNX_JOB_UNBOUND) { ilist->list[current].state = DNX_JOB_INPROGRESS; dnxAuditJob(&(ilist->list[current]), "ACK"); ret = DNX_OK; } } DNX_PT_MUTEX_UNLOCK(&ilist->mut); return ret; }
/** The main timer thread procedure entry point. * * @param[in] data - an opaque pointer to thread data for the timer thread. * This is actually the dnx server global data object. * * @return Always returns 0. */ static void * dnxTimer(void * data) { iDnxTimer * itimer = (iDnxTimer *)data; DnxNewJob ExpiredList[MAX_EXPIRED]; int i, totalExpired; int ret = 0; assert(data); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0); pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0); pthread_cleanup_push(dnxTimerCleanup, data); dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self()); while (1) { pthread_testcancel(); dnxCancelableSleep(itimer->sleepms); // search for expired jobs in the pending queue totalExpired = MAX_EXPIRED; if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, &totalExpired)) == DNX_OK && totalExpired > 0) { for (i = 0; i < totalExpired; i++) { char msg[256]; char addrstr[DNX_MAX_ADDRSTR]; DnxNewJob * job = &ExpiredList[i]; dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.", pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd); dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT); dnxAuditJob(job, "EXPIRE"); // if (job->ack) snprintf(msg, sizeof msg, "(DNX: Service Check [%lu,%lu] Timed Out - " "Node: %s - Failed to return job response in time allowed)", job->xid.objSerial, job->xid.objSlot, addrstr); // else // snprintf(msg, sizeof msg, // "(DNX: Service Check [%lu,%lu] Timed Out - " // "Node: %s - Failed to acknowledge job receipt)", // job->xid.objSerial, job->xid.objSlot, addrstr); dnxDebug(2, msg); // report the expired job to Nagios ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, time(0) - job->start_time, 1, 0, msg); dnxJobCleanup(job); } } if (totalExpired > 0 || ret != DNX_OK) dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d Retcode=%d: %s.", pthread_self(), totalExpired, ret, dnxErrorString(ret)); } dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret)); pthread_cleanup_pop(1); return 0; }