Exemple #1
0
/** Post a new job from Nagios to the dnxServer job queue.
 *
 * @param[in] joblist - the job list to which the new job should be posted.
 * @param[in] serial - the serial number of the new job.
 * @param[in] jdp - a pointer to a job data structure.
 * @param[in] ds - a pointer to the nagios job that's being posted.
 * @param[in] pNode - a dnxClient node request structure that is being
 *    posted with this job. The dispatcher thread will send the job to the
 *    associated node.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxPostNewJob(DnxJobList * joblist, unsigned long serial, 
      DnxJobData * jdp, nebstruct_service_check_data * ds, 
      DnxNodeRequest * pNode)
{
   DnxNewJob Job;
   int ret;

   assert(ds);
   assert(ds->command_line);

   // fill-in the job structure with the necessary information
   dnxMakeXID(&Job.xid, DNX_OBJ_JOB, serial, 0);
   Job.payload    = jdp;
   Job.cmd        = xstrdup(ds->command_line);
   Job.start_time = ds->start_time.tv_sec;
   Job.timeout    = ds->timeout;
   Job.expires    = Job.start_time + Job.timeout + 5;
   Job.pNode      = pNode;

   dnxDebug(2, "DnxNebMain: Posting Job [%lu]: %s.", serial, Job.cmd);

   // post to the Job Queue
   if ((ret = dnxJobListAdd(joblist, &Job)) != DNX_OK)
   {
      dnxStatsInc(0, JOBS_REJECTED_NO_SLOTS);
      dnxLog("Failed to post Job [%lu]; \"%s\": %d.",
            Job.xid.objSerial, Job.cmd, ret);
   }
   else
   {
      dnxStatsInc(0, JOBS_HANDLED);
      dnxAuditJob(&Job, "ASSIGN");
   }
   return ret;
}
Exemple #2
0
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long tail;
   int ret = DNX_OK;

   assert(pJobList && pJob);

   DNX_PT_MUTEX_LOCK(&ilist->mut);

   tail = ilist->tail;

   // verify space in the job list, this keeps a single empty buffer element to 
   // protect us from not knowing a full ring from an empty one
   if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) {
      dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", 
            ilist->size, pJob->cmd);
      dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", 
            ilist->size, pJob->cmd);
     ret = DNX_ERR_CAPACITY;
   } else {
      // add the slot index to the Job's XID - this allows us to index 
      //    the job list using the returned result's XID.objSlot field
      pJob->xid.objSlot = tail;
      // We were unable to get an available dnxClient job request so we
      // put the job into the queue anyway and have the timer thread try 
      // and find a dnxClient for it later
      if (pJob->pNode->xid.objSlot == -1) {
         pJob->state = DNX_JOB_UNBOUND;
      } else {
         pJob->state = DNX_JOB_PENDING;
      }
      
      dnxAuditJob(pJob, "ASSIGN");
      
      // add this job to the job list
      memcpy(&ilist->list[tail], pJob, sizeof *pJob);
      
      ilist->tail = tail;
   
      dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", 
            pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail);
      
      if(pJob->state == DNX_JOB_PENDING) {
         pthread_cond_signal(&ilist->cond);  // signal that a new job is available
      }         
   }

   DNX_PT_MUTEX_UNLOCK(&ilist->mut);

   return ret;
}
Exemple #3
0
int dnxJobListMarkAckSent(DnxJobList * pJobList, DnxXID * pXid) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   assert(pJobList && pXid);   // parameter validation
   int ret = DNX_ERR_NOTFOUND;
   dnxDebug(4, "dnxJobListMarkAckSent: Job [%lu:%lu]", 
        pXid->objSerial, pXid->objSlot);
   unsigned long current = pXid->objSlot;

   DNX_PT_MUTEX_LOCK(&ilist->mut);
   if (dnxEqualXIDs(pXid, &ilist->list[current].xid)) {
      if(ilist->list[current].state == DNX_JOB_RECEIVED || ilist->list[current].state == DNX_JOB_COMPLETE) {
         ilist->list[current].ack = 1;
         dnxAuditJob(&(ilist->list[current]), "CONFIRMED");
         ret = DNX_OK;
      }
   }
   DNX_PT_MUTEX_UNLOCK(&ilist->mut);
   return ret;
}
Exemple #4
0
int dnxJobListMarkAck(DnxJobList * pJobList, DnxResult * pRes) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   assert(pJobList && pRes);   // parameter validation
   time_t now = time(0);
   int ret = DNX_ERR_NOTFOUND;
   dnxDebug(4, "dnxJobListMarkAck: Job [%lu:%lu] serial (%lu) slot (%lu) latency (%lu) sec.", 
        pRes->xid.objSerial, pRes->xid.objSlot, pRes->xid.objSerial, pRes->xid.objSlot, (now - pRes->timestamp));
   unsigned long current = pRes->xid.objSlot;

   DNX_PT_MUTEX_LOCK(&ilist->mut);
   if (dnxEqualXIDs(&(pRes->xid), &ilist->list[current].xid)) {
      if(ilist->list[current].state == DNX_JOB_PENDING || ilist->list[current].state == DNX_JOB_UNBOUND) {
         ilist->list[current].state = DNX_JOB_INPROGRESS;
         dnxAuditJob(&(ilist->list[current]), "ACK");
         ret = DNX_OK;
      }
   }
   DNX_PT_MUTEX_UNLOCK(&ilist->mut);
   return ret;
}
Exemple #5
0
/** The main timer thread procedure entry point.
 *
 * @param[in] data - an opaque pointer to thread data for the timer thread.
 *    This is actually the dnx server global data object.
 *
 * @return Always returns 0.
 */
static void * dnxTimer(void * data)
{
   iDnxTimer * itimer = (iDnxTimer *)data;
   DnxNewJob ExpiredList[MAX_EXPIRED];
   int i, totalExpired;
   int ret = 0;

   assert(data);

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxTimerCleanup, data);

   dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self());

   while (1)
   {
      pthread_testcancel();

      dnxCancelableSleep(itimer->sleepms);

      // search for expired jobs in the pending queue
      totalExpired = MAX_EXPIRED;
      if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, 
            &totalExpired)) == DNX_OK && totalExpired > 0)
      {
         for (i = 0; i < totalExpired; i++)
         {
            char msg[256];
            char addrstr[DNX_MAX_ADDRSTR];
            DnxNewJob * job = &ExpiredList[i];

            dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.",
                  pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd);

            dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT);
            dnxAuditJob(job, "EXPIRE");

//          if (job->ack)
               snprintf(msg, sizeof msg, 
                     "(DNX: Service Check [%lu,%lu] Timed Out - "
                     "Node: %s - Failed to return job response in time allowed)",
                     job->xid.objSerial, job->xid.objSlot, addrstr);
//          else
//             snprintf(msg, sizeof msg, 
//                   "(DNX: Service Check [%lu,%lu] Timed Out - "
//                   "Node: %s - Failed to acknowledge job receipt)",
//                   job->xid.objSerial, job->xid.objSlot, addrstr);

            dnxDebug(2, msg);

            // report the expired job to Nagios
            ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, 
                  time(0) - job->start_time, 1, 0, msg);
            dnxJobCleanup(job);
         }
      }

      if (totalExpired > 0 || ret != DNX_OK)
         dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d  Retcode=%d: %s.",
               pthread_self(), totalExpired, ret, dnxErrorString(ret));
   }

   dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret));

   pthread_cleanup_pop(1);
   return 0;
}