Exemple #1
0
/** Post a new job from Nagios to the dnxServer job queue.
 *
 * @param[in] joblist - the job list to which the new job should be posted.
 * @param[in] serial - the serial number of the new job.
 * @param[in] jdp - a pointer to a job data structure.
 * @param[in] ds - a pointer to the nagios job that's being posted.
 * @param[in] pNode - a dnxClient node request structure that is being
 *    posted with this job. The dispatcher thread will send the job to the
 *    associated node.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxPostNewJob(DnxJobList * joblist, unsigned long serial, 
      DnxJobData * jdp, nebstruct_service_check_data * ds, 
      DnxNodeRequest * pNode)
{
   DnxNewJob Job;
   int ret;

   assert(ds);
   assert(ds->command_line);

   // fill-in the job structure with the necessary information
   dnxMakeXID(&Job.xid, DNX_OBJ_JOB, serial, 0);
   Job.payload    = jdp;
   Job.cmd        = xstrdup(ds->command_line);
   Job.start_time = ds->start_time.tv_sec;
   Job.timeout    = ds->timeout;
   Job.expires    = Job.start_time + Job.timeout + 5;
   Job.pNode      = pNode;

   dnxDebug(2, "DnxNebMain: Posting Job [%lu]: %s.", serial, Job.cmd);

   // post to the Job Queue
   if ((ret = dnxJobListAdd(joblist, &Job)) != DNX_OK)
   {
      dnxStatsInc(0, JOBS_REJECTED_NO_SLOTS);
      dnxLog("Failed to post Job [%lu]; \"%s\": %d.",
            Job.xid.objSerial, Job.cmd, ret);
   }
   else
   {
      dnxStatsInc(0, JOBS_HANDLED);
      dnxAuditJob(&Job, "ASSIGN");
   }
   return ret;
}
Exemple #2
0
/** Launches an external command and waits for it to return a status code.
 *
 * @param[in] script - the command line to be launched.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int launchScript(char * script)
{
   int ret;

   assert(script);

   // exec the script - system waits till child completes
   if ((ret = system(script)) == -1)
   {
      dnxLog("Failed to exec script: %s.", strerror(errno));
      ret = DNX_ERR_INVALID;
   }
   else
      ret = DNX_OK;

   dnxLog("Sync script returned %d.", WEXITSTATUS(ret));

   return ret;
}
Exemple #3
0
/** Initialize worker thread communication resources.
 * 
 * @param[in] ws - a pointer to a worker thread's status data structure.
 * 
 * @return Zero on success, or a non-zero error value.
 */
static int initWorkerComm(DnxWorkerStatus * ws)
{
   char szChanDisp[64];
   char szChanColl[64];
   int ret;

   // create a channel for sending job requests (named after its memory address)
   sprintf(szChanDisp, "Dispatch:%lx", ws);
   if ((ret = dnxChanMapAdd(szChanDisp, ws->iwlm->cfg.dispatcher)) != DNX_OK)
   {
      dnxLog("WLM: Failed to initialize dispatcher channel: %s.", dnxErrorString(ret));
      return ret;
   }
   if ((ret = dnxConnect(szChanDisp, 1, &ws->dispatch)) != DNX_OK)
   {
      dnxLog("WLM: Failed to open dispatcher channel: %s.", dnxErrorString(ret));
      dnxChanMapDelete(szChanDisp);
      return ret;
   }

   // create a channel for sending job results (named after its memory address)
   sprintf(szChanColl, "Collect:%lx", ws);
   if ((ret = dnxChanMapAdd(szChanColl, ws->iwlm->cfg.collector)) != DNX_OK)
   {
      dnxLog("WLM: Failed to initialize collector channel: %s.", dnxErrorString(ret));
      dnxDisconnect(ws->dispatch);
      dnxChanMapDelete(szChanDisp);
      return ret;
   }
   if ((ret = dnxConnect(szChanColl, 1, &ws->collect)) != DNX_OK)
   {
      dnxLog("WLM: Failed to open collector channel: %s.", dnxErrorString(ret));
      dnxChanMapDelete(szChanColl);
      dnxDisconnect(ws->dispatch);
      dnxChanMapDelete(szChanDisp);
      return ret;
   }
   return 0;
}
Exemple #4
0
int dnxJobListAdd(DnxJobList * pJobList, DnxNewJob * pJob) {
   iDnxJobList * ilist = (iDnxJobList *)pJobList;
   unsigned long tail;
   int ret = DNX_OK;

   assert(pJobList && pJob);

   DNX_PT_MUTEX_LOCK(&ilist->mut);

   tail = ilist->tail;

   // verify space in the job list, this keeps a single empty buffer element to 
   // protect us from not knowing a full ring from an empty one
   if (ilist->list[tail].state && (tail = (tail + 1) % ilist->size) == ilist->head) {
      dnxLog("dnxJobListAdd: Out of job slots (max=%lu): %s.", 
            ilist->size, pJob->cmd);
      dnxDebug(1, "dnxJobListAdd: Out of job slots (max=%lu): %s.", 
            ilist->size, pJob->cmd);
     ret = DNX_ERR_CAPACITY;
   } else {
      // add the slot index to the Job's XID - this allows us to index 
      //    the job list using the returned result's XID.objSlot field
      pJob->xid.objSlot = tail;
      // We were unable to get an available dnxClient job request so we
      // put the job into the queue anyway and have the timer thread try 
      // and find a dnxClient for it later
      if (pJob->pNode->xid.objSlot == -1) {
         pJob->state = DNX_JOB_UNBOUND;
      } else {
         pJob->state = DNX_JOB_PENDING;
      }
      
      dnxAuditJob(pJob, "ASSIGN");
      
      // add this job to the job list
      memcpy(&ilist->list[tail], pJob, sizeof *pJob);
      
      ilist->tail = tail;
   
      dnxDebug(1, "dnxJobListAdd: Job [%lu:%lu]: Head=%lu, Tail=%lu.", 
            pJob->xid.objSerial, pJob->xid.objSlot, ilist->head, ilist->tail);
      
      if(pJob->state == DNX_JOB_PENDING) {
         pthread_cond_signal(&ilist->cond);  // signal that a new job is available
      }         
   }

   DNX_PT_MUTEX_UNLOCK(&ilist->mut);

   return ret;
}
Exemple #5
0
/** Process Data Event Handler.
 *
 * @param[in] event_type - the event regarding which we were called by Nagios.
 * @param[in] data - an opaque pointer to an event-specific data structure.
 *
 * @return Zero if all is okay, but we want nagios to handle this event;
 *    non-zero if there's a problem of some sort.
 */
static int ehProcessData(int event_type, void * data)
{
   nebstruct_process_data *procdata = (nebstruct_process_data *)data;

   // validate our event type - ignore wrong event type
   assert(event_type == NEBCALLBACK_PROCESS_DATA);
   if (event_type != NEBCALLBACK_PROCESS_DATA)
      return OK;

   // sanity-check our data structure - should never happen
   assert(procdata);
   if (!procdata)
   {
      dnxLog("Startup handler received NULL process data structure.");
      return ERROR;
   }

   // look for process event loop start event
   if (procdata->type == NEBTYPE_PROCESS_EVENTLOOPSTART)
   {
      dnxDebug(2, "Startup handler received PROCESS_EVENTLOOPSTART event.");

      // execute sync script, if defined
      if (cfg.syncScript)
      {
         dnxLog("Startup handler executing plugin sync script: %s.", cfg.syncScript);

         // NB: This halts Nagios execution until the script exits...
         launchScript(cfg.syncScript);
      }

      // if server init fails, do server shutdown
      if (dnxServerInit() != 0)
         dnxServerDeInit();
   }
   return OK;
}
Exemple #6
0
int dnxInitAgent(char * agentUrl, DnxCfgParser * parser)
{
   int ret;

   s_shutdown = 0;
   s_agentTid = 0;
   s_parser = parser;

   if ((ret = dnxChanMapAdd(s_agentName, agentUrl)) != DNX_OK)
      dnxLog("AGENT channel init failed: %s.", dnxErrorString(ret));
   else if ((ret = dnxConnect(s_agentName, DNX_MODE_PASSIVE, &s_agent)) != DNX_OK)
   {
      dnxLog("AGENT channel connect failed: %s.", dnxErrorString(ret));
      dnxChanMapDelete(s_agentName);
   }
   else if ((ret = pthread_create(&s_agentTid, 0, dnxAgentServer, 0)) != 0)
   {
      dnxLog("AGENT server init failed: %s.", strerror(ret));
      dnxDisconnect(s_agent);
      dnxChanMapDelete(s_agentName);
      ret = DNX_ERR_THREAD;
   }
   return ret;
}
Exemple #7
0
/** Validate a configuration data structure in context.
 *
 * @param[in] dict - the dictionary used by the DnxCfgParser.
 * @param[in] vptrs - an array of opaque objects (either pointers or values)
 *    to be checked.
 * @param[in] passthru - an opaque pointer passed through from
 *    dnxCfgParserCreate. In this routine, it's the regex_t object into which
 *    we should parse the regular expression if one is given.
 *
 * @return Zero on success, or a non-zero error value. This error value is
 * passed back through dnxCfgParserParse.
 */
static int validateCfg(DnxCfgDict * dict, void ** vptrs, void * passthru)
{
   regex_t * rep = (regex_t *)passthru;
   int err, ret = DNX_ERR_INVALID;
   DnxServerCfg cfg;

   assert(dict && vptrs && passthru);

   // setup data structure so we can use the same functionality we had before
   cfg.agentUrl           = (char *)vptrs[ 0];
   cfg.dispatcherUrl      = (char *)vptrs[ 1];
   cfg.collectorUrl       = (char *)vptrs[ 2];
   cfg.authWorkerNodes    = (char *)vptrs[ 3];
   cfg.maxNodeRequests    = (unsigned)(intptr_t)vptrs[ 4];
   cfg.minServiceSlots    = (unsigned)(intptr_t)vptrs[ 5];
   cfg.expirePollInterval = (unsigned)(intptr_t)vptrs[ 6];
   cfg.localCheckPattern  = (char *)vptrs[ 7];
   cfg.syncScript         = (char *)vptrs[ 8];
   cfg.logFilePath        = (char *)vptrs[ 9];
   cfg.debugFilePath      = (char *)vptrs[10];
   cfg.auditFilePath      = (char *)vptrs[11];
   cfg.debugLevel         = (unsigned)(intptr_t)vptrs[12];

   // validate configuration items in context
   if (!cfg.agentUrl)
      dnxLog("config: Missing channelAgent parameter.");
   else if (!cfg.dispatcherUrl)
      dnxLog("config: Missing channelDispatcher parameter.");
   else if (!cfg.collectorUrl)
      dnxLog("config: Missing channelCollector parameter.");
   else if (cfg.maxNodeRequests < 1)
      dnxLog("config: Invalid maxNodeRequests parameter.");
   else if (cfg.minServiceSlots < 1)
      dnxLog("config: Invalid minServiceSlots parameter.");
   else if (cfg.expirePollInterval < 1)
      dnxLog("config: Invalid expirePollInterval parameter.");
   else if (cfg.localCheckPattern && (err = regcomp(rep,
         cfg.localCheckPattern, REG_EXTENDED | REG_NOSUB)) != 0)
   {
      char buffer[128];
      regerror(err, rep, buffer, sizeof buffer);
      dnxLog("config: Failed to compile localCheckPattern (\"%s\"): %s.",
             cfg.localCheckPattern, buffer);
      regfree(rep);
   }
   else
      ret = 0;

   return ret;
}
Exemple #8
0
char *ntop(const char * sastr)
{
    const struct sockaddr * sa = (const struct sockaddr *)sastr;

    assert(sa);
    if(!sa)
    {
        return xstrdup("DNX Error:  Address Uknown or Corrupt! ");
    }
    char * buf = NULL;

    switch(sa->sa_family)
    {
        case AF_INET:
            buf = (char *)xcalloc(INET_ADDRSTRLEN +1,sizeof(char));
            if(buf)
            {
                inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr),buf, INET_ADDRSTRLEN);
            }
        break;

        case AF_INET6:
            buf = (char *)xcalloc(INET6_ADDRSTRLEN +1,sizeof(char));
            if(buf)
            {
                inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr),buf, INET6_ADDRSTRLEN);
            }
        break;

        default:
            buf = xstrdup("127.0.0.1");
        break;
    }

    if(!buf)
    {
        dnxLog("ntop: out of memory, sleeping for 1 second before  trying again");
        sleep(1);
        return(ntop((char *)sa));
    }else{
        return buf;
    }
}
Exemple #9
0
/** Register a new client node "request for work" request.
 *
 * The message is either stored or used to find an existing node request
 * that should be updated. If stored, @p ppMsg is returned as zero so that
 * it will be reallocated by the caller. In all other cases, the same
 * message block can be reused by the caller for the next request.
 *
 * @param[in] ireg - the registrar on which to register a new client request.
 * @param[in] ppMsg - the address of the dnx client request node pointer.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxRegisterNode(iDnxRegistrar * ireg, DnxNodeRequest ** ppMsg)
{
   pthread_t tid = pthread_self();
   DnxNodeRequest * pReq;
   time_t now = time(0);
   int ret = DNX_OK;

   assert(ireg && ppMsg && *ppMsg);

   // compute expiration time of this request
   pReq = *ppMsg;
   pReq->expires = now + pReq->ttl;

   dnxStatsInc(pReq->address, REQUESTS_RECEIVED);

   // locate existing node: update expiration time, or add to the queue
   if (dnxQueueFind(ireg->rqueue, (void **)&pReq, dnxCompareNodeReq) == DNX_QRES_FOUND)
   {
      pReq->expires = (*ppMsg)->expires;
      dnxDebug(2,
            "dnxRegistrar[%lx]: Updated req [%lu,%lu] at %u; expires at %u.",
            tid, pReq->xid.objSerial, pReq->xid.objSlot,
            (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000));
   }
   else if ((ret = dnxQueuePut(ireg->rqueue, *ppMsg)) == DNX_OK)
   {
      *ppMsg = 0;    // we're keeping this message; return NULL
      dnxDebug(2,
            "dnxRegistrar[%lx]: Added req [%lu,%lu] at %u; expires at %u.",
            tid, pReq->xid.objSerial, pReq->xid.objSlot,
            (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000));
   }
   else
      dnxLog("DNX Registrar: Unable to enqueue node request: %s.",
            dnxErrorString(ret));

   return ret;
}
Exemple #10
0
/** Grow the thread pool to the configured number of threads.
 * 
 * This routine calculates an appropriate growth factor. If the current
 * number of threads is less than the requested initial pool size, then the 
 * pool is grown to the initial pool size. If the current number of threads
 * is near the maximum pool size, then only grow to the maximum. Otherwise it 
 * is grown by the configured pool growth value.
 * 
 * @param[in] iwlm - a reference to the work load manager whose thread 
 *    pool size is to be increased.
 * 
 * @return Zero on success, or a non-zero error value.
 */
static int growThreadPool(iDnxWlm * iwlm)
{
   unsigned i, add, growsz;
   int ret;

   // set additional thread count - keep us between the min and the max
   if (iwlm->threads < iwlm->cfg.poolInitial)
      growsz = iwlm->cfg.poolInitial - iwlm->threads;
   else if (iwlm->threads + iwlm->cfg.poolGrow > iwlm->cfg.poolMax)
      growsz = iwlm->cfg.poolMax - iwlm->threads;
   else
      growsz = iwlm->cfg.poolGrow;

   // fill as many empty slots as we can or need to
   for (i = iwlm->threads, add = growsz; i < iwlm->poolsz && add > 0; i++, add--)
   {
      if ((ret = workerCreate(iwlm, &iwlm->pool[i])) != 0)
         break;
      iwlm->threads++;
      iwlm->tcreated++;
   }
   dnxLog("WLM: Increased thread pool by %d.", growsz - add);
   return ret;
}
Exemple #11
0
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm)
{
   iDnxWlm * iwlm;
   struct ifaddrs * ifa = NULL;

   assert(cfg && pwlm);
   assert(cfg->poolMin > 0);
   assert(cfg->poolMax >= cfg->poolMin);
   assert(cfg->poolInitial >= cfg->poolMin);
   assert(cfg->poolInitial <= cfg->poolMax);

   // allocate and configure the master thread pool data structure
   if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0)
      return DNX_ERR_MEMORY;

   memset(iwlm, 0, sizeof *iwlm);
   iwlm->cfg = *cfg;
   iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher);
   iwlm->cfg.collector = xstrdup(iwlm->cfg.collector);
   iwlm->poolsz = iwlm->cfg.poolMax;
   iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool);
   iwlm->minexectm = (unsigned)(-1);   // the largest possible value
   memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool);

   // cache our (primary?) ip address in binary and string format
   if (getifaddrs(&ifa) == 0)
   {
      u_int setflags = IFF_UP | IFF_RUNNING;
      u_int clrflags = IFF_LOOPBACK;
      struct ifaddrs * ifcur = ifa;

      // locate the first proper AF_NET address in our interface list
      while (ifcur && (ifcur->ifa_addr == 0 
            || ifcur->ifa_addr->sa_family != AF_INET 
            || (ifcur->ifa_flags & setflags) != setflags
            || (ifcur->ifa_flags & clrflags) != 0))
         ifcur = ifcur->ifa_next;

      if (ifcur)
      {
         // cache binary and presentation (string) versions of the ip address
         iwlm->myipaddr = (unsigned long)
               ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr;
         inet_ntop(ifcur->ifa_addr->sa_family,
                &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr,
                iwlm->myipaddrstr, sizeof iwlm->myipaddrstr);
      }
      freeifaddrs(ifa);
   }
   
   char unset[] = "NULL";
   if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set
   {
      dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread.");
      char machineName [MAX_HOSTNAME];
      if(strcmp(cfg->hostname, unset)==0)
      {
         dnxDebug(3, "dnxWlmCreate: Hostname undefined in config.");
         // Get our hostname
         if(gethostname(machineName, MAX_HOSTNAME)==0)
         {
            dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName);
            // cache hostname
            strcpy(iwlm->myhostname, machineName);
         } else {
            dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?],"
               "please set hostname in config.", machineName);
            sprintf( machineName, "localhost");
            strcpy(iwlm->myhostname, machineName);
         }
      } else {
         dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname);
         strcpy(iwlm->myhostname, cfg->hostname);
      }
   } else {
      dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname);
      strcpy(iwlm->cfg.hostname, iwlm->myhostname);
   }

   // if any of the above failed, we really can't continue
   if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool)
   {
      xfree(iwlm->cfg.dispatcher);
      xfree(iwlm->cfg.collector);
      xfree(iwlm);
      return DNX_ERR_MEMORY;
   }

   // create initial worker thread pool
   DNX_PT_MUTEX_INIT(&iwlm->mutex);
   DNX_PT_MUTEX_LOCK(&iwlm->mutex);
   {
      int ret;
      if ((ret = growThreadPool(iwlm)) != DNX_OK)
      {
         if (iwlm->threads)
            dnxLog("WLM: Error creating SOME worker threads: %s; "
                  "continuing with smaller initial pool.", dnxErrorString(ret));
         else
         {
            dnxLog("WLM: Unable to create ANY worker threads: %s; "
                  "terminating.", dnxErrorString(ret));
            DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
            DNX_PT_MUTEX_DESTROY(&iwlm->mutex);
            xfree(iwlm);
            return ret;
         }
      }
   }
   DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

   dnxLog("WLM: Started worker thread pool.");

   *pwlm = (DnxWlm *)iwlm;

   return DNX_OK;
}
Exemple #12
0
/** The main thread routine for a worker thread.
 * 
 * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this
 *    thread.
 * 
 * @return Always returns 0.
 */
static void * dnxWorker(void * data)
{
   DnxWorkerStatus * ws = (DnxWorkerStatus *)data;
   pthread_t tid = pthread_self();
   int retries = 0;
   iDnxWlm * iwlm;

   assert(data);
   
   iwlm = ws->iwlm;

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxWorkerCleanup, data);

   time(&ws->tstart);   // set thread start time (for stats)

   while (!iwlm->terminate)
   {
      DnxNodeRequest msg;
      DnxJob job;
      int ret;
      
      // setup job request message - use thread id and node address in XID
      dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr);
      msg.reqType = DNX_REQ_REGISTER;
      msg.jobCap = 1;
      msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff;
      msg.hn = iwlm->myhostname;
      // request a job, and then wait for a job to come in...
      if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) {
         dnxLog("Worker[%lx]: Error sending node request: %s.", 
               tid, dnxErrorString(ret));
      } else {
         DNX_PT_MUTEX_LOCK(&iwlm->mutex);
         iwlm->reqsent++;
         DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
      }

      // wait for job, even if request was never sent
      if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) {
         dnxLog("Worker[%lx]: Error receiving job: %s.",
               tid, dnxErrorString(ret));
      }
      
      // Allow thread to be canceled
      pthread_testcancel();

      DNX_PT_MUTEX_LOCK(&iwlm->mutex);
      cleanThreadPool(iwlm); // ensure counts are accurate before using them
      if (ret != DNX_OK)
      {
         // if above pool minimum and exceeded max retries...
         if (iwlm->threads > iwlm->cfg.poolMin 
               && ++retries > iwlm->cfg.maxRetries)
         {
            dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid);
            DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
            break;
         }
      }
      else
      {
         iwlm->jobsrcvd++;
         iwlm->active++;
//          dnxSendJobAck(ws->collect, &job, &job.address);
//          dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", 
//                tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd);
         
//          DnxAck ack;
//          ack.xid = job.xid;
//          ack.timestamp = job.timestamp;
         
         dnxSendJobAck(ws->collect, &job, 0);
         dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", 
               tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp);



         // check pool size before we get too busy -
         // if we're not shutting down and we haven't reached the configured
         // maximum and this is the last thread out, then increase the pool
         if (!iwlm->terminate 
               && iwlm->threads < iwlm->cfg.poolMax
               && iwlm->active == iwlm->threads) // Maybe more aggressive here
            growThreadPool(iwlm);
      }
      DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

      // if we have a job, execute it and reset retry count
      if (ret == DNX_OK)
      {
         char resData[MAX_RESULT_DATA + 1];
         DnxResult result;
         time_t jobstart;


         dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", 
               tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd);
               
               
         
         
         // prepare result structure
         result.xid = job.xid;               // result xid must match job xid
         result.state = DNX_JOB_COMPLETE;    // complete or expired
         result.delta = 0;
         result.resCode = DNX_PLUGIN_RESULT_OK;
         result.resData = 0;

         /** @todo Allocate result data buffer based on configured buffer size. */

         // we want to be able to cancel threads while they're out on a task
         // in order to obtain timely shutdown for long jobs - move into
         // async cancel mode, but only for the duration of the check
         pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0);

         *resData = 0;
         jobstart = time(0);
         dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0);
         result.delta = time(0) - jobstart;

         pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);

         // store allocated copy of the result string
         if (*resData) result.resData = xstrdup(resData);

         dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.",
               tid, job.xid.objSerial, job.xid.objSlot, result.delta, 
               result.resCode, result.resData);

//          if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) {
//             dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.",
//                   tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret));
//          }
         

         // Wait while we wait for an Ack to our Results
         DnxJob ack;
         int trys = 1;
         while(trys < 4) {
            if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) {
               dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret));
               break;
            }
            // Now wait for our Ack
            if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) {
               dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys);
            } else if (ret == DNX_ERR_TIMEOUT) {
               // we didn't get our Ack
               trys++;
            } else {
               // We got our Ack
               dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys);
               break;
            }
         }


         xfree(result.resData);
 
         // update all statistics
         DNX_PT_MUTEX_LOCK(&iwlm->mutex);
         {
            // track status
            if (result.resCode == DNX_PLUGIN_RESULT_OK) 
               iwlm->jobsok++;
            else 
               iwlm->jobsfail++;

            // track min/max/avg execution time
            if (result.delta > iwlm->maxexectm)
               iwlm->maxexectm = result.delta;
            if (result.delta < iwlm->minexectm)
               iwlm->minexectm = result.delta;
            iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2;

            // total job processing time
            iwlm->jobtm += (unsigned)result.delta;
            iwlm->active--;   // reduce active count
         }
         DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

         ws->serial++;     // increment job serial number for next job
         retries = 0;
      }
   }
   pthread_cleanup_pop(1);
   return 0;
}
Exemple #13
0
/** Log changes between old and new configuration data sets.
 * 
 * Dynamic reconfiguration of dispatcher and collector URL's is not allowed
 * so we don't need to check differences in those string values.
 * 
 * @param[in] ocp - a reference to the old configuration data set.
 * @param[in] ncp - a reference to the new configuration data set.
 */
static void logConfigChanges(DnxWlmCfgData * ocp, DnxWlmCfgData * ncp)
{
   if (strcmp(ocp->dispatcher, ncp->dispatcher) != 0)
      dnxLog("Config parameter 'channelDispatcher' changed from %s to %s. "
            "NOTE: Changing the dispatcher URL requires a restart.", 
            ocp->dispatcher, ncp->dispatcher);

   if (strcmp(ocp->collector, ncp->collector) != 0)
      dnxLog("Config parameter 'channelCollector' changed from %s to %s. "
            "NOTE: Changing the collector URL requires a restart.", 
            ocp->collector, ncp->collector);

   if (ocp->reqTimeout != ncp->reqTimeout)
      dnxLog("Config parameter 'threadRequestTimeout' changed from %u to %u.", 
            ocp->reqTimeout, ncp->reqTimeout);

   if (ocp->ttlBackoff != ncp->ttlBackoff)
      dnxLog("Config parameter 'threadTtlBackoff' changed from %u to %u.", 
            ocp->ttlBackoff, ncp->ttlBackoff);

   if (ocp->maxRetries != ncp->maxRetries)
      dnxLog("Config parameter 'threadMaxTimeouts' changed from %u to %u.", 
            ocp->maxRetries, ncp->maxRetries);

   if (ocp->poolMin != ncp->poolMin)
      dnxLog("Config parameter 'poolMin' changed from %u to %u.", 
            ocp->poolMin, ncp->poolMin);

   if (ocp->poolInitial != ncp->poolInitial)
      dnxLog("Config parameter 'poolInitial' changed from %u to %u.", 
            ocp->poolInitial, ncp->poolInitial);

   if (ocp->poolMax != ncp->poolMax)
      dnxLog("Config parameter 'poolMax' changed from %u to %u.", 
            ocp->poolMax, ncp->poolMax);

   if (ocp->poolGrow != ncp->poolGrow)
      dnxLog("Config parameter 'poolGrow' changed from %u to %u.", 
            ocp->poolGrow, ncp->poolGrow);

   if (ocp->pollInterval != ncp->pollInterval)
      dnxLog("Config parameter 'wlmPollInterval' changed from %u to %u.", 
            ocp->pollInterval, ncp->pollInterval);

   if (ocp->shutdownGrace != ncp->shutdownGrace)
      dnxLog("Config parameter 'wlmShutdownGracePeriod' changed from %u to %u.", 
            ocp->shutdownGrace, ncp->shutdownGrace);

   if (ocp->maxResults != ncp->maxResults)
      dnxLog("Config parameter 'maxResultBuffer' changed from %u to %u.", 
            ocp->maxResults, ncp->maxResults);

   if (ocp->showNodeAddr != ncp->showNodeAddr)
      dnxLog("Config parameter 'showNodeAddr' changed from %s to %s.", 
            ocp->showNodeAddr? "TRUE" : "FALSE", 
            ncp->showNodeAddr? "TRUE" : "FALSE");
            
   if (ocp->hostname != ncp->hostname)
      dnxLog("Config parameter 'hostname' changed from %s to %s.",
            ocp->hostname, ncp->hostname);
}
Exemple #14
0
/** The agent thread control procedure.
 *
 * @param[in] data - thread data; not used.
 *
 * @return Always returns a null pointer (zero).
 */
static void * dnxAgentServer(void * data)
{
   int ret;
   DnxMgmtRequest Msg;
   Msg.action = 0;

   dnxLog("DNX Server Agent awaiting commands...");

   while (!s_shutdown)
   {
      memset(Msg.address, '\0', DNX_MAX_ADDRESS);

      // wait 2 second for a request; process the request, if valid
      if ((ret = dnxWaitForMgmtRequest(s_agent, &Msg, Msg.address, 2)) == DNX_OK)
      {
         DnxMgmtReply Rsp;
         char addrstr[DNX_MAX_ADDRSTR];

         dnxDebug(2, "Received MgmtRequest from %s.", 
               dnxNtop(Msg.address, addrstr, sizeof addrstr));

         // setup some default response values
         Rsp.xid = Msg.xid;
         Rsp.status = DNX_REQ_ACK;
         Rsp.reply = 0;

         // perform the requested action
         if (!strcmp(Msg.action, "RESETSTATS"))
         {
            dnxStatsResetServerStats();
            dnxStatsForEachNode(dnxResetNodeStats, 0);
            Rsp.reply = xstrdup("OK");
         }
         else if (!strncmp(Msg.action, "GETSTATS ", 9))
         {
            if ((Rsp.reply = buildMgmtStatsReply(Msg.action + 9)) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strncmp(Msg.action, "GETNODESTATS ", 13))
         {
            if ((Rsp.reply = buildMgmtNodeStatsReply(Msg.action + 13)) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETNODELIST"))
         {
            if ((Rsp.reply = buildMgmtNodeListReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETCONFIG"))
         {
            if ((Rsp.reply = buildMgmtCfgReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETVERSION"))
         {
            if ((Rsp.reply = versionText()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "HELP"))
         {
            if ((Rsp.reply = buildHelpReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }

         // send response, log response failures
         if ((ret = dnxSendMgmtReply(s_agent, &Rsp, Msg.address)) != 0)
            dnxLog("Agent response failure: %s.", dnxErrorString(ret));

         // free request and reply message buffers
         xfree(Rsp.reply);
         xfree(Msg.action);
      }
      else if (ret != DNX_ERR_TIMEOUT)
         dnxLog("Agent channel failure: %s.", dnxErrorString(ret));
   }

   dnxLog("Agent terminating...");

   return 0;
}
Exemple #15
0
/** Service Check Event Handler.
 *
 * @param[in] event_type - the event type for which we're being called.
 * @param[in] data - an opaque pointer to nagios event-specific data.
 *
 * @return Zero if we want Nagios to handle the event;
 *    NEBERROR_CALLBACKOVERRIDE indicates that we want to handle the event
 *    ourselves; any other non-zero value represents an error.
 */
static int ehSvcCheck(int event_type, void * data)
{
   static unsigned long serial = 0; // the number of service checks processed

   nebstruct_service_check_data * svcdata = (nebstruct_service_check_data *)data;
   DnxNodeRequest * pNode;
   DnxJobData * jdp;
   int ret;

   if (event_type != NEBCALLBACK_SERVICE_CHECK_DATA)
      return OK;

   if (svcdata == 0)
   {
      dnxLog("Service handler received NULL service data structure.");
      return ERROR;  // shouldn't happen - internal Nagios error
   }

   if (svcdata->type != NEBTYPE_SERVICECHECK_INITIATE)
      return OK;  // ignore non-initiate service checks

   // check for local execution pattern on command line
   if (cfg.localCheckPattern && regexec(&regEx, svcdata->command_line, 0, 0, 0) == 0)
   {
      dnxDebug(1, "Service will execute locally: %s.", svcdata->command_line);
      return OK;     // tell nagios execute locally
   }

   dnxDebug(3, "ehSvcCheck: Received Job [%lu] at %lu (%lu).",
         serial, (unsigned long)time(0),
         (unsigned long)svcdata->start_time.tv_sec);

   if ((ret = dnxGetNodeRequest(registrar, &pNode)) != DNX_OK)
   {
      dnxDebug(3, "ehSvcCheck: No worker nodes requests available: %s.",dnxErrorString(ret));
      return OK;     // tell nagios execute locally
   }

   // allocate and populate a new job payload object
   if ((jdp = (DnxJobData *)xmalloc(sizeof *jdp)) == 0)
   {
      dnxDebug(1, "ehSvcCheck: Out of memory!");
      return OK;
   }
   memset(jdp, 0, sizeof *jdp);
   jdp->svc = (service *)svcdata->OBJECT_FIELD_NAME;

   assert(jdp->svc);

#if CURRENT_NEB_API_VERSION == 3
   {
      // a nagios 3.x global variable
      extern check_result check_result_info;

      /** @todo patch nagios to pass these values to the event handler. */

      jdp->chkopts    = check_result_info.check_options;
      jdp->schedule   = check_result_info.scheduled_check;
      jdp->reschedule = check_result_info.reschedule_check;
   }
#endif

   if ((ret = dnxPostNewJob(joblist, serial, jdp, svcdata, pNode)) != DNX_OK)
   {
      dnxLog("Unable to post job [%lu]: %s.", serial, dnxErrorString(ret));
      xfree(jdp);
      return OK;     // tell nagios execute locally
   }

   serial++;                           // bump serial number
   return NEBERROR_CALLBACKOVERRIDE;   // tell nagios we want it
}
Exemple #16
0
/** The main NEB module initialization routine.
 *
 * This function gets called when the module is loaded by the event broker.
 *
 * @param[in] flags - module flags - not used
 * @param[in] args - module arguments. These come from the nagios
 *    configuration file, and are passed through to the module as it loads.
 * @param[in] handle - our module handle - passed from the OS to nagios as
 *    nagios loaded us.
 *
 * @return Zero on success, or a non-zero error value.
 */
int nebmodule_init(int flags, char * args, nebmodule * handle)
{
   int ret;

   myHandle = handle;

   // module args string should contain a fully-qualified config file path
   if (!args || !*args)
      args = DNX_DEFAULT_SERVER_CONFIG_FILE;

   if ((ret = initConfig(args)) != 0)
      return ERROR;

   // set configured debug level and syslog log facility code
   dnxLogInit(cfg.logFilePath, cfg.debugFilePath, cfg.auditFilePath,
         &cfg.debugLevel);

   dnxLog("-------- DNX Server Module Version %s Startup --------", VERSION);
   dnxLog("Copyright (c) 2006-2010 Intellectual Reserve. All rights reserved.");
   dnxLog("Configuration file: %s.", args);
   dnxLog("Dispatcher: %s.", cfg.dispatcherUrl);
   dnxLog("Collector: %s.", cfg.collectorUrl);
   dnxLog("Agent: %s.", cfg.agentUrl);
   if (cfg.debugFilePath && cfg.debugLevel != 0)
   {
      dnxLog("Debug logging enabled at level %d to %s.",
            cfg.debugLevel, cfg.debugFilePath);
#if DEBUG_HEAP
      dnxLog("Debug heap is enabled.");
#endif
#if DEBUG_LOCKS
      dnxLog("Debug locks are enabled.");
#endif
   }
   if (cfg.auditFilePath)
      dnxLog("Auditing enabled to %s.", cfg.auditFilePath);

#if DEBUG_HEAP
      dnxLog("Debug heap is enabled.");
#endif
#if DEBUG_LOCKS
      dnxLog("Debug locks are enabled.");
#endif

   // subscribe to PROCESS_DATA call-backs in order to defer initialization
   //    until after Nagios validates its configuration and environment.
   if ((ret = neb_register_callback(NEBCALLBACK_PROCESS_DATA,
         myHandle, 0, ehProcessData)) != OK)
   {
      dnxLog("PROCESS_DATA event registration failed: %s.", dnxErrorString(ret));
      releaseConfig();
      return ERROR;
   }
   start_time = time(0);

   dnxLog("-------- DNX Server Module Startup Complete --------");

   return OK;
}
Exemple #17
0
/** The main timer thread procedure entry point.
 *
 * @param[in] data - an opaque pointer to thread data for the timer thread.
 *    This is actually the dnx server global data object.
 *
 * @return Always returns 0.
 */
static void * dnxTimer(void * data)
{
   iDnxTimer * itimer = (iDnxTimer *)data;
   DnxNewJob ExpiredList[MAX_EXPIRED];
   int i, totalExpired;
   int ret = 0;

   assert(data);

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxTimerCleanup, data);

   dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self());

   while (1)
   {
      pthread_testcancel();

      dnxCancelableSleep(itimer->sleepms);

      // search for expired jobs in the pending queue
      totalExpired = MAX_EXPIRED;
      if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, 
            &totalExpired)) == DNX_OK && totalExpired > 0)
      {
         for (i = 0; i < totalExpired; i++)
         {
            char msg[256];
            char addrstr[DNX_MAX_ADDRSTR];
            DnxNewJob * job = &ExpiredList[i];

            dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.",
                  pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd);

            dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT);
            dnxAuditJob(job, "EXPIRE");

//          if (job->ack)
               snprintf(msg, sizeof msg, 
                     "(DNX: Service Check [%lu,%lu] Timed Out - "
                     "Node: %s - Failed to return job response in time allowed)",
                     job->xid.objSerial, job->xid.objSlot, addrstr);
//          else
//             snprintf(msg, sizeof msg, 
//                   "(DNX: Service Check [%lu,%lu] Timed Out - "
//                   "Node: %s - Failed to acknowledge job receipt)",
//                   job->xid.objSerial, job->xid.objSlot, addrstr);

            dnxDebug(2, msg);

            // report the expired job to Nagios
            ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, 
                  time(0) - job->start_time, 1, 0, msg);
            dnxJobCleanup(job);
         }
      }

      if (totalExpired > 0 || ret != DNX_OK)
         dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d  Retcode=%d: %s.",
               pthread_self(), totalExpired, ret, dnxErrorString(ret));
   }

   dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret));

   pthread_cleanup_pop(1);
   return 0;
}
Exemple #18
0
/** Initialize the dnxServer.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxServerInit(void)
{
   int ret, joblistsz;

   // clear globals so we know what to "undo" as we back out
   joblist = 0;
   registrar = 0;
   dispatcher = 0;
   collector = 0;

   if ((ret = dnxChanMapInit(0)) != 0)
   {
      dnxLog("Failed to initialize channel map: %s.", dnxErrorString(ret));
      return ret;
   }

   joblistsz = dnxCalculateJobListSize();

   dnxLog("Allocating %d service request slots in the DNX job list.", joblistsz);

   if ((ret = dnxJobListCreate(joblistsz, &joblist)) != 0)
   {
      dnxLog("Failed to initialize DNX job list with %d slots.", joblistsz);
      return ret;
   }

   // create and configure collector
   if ((ret = dnxCollectorCreate("Collect", cfg.collectorUrl,
               joblist, &collector)) != 0)
      return ret;

   // create and configure dispatcher
   if ((ret = dnxDispatcherCreate("Dispatch", cfg.dispatcherUrl,
               joblist, &dispatcher)) != 0)
      return ret;

   // create worker node registrar
   if ((ret = dnxRegistrarCreate(joblistsz * 2,
               dnxDispatcherGetChannel(dispatcher), &registrar)) != 0)
      return ret;

   // initialize server management agent
   if ((ret = dnxInitAgent(cfg.agentUrl, parser)) != 0)
      return ret;

#if CURRENT_NEB_API_VERSION == 3 && defined(DIRECT_POST)

   // register for timed event to piggy-back on reaper thread
   neb_register_callback(NEBCALLBACK_TIMED_EVENT_DATA, myHandle, 0, ehTimedEvent);
   dnxLog("Registered for TIMEDEVENT_EXECUTE event.");

#endif

   // registration for this event starts everything rolling
   neb_register_callback(NEBCALLBACK_SERVICE_CHECK_DATA, myHandle, 0, ehSvcCheck);

   dnxLog("Registered for SERVICE_CHECK_DATA event.");
   dnxLog("Server initialization completed.");

   return 0;
}