Пример #1
0
/** Create a new worker thread.
 *
 * @param[in] iwlm - the WLM object whose thread pool is being updated.
 * @param[in] pws - the address of storage for the address of the newly 
 *    allocated and configured worker status structure.
 * 
 * @return Zero on success, or a non-zero error value.
 */
static int workerCreate(iDnxWlm * iwlm, DnxWorkerStatus ** pws)
{
   DnxWorkerStatus * ws = NULL;
   int ret;
   
   // allocate and clear a new worker status structure
   if ((ws = (DnxWorkerStatus *)xmalloc(sizeof *ws)) == 0)
      return DNX_ERR_MEMORY;
   memset(ws, 0, sizeof *ws);
   ws->iwlm = iwlm;

   // initialize our communications channels
   if ((ret = initWorkerComm(ws)) != 0)
   {
      dnxLog("WLM: Failed to initialize worker comm channels: %s.",dnxErrorString(ret));
      xfree(ws);
      return ret;
   }

   // create a worker thread
   ws->state = DNX_THREAD_RUNNING; // set thread state to active
   if ((ret = pthread_create(&ws->tid, 0, dnxWorker, ws)) != 0)
   {
      dnxLog("WLM: Failed to create worker thread: %s.", strerror(ret));
      releaseWorkerComm(ws);
      xfree(ws);
      return DNX_ERR_THREAD;
   }
   *pws = ws;
   return 0;
}
Пример #2
0
int dnxTimerCreate(DnxJobList * joblist, int sleeptime, DnxTimer ** ptimer)
{
   iDnxTimer * itimer;
   int ret;

   assert(joblist && ptimer);

   // don't allow sleep times outside the range 1/10th sec to 5 minutes
   if (sleeptime < 100 || sleeptime > 300000)
      sleeptime = DNX_DEF_TIMER_SLEEP;

   if ((itimer = (iDnxTimer *)xmalloc(sizeof *itimer)) == 0)
      return DNX_ERR_MEMORY;

   // initialize the itimer
   memset(itimer, 0, sizeof *itimer);
   itimer->joblist = joblist;
   itimer->sleepms = sleeptime;

   // create the timer thread
   if ((ret = pthread_create(&itimer->tid, 0, dnxTimer, itimer)) != 0)
   {
      dnxLog("Timer thread creation failed: %s.", dnxErrorString(ret));
      xfree(itimer);
      return DNX_ERR_THREAD;
   }

   *ptimer = (DnxTimer *)itimer;

   return DNX_OK;
}
Пример #3
0
int dnxRegistrarCreate(DnxChannel * chan, unsigned queuesz, 
      DnxRegistrar ** preg)
{
   iDnxRegistrar * ireg;
   int ret;

   assert(chan && queuesz && preg);

   if ((ireg = (iDnxRegistrar *)xmalloc(sizeof *ireg)) == 0)
      return DNX_ERR_MEMORY;

   memset(ireg, 0, sizeof *ireg);

   ireg->channel = chan;

   if ((ret = dnxQueueCreate(queuesz, xfree, &ireg->rqueue)) != 0)
   {
      dnxLog("DNX Registrar: Queue creation failed: %s.", dnxErrorString(ret));
      xfree(ireg);
      return ret;
   }

   if ((ret = pthread_create(&ireg->tid, 0, dnxRegistrar, ireg)) != 0)
   {
      dnxLog("DNX Registrar: Thread creation failed: %s.", strerror(ret));
      dnxQueueDestroy(ireg->rqueue);
      xfree(ireg);
      return DNX_ERR_THREAD;
   }

   *preg = (DnxRegistrar *)ireg;
   return DNX_OK;
}
Пример #4
0
/** The main NEB module initialization routine.
 *
 * This function gets called when the module is loaded by the event broker.
 *
 * @param[in] flags - module flags - not used
 * @param[in] args - module arguments. These come from the nagios
 *    configuration file, and are passed through to the module as it loads.
 * @param[in] handle - our module handle - passed from the OS to nagios as
 *    nagios loaded us.
 *
 * @return Zero on success, or a non-zero error value.
 */
int nebmodule_init(int flags, char * args, nebmodule * handle)
{
   int ret;

   myHandle = handle;

   // module args string should contain a fully-qualified config file path
   if (!args || !*args)
      args = DNX_DEFAULT_SERVER_CONFIG_FILE;

   if ((ret = initConfig(args)) != 0)
      return ERROR;

   // set configured debug level and syslog log facility code
   dnxLogInit(cfg.logFilePath, cfg.debugFilePath, cfg.auditFilePath,
         &cfg.debugLevel);

   dnxLog("-------- DNX Server Module Version %s Startup --------", VERSION);
   dnxLog("Copyright (c) 2006-2010 Intellectual Reserve. All rights reserved.");
   dnxLog("Configuration file: %s.", args);
   dnxLog("Dispatcher: %s.", cfg.dispatcherUrl);
   dnxLog("Collector: %s.", cfg.collectorUrl);
   dnxLog("Agent: %s.", cfg.agentUrl);
   if (cfg.debugFilePath && cfg.debugLevel != 0)
   {
      dnxLog("Debug logging enabled at level %d to %s.",
            cfg.debugLevel, cfg.debugFilePath);
#if DEBUG_HEAP
      dnxLog("Debug heap is enabled.");
#endif
#if DEBUG_LOCKS
      dnxLog("Debug locks are enabled.");
#endif
   }
   if (cfg.auditFilePath)
      dnxLog("Auditing enabled to %s.", cfg.auditFilePath);

#if DEBUG_HEAP
      dnxLog("Debug heap is enabled.");
#endif
#if DEBUG_LOCKS
      dnxLog("Debug locks are enabled.");
#endif

   // subscribe to PROCESS_DATA call-backs in order to defer initialization
   //    until after Nagios validates its configuration and environment.
   if ((ret = neb_register_callback(NEBCALLBACK_PROCESS_DATA,
         myHandle, 0, ehProcessData)) != OK)
   {
      dnxLog("PROCESS_DATA event registration failed: %s.", dnxErrorString(ret));
      releaseConfig();
      return ERROR;
   }
   start_time = time(0);

   dnxLog("-------- DNX Server Module Startup Complete --------");

   return OK;
}
Пример #5
0
/** The main thread entry point procedure for the registrar thread.
 *
 * This thread handles all inbound requests in a single-threaded fashion,
 * so we can safely call dnxStatsInc here for new nodes.
 *
 * @param[in] data - an opaque pointer to registrar thread data. This is
 *    actually a pointer to the dnx server global data structure.
 *
 * @return Always returns NULL.
 */
static void * dnxRegistrar(void * data)
{
   iDnxRegistrar * ireg = (iDnxRegistrar *)data;
   DnxNodeRequest * pMsg = 0;

   assert(data);

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);

   dnxLog("DNX Registrar: Awaiting worker node requests...");

   while (1)
   {
      int ret;

      // (re)allocate message block if not consumed in last pass
      if (pMsg == 0 && (pMsg = (DnxNodeRequest *)xmalloc(sizeof *pMsg)) == 0)
      {
         dnxCancelableSleep(10);    // sleep for a while and try again...
         continue;
      }

      pthread_cleanup_push(xfree, pMsg);

      pthread_testcancel();

      // wait on the registrar socket for a request
      if ((ret = dnxWaitForNodeRequest(ireg->channel, pMsg, pMsg->address,
            DNX_REGISTRAR_REQUEST_TIMEOUT)) == DNX_OK)
      {
         switch (pMsg->reqType)
         {
            case DNX_REQ_REGISTER:
               ret = dnxRegisterNode(ireg, &pMsg);
               break;

            case DNX_REQ_DEREGISTER:
               ret = dnxDeregisterNode(ireg, pMsg);
               break;

            default:
               ret = DNX_ERR_UNSUPPORTED;
         }
      }

      pthread_cleanup_pop(0);

      if (ret != DNX_OK && ret != DNX_ERR_TIMEOUT)
         dnxLog("DNX Registrar: Process node request failed: %s.",
               dnxErrorString(ret));
   }
   return 0;
}
Пример #6
0
/** Initialize worker thread communication resources.
 * 
 * @param[in] ws - a pointer to a worker thread's status data structure.
 * 
 * @return Zero on success, or a non-zero error value.
 */
static int initWorkerComm(DnxWorkerStatus * ws)
{
   char szChanDisp[64];
   char szChanColl[64];
   int ret;

   // create a channel for sending job requests (named after its memory address)
   sprintf(szChanDisp, "Dispatch:%lx", ws);
   if ((ret = dnxChanMapAdd(szChanDisp, ws->iwlm->cfg.dispatcher)) != DNX_OK)
   {
      dnxLog("WLM: Failed to initialize dispatcher channel: %s.", dnxErrorString(ret));
      return ret;
   }
   if ((ret = dnxConnect(szChanDisp, 1, &ws->dispatch)) != DNX_OK)
   {
      dnxLog("WLM: Failed to open dispatcher channel: %s.", dnxErrorString(ret));
      dnxChanMapDelete(szChanDisp);
      return ret;
   }

   // create a channel for sending job results (named after its memory address)
   sprintf(szChanColl, "Collect:%lx", ws);
   if ((ret = dnxChanMapAdd(szChanColl, ws->iwlm->cfg.collector)) != DNX_OK)
   {
      dnxLog("WLM: Failed to initialize collector channel: %s.", dnxErrorString(ret));
      dnxDisconnect(ws->dispatch);
      dnxChanMapDelete(szChanDisp);
      return ret;
   }
   if ((ret = dnxConnect(szChanColl, 1, &ws->collect)) != DNX_OK)
   {
      dnxLog("WLM: Failed to open collector channel: %s.", dnxErrorString(ret));
      dnxChanMapDelete(szChanColl);
      dnxDisconnect(ws->dispatch);
      dnxChanMapDelete(szChanDisp);
      return ret;
   }
   return 0;
}
Пример #7
0
int dnxInitAgent(char * agentUrl, DnxCfgParser * parser)
{
   int ret;

   s_shutdown = 0;
   s_agentTid = 0;
   s_parser = parser;

   if ((ret = dnxChanMapAdd(s_agentName, agentUrl)) != DNX_OK)
      dnxLog("AGENT channel init failed: %s.", dnxErrorString(ret));
   else if ((ret = dnxConnect(s_agentName, DNX_MODE_PASSIVE, &s_agent)) != DNX_OK)
   {
      dnxLog("AGENT channel connect failed: %s.", dnxErrorString(ret));
      dnxChanMapDelete(s_agentName);
   }
   else if ((ret = pthread_create(&s_agentTid, 0, dnxAgentServer, 0)) != 0)
   {
      dnxLog("AGENT server init failed: %s.", strerror(ret));
      dnxDisconnect(s_agent);
      dnxChanMapDelete(s_agentName);
      ret = DNX_ERR_THREAD;
   }
   return ret;
}
Пример #8
0
int dnxGetNodeRequest(DnxRegistrar * reg, DnxNodeRequest ** ppNode)
{
   iDnxRegistrar * ireg = (iDnxRegistrar *)reg;
   int ret, discard_count = 0;
   DnxNodeRequest * node = 0;

   assert(reg && ppNode);

   while ((ret = dnxQueueGet(ireg->rqueue, (void **)&node)) == DNX_OK)
   {
      time_t now = time(0);

      // verify that this request's Time-To-Live (TTL) has not expired
      if (node->expires > now)
         break;

      dnxStatsInc(node->address, REQUESTS_EXPIRED);

      dnxDebug(3, "dnxRegisterNode: Expired req [%lu,%lu] at %u; expired at %u.",
            node->xid.objSerial, node->xid.objSlot, (unsigned)(now % 1000), 
            (unsigned)(node->expires % 1000));

      discard_count++;

      xfree(node);
      node = 0;
   }

   if (discard_count > 0)
      dnxDebug(1, "dnxGetNodeRequest: Discarded %d expired node requests.",
            discard_count);

   if (ret != DNX_OK && ret != DNX_ERR_TIMEOUT)
   {
      dnxStatsInc(0, JOBS_REJECTED_NO_NODES);
      dnxDebug(2, "dnxGetNodeRequest: Unable to fulfill node request: %s.",
            dnxErrorString(ret));
   }

   *ppNode = node;   // return a node or NULL

   return ret;
}
Пример #9
0
/** Register a new client node "request for work" request.
 *
 * The message is either stored or used to find an existing node request
 * that should be updated. If stored, @p ppMsg is returned as zero so that
 * it will be reallocated by the caller. In all other cases, the same
 * message block can be reused by the caller for the next request.
 *
 * @param[in] ireg - the registrar on which to register a new client request.
 * @param[in] ppMsg - the address of the dnx client request node pointer.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxRegisterNode(iDnxRegistrar * ireg, DnxNodeRequest ** ppMsg)
{
   pthread_t tid = pthread_self();
   DnxNodeRequest * pReq;
   time_t now = time(0);
   int ret = DNX_OK;

   assert(ireg && ppMsg && *ppMsg);

   // compute expiration time of this request
   pReq = *ppMsg;
   pReq->expires = now + pReq->ttl;

   dnxStatsInc(pReq->address, REQUESTS_RECEIVED);

   // locate existing node: update expiration time, or add to the queue
   if (dnxQueueFind(ireg->rqueue, (void **)&pReq, dnxCompareNodeReq) == DNX_QRES_FOUND)
   {
      pReq->expires = (*ppMsg)->expires;
      dnxDebug(2,
            "dnxRegistrar[%lx]: Updated req [%lu,%lu] at %u; expires at %u.",
            tid, pReq->xid.objSerial, pReq->xid.objSlot,
            (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000));
   }
   else if ((ret = dnxQueuePut(ireg->rqueue, *ppMsg)) == DNX_OK)
   {
      *ppMsg = 0;    // we're keeping this message; return NULL
      dnxDebug(2,
            "dnxRegistrar[%lx]: Added req [%lu,%lu] at %u; expires at %u.",
            tid, pReq->xid.objSerial, pReq->xid.objSlot,
            (unsigned)(now % 1000), (unsigned)(pReq->expires % 1000));
   }
   else
      dnxLog("DNX Registrar: Unable to enqueue node request: %s.",
            dnxErrorString(ret));

   return ret;
}
Пример #10
0
/** The main timer thread procedure entry point.
 *
 * @param[in] data - an opaque pointer to thread data for the timer thread.
 *    This is actually the dnx server global data object.
 *
 * @return Always returns 0.
 */
static void * dnxTimer(void * data)
{
   iDnxTimer * itimer = (iDnxTimer *)data;
   DnxNewJob ExpiredList[MAX_EXPIRED];
   int i, totalExpired;
   int ret = 0;

   assert(data);

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxTimerCleanup, data);

   dnxLog("dnxTimer[%lx]: Watching for expired jobs...", pthread_self());

   while (1)
   {
      pthread_testcancel();

      dnxCancelableSleep(itimer->sleepms);

      // search for expired jobs in the pending queue
      totalExpired = MAX_EXPIRED;
      if ((ret = dnxJobListExpire(itimer->joblist, ExpiredList, 
            &totalExpired)) == DNX_OK && totalExpired > 0)
      {
         for (i = 0; i < totalExpired; i++)
         {
            char msg[256];
            char addrstr[DNX_MAX_ADDRSTR];
            DnxNewJob * job = &ExpiredList[i];

            dnxDebug(1, "dnxTimer[%lx]: Expiring Job [%lu,%lu]: %s.",
                  pthread_self(), job->xid.objSerial, job->xid.objSlot, job->cmd);

            dnxStatsInc(job->pNode->address, RESULTS_TIMED_OUT);
            dnxAuditJob(job, "EXPIRE");

//          if (job->ack)
               snprintf(msg, sizeof msg, 
                     "(DNX: Service Check [%lu,%lu] Timed Out - "
                     "Node: %s - Failed to return job response in time allowed)",
                     job->xid.objSerial, job->xid.objSlot, addrstr);
//          else
//             snprintf(msg, sizeof msg, 
//                   "(DNX: Service Check [%lu,%lu] Timed Out - "
//                   "Node: %s - Failed to acknowledge job receipt)",
//                   job->xid.objSerial, job->xid.objSlot, addrstr);

            dnxDebug(2, msg);

            // report the expired job to Nagios
            ret = dnxPostResult(job->payload, job->xid.objSerial, job->start_time, 
                  time(0) - job->start_time, 1, 0, msg);
            dnxJobCleanup(job);
         }
      }

      if (totalExpired > 0 || ret != DNX_OK)
         dnxDebug(2, "dnxTimer[%lx]: Expired job count: %d  Retcode=%d: %s.",
               pthread_self(), totalExpired, ret, dnxErrorString(ret));
   }

   dnxLog("dnxTimer[%lx]: Terminating: %s.", pthread_self(), dnxErrorString(ret));

   pthread_cleanup_pop(1);
   return 0;
}
Пример #11
0
int dnxWlmCreate(DnxWlmCfgData * cfg, DnxWlm ** pwlm)
{
   iDnxWlm * iwlm;
   struct ifaddrs * ifa = NULL;

   assert(cfg && pwlm);
   assert(cfg->poolMin > 0);
   assert(cfg->poolMax >= cfg->poolMin);
   assert(cfg->poolInitial >= cfg->poolMin);
   assert(cfg->poolInitial <= cfg->poolMax);

   // allocate and configure the master thread pool data structure
   if ((iwlm = (iDnxWlm *)xmalloc(sizeof *iwlm)) == 0)
      return DNX_ERR_MEMORY;

   memset(iwlm, 0, sizeof *iwlm);
   iwlm->cfg = *cfg;
   iwlm->cfg.dispatcher = xstrdup(iwlm->cfg.dispatcher);
   iwlm->cfg.collector = xstrdup(iwlm->cfg.collector);
   iwlm->poolsz = iwlm->cfg.poolMax;
   iwlm->pool = (DnxWorkerStatus **)xmalloc(iwlm->poolsz * sizeof *iwlm->pool);
   iwlm->minexectm = (unsigned)(-1);   // the largest possible value
   memset(iwlm->pool, 0, iwlm->poolsz * sizeof *iwlm->pool);

   // cache our (primary?) ip address in binary and string format
   if (getifaddrs(&ifa) == 0)
   {
      u_int setflags = IFF_UP | IFF_RUNNING;
      u_int clrflags = IFF_LOOPBACK;
      struct ifaddrs * ifcur = ifa;

      // locate the first proper AF_NET address in our interface list
      while (ifcur && (ifcur->ifa_addr == 0 
            || ifcur->ifa_addr->sa_family != AF_INET 
            || (ifcur->ifa_flags & setflags) != setflags
            || (ifcur->ifa_flags & clrflags) != 0))
         ifcur = ifcur->ifa_next;

      if (ifcur)
      {
         // cache binary and presentation (string) versions of the ip address
         iwlm->myipaddr = (unsigned long)
               ((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr.s_addr;
         inet_ntop(ifcur->ifa_addr->sa_family,
                &((struct sockaddr_in *)ifcur->ifa_addr)->sin_addr,
                iwlm->myipaddrstr, sizeof iwlm->myipaddrstr);
      }
      freeifaddrs(ifa);
   }
   
   char unset[] = "NULL";
   if(!strnlen(iwlm->myhostname, 1)) //See if the global hostname has been set
   {
      dnxDebug(3, "dnxWlmCreate: Hostname not set in parent thread.");
      char machineName [MAX_HOSTNAME];
      if(strcmp(cfg->hostname, unset)==0)
      {
         dnxDebug(3, "dnxWlmCreate: Hostname undefined in config.");
         // Get our hostname
         if(gethostname(machineName, MAX_HOSTNAME)==0)
         {
            dnxDebug(3, "dnxWlmCreate: Hostname is [%s].", machineName);
            // cache hostname
            strcpy(iwlm->myhostname, machineName);
         } else {
            dnxLog("dnxWlmCreate: Unable to obtain Hostname [%s?],"
               "please set hostname in config.", machineName);
            sprintf( machineName, "localhost");
            strcpy(iwlm->myhostname, machineName);
         }
      } else {
         dnxDebug(3, "dnxWlmCreate: Using hostname in config [%s].", cfg->hostname);
         strcpy(iwlm->myhostname, cfg->hostname);
      }
   } else {
      dnxDebug(3, "dnxWlmCreate: Using cached hostname [%s].", iwlm->myhostname);
      strcpy(iwlm->cfg.hostname, iwlm->myhostname);
   }

   // if any of the above failed, we really can't continue
   if (!iwlm->cfg.dispatcher || !iwlm->cfg.collector || !iwlm->pool)
   {
      xfree(iwlm->cfg.dispatcher);
      xfree(iwlm->cfg.collector);
      xfree(iwlm);
      return DNX_ERR_MEMORY;
   }

   // create initial worker thread pool
   DNX_PT_MUTEX_INIT(&iwlm->mutex);
   DNX_PT_MUTEX_LOCK(&iwlm->mutex);
   {
      int ret;
      if ((ret = growThreadPool(iwlm)) != DNX_OK)
      {
         if (iwlm->threads)
            dnxLog("WLM: Error creating SOME worker threads: %s; "
                  "continuing with smaller initial pool.", dnxErrorString(ret));
         else
         {
            dnxLog("WLM: Unable to create ANY worker threads: %s; "
                  "terminating.", dnxErrorString(ret));
            DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
            DNX_PT_MUTEX_DESTROY(&iwlm->mutex);
            xfree(iwlm);
            return ret;
         }
      }
   }
   DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

   dnxLog("WLM: Started worker thread pool.");

   *pwlm = (DnxWlm *)iwlm;

   return DNX_OK;
}
Пример #12
0
/** The main thread routine for a worker thread.
 * 
 * @param[in] data - an opaque pointer to a DnxWorkerStatus structure for this
 *    thread.
 * 
 * @return Always returns 0.
 */
static void * dnxWorker(void * data)
{
   DnxWorkerStatus * ws = (DnxWorkerStatus *)data;
   pthread_t tid = pthread_self();
   int retries = 0;
   iDnxWlm * iwlm;

   assert(data);
   
   iwlm = ws->iwlm;

   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);
   pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);
   pthread_cleanup_push(dnxWorkerCleanup, data);

   time(&ws->tstart);   // set thread start time (for stats)

   while (!iwlm->terminate)
   {
      DnxNodeRequest msg;
      DnxJob job;
      int ret;
      
      // setup job request message - use thread id and node address in XID
      dnxMakeXID(&msg.xid, DNX_OBJ_WORKER, tid, iwlm->myipaddr);
      msg.reqType = DNX_REQ_REGISTER;
      msg.jobCap = 1;
      msg.ttl = iwlm->cfg.reqTimeout - iwlm->cfg.ttlBackoff;
      msg.hn = iwlm->myhostname;
      // request a job, and then wait for a job to come in...
      if ((ret = dnxSendNodeRequest(ws->dispatch, &msg, 0)) != DNX_OK) {
         dnxLog("Worker[%lx]: Error sending node request: %s.", 
               tid, dnxErrorString(ret));
      } else {
         DNX_PT_MUTEX_LOCK(&iwlm->mutex);
         iwlm->reqsent++;
         DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
      }

      // wait for job, even if request was never sent
      if ((ret = dnxWaitForJob(ws->dispatch, &job, job.address,iwlm->cfg.reqTimeout)) != DNX_OK && ret != DNX_ERR_TIMEOUT) {
         dnxLog("Worker[%lx]: Error receiving job: %s.",
               tid, dnxErrorString(ret));
      }
      
      // Allow thread to be canceled
      pthread_testcancel();

      DNX_PT_MUTEX_LOCK(&iwlm->mutex);
      cleanThreadPool(iwlm); // ensure counts are accurate before using them
      if (ret != DNX_OK)
      {
         // if above pool minimum and exceeded max retries...
         if (iwlm->threads > iwlm->cfg.poolMin 
               && ++retries > iwlm->cfg.maxRetries)
         {
            dnxLog("Worker[%lx]: Exiting - max retries exceeded.", tid);
            DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);
            break;
         }
      }
      else
      {
         iwlm->jobsrcvd++;
         iwlm->active++;
//          dnxSendJobAck(ws->collect, &job, &job.address);
//          dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] (T/O %d): %s.", 
//                tid, job.xid.objSerial, job.xid.objSlot, job.timeout, job.cmd);
         
//          DnxAck ack;
//          ack.xid = job.xid;
//          ack.timestamp = job.timestamp;
         
         dnxSendJobAck(ws->collect, &job, 0);
         dnxDebug(3, "Worker[%lx]: Acknowledged job [%lu:%lu] to channel (%lx) (T/S %lu).", 
               tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timestamp);



         // check pool size before we get too busy -
         // if we're not shutting down and we haven't reached the configured
         // maximum and this is the last thread out, then increase the pool
         if (!iwlm->terminate 
               && iwlm->threads < iwlm->cfg.poolMax
               && iwlm->active == iwlm->threads) // Maybe more aggressive here
            growThreadPool(iwlm);
      }
      DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

      // if we have a job, execute it and reset retry count
      if (ret == DNX_OK)
      {
         char resData[MAX_RESULT_DATA + 1];
         DnxResult result;
         time_t jobstart;


         dnxDebug(3, "Worker[%lx]: Received job [%lu:%lu] from (%lx) (T/O %d): %s.", 
               tid, job.xid.objSerial, job.xid.objSlot, ws->collect, job.timeout, job.cmd);
               
               
         
         
         // prepare result structure
         result.xid = job.xid;               // result xid must match job xid
         result.state = DNX_JOB_COMPLETE;    // complete or expired
         result.delta = 0;
         result.resCode = DNX_PLUGIN_RESULT_OK;
         result.resData = 0;

         /** @todo Allocate result data buffer based on configured buffer size. */

         // we want to be able to cancel threads while they're out on a task
         // in order to obtain timely shutdown for long jobs - move into
         // async cancel mode, but only for the duration of the check
         pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0);

         *resData = 0;
         jobstart = time(0);
         dnxPluginExecute(job.cmd, &result.resCode, resData, sizeof resData - 1, job.timeout,iwlm->cfg.showNodeAddr? iwlm->myipaddrstr: 0);
         result.delta = time(0) - jobstart;

         pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, 0);

         // store allocated copy of the result string
         if (*resData) result.resData = xstrdup(resData);

         dnxDebug(3, "Worker[%lx]: Job [%lu:%lu] completed in %lu seconds: %d, %s.",
               tid, job.xid.objSerial, job.xid.objSlot, result.delta, 
               result.resCode, result.resData);

//          if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) {
//             dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.",
//                   tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret));
//          }
         

         // Wait while we wait for an Ack to our Results
         DnxJob ack;
         int trys = 1;
         while(trys < 4) {
            if ((ret = dnxSendResult(ws->collect, &result, 0)) != DNX_OK) {
               dnxDebug(3, "Worker[%lx]: Post job [%lu:%lu] results failed: %s.",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret));
               break;
            }
            // Now wait for our Ack
            if ((ret = dnxWaitForAck(ws->dispatch, &ack, job.address, 3)) != DNX_OK && ret != DNX_ERR_TIMEOUT) {
               dnxDebug(3, "Worker[%lx]: Error receiving Ack for job [%lu:%lu]: %s. Retry (%i).",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys);
            } else if (ret == DNX_ERR_TIMEOUT) {
               // we didn't get our Ack
               trys++;
            } else {
               // We got our Ack
               dnxDebug(3, "Worker[%lx]: Ack Received for job [%lu:%lu]: %s. After (%i) try(s).",
                     tid, job.xid.objSerial, job.xid.objSlot, dnxErrorString(ret), trys);
               break;
            }
         }


         xfree(result.resData);
 
         // update all statistics
         DNX_PT_MUTEX_LOCK(&iwlm->mutex);
         {
            // track status
            if (result.resCode == DNX_PLUGIN_RESULT_OK) 
               iwlm->jobsok++;
            else 
               iwlm->jobsfail++;

            // track min/max/avg execution time
            if (result.delta > iwlm->maxexectm)
               iwlm->maxexectm = result.delta;
            if (result.delta < iwlm->minexectm)
               iwlm->minexectm = result.delta;
            iwlm->avgexectm = (iwlm->avgexectm + result.delta) / 2;

            // total job processing time
            iwlm->jobtm += (unsigned)result.delta;
            iwlm->active--;   // reduce active count
         }
         DNX_PT_MUTEX_UNLOCK(&iwlm->mutex);

         ws->serial++;     // increment job serial number for next job
         retries = 0;
      }
   }
   pthread_cleanup_pop(1);
   return 0;
}
Пример #13
0
/** The agent thread control procedure.
 *
 * @param[in] data - thread data; not used.
 *
 * @return Always returns a null pointer (zero).
 */
static void * dnxAgentServer(void * data)
{
   int ret;
   DnxMgmtRequest Msg;
   Msg.action = 0;

   dnxLog("DNX Server Agent awaiting commands...");

   while (!s_shutdown)
   {
      memset(Msg.address, '\0', DNX_MAX_ADDRESS);

      // wait 2 second for a request; process the request, if valid
      if ((ret = dnxWaitForMgmtRequest(s_agent, &Msg, Msg.address, 2)) == DNX_OK)
      {
         DnxMgmtReply Rsp;
         char addrstr[DNX_MAX_ADDRSTR];

         dnxDebug(2, "Received MgmtRequest from %s.", 
               dnxNtop(Msg.address, addrstr, sizeof addrstr));

         // setup some default response values
         Rsp.xid = Msg.xid;
         Rsp.status = DNX_REQ_ACK;
         Rsp.reply = 0;

         // perform the requested action
         if (!strcmp(Msg.action, "RESETSTATS"))
         {
            dnxStatsResetServerStats();
            dnxStatsForEachNode(dnxResetNodeStats, 0);
            Rsp.reply = xstrdup("OK");
         }
         else if (!strncmp(Msg.action, "GETSTATS ", 9))
         {
            if ((Rsp.reply = buildMgmtStatsReply(Msg.action + 9)) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strncmp(Msg.action, "GETNODESTATS ", 13))
         {
            if ((Rsp.reply = buildMgmtNodeStatsReply(Msg.action + 13)) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETNODELIST"))
         {
            if ((Rsp.reply = buildMgmtNodeListReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETCONFIG"))
         {
            if ((Rsp.reply = buildMgmtCfgReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "GETVERSION"))
         {
            if ((Rsp.reply = versionText()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }
         else if (!strcmp(Msg.action, "HELP"))
         {
            if ((Rsp.reply = buildHelpReply()) == 0)
               Rsp.status = DNX_REQ_NAK;
         }

         // send response, log response failures
         if ((ret = dnxSendMgmtReply(s_agent, &Rsp, Msg.address)) != 0)
            dnxLog("Agent response failure: %s.", dnxErrorString(ret));

         // free request and reply message buffers
         xfree(Rsp.reply);
         xfree(Msg.action);
      }
      else if (ret != DNX_ERR_TIMEOUT)
         dnxLog("Agent channel failure: %s.", dnxErrorString(ret));
   }

   dnxLog("Agent terminating...");

   return 0;
}
Пример #14
0
/** Service Check Event Handler.
 *
 * @param[in] event_type - the event type for which we're being called.
 * @param[in] data - an opaque pointer to nagios event-specific data.
 *
 * @return Zero if we want Nagios to handle the event;
 *    NEBERROR_CALLBACKOVERRIDE indicates that we want to handle the event
 *    ourselves; any other non-zero value represents an error.
 */
static int ehSvcCheck(int event_type, void * data)
{
   static unsigned long serial = 0; // the number of service checks processed

   nebstruct_service_check_data * svcdata = (nebstruct_service_check_data *)data;
   DnxNodeRequest * pNode;
   DnxJobData * jdp;
   int ret;

   if (event_type != NEBCALLBACK_SERVICE_CHECK_DATA)
      return OK;

   if (svcdata == 0)
   {
      dnxLog("Service handler received NULL service data structure.");
      return ERROR;  // shouldn't happen - internal Nagios error
   }

   if (svcdata->type != NEBTYPE_SERVICECHECK_INITIATE)
      return OK;  // ignore non-initiate service checks

   // check for local execution pattern on command line
   if (cfg.localCheckPattern && regexec(&regEx, svcdata->command_line, 0, 0, 0) == 0)
   {
      dnxDebug(1, "Service will execute locally: %s.", svcdata->command_line);
      return OK;     // tell nagios execute locally
   }

   dnxDebug(3, "ehSvcCheck: Received Job [%lu] at %lu (%lu).",
         serial, (unsigned long)time(0),
         (unsigned long)svcdata->start_time.tv_sec);

   if ((ret = dnxGetNodeRequest(registrar, &pNode)) != DNX_OK)
   {
      dnxDebug(3, "ehSvcCheck: No worker nodes requests available: %s.",dnxErrorString(ret));
      return OK;     // tell nagios execute locally
   }

   // allocate and populate a new job payload object
   if ((jdp = (DnxJobData *)xmalloc(sizeof *jdp)) == 0)
   {
      dnxDebug(1, "ehSvcCheck: Out of memory!");
      return OK;
   }
   memset(jdp, 0, sizeof *jdp);
   jdp->svc = (service *)svcdata->OBJECT_FIELD_NAME;

   assert(jdp->svc);

#if CURRENT_NEB_API_VERSION == 3
   {
      // a nagios 3.x global variable
      extern check_result check_result_info;

      /** @todo patch nagios to pass these values to the event handler. */

      jdp->chkopts    = check_result_info.check_options;
      jdp->schedule   = check_result_info.scheduled_check;
      jdp->reschedule = check_result_info.reschedule_check;
   }
#endif

   if ((ret = dnxPostNewJob(joblist, serial, jdp, svcdata, pNode)) != DNX_OK)
   {
      dnxLog("Unable to post job [%lu]: %s.", serial, dnxErrorString(ret));
      xfree(jdp);
      return OK;     // tell nagios execute locally
   }

   serial++;                           // bump serial number
   return NEBERROR_CALLBACKOVERRIDE;   // tell nagios we want it
}
Пример #15
0
/** Initialize the dnxServer.
 *
 * @return Zero on success, or a non-zero error value.
 */
static int dnxServerInit(void)
{
   int ret, joblistsz;

   // clear globals so we know what to "undo" as we back out
   joblist = 0;
   registrar = 0;
   dispatcher = 0;
   collector = 0;

   if ((ret = dnxChanMapInit(0)) != 0)
   {
      dnxLog("Failed to initialize channel map: %s.", dnxErrorString(ret));
      return ret;
   }

   joblistsz = dnxCalculateJobListSize();

   dnxLog("Allocating %d service request slots in the DNX job list.", joblistsz);

   if ((ret = dnxJobListCreate(joblistsz, &joblist)) != 0)
   {
      dnxLog("Failed to initialize DNX job list with %d slots.", joblistsz);
      return ret;
   }

   // create and configure collector
   if ((ret = dnxCollectorCreate("Collect", cfg.collectorUrl,
               joblist, &collector)) != 0)
      return ret;

   // create and configure dispatcher
   if ((ret = dnxDispatcherCreate("Dispatch", cfg.dispatcherUrl,
               joblist, &dispatcher)) != 0)
      return ret;

   // create worker node registrar
   if ((ret = dnxRegistrarCreate(joblistsz * 2,
               dnxDispatcherGetChannel(dispatcher), &registrar)) != 0)
      return ret;

   // initialize server management agent
   if ((ret = dnxInitAgent(cfg.agentUrl, parser)) != 0)
      return ret;

#if CURRENT_NEB_API_VERSION == 3 && defined(DIRECT_POST)

   // register for timed event to piggy-back on reaper thread
   neb_register_callback(NEBCALLBACK_TIMED_EVENT_DATA, myHandle, 0, ehTimedEvent);
   dnxLog("Registered for TIMEDEVENT_EXECUTE event.");

#endif

   // registration for this event starts everything rolling
   neb_register_callback(NEBCALLBACK_SERVICE_CHECK_DATA, myHandle, 0, ehSvcCheck);

   dnxLog("Registered for SERVICE_CHECK_DATA event.");
   dnxLog("Server initialization completed.");

   return 0;
}
Пример #16
0
/** The main program entry point for the dnx management client.
 * 
 * @param[in] argc - the number of elements in the @p argv array.
 * @param[in] argv - a null-terminated array of command-line arguments.
 * 
 * @return Zero on success, or a non-zero error code that is returned to the
 * shell. Any non-zero codes should be values between 1 and 127.
 */
int main(int argc, char ** argv)
{
   extern char * optarg;
   extern int optind, opterr, optopt;
   gTopDCS = dnxComStatCreateDCS("127.0.0.1");
   int ch, ret;
   char * cp, * prog, * cmdstr;
   char * hoststr, * portstr;

   // get program base name
   prog = (char *)((cp = strrchr(argv[0], '/')) != 0 ? (cp + 1) : argv[0]);

   // parse arguments
   hoststr = "localhost";
   portstr = "12482";
   opterr = 0;
   cmdstr = 0;
   while ((ch = getopt(argc, argv, "hvc:s:p:")) != -1)
   {
      switch (ch)
      {
         case 's':
            hoststr = optarg;
            break;

         case 'p':
            portstr = optarg;
            break;

         case 'c': 
            cmdstr = optarg; 
            break;

         case 'v':
            printf("\n  %s version %s\n  Bug reports: %s.\n\n", 
                  prog, VERSION, PACKAGE_BUGREPORT);
            exit(0);

         case 'h': 
         default :
            usage(prog);
      }
   }

   // ensure we've been given a command
   if (!cmdstr)
   {
      fprintf(stderr, "%s: No command string specified.\n", prog);
      usage(prog);
   }

   // init comm sub-system; send command; wait for response
   if ((ret = dnxChanMapInit(0)) != 0)
      fprintf(stderr, "%s: Error initializing channel map: %s.\n", 
            prog, dnxErrorString(ret));
   else
   {
      char url[1024];

      snprintf(url, sizeof url, "udp://%s:%s", hoststr, portstr);

      if ((ret = dnxChanMapAdd("MgmtClient", url)) != 0)
         fprintf(stderr, "%s: Error adding channel (%s): %s.\n", 
               prog, url, dnxErrorString(ret));
      else
      {
         DnxChannel * channel;

         if ((ret = dnxConnect("MgmtClient", 1, &channel)) != 0)
            fprintf(stderr, "%s: Error connecting to server (%s): %s.\n", 
                  prog, url, dnxErrorString(ret));
         else
         {
            DnxMgmtRequest req;

            memset(&req, 0, sizeof req);
            dnxMakeXID(&req.xid, DNX_OBJ_MANAGER, 0, 0);
            req.action = cmdstr;

            if ((ret = dnxSendMgmtRequest(channel, &req, 0)) != 0)
               fprintf(stderr, "%s: Error sending request: %s.\n", 
                     prog, dnxErrorString(ret));
            else
            {
               DnxMgmtReply rsp;

               if ((ret = dnxWaitForMgmtReply(channel, &rsp, 0, 10)) != 0)
                  fprintf(stderr, "%s: Error receiving response: %s.\n", 
                        prog, dnxErrorString(ret));
               else
               {
                  if (rsp.status == DNX_REQ_ACK)
                     printf("%s\n", rsp.reply);
                  else
                     fprintf(stderr, "%s: Request failed on server.\nResponse was (%s)\n", prog,rsp.reply);
      
               }
            }
            dnxDisconnect(channel);
         }
         dnxChanMapDelete("MgmtClient");
      }
      dnxChanMapRelease();
   }

   xheapchk();

   return ret? -1: 0;
}