int
status_job (mbdReqType reqType,
	    struct jobCard *jp, int newStatus, sbdReplyType err)
{
  static char fname[] = "status_job()";
  static int seq = 1;
  static char lastHost[MAXHOSTNAMELEN];
  int reply;
  char *request_buf;
  char *reply_buf = NULL;
  XDR xdrs;
  struct LSFHeader hdr;
  int cc;
  struct statusReq statusReq;
  int flags;
  int i;
  int len;
  struct lsfAuth *auth = NULL;

  if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL))
    ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s",
	       fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId));

  if (newStatus == JOB_STAT_EXIT)
    {
      jp->userJobSucc = FALSE;
    }

  if (MASK_STATUS (newStatus) == JOB_STAT_DONE)
    {
      jp->userJobSucc = TRUE;
    }

  if (IS_POST_FINISH (newStatus))
    {
      if (jp->userJobSucc != TRUE)
	{
	  return 0;
	}
    }

  if (masterHost == NULL)
    return -1;

  if (jp->notReported < 0)
    {
      jp->notReported = -INFINIT_INT;
      return (0);
    }

  statusReq.jobId = jp->jobSpecs.jobId;
  statusReq.actPid = jp->jobSpecs.actPid;
  statusReq.jobPid = jp->jobSpecs.jobPid;
  statusReq.jobPGid = jp->jobSpecs.jobPGid;
  statusReq.newStatus = newStatus;
  statusReq.reason = jp->jobSpecs.reasons;
  statusReq.subreasons = jp->jobSpecs.subreasons;
  statusReq.sbdReply = err;
  statusReq.lsfRusage = jp->lsfRusage;
  statusReq.execUid = jp->jobSpecs.execUid;
  statusReq.numExecHosts = 0;
  statusReq.execHosts = NULL;
  statusReq.exitStatus = jp->w_status;
  statusReq.execCwd = jp->jobSpecs.execCwd;
  statusReq.execHome = jp->jobSpecs.execHome;
  statusReq.execUsername = jp->execUsername;
  statusReq.queuePostCmd = "";
  statusReq.queuePreCmd = "";
  statusReq.msgId = jp->delieveredMsgId;

  if (IS_FINISH (newStatus))
    {
      if (jp->maxRusage.mem > jp->runRusage.mem)
	jp->runRusage.mem = jp->maxRusage.mem;
      if (jp->maxRusage.swap > jp->runRusage.swap)
	jp->runRusage.swap = jp->maxRusage.swap;
      if (jp->maxRusage.stime > jp->runRusage.stime)
	jp->runRusage.stime = jp->maxRusage.stime;
      if (jp->maxRusage.utime > jp->runRusage.utime)
	jp->runRusage.utime = jp->maxRusage.utime;
    }
  statusReq.runRusage.mem = jp->runRusage.mem;
  statusReq.runRusage.swap = jp->runRusage.swap;
  statusReq.runRusage.utime = jp->runRusage.utime;
  statusReq.runRusage.stime = jp->runRusage.stime;
  statusReq.runRusage.npids = jp->runRusage.npids;
  statusReq.runRusage.pidInfo = jp->runRusage.pidInfo;
  statusReq.runRusage.npgids = jp->runRusage.npgids;
  statusReq.runRusage.pgid = jp->runRusage.pgid;
  statusReq.actStatus = jp->actStatus;
  statusReq.sigValue = jp->jobSpecs.actValue;
  statusReq.seq = seq;
  seq++;
  if (seq >= MAX_SEQ_NUM)
    seq = 1;

  len = 1024 + ALIGNWORD_ (sizeof (struct statusReq));

  len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execUsername)) + 4;

  for (i = 0; i < statusReq.runRusage.npids; i++)
    len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4;

  for (i = 0; i < statusReq.runRusage.npgids; i++)
    len += ALIGNWORD_ (sizeof (int)) + 4;

  if (logclass & (LC_TRACE | LC_COMM))
    ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname,
	       len);

  if ((request_buf = malloc (len)) == NULL)
    {
      ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc");
      return (-1);
    }

  xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE);
  initLSFHeader_ (&hdr);
  hdr.opCode = reqType;

  if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0,
		      auth))
    {
      ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M,
		 fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq");
      lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq");
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      relife ();
    }

  flags = CALL_SERVER_NO_HANDSHAKE;
  if (statusChan >= 0)
    flags |= CALL_SERVER_USE_SOCKET;

  if (reqType == BATCH_RUSAGE_JOB)
    flags |= CALL_SERVER_NO_WAIT_REPLY;

  if (logclass & LC_COMM)
    ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d",
	       fname, statusChan, flags);

  cc = call_server (masterHost,
		    mbd_port,
		    request_buf,
		    XDR_GETPOS (&xdrs),
		    &reply_buf,
		    &hdr,
		    connTimeout, readTimeout, &statusChan, NULL, NULL, flags);
  if (cc < 0)
    {
      statusChan = -1;
      if (!equalHost_ (masterHost, lastHost))
	{
	  if (errno != EINTR)
	    ls_syslog (LOG_DEBUG,
		       "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s",
		       fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId),
		       lsb_sysmsg ());
	  strcpy (lastHost, masterHost);
	}
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      failcnt++;
      return (-1);
    }
  else if (cc == 0)
    {

    }

  failcnt = 0;
  lastHost[0] = '\0';
  xdr_destroy (&xdrs);
  FREEUP (request_buf);

  if (cc)
    free (reply_buf);

  if (flags & CALL_SERVER_NO_WAIT_REPLY)
    {

      struct timeval timeval;

      timeval.tv_sec = 0;
      timeval.tv_usec = 0;

      if (rd_select_ (chanSock_ (statusChan), &timeval) == 0)
	{
	  jp->needReportRU = FALSE;
	  jp->lastStatusMbdTime = now;
	  return 0;
	}

      CLOSECD (statusChan);

      if (logclass & LC_COMM)
	ls_syslog (LOG_DEBUG1,
		   "%s: Job <%s> rd_select() failed, assume connection broken",
		   fname, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
  reply = hdr.opCode;
  switch (reply)
    {
    case LSBE_NO_ERROR:
    case LSBE_LOCK_JOB:
      jp->needReportRU = FALSE;
      jp->lastStatusMbdTime = now;
      if (reply == LSBE_LOCK_JOB)
	{
	  if (IS_SUSP (jp->jobSpecs.jStatus))
	    jp->jobSpecs.reasons |= SUSP_MBD_LOCK;
	  else
	    ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."),	/* catgets 5204 */
		       fname,
		       lsb_jobid2str (jp->jobSpecs.jobId),
		       jp->jobSpecs.jStatus);
	}
      return (0);
    case LSBE_NO_JOB:
      if (!IS_POST_FINISH (jp->jobSpecs.jStatus))
	{
	  ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost);	/* catgets 5205 */
	}

      jp->notReported = -INFINIT_INT;
      return (0);
    case LSBE_STOP_JOB:
      if (jobsig (jp, SIGSTOP, TRUE) < 0)
	SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT);
      else
	{
	  SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP);
	  jp->jobSpecs.reasons |= SUSP_USER_STOP;
	}
      return (-1);
    case LSBE_SBATCHD:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"),	/* catgets 5206 */
		 fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    default:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"),	/* catgets 5207 */
		 fname,
		 reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
}
Exemple #2
0
void
die(int sig)
{
    static char fname[] = "die";
    char myhost[MAXHOSTNAMELEN];

    if (debug > 1)
        fprintf(stderr, "%s: signal %d\n",
            fname,
            sig);

    if (masterme) {
        releaseElogLock();
    }

    if (gethostname(myhost, MAXHOSTNAMELEN) <0) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL_M, fname, "gethostname", myhost);
        strcpy(myhost, "localhost");
    }

    if (sig > 0 && sig < 100) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8216,
            "Daemon on host <%s> received signal <%d>; exiting"), /* catgets 8216 */
             myhost, sig);
    } else {
        switch (sig) {
        case MASTER_RESIGN:
            ls_syslog(LOG_INFO, (_i18n_msg_get(ls_catd , NL_SETN, 8272, "Master daemon on host <%s> resigned; exiting")), myhost);    /* catgets 8272 */
            break;
        case MASTER_RECONFIG:
            ls_syslog(LOG_INFO, (_i18n_msg_get(ls_catd , NL_SETN, 8273, "Master daemon on host <%s> exiting for reconfiguration")), myhost);  /* catgets 8273 */
            break;

        case SLAVE_MEM:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8217,
                "Slave daemon on host <%s> failed in memory allocation; fatal error - exiting"), myhost); /* catgets 8217 */
            lsb_merr1(_i18n_msg_get(ls_catd , NL_SETN, 8217,
                "Slave daemon on host <%s> failed in memory allocation; fatal error - exiting"), myhost);
            break;
        case MASTER_MEM:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8218,
                "Master daemon on host <%s> failed in memory allocation; fatal error - exiting"), myhost); /* catgets 8218 */
            break;
        case SLAVE_FATAL:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8219,
                "Slave daemon on host <%s> dying; fatal error - see above messages for reason"), myhost); /* catgets 8219 */
            break;
        case MASTER_FATAL:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8220,
                "Master daemon on host <%s> dying; fatal error - see above messages for reason"), myhost); /* catgets 8220 */
            break;
        case MASTER_CONF:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8221,
                "Master daemon on host <%s> died of bad configuration file"), myhost); /* catgets 8221 */
              break;
        case SLAVE_RESTART:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8222,
                "Slave daemon on host <%s> restarting"), myhost); /* catgets 8222 */
              break;
        case SLAVE_SHUTDOWN:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8223,
                "Slave daemon on host <%s> shutdown"), myhost); /* catgets 8223 */
              break;
        default:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 8224,
                "Daemon on host <%s> exiting; cause code <%d> unknown"), myhost, sig); /* catgets 8224 */
            break;
        }
    }

    shutdown(chanSock_(batchSock), 2);

    exit(sig);

}
Exemple #3
0
int
ls_initrex(int num, int options)
{
    struct servent *sv;

    if (geteuid() == 0)
       rootuid_ = TRUE;

    if (initenv_(NULL, NULL)<0) {
        if (rootuid_ && !(options & KEEPUID))
            lsfSetUid(getuid());
        return(-1);
    }

    inithostsock_();
    lsQueueInit_(&requestQ, lsReqCmp_, NULL);
    if (requestQ == NULL) {
        lserrno = LSE_MALLOC;
        return(-1);
    }

    res_addr_.sin_family = AF_INET;

    if (genParams_[LSF_RES_PORT].paramValue) {
        if ((res_addr_.sin_port = atoi(genParams_[LSF_RES_PORT].paramValue))
            != 0)
            res_addr_.sin_port = htons(res_addr_.sin_port);
        else
            goto res_init_fail;
    } else if (genParams_[LSF_RES_DEBUG].paramValue) {
        res_addr_.sin_port = htons(RES_PORT);
    } else {
#  if defined(_COMPANY_X_)
        if ((res_addr_.sin_port =
                 get_port_number(RES_SERVICE,(char *)NULL)) == -1) {
#  else
        if ((sv = getservbyname("res", "tcp")) != NULL)
            res_addr_.sin_port = sv->s_port;
        else {
#  endif
res_init_fail:
            lserrno = LSE_RES_NREG;
            if (rootuid_ && !(options & KEEPUID))
                lsfSetUid(getuid());
            return (-1);
        }
    }

    initconntbl_();
    FD_ZERO(&connection_ok_);

    if ((rootuid_) && (genParams_[LSF_AUTH].paramValue == NULL)) {
        int i;
        i = opensocks_(num);
        if (!(options & KEEPUID))
            lsfSetUid(getuid());
        return (i);
    } else {
        return (num);
    }
}

int
opensocks_(int num)
{
    static char fname[] = "opensocks_";
    int s;
    int nextdescr;
    int i;

    totsockets_ = (num <= 0 || num > MAXCONNECT) ? LSF_DEFAULT_SOCKS : num;

    if (logclass & LC_COMM)
       ls_syslog(LOG_DEBUG,"%s: try to allocate num <%d> of socks",fname,num);

    nextdescr = FIRST_RES_SOCK;
    for (i = 0; i < totsockets_; i++) {
        if ((s = CreateSock_(SOCK_STREAM)) < 0) {
            if (logclass & LC_COMM)
                ls_syslog(LOG_DEBUG,
                   "%s: CreateSock_ failed, iter:<%d> %s",
                    fname,i,strerror(errno));
            totsockets_ = i;
            if (i > 0) {
                break;
            } else {
               return(-1);
            }
        }

        if (s != nextdescr) {
            if (dup2(s,nextdescr) < 0) {
                if (logclass & LC_COMM)
                    ls_syslog(LOG_DEBUG,
                    "%s: dup2() failed, old:<%d>, new<%d>, iter:<%d>  %s",
                               fname,s,nextdescr,i,strerror(errno));
                close(s);
                lserrno = LSE_SOCK_SYS;
                totsockets_ = i;
                if (i > 0)
                   break;
                else
                   return (-1);
            }

#if defined(FD_CLOEXEC)
            fcntl(nextdescr, F_SETFD, (fcntl(nextdescr, F_GETFD)
                       | FD_CLOEXEC)) ;
#else
#if defined(FIOCLEX)
            (void)ioctl(nextdescr, FIOCLEX, (char *)NULL);
#endif
#endif

            close(s);
        }
        nextdescr++;
    }

    currentsocket_ = FIRST_RES_SOCK;

    if (logclass & LC_COMM)
       ls_syslog(LOG_DEBUG,"%s: returning num=<%d>",fname,totsockets_);

    return (totsockets_);

}

/* ls_fdbusy()
 */
int
ls_fdbusy(int fd)
{
    sTab   hashSearchPtr;
    hEnt   *hEntPtr;

    if (fd == chanSock_(limchans_[PRIMARY])
        || fd == chanSock_(limchans_[MASTER])
        || fd == chanSock_(limchans_[UNBOUND]))
        return TRUE;

    if (fd == cli_nios_fd[0])
        return TRUE;

    hEntPtr = h_firstEnt_(&conn_table, &hashSearchPtr);
    while (hEntPtr) {
        int   *pfd;

        pfd = hEntPtr->hData;
        if (fd == pfd[0]
            || fd == pfd[1])
            return (TRUE);

        hEntPtr = h_nextEnt_(&hashSearchPtr);
    }

    if (rootuid_
        && fd >= currentsocket_
        && fd < FIRST_RES_SOCK + totsockets_)
        return TRUE;

    return FALSE;
}