Example #1
0
void
do_reboot(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_reboot()";
    char               reply_buf[MSGSIZE / 8];
    XDR                xdrs2;
    sbdReplyType       reply;
    struct LSFHeader   replyHdr;

    if (logclass & LC_TRACE)
        ls_syslog(LOG_DEBUG, "%s: Entering this routine...", fname);

    reply = ERR_NO_ERROR;
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE / 8, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    if (reqHdr->opCode == CMD_SBD_REBOOT)
        replyHdr.opCode = LSBE_NO_ERROR;
    else
        replyHdr.opCode = reply;

    if (!xdr_encodeMsg(&xdrs2, (char *) 0, &replyHdr, 0, 0, NULL)) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_encodeMsg");
        xdr_destroy(&xdrs2);
        relife();
        return;
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL_M, fname, "chanWrite_");
        xdr_destroy(&xdrs2);
        relife();
        return;
    }
    ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5828,
                                        "Slave batch daemon reboot command received")); /* catgets 5828 */
    relife();
    ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5829,
                                     "Unable to relife during rebooting: %m")); /* catgets 5829 */
    xdr_destroy(&xdrs2);

}
Example #2
0
void
do_shutdown(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_shutdown()";
    char               reply_buf[MSGSIZE / 8];
    XDR                xdrs2;
    sbdReplyType       reply;
    struct LSFHeader   replyHdr;

    reply = ERR_NO_ERROR;

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE / 8, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    if (reqHdr->opCode == CMD_SBD_SHUTDOWN)
        replyHdr.opCode = LSBE_NO_ERROR;
    else
        replyHdr.opCode = reply;

    if (!xdr_encodeMsg(&xdrs2, (char *) 0, &replyHdr, 0, 0, NULL)) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_encodeMsg");
        xdr_destroy(&xdrs2);
        relife();
        return;
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5835,
                                         "%s: Sending shutdown reply to master failed: %m"), /* catgets 5835 */
                  fname);
        xdr_destroy(&xdrs2);
        relife();
        return;
    }
    xdr_destroy(&xdrs2);
    ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5836,
                                        "Slave batch daemon shutdown command received")); /* catgets 5836 */
    die(SLAVE_SHUTDOWN);

}
Example #3
0
int
status_job (mbdReqType reqType,
	    struct jobCard *jp, int newStatus, sbdReplyType err)
{
  static char fname[] = "status_job()";
  static int seq = 1;
  static char lastHost[MAXHOSTNAMELEN];
  int reply;
  char *request_buf;
  char *reply_buf = NULL;
  XDR xdrs;
  struct LSFHeader hdr;
  int cc;
  struct statusReq statusReq;
  int flags;
  int i;
  int len;
  struct lsfAuth *auth = NULL;

  if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL))
    ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s",
	       fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId));

  if (newStatus == JOB_STAT_EXIT)
    {
      jp->userJobSucc = FALSE;
    }

  if (MASK_STATUS (newStatus) == JOB_STAT_DONE)
    {
      jp->userJobSucc = TRUE;
    }

  if (IS_POST_FINISH (newStatus))
    {
      if (jp->userJobSucc != TRUE)
	{
	  return 0;
	}
    }

  if (masterHost == NULL)
    return -1;

  if (jp->notReported < 0)
    {
      jp->notReported = -INFINIT_INT;
      return (0);
    }

  statusReq.jobId = jp->jobSpecs.jobId;
  statusReq.actPid = jp->jobSpecs.actPid;
  statusReq.jobPid = jp->jobSpecs.jobPid;
  statusReq.jobPGid = jp->jobSpecs.jobPGid;
  statusReq.newStatus = newStatus;
  statusReq.reason = jp->jobSpecs.reasons;
  statusReq.subreasons = jp->jobSpecs.subreasons;
  statusReq.sbdReply = err;
  statusReq.lsfRusage = jp->lsfRusage;
  statusReq.execUid = jp->jobSpecs.execUid;
  statusReq.numExecHosts = 0;
  statusReq.execHosts = NULL;
  statusReq.exitStatus = jp->w_status;
  statusReq.execCwd = jp->jobSpecs.execCwd;
  statusReq.execHome = jp->jobSpecs.execHome;
  statusReq.execUsername = jp->execUsername;
  statusReq.queuePostCmd = "";
  statusReq.queuePreCmd = "";
  statusReq.msgId = jp->delieveredMsgId;

  if (IS_FINISH (newStatus))
    {
      if (jp->maxRusage.mem > jp->runRusage.mem)
	jp->runRusage.mem = jp->maxRusage.mem;
      if (jp->maxRusage.swap > jp->runRusage.swap)
	jp->runRusage.swap = jp->maxRusage.swap;
      if (jp->maxRusage.stime > jp->runRusage.stime)
	jp->runRusage.stime = jp->maxRusage.stime;
      if (jp->maxRusage.utime > jp->runRusage.utime)
	jp->runRusage.utime = jp->maxRusage.utime;
    }
  statusReq.runRusage.mem = jp->runRusage.mem;
  statusReq.runRusage.swap = jp->runRusage.swap;
  statusReq.runRusage.utime = jp->runRusage.utime;
  statusReq.runRusage.stime = jp->runRusage.stime;
  statusReq.runRusage.npids = jp->runRusage.npids;
  statusReq.runRusage.pidInfo = jp->runRusage.pidInfo;
  statusReq.runRusage.npgids = jp->runRusage.npgids;
  statusReq.runRusage.pgid = jp->runRusage.pgid;
  statusReq.actStatus = jp->actStatus;
  statusReq.sigValue = jp->jobSpecs.actValue;
  statusReq.seq = seq;
  seq++;
  if (seq >= MAX_SEQ_NUM)
    seq = 1;

  len = 1024 + ALIGNWORD_ (sizeof (struct statusReq));

  len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execUsername)) + 4;

  for (i = 0; i < statusReq.runRusage.npids; i++)
    len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4;

  for (i = 0; i < statusReq.runRusage.npgids; i++)
    len += ALIGNWORD_ (sizeof (int)) + 4;

  if (logclass & (LC_TRACE | LC_COMM))
    ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname,
	       len);

  if ((request_buf = malloc (len)) == NULL)
    {
      ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc");
      return (-1);
    }

  xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE);
  initLSFHeader_ (&hdr);
  hdr.opCode = reqType;

  if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0,
		      auth))
    {
      ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M,
		 fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq");
      lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq");
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      relife ();
    }

  flags = CALL_SERVER_NO_HANDSHAKE;
  if (statusChan >= 0)
    flags |= CALL_SERVER_USE_SOCKET;

  if (reqType == BATCH_RUSAGE_JOB)
    flags |= CALL_SERVER_NO_WAIT_REPLY;

  if (logclass & LC_COMM)
    ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d",
	       fname, statusChan, flags);

  cc = call_server (masterHost,
		    mbd_port,
		    request_buf,
		    XDR_GETPOS (&xdrs),
		    &reply_buf,
		    &hdr,
		    connTimeout, readTimeout, &statusChan, NULL, NULL, flags);
  if (cc < 0)
    {
      statusChan = -1;
      if (!equalHost_ (masterHost, lastHost))
	{
	  if (errno != EINTR)
	    ls_syslog (LOG_DEBUG,
		       "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s",
		       fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId),
		       lsb_sysmsg ());
	  strcpy (lastHost, masterHost);
	}
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      failcnt++;
      return (-1);
    }
  else if (cc == 0)
    {

    }

  failcnt = 0;
  lastHost[0] = '\0';
  xdr_destroy (&xdrs);
  FREEUP (request_buf);

  if (cc)
    free (reply_buf);

  if (flags & CALL_SERVER_NO_WAIT_REPLY)
    {

      struct timeval timeval;

      timeval.tv_sec = 0;
      timeval.tv_usec = 0;

      if (rd_select_ (chanSock_ (statusChan), &timeval) == 0)
	{
	  jp->needReportRU = FALSE;
	  jp->lastStatusMbdTime = now;
	  return 0;
	}

      CLOSECD (statusChan);

      if (logclass & LC_COMM)
	ls_syslog (LOG_DEBUG1,
		   "%s: Job <%s> rd_select() failed, assume connection broken",
		   fname, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
  reply = hdr.opCode;
  switch (reply)
    {
    case LSBE_NO_ERROR:
    case LSBE_LOCK_JOB:
      jp->needReportRU = FALSE;
      jp->lastStatusMbdTime = now;
      if (reply == LSBE_LOCK_JOB)
	{
	  if (IS_SUSP (jp->jobSpecs.jStatus))
	    jp->jobSpecs.reasons |= SUSP_MBD_LOCK;
	  else
	    ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."),	/* catgets 5204 */
		       fname,
		       lsb_jobid2str (jp->jobSpecs.jobId),
		       jp->jobSpecs.jStatus);
	}
      return (0);
    case LSBE_NO_JOB:
      if (!IS_POST_FINISH (jp->jobSpecs.jStatus))
	{
	  ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost);	/* catgets 5205 */
	}

      jp->notReported = -INFINIT_INT;
      return (0);
    case LSBE_STOP_JOB:
      if (jobsig (jp, SIGSTOP, TRUE) < 0)
	SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT);
      else
	{
	  SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP);
	  jp->jobSpecs.reasons |= SUSP_USER_STOP;
	}
      return (-1);
    case LSBE_SBATCHD:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"),	/* catgets 5206 */
		 fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    default:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"),	/* catgets 5207 */
		 fname,
		 reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
}
Example #4
0
void
job_checking (void)
{
    static char fname[] = "job_checking";
    struct jobCard *jobCard, *nextJob;
    struct hostLoad *myload, savedLoad;
    char *myhostnm;
    static time_t last_check;
    char preempted = FALSE;
    int i;

    if (last_check == 0)
	last_check = now;
    if (jobcnt <= 0) {
        last_check = now;
        return;
    }

    checkFinish ();

    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
         jobCard = nextJob) {

	nextJob = jobCard->forw;
        if (IS_FINISH(jobCard->jobSpecs.jStatus)
              || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND))
            continue;

	ruLimits(jobCard);

	if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) {

	    jobCard->runTime += (int) (now - last_check);
	}
	if (jobCard->runTime >
	    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
	        if (jobCard->runTime >
		    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl
		    + WARN_TIME && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        ls_syslog(LOG_INFO, \
                                  "%s: warning period expired killing the job=%d",
			    fname, jobCard->jobSpecs.jobId);
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
	        } else if (!jobCard->timeExpire) {
		    ls_syslog(LOG_INFO, I18N(5704,
                        "%s: sending warning signal to job=%d"), /* catgets 5704 */
			fname, jobCard->jobSpecs.jobId);
		    jobsig(jobCard, SIGUSR2, FALSE);
		    jobCard->timeExpire = TRUE;
	        }
            } else {
                if (jobCard->runTime >
                    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
	    continue;
	}

        if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime

             && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                 || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
                if (now > jobCard->jobSpecs.termTime + WARN_TIME
                                                   && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
                } else
		    if (!jobCard->timeExpire) {
		        jobsig(jobCard, SIGUSR2, FALSE);
		        jobCard->timeExpire = TRUE;
		    }
            } else {
                if (now > jobCard->jobSpecs.termTime) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
            continue;
        }


        if (! window_ok (jobCard)
	    && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) {
	    if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                          && now - jobCard->windWarnTime >= WARN_TIME)) {


	        jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0);
		continue;

	    }
	} else {

		jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW);
                continue;
	}
    }


    if ((myhostnm = ls_getmyhostname()) == NULL) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname");
        die(SLAVE_FATAL);
    }
    myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1);
    if (myload == NULL) {
        if (myStatus != NO_LIM)

	    ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts");
	if (lserrno == LSE_LIM_BADHOST)
	    relife();
	if (lserrno == LSE_BAD_XDR)
	    relife();
	if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) {
	    myStatus |= NO_LIM;


            tryChkpntMig();
        }
        last_check = now;
	return;
    } else
	myStatus = 0;



    memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad));
    savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float),
				   "job_checking");
    savedLoad.status = (int *) my_malloc
       ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking");
    for (i = 0; i < allLsInfo->numIndx; i++)
        savedLoad.li[i] = myload->li[i];
    for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++)
        savedLoad.status[i] = myload->status[i];
    tryResume (&savedLoad);

    if (!preempted)
        tryStop (myhostnm, &savedLoad);

    tryChkpntMig();


    FREEUP(savedLoad.li);
    FREEUP(savedLoad.status);
    last_check = now;
    return;

}
Example #5
0
static void
tryStop (char *myhostnm, struct hostLoad *myload)
{
    static char fname[] = "tryStop";
    struct jobCard *jobCard, *next;
    int reasons, subreasons, stopmore = FALSE;
    static int errCount = 0, lastTryStopTime = 0;

    if (now - lastTryStopTime < sbdSleepTime) {

        return;
    }
    lastTryStopTime = now;

    for (jobCard = jobQueHead->forw; jobCard != jobQueHead; jobCard = next) {
        next = jobCard->forw;

	if (jobCard->jobSpecs.numToHosts == 1) {

            if  ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN)
                && (now >= jobCard->jobSpecs.startTime + sbdSleepTime)
                && shouldStop (myload, jobCard, &reasons, &subreasons, 1, &stopmore)) {


    	        jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons);
	        if (stopmore)
                    continue;
                else
	            return;
	    }
	} else {
	    struct hostLoad *load;
            int numh;
            struct nameList *hostList;

            numh = jobCard->jobSpecs.numToHosts;
            hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh);
            numh = hostList->listSize;



           if (hostList->listSize == 1) {
               load = myload;
           } else {
                load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0,
                                    hostList->names, hostList->listSize);
           }

	    if (load == NULL) {
                if (errCount < 3)
		    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_MM, fname,
			      lsb_jobid2str(jobCard->jobSpecs.jobId),
			      "ls_loadofhosts");
		errCount++;
                if (lserrno == LSE_LIM_BADHOST)
                    relife();
		if (lserrno == LSE_BAD_XDR)
	            relife();
		if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT)
		    myStatus |= NO_LIM;
		continue;
	    } else {
                errCount = 0;
		myStatus = 0;
            }

	    if ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN)
                && now >= jobCard->jobSpecs.startTime + sbdSleepTime) {
		if (shouldStop (load, jobCard, &reasons, &subreasons, numh, &stopmore)) {

                    jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons);
                    if (stopmore)
		        break;
                    else
		        return;
		}
            }
        }
    }
    return;

}
Example #6
0
static void
tryResume (struct hostLoad *myload)
{
    char fname[] = "tryResume";
    struct jobCard *jobCard, *next;

    static int errCount = 0, lastTryResumeTime = 0;


    if (now - lastTryResumeTime < sbdSleepTime) {

        return;
    }
    lastTryResumeTime = now;

    for (jobCard = jobQueHead->back; jobCard != jobQueHead; jobCard = next) {
        next = jobCard->back;

	if (!(jobCard->jobSpecs.jStatus & JOB_STAT_SSUSP) ||
	    jobCard->jobSpecs.actPid)
            continue;

        if (jobCard->jobSpecs.numToHosts == 1) {
            if (shouldResume (myload, jobCard, 1)) {


                if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0)
                    continue;
                else
                    return;
            }
	} else {
            int numh;
	    struct hostLoad *load;
            struct nameList *hostList;

	    numh = jobCard->jobSpecs.numToHosts;
            hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh);
            numh = hostList->listSize;
            load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0,
                                   hostList->names, hostList->listSize);

	    if (load == NULL) {
                if (errCount < 3)
		    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
			      lsb_jobid2str(jobCard->jobSpecs.jobId),
			      "ls_loadofhosts");
                errCount++;
		if (lserrno == LSE_LIM_BADHOST)
		    relife();
		if (lserrno == LSE_BAD_XDR)
		    relife();
		if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT)
		    myStatus |= NO_LIM;
		continue;
	    } else {
		myStatus = 0;
                errCount = 0;
            }
            if (!shouldResume (load, jobCard, numh))
                continue;



	    if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0)

		continue;
	    else
	        return;
        }
    }
    return;

}
Example #7
0
void
do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_sigjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSig      jobSig;
    sbdReplyType       reply;
    struct jobReply    jobReply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp = NULL;
    char               found = FALSE;
    int                cc;
    int                sigValue;
    int                savedActReasons;
    int                savedActSubReasons;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));
    if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig");

	goto Reply1;
    }

    jobSig.sigValue = sig_decode(jobSig.sigValue);
    sigValue = jobSig.sigValue;

    if (logclass & LC_SIGNAL)
        ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue);

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jobSig.jobId)
            continue;
        found = TRUE;
        break;
    }
    if (found == FALSE) {
        reply = ERR_NO_JOB;
        jp = NULL;
        goto Reply1;
    }

    if (jobSig.reasons & SUSP_MBD_LOCK) {

        jp->jobSpecs.reasons = jobSig.reasons;
        jp->jobSpecs.subreasons = jobSig.subReasons;
        savedActReasons = jp->actReasons;
        savedActSubReasons = jp->actSubReasons;
        jp->actReasons = jobSig.reasons;
        jp->actSubReasons = jobSig.subReasons;
    }


    if (jp->postJobStarted) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (IS_FINISH(jp->jobSpecs.jStatus)) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (jp->jobSpecs.jobPGid == -1) {
        SBD_SET_STATE(jp, JOB_STAT_EXIT);
        reply = ERR_NO_ERROR;
        goto Reply;
    }

    if (!JOB_STARTED(jp)) {
        if (isSigTerm(sigValue) == TRUE) {
            if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0)
           	reply = ERR_SIG_RETRY;
    	    else
            	reply = ERR_NO_ERROR;

            goto Reply;
        }

        reply = ERR_SIG_RETRY;

        if (logclass & LC_EXEC)
            ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>",
                      fname, getLsbSigSymbol(sigValue),
                      lsb_jobid2str(jp->jobSpecs.jobId));
        goto Reply1;
    }

    if (IS_PEND(jp->jobSpecs.jStatus)) {
        reply = ERR_SIG_RETRY;
        goto Reply1;
    }

    if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) {

        if ((cc = jobSigStart(jp,
                              sigValue,
                              jobSig.actFlags,
                              jobSig.chkPeriod,
                              NO_SIGLOG)) < 0)
            reply = ERR_SIG_RETRY;
        else {
            jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
            reply = ERR_NO_ERROR;
        }
        goto Reply;
    }

    if ((cc = jobSigStart(jp,
                          sigValue,
                          jobSig.actFlags,
                          jobSig.chkPeriod,
                          NO_SIGLOG)) < 0)
        reply = ERR_SIG_RETRY;
    else
        reply = ERR_NO_ERROR;

Reply:
    sbdlog_newstatus(jp);

Reply1:

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR) {
        jobReply.jobPid = jp->jobSpecs.jobPid;
        jobReply.actPid = jp->jobSpecs.actPid;
        jobReply.jobId = jp->jobSpecs.jobId;
        jobReply.jobPGid = jp->jobSpecs.jobPGid;
        jobReply.jStatus = jp->jobSpecs.jStatus;
        jobReply.reasons = jp->jobSpecs.reasons;
        jobReply.actStatus = jp->actStatus;
        replyStruct = (char *) &jobReply;
    } else {
        if (reply != ERR_NO_JOB)
            if  ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) {
                jp->actReasons = savedActReasons;
                jp->actSubReasons = savedActSubReasons;
            }
        replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply");
        relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821,
                                         "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */
    }
    if (jp != NULL)
        jp->actStatus = ACT_NO;

    xdr_destroy(&xdrs2);

    return;
}
Example #8
0
void
do_probe(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char         fname[] = "do_probe()";
    char                reply_buf[MSGSIZE];
    XDR                 xdrs2;
    struct LSFHeader    replyHdr;
    struct sbdPackage   sbdPackage;
    struct jobSpecs     *jobSpecs;
    int                 i;
    struct lsfAuth      *auth = NULL;

    if (reqHdr->length == 0)
        return;

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = ERR_NO_ERROR;
    jobSpecs = NULL;

    if (!xdr_sbdPackage(xdrs, &sbdPackage, reqHdr)) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_sbdPackage");
        relife();
    } else {
        if (sbdPackage.numJobs) {
            jobSpecs = my_calloc(sbdPackage.numJobs,
                                 sizeof(struct jobSpecs), fname);
            for (i = 0; i < sbdPackage.numJobs; i++) {
                if (!xdr_arrayElement(xdrs, (char *) &(jobSpecs[i]),
                                      reqHdr, xdr_jobSpecs)) {
                    replyHdr.opCode = ERR_BAD_REQ;
                    ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5815,
                                                     "%s: %s(%d) failed for %d jobs"), /* catgets 5815 */
                              fname, "xdr_arrayElement", i, sbdPackage.numJobs);
                    break;
                }
                refreshJob(&(jobSpecs[i]));
                xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs[i], reqHdr);
            }
        }
    }
    if (replyHdr.opCode == ERR_NO_ERROR)
	if (!xdr_sbdPackage1(xdrs, &sbdPackage, reqHdr)) {
	    ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_sbdPackage1");
	    relife();
	}
    if (replyHdr.opCode == ERR_NO_ERROR) {
        if (myStatus & NO_LIM) {
	    replyHdr.opCode = ERR_NO_LIM;
        }
    }
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    if (!xdr_encodeMsg(&xdrs2, NULL, &replyHdr, NULL, 0, auth)) {
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_encodeMsg");
	relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL_M, fname, "chanWrite_");
    }

    xdr_destroy(&xdrs2);

    if (jobSpecs != NULL)
	free(jobSpecs);


    getManagerId(&sbdPackage);

    mbdPid = sbdPackage.mbdPid;
    sbdSleepTime = sbdPackage.sbdSleepTime;
    retryIntvl = sbdPackage.retryIntvl;
    preemPeriod = sbdPackage.preemPeriod;
    pgSuspIdleT = sbdPackage.pgSuspIdleT;
    maxJobs = sbdPackage.maxJobs;
    uJobLimit = sbdPackage.uJobLimit;
    rusageUpdateRate = sbdPackage.rusageUpdateRate;
    rusageUpdatePercent = sbdPackage.rusageUpdatePercent;
    jobTerminateInterval = sbdPackage.jobTerminateInterval;


    for (i = 0; i < sbdPackage.nAdmins; i++)
	FREEUP(sbdPackage.admins[i]);
    FREEUP(sbdPackage.admins);

    return;
}
Example #9
0
void
do_modifyjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_switchjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSpecs    jobSpecs;
    struct jobReply    jobReply;
    sbdReplyType       reply;
    char               found = FALSE;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));

    if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs");
	goto sendReply;
    }
    for (jp = jobQueHead->back; jp != jobQueHead; jp = jp->back)
	if (jp->jobSpecs.jobId == jobSpecs.jobId) {
	    found = TRUE;
	    break;
	}
    if (!found) {
	reply = ERR_NO_JOB;
	ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5808,
                                         "%s: mbatchd trying to modify a non-existent job <%s>"), fname, lsb_jobid2str(jobSpecs.jobId)); /* catgets 5808 */
	goto sendReply;
    }
    if (jp->jobSpecs.jStatus & (JOB_STAT_DONE | JOB_STAT_EXIT)) {
	reply = ERR_JOB_FINISH;
	goto sendReply;
    }
    if ((lsbJobCpuLimit != 1) &&
	((jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxl
	  != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxl) ||
	 (jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxh
	  != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxh) ||
	 (jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curl
	  != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curl) ||
	 (jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curh
	  != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curh)
	    )) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd, NL_SETN, 5809, "%s, LSB_JOB_CPULIMIT is not set for the host, job <%s>, CPU limit not modified"), fname, lsb_jobid2str(jobSpecs.jobId));
    } else {
	memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU],
	       (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_CPU],
	       sizeof(struct lsfLimit));
    }
    if ((lsbJobMemLimit != 1) &&
	((jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxl
	  != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxl) ||
	 (jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxh
	  != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxh) ||
	 (jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curl
	  != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curl) ||
	 (jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curh
	  != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curh)
	    )) {
	ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd, NL_SETN, 5810, "%s, LSB_JOB_MEMLIMIT is not set for the host, job <%s>, memory limit not modified"), fname, lsb_jobid2str(jobSpecs.jobId));
    } else {
	memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS],
	       (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_RSS],
	       sizeof(struct lsfLimit));
    }

    memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_RUN],
	   (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_RUN],
	   sizeof(struct lsfLimit));
    setRunLimit(jp, FALSE);
    if (strcmp(jp->jobSpecs.outFile, jobSpecs.outFile) ||
	!(strcmp(jobSpecs.outFile, "/dev/null")))
    {
	strcpy(jp->jobSpecs.outFile, jobSpecs.outFile);
	if (strcmp(jobSpecs.outFile, "/dev/null") ||
	    (jobSpecs.options & SUB_OUT_FILE)) {
	    jp->jobSpecs.options |= SUB_OUT_FILE;
	}
	else {
	    jp->jobSpecs.options &= ~SUB_OUT_FILE;
	}
    }
    if (strcmp(jp->jobSpecs.errFile, jobSpecs.errFile))
    {
	strcpy(jp->jobSpecs.errFile, jobSpecs.errFile);
	if (!strcmp(jp->jobSpecs.errFile, "/dev/null")
	    && !(jobSpecs.options & SUB_ERR_FILE)) {
            jp->jobSpecs.options &= ~SUB_ERR_FILE;
	}
    }

    if (jobSpecs.options & SUB_RERUNNABLE) {
	jp->jobSpecs.options |= SUB_RERUNNABLE;
    } else {
	jp->jobSpecs.options &= ~SUB_RERUNNABLE;
    }

sendReply:
    xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr);
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);
    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR)
	replyStruct = (char *) &jobReply;
    else {
	replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
		  lsb_jobid2str(jp->jobSpecs.jobId),
		  "xdr_jobReply");
	relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
		  lsb_jobid2str(jp->jobSpecs.jobId), "chanWrite_");
    }

    xdr_destroy(&xdrs2);

    return;

}
Example #10
0
void
do_newjob(XDR *xdrs, int chfd, struct LSFHeader *reqHdr)
{
    static char        fname[] = "do_newjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSpecs    jobSpecs;
    struct jobReply    jobReply;
    struct jobCard     *jp;
    sbdReplyType       reply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));

    if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs");
	goto sendReply;
    }


    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId == jobSpecs.jobId) {

	    jobReply.jobId = jp->jobSpecs.jobId;
	    jobReply.jobPid = jp->jobSpecs.jobPid;
	    jobReply.jobPGid = jp->jobSpecs.jobPGid;
	    jobReply.jStatus = jp->jobSpecs.jStatus;
	    reply = ERR_NO_ERROR;
	    goto sendReply;
	}
    }

    jp = calloc(1, sizeof(struct jobCard));
    if (jp == NULL) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jobSpecs.jobId), "calloc");
	reply = ERR_MEM;
	goto sendReply;
    }

    memcpy((char *) &jp->jobSpecs, (char *) &jobSpecs,
	   sizeof(struct jobSpecs));

    jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
    jp->jobSpecs.startTime = now;
    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    /* Initialize the core number
     */
    jp->core_num = -1;

    if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
	if (lockHosts (jp) < 0) {
	    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
                      lsb_jobid2str(jp->jobSpecs.jobId), "lockHosts");
            unlockHosts (jp, jp->jobSpecs.numToHosts);
	    reply = ERR_LOCK_FAIL;
	    freeWeek(jp->week);
	    FREEUP(jp);
	    goto sendReply;
        }
    }
    jp->runTime = 0;
    if (initJobCard(jp, &jobSpecs, (int *)&reply) < 0) {

	if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
	    unlockHosts (jp, jp->jobSpecs.numToHosts);
	}
	FREEUP(jp);
	goto sendReply;
    }

    jp->execJobFlag = 0;

    if (jp->runTime < 0) {
        jp->runTime = 0;
    }
    jp->execGid = 0;
    jp->execUsername[0] = '\0';
    jp->jobSpecs.execUid = -1;
    jp->jobSpecs.execUsername[0] = '\0';

    if (jp->jobSpecs.jobSpoolDir[0] != '\0') {
        char *tmp;

        if ((tmp = getUnixSpoolDir (jp->jobSpecs.jobSpoolDir)) == NULL) {

            jp->jobSpecs.jobSpoolDir[0] = '\0';
        }
    }

    if ((logclass & LC_TRACE) && jp->jobSpecs.jobSpoolDir[0] != 0) {
        ls_syslog(LOG_DEBUG,
                  "%s: the SpoolDir for  job <%s>  is %s \n",
                  fname, lsb_jobid2str(jp->jobSpecs.jobId),
                  jp->jobSpecs.jobSpoolDir);
    }
    if (jp->jobSpecs.options & SUB_PRE_EXEC)
	SBD_SET_STATE(jp, (JOB_STAT_RUN | JOB_STAT_PRE_EXEC))
        else
            SBD_SET_STATE(jp, JOB_STAT_RUN);

    reply = job_exec(jp, chfd);

    if (reply != ERR_NO_ERROR) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "job_exec");
	if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
            unlockHosts (jp, jp->jobSpecs.numToHosts);
	}
	deallocJobCard(jp);
    } else {
	jobReply.jobId = jp->jobSpecs.jobId;
	jobReply.jobPid = jp->jobSpecs.jobPid;
	jobReply.jobPGid = jp->jobSpecs.jobPGid;
	jobReply.jStatus = jp->jobSpecs.jStatus;
    }


sendReply:
    xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr);
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);
    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    replyStruct = (reply == ERR_NO_ERROR) ? (char *) &jobReply : (char *) NULL;
    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobReply");
	lsb_merr(_i18n_msg_get(ls_catd , NL_SETN, 5804,
			       "Fatal error: xdr_jobReply() failed; sbatchd relifing")); /* catgets 5804 */
	relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
	ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5805,
					 "%s: Sending jobReply (len=%d) to master failed: %m"), /* catgets 5805 */
		  fname, XDR_GETPOS(&xdrs2));
    }

    xdr_destroy(&xdrs2);


    if (reply == ERR_NO_ERROR && !daemonParams[LSB_BSUBI_OLD].paramValue &&
	PURE_INTERACTIVE(&jp->jobSpecs)) {
  	if (status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		        ERR_NO_ERROR) < 0) {
            jp->notReported++;
	}
    }

}
Example #11
0
void
do_switchjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_switchjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSpecs    jobSpecs;
    struct jobReply    jobReply;
    int                i;
    sbdReplyType       reply;
    char               *cp;
    char               *word;
    char               found = FALSE;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));

    if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) {
        reply = ERR_BAD_REQ;
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs");
        goto sendReply;
    }
    for (jp = jobQueHead->back; jp != jobQueHead; jp = jp->back) {
        if (jp->jobSpecs.jobId == jobSpecs.jobId) {
            found = TRUE;
            break;
        }
    }
    if (!found) {
        reply = ERR_NO_JOB;
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5807,
                                         "%s: mbatchd trying to switch a non-existent job <%s>"), fname, lsb_jobid2str(jobSpecs.jobId)); /* catgets 5807 */
        goto sendReply;
    }
    if (jp->jobSpecs.jStatus & (JOB_STAT_DONE | JOB_STAT_EXIT)) {
        reply = ERR_JOB_FINISH;
        goto sendReply;
    }


    cp = jobSpecs.windows;
    freeWeek(jp->week);
    while ((word = getNextWord_(&cp)) != NULL) {
        if (addWindow(word, jp->week, "switchJob jobSpecs") < 0) {
            ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S_M, fname,
                      lsb_jobid2str(jp->jobSpecs.jobId), "addWindow", word);
            freeWeek(jp->week);
            reply = ERR_BAD_REQ;
            goto sendReply;
        }
    }
    jp->windEdge = now;


    if ((jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE)
	&& !(jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE))
	for (i = 0; i < jp->jobSpecs.numToHosts; i++)
	    if (unlockHost_(jp->jobSpecs.toHosts[i]) < 0
		&& lserrno != LSE_LIM_NLOCKED)
		ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S_MM, fname,
                          lsb_jobid2str(jp->jobSpecs.jobId), "unlockHost_", jp->jobSpecs.toHosts[i]);



    strcpy(jp->jobSpecs.queue, jobSpecs.queue);
    strcpy(jp->jobSpecs.windows, jobSpecs.windows);
    jp->jobSpecs.priority = jobSpecs.priority;
    jp->jobSpecs.nice = jobSpecs.nice;
    jp->jobSpecs.jAttrib = jobSpecs.jAttrib;

    freeThresholds (&jp->jobSpecs.thresholds);
    saveThresholds (&jp->jobSpecs, &jobSpecs.thresholds);


    memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_RUN],
	   (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_RUN],
	   sizeof(struct lsfLimit));


    strcpy (jp->jobSpecs.requeueEValues, jobSpecs.requeueEValues);
    strcpy (jp->jobSpecs.resumeCond, jobSpecs.resumeCond);
    strcpy (jp->jobSpecs.stopCond, jobSpecs.stopCond);

    lsbFreeResVal (&jp->resumeCondVal);
    if (jobSpecs.resumeCond && jobSpecs.resumeCond[0] != '\0') {
        if ((jp->resumeCondVal = checkThresholdCond (jobSpecs.resumeCond))
            == NULL)
            ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname,
		      lsb_jobid2str(jp->jobSpecs.jobId),
		      "checkThresholdCond", jobSpecs.resumeCond);
    }

    lsbFreeResVal (&jp->stopCondVal);
    if (jobSpecs.stopCond && jobSpecs.stopCond[0] != '\0') {
        if ((jp->stopCondVal = checkThresholdCond (jobSpecs.stopCond))
            == NULL)
            ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname,
		      lsb_jobid2str(jp->jobSpecs.jobId),
		      "checkThresholdCond", jobSpecs.stopCond);
    }

    if (jobSpecs.options & SUB_LOGIN_SHELL) {
	FREEUP (jp->jobSpecs.loginShell);
	jp->jobSpecs.loginShell = safeSave (jobSpecs.loginShell);
    }

    strcpy (jp->jobSpecs.suspendActCmd, jobSpecs.suspendActCmd);
    strcpy (jp->jobSpecs.resumeActCmd, jobSpecs.resumeActCmd);
    strcpy (jp->jobSpecs.terminateActCmd, jobSpecs.terminateActCmd);

    setRunLimit (jp, FALSE);
    offList ((struct listEntry *)jp);
    inJobLink (jp);

    if (reniceJob(jp) < 0)
	ls_syslog(LOG_DEBUG, "%s: renice job <%s> failed",
		  fname, lsb_jobid2str(jp->jobSpecs.jobId));

    reply = ERR_NO_ERROR;
    jobReply.jobId = jp->jobSpecs.jobId;
    jobReply.jobPid = jp->jobSpecs.jobPid;
    jobReply.jobPGid = jp->jobSpecs.jobPGid;
    jobReply.jStatus = jp->jobSpecs.jStatus;

sendReply:
    xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr);
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);
    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR)
	replyStruct = (char *) &jobReply;
    else {
	replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
		  lsb_jobid2str(jp->jobSpecs.jobId),
		  "xdr_jobReply");
	relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
		  lsb_jobid2str(jp->jobSpecs.jobId), "chanWrite_");
    }

    xdr_destroy(&xdrs2);

    return;

}