示例#1
0
int
job_resume (struct jobCard *jp)
{
    static char fname[] = "job_resume";
    int rep;

    if (jp->jobSpecs.actPid)
	return 0;

    if (jobsig(jp, SIGCONT, FALSE) < 0)
        return -1;

    SBD_SET_STATE(jp, JOB_STAT_RUN);

    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		      ERR_NO_ERROR);
    if (rep < 0)
        jp->notReported++;
    else {
	if (jp->notReported > 0)
            jp->notReported = 0;
    }
    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Resume job %s",
                               fname, lsb_jobid2str(jp->jobSpecs.jobId));
    return 0;
}
示例#2
0
static int
jobResumeAction (struct jobCard *jp, int sigValue, int suspReason)
{
    static char fname[] = "jobResumeAction";


    if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) {

        return -1;
    };


    if (jp->jobSpecs.actPid)
        return 0;


    if (!(jp->jobSpecs.reasons & suspReason))
        return -1;

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;",
                 fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason);

    if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0)
        if (jobsig(jp, 0, FALSE) < 0) {

            SBD_SET_STATE(jp, JOB_STAT_EXIT);
            return -1;
        }
    sbdlog_newstatus(jp);
    return 0;

}
示例#3
0
char
window_ok (struct jobCard *jobPtr)
{
  windows_t *wp;
  struct dayhour dayhour;
  char active;
  time_t ckTime;
  time_t now;

  now = time (0);
  active = jobPtr->active;

  if (active && (jobPtr->jobSpecs.options & SUB_WINDOW_SIG))
    ckTime = now + WARN_TIME;
  else
    ckTime = now;

  if (jobPtr->windEdge > ckTime || jobPtr->windEdge == 0)
    return (jobPtr->active);

  getDayHour (&dayhour, ckTime);
  if (jobPtr->week[dayhour.day] == NULL)
    {
      jobPtr->active = TRUE;
      jobPtr->windEdge = now + (24.0 - dayhour.hour) * 3600.0;
      return (jobPtr->active);
    }

  jobPtr->active = FALSE;
  jobPtr->windEdge = now + (24.0 - dayhour.hour) * 3600.0;
  for (wp = jobPtr->week[dayhour.day]; wp; wp = wp->nextwind)
    checkWindow (&dayhour, &jobPtr->active, &jobPtr->windEdge, wp, now);

  if (active && !jobPtr->active && now - jobPtr->windWarnTime >= WARN_TIME
      && (jobPtr->jobSpecs.options & SUB_WINDOW_SIG))
    {

      if (!(jobPtr->jobSpecs.jStatus & JOB_STAT_RUN))
	job_resume (jobPtr);
      jobsig (jobPtr, sig_decode (jobPtr->jobSpecs.sigValue), TRUE);
      jobPtr->windWarnTime = now;
    }

  return (jobPtr->active);

}
示例#4
0
void
checkFinish (void)
{
    struct jobCard *jobCard, *nextJob;

    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
					     jobCard = nextJob) {
        nextJob = jobCard->forw;

        if (!(IS_FINISH(jobCard->jobSpecs.jStatus))
	    && !(IS_POST_FINISH(jobCard->jobSpecs.jStatus) ) ) {


            if ( (jobsig(jobCard, 0, FALSE) < 0)
                || ( (jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)
		    && (jobCard->jobSpecs.termTime
			< time(0)-MAX(6,jobTerminateInterval*3)) ) ) {
	        jobGone (jobCard);
	    }
	}


	if (jobCard->jobSpecs.actPid) {

	    if (killpg(jobCard->jobSpecs.actPid, SIGCONT) == 0)
		continue;
	    if (kill(jobCard->jobSpecs.actPid, SIGCONT) == 0)
		continue;
	    if (jobCard->cleanupPid > 0 &&
		kill(jobCard->cleanupPid, SIGCONT) == 0)
		continue;

	    sigActEnd(jobCard);
	    continue;
        }

        if (IS_FINISH(jobCard->jobSpecs.jStatus)
	    || IS_POST_FINISH(jobCard->jobSpecs.jStatus)
	    || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) {
            job_finish (jobCard, TRUE);
        }
    }
}
示例#5
0
int
status_job (mbdReqType reqType,
	    struct jobCard *jp, int newStatus, sbdReplyType err)
{
  static char fname[] = "status_job()";
  static int seq = 1;
  static char lastHost[MAXHOSTNAMELEN];
  int reply;
  char *request_buf;
  char *reply_buf = NULL;
  XDR xdrs;
  struct LSFHeader hdr;
  int cc;
  struct statusReq statusReq;
  int flags;
  int i;
  int len;
  struct lsfAuth *auth = NULL;

  if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL))
    ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s",
	       fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId));

  if (newStatus == JOB_STAT_EXIT)
    {
      jp->userJobSucc = FALSE;
    }

  if (MASK_STATUS (newStatus) == JOB_STAT_DONE)
    {
      jp->userJobSucc = TRUE;
    }

  if (IS_POST_FINISH (newStatus))
    {
      if (jp->userJobSucc != TRUE)
	{
	  return 0;
	}
    }

  if (masterHost == NULL)
    return -1;

  if (jp->notReported < 0)
    {
      jp->notReported = -INFINIT_INT;
      return (0);
    }

  statusReq.jobId = jp->jobSpecs.jobId;
  statusReq.actPid = jp->jobSpecs.actPid;
  statusReq.jobPid = jp->jobSpecs.jobPid;
  statusReq.jobPGid = jp->jobSpecs.jobPGid;
  statusReq.newStatus = newStatus;
  statusReq.reason = jp->jobSpecs.reasons;
  statusReq.subreasons = jp->jobSpecs.subreasons;
  statusReq.sbdReply = err;
  statusReq.lsfRusage = jp->lsfRusage;
  statusReq.execUid = jp->jobSpecs.execUid;
  statusReq.numExecHosts = 0;
  statusReq.execHosts = NULL;
  statusReq.exitStatus = jp->w_status;
  statusReq.execCwd = jp->jobSpecs.execCwd;
  statusReq.execHome = jp->jobSpecs.execHome;
  statusReq.execUsername = jp->execUsername;
  statusReq.queuePostCmd = "";
  statusReq.queuePreCmd = "";
  statusReq.msgId = jp->delieveredMsgId;

  if (IS_FINISH (newStatus))
    {
      if (jp->maxRusage.mem > jp->runRusage.mem)
	jp->runRusage.mem = jp->maxRusage.mem;
      if (jp->maxRusage.swap > jp->runRusage.swap)
	jp->runRusage.swap = jp->maxRusage.swap;
      if (jp->maxRusage.stime > jp->runRusage.stime)
	jp->runRusage.stime = jp->maxRusage.stime;
      if (jp->maxRusage.utime > jp->runRusage.utime)
	jp->runRusage.utime = jp->maxRusage.utime;
    }
  statusReq.runRusage.mem = jp->runRusage.mem;
  statusReq.runRusage.swap = jp->runRusage.swap;
  statusReq.runRusage.utime = jp->runRusage.utime;
  statusReq.runRusage.stime = jp->runRusage.stime;
  statusReq.runRusage.npids = jp->runRusage.npids;
  statusReq.runRusage.pidInfo = jp->runRusage.pidInfo;
  statusReq.runRusage.npgids = jp->runRusage.npgids;
  statusReq.runRusage.pgid = jp->runRusage.pgid;
  statusReq.actStatus = jp->actStatus;
  statusReq.sigValue = jp->jobSpecs.actValue;
  statusReq.seq = seq;
  seq++;
  if (seq >= MAX_SEQ_NUM)
    seq = 1;

  len = 1024 + ALIGNWORD_ (sizeof (struct statusReq));

  len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execUsername)) + 4;

  for (i = 0; i < statusReq.runRusage.npids; i++)
    len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4;

  for (i = 0; i < statusReq.runRusage.npgids; i++)
    len += ALIGNWORD_ (sizeof (int)) + 4;

  if (logclass & (LC_TRACE | LC_COMM))
    ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname,
	       len);

  if ((request_buf = malloc (len)) == NULL)
    {
      ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc");
      return (-1);
    }

  xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE);
  initLSFHeader_ (&hdr);
  hdr.opCode = reqType;

  if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0,
		      auth))
    {
      ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M,
		 fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq");
      lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq");
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      relife ();
    }

  flags = CALL_SERVER_NO_HANDSHAKE;
  if (statusChan >= 0)
    flags |= CALL_SERVER_USE_SOCKET;

  if (reqType == BATCH_RUSAGE_JOB)
    flags |= CALL_SERVER_NO_WAIT_REPLY;

  if (logclass & LC_COMM)
    ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d",
	       fname, statusChan, flags);

  cc = call_server (masterHost,
		    mbd_port,
		    request_buf,
		    XDR_GETPOS (&xdrs),
		    &reply_buf,
		    &hdr,
		    connTimeout, readTimeout, &statusChan, NULL, NULL, flags);
  if (cc < 0)
    {
      statusChan = -1;
      if (!equalHost_ (masterHost, lastHost))
	{
	  if (errno != EINTR)
	    ls_syslog (LOG_DEBUG,
		       "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s",
		       fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId),
		       lsb_sysmsg ());
	  strcpy (lastHost, masterHost);
	}
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      failcnt++;
      return (-1);
    }
  else if (cc == 0)
    {

    }

  failcnt = 0;
  lastHost[0] = '\0';
  xdr_destroy (&xdrs);
  FREEUP (request_buf);

  if (cc)
    free (reply_buf);

  if (flags & CALL_SERVER_NO_WAIT_REPLY)
    {

      struct timeval timeval;

      timeval.tv_sec = 0;
      timeval.tv_usec = 0;

      if (rd_select_ (chanSock_ (statusChan), &timeval) == 0)
	{
	  jp->needReportRU = FALSE;
	  jp->lastStatusMbdTime = now;
	  return 0;
	}

      CLOSECD (statusChan);

      if (logclass & LC_COMM)
	ls_syslog (LOG_DEBUG1,
		   "%s: Job <%s> rd_select() failed, assume connection broken",
		   fname, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
  reply = hdr.opCode;
  switch (reply)
    {
    case LSBE_NO_ERROR:
    case LSBE_LOCK_JOB:
      jp->needReportRU = FALSE;
      jp->lastStatusMbdTime = now;
      if (reply == LSBE_LOCK_JOB)
	{
	  if (IS_SUSP (jp->jobSpecs.jStatus))
	    jp->jobSpecs.reasons |= SUSP_MBD_LOCK;
	  else
	    ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."),	/* catgets 5204 */
		       fname,
		       lsb_jobid2str (jp->jobSpecs.jobId),
		       jp->jobSpecs.jStatus);
	}
      return (0);
    case LSBE_NO_JOB:
      if (!IS_POST_FINISH (jp->jobSpecs.jStatus))
	{
	  ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost);	/* catgets 5205 */
	}

      jp->notReported = -INFINIT_INT;
      return (0);
    case LSBE_STOP_JOB:
      if (jobsig (jp, SIGSTOP, TRUE) < 0)
	SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT);
      else
	{
	  SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP);
	  jp->jobSpecs.reasons |= SUSP_USER_STOP;
	}
      return (-1);
    case LSBE_SBATCHD:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"),	/* catgets 5206 */
		 fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    default:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"),	/* catgets 5207 */
		 fname,
		 reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
}
示例#6
0
void
job_checking (void)
{
    static char fname[] = "job_checking";
    struct jobCard *jobCard, *nextJob;
    struct hostLoad *myload, savedLoad;
    char *myhostnm;
    static time_t last_check;
    char preempted = FALSE;
    int i;

    if (last_check == 0)
	last_check = now;
    if (jobcnt <= 0) {
        last_check = now;
        return;
    }

    checkFinish ();

    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
         jobCard = nextJob) {

	nextJob = jobCard->forw;
        if (IS_FINISH(jobCard->jobSpecs.jStatus)
              || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND))
            continue;

	ruLimits(jobCard);

	if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) {

	    jobCard->runTime += (int) (now - last_check);
	}
	if (jobCard->runTime >
	    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
	        if (jobCard->runTime >
		    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl
		    + WARN_TIME && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        ls_syslog(LOG_INFO, \
                                  "%s: warning period expired killing the job=%d",
			    fname, jobCard->jobSpecs.jobId);
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
	        } else if (!jobCard->timeExpire) {
		    ls_syslog(LOG_INFO, I18N(5704,
                        "%s: sending warning signal to job=%d"), /* catgets 5704 */
			fname, jobCard->jobSpecs.jobId);
		    jobsig(jobCard, SIGUSR2, FALSE);
		    jobCard->timeExpire = TRUE;
	        }
            } else {
                if (jobCard->runTime >
                    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
	    continue;
	}

        if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime

             && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                 || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
                if (now > jobCard->jobSpecs.termTime + WARN_TIME
                                                   && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
                } else
		    if (!jobCard->timeExpire) {
		        jobsig(jobCard, SIGUSR2, FALSE);
		        jobCard->timeExpire = TRUE;
		    }
            } else {
                if (now > jobCard->jobSpecs.termTime) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
            continue;
        }


        if (! window_ok (jobCard)
	    && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) {
	    if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                          && now - jobCard->windWarnTime >= WARN_TIME)) {


	        jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0);
		continue;

	    }
	} else {

		jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW);
                continue;
	}
    }


    if ((myhostnm = ls_getmyhostname()) == NULL) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname");
        die(SLAVE_FATAL);
    }
    myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1);
    if (myload == NULL) {
        if (myStatus != NO_LIM)

	    ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts");
	if (lserrno == LSE_LIM_BADHOST)
	    relife();
	if (lserrno == LSE_BAD_XDR)
	    relife();
	if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) {
	    myStatus |= NO_LIM;


            tryChkpntMig();
        }
        last_check = now;
	return;
    } else
	myStatus = 0;



    memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad));
    savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float),
				   "job_checking");
    savedLoad.status = (int *) my_malloc
       ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking");
    for (i = 0; i < allLsInfo->numIndx; i++)
        savedLoad.li[i] = myload->li[i];
    for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++)
        savedLoad.status[i] = myload->status[i];
    tryResume (&savedLoad);

    if (!preempted)
        tryStop (myhostnm, &savedLoad);

    tryChkpntMig();


    FREEUP(savedLoad.li);
    FREEUP(savedLoad.status);
    last_check = now;
    return;

}
示例#7
0
static void
ruLimits(struct jobCard *jobCard)
{
    struct rlimit rlimit;


    rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_CPU],
		  &rlimit, LSF_RLIMIT_CPU);



    if (rlimit.rlim_cur != RLIM_INFINITY && lsbJobCpuLimit != 0) {

	if ((long)rlimit.rlim_cur < ((long)jobCard->runRusage.utime +
			       (long)jobCard->runRusage.stime)) {

            if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) {

	    } else {
		jobSigStart (jobCard, SIG_TERM_CPULIMIT, 0, 0, SIGLOG);
		sbdlog_newstatus(jobCard);

		jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
	    }
	}
    }


    rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_SWAP],
                  &rlimit, LSF_RLIMIT_SWAP);
    if (rlimit.rlim_cur != RLIM_INFINITY) {
        if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.swap) {
            jobsig(jobCard, SIGQUIT, FALSE);
            jobsig(jobCard, SIGKILL, TRUE);
        }
    }


    rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_PROCESS],
                  &rlimit, LSF_RLIMIT_PROCESS);

    if (rlimit.rlim_cur != RLIM_INFINITY) {
        if ((int)rlimit.rlim_cur + 2 < jobCard->runRusage.npids) {

            if ((IS_SUSP (jobCard->jobSpecs.jStatus))
               && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
               && (jobCard->jobSpecs.subreasons & SUB_REASON_PROCESSLIMIT))
                return;
            else {
                jobSigStart (jobCard, SIG_TERM_PROCESSLIMIT, 0, 0, SIGLOG);
                sbdlog_newstatus(jobCard);
            }
        }
    }


    if ( (lsbJobMemLimit == 1) ||
	 (lsbJobMemLimit != 0 && lsbMemEnforce == TRUE)) {
        rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RSS],
                      &rlimit, LSF_RLIMIT_RSS);
        if (rlimit.rlim_cur != RLIM_INFINITY) {
	    if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.mem) {
                if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) {

	        } else {
		    jobSigStart (jobCard, SIG_TERM_MEMLIMIT, 0, 0, SIGLOG);
		    sbdlog_newstatus(jobCard);
		    jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
		}
	    }
        }
    }
}
示例#8
0
static void
chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed)
{
    static char fname[] = "chkpntEnd()";
    int savePid, saveStatus;


    if (IS_SUSP(jobCard->jobSpecs.jStatus)
       && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG))
        jobsig(jobCard, SIGSTOP, TRUE);

    saveStatus = jobCard->jobSpecs.jStatus;
    if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) {
        if (w_status == 0)  {
            if (!jobCard->missing) {

                jobCard->missing = TRUE;
                need_checkfinish = TRUE;
                return;
            } else if (jobCard->notReported == 0)
                return;

            if (jobCard->cleanupPid == 0) {
                if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0)
                    return;

                ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709,
                    "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */
                    fname, lsb_jobid2str(jobCard->jobSpecs.jobId));
            }

            SBD_SET_STATE(jobCard, JOB_STAT_PEND);
        } else {
            jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG;
        }
    }

    savePid = jobCard->jobSpecs.actPid;

    if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus,
                    w_status == 0 ? ERR_NO_ERROR :
                    ERR_SYSACT_FAIL) < 0) {
        jobCard->jobSpecs.actPid = savePid;
        jobCard->jobSpecs.jStatus = saveStatus;
    } else {
        jobCard->lastChkpntTime = now;
        jobCard->jobSpecs.actPid = 0;
        jobCard->actStatus = ACT_NO;
        jobCard->jobSpecs.actValue = SIG_NULL;

        if (w_status == 0) {

            jobCard->migCnt = 1;
        }

        if (saveStatus & JOB_STAT_MIG) {
            if (w_status == 0) {

                cleanupMigJob(jobCard);
		deallocJobCard(jobCard);
		*freed = TRUE;
            } else
                jobCard->migCnt *= 2;
        }
    }

}