Esempio n. 1
0
void
checkFinish (void)
{
    struct jobCard *jobCard, *nextJob;

    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
					     jobCard = nextJob) {
        nextJob = jobCard->forw;

        if (!(IS_FINISH(jobCard->jobSpecs.jStatus))
	    && !(IS_POST_FINISH(jobCard->jobSpecs.jStatus) ) ) {


            if ( (jobsig(jobCard, 0, FALSE) < 0)
                || ( (jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)
		    && (jobCard->jobSpecs.termTime
			< time(0)-MAX(6,jobTerminateInterval*3)) ) ) {
	        jobGone (jobCard);
	    }
	}


	if (jobCard->jobSpecs.actPid) {

	    if (killpg(jobCard->jobSpecs.actPid, SIGCONT) == 0)
		continue;
	    if (kill(jobCard->jobSpecs.actPid, SIGCONT) == 0)
		continue;
	    if (jobCard->cleanupPid > 0 &&
		kill(jobCard->cleanupPid, SIGCONT) == 0)
		continue;

	    sigActEnd(jobCard);
	    continue;
        }

        if (IS_FINISH(jobCard->jobSpecs.jStatus)
	    || IS_POST_FINISH(jobCard->jobSpecs.jStatus)
	    || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) {
            job_finish (jobCard, TRUE);
        }
    }
}
Esempio n. 2
0
int
main (int argc, char **argv, char **environ)
{
  char *queue = NULL, *host = NULL, *jobName = NULL, *user = NULL;
  LS_LONG_INT jobId;
  int options;
  struct jobInfoEnt *jInfo;
  char *outFile;
  char fflag = FALSE;
  int cc;
  int rc;

  rc = _i18n_init (I18N_CAT_MIN);

  if (lsb_init (argv[0]) < 0)
    {
      lsb_perror ("lsb_init");
      exit (-1);
    }

  while ((cc = getopt (argc, argv, "Vhfq:m:J:")) != EOF)
    {
      switch (cc)
	{
	case 'q':
	  if (queue || host || jobName)
	    oneOf (argv[0]);
	  queue = optarg;
	  break;
	case 'm':
	  if (queue || host || jobName)
	    oneOf (argv[0]);
	  host = optarg;
	  break;
	case 'J':
	  if (queue || host || jobName)
	    oneOf (argv[0]);
	  jobName = optarg;
	  break;
	case 'V':
	  fputs (_LS_VERSION_, stderr);
	  exit (0);
	case 'f':
	  fflag = TRUE;
	  break;
	case 'h':
	default:
	  usage (argv[0]);
	}
    }

  jobId = 0;
  options = LAST_JOB;
  if (argc >= optind + 1)
    {
      if (queue || host || jobName)
	{
	  oneOf (argv[0]);
	}
      else if ((argc > 2 && !fflag) || (argc > 3 && fflag))
	usage (argv[0]);

      if (getOneJobId (argv[optind], &jobId, 0))
	{
	  usage (argv[0]);
	}

      options = 0;
    }



  if (lsb_openjobinfo (jobId, jobName, NULL, queue, host, options) < 0
      || (jInfo = lsb_readjobinfo (NULL)) == NULL)
    {

      if (jobId != 0 || jobName != NULL)
	{
	  user = ALL_USERS;
	  if (lsb_openjobinfo (jobId, jobName, user, queue, host, options) < 0
	      || (jInfo = lsb_readjobinfo (NULL)) == NULL)
	    {
	      jobInfoErr (jobId, jobName, NULL, queue, host, options);
	      exit (-1);
	    }
	}
      else
	{
	  jobInfoErr (jobId, jobName, NULL, queue, host, options);
	  exit (-1);
	}
    }
  lsb_closejobinfo ();


  if (jobId && jInfo->jobId != jobId)
    {
      lsberrno = LSBE_JOB_ARRAY;
      lsb_perror ("bpeek");
      exit (-1);
    }


  if ((jInfo->submit.options & SUB_INTERACTIVE) &&
      !(jInfo->submit.options & (SUB_OUT_FILE | SUB_ERR_FILE)))
    {
      fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2456, "Job <%s> : Cannot bpeek an interactive job.\n"),	/* catgets  2456 */
	       lsb_jobid2str (jInfo->jobId));
      exit (-1);
    }

  if (IS_PEND (jInfo->status) || jInfo->execUsername[0] == '\0')
    {
      fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2454, "Job <%s> : Not yet started.\n"),	/* catgets  2454 */
	       lsb_jobid2str (jInfo->jobId));

      exit (-1);
    }
  if (IS_FINISH (jInfo->status))
    {
      fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2455, "Job <%s> : Already finished.\n"),	/* catgets  2455  */
	       lsb_jobid2str (jInfo->jobId));
      exit (-1);
    }

  if ((outFile = lsb_peekjob (jInfo->jobId)) == NULL)
    {
      char msg[50];
      sprintf (msg, "%s <%s>", I18N_Job, lsb_jobid2str (jInfo->jobId));
      lsb_perror (msg);
      exit (-1);
    }
  displayOutput (outFile, jInfo, fflag, environ);
  _i18n_end (ls_catd);
  exit (0);

}
Esempio n. 3
0
void
prtJobRusage(struct jobInfoEnt *job)
{
    char prline[MAXLINELEN];

    int i, j;
    int linepos;



    if (IS_FINISH(job->status))
        return;



    if (IS_PEND(job->status)) {
        if (job->runRusage.utime || job->runRusage.stime) {
	    if (uf_format)
                printf ("%s: Resource usage collected. The CPU time used is %d seconds.",
                    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->jRusageUpdateTime),
                    job->runRusage.utime + job->runRusage.stime);
            else {
                sprintf(prline, "%s: %s.\n",
		    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T,
				&job->jRusageUpdateTime),
	            I18N(644, "Resource usage collected")); /* catgets 644  */
                prtLine(prline);
                sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,645, "                     The CPU time used is %d seconds.\n")),  /* catgets  645  */
                             job->runRusage.utime + job->runRusage.stime);
                prtLine(prline);
            }
        }
        return;
    };




   if (job->runRusage.utime > 0 || job->runRusage.stime > 0
       || job->runRusage.mem > 0 || job->runRusage.swap > 0
       || job->runRusage.npgids > 0 || job->runRusage.npids > 0) {
        if (uf_format)
            printf ("%s: Resource usage collected.",
                 _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->jRusageUpdateTime));
        else {
            sprintf(prline, "%s: %s.\n",
		 _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T,
			      &job->jRusageUpdateTime),
		 I18N(646, "Resource usage collected")); /* catgets  646  */
            prtLine(prline);
        }
    } else
        return;

    if (job->runRusage.utime > 0 || job->runRusage.stime > 0) {
	if (uf_format)
            printf (" The CPU time used is %d seconds.",
                job->runRusage.utime + job->runRusage.stime);
        else {
            sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,647, "                     The CPU time used is %d seconds.\n")), /* catgets  647  */
                             job->runRusage.utime + job->runRusage.stime);
            prtLine(prline);
        }
    }


    if (job->runRusage.mem > 0) {
        if (uf_format) {
            if (job->runRusage.mem > 1024)
                printf(" MEM: %d Mbytes;", job->runRusage.mem/1024);
            else
                printf(" MEM: %d Kbytes;", job->runRusage.mem);
        }
        else {
	    if (job->runRusage.mem > 1024)
	        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,648, "                     MEM: %d Mbytes")), job->runRusage.mem/1024); /* catgets  648  */
	    else
	        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,649, "                     MEM: %d Kbytes")), job->runRusage.mem); /* catgets  649  */
	    prtLine(prline);
        }
    }

    if (job->runRusage.swap > 0) {
	char *space;

	if (job->runRusage.mem > 0)
	    space = ";  ";
	else
	    space = "                     ";

        if (uf_format) {
            if (job->runRusage.swap > 1024)
                printf(" SWAP: %d Mbytes;", job->runRusage.swap/1024);
            else
                printf(" SWAP: %d Kbytes;", job->runRusage.swap);
        }
        else {
	    if (job->runRusage.swap > 1024)
	        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,650, "%sSWAP: %d Mbytes\n")), space, /* catgets  650  */
		    job->runRusage.swap/1024);
	    else
	        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,651, "%sSWAP: %d Kbytes\n")), space, job->runRusage.swap); /* catgets  651  */
	    prtLine(prline);
        }
    } else {
	if (job->runRusage.mem > 0 && !uf_format) {
	    sprintf(prline, "\n");
	    prtLine(prline);
	}
    }

    if (job->runRusage.npgids <= 0)
        return;


    for (i=0; i < job->runRusage.npgids; i++) {
	if (uf_format)
            printf (" PGID: %d; ", job->runRusage.pgid[i]);
        else {
            sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,652, "                     PGID: %d;  ")), job->runRusage.pgid[i]); /* catgets  652  */
	    linepos = strlen(prline);
            prtLine(prline);
        }
        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,653, "PIDs: "))); /* catgets  653  */
	linepos += 6;
        prtLineWUF(prline);
        for (j=0; j < job->runRusage.npids; j++) {
            if (job->runRusage.pgid[i] == job->runRusage.pidInfo[j].pgid) {
                sprintf(prline, "%d ", job->runRusage.pidInfo[j].pid);
                if (uf_format)
                  printf ("%d%s", job->runRusage.pidInfo[j].pid, j==job->runRusage.npids-1?"":" ");
                else {
		  linepos += strlen(prline);

		  if (linepos >= 80) {
		      char *newline ="\n                     ";
		      prtLine(newline);
		      prtLine(prline);
		      linepos = strlen(prline) + 21;
		  }
		  else
		      prtLine(prline);
                }
            }
        }
        if (uf_format)
            printf(";");
        else {
            sprintf(prline, "\n");
            prtLine(prline);
        }
    }
    sprintf(prline, "\n");
    prtLineWUF(prline);

    if (uf_format && job->runRusage.mem > 0) {
        printf ("\n MEMORY USAGE:\n");
        printf (" MAX MEM: N/A MBytes;  AVG MEM: N/A MBytes\n");
    }
}
Esempio n. 4
0
int
status_job (mbdReqType reqType,
	    struct jobCard *jp, int newStatus, sbdReplyType err)
{
  static char fname[] = "status_job()";
  static int seq = 1;
  static char lastHost[MAXHOSTNAMELEN];
  int reply;
  char *request_buf;
  char *reply_buf = NULL;
  XDR xdrs;
  struct LSFHeader hdr;
  int cc;
  struct statusReq statusReq;
  int flags;
  int i;
  int len;
  struct lsfAuth *auth = NULL;

  if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL))
    ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s",
	       fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId));

  if (newStatus == JOB_STAT_EXIT)
    {
      jp->userJobSucc = FALSE;
    }

  if (MASK_STATUS (newStatus) == JOB_STAT_DONE)
    {
      jp->userJobSucc = TRUE;
    }

  if (IS_POST_FINISH (newStatus))
    {
      if (jp->userJobSucc != TRUE)
	{
	  return 0;
	}
    }

  if (masterHost == NULL)
    return -1;

  if (jp->notReported < 0)
    {
      jp->notReported = -INFINIT_INT;
      return (0);
    }

  statusReq.jobId = jp->jobSpecs.jobId;
  statusReq.actPid = jp->jobSpecs.actPid;
  statusReq.jobPid = jp->jobSpecs.jobPid;
  statusReq.jobPGid = jp->jobSpecs.jobPGid;
  statusReq.newStatus = newStatus;
  statusReq.reason = jp->jobSpecs.reasons;
  statusReq.subreasons = jp->jobSpecs.subreasons;
  statusReq.sbdReply = err;
  statusReq.lsfRusage = jp->lsfRusage;
  statusReq.execUid = jp->jobSpecs.execUid;
  statusReq.numExecHosts = 0;
  statusReq.execHosts = NULL;
  statusReq.exitStatus = jp->w_status;
  statusReq.execCwd = jp->jobSpecs.execCwd;
  statusReq.execHome = jp->jobSpecs.execHome;
  statusReq.execUsername = jp->execUsername;
  statusReq.queuePostCmd = "";
  statusReq.queuePreCmd = "";
  statusReq.msgId = jp->delieveredMsgId;

  if (IS_FINISH (newStatus))
    {
      if (jp->maxRusage.mem > jp->runRusage.mem)
	jp->runRusage.mem = jp->maxRusage.mem;
      if (jp->maxRusage.swap > jp->runRusage.swap)
	jp->runRusage.swap = jp->maxRusage.swap;
      if (jp->maxRusage.stime > jp->runRusage.stime)
	jp->runRusage.stime = jp->maxRusage.stime;
      if (jp->maxRusage.utime > jp->runRusage.utime)
	jp->runRusage.utime = jp->maxRusage.utime;
    }
  statusReq.runRusage.mem = jp->runRusage.mem;
  statusReq.runRusage.swap = jp->runRusage.swap;
  statusReq.runRusage.utime = jp->runRusage.utime;
  statusReq.runRusage.stime = jp->runRusage.stime;
  statusReq.runRusage.npids = jp->runRusage.npids;
  statusReq.runRusage.pidInfo = jp->runRusage.pidInfo;
  statusReq.runRusage.npgids = jp->runRusage.npgids;
  statusReq.runRusage.pgid = jp->runRusage.pgid;
  statusReq.actStatus = jp->actStatus;
  statusReq.sigValue = jp->jobSpecs.actValue;
  statusReq.seq = seq;
  seq++;
  if (seq >= MAX_SEQ_NUM)
    seq = 1;

  len = 1024 + ALIGNWORD_ (sizeof (struct statusReq));

  len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execUsername)) + 4;

  for (i = 0; i < statusReq.runRusage.npids; i++)
    len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4;

  for (i = 0; i < statusReq.runRusage.npgids; i++)
    len += ALIGNWORD_ (sizeof (int)) + 4;

  if (logclass & (LC_TRACE | LC_COMM))
    ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname,
	       len);

  if ((request_buf = malloc (len)) == NULL)
    {
      ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc");
      return (-1);
    }

  xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE);
  initLSFHeader_ (&hdr);
  hdr.opCode = reqType;

  if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0,
		      auth))
    {
      ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M,
		 fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq");
      lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq");
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      relife ();
    }

  flags = CALL_SERVER_NO_HANDSHAKE;
  if (statusChan >= 0)
    flags |= CALL_SERVER_USE_SOCKET;

  if (reqType == BATCH_RUSAGE_JOB)
    flags |= CALL_SERVER_NO_WAIT_REPLY;

  if (logclass & LC_COMM)
    ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d",
	       fname, statusChan, flags);

  cc = call_server (masterHost,
		    mbd_port,
		    request_buf,
		    XDR_GETPOS (&xdrs),
		    &reply_buf,
		    &hdr,
		    connTimeout, readTimeout, &statusChan, NULL, NULL, flags);
  if (cc < 0)
    {
      statusChan = -1;
      if (!equalHost_ (masterHost, lastHost))
	{
	  if (errno != EINTR)
	    ls_syslog (LOG_DEBUG,
		       "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s",
		       fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId),
		       lsb_sysmsg ());
	  strcpy (lastHost, masterHost);
	}
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      failcnt++;
      return (-1);
    }
  else if (cc == 0)
    {

    }

  failcnt = 0;
  lastHost[0] = '\0';
  xdr_destroy (&xdrs);
  FREEUP (request_buf);

  if (cc)
    free (reply_buf);

  if (flags & CALL_SERVER_NO_WAIT_REPLY)
    {

      struct timeval timeval;

      timeval.tv_sec = 0;
      timeval.tv_usec = 0;

      if (rd_select_ (chanSock_ (statusChan), &timeval) == 0)
	{
	  jp->needReportRU = FALSE;
	  jp->lastStatusMbdTime = now;
	  return 0;
	}

      CLOSECD (statusChan);

      if (logclass & LC_COMM)
	ls_syslog (LOG_DEBUG1,
		   "%s: Job <%s> rd_select() failed, assume connection broken",
		   fname, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
  reply = hdr.opCode;
  switch (reply)
    {
    case LSBE_NO_ERROR:
    case LSBE_LOCK_JOB:
      jp->needReportRU = FALSE;
      jp->lastStatusMbdTime = now;
      if (reply == LSBE_LOCK_JOB)
	{
	  if (IS_SUSP (jp->jobSpecs.jStatus))
	    jp->jobSpecs.reasons |= SUSP_MBD_LOCK;
	  else
	    ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."),	/* catgets 5204 */
		       fname,
		       lsb_jobid2str (jp->jobSpecs.jobId),
		       jp->jobSpecs.jStatus);
	}
      return (0);
    case LSBE_NO_JOB:
      if (!IS_POST_FINISH (jp->jobSpecs.jStatus))
	{
	  ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost);	/* catgets 5205 */
	}

      jp->notReported = -INFINIT_INT;
      return (0);
    case LSBE_STOP_JOB:
      if (jobsig (jp, SIGSTOP, TRUE) < 0)
	SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT);
      else
	{
	  SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP);
	  jp->jobSpecs.reasons |= SUSP_USER_STOP;
	}
      return (-1);
    case LSBE_SBATCHD:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"),	/* catgets 5206 */
		 fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    default:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"),	/* catgets 5207 */
		 fname,
		 reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
}
Esempio n. 5
0
void
job_checking (void)
{
    static char fname[] = "job_checking";
    struct jobCard *jobCard, *nextJob;
    struct hostLoad *myload, savedLoad;
    char *myhostnm;
    static time_t last_check;
    char preempted = FALSE;
    int i;

    if (last_check == 0)
	last_check = now;
    if (jobcnt <= 0) {
        last_check = now;
        return;
    }

    checkFinish ();

    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
         jobCard = nextJob) {

	nextJob = jobCard->forw;
        if (IS_FINISH(jobCard->jobSpecs.jStatus)
              || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND))
            continue;

	ruLimits(jobCard);

	if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) {

	    jobCard->runTime += (int) (now - last_check);
	}
	if (jobCard->runTime >
	    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
	        if (jobCard->runTime >
		    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl
		    + WARN_TIME && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        ls_syslog(LOG_INFO, \
                                  "%s: warning period expired killing the job=%d",
			    fname, jobCard->jobSpecs.jobId);
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
	        } else if (!jobCard->timeExpire) {
		    ls_syslog(LOG_INFO, I18N(5704,
                        "%s: sending warning signal to job=%d"), /* catgets 5704 */
			fname, jobCard->jobSpecs.jobId);
		    jobsig(jobCard, SIGUSR2, FALSE);
		    jobCard->timeExpire = TRUE;
	        }
            } else {
                if (jobCard->runTime >
                    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
	    continue;
	}

        if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime

             && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                 || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
                if (now > jobCard->jobSpecs.termTime + WARN_TIME
                                                   && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
                } else
		    if (!jobCard->timeExpire) {
		        jobsig(jobCard, SIGUSR2, FALSE);
		        jobCard->timeExpire = TRUE;
		    }
            } else {
                if (now > jobCard->jobSpecs.termTime) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
            continue;
        }


        if (! window_ok (jobCard)
	    && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) {
	    if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                          && now - jobCard->windWarnTime >= WARN_TIME)) {


	        jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0);
		continue;

	    }
	} else {

		jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW);
                continue;
	}
    }


    if ((myhostnm = ls_getmyhostname()) == NULL) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname");
        die(SLAVE_FATAL);
    }
    myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1);
    if (myload == NULL) {
        if (myStatus != NO_LIM)

	    ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts");
	if (lserrno == LSE_LIM_BADHOST)
	    relife();
	if (lserrno == LSE_BAD_XDR)
	    relife();
	if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) {
	    myStatus |= NO_LIM;


            tryChkpntMig();
        }
        last_check = now;
	return;
    } else
	myStatus = 0;



    memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad));
    savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float),
				   "job_checking");
    savedLoad.status = (int *) my_malloc
       ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking");
    for (i = 0; i < allLsInfo->numIndx; i++)
        savedLoad.li[i] = myload->li[i];
    for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++)
        savedLoad.status[i] = myload->status[i];
    tryResume (&savedLoad);

    if (!preempted)
        tryStop (myhostnm, &savedLoad);

    tryChkpntMig();


    FREEUP(savedLoad.li);
    FREEUP(savedLoad.status);
    last_check = now;
    return;

}
Esempio n. 6
0
void
do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_sigjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSig      jobSig;
    sbdReplyType       reply;
    struct jobReply    jobReply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp = NULL;
    char               found = FALSE;
    int                cc;
    int                sigValue;
    int                savedActReasons;
    int                savedActSubReasons;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));
    if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig");

	goto Reply1;
    }

    jobSig.sigValue = sig_decode(jobSig.sigValue);
    sigValue = jobSig.sigValue;

    if (logclass & LC_SIGNAL)
        ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue);

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jobSig.jobId)
            continue;
        found = TRUE;
        break;
    }
    if (found == FALSE) {
        reply = ERR_NO_JOB;
        jp = NULL;
        goto Reply1;
    }

    if (jobSig.reasons & SUSP_MBD_LOCK) {

        jp->jobSpecs.reasons = jobSig.reasons;
        jp->jobSpecs.subreasons = jobSig.subReasons;
        savedActReasons = jp->actReasons;
        savedActSubReasons = jp->actSubReasons;
        jp->actReasons = jobSig.reasons;
        jp->actSubReasons = jobSig.subReasons;
    }


    if (jp->postJobStarted) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (IS_FINISH(jp->jobSpecs.jStatus)) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (jp->jobSpecs.jobPGid == -1) {
        SBD_SET_STATE(jp, JOB_STAT_EXIT);
        reply = ERR_NO_ERROR;
        goto Reply;
    }

    if (!JOB_STARTED(jp)) {
        if (isSigTerm(sigValue) == TRUE) {
            if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0)
           	reply = ERR_SIG_RETRY;
    	    else
            	reply = ERR_NO_ERROR;

            goto Reply;
        }

        reply = ERR_SIG_RETRY;

        if (logclass & LC_EXEC)
            ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>",
                      fname, getLsbSigSymbol(sigValue),
                      lsb_jobid2str(jp->jobSpecs.jobId));
        goto Reply1;
    }

    if (IS_PEND(jp->jobSpecs.jStatus)) {
        reply = ERR_SIG_RETRY;
        goto Reply1;
    }

    if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) {

        if ((cc = jobSigStart(jp,
                              sigValue,
                              jobSig.actFlags,
                              jobSig.chkPeriod,
                              NO_SIGLOG)) < 0)
            reply = ERR_SIG_RETRY;
        else {
            jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
            reply = ERR_NO_ERROR;
        }
        goto Reply;
    }

    if ((cc = jobSigStart(jp,
                          sigValue,
                          jobSig.actFlags,
                          jobSig.chkPeriod,
                          NO_SIGLOG)) < 0)
        reply = ERR_SIG_RETRY;
    else
        reply = ERR_NO_ERROR;

Reply:
    sbdlog_newstatus(jp);

Reply1:

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR) {
        jobReply.jobPid = jp->jobSpecs.jobPid;
        jobReply.actPid = jp->jobSpecs.actPid;
        jobReply.jobId = jp->jobSpecs.jobId;
        jobReply.jobPGid = jp->jobSpecs.jobPGid;
        jobReply.jStatus = jp->jobSpecs.jStatus;
        jobReply.reasons = jp->jobSpecs.reasons;
        jobReply.actStatus = jp->actStatus;
        replyStruct = (char *) &jobReply;
    } else {
        if (reply != ERR_NO_JOB)
            if  ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) {
                jp->actReasons = savedActReasons;
                jp->actSubReasons = savedActSubReasons;
            }
        replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply");
        relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821,
                                         "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */
    }
    if (jp != NULL)
        jp->actStatus = ACT_NO;

    xdr_destroy(&xdrs2);

    return;
}