void checkFinish (void) { struct jobCard *jobCard, *nextJob; for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = nextJob) { nextJob = jobCard->forw; if (!(IS_FINISH(jobCard->jobSpecs.jStatus)) && !(IS_POST_FINISH(jobCard->jobSpecs.jStatus) ) ) { if ( (jobsig(jobCard, 0, FALSE) < 0) || ( (jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL) && (jobCard->jobSpecs.termTime < time(0)-MAX(6,jobTerminateInterval*3)) ) ) { jobGone (jobCard); } } if (jobCard->jobSpecs.actPid) { if (killpg(jobCard->jobSpecs.actPid, SIGCONT) == 0) continue; if (kill(jobCard->jobSpecs.actPid, SIGCONT) == 0) continue; if (jobCard->cleanupPid > 0 && kill(jobCard->cleanupPid, SIGCONT) == 0) continue; sigActEnd(jobCard); continue; } if (IS_FINISH(jobCard->jobSpecs.jStatus) || IS_POST_FINISH(jobCard->jobSpecs.jStatus) || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) { job_finish (jobCard, TRUE); } } }
int main (int argc, char **argv, char **environ) { char *queue = NULL, *host = NULL, *jobName = NULL, *user = NULL; LS_LONG_INT jobId; int options; struct jobInfoEnt *jInfo; char *outFile; char fflag = FALSE; int cc; int rc; rc = _i18n_init (I18N_CAT_MIN); if (lsb_init (argv[0]) < 0) { lsb_perror ("lsb_init"); exit (-1); } while ((cc = getopt (argc, argv, "Vhfq:m:J:")) != EOF) { switch (cc) { case 'q': if (queue || host || jobName) oneOf (argv[0]); queue = optarg; break; case 'm': if (queue || host || jobName) oneOf (argv[0]); host = optarg; break; case 'J': if (queue || host || jobName) oneOf (argv[0]); jobName = optarg; break; case 'V': fputs (_LS_VERSION_, stderr); exit (0); case 'f': fflag = TRUE; break; case 'h': default: usage (argv[0]); } } jobId = 0; options = LAST_JOB; if (argc >= optind + 1) { if (queue || host || jobName) { oneOf (argv[0]); } else if ((argc > 2 && !fflag) || (argc > 3 && fflag)) usage (argv[0]); if (getOneJobId (argv[optind], &jobId, 0)) { usage (argv[0]); } options = 0; } if (lsb_openjobinfo (jobId, jobName, NULL, queue, host, options) < 0 || (jInfo = lsb_readjobinfo (NULL)) == NULL) { if (jobId != 0 || jobName != NULL) { user = ALL_USERS; if (lsb_openjobinfo (jobId, jobName, user, queue, host, options) < 0 || (jInfo = lsb_readjobinfo (NULL)) == NULL) { jobInfoErr (jobId, jobName, NULL, queue, host, options); exit (-1); } } else { jobInfoErr (jobId, jobName, NULL, queue, host, options); exit (-1); } } lsb_closejobinfo (); if (jobId && jInfo->jobId != jobId) { lsberrno = LSBE_JOB_ARRAY; lsb_perror ("bpeek"); exit (-1); } if ((jInfo->submit.options & SUB_INTERACTIVE) && !(jInfo->submit.options & (SUB_OUT_FILE | SUB_ERR_FILE))) { fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2456, "Job <%s> : Cannot bpeek an interactive job.\n"), /* catgets 2456 */ lsb_jobid2str (jInfo->jobId)); exit (-1); } if (IS_PEND (jInfo->status) || jInfo->execUsername[0] == '\0') { fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2454, "Job <%s> : Not yet started.\n"), /* catgets 2454 */ lsb_jobid2str (jInfo->jobId)); exit (-1); } if (IS_FINISH (jInfo->status)) { fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2455, "Job <%s> : Already finished.\n"), /* catgets 2455 */ lsb_jobid2str (jInfo->jobId)); exit (-1); } if ((outFile = lsb_peekjob (jInfo->jobId)) == NULL) { char msg[50]; sprintf (msg, "%s <%s>", I18N_Job, lsb_jobid2str (jInfo->jobId)); lsb_perror (msg); exit (-1); } displayOutput (outFile, jInfo, fflag, environ); _i18n_end (ls_catd); exit (0); }
void prtJobRusage(struct jobInfoEnt *job) { char prline[MAXLINELEN]; int i, j; int linepos; if (IS_FINISH(job->status)) return; if (IS_PEND(job->status)) { if (job->runRusage.utime || job->runRusage.stime) { if (uf_format) printf ("%s: Resource usage collected. The CPU time used is %d seconds.", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->jRusageUpdateTime), job->runRusage.utime + job->runRusage.stime); else { sprintf(prline, "%s: %s.\n", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->jRusageUpdateTime), I18N(644, "Resource usage collected")); /* catgets 644 */ prtLine(prline); sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,645, " The CPU time used is %d seconds.\n")), /* catgets 645 */ job->runRusage.utime + job->runRusage.stime); prtLine(prline); } } return; }; if (job->runRusage.utime > 0 || job->runRusage.stime > 0 || job->runRusage.mem > 0 || job->runRusage.swap > 0 || job->runRusage.npgids > 0 || job->runRusage.npids > 0) { if (uf_format) printf ("%s: Resource usage collected.", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->jRusageUpdateTime)); else { sprintf(prline, "%s: %s.\n", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->jRusageUpdateTime), I18N(646, "Resource usage collected")); /* catgets 646 */ prtLine(prline); } } else return; if (job->runRusage.utime > 0 || job->runRusage.stime > 0) { if (uf_format) printf (" The CPU time used is %d seconds.", job->runRusage.utime + job->runRusage.stime); else { sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,647, " The CPU time used is %d seconds.\n")), /* catgets 647 */ job->runRusage.utime + job->runRusage.stime); prtLine(prline); } } if (job->runRusage.mem > 0) { if (uf_format) { if (job->runRusage.mem > 1024) printf(" MEM: %d Mbytes;", job->runRusage.mem/1024); else printf(" MEM: %d Kbytes;", job->runRusage.mem); } else { if (job->runRusage.mem > 1024) sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,648, " MEM: %d Mbytes")), job->runRusage.mem/1024); /* catgets 648 */ else sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,649, " MEM: %d Kbytes")), job->runRusage.mem); /* catgets 649 */ prtLine(prline); } } if (job->runRusage.swap > 0) { char *space; if (job->runRusage.mem > 0) space = "; "; else space = " "; if (uf_format) { if (job->runRusage.swap > 1024) printf(" SWAP: %d Mbytes;", job->runRusage.swap/1024); else printf(" SWAP: %d Kbytes;", job->runRusage.swap); } else { if (job->runRusage.swap > 1024) sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,650, "%sSWAP: %d Mbytes\n")), space, /* catgets 650 */ job->runRusage.swap/1024); else sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,651, "%sSWAP: %d Kbytes\n")), space, job->runRusage.swap); /* catgets 651 */ prtLine(prline); } } else { if (job->runRusage.mem > 0 && !uf_format) { sprintf(prline, "\n"); prtLine(prline); } } if (job->runRusage.npgids <= 0) return; for (i=0; i < job->runRusage.npgids; i++) { if (uf_format) printf (" PGID: %d; ", job->runRusage.pgid[i]); else { sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,652, " PGID: %d; ")), job->runRusage.pgid[i]); /* catgets 652 */ linepos = strlen(prline); prtLine(prline); } sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,653, "PIDs: "))); /* catgets 653 */ linepos += 6; prtLineWUF(prline); for (j=0; j < job->runRusage.npids; j++) { if (job->runRusage.pgid[i] == job->runRusage.pidInfo[j].pgid) { sprintf(prline, "%d ", job->runRusage.pidInfo[j].pid); if (uf_format) printf ("%d%s", job->runRusage.pidInfo[j].pid, j==job->runRusage.npids-1?"":" "); else { linepos += strlen(prline); if (linepos >= 80) { char *newline ="\n "; prtLine(newline); prtLine(prline); linepos = strlen(prline) + 21; } else prtLine(prline); } } } if (uf_format) printf(";"); else { sprintf(prline, "\n"); prtLine(prline); } } sprintf(prline, "\n"); prtLineWUF(prline); if (uf_format && job->runRusage.mem > 0) { printf ("\n MEMORY USAGE:\n"); printf (" MAX MEM: N/A MBytes; AVG MEM: N/A MBytes\n"); } }
int status_job (mbdReqType reqType, struct jobCard *jp, int newStatus, sbdReplyType err) { static char fname[] = "status_job()"; static int seq = 1; static char lastHost[MAXHOSTNAMELEN]; int reply; char *request_buf; char *reply_buf = NULL; XDR xdrs; struct LSFHeader hdr; int cc; struct statusReq statusReq; int flags; int i; int len; struct lsfAuth *auth = NULL; if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL)) ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s", fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId)); if (newStatus == JOB_STAT_EXIT) { jp->userJobSucc = FALSE; } if (MASK_STATUS (newStatus) == JOB_STAT_DONE) { jp->userJobSucc = TRUE; } if (IS_POST_FINISH (newStatus)) { if (jp->userJobSucc != TRUE) { return 0; } } if (masterHost == NULL) return -1; if (jp->notReported < 0) { jp->notReported = -INFINIT_INT; return (0); } statusReq.jobId = jp->jobSpecs.jobId; statusReq.actPid = jp->jobSpecs.actPid; statusReq.jobPid = jp->jobSpecs.jobPid; statusReq.jobPGid = jp->jobSpecs.jobPGid; statusReq.newStatus = newStatus; statusReq.reason = jp->jobSpecs.reasons; statusReq.subreasons = jp->jobSpecs.subreasons; statusReq.sbdReply = err; statusReq.lsfRusage = jp->lsfRusage; statusReq.execUid = jp->jobSpecs.execUid; statusReq.numExecHosts = 0; statusReq.execHosts = NULL; statusReq.exitStatus = jp->w_status; statusReq.execCwd = jp->jobSpecs.execCwd; statusReq.execHome = jp->jobSpecs.execHome; statusReq.execUsername = jp->execUsername; statusReq.queuePostCmd = ""; statusReq.queuePreCmd = ""; statusReq.msgId = jp->delieveredMsgId; if (IS_FINISH (newStatus)) { if (jp->maxRusage.mem > jp->runRusage.mem) jp->runRusage.mem = jp->maxRusage.mem; if (jp->maxRusage.swap > jp->runRusage.swap) jp->runRusage.swap = jp->maxRusage.swap; if (jp->maxRusage.stime > jp->runRusage.stime) jp->runRusage.stime = jp->maxRusage.stime; if (jp->maxRusage.utime > jp->runRusage.utime) jp->runRusage.utime = jp->maxRusage.utime; } statusReq.runRusage.mem = jp->runRusage.mem; statusReq.runRusage.swap = jp->runRusage.swap; statusReq.runRusage.utime = jp->runRusage.utime; statusReq.runRusage.stime = jp->runRusage.stime; statusReq.runRusage.npids = jp->runRusage.npids; statusReq.runRusage.pidInfo = jp->runRusage.pidInfo; statusReq.runRusage.npgids = jp->runRusage.npgids; statusReq.runRusage.pgid = jp->runRusage.pgid; statusReq.actStatus = jp->actStatus; statusReq.sigValue = jp->jobSpecs.actValue; statusReq.seq = seq; seq++; if (seq >= MAX_SEQ_NUM) seq = 1; len = 1024 + ALIGNWORD_ (sizeof (struct statusReq)); len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 + ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 + ALIGNWORD_ (strlen (statusReq.execUsername)) + 4; for (i = 0; i < statusReq.runRusage.npids; i++) len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4; for (i = 0; i < statusReq.runRusage.npgids; i++) len += ALIGNWORD_ (sizeof (int)) + 4; if (logclass & (LC_TRACE | LC_COMM)) ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname, len); if ((request_buf = malloc (len)) == NULL) { ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc"); return (-1); } xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE); initLSFHeader_ (&hdr); hdr.opCode = reqType; if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0, auth)) { ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq"); lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq"); xdr_destroy (&xdrs); FREEUP (request_buf); relife (); } flags = CALL_SERVER_NO_HANDSHAKE; if (statusChan >= 0) flags |= CALL_SERVER_USE_SOCKET; if (reqType == BATCH_RUSAGE_JOB) flags |= CALL_SERVER_NO_WAIT_REPLY; if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d", fname, statusChan, flags); cc = call_server (masterHost, mbd_port, request_buf, XDR_GETPOS (&xdrs), &reply_buf, &hdr, connTimeout, readTimeout, &statusChan, NULL, NULL, flags); if (cc < 0) { statusChan = -1; if (!equalHost_ (masterHost, lastHost)) { if (errno != EINTR) ls_syslog (LOG_DEBUG, "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s", fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId), lsb_sysmsg ()); strcpy (lastHost, masterHost); } xdr_destroy (&xdrs); FREEUP (request_buf); failcnt++; return (-1); } else if (cc == 0) { } failcnt = 0; lastHost[0] = '\0'; xdr_destroy (&xdrs); FREEUP (request_buf); if (cc) free (reply_buf); if (flags & CALL_SERVER_NO_WAIT_REPLY) { struct timeval timeval; timeval.tv_sec = 0; timeval.tv_usec = 0; if (rd_select_ (chanSock_ (statusChan), &timeval) == 0) { jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; return 0; } CLOSECD (statusChan); if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: Job <%s> rd_select() failed, assume connection broken", fname, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } reply = hdr.opCode; switch (reply) { case LSBE_NO_ERROR: case LSBE_LOCK_JOB: jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; if (reply == LSBE_LOCK_JOB) { if (IS_SUSP (jp->jobSpecs.jStatus)) jp->jobSpecs.reasons |= SUSP_MBD_LOCK; else ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."), /* catgets 5204 */ fname, lsb_jobid2str (jp->jobSpecs.jobId), jp->jobSpecs.jStatus); } return (0); case LSBE_NO_JOB: if (!IS_POST_FINISH (jp->jobSpecs.jStatus)) { ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost); /* catgets 5205 */ } jp->notReported = -INFINIT_INT; return (0); case LSBE_STOP_JOB: if (jobsig (jp, SIGSTOP, TRUE) < 0) SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT); else { SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP); jp->jobSpecs.reasons |= SUSP_USER_STOP; } return (-1); case LSBE_SBATCHD: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"), /* catgets 5206 */ fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); default: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"), /* catgets 5207 */ fname, reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } }
void job_checking (void) { static char fname[] = "job_checking"; struct jobCard *jobCard, *nextJob; struct hostLoad *myload, savedLoad; char *myhostnm; static time_t last_check; char preempted = FALSE; int i; if (last_check == 0) last_check = now; if (jobcnt <= 0) { last_check = now; return; } checkFinish (); for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = nextJob) { nextJob = jobCard->forw; if (IS_FINISH(jobCard->jobSpecs.jStatus) || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) continue; ruLimits(jobCard); if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) { jobCard->runTime += (int) (now - last_check); } if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { ls_syslog(LOG_INFO, \ "%s: warning period expired killing the job=%d", fname, jobCard->jobSpecs.jobId); jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { ls_syslog(LOG_INFO, I18N(5704, "%s: sending warning signal to job=%d"), /* catgets 5704 */ fname, jobCard->jobSpecs.jobId); jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else { jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (now > jobCard->jobSpecs.termTime + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (now > jobCard->jobSpecs.termTime) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (! window_ok (jobCard) && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) { if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG) || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG) && now - jobCard->windWarnTime >= WARN_TIME)) { jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0); continue; } } else { jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW); continue; } } if ((myhostnm = ls_getmyhostname()) == NULL) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname"); die(SLAVE_FATAL); } myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1); if (myload == NULL) { if (myStatus != NO_LIM) ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts"); if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) { myStatus |= NO_LIM; tryChkpntMig(); } last_check = now; return; } else myStatus = 0; memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad)); savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float), "job_checking"); savedLoad.status = (int *) my_malloc ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking"); for (i = 0; i < allLsInfo->numIndx; i++) savedLoad.li[i] = myload->li[i]; for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++) savedLoad.status[i] = myload->status[i]; tryResume (&savedLoad); if (!preempted) tryStop (myhostnm, &savedLoad); tryChkpntMig(); FREEUP(savedLoad.li); FREEUP(savedLoad.status); last_check = now; return; }
void do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_sigjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSig jobSig; sbdReplyType reply; struct jobReply jobReply; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp = NULL; char found = FALSE; int cc; int sigValue; int savedActReasons; int savedActSubReasons; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig"); goto Reply1; } jobSig.sigValue = sig_decode(jobSig.sigValue); sigValue = jobSig.sigValue; if (logclass & LC_SIGNAL) ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue); for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jobSig.jobId) continue; found = TRUE; break; } if (found == FALSE) { reply = ERR_NO_JOB; jp = NULL; goto Reply1; } if (jobSig.reasons & SUSP_MBD_LOCK) { jp->jobSpecs.reasons = jobSig.reasons; jp->jobSpecs.subreasons = jobSig.subReasons; savedActReasons = jp->actReasons; savedActSubReasons = jp->actSubReasons; jp->actReasons = jobSig.reasons; jp->actSubReasons = jobSig.subReasons; } if (jp->postJobStarted) { reply = ERR_NO_ERROR; goto Reply1; } if (IS_FINISH(jp->jobSpecs.jStatus)) { reply = ERR_NO_ERROR; goto Reply1; } if (jp->jobSpecs.jobPGid == -1) { SBD_SET_STATE(jp, JOB_STAT_EXIT); reply = ERR_NO_ERROR; goto Reply; } if (!JOB_STARTED(jp)) { if (isSigTerm(sigValue) == TRUE) { if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; goto Reply; } reply = ERR_SIG_RETRY; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>", fname, getLsbSigSymbol(sigValue), lsb_jobid2str(jp->jobSpecs.jobId)); goto Reply1; } if (IS_PEND(jp->jobSpecs.jStatus)) { reply = ERR_SIG_RETRY; goto Reply1; } if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) { if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else { jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; reply = ERR_NO_ERROR; } goto Reply; } if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; Reply: sbdlog_newstatus(jp); Reply1: xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) { jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.actPid = jp->jobSpecs.actPid; jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; jobReply.reasons = jp->jobSpecs.reasons; jobReply.actStatus = jp->actStatus; replyStruct = (char *) &jobReply; } else { if (reply != ERR_NO_JOB) if ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) { jp->actReasons = savedActReasons; jp->actSubReasons = savedActSubReasons; } replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821, "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */ } if (jp != NULL) jp->actStatus = ACT_NO; xdr_destroy(&xdrs2); return; }