static int shouldStop (struct hostLoad *loadV, struct jobCard *jobCard, int *reasons, int *subreasons, int num, int *stopmore) { static char fname[] = "shouldStop"; int i, numLoad = -1, j; struct hostLoad *load = NULL; static struct tclHostData tclHostData; static int first = TRUE; *reasons = 0; *subreasons = 0; if( jobCard->postJobStarted ) { return false; } if (jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP) return false; if (now - jobCard->windWarnTime < sbdSleepTime) return FALSE; if (!JOB_STARTED(jobCard)) return FALSE; if (LS_ISUNAVAIL(loadV->status)) return FALSE; if (num <= 0) return FALSE; for (i = 0; i <jobCard->jobSpecs.numToHosts && (*reasons) == 0; i++) { if (i > 0 && !strcmp (jobCard->jobSpecs.toHosts[i], jobCard->jobSpecs.toHosts[i-1])) continue; numLoad++; load = NULL; for (j = 0; j < num; j ++) { if (equalHost_(jobCard->jobSpecs.toHosts[i], loadV[j].hostName)) { load = &(loadV[j]); break; } } if (load == NULL) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5705, "%s: Can not find load information for host <%s>"), fname, jobCard->jobSpecs.toHosts[i]); /* catgets 5705 */ return FALSE; } if (LS_ISLOCKEDU(load->status) && !(jobCard->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE)) { *reasons = SUSP_HOST_LOCK; *stopmore = TRUE; } else if (LS_ISLOCKEDM(load->status)) { *reasons = SUSP_HOST_LOCK_MASTER; *stopmore = TRUE; } else if (load->li[IT] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][IT] && load->li[IT] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][IT] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = IT; *stopmore = TRUE; } else if (load->li[LS] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][LS] && load->li[LS] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][LS] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = LS; *stopmore = TRUE; } else if (load->li[UT] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][UT] && load->li[UT] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][UT] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = UT; } else if(load->li[PG] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][PG] && load->li[PG] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][PG] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = PG; } else if(load->li[IO] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][IO] && load->li[IO] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][IO] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = IO; } else if(load->li[MEM] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM] && load->li[MEM] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = MEM; } else if(load->li[SWP] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP] && load->li[SWP] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = SWP; } else if(load->li[TMP] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP] && load->li[TMP] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = TMP; } for (j = R15S; !(*reasons) && j <= R15M; j++) if ((load->li[j] != INFINIT_LOAD) && (jobCard->jobSpecs.thresholds.loadStop[numLoad][j] != INFINIT_LOAD) && (load->li[j] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][j])) { *reasons |= SUSP_LOAD_REASON; *subreasons = j; break; } for (j = MEM + 1; !(*reasons) && j < MIN(allLsInfo->numIndx, jobCard->jobSpecs.thresholds.nIdx); j++) { if (load->li[j] >= INFINIT_LOAD || load->li[j] <= -INFINIT_LOAD || jobCard->jobSpecs.thresholds.loadStop[numLoad][j] >= INFINIT_LOAD || jobCard->jobSpecs.thresholds.loadStop[numLoad][j] <= -INFINIT_LOAD) { continue; } if (allLsInfo->resTable[j].orderType == INCR) { if (load->li[j] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) { *reasons |= SUSP_LOAD_REASON; *subreasons = j; break; } } else { if (load->li[j] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) { *reasons |= SUSP_LOAD_REASON; *subreasons = j; break; } } } if (!(*reasons) && jobCard->stopCondVal != NULL) { int returnCode; if (first == TRUE) { initTclHostData (&tclHostData); returnCode = getTclHostData (load, &tclHostData, FALSE); first = FALSE; } else { returnCode = getTclHostData (load, &tclHostData, TRUE); } if (returnCode >= 0 && evalResReq (jobCard->stopCondVal->selectStr, &tclHostData, DFT_FROMTYPE) == 1) { *reasons |= SUSP_QUE_STOP_COND; break; } } } if (! (*reasons)) return FALSE; if (LS_ISLOCKEDU(load->status) || LS_ISLOCKEDM(load->status)) { return TRUE; } else if (shouldStop1 (load)) { if (logclass & (LC_SCHED | LC_EXEC)) ls_syslog (LOG_DEBUG2, "%s: Should stop job %s; reason=%x, subreasons=%d", fname, lsb_jobid2str(jobCard->jobSpecs.jobId), *reasons, *subreasons); return TRUE; } return FALSE; }
void do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_sigjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSig jobSig; sbdReplyType reply; struct jobReply jobReply; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp = NULL; char found = FALSE; int cc; int sigValue; int savedActReasons; int savedActSubReasons; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig"); goto Reply1; } jobSig.sigValue = sig_decode(jobSig.sigValue); sigValue = jobSig.sigValue; if (logclass & LC_SIGNAL) ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue); for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jobSig.jobId) continue; found = TRUE; break; } if (found == FALSE) { reply = ERR_NO_JOB; jp = NULL; goto Reply1; } if (jobSig.reasons & SUSP_MBD_LOCK) { jp->jobSpecs.reasons = jobSig.reasons; jp->jobSpecs.subreasons = jobSig.subReasons; savedActReasons = jp->actReasons; savedActSubReasons = jp->actSubReasons; jp->actReasons = jobSig.reasons; jp->actSubReasons = jobSig.subReasons; } if (jp->postJobStarted) { reply = ERR_NO_ERROR; goto Reply1; } if (IS_FINISH(jp->jobSpecs.jStatus)) { reply = ERR_NO_ERROR; goto Reply1; } if (jp->jobSpecs.jobPGid == -1) { SBD_SET_STATE(jp, JOB_STAT_EXIT); reply = ERR_NO_ERROR; goto Reply; } if (!JOB_STARTED(jp)) { if (isSigTerm(sigValue) == TRUE) { if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; goto Reply; } reply = ERR_SIG_RETRY; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>", fname, getLsbSigSymbol(sigValue), lsb_jobid2str(jp->jobSpecs.jobId)); goto Reply1; } if (IS_PEND(jp->jobSpecs.jStatus)) { reply = ERR_SIG_RETRY; goto Reply1; } if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) { if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else { jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; reply = ERR_NO_ERROR; } goto Reply; } if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; Reply: sbdlog_newstatus(jp); Reply1: xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) { jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.actPid = jp->jobSpecs.actPid; jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; jobReply.reasons = jp->jobSpecs.reasons; jobReply.actStatus = jp->actStatus; replyStruct = (char *) &jobReply; } else { if (reply != ERR_NO_JOB) if ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) { jp->actReasons = savedActReasons; jp->actSubReasons = savedActSubReasons; } replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821, "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */ } if (jp != NULL) jp->actStatus = ACT_NO; xdr_destroy(&xdrs2); return; }