static int jobResumeAction (struct jobCard *jp, int sigValue, int suspReason) { static char fname[] = "jobResumeAction"; if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) { return -1; }; if (jp->jobSpecs.actPid) return 0; if (!(jp->jobSpecs.reasons & suspReason)) return -1; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason); if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0) if (jobsig(jp, 0, FALSE) < 0) { SBD_SET_STATE(jp, JOB_STAT_EXIT); return -1; } sbdlog_newstatus(jp); return 0; }
void jobSuspendAction(struct jobCard *jp, int sigValue, int suspReasons, int suspSubReasons) { static char fname[] = "jobSuspendAction"; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Suspend job %s; reasons=%x, subresons=%d, sigValue=%d, status=%x", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, jp->jobSpecs.subreasons, sigValue, jp->jobSpecs.jStatus); jp->actReasons = suspReasons; jp->actSubReasons = suspSubReasons; if (!JOB_RUNNING(jp)) return; if( jp->postJobStarted ) { return; } if (IS_SUSP (jp->jobSpecs.jStatus)) { if (jp->jobSpecs.reasons & suspReasons) return; else if (jp->jobSpecs.sigMap[-sigValue] == 0) return; } if ((jp->jobSpecs.actPid) && ((jp->jobSpecs.actValue == sigValue) || (jp->jobSpecs.actValue == (sigValue + jp->jobSpecs.sigMap[-sigValue])))) return; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Call jobSigStart(sigValue =%d) to suspend job", fname, sigValue + jp->jobSpecs.sigMap[-(sigValue)]); jobSigStart(jp, sigValue + jp->jobSpecs.sigMap[-(sigValue)], 0, 0, SIGLOG); sbdlog_newstatus(jp); }
void job_checking (void) { static char fname[] = "job_checking"; struct jobCard *jobCard, *nextJob; struct hostLoad *myload, savedLoad; char *myhostnm; static time_t last_check; char preempted = FALSE; int i; if (last_check == 0) last_check = now; if (jobcnt <= 0) { last_check = now; return; } checkFinish (); for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = nextJob) { nextJob = jobCard->forw; if (IS_FINISH(jobCard->jobSpecs.jStatus) || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) continue; ruLimits(jobCard); if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) { jobCard->runTime += (int) (now - last_check); } if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { ls_syslog(LOG_INFO, \ "%s: warning period expired killing the job=%d", fname, jobCard->jobSpecs.jobId); jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { ls_syslog(LOG_INFO, I18N(5704, "%s: sending warning signal to job=%d"), /* catgets 5704 */ fname, jobCard->jobSpecs.jobId); jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else { jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (now > jobCard->jobSpecs.termTime + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (now > jobCard->jobSpecs.termTime) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (! window_ok (jobCard) && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) { if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG) || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG) && now - jobCard->windWarnTime >= WARN_TIME)) { jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0); continue; } } else { jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW); continue; } } if ((myhostnm = ls_getmyhostname()) == NULL) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname"); die(SLAVE_FATAL); } myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1); if (myload == NULL) { if (myStatus != NO_LIM) ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts"); if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) { myStatus |= NO_LIM; tryChkpntMig(); } last_check = now; return; } else myStatus = 0; memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad)); savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float), "job_checking"); savedLoad.status = (int *) my_malloc ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking"); for (i = 0; i < allLsInfo->numIndx; i++) savedLoad.li[i] = myload->li[i]; for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++) savedLoad.status[i] = myload->status[i]; tryResume (&savedLoad); if (!preempted) tryStop (myhostnm, &savedLoad); tryChkpntMig(); FREEUP(savedLoad.li); FREEUP(savedLoad.status); last_check = now; return; }
static void tryChkpntMig(void) { char migrating = FALSE; struct jobCard *jobCard, *nextJob; for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = jobCard->forw) { if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) { migrating = TRUE; break; } } for (jobCard = jobQueHead->forw; jobCard != jobQueHead; jobCard = nextJob) { nextJob = jobCard->forw; if (jobCard->missing) continue; if ((jobCard->jobSpecs.jStatus & JOB_STAT_SSUSP) && !migrating && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG) && jobCard->jobSpecs.actPid == 0 && (jobCard->jobSpecs.options & (SUB_CHKPNTABLE | SUB_RERUNNABLE)) && (now - jobCard->jobSpecs.lastSSuspTime > jobCard->jobSpecs.migThresh) && (now - jobCard->lastChkpntTime > jobCard->migCnt * sbdSleepTime) && !(jobCard->jobSpecs.reasons & SUSP_QUEUE_WINDOW)) { if (jobSigStart (jobCard, SIG_CHKPNT, LSB_CHKPNT_KILL, jobCard->jobSpecs.chkPeriod, SIGLOG) == 0) { jobCard->jobSpecs.jStatus |= JOB_STAT_MIG; migrating = TRUE; sbdlog_newstatus(jobCard); continue; } } if (!(jobCard->jobSpecs.jStatus & JOB_STAT_MIG) && (jobCard->jobSpecs.jStatus & JOB_STAT_RUN) && jobCard->jobSpecs.actPid == 0 && jobCard->jobSpecs.chkPeriod && now - jobCard->lastChkpntTime > jobCard->jobSpecs.chkPeriod) { if (jobSigStart (jobCard, SIG_CHKPNT, 0, jobCard->jobSpecs.chkPeriod, SIGLOG) == 0) { sbdlog_newstatus(jobCard); continue; } } } }
static void ruLimits(struct jobCard *jobCard) { struct rlimit rlimit; rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_CPU], &rlimit, LSF_RLIMIT_CPU); if (rlimit.rlim_cur != RLIM_INFINITY && lsbJobCpuLimit != 0) { if ((long)rlimit.rlim_cur < ((long)jobCard->runRusage.utime + (long)jobCard->runRusage.stime)) { if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) { } else { jobSigStart (jobCard, SIG_TERM_CPULIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } } rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_SWAP], &rlimit, LSF_RLIMIT_SWAP); if (rlimit.rlim_cur != RLIM_INFINITY) { if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.swap) { jobsig(jobCard, SIGQUIT, FALSE); jobsig(jobCard, SIGKILL, TRUE); } } rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_PROCESS], &rlimit, LSF_RLIMIT_PROCESS); if (rlimit.rlim_cur != RLIM_INFINITY) { if ((int)rlimit.rlim_cur + 2 < jobCard->runRusage.npids) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_PROCESSLIMIT)) return; else { jobSigStart (jobCard, SIG_TERM_PROCESSLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } if ( (lsbJobMemLimit == 1) || (lsbJobMemLimit != 0 && lsbMemEnforce == TRUE)) { rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RSS], &rlimit, LSF_RLIMIT_RSS); if (rlimit.rlim_cur != RLIM_INFINITY) { if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.mem) { if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) { } else { jobSigStart (jobCard, SIG_TERM_MEMLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } } } }
void do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_sigjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSig jobSig; sbdReplyType reply; struct jobReply jobReply; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp = NULL; char found = FALSE; int cc; int sigValue; int savedActReasons; int savedActSubReasons; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig"); goto Reply1; } jobSig.sigValue = sig_decode(jobSig.sigValue); sigValue = jobSig.sigValue; if (logclass & LC_SIGNAL) ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue); for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jobSig.jobId) continue; found = TRUE; break; } if (found == FALSE) { reply = ERR_NO_JOB; jp = NULL; goto Reply1; } if (jobSig.reasons & SUSP_MBD_LOCK) { jp->jobSpecs.reasons = jobSig.reasons; jp->jobSpecs.subreasons = jobSig.subReasons; savedActReasons = jp->actReasons; savedActSubReasons = jp->actSubReasons; jp->actReasons = jobSig.reasons; jp->actSubReasons = jobSig.subReasons; } if (jp->postJobStarted) { reply = ERR_NO_ERROR; goto Reply1; } if (IS_FINISH(jp->jobSpecs.jStatus)) { reply = ERR_NO_ERROR; goto Reply1; } if (jp->jobSpecs.jobPGid == -1) { SBD_SET_STATE(jp, JOB_STAT_EXIT); reply = ERR_NO_ERROR; goto Reply; } if (!JOB_STARTED(jp)) { if (isSigTerm(sigValue) == TRUE) { if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; goto Reply; } reply = ERR_SIG_RETRY; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>", fname, getLsbSigSymbol(sigValue), lsb_jobid2str(jp->jobSpecs.jobId)); goto Reply1; } if (IS_PEND(jp->jobSpecs.jStatus)) { reply = ERR_SIG_RETRY; goto Reply1; } if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) { if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else { jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; reply = ERR_NO_ERROR; } goto Reply; } if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; Reply: sbdlog_newstatus(jp); Reply1: xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) { jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.actPid = jp->jobSpecs.actPid; jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; jobReply.reasons = jp->jobSpecs.reasons; jobReply.actStatus = jp->actStatus; replyStruct = (char *) &jobReply; } else { if (reply != ERR_NO_JOB) if ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) { jp->actReasons = savedActReasons; jp->actSubReasons = savedActSubReasons; } replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821, "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */ } if (jp != NULL) jp->actStatus = ACT_NO; xdr_destroy(&xdrs2); return; }