int job_resume (struct jobCard *jp) { static char fname[] = "job_resume"; int rep; if (jp->jobSpecs.actPid) return 0; if (jobsig(jp, SIGCONT, FALSE) < 0) return -1; SBD_SET_STATE(jp, JOB_STAT_RUN); jp->jobSpecs.reasons = 0; jp->jobSpecs.subreasons = 0; rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR); if (rep < 0) jp->notReported++; else { if (jp->notReported > 0) jp->notReported = 0; } if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Resume job %s", fname, lsb_jobid2str(jp->jobSpecs.jobId)); return 0; }
static int jobResumeAction (struct jobCard *jp, int sigValue, int suspReason) { static char fname[] = "jobResumeAction"; if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) { return -1; }; if (jp->jobSpecs.actPid) return 0; if (!(jp->jobSpecs.reasons & suspReason)) return -1; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason); if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0) if (jobsig(jp, 0, FALSE) < 0) { SBD_SET_STATE(jp, JOB_STAT_EXIT); return -1; } sbdlog_newstatus(jp); return 0; }
char window_ok (struct jobCard *jobPtr) { windows_t *wp; struct dayhour dayhour; char active; time_t ckTime; time_t now; now = time (0); active = jobPtr->active; if (active && (jobPtr->jobSpecs.options & SUB_WINDOW_SIG)) ckTime = now + WARN_TIME; else ckTime = now; if (jobPtr->windEdge > ckTime || jobPtr->windEdge == 0) return (jobPtr->active); getDayHour (&dayhour, ckTime); if (jobPtr->week[dayhour.day] == NULL) { jobPtr->active = TRUE; jobPtr->windEdge = now + (24.0 - dayhour.hour) * 3600.0; return (jobPtr->active); } jobPtr->active = FALSE; jobPtr->windEdge = now + (24.0 - dayhour.hour) * 3600.0; for (wp = jobPtr->week[dayhour.day]; wp; wp = wp->nextwind) checkWindow (&dayhour, &jobPtr->active, &jobPtr->windEdge, wp, now); if (active && !jobPtr->active && now - jobPtr->windWarnTime >= WARN_TIME && (jobPtr->jobSpecs.options & SUB_WINDOW_SIG)) { if (!(jobPtr->jobSpecs.jStatus & JOB_STAT_RUN)) job_resume (jobPtr); jobsig (jobPtr, sig_decode (jobPtr->jobSpecs.sigValue), TRUE); jobPtr->windWarnTime = now; } return (jobPtr->active); }
void checkFinish (void) { struct jobCard *jobCard, *nextJob; for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = nextJob) { nextJob = jobCard->forw; if (!(IS_FINISH(jobCard->jobSpecs.jStatus)) && !(IS_POST_FINISH(jobCard->jobSpecs.jStatus) ) ) { if ( (jobsig(jobCard, 0, FALSE) < 0) || ( (jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL) && (jobCard->jobSpecs.termTime < time(0)-MAX(6,jobTerminateInterval*3)) ) ) { jobGone (jobCard); } } if (jobCard->jobSpecs.actPid) { if (killpg(jobCard->jobSpecs.actPid, SIGCONT) == 0) continue; if (kill(jobCard->jobSpecs.actPid, SIGCONT) == 0) continue; if (jobCard->cleanupPid > 0 && kill(jobCard->cleanupPid, SIGCONT) == 0) continue; sigActEnd(jobCard); continue; } if (IS_FINISH(jobCard->jobSpecs.jStatus) || IS_POST_FINISH(jobCard->jobSpecs.jStatus) || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) { job_finish (jobCard, TRUE); } } }
int status_job (mbdReqType reqType, struct jobCard *jp, int newStatus, sbdReplyType err) { static char fname[] = "status_job()"; static int seq = 1; static char lastHost[MAXHOSTNAMELEN]; int reply; char *request_buf; char *reply_buf = NULL; XDR xdrs; struct LSFHeader hdr; int cc; struct statusReq statusReq; int flags; int i; int len; struct lsfAuth *auth = NULL; if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL)) ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s", fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId)); if (newStatus == JOB_STAT_EXIT) { jp->userJobSucc = FALSE; } if (MASK_STATUS (newStatus) == JOB_STAT_DONE) { jp->userJobSucc = TRUE; } if (IS_POST_FINISH (newStatus)) { if (jp->userJobSucc != TRUE) { return 0; } } if (masterHost == NULL) return -1; if (jp->notReported < 0) { jp->notReported = -INFINIT_INT; return (0); } statusReq.jobId = jp->jobSpecs.jobId; statusReq.actPid = jp->jobSpecs.actPid; statusReq.jobPid = jp->jobSpecs.jobPid; statusReq.jobPGid = jp->jobSpecs.jobPGid; statusReq.newStatus = newStatus; statusReq.reason = jp->jobSpecs.reasons; statusReq.subreasons = jp->jobSpecs.subreasons; statusReq.sbdReply = err; statusReq.lsfRusage = jp->lsfRusage; statusReq.execUid = jp->jobSpecs.execUid; statusReq.numExecHosts = 0; statusReq.execHosts = NULL; statusReq.exitStatus = jp->w_status; statusReq.execCwd = jp->jobSpecs.execCwd; statusReq.execHome = jp->jobSpecs.execHome; statusReq.execUsername = jp->execUsername; statusReq.queuePostCmd = ""; statusReq.queuePreCmd = ""; statusReq.msgId = jp->delieveredMsgId; if (IS_FINISH (newStatus)) { if (jp->maxRusage.mem > jp->runRusage.mem) jp->runRusage.mem = jp->maxRusage.mem; if (jp->maxRusage.swap > jp->runRusage.swap) jp->runRusage.swap = jp->maxRusage.swap; if (jp->maxRusage.stime > jp->runRusage.stime) jp->runRusage.stime = jp->maxRusage.stime; if (jp->maxRusage.utime > jp->runRusage.utime) jp->runRusage.utime = jp->maxRusage.utime; } statusReq.runRusage.mem = jp->runRusage.mem; statusReq.runRusage.swap = jp->runRusage.swap; statusReq.runRusage.utime = jp->runRusage.utime; statusReq.runRusage.stime = jp->runRusage.stime; statusReq.runRusage.npids = jp->runRusage.npids; statusReq.runRusage.pidInfo = jp->runRusage.pidInfo; statusReq.runRusage.npgids = jp->runRusage.npgids; statusReq.runRusage.pgid = jp->runRusage.pgid; statusReq.actStatus = jp->actStatus; statusReq.sigValue = jp->jobSpecs.actValue; statusReq.seq = seq; seq++; if (seq >= MAX_SEQ_NUM) seq = 1; len = 1024 + ALIGNWORD_ (sizeof (struct statusReq)); len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 + ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 + ALIGNWORD_ (strlen (statusReq.execUsername)) + 4; for (i = 0; i < statusReq.runRusage.npids; i++) len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4; for (i = 0; i < statusReq.runRusage.npgids; i++) len += ALIGNWORD_ (sizeof (int)) + 4; if (logclass & (LC_TRACE | LC_COMM)) ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname, len); if ((request_buf = malloc (len)) == NULL) { ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc"); return (-1); } xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE); initLSFHeader_ (&hdr); hdr.opCode = reqType; if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0, auth)) { ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq"); lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq"); xdr_destroy (&xdrs); FREEUP (request_buf); relife (); } flags = CALL_SERVER_NO_HANDSHAKE; if (statusChan >= 0) flags |= CALL_SERVER_USE_SOCKET; if (reqType == BATCH_RUSAGE_JOB) flags |= CALL_SERVER_NO_WAIT_REPLY; if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d", fname, statusChan, flags); cc = call_server (masterHost, mbd_port, request_buf, XDR_GETPOS (&xdrs), &reply_buf, &hdr, connTimeout, readTimeout, &statusChan, NULL, NULL, flags); if (cc < 0) { statusChan = -1; if (!equalHost_ (masterHost, lastHost)) { if (errno != EINTR) ls_syslog (LOG_DEBUG, "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s", fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId), lsb_sysmsg ()); strcpy (lastHost, masterHost); } xdr_destroy (&xdrs); FREEUP (request_buf); failcnt++; return (-1); } else if (cc == 0) { } failcnt = 0; lastHost[0] = '\0'; xdr_destroy (&xdrs); FREEUP (request_buf); if (cc) free (reply_buf); if (flags & CALL_SERVER_NO_WAIT_REPLY) { struct timeval timeval; timeval.tv_sec = 0; timeval.tv_usec = 0; if (rd_select_ (chanSock_ (statusChan), &timeval) == 0) { jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; return 0; } CLOSECD (statusChan); if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: Job <%s> rd_select() failed, assume connection broken", fname, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } reply = hdr.opCode; switch (reply) { case LSBE_NO_ERROR: case LSBE_LOCK_JOB: jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; if (reply == LSBE_LOCK_JOB) { if (IS_SUSP (jp->jobSpecs.jStatus)) jp->jobSpecs.reasons |= SUSP_MBD_LOCK; else ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."), /* catgets 5204 */ fname, lsb_jobid2str (jp->jobSpecs.jobId), jp->jobSpecs.jStatus); } return (0); case LSBE_NO_JOB: if (!IS_POST_FINISH (jp->jobSpecs.jStatus)) { ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost); /* catgets 5205 */ } jp->notReported = -INFINIT_INT; return (0); case LSBE_STOP_JOB: if (jobsig (jp, SIGSTOP, TRUE) < 0) SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT); else { SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP); jp->jobSpecs.reasons |= SUSP_USER_STOP; } return (-1); case LSBE_SBATCHD: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"), /* catgets 5206 */ fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); default: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"), /* catgets 5207 */ fname, reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } }
void job_checking (void) { static char fname[] = "job_checking"; struct jobCard *jobCard, *nextJob; struct hostLoad *myload, savedLoad; char *myhostnm; static time_t last_check; char preempted = FALSE; int i; if (last_check == 0) last_check = now; if (jobcnt <= 0) { last_check = now; return; } checkFinish (); for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = nextJob) { nextJob = jobCard->forw; if (IS_FINISH(jobCard->jobSpecs.jStatus) || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) continue; ruLimits(jobCard); if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) { jobCard->runTime += (int) (now - last_check); } if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { ls_syslog(LOG_INFO, \ "%s: warning period expired killing the job=%d", fname, jobCard->jobSpecs.jobId); jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { ls_syslog(LOG_INFO, I18N(5704, "%s: sending warning signal to job=%d"), /* catgets 5704 */ fname, jobCard->jobSpecs.jobId); jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else { jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (now > jobCard->jobSpecs.termTime + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (now > jobCard->jobSpecs.termTime) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (! window_ok (jobCard) && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) { if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG) || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG) && now - jobCard->windWarnTime >= WARN_TIME)) { jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0); continue; } } else { jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW); continue; } } if ((myhostnm = ls_getmyhostname()) == NULL) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname"); die(SLAVE_FATAL); } myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1); if (myload == NULL) { if (myStatus != NO_LIM) ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts"); if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) { myStatus |= NO_LIM; tryChkpntMig(); } last_check = now; return; } else myStatus = 0; memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad)); savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float), "job_checking"); savedLoad.status = (int *) my_malloc ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking"); for (i = 0; i < allLsInfo->numIndx; i++) savedLoad.li[i] = myload->li[i]; for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++) savedLoad.status[i] = myload->status[i]; tryResume (&savedLoad); if (!preempted) tryStop (myhostnm, &savedLoad); tryChkpntMig(); FREEUP(savedLoad.li); FREEUP(savedLoad.status); last_check = now; return; }
static void ruLimits(struct jobCard *jobCard) { struct rlimit rlimit; rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_CPU], &rlimit, LSF_RLIMIT_CPU); if (rlimit.rlim_cur != RLIM_INFINITY && lsbJobCpuLimit != 0) { if ((long)rlimit.rlim_cur < ((long)jobCard->runRusage.utime + (long)jobCard->runRusage.stime)) { if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) { } else { jobSigStart (jobCard, SIG_TERM_CPULIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } } rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_SWAP], &rlimit, LSF_RLIMIT_SWAP); if (rlimit.rlim_cur != RLIM_INFINITY) { if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.swap) { jobsig(jobCard, SIGQUIT, FALSE); jobsig(jobCard, SIGKILL, TRUE); } } rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_PROCESS], &rlimit, LSF_RLIMIT_PROCESS); if (rlimit.rlim_cur != RLIM_INFINITY) { if ((int)rlimit.rlim_cur + 2 < jobCard->runRusage.npids) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_PROCESSLIMIT)) return; else { jobSigStart (jobCard, SIG_TERM_PROCESSLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } if ( (lsbJobMemLimit == 1) || (lsbJobMemLimit != 0 && lsbMemEnforce == TRUE)) { rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RSS], &rlimit, LSF_RLIMIT_RSS); if (rlimit.rlim_cur != RLIM_INFINITY) { if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.mem) { if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) { } else { jobSigStart (jobCard, SIG_TERM_MEMLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } } } }
static void chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed) { static char fname[] = "chkpntEnd()"; int savePid, saveStatus; if (IS_SUSP(jobCard->jobSpecs.jStatus) && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG)) jobsig(jobCard, SIGSTOP, TRUE); saveStatus = jobCard->jobSpecs.jStatus; if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) { if (w_status == 0) { if (!jobCard->missing) { jobCard->missing = TRUE; need_checkfinish = TRUE; return; } else if (jobCard->notReported == 0) return; if (jobCard->cleanupPid == 0) { if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0) return; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709, "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */ fname, lsb_jobid2str(jobCard->jobSpecs.jobId)); } SBD_SET_STATE(jobCard, JOB_STAT_PEND); } else { jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG; } } savePid = jobCard->jobSpecs.actPid; if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus, w_status == 0 ? ERR_NO_ERROR : ERR_SYSACT_FAIL) < 0) { jobCard->jobSpecs.actPid = savePid; jobCard->jobSpecs.jStatus = saveStatus; } else { jobCard->lastChkpntTime = now; jobCard->jobSpecs.actPid = 0; jobCard->actStatus = ACT_NO; jobCard->jobSpecs.actValue = SIG_NULL; if (w_status == 0) { jobCard->migCnt = 1; } if (saveStatus & JOB_STAT_MIG) { if (w_status == 0) { cleanupMigJob(jobCard); deallocJobCard(jobCard); *freed = TRUE; } else jobCard->migCnt *= 2; } } }