void do_reboot(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_reboot()"; char reply_buf[MSGSIZE / 8]; XDR xdrs2; sbdReplyType reply; struct LSFHeader replyHdr; if (logclass & LC_TRACE) ls_syslog(LOG_DEBUG, "%s: Entering this routine...", fname); reply = ERR_NO_ERROR; xdrmem_create(&xdrs2, reply_buf, MSGSIZE / 8, XDR_ENCODE); initLSFHeader_(&replyHdr); if (reqHdr->opCode == CMD_SBD_REBOOT) replyHdr.opCode = LSBE_NO_ERROR; else replyHdr.opCode = reply; if (!xdr_encodeMsg(&xdrs2, (char *) 0, &replyHdr, 0, 0, NULL)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_encodeMsg"); xdr_destroy(&xdrs2); relife(); return; } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL_M, fname, "chanWrite_"); xdr_destroy(&xdrs2); relife(); return; } ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5828, "Slave batch daemon reboot command received")); /* catgets 5828 */ relife(); ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5829, "Unable to relife during rebooting: %m")); /* catgets 5829 */ xdr_destroy(&xdrs2); }
void do_shutdown(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_shutdown()"; char reply_buf[MSGSIZE / 8]; XDR xdrs2; sbdReplyType reply; struct LSFHeader replyHdr; reply = ERR_NO_ERROR; xdrmem_create(&xdrs2, reply_buf, MSGSIZE / 8, XDR_ENCODE); initLSFHeader_(&replyHdr); if (reqHdr->opCode == CMD_SBD_SHUTDOWN) replyHdr.opCode = LSBE_NO_ERROR; else replyHdr.opCode = reply; if (!xdr_encodeMsg(&xdrs2, (char *) 0, &replyHdr, 0, 0, NULL)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_encodeMsg"); xdr_destroy(&xdrs2); relife(); return; } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5835, "%s: Sending shutdown reply to master failed: %m"), /* catgets 5835 */ fname); xdr_destroy(&xdrs2); relife(); return; } xdr_destroy(&xdrs2); ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5836, "Slave batch daemon shutdown command received")); /* catgets 5836 */ die(SLAVE_SHUTDOWN); }
int status_job (mbdReqType reqType, struct jobCard *jp, int newStatus, sbdReplyType err) { static char fname[] = "status_job()"; static int seq = 1; static char lastHost[MAXHOSTNAMELEN]; int reply; char *request_buf; char *reply_buf = NULL; XDR xdrs; struct LSFHeader hdr; int cc; struct statusReq statusReq; int flags; int i; int len; struct lsfAuth *auth = NULL; if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL)) ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s", fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId)); if (newStatus == JOB_STAT_EXIT) { jp->userJobSucc = FALSE; } if (MASK_STATUS (newStatus) == JOB_STAT_DONE) { jp->userJobSucc = TRUE; } if (IS_POST_FINISH (newStatus)) { if (jp->userJobSucc != TRUE) { return 0; } } if (masterHost == NULL) return -1; if (jp->notReported < 0) { jp->notReported = -INFINIT_INT; return (0); } statusReq.jobId = jp->jobSpecs.jobId; statusReq.actPid = jp->jobSpecs.actPid; statusReq.jobPid = jp->jobSpecs.jobPid; statusReq.jobPGid = jp->jobSpecs.jobPGid; statusReq.newStatus = newStatus; statusReq.reason = jp->jobSpecs.reasons; statusReq.subreasons = jp->jobSpecs.subreasons; statusReq.sbdReply = err; statusReq.lsfRusage = jp->lsfRusage; statusReq.execUid = jp->jobSpecs.execUid; statusReq.numExecHosts = 0; statusReq.execHosts = NULL; statusReq.exitStatus = jp->w_status; statusReq.execCwd = jp->jobSpecs.execCwd; statusReq.execHome = jp->jobSpecs.execHome; statusReq.execUsername = jp->execUsername; statusReq.queuePostCmd = ""; statusReq.queuePreCmd = ""; statusReq.msgId = jp->delieveredMsgId; if (IS_FINISH (newStatus)) { if (jp->maxRusage.mem > jp->runRusage.mem) jp->runRusage.mem = jp->maxRusage.mem; if (jp->maxRusage.swap > jp->runRusage.swap) jp->runRusage.swap = jp->maxRusage.swap; if (jp->maxRusage.stime > jp->runRusage.stime) jp->runRusage.stime = jp->maxRusage.stime; if (jp->maxRusage.utime > jp->runRusage.utime) jp->runRusage.utime = jp->maxRusage.utime; } statusReq.runRusage.mem = jp->runRusage.mem; statusReq.runRusage.swap = jp->runRusage.swap; statusReq.runRusage.utime = jp->runRusage.utime; statusReq.runRusage.stime = jp->runRusage.stime; statusReq.runRusage.npids = jp->runRusage.npids; statusReq.runRusage.pidInfo = jp->runRusage.pidInfo; statusReq.runRusage.npgids = jp->runRusage.npgids; statusReq.runRusage.pgid = jp->runRusage.pgid; statusReq.actStatus = jp->actStatus; statusReq.sigValue = jp->jobSpecs.actValue; statusReq.seq = seq; seq++; if (seq >= MAX_SEQ_NUM) seq = 1; len = 1024 + ALIGNWORD_ (sizeof (struct statusReq)); len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 + ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 + ALIGNWORD_ (strlen (statusReq.execUsername)) + 4; for (i = 0; i < statusReq.runRusage.npids; i++) len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4; for (i = 0; i < statusReq.runRusage.npgids; i++) len += ALIGNWORD_ (sizeof (int)) + 4; if (logclass & (LC_TRACE | LC_COMM)) ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname, len); if ((request_buf = malloc (len)) == NULL) { ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc"); return (-1); } xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE); initLSFHeader_ (&hdr); hdr.opCode = reqType; if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0, auth)) { ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq"); lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq"); xdr_destroy (&xdrs); FREEUP (request_buf); relife (); } flags = CALL_SERVER_NO_HANDSHAKE; if (statusChan >= 0) flags |= CALL_SERVER_USE_SOCKET; if (reqType == BATCH_RUSAGE_JOB) flags |= CALL_SERVER_NO_WAIT_REPLY; if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d", fname, statusChan, flags); cc = call_server (masterHost, mbd_port, request_buf, XDR_GETPOS (&xdrs), &reply_buf, &hdr, connTimeout, readTimeout, &statusChan, NULL, NULL, flags); if (cc < 0) { statusChan = -1; if (!equalHost_ (masterHost, lastHost)) { if (errno != EINTR) ls_syslog (LOG_DEBUG, "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s", fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId), lsb_sysmsg ()); strcpy (lastHost, masterHost); } xdr_destroy (&xdrs); FREEUP (request_buf); failcnt++; return (-1); } else if (cc == 0) { } failcnt = 0; lastHost[0] = '\0'; xdr_destroy (&xdrs); FREEUP (request_buf); if (cc) free (reply_buf); if (flags & CALL_SERVER_NO_WAIT_REPLY) { struct timeval timeval; timeval.tv_sec = 0; timeval.tv_usec = 0; if (rd_select_ (chanSock_ (statusChan), &timeval) == 0) { jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; return 0; } CLOSECD (statusChan); if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: Job <%s> rd_select() failed, assume connection broken", fname, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } reply = hdr.opCode; switch (reply) { case LSBE_NO_ERROR: case LSBE_LOCK_JOB: jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; if (reply == LSBE_LOCK_JOB) { if (IS_SUSP (jp->jobSpecs.jStatus)) jp->jobSpecs.reasons |= SUSP_MBD_LOCK; else ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."), /* catgets 5204 */ fname, lsb_jobid2str (jp->jobSpecs.jobId), jp->jobSpecs.jStatus); } return (0); case LSBE_NO_JOB: if (!IS_POST_FINISH (jp->jobSpecs.jStatus)) { ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost); /* catgets 5205 */ } jp->notReported = -INFINIT_INT; return (0); case LSBE_STOP_JOB: if (jobsig (jp, SIGSTOP, TRUE) < 0) SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT); else { SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP); jp->jobSpecs.reasons |= SUSP_USER_STOP; } return (-1); case LSBE_SBATCHD: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"), /* catgets 5206 */ fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); default: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"), /* catgets 5207 */ fname, reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } }
void job_checking (void) { static char fname[] = "job_checking"; struct jobCard *jobCard, *nextJob; struct hostLoad *myload, savedLoad; char *myhostnm; static time_t last_check; char preempted = FALSE; int i; if (last_check == 0) last_check = now; if (jobcnt <= 0) { last_check = now; return; } checkFinish (); for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = nextJob) { nextJob = jobCard->forw; if (IS_FINISH(jobCard->jobSpecs.jStatus) || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND)) continue; ruLimits(jobCard); if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) { jobCard->runTime += (int) (now - last_check); } if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { ls_syslog(LOG_INFO, \ "%s: warning period expired killing the job=%d", fname, jobCard->jobSpecs.jobId); jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { ls_syslog(LOG_INFO, I18N(5704, "%s: sending warning signal to job=%d"), /* catgets 5704 */ fname, jobCard->jobSpecs.jobId); jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (jobCard->runTime > jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT)) continue; else { jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) { if ((jobCard->jobSpecs.terminateActCmd == NULL) || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) { if (now > jobCard->jobSpecs.termTime + WARN_TIME && jobCard->timeExpire) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); jobCard->jobSpecs.jStatus |= JOB_STAT_KILL; } } else if (!jobCard->timeExpire) { jobsig(jobCard, SIGUSR2, FALSE); jobCard->timeExpire = TRUE; } } else { if (now > jobCard->jobSpecs.termTime) { if ((IS_SUSP (jobCard->jobSpecs.jStatus)) && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT) && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE)) continue; else { jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG); sbdlog_newstatus(jobCard); } } } continue; } if (! window_ok (jobCard) && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) { if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG) || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG) && now - jobCard->windWarnTime >= WARN_TIME)) { jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0); continue; } } else { jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW); continue; } } if ((myhostnm = ls_getmyhostname()) == NULL) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname"); die(SLAVE_FATAL); } myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1); if (myload == NULL) { if (myStatus != NO_LIM) ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts"); if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) { myStatus |= NO_LIM; tryChkpntMig(); } last_check = now; return; } else myStatus = 0; memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad)); savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float), "job_checking"); savedLoad.status = (int *) my_malloc ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking"); for (i = 0; i < allLsInfo->numIndx; i++) savedLoad.li[i] = myload->li[i]; for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++) savedLoad.status[i] = myload->status[i]; tryResume (&savedLoad); if (!preempted) tryStop (myhostnm, &savedLoad); tryChkpntMig(); FREEUP(savedLoad.li); FREEUP(savedLoad.status); last_check = now; return; }
static void tryStop (char *myhostnm, struct hostLoad *myload) { static char fname[] = "tryStop"; struct jobCard *jobCard, *next; int reasons, subreasons, stopmore = FALSE; static int errCount = 0, lastTryStopTime = 0; if (now - lastTryStopTime < sbdSleepTime) { return; } lastTryStopTime = now; for (jobCard = jobQueHead->forw; jobCard != jobQueHead; jobCard = next) { next = jobCard->forw; if (jobCard->jobSpecs.numToHosts == 1) { if ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN) && (now >= jobCard->jobSpecs.startTime + sbdSleepTime) && shouldStop (myload, jobCard, &reasons, &subreasons, 1, &stopmore)) { jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons); if (stopmore) continue; else return; } } else { struct hostLoad *load; int numh; struct nameList *hostList; numh = jobCard->jobSpecs.numToHosts; hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh); numh = hostList->listSize; if (hostList->listSize == 1) { load = myload; } else { load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0, hostList->names, hostList->listSize); } if (load == NULL) { if (errCount < 3) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_MM, fname, lsb_jobid2str(jobCard->jobSpecs.jobId), "ls_loadofhosts"); errCount++; if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) myStatus |= NO_LIM; continue; } else { errCount = 0; myStatus = 0; } if ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN) && now >= jobCard->jobSpecs.startTime + sbdSleepTime) { if (shouldStop (load, jobCard, &reasons, &subreasons, numh, &stopmore)) { jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons); if (stopmore) break; else return; } } } } return; }
static void tryResume (struct hostLoad *myload) { char fname[] = "tryResume"; struct jobCard *jobCard, *next; static int errCount = 0, lastTryResumeTime = 0; if (now - lastTryResumeTime < sbdSleepTime) { return; } lastTryResumeTime = now; for (jobCard = jobQueHead->back; jobCard != jobQueHead; jobCard = next) { next = jobCard->back; if (!(jobCard->jobSpecs.jStatus & JOB_STAT_SSUSP) || jobCard->jobSpecs.actPid) continue; if (jobCard->jobSpecs.numToHosts == 1) { if (shouldResume (myload, jobCard, 1)) { if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0) continue; else return; } } else { int numh; struct hostLoad *load; struct nameList *hostList; numh = jobCard->jobSpecs.numToHosts; hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh); numh = hostList->listSize; load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0, hostList->names, hostList->listSize); if (load == NULL) { if (errCount < 3) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jobCard->jobSpecs.jobId), "ls_loadofhosts"); errCount++; if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) myStatus |= NO_LIM; continue; } else { myStatus = 0; errCount = 0; } if (!shouldResume (load, jobCard, numh)) continue; if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0) continue; else return; } } return; }
void do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_sigjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSig jobSig; sbdReplyType reply; struct jobReply jobReply; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp = NULL; char found = FALSE; int cc; int sigValue; int savedActReasons; int savedActSubReasons; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig"); goto Reply1; } jobSig.sigValue = sig_decode(jobSig.sigValue); sigValue = jobSig.sigValue; if (logclass & LC_SIGNAL) ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue); for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jobSig.jobId) continue; found = TRUE; break; } if (found == FALSE) { reply = ERR_NO_JOB; jp = NULL; goto Reply1; } if (jobSig.reasons & SUSP_MBD_LOCK) { jp->jobSpecs.reasons = jobSig.reasons; jp->jobSpecs.subreasons = jobSig.subReasons; savedActReasons = jp->actReasons; savedActSubReasons = jp->actSubReasons; jp->actReasons = jobSig.reasons; jp->actSubReasons = jobSig.subReasons; } if (jp->postJobStarted) { reply = ERR_NO_ERROR; goto Reply1; } if (IS_FINISH(jp->jobSpecs.jStatus)) { reply = ERR_NO_ERROR; goto Reply1; } if (jp->jobSpecs.jobPGid == -1) { SBD_SET_STATE(jp, JOB_STAT_EXIT); reply = ERR_NO_ERROR; goto Reply; } if (!JOB_STARTED(jp)) { if (isSigTerm(sigValue) == TRUE) { if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; goto Reply; } reply = ERR_SIG_RETRY; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>", fname, getLsbSigSymbol(sigValue), lsb_jobid2str(jp->jobSpecs.jobId)); goto Reply1; } if (IS_PEND(jp->jobSpecs.jStatus)) { reply = ERR_SIG_RETRY; goto Reply1; } if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) { if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else { jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; reply = ERR_NO_ERROR; } goto Reply; } if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; Reply: sbdlog_newstatus(jp); Reply1: xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) { jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.actPid = jp->jobSpecs.actPid; jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; jobReply.reasons = jp->jobSpecs.reasons; jobReply.actStatus = jp->actStatus; replyStruct = (char *) &jobReply; } else { if (reply != ERR_NO_JOB) if ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) { jp->actReasons = savedActReasons; jp->actSubReasons = savedActSubReasons; } replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821, "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */ } if (jp != NULL) jp->actStatus = ACT_NO; xdr_destroy(&xdrs2); return; }
void do_probe(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_probe()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct LSFHeader replyHdr; struct sbdPackage sbdPackage; struct jobSpecs *jobSpecs; int i; struct lsfAuth *auth = NULL; if (reqHdr->length == 0) return; initLSFHeader_(&replyHdr); replyHdr.opCode = ERR_NO_ERROR; jobSpecs = NULL; if (!xdr_sbdPackage(xdrs, &sbdPackage, reqHdr)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_sbdPackage"); relife(); } else { if (sbdPackage.numJobs) { jobSpecs = my_calloc(sbdPackage.numJobs, sizeof(struct jobSpecs), fname); for (i = 0; i < sbdPackage.numJobs; i++) { if (!xdr_arrayElement(xdrs, (char *) &(jobSpecs[i]), reqHdr, xdr_jobSpecs)) { replyHdr.opCode = ERR_BAD_REQ; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5815, "%s: %s(%d) failed for %d jobs"), /* catgets 5815 */ fname, "xdr_arrayElement", i, sbdPackage.numJobs); break; } refreshJob(&(jobSpecs[i])); xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs[i], reqHdr); } } } if (replyHdr.opCode == ERR_NO_ERROR) if (!xdr_sbdPackage1(xdrs, &sbdPackage, reqHdr)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_sbdPackage1"); relife(); } if (replyHdr.opCode == ERR_NO_ERROR) { if (myStatus & NO_LIM) { replyHdr.opCode = ERR_NO_LIM; } } xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); if (!xdr_encodeMsg(&xdrs2, NULL, &replyHdr, NULL, 0, auth)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_encodeMsg"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL_M, fname, "chanWrite_"); } xdr_destroy(&xdrs2); if (jobSpecs != NULL) free(jobSpecs); getManagerId(&sbdPackage); mbdPid = sbdPackage.mbdPid; sbdSleepTime = sbdPackage.sbdSleepTime; retryIntvl = sbdPackage.retryIntvl; preemPeriod = sbdPackage.preemPeriod; pgSuspIdleT = sbdPackage.pgSuspIdleT; maxJobs = sbdPackage.maxJobs; uJobLimit = sbdPackage.uJobLimit; rusageUpdateRate = sbdPackage.rusageUpdateRate; rusageUpdatePercent = sbdPackage.rusageUpdatePercent; jobTerminateInterval = sbdPackage.jobTerminateInterval; for (i = 0; i < sbdPackage.nAdmins; i++) FREEUP(sbdPackage.admins[i]); FREEUP(sbdPackage.admins); return; }
void do_modifyjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_switchjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSpecs jobSpecs; struct jobReply jobReply; sbdReplyType reply; char found = FALSE; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs"); goto sendReply; } for (jp = jobQueHead->back; jp != jobQueHead; jp = jp->back) if (jp->jobSpecs.jobId == jobSpecs.jobId) { found = TRUE; break; } if (!found) { reply = ERR_NO_JOB; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5808, "%s: mbatchd trying to modify a non-existent job <%s>"), fname, lsb_jobid2str(jobSpecs.jobId)); /* catgets 5808 */ goto sendReply; } if (jp->jobSpecs.jStatus & (JOB_STAT_DONE | JOB_STAT_EXIT)) { reply = ERR_JOB_FINISH; goto sendReply; } if ((lsbJobCpuLimit != 1) && ((jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxl != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxl) || (jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxh != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_maxh) || (jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curl != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curl) || (jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curh != jobSpecs.lsfLimits[LSF_RLIMIT_CPU].rlim_curh) )) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd, NL_SETN, 5809, "%s, LSB_JOB_CPULIMIT is not set for the host, job <%s>, CPU limit not modified"), fname, lsb_jobid2str(jobSpecs.jobId)); } else { memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_CPU], (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_CPU], sizeof(struct lsfLimit)); } if ((lsbJobMemLimit != 1) && ((jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxl != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxl) || (jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxh != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_maxh) || (jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curl != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curl) || (jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curh != jobSpecs.lsfLimits[LSF_RLIMIT_RSS].rlim_curh) )) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd, NL_SETN, 5810, "%s, LSB_JOB_MEMLIMIT is not set for the host, job <%s>, memory limit not modified"), fname, lsb_jobid2str(jobSpecs.jobId)); } else { memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_RSS], (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_RSS], sizeof(struct lsfLimit)); } memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_RUN], (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_RUN], sizeof(struct lsfLimit)); setRunLimit(jp, FALSE); if (strcmp(jp->jobSpecs.outFile, jobSpecs.outFile) || !(strcmp(jobSpecs.outFile, "/dev/null"))) { strcpy(jp->jobSpecs.outFile, jobSpecs.outFile); if (strcmp(jobSpecs.outFile, "/dev/null") || (jobSpecs.options & SUB_OUT_FILE)) { jp->jobSpecs.options |= SUB_OUT_FILE; } else { jp->jobSpecs.options &= ~SUB_OUT_FILE; } } if (strcmp(jp->jobSpecs.errFile, jobSpecs.errFile)) { strcpy(jp->jobSpecs.errFile, jobSpecs.errFile); if (!strcmp(jp->jobSpecs.errFile, "/dev/null") && !(jobSpecs.options & SUB_ERR_FILE)) { jp->jobSpecs.options &= ~SUB_ERR_FILE; } } if (jobSpecs.options & SUB_RERUNNABLE) { jp->jobSpecs.options |= SUB_RERUNNABLE; } else { jp->jobSpecs.options &= ~SUB_RERUNNABLE; } sendReply: xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr); xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) replyStruct = (char *) &jobReply; else { replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "chanWrite_"); } xdr_destroy(&xdrs2); return; }
void do_newjob(XDR *xdrs, int chfd, struct LSFHeader *reqHdr) { static char fname[] = "do_newjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSpecs jobSpecs; struct jobReply jobReply; struct jobCard *jp; sbdReplyType reply; struct LSFHeader replyHdr; char *replyStruct; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs"); goto sendReply; } for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId == jobSpecs.jobId) { jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; reply = ERR_NO_ERROR; goto sendReply; } } jp = calloc(1, sizeof(struct jobCard)); if (jp == NULL) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jobSpecs.jobId), "calloc"); reply = ERR_MEM; goto sendReply; } memcpy((char *) &jp->jobSpecs, (char *) &jobSpecs, sizeof(struct jobSpecs)); jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; jp->jobSpecs.startTime = now; jp->jobSpecs.reasons = 0; jp->jobSpecs.subreasons = 0; /* Initialize the core number */ jp->core_num = -1; if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { if (lockHosts (jp) < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "lockHosts"); unlockHosts (jp, jp->jobSpecs.numToHosts); reply = ERR_LOCK_FAIL; freeWeek(jp->week); FREEUP(jp); goto sendReply; } } jp->runTime = 0; if (initJobCard(jp, &jobSpecs, (int *)&reply) < 0) { if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { unlockHosts (jp, jp->jobSpecs.numToHosts); } FREEUP(jp); goto sendReply; } jp->execJobFlag = 0; if (jp->runTime < 0) { jp->runTime = 0; } jp->execGid = 0; jp->execUsername[0] = '\0'; jp->jobSpecs.execUid = -1; jp->jobSpecs.execUsername[0] = '\0'; if (jp->jobSpecs.jobSpoolDir[0] != '\0') { char *tmp; if ((tmp = getUnixSpoolDir (jp->jobSpecs.jobSpoolDir)) == NULL) { jp->jobSpecs.jobSpoolDir[0] = '\0'; } } if ((logclass & LC_TRACE) && jp->jobSpecs.jobSpoolDir[0] != 0) { ls_syslog(LOG_DEBUG, "%s: the SpoolDir for job <%s> is %s \n", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.jobSpoolDir); } if (jp->jobSpecs.options & SUB_PRE_EXEC) SBD_SET_STATE(jp, (JOB_STAT_RUN | JOB_STAT_PRE_EXEC)) else SBD_SET_STATE(jp, JOB_STAT_RUN); reply = job_exec(jp, chfd); if (reply != ERR_NO_ERROR) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "job_exec"); if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { unlockHosts (jp, jp->jobSpecs.numToHosts); } deallocJobCard(jp); } else { jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; } sendReply: xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr); xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; replyStruct = (reply == ERR_NO_ERROR) ? (char *) &jobReply : (char *) NULL; if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobReply"); lsb_merr(_i18n_msg_get(ls_catd , NL_SETN, 5804, "Fatal error: xdr_jobReply() failed; sbatchd relifing")); /* catgets 5804 */ relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5805, "%s: Sending jobReply (len=%d) to master failed: %m"), /* catgets 5805 */ fname, XDR_GETPOS(&xdrs2)); } xdr_destroy(&xdrs2); if (reply == ERR_NO_ERROR && !daemonParams[LSB_BSUBI_OLD].paramValue && PURE_INTERACTIVE(&jp->jobSpecs)) { if (status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR) < 0) { jp->notReported++; } } }
void do_switchjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_switchjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSpecs jobSpecs; struct jobReply jobReply; int i; sbdReplyType reply; char *cp; char *word; char found = FALSE; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs"); goto sendReply; } for (jp = jobQueHead->back; jp != jobQueHead; jp = jp->back) { if (jp->jobSpecs.jobId == jobSpecs.jobId) { found = TRUE; break; } } if (!found) { reply = ERR_NO_JOB; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5807, "%s: mbatchd trying to switch a non-existent job <%s>"), fname, lsb_jobid2str(jobSpecs.jobId)); /* catgets 5807 */ goto sendReply; } if (jp->jobSpecs.jStatus & (JOB_STAT_DONE | JOB_STAT_EXIT)) { reply = ERR_JOB_FINISH; goto sendReply; } cp = jobSpecs.windows; freeWeek(jp->week); while ((word = getNextWord_(&cp)) != NULL) { if (addWindow(word, jp->week, "switchJob jobSpecs") < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "addWindow", word); freeWeek(jp->week); reply = ERR_BAD_REQ; goto sendReply; } } jp->windEdge = now; if ((jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) && !(jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE)) for (i = 0; i < jp->jobSpecs.numToHosts; i++) if (unlockHost_(jp->jobSpecs.toHosts[i]) < 0 && lserrno != LSE_LIM_NLOCKED) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S_MM, fname, lsb_jobid2str(jp->jobSpecs.jobId), "unlockHost_", jp->jobSpecs.toHosts[i]); strcpy(jp->jobSpecs.queue, jobSpecs.queue); strcpy(jp->jobSpecs.windows, jobSpecs.windows); jp->jobSpecs.priority = jobSpecs.priority; jp->jobSpecs.nice = jobSpecs.nice; jp->jobSpecs.jAttrib = jobSpecs.jAttrib; freeThresholds (&jp->jobSpecs.thresholds); saveThresholds (&jp->jobSpecs, &jobSpecs.thresholds); memcpy((char *) &jp->jobSpecs.lsfLimits[LSF_RLIMIT_RUN], (char *) &jobSpecs.lsfLimits[LSF_RLIMIT_RUN], sizeof(struct lsfLimit)); strcpy (jp->jobSpecs.requeueEValues, jobSpecs.requeueEValues); strcpy (jp->jobSpecs.resumeCond, jobSpecs.resumeCond); strcpy (jp->jobSpecs.stopCond, jobSpecs.stopCond); lsbFreeResVal (&jp->resumeCondVal); if (jobSpecs.resumeCond && jobSpecs.resumeCond[0] != '\0') { if ((jp->resumeCondVal = checkThresholdCond (jobSpecs.resumeCond)) == NULL) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "checkThresholdCond", jobSpecs.resumeCond); } lsbFreeResVal (&jp->stopCondVal); if (jobSpecs.stopCond && jobSpecs.stopCond[0] != '\0') { if ((jp->stopCondVal = checkThresholdCond (jobSpecs.stopCond)) == NULL) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "checkThresholdCond", jobSpecs.stopCond); } if (jobSpecs.options & SUB_LOGIN_SHELL) { FREEUP (jp->jobSpecs.loginShell); jp->jobSpecs.loginShell = safeSave (jobSpecs.loginShell); } strcpy (jp->jobSpecs.suspendActCmd, jobSpecs.suspendActCmd); strcpy (jp->jobSpecs.resumeActCmd, jobSpecs.resumeActCmd); strcpy (jp->jobSpecs.terminateActCmd, jobSpecs.terminateActCmd); setRunLimit (jp, FALSE); offList ((struct listEntry *)jp); inJobLink (jp); if (reniceJob(jp) < 0) ls_syslog(LOG_DEBUG, "%s: renice job <%s> failed", fname, lsb_jobid2str(jp->jobSpecs.jobId)); reply = ERR_NO_ERROR; jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; sendReply: xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr); xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) replyStruct = (char *) &jobReply; else { replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "chanWrite_"); } xdr_destroy(&xdrs2); return; }