static int rmJobBufFilesPid(struct jobCard *jp) { static char fname[] = "rmJobBufFilesPid()"; int pid; if ((pid = fork()) < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "fork"); return (pid); } if (pid) return (pid); closeBatchSocket(); putEnv(LS_EXEC_T, "END"); if (postJobSetup(jp) < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "postSetupUser"); exit(-1); } rmJobBufFiles(jp); exit(0); }
static void prtSignaled (int signalValue, LS_LONG_INT jobId) { char *op; switch (signalValue) { case SIGCHK: op = (_i18n_msg_get(ls_catd,NL_SETN,473, "checkpointed")); /* catgets 473 */ break; case SIGSTOP: op = (_i18n_msg_get(ls_catd,NL_SETN,474, "stopped")); /* catgets 474 */ break; case SIGCONT: op = (_i18n_msg_get(ls_catd,NL_SETN,475, "resumed")); /* catgets 475 */ break; case SIGKILL: case SIGFORCE: op = (_i18n_msg_get(ls_catd,NL_SETN,476, "terminated")); /* catgets 476 */ break; case SIGDEL: op = (_i18n_msg_get(ls_catd,NL_SETN,477, "deleted")); /* catgets 477 */ break; default: op = (_i18n_msg_get(ls_catd,NL_SETN,478, "signaled")); /* catgets 478 */ break; } if (signalValue == SIGDEL && runCount != 0) printf((_i18n_msg_get(ls_catd,NL_SETN,479, "Job <%s> will be deleted after running next %d times\n")), /* catgets 479 */ lsb_jobid2str(jobId), runCount); else printf((_i18n_msg_get(ls_catd,NL_SETN,480, "Job <%s> is being %s\n")), lsb_jobid2str(jobId), op); /* catgets 480 */ }
static int xdr_thresholds(XDR *xdrs, struct jobSpecs *jobSpecs) { static char fname[] = "xdr_thresholds"; int i, j; if (xdrs->x_op == XDR_DECODE) { jobSpecs->thresholds.loadSched = NULL; jobSpecs->thresholds.loadStop = NULL; } if (xdrs->x_op == XDR_FREE) { for(i=0; i < jobSpecs->thresholds.nThresholds; i++) { FREEUP(jobSpecs->thresholds.loadSched[i]); FREEUP(jobSpecs->thresholds.loadStop[i]); } FREEUP(jobSpecs->thresholds.loadSched); FREEUP(jobSpecs->thresholds.loadStop); return(TRUE); } if (!(xdr_int(xdrs, &jobSpecs->thresholds.nIdx) && xdr_int(xdrs, &jobSpecs->thresholds.nThresholds))) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(jobSpecs->jobId), "xdr_int", "nIdx/nThresholds"); return(FALSE); } if (xdrs->x_op == XDR_DECODE) { jobSpecs->thresholds.loadSched = (float **) my_calloc (jobSpecs->thresholds.nThresholds, sizeof(float *), fname); jobSpecs->thresholds.loadStop = (float **) my_calloc (jobSpecs->thresholds.nThresholds, sizeof(float *), fname); for (i = 0; i < jobSpecs->thresholds.nThresholds; i++) { jobSpecs->thresholds.loadSched[i] = (float *) my_calloc (jobSpecs->thresholds.nIdx, sizeof(float), fname); jobSpecs->thresholds.loadStop[i] = (float *) my_calloc (jobSpecs->thresholds.nIdx, sizeof(float), fname); } } for (j = 0; j < jobSpecs->thresholds.nThresholds; j++) { for (i = 0; i < jobSpecs->thresholds.nIdx; i++) { if (!(xdr_float(xdrs, &jobSpecs->thresholds.loadStop[j][i]) && xdr_float(xdrs, &jobSpecs->thresholds.loadSched[j][i]))) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(jobSpecs->jobId), "xdr_float", "loadStop/loadSched"); return(FALSE); } } } return (TRUE); }
static int jobResumeAction (struct jobCard *jp, int sigValue, int suspReason) { static char fname[] = "jobResumeAction"; if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) { return -1; }; if (jp->jobSpecs.actPid) return 0; if (!(jp->jobSpecs.reasons & suspReason)) return -1; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason); if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0) if (jobsig(jp, 0, FALSE) < 0) { SBD_SET_STATE(jp, JOB_STAT_EXIT); return -1; } sbdlog_newstatus(jp); return 0; }
void prtJobSubmit(struct jobInfoEnt *job, int prt_q, int tFormat) { char prline[MAXLINELEN]; char *timestr; timestr = putstr_(_i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->submitTime)); if (tFormat) { sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,569, "%s: Job <%s> submitted from host <%s>")), /* catgets 569 */ timestr, lsb_jobid2str(job->jobId), job->fromHost); } else { sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,570, "%s: Submitted from host <%s>")), /* catgets 570 */ timestr, job->fromHost); } FREEUP(timestr); prtLine(prline); if (job->submit.options2 & SUB2_HOLD) { sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,570, " with hold"))); /* catgets 570 */ prtLine(prline); } if (prt_q) { sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,571, ", to Queue <%s>")), job->submit.queue); /* catgets 571 */ prtLine(prline); } TIMEIT(2, prtBTTime(job), "prtBTTime"); }
void prtJobSubmit(struct jobInfoEnt *job, int prt_q, int tFormat) { char prline[MAXLINELEN]; char timeBuf[128]; if (tFormat) { sprintf(timeBuf, " Job <%s> s", lsb_jobid2str(job->jobId)); } else { sprintf(timeBuf, " S"); } sprintf(prline, "%-12.19s:%submitted from host <%s>", ctime(&job->submitTime), timeBuf, job->fromHost); prtLineWUF(prline); if (job->submit.options2 & SUB2_HOLD) { sprintf(prline, " on hold"); prtLineWUF(prline); } if (prt_q) { sprintf(prline, " to Queue <%s>", job->submit.queue); prtLineWUF(prline); } TIMEIT(2, prtBTTime(job), "prtBTTime"); }
int job_resume (struct jobCard *jp) { static char fname[] = "job_resume"; int rep; if (jp->jobSpecs.actPid) return 0; if (jobsig(jp, SIGCONT, FALSE) < 0) return -1; SBD_SET_STATE(jp, JOB_STAT_RUN); jp->jobSpecs.reasons = 0; jp->jobSpecs.subreasons = 0; rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR); if (rep < 0) jp->notReported++; else { if (jp->notReported > 0) jp->notReported = 0; } if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Resume job %s", fname, lsb_jobid2str(jp->jobSpecs.jobId)); return 0; }
static int cleanupMigJob(struct jobCard *jp) { static char fname[] = "cleanupMigJob()"; int pid; unlockHosts (jp, jp->jobSpecs.numToHosts); if (!jp->jobSpecs.postCmd || jp->jobSpecs.postCmd[0] == '\0') return 0; if ((pid = fork()) < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "fork"); lsb_merr2(_i18n_msg_get(ls_catd , NL_SETN, 700, "Unable to fork a child to run the queue's post-exec command for job <%s>. Please run <%s> manually if necessary.\n"), /* catgets 700 */ lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.postCmd); return (pid); } if (pid) return (pid); closeBatchSocket(); putEnv(LS_EXEC_T, "END"); if (postJobSetup(jp) == -1) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "postJobSetup"); lsb_merr2(_i18n_msg_get(ls_catd , NL_SETN, 701, "Unable to setup the environment for job <%s> to run the queue's post exec. Please run <%s> manually if necessary.\n"), /* catgets 701 */ lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.postCmd); exit(-1); } runQPost(jp); exit(0); }
static int signalJobs (LS_LONG_INT *jobIds, int numJobs) { int failsignal = FALSE, signaled = FALSE; int i, cc; char msg[80]; for (i = 0; i < numJobs; i++) { if (sigValue == SIGCHK) cc = lsb_chkpntjob(jobIds[i], chkPeriod, chkOptions); else if (sigValue == SIGDEL) cc = lsb_deletejob(jobIds[i], runCount, 0); else if (sigValue == SIGFORCE) cc = lsb_forcekilljob(jobIds[i]); else cc = lsb_signaljob(jobIds[i], sigValue); if (cc < 0) { if (sigValue == SIGCHK && lsberrno == LSBE_NOT_STARTED && chkPeriod != LSB_CHKPERIOD_NOCHNG) { if (chkPeriod) printf((_i18n_msg_get(ls_catd,NL_SETN,470, "Job <%s>: Checkpoint period is now %d min.\n")), /* catgets 470 */ lsb_jobid2str(jobIds[i]), (int) (chkPeriod / 60)); else printf((_i18n_msg_get(ls_catd,NL_SETN,471, "Job <%s>: Periodic checkpointing is disabled\n")), /* catgets 471 */ lsb_jobid2str(jobIds[i])); signaled = TRUE; } else { failsignal = TRUE; sprintf (msg, "%s <%s>", I18N_Job, lsb_jobid2str(jobIds[i])); lsb_perror (msg); } } else { signaled = TRUE; prtSignaled (sigValue, jobIds[i]); } } return (signaled ? !failsignal : FALSE); }
int main(int argc, char **argv) { int cc; char *msg; LS_LONG_INT *jobIDs; if (lsb_init(argv[0]) < 0) { lsb_perror("lsb_init"); return -1; } while ((cc = getopt(argc, argv, "Vhd:")) != EOF) { switch (cc) { case 'V': fputs(_LS_VERSION_, stderr); return -1; case 'h': usage(); return -1; case 'd': msg = optarg; break; default: usage(); return -1; } } if (strlen(msg) > LSB_MAX_MSGSIZE) { fprintf(stderr, "bpost: message bigger than %d\n", LSB_MAX_MSGSIZE); return -1; } getJobIds(argc, argv, NULL, NULL, NULL, NULL, &jobIDs, 0); cc = lsb_postjobmsg(jobIDs[0], msg); if (cc < 0) { lsb_perror("lsb_jobmsg()"); return -1; } printf("Message to job %s posted all right.\n", lsb_jobid2str(jobIDs[0])); return 0; }
void jobSuspendAction(struct jobCard *jp, int sigValue, int suspReasons, int suspSubReasons) { static char fname[] = "jobSuspendAction"; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Suspend job %s; reasons=%x, subresons=%d, sigValue=%d, status=%x", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, jp->jobSpecs.subreasons, sigValue, jp->jobSpecs.jStatus); jp->actReasons = suspReasons; jp->actSubReasons = suspSubReasons; if (!JOB_RUNNING(jp)) return; if( jp->postJobStarted ) { return; } if (IS_SUSP (jp->jobSpecs.jStatus)) { if (jp->jobSpecs.reasons & suspReasons) return; else if (jp->jobSpecs.sigMap[-sigValue] == 0) return; } if ((jp->jobSpecs.actPid) && ((jp->jobSpecs.actValue == sigValue) || (jp->jobSpecs.actValue == (sigValue + jp->jobSpecs.sigMap[-sigValue])))) return; if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG1, "%s: Call jobSigStart(sigValue =%d) to suspend job", fname, sigValue + jp->jobSpecs.sigMap[-(sigValue)]); jobSigStart(jp, sigValue + jp->jobSpecs.sigMap[-(sigValue)], 0, 0, SIGLOG); sbdlog_newstatus(jp); }
int main(int argc, char **argv) { struct submit req; struct submitReply reply; char *job; LS_LONG_INT jobId = -1, *jobIdList = NULL; int numJobIds; time_t beginTime, terminTime; if (lsb_init(argv[0]) < 0) { sub_perror("lsb_init"); fprintf(stderr, ". Job not modified.\n"); exit (-1); } if (fillReq (argc, argv, CMD_BMODIFY, &req) < 0) { fprintf(stderr, ". Job not modified.\n"); exit (-1); } job = req.command; beginTime = req.beginTime; terminTime = req.termTime; if ((numJobIds = getJobIdList(job, &jobIdList)) < 0) { exit(-1); } jobId = jobIdList[0]; if ((jobId = lsb_modify(&req, &reply, jobId)) < 0) { if (lsberrno == LSBE_JOB_ARRAY) { fprintf(stderr, "Options -q and -O cannot be applied on job array"); } else { prtErrMsg (&req, &reply); } fprintf(stderr, ". Job not modified.\n"); if (req.nxf) free(req.xf); exit (-1); } printf("Parameters of job <%s> are being changed\n", lsb_jobid2str(jobId)); if (beginTime > 0 || terminTime > 0) prtBETime_(&req); if (req.nxf) free(req.xf); return 0; }
void shout_err (struct jobCard *jobPtr, char *msg) { char buf[MSGSIZE]; sprintf(buf, \ "We are unable to run your job %s:<%s>. The error is:\n%s.", lsb_jobid2str(jobPtr->jobSpecs.jobId), jobPtr->jobSpecs.command, msg); if (jobPtr->jobSpecs.options & SUB_MAIL_USER) { merr_user(jobPtr->jobSpecs.mailUser, jobPtr->jobSpecs.fromHost, buf, I18N_error); } else { merr_user(jobPtr->jobSpecs.userName, jobPtr->jobSpecs.fromHost, buf, I18N_error); } }
void suspendActEnd (struct jobCard *jobCard, int w_status) { int sbdStartStop = 0; if (logclass & (LC_TRACE | LC_SIGNAL)) ls_syslog(LOG_DEBUG, "suspendActEnd: Suspend job %s; reasons=%x, subresons=%d", lsb_jobid2str(jobCard->jobSpecs.jobId), jobCard->actReasons, jobCard->actReasons); sbdStartStop = (jobCard->actReasons & SUSP_SBD_STARTUP); jobCard->jobSpecs.lastSSuspTime = now; jobCard->jobSpecs.reasons |= jobCard->actReasons & (~SUSP_SBD_STARTUP); jobCard->jobSpecs.subreasons = jobCard->actSubReasons; if ((jobCard->jobSpecs.actValue == SIG_SUSP_USER) || (jobCard->jobSpecs.actValue == SIG_TERM_USER)) SET_STATE(jobCard->jobSpecs.jStatus, JOB_STAT_USUSP); else SET_STATE(jobCard->jobSpecs.jStatus, JOB_STAT_SSUSP); if (w_status == 0) jobCard->actStatus = ACT_DONE; else jobCard->actStatus = ACT_FAIL; if (sbdStartStop) jobCard->actStatus = ACT_NO; if (jobSigLog(jobCard, w_status) == 0) { jobCard->jobSpecs.actValue = SIG_NULL; jobCard->jobSpecs.actPid = 0; } }
void prtHeader(struct jobInfoEnt *job, int prt_q, int tFormat) { char prline[MAXLINELEN]; if (!tFormat) { sprintf(prline, "\nJob%s <%s>,", uf_format?"":" Id", lsb_jobid2str(job->jobId)); prtLineWUF(prline); if (job->submit.options & SUB_JOB_NAME) { char *jobName, *pos; jobName = job->submit.jobName; if ((pos = strchr(jobName, '[')) && LSB_ARRAY_IDX(job->jobId)) { *pos = '\0'; sprintf(jobName, "%s[%d]", jobName, LSB_ARRAY_IDX(job->jobId)); } sprintf(prline, " Job Name <%s>,", jobName); prtLineWUF(prline); } } if (tFormat) { sprintf(prline, ","); prtLine(prline); } sprintf(prline, " User <%s>,", job->user); prtLineWUF(prline); if (lsbMode_ & LSB_MODE_BATCH) { sprintf(prline, " Project <%s>,", job->submit.projectName); prtLineWUF(prline); } if (job->submit.userGroup && job->submit.userGroup[0] != '\0') { sprintf(prline, " User Group <%s>,", job->submit.userGroup); prtLineWUF(prline); } if (job->submit.options & SUB_MAIL_USER) { sprintf(prline, " Mail <%s>,", job->submit.mailUser); prtLineWUF(prline); } if (prt_q) { sprintf(prline, " Status <%s>, Queue <%s>,", get_status(job), job->submit.queue); prtLineWUF(prline); } /* Interactive job */ if (job->submit.options & SUB_INTERACTIVE) { sprintf(prline, " Interactive"); if (job->submit.options & SUB_PTY) { strcat(prline, " pseudo-terminal"); if (job->submit.options & (SUB_PTY_SHELL)) strcat(prline, " shell"); } strcat(prline, " mode,"); prtLineWUF(prline); } if (job->jobPriority > 0) { sprintf(prline, " Job Priority <%d>,", job->jobPriority); prtLineWUF(prline); } if (job->submit.options2 & (SUB2_JOB_CMD_SPOOL)) { if (tFormat) sprintf(prline, " Command(Spooled) <%s>", job->submit.command); else sprintf(prline, " Command(Spooled) <%s>", job->submit.command); } else { if (tFormat) sprintf(prline, " Command <%s>", job->submit.command); else sprintf(prline, " Command <%s>", job->submit.command); } prtLineWUF(prline); if (job->submit.options2 & SUB2_JOB_GROUP) { sprintf(prline, ", Job Group <%s>", job->submit.job_group); prtLineWUF(prline); } sprintf(prline, "\n"); prtLineWUF(prline); }
void child_handler(int sig) { int pid; LS_WAIT_T status; struct rusage rusage; register float cpuTime; struct lsfRusage lsfRusage; struct jobCard *jobCard; static short lastMbdExitVal = MASTER_NULL; static int sbd_finish_sleep = -1; cleanRusage (&rusage); now = time(0); while ((pid=wait3(&status, WNOHANG, &rusage)) > 0) { if (pid == mbdPid) { int sig = WTERMSIG(status); if (mbdExitCnt > 150) mbdExitCnt = 150; mbdExitVal = WIFSIGNALED(status); if (mbdExitVal) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5600, "mbatchd died with signal <%d> termination"), /* catgets 5600 */ sig); if (WCOREDUMP(status)) ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5601, "mbatchd core dumped")); /* catgets 5601 */ mbdExitVal = sig; if (mbdExitVal == lastMbdExitVal) mbdExitCnt++; else { mbdExitCnt = 0; lastMbdExitVal = mbdExitVal; } continue; } else { mbdExitVal = WEXITSTATUS(status); if (mbdExitVal == lastMbdExitVal) mbdExitCnt++; else { mbdExitCnt = 0; lastMbdExitVal = mbdExitVal; } if (mbdExitVal == MASTER_RECONFIG) { ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5602, "mbatchd resigned for reconfiguration")); /* catgets 5602 */ start_master(); } else ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5603, "mbatchd exited with value <%d>"), /* catgets 5603 */ mbdExitVal); continue; } } ls_ruunix2lsf (&rusage, &lsfRusage); cpuTime = lsfRusage.ru_utime + lsfRusage.ru_stime; for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = jobCard->forw) { if (jobCard->exitPid == pid) { jobCard->w_status = LS_STATUS(status); jobCard->exitPid = -1; if (logclass & LC_EXEC) { ls_syslog(LOG_DEBUG, I18N(5604, "child_handler: Job <%s> exitPid <%d> status <%d> exitcode <%d>"),/*catgets 5604*/ lsb_jobid2str(jobCard->jobSpecs.jobId), pid, jobCard->w_status, WEXITSTATUS(status)); } } if (jobCard->jobSpecs.jobPid == pid) { jobCard->collectedChild = TRUE; jobCard->cpuTime = cpuTime; jobCard->w_status = LS_STATUS(status); jobCard->exitPid = -1; memcpy ((char *) &jobCard->lsfRusage, (char *) &lsfRusage, sizeof (struct lsfRusage)); jobCard->notReported++; if (sbd_finish_sleep < 0) { if (daemonParams[LSB_SBD_FINISH_SLEEP].paramValue) { errno = 0; sbd_finish_sleep = atoi(daemonParams[LSB_SBD_FINISH_SLEEP].paramValue); if (errno) sbd_finish_sleep = 0; } else { sbd_finish_sleep=0; } } if (sbd_finish_sleep > 0) { millisleep_(sbd_finish_sleep); } if (logclass & LC_EXEC) { ls_syslog(LOG_DEBUG, I18N(5605, "child_handler: Job <%s> Pid <%d> status <%d> exitcode <%d>"), /*catgets 5605*/ lsb_jobid2str(jobCard->jobSpecs.jobId), pid, jobCard->w_status, WEXITSTATUS(status)); } need_checkfinish = TRUE; break; } } } }
int main (int argc, char **argv, char **environ) { char *queue = NULL, *host = NULL, *jobName = NULL, *user = NULL; LS_LONG_INT jobId; int options; struct jobInfoEnt *jInfo; char *outFile; char fflag = FALSE; int cc; int rc; rc = _i18n_init (I18N_CAT_MIN); if (lsb_init (argv[0]) < 0) { lsb_perror ("lsb_init"); exit (-1); } while ((cc = getopt (argc, argv, "Vhfq:m:J:")) != EOF) { switch (cc) { case 'q': if (queue || host || jobName) oneOf (argv[0]); queue = optarg; break; case 'm': if (queue || host || jobName) oneOf (argv[0]); host = optarg; break; case 'J': if (queue || host || jobName) oneOf (argv[0]); jobName = optarg; break; case 'V': fputs (_LS_VERSION_, stderr); exit (0); case 'f': fflag = TRUE; break; case 'h': default: usage (argv[0]); } } jobId = 0; options = LAST_JOB; if (argc >= optind + 1) { if (queue || host || jobName) { oneOf (argv[0]); } else if ((argc > 2 && !fflag) || (argc > 3 && fflag)) usage (argv[0]); if (getOneJobId (argv[optind], &jobId, 0)) { usage (argv[0]); } options = 0; } if (lsb_openjobinfo (jobId, jobName, NULL, queue, host, options) < 0 || (jInfo = lsb_readjobinfo (NULL)) == NULL) { if (jobId != 0 || jobName != NULL) { user = ALL_USERS; if (lsb_openjobinfo (jobId, jobName, user, queue, host, options) < 0 || (jInfo = lsb_readjobinfo (NULL)) == NULL) { jobInfoErr (jobId, jobName, NULL, queue, host, options); exit (-1); } } else { jobInfoErr (jobId, jobName, NULL, queue, host, options); exit (-1); } } lsb_closejobinfo (); if (jobId && jInfo->jobId != jobId) { lsberrno = LSBE_JOB_ARRAY; lsb_perror ("bpeek"); exit (-1); } if ((jInfo->submit.options & SUB_INTERACTIVE) && !(jInfo->submit.options & (SUB_OUT_FILE | SUB_ERR_FILE))) { fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2456, "Job <%s> : Cannot bpeek an interactive job.\n"), /* catgets 2456 */ lsb_jobid2str (jInfo->jobId)); exit (-1); } if (IS_PEND (jInfo->status) || jInfo->execUsername[0] == '\0') { fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2454, "Job <%s> : Not yet started.\n"), /* catgets 2454 */ lsb_jobid2str (jInfo->jobId)); exit (-1); } if (IS_FINISH (jInfo->status)) { fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2455, "Job <%s> : Already finished.\n"), /* catgets 2455 */ lsb_jobid2str (jInfo->jobId)); exit (-1); } if ((outFile = lsb_peekjob (jInfo->jobId)) == NULL) { char msg[50]; sprintf (msg, "%s <%s>", I18N_Job, lsb_jobid2str (jInfo->jobId)); lsb_perror (msg); exit (-1); } displayOutput (outFile, jInfo, fflag, environ); _i18n_end (ls_catd); exit (0); }
void bmove (int argc, char **argv, int opCode) { int position, reqPos; LS_LONG_INT jobId = 0; int achar; if (lsb_init(argv[0]) < 0) { lsb_perror("lsb_init"); exit(-1); } opterr = 0; while((achar = getopt(argc, argv, "hV")) != EOF) { switch(achar) { case 'V': fputs(_LS_VERSION_, stderr); exit(0); case 'h': default: usage(argv[0]); } } if (argc == optind) { fprintf(stderr, "%s.\n", (_i18n_msg_get(ls_catd,NL_SETN,852, "Job ID must be specified"))); /* catgets 852 */ usage(argv[0]); } if (optind < argc-2) { fprintf(stderr, "%s.\n", (_i18n_msg_get(ls_catd,NL_SETN,853, "Command syntax error: too many arguments"))); /* catgets 853 */ usage(argv[0]); } if (getOneJobId (argv[optind], &jobId, 0)) { usage(argv[0]); } position = 1; if (optind == argc - 2) { if (!isint_(argv[++optind]) || atoi(argv[optind]) <= 0) { fprintf(stderr, "%s: %s.\n", argv[optind], I18N(854, "Position value must be a positive integer")); /* catgets854*/ usage(argv[0]); } position = atoi(argv[optind]); } reqPos = position; if (lsb_movejob(jobId, &position, opCode) <0) { lsb_perror(lsb_jobid2str (jobId)); exit(-1); } if (position != reqPos) fprintf(stderr, (_i18n_msg_get(ls_catd,NL_SETN,855, "Warning: position value <%d> is beyond movable range.\n")), /* catgets 855 */ reqPos); if (opCode == TO_TOP) fprintf(stderr, (_i18n_msg_get(ls_catd,NL_SETN,856, "Job <%s> has been moved to position %d from top.\n")), /* catgets 856 */ lsb_jobid2str (jobId), position); else fprintf(stderr, (_i18n_msg_get(ls_catd,NL_SETN,857, "Job <%s> has been moved to position %d from bottom.\n")), /* catgets 857 */ lsb_jobid2str (jobId), position); exit(0); }
void do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_sigjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSig jobSig; sbdReplyType reply; struct jobReply jobReply; struct LSFHeader replyHdr; char *replyStruct; struct jobCard *jp = NULL; char found = FALSE; int cc; int sigValue; int savedActReasons; int savedActSubReasons; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig"); goto Reply1; } jobSig.sigValue = sig_decode(jobSig.sigValue); sigValue = jobSig.sigValue; if (logclass & LC_SIGNAL) ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue); for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jobSig.jobId) continue; found = TRUE; break; } if (found == FALSE) { reply = ERR_NO_JOB; jp = NULL; goto Reply1; } if (jobSig.reasons & SUSP_MBD_LOCK) { jp->jobSpecs.reasons = jobSig.reasons; jp->jobSpecs.subreasons = jobSig.subReasons; savedActReasons = jp->actReasons; savedActSubReasons = jp->actSubReasons; jp->actReasons = jobSig.reasons; jp->actSubReasons = jobSig.subReasons; } if (jp->postJobStarted) { reply = ERR_NO_ERROR; goto Reply1; } if (IS_FINISH(jp->jobSpecs.jStatus)) { reply = ERR_NO_ERROR; goto Reply1; } if (jp->jobSpecs.jobPGid == -1) { SBD_SET_STATE(jp, JOB_STAT_EXIT); reply = ERR_NO_ERROR; goto Reply; } if (!JOB_STARTED(jp)) { if (isSigTerm(sigValue) == TRUE) { if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; goto Reply; } reply = ERR_SIG_RETRY; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>", fname, getLsbSigSymbol(sigValue), lsb_jobid2str(jp->jobSpecs.jobId)); goto Reply1; } if (IS_PEND(jp->jobSpecs.jStatus)) { reply = ERR_SIG_RETRY; goto Reply1; } if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) { if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else { jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; reply = ERR_NO_ERROR; } goto Reply; } if ((cc = jobSigStart(jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0) reply = ERR_SIG_RETRY; else reply = ERR_NO_ERROR; Reply: sbdlog_newstatus(jp); Reply1: xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; if (reply == ERR_NO_ERROR) { jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.actPid = jp->jobSpecs.actPid; jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; jobReply.reasons = jp->jobSpecs.reasons; jobReply.actStatus = jp->actStatus; replyStruct = (char *) &jobReply; } else { if (reply != ERR_NO_JOB) if ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) { jp->actReasons = savedActReasons; jp->actSubReasons = savedActSubReasons; } replyStruct = (char *) 0; } if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply"); relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821, "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */ } if (jp != NULL) jp->actStatus = ACT_NO; xdr_destroy(&xdrs2); return; }
void do_jobSetup(XDR * xdrs, int chfd, struct LSFHeader * reqHdr) { static char fname[] = "do_jobSetup()"; struct jobSetup jsetup; struct jobCard *jp = NULL; char found = FALSE; struct jobCard savejp; if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG, "%s: Entering ...", fname); if (!xdr_jobSetup(xdrs, &jsetup, reqHdr)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSetup"); return; } for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId != jsetup.jobId) continue; found = TRUE; break; } if (found == FALSE) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5838, "%s: Job <%s> is not found"), /* catgets 5838 */ fname, lsb_jobid2str(jsetup.jobId)); replyHdrWithRC(LSBE_NO_JOB, chfd, jsetup.jobId); return; } if (jp->jobSpecs.actPid) return; memcpy((char *) &savejp, (char *) jp, sizeof(savejp)); jp->execJobFlag |= JOB_EXEC_QPRE_KNOWN; if (jsetup.execJobFlag & JOB_EXEC_QPRE_OK) jp->execJobFlag |= JOB_EXEC_QPRE_OK; jp->jobSpecs.jobPid = jsetup.jobPid; jp->jobSpecs.jobPGid = jsetup.jobPGid; jp->jobSpecs.execUid = jsetup.execUid; strcpy(jp->jobSpecs.execUsername, jsetup.execUsername); jp->execGid = jsetup.execGid; strcpy(jp->execUsername, jsetup.execUsername); strcpy(jp->jobSpecs.execCwd, jsetup.execCwd); strcpy(jp->jobSpecs.execHome, jsetup.execHome); if (jsetup.jStatus & JOB_STAT_RUN) { if (!(jsetup.jStatus & JOB_STAT_PRE_EXEC)) jp->jobSpecs.jStatus &= ~JOB_STAT_PRE_EXEC; if (status_job(BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR) < 0) { memcpy((char *) jp, (char *) &savejp, sizeof(savejp)); return; } jp->execJobFlag |= JOB_EXEC_STARTED; } else { jp->jobSpecs.reasons = jsetup.reason; jp->collectedChild = TRUE; jp->notReported = 0; jp->exitPid = -1; jp->needReportRU = FALSE; jp->jobSpecs.jStatus = jsetup.jStatus; jp->w_status = jsetup.w_status; jp->lsfRusage = jsetup.lsfRusage; jp->cpuTime = jsetup.cpuTime; if (job_finish(jp, TRUE) < 0) { memcpy((char *) jp, (char *) &savejp, sizeof(savejp)); return; } } if (replyHdrWithRC(LSBE_NO_ERROR, chfd, jsetup.jobId) < 0) { ls_syslog(LOG_DEBUG, "%s: Reply header failed for job <%s>", fname, lsb_jobid2str(jsetup.jobId)); } if (logclass & LC_EXEC) ls_syslog(LOG_DEBUG1, "%s: JobId %s jstatus %d reason %x jobPid %d jobPGid %d execUid %d execGid <%d> execUser <%s> execHome <%s> execCwd <%s> execJobFlag %x cpuTime %f w_status %d", fname, lsb_jobid2str(jsetup.jobId), jsetup.jStatus, jsetup.reason, jsetup.jobPid, jsetup.jobPGid, jsetup.execUid, jsetup.execGid, jsetup.execUsername, jsetup.execHome, jsetup.execCwd, jsetup.execJobFlag, jsetup.cpuTime, jsetup.w_status); }
int main(int argc, char** argv) { char* hosts = NULL; struct runJobRequest runJobRequest; int cc; int c; bool_t fFlag = FALSE; bool_t bFlag = FALSE; int rc; rc = _i18n_init ( I18N_CAT_MIN ); if (lsb_init(argv[0]) < 0) { lsb_perror("lsb_init"); exit (-1); } while((c = getopt(argc, argv, "m:fbhV")) != EOF) { switch(c) { case 'm': hosts = putstr_(optarg); if (hosts == NULL) { perror("putstr_"); exit(-1); } break; case 'f': fFlag = TRUE; break; case 'b': bFlag = TRUE; break; case 'V': fputs(_LS_VERSION_, stderr); return (0); case 'h': usage(argv[0]); exit(-1); } } if (argc <= optind) { usage(argv[0]); exit(-1); } memset((struct runJobRequest* )&runJobRequest, 0, sizeof(struct runJobRequest)); if (getOneJobId (argv[argc - 1], &(runJobRequest.jobId), 0)) { usage(argv[0]); exit(-1); } runJobRequest.numHosts = countHosts(hosts); if (runJobRequest.numHosts > 1) { int i; runJobRequest.hostname = (char **)calloc(runJobRequest.numHosts, sizeof(char *)); if (runJobRequest.hostname == NULL) { perror("calloc"); exit(-1); } for (i = 0; i < runJobRequest.numHosts; i++) { while (isspace(*hosts)) hosts++; runJobRequest.hostname[i] = hosts; hosts += strlen(hosts) + 1; } } else runJobRequest.hostname = &hosts; runJobRequest.options = (fFlag == TRUE) ? RUNJOB_OPT_NOSTOP : RUNJOB_OPT_NORMAL; if (bFlag) { runJobRequest.options |= RUNJOB_OPT_FROM_BEGIN; } cc = lsb_runjob(&runJobRequest); if (cc < 0) { lsb_perror((_i18n_msg_get(ls_catd,NL_SETN,2755, "Failed to run the job"))); /* catgets 2755 */ exit(-1); } printf((_i18n_msg_get(ls_catd,NL_SETN,2756, "Job <%s> is being forced to run.\n")), /* catgets 2756 */ lsb_jobid2str(runJobRequest.jobId)); _i18n_end ( ls_catd ); return (0); }
static int shouldResume (struct hostLoad *loadV, struct jobCard *jp, int num) { static char fname[] = "shouldResume"; int i, j, numHosts = -1; int resume = TRUE, found; int lastReasons = jp->jobSpecs.reasons; int lastSubreasons = jp->jobSpecs.subreasons; struct hostLoad *loads = NULL; struct tclHostData *tclHostData = NULL; if (logclass & (LC_SCHED | LC_EXEC)) ls_syslog(LOG_DEBUG3, "%s: job=%s; jStatus=%d; reasons=%x, subreasons=%d, numHosts=%d", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.jStatus, jp->jobSpecs.reasons, jp->jobSpecs.subreasons, num); if (num <= 0) return FALSE; if (!(jp->jobSpecs.jStatus & JOB_STAT_SSUSP)) return FALSE; if ((jp->jobSpecs.reasons & SUSP_QUEUE_WINDOW) || (jp->jobSpecs.reasons & SUSP_USER_STOP) || (jp->jobSpecs.reasons & SUSP_MBD_LOCK)) return FALSE; loads = (struct hostLoad *) my_malloc (num * sizeof (struct hostLoad), fname); if (jp->resumeCondVal != NULL) { tclHostData = (struct tclHostData *) my_malloc (num * sizeof (struct tclHostData), fname); for (i = 0; i < num; i++) { initTclHostData (&tclHostData[i]); } } else { tclHostData = NULL; } for (j = 0; j <jp->jobSpecs.numToHosts; j++) { if (j > 0 && !strcmp (jp->jobSpecs.toHosts[j], jp->jobSpecs.toHosts[j-1])) continue; numHosts++; found = FALSE; for (i = 0; i < num; i++) { if (equalHost_(jp->jobSpecs.toHosts[j], loadV[i].hostName)) { loads[numHosts] = loadV[i]; if (tclHostData != NULL) { if (getTclHostData (&loadV[i], &tclHostData[numHosts], FALSE) < 0) { break; } } found = TRUE; break; } } if (found != TRUE) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5706, "%s: Can not find load information for host <%s> to check resume condiftions for job <%s>"), fname, jp->jobSpecs.toHosts[j], lsb_jobid2str(jp->jobSpecs.jobId)); /* catgets 5706 */ loads[numHosts].li = NULL; continue; } } if (numHosts >= 0) { numHosts++; resume = checkResumeByLoad (jp->jobSpecs.jobId, numHosts, jp->jobSpecs.thresholds, loads, &jp->jobSpecs.reasons, &jp->jobSpecs.subreasons, jp->jobSpecs.jAttrib, jp->resumeCondVal, tclHostData); FREEUP (loads); if (tclHostData != NULL) { for (i = 0; i < numHosts; i++) { FREEUP (tclHostData[i].resBitMaps); FREEUP (tclHostData[i].loadIndex); } FREEUP (tclHostData); } } else { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5707, "%s: No valid load information is found for job <%s>"), fname, lsb_jobid2str(jp->jobSpecs.jobId)); /* catgets 5707 */ } if ((logclass & (LC_SCHED | LC_EXEC)) && !resume) ls_syslog(LOG_DEBUG2, "%s: Can't resume job %s; reason=%x, subreasons=%d", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, jp->jobSpecs.subreasons); if (!resume) { if ((jp->jobSpecs.reasons != lastReasons || (jp->jobSpecs.reasons == lastReasons && jp->jobSpecs.subreasons != lastSubreasons)) && (now - jp->lastStatusMbdTime > rusageUpdateRate * sbdSleepTime)) jp->notReported++; } return (resume); }
bool_t xdr_jobSpecs (XDR *xdrs, struct jobSpecs *jobSpecs, struct LSFHeader *hdr) { static char fname[] = "xdr_jobSpecs"; char *sp[15]; char *pTemp; int i, nLimits; int jobArrId, jobArrElemId; LS_LONG_INT tmpJobId; if (xdrs->x_op == XDR_DECODE) { jobSpecs->numToHosts = 0; jobSpecs->toHosts = NULL; jobSpecs->nxf = 0; jobSpecs->xf = NULL; jobSpecs->numEnv = 0; jobSpecs->env = NULL; jobSpecs->eexec.len = 0; jobSpecs->eexec.data = NULL; jobSpecs->loginShell = NULL; jobSpecs->schedHostType= NULL; jobSpecs->execHosts = NULL; } if (xdrs->x_op == XDR_FREE) { for(i=0; i < jobSpecs->numToHosts; i++) { FREEUP(jobSpecs->toHosts[i]); } FREEUP(jobSpecs->toHosts); for(i=0; i < jobSpecs->numEnv; i++) FREEUP(jobSpecs->env[i]); FREEUP(jobSpecs->env); FREEUP(jobSpecs->xf); FREEUP(jobSpecs->loginShell); FREEUP(jobSpecs->schedHostType); FREEUP(jobSpecs->execHosts); if (!xdr_thresholds(xdrs, jobSpecs) || !xdr_lenData(xdrs, &jobSpecs->eexec)) return(FALSE); return(TRUE); } if (xdrs->x_op == XDR_ENCODE) { jobId64To32(jobSpecs->jobId, &jobArrId, &jobArrElemId); } if (!xdr_int(xdrs, &jobArrId)) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_int", "jobId"); return(FALSE); } if (!(xdr_int(xdrs, &jobSpecs->userId) && xdr_int(xdrs, &jobSpecs->options) && xdr_short(xdrs, &jobSpecs->nice) && xdr_int(xdrs, &jobSpecs->priority) && xdr_int(xdrs, &jobSpecs->chkSig) && xdr_int(xdrs, &jobSpecs->actPid) && xdr_time_t(xdrs, &jobSpecs->chkPeriod) && xdr_time_t(xdrs, &jobSpecs->migThresh) && xdr_time_t(xdrs, &jobSpecs->lastSSuspTime) && xdr_float(xdrs, &jobSpecs->lastCpuTime))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_int", "userId"); return(FALSE); } nLimits = LSF_RLIM_NLIMITS; tmpJobId = jobArrId; if (!xdr_int(xdrs, &nLimits)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "nLimits"); return(FALSE); } for (i = 0; i < nLimits && i < LSF_RLIM_NLIMITS; i++) { if(!xdr_lsfLimit(xdrs, &jobSpecs->lsfLimits[i], hdr)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(tmpJobId), "xdr_lsfLimit"); return(FALSE); } } if (nLimits > LSF_RLIM_NLIMITS) { for (i=LSF_RLIM_NLIMITS; i<nLimits; i++) { struct lsfLimit lsfLimit; if (!xdr_lsfLimit(xdrs, &lsfLimit, hdr)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(tmpJobId), "xdr_lsfLimit"); return(FALSE); } } } if (!(xdr_int(xdrs, &jobSpecs->jStatus) && xdr_int(xdrs, &jobSpecs->reasons) && xdr_int(xdrs, &jobSpecs->subreasons) && xdr_time_t(xdrs, &jobSpecs->termTime) && xdr_time_t(xdrs, &jobSpecs->startTime) && xdr_int(xdrs, &jobSpecs->runTime) && xdr_time_t(xdrs, &jobSpecs->submitTime) && xdr_int(xdrs, &jobSpecs->jobPid) && xdr_int(xdrs, &jobSpecs->jobPGid) && xdr_int(xdrs, &jobSpecs->restartPid) && xdr_int(xdrs, &jobSpecs->sigValue) && xdr_int(xdrs, &jobSpecs->umask) && xdr_int(xdrs, &jobSpecs->jAttrib))) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "jStatus"); return(FALSE); } sp[0] = jobSpecs->jobFile; sp[1] = jobSpecs->inFile; sp[2] = jobSpecs->outFile; sp[3] = jobSpecs->errFile; sp[4] = jobSpecs->chkpntDir; sp[5] = jobSpecs->cwd; sp[6] = jobSpecs->subHomeDir; sp[7] = jobSpecs->command; sp[8] = jobSpecs->jobName; sp[9] = jobSpecs->preExecCmd; sp[10] = jobSpecs->fromHost; sp[11] = jobSpecs->resReq; if (xdrs->x_op == XDR_DECODE) for (i = 0; i < 11; i++) sp[i][0] = '\0'; if (!(xdr_string(xdrs, &sp[0], MAXFILENAMELEN) && xdr_string(xdrs, &sp[1], MAXFILENAMELEN) && xdr_string(xdrs, &sp[2], MAXFILENAMELEN) && xdr_string(xdrs, &sp[3], MAXFILENAMELEN) && xdr_string(xdrs, &sp[4], MAXFILENAMELEN) && xdr_string(xdrs, &sp[5], MAXFILENAMELEN) && xdr_string(xdrs, &sp[6], MAXFILENAMELEN) && xdr_string(xdrs, &sp[7], MAXLINELEN) && xdr_string(xdrs, &sp[8], MAXLINELEN) && xdr_string(xdrs, &sp[9], MAXLINELEN) && xdr_string(xdrs, &sp[10], MAXHOSTNAMELEN))) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "jobFile"); return(FALSE); } if (xdrs->x_op == XDR_DECODE) sp[11][0] = '\0'; if (!xdr_string(xdrs, &sp[11], MAXLINELEN)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "jobFile"); return(FALSE); } sp[12] = jobSpecs->queue; sp[13] = jobSpecs->windows; sp[14] = jobSpecs->userName; if (xdrs->x_op == XDR_DECODE) for (i = 12; i < 15; i++) sp[i][0] = '\0'; if (!(xdr_string(xdrs, &sp[12], MAXFILENAMELEN) && xdr_string(xdrs, &sp[13], MAXLINELEN) && xdr_string(xdrs, &sp[14], MAX_LSB_NAME_LEN))) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "jobFile"); return(FALSE); } if (!xdr_int(xdrs, &jobSpecs->numToHosts)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "numToHosts"); return(FALSE); } if (xdrs->x_op == XDR_DECODE && jobSpecs->numToHosts) { jobSpecs->toHosts = (char **) my_calloc(jobSpecs->numToHosts, sizeof (char *), fname); } for (i = 0; i < jobSpecs->numToHosts; i++) { if (!xdr_var_string(xdrs, &jobSpecs->toHosts[i])) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_var_string", "toHosts"); return(FALSE); } } if (!xdr_thresholds(xdrs, jobSpecs)) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(tmpJobId), "xdr_thresholds"); if (!xdr_int(xdrs, &jobSpecs->nxf)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_int", "nxf"); return(FALSE); } if (xdrs->x_op == XDR_DECODE && jobSpecs->nxf > 0) { jobSpecs->xf = (struct xFile *) my_calloc(jobSpecs->nxf, sizeof(struct xFile), fname); } for (i = 0; i < jobSpecs->nxf; i++) { if (!xdr_arrayElement(xdrs, (char *) &(jobSpecs->xf[i]), hdr, xdr_xFile)) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), "xdr_arrayElement", "xf"); return(FALSE); } } sp[0] = jobSpecs->mailUser; sp[1] = jobSpecs->clusterName; sp[2] = jobSpecs->projectName; sp[3] = jobSpecs->preCmd; sp[4] = jobSpecs->postCmd; sp[5] = jobSpecs->execCwd; sp[6] = jobSpecs->execHome; sp[7] = jobSpecs->requeueEValues; if (xdrs->x_op == XDR_DECODE) { for(i=0; i < 8; i++) sp[i][0]= '\0'; } if (!(xdr_string(xdrs, &sp[0], MAXLINELEN) && xdr_string(xdrs, &sp[1], MAX_LSB_NAME_LEN) && xdr_string(xdrs, &sp[2], MAX_LSB_NAME_LEN) && xdr_string(xdrs, &sp[3], MAXLINELEN) && xdr_string(xdrs, &sp[4], MAXLINELEN) && xdr_string(xdrs, &sp[5], MAXFILENAMELEN) && xdr_string(xdrs, &sp[6], MAXFILENAMELEN) && xdr_string(xdrs, &sp[7], MAXLINELEN) && xdr_int(xdrs, &jobSpecs->execUid) && xdr_int(xdrs, &jobSpecs->maxNumProcessors) && xdr_int(xdrs, &jobSpecs->numEnv))) return(FALSE); if (xdrs->x_op == XDR_DECODE && jobSpecs->numEnv) { jobSpecs->env = (char **) my_calloc(jobSpecs->numEnv, sizeof (char *), fname); } for (i = 0; i < jobSpecs->numEnv; i++) { if (!xdr_var_string(xdrs, &jobSpecs->env[i])) return(FALSE); } if (!xdr_lenData(xdrs, &jobSpecs->eexec)) return (FALSE); if (!xdr_int(xdrs, &jobSpecs->niosPort)) return (FALSE); sp[0] = jobSpecs->resumeCond; sp[1] = jobSpecs->stopCond; sp[2] = jobSpecs->suspendActCmd; sp[3] = jobSpecs->resumeActCmd; sp[4] = jobSpecs->terminateActCmd; if (xdrs->x_op == XDR_DECODE) { sp[0][0] = '\0'; sp[1][0] = '\0'; } if (!(xdr_string(xdrs, &sp[0], MAXLINELEN)) || !(xdr_string(xdrs, &sp[1], MAXLINELEN))) return (FALSE); if (xdrs->x_op == XDR_DECODE) { for ( i = 2; i < 5; i++) sp[i][0] = '\0'; } for ( i = 2; i < 5; i++) if (!(xdr_string(xdrs, &sp[i], MAXLINELEN))) return(FALSE); for (i = 0; i <LSB_SIG_NUM; i++) if (!(xdr_int(xdrs, &jobSpecs->sigMap[i]))) return(FALSE); if (!(xdr_int(xdrs, &jobSpecs->actValue))) return (FALSE); if (!xdr_var_string(xdrs, &jobSpecs->loginShell)) return (FALSE); if (!xdr_var_string(xdrs, &jobSpecs->schedHostType)) return (FALSE); if (!xdr_var_string(xdrs, &jobSpecs->execHosts)) return (FALSE); if (!xdr_int(xdrs, &jobSpecs->options2)) { return(FALSE); } pTemp = jobSpecs->jobSpoolDir; if(!( xdr_string(xdrs, &pTemp, MAXPATHLEN))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_string", "jobSpoolDir"); return(FALSE); } if (!(xdr_int(xdrs, &jobArrElemId))) { return (FALSE); } if (xdrs->x_op == XDR_DECODE) { jobId32To64(&jobSpecs->jobId,jobArrId,jobArrElemId); } sp[0] = jobSpecs->inFileSpool; sp[1] = jobSpecs->commandSpool; if (!(xdr_string(xdrs, &sp[0], MAXFILENAMELEN))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_string", "inFileSpool"); return (FALSE); } if (!(xdr_string(xdrs, &sp[1], MAXFILENAMELEN))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_string", "commandSpool"); return (FALSE); } if (!(xdr_int(xdrs, &jobSpecs->userPriority))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_int", "userPriority"); return (FALSE); } sp[0] = jobSpecs->execUsername; if (!(xdr_string(xdrs, &sp[0], MAX_LSB_NAME_LEN))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_string", "execUsername"); return (FALSE); } sp[0] = jobSpecs->prepostUsername; if (!(xdr_string(xdrs, &sp[0], MAX_LSB_NAME_LEN))) { ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_string", "prepostUsername"); return (FALSE); } return(TRUE); }
static void chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed) { static char fname[] = "chkpntEnd()"; int savePid, saveStatus; if (IS_SUSP(jobCard->jobSpecs.jStatus) && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG)) jobsig(jobCard, SIGSTOP, TRUE); saveStatus = jobCard->jobSpecs.jStatus; if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) { if (w_status == 0) { if (!jobCard->missing) { jobCard->missing = TRUE; need_checkfinish = TRUE; return; } else if (jobCard->notReported == 0) return; if (jobCard->cleanupPid == 0) { if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0) return; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709, "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */ fname, lsb_jobid2str(jobCard->jobSpecs.jobId)); } SBD_SET_STATE(jobCard, JOB_STAT_PEND); } else { jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG; } } savePid = jobCard->jobSpecs.actPid; if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus, w_status == 0 ? ERR_NO_ERROR : ERR_SYSACT_FAIL) < 0) { jobCard->jobSpecs.actPid = savePid; jobCard->jobSpecs.jStatus = saveStatus; } else { jobCard->lastChkpntTime = now; jobCard->jobSpecs.actPid = 0; jobCard->actStatus = ACT_NO; jobCard->jobSpecs.actValue = SIG_NULL; if (w_status == 0) { jobCard->migCnt = 1; } if (saveStatus & JOB_STAT_MIG) { if (w_status == 0) { cleanupMigJob(jobCard); deallocJobCard(jobCard); *freed = TRUE; } else jobCard->migCnt *= 2; } } }
int status_job (mbdReqType reqType, struct jobCard *jp, int newStatus, sbdReplyType err) { static char fname[] = "status_job()"; static int seq = 1; static char lastHost[MAXHOSTNAMELEN]; int reply; char *request_buf; char *reply_buf = NULL; XDR xdrs; struct LSFHeader hdr; int cc; struct statusReq statusReq; int flags; int i; int len; struct lsfAuth *auth = NULL; if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL)) ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s", fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId)); if (newStatus == JOB_STAT_EXIT) { jp->userJobSucc = FALSE; } if (MASK_STATUS (newStatus) == JOB_STAT_DONE) { jp->userJobSucc = TRUE; } if (IS_POST_FINISH (newStatus)) { if (jp->userJobSucc != TRUE) { return 0; } } if (masterHost == NULL) return -1; if (jp->notReported < 0) { jp->notReported = -INFINIT_INT; return (0); } statusReq.jobId = jp->jobSpecs.jobId; statusReq.actPid = jp->jobSpecs.actPid; statusReq.jobPid = jp->jobSpecs.jobPid; statusReq.jobPGid = jp->jobSpecs.jobPGid; statusReq.newStatus = newStatus; statusReq.reason = jp->jobSpecs.reasons; statusReq.subreasons = jp->jobSpecs.subreasons; statusReq.sbdReply = err; statusReq.lsfRusage = jp->lsfRusage; statusReq.execUid = jp->jobSpecs.execUid; statusReq.numExecHosts = 0; statusReq.execHosts = NULL; statusReq.exitStatus = jp->w_status; statusReq.execCwd = jp->jobSpecs.execCwd; statusReq.execHome = jp->jobSpecs.execHome; statusReq.execUsername = jp->execUsername; statusReq.queuePostCmd = ""; statusReq.queuePreCmd = ""; statusReq.msgId = jp->delieveredMsgId; if (IS_FINISH (newStatus)) { if (jp->maxRusage.mem > jp->runRusage.mem) jp->runRusage.mem = jp->maxRusage.mem; if (jp->maxRusage.swap > jp->runRusage.swap) jp->runRusage.swap = jp->maxRusage.swap; if (jp->maxRusage.stime > jp->runRusage.stime) jp->runRusage.stime = jp->maxRusage.stime; if (jp->maxRusage.utime > jp->runRusage.utime) jp->runRusage.utime = jp->maxRusage.utime; } statusReq.runRusage.mem = jp->runRusage.mem; statusReq.runRusage.swap = jp->runRusage.swap; statusReq.runRusage.utime = jp->runRusage.utime; statusReq.runRusage.stime = jp->runRusage.stime; statusReq.runRusage.npids = jp->runRusage.npids; statusReq.runRusage.pidInfo = jp->runRusage.pidInfo; statusReq.runRusage.npgids = jp->runRusage.npgids; statusReq.runRusage.pgid = jp->runRusage.pgid; statusReq.actStatus = jp->actStatus; statusReq.sigValue = jp->jobSpecs.actValue; statusReq.seq = seq; seq++; if (seq >= MAX_SEQ_NUM) seq = 1; len = 1024 + ALIGNWORD_ (sizeof (struct statusReq)); len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 + ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 + ALIGNWORD_ (strlen (statusReq.execUsername)) + 4; for (i = 0; i < statusReq.runRusage.npids; i++) len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4; for (i = 0; i < statusReq.runRusage.npgids; i++) len += ALIGNWORD_ (sizeof (int)) + 4; if (logclass & (LC_TRACE | LC_COMM)) ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname, len); if ((request_buf = malloc (len)) == NULL) { ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc"); return (-1); } xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE); initLSFHeader_ (&hdr); hdr.opCode = reqType; if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0, auth)) { ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq"); lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq"); xdr_destroy (&xdrs); FREEUP (request_buf); relife (); } flags = CALL_SERVER_NO_HANDSHAKE; if (statusChan >= 0) flags |= CALL_SERVER_USE_SOCKET; if (reqType == BATCH_RUSAGE_JOB) flags |= CALL_SERVER_NO_WAIT_REPLY; if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d", fname, statusChan, flags); cc = call_server (masterHost, mbd_port, request_buf, XDR_GETPOS (&xdrs), &reply_buf, &hdr, connTimeout, readTimeout, &statusChan, NULL, NULL, flags); if (cc < 0) { statusChan = -1; if (!equalHost_ (masterHost, lastHost)) { if (errno != EINTR) ls_syslog (LOG_DEBUG, "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s", fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId), lsb_sysmsg ()); strcpy (lastHost, masterHost); } xdr_destroy (&xdrs); FREEUP (request_buf); failcnt++; return (-1); } else if (cc == 0) { } failcnt = 0; lastHost[0] = '\0'; xdr_destroy (&xdrs); FREEUP (request_buf); if (cc) free (reply_buf); if (flags & CALL_SERVER_NO_WAIT_REPLY) { struct timeval timeval; timeval.tv_sec = 0; timeval.tv_usec = 0; if (rd_select_ (chanSock_ (statusChan), &timeval) == 0) { jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; return 0; } CLOSECD (statusChan); if (logclass & LC_COMM) ls_syslog (LOG_DEBUG1, "%s: Job <%s> rd_select() failed, assume connection broken", fname, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } reply = hdr.opCode; switch (reply) { case LSBE_NO_ERROR: case LSBE_LOCK_JOB: jp->needReportRU = FALSE; jp->lastStatusMbdTime = now; if (reply == LSBE_LOCK_JOB) { if (IS_SUSP (jp->jobSpecs.jStatus)) jp->jobSpecs.reasons |= SUSP_MBD_LOCK; else ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."), /* catgets 5204 */ fname, lsb_jobid2str (jp->jobSpecs.jobId), jp->jobSpecs.jStatus); } return (0); case LSBE_NO_JOB: if (!IS_POST_FINISH (jp->jobSpecs.jStatus)) { ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost); /* catgets 5205 */ } jp->notReported = -INFINIT_INT; return (0); case LSBE_STOP_JOB: if (jobsig (jp, SIGSTOP, TRUE) < 0) SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT); else { SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP); jp->jobSpecs.reasons |= SUSP_USER_STOP; } return (-1); case LSBE_SBATCHD: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"), /* catgets 5206 */ fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); default: ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"), /* catgets 5207 */ fname, reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId)); return (-1); } }
static void sigActEnd (struct jobCard *jobCard) { int w_status; struct stat st; bool_t freed = FALSE; char exitFile[MAXFILENAMELEN]; if (jobCard->jobSpecs.actValue < 0) { sprintf(exitFile, "%s/.%s.%s.%s", LSTMPDIR, jobCard->jobSpecs.jobFile, lsb_jobidinstr(jobCard->jobSpecs.jobId), exitFileSuffix(jobCard->jobSpecs.actValue)); w_status = stat(exitFile, &st); if (w_status == 0) jobCard->actStatus = ACT_DONE; else { jobCard->actStatus = ACT_FAIL; } } jobCard->jobSpecs.jStatus &= ~JOB_STAT_SIGNAL; switch (jobCard->jobSpecs.actValue) { case SIG_CHKPNT: case SIG_CHKPNT_COPY: chkpntEnd (jobCard, w_status, &freed); break; case SIG_SUSP_USER: case SIG_SUSP_LOAD: case SIG_SUSP_WINDOW: case SIG_SUSP_OTHER: suspendActEnd (jobCard, w_status); break; case SIG_RESUME_USER: case SIG_RESUME_LOAD: case SIG_RESUME_WINDOW: case SIG_RESUME_OTHER: resumeActEnd (jobCard, w_status); break; case SIG_TERM_USER: case SIG_KILL_REQUEUE: case SIG_TERM_OTHER: case SIG_TERM_FORCE: if (jobSigLog (jobCard, w_status) == 0) { jobCard->jobSpecs.actPid = 0; jobCard->jobSpecs.actValue = SIG_NULL; } break; case SIG_TERM_LOAD: case SIG_TERM_WINDOW: case SIG_TERM_RUNLIMIT: case SIG_TERM_DEADLINE: case SIG_TERM_PROCESSLIMIT: case SIG_TERM_CPULIMIT: case SIG_TERM_MEMLIMIT: suspendActEnd (jobCard, w_status); break; default: ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5708, "sigActEnd: unknown sigValue <%d> for job <%s> at the job status <%d> with actPid <%d>"), /* catgets 5708 */ jobCard->jobSpecs.actValue, lsb_jobid2str(jobCard->jobSpecs.jobId), jobCard->jobSpecs.jStatus, jobCard->jobSpecs.actPid); jobCard->jobSpecs.actPid = 0; return; } if (!freed) { sbdlog_newstatus (jobCard); } }
static void tryResume (struct hostLoad *myload) { char fname[] = "tryResume"; struct jobCard *jobCard, *next; static int errCount = 0, lastTryResumeTime = 0; if (now - lastTryResumeTime < sbdSleepTime) { return; } lastTryResumeTime = now; for (jobCard = jobQueHead->back; jobCard != jobQueHead; jobCard = next) { next = jobCard->back; if (!(jobCard->jobSpecs.jStatus & JOB_STAT_SSUSP) || jobCard->jobSpecs.actPid) continue; if (jobCard->jobSpecs.numToHosts == 1) { if (shouldResume (myload, jobCard, 1)) { if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0) continue; else return; } } else { int numh; struct hostLoad *load; struct nameList *hostList; numh = jobCard->jobSpecs.numToHosts; hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh); numh = hostList->listSize; load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0, hostList->names, hostList->listSize); if (load == NULL) { if (errCount < 3) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jobCard->jobSpecs.jobId), "ls_loadofhosts"); errCount++; if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) myStatus |= NO_LIM; continue; } else { myStatus = 0; errCount = 0; } if (!shouldResume (load, jobCard, numh)) continue; if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0) continue; else return; } } return; }
static void tryStop (char *myhostnm, struct hostLoad *myload) { static char fname[] = "tryStop"; struct jobCard *jobCard, *next; int reasons, subreasons, stopmore = FALSE; static int errCount = 0, lastTryStopTime = 0; if (now - lastTryStopTime < sbdSleepTime) { return; } lastTryStopTime = now; for (jobCard = jobQueHead->forw; jobCard != jobQueHead; jobCard = next) { next = jobCard->forw; if (jobCard->jobSpecs.numToHosts == 1) { if ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN) && (now >= jobCard->jobSpecs.startTime + sbdSleepTime) && shouldStop (myload, jobCard, &reasons, &subreasons, 1, &stopmore)) { jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons); if (stopmore) continue; else return; } } else { struct hostLoad *load; int numh; struct nameList *hostList; numh = jobCard->jobSpecs.numToHosts; hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh); numh = hostList->listSize; if (hostList->listSize == 1) { load = myload; } else { load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0, hostList->names, hostList->listSize); } if (load == NULL) { if (errCount < 3) ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_MM, fname, lsb_jobid2str(jobCard->jobSpecs.jobId), "ls_loadofhosts"); errCount++; if (lserrno == LSE_LIM_BADHOST) relife(); if (lserrno == LSE_BAD_XDR) relife(); if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) myStatus |= NO_LIM; continue; } else { errCount = 0; myStatus = 0; } if ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN) && now >= jobCard->jobSpecs.startTime + sbdSleepTime) { if (shouldStop (load, jobCard, &reasons, &subreasons, numh, &stopmore)) { jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons); if (stopmore) break; else return; } } } } return; }
void prtJobStart(struct jobInfoEnt *job, int prtFlag, int jobPid, int tFormat) { char prline[MAXLINELEN], tBuff[20]; time_t startTime; int i = 0; struct nameList *hostList = NULL; if (lsbParams[LSB_SHORT_HOSTLIST].paramValue && job->numExHosts > 1 && strcmp(lsbParams[LSB_SHORT_HOSTLIST].paramValue, "1") == 0) { hostList = lsb_compressStrList(job->exHosts, job->numExHosts); if (!hostList) { exit(99); } } if (tFormat) { sprintf (tBuff, "%s <%s>", I18N_Job, lsb_jobid2str(job->jobId)); } else if (LSB_ARRAY_IDX(job->jobId) > 0 ) sprintf (tBuff, " [%d]", LSB_ARRAY_IDX(job->jobId)); else tBuff[0] = '\0'; if (job->startTime && job->numExHosts) { if (job->startTime < job->submitTime) startTime = job->submitTime; else startTime = job->startTime; if ((job->submit.options & SUB_PRE_EXEC) && (prtFlag != BJOBS_PRINT)) { if (prtFlag == BHIST_PRINT_PRE_EXEC) { if (tBuff[0] == '\0') sprintf(prline, "%s: %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), I18N(604, "The pre-exec command is started on")); /* catgets 604 */ else sprintf(prline, "%s:%s, %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), tBuff, I18N(605, "the pre-exec command is started on")); /* catgets 605 */ } else { if (tBuff[0] == '\0') sprintf(prline, "%s: %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), I18N(606, "The batch job command is started on")); /*catgets 606 */ else sprintf(prline, "%s:%s, %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), tBuff, I18N(607, "the batch job command is started on")); /*catgets 607 */ } } else { if (jobPid > 0) { if (tBuff[0] == '\0') sprintf(prline, "%s: %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), I18N(608, "Started on")); /* catgets 608 */ else sprintf(prline, "%s:%s %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), tBuff, I18N(609, "started on")); /* catgets 609 */ } else { if (tBuff[0] == '\0') sprintf(prline, "%s: %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), I18N(610, "Dispatched to")); /* catgets 610 */ else sprintf(prline, "%s: %s %s", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime), tBuff, I18N(611, "dispatched to")); /* catgets 611 */ } } prtLineWUF(prline); if (job->numExHosts > 1) { sprintf(prline, " %d %s", job->numExHosts, I18N(612, "Hosts/Processors")); /* catgets 612 */ prtLineWUF(prline); } if (lsbParams[LSB_SHORT_HOSTLIST].paramValue && job->numExHosts > 1 && strcmp(lsbParams[LSB_SHORT_HOSTLIST].paramValue, "1") == 0) { for (i = 0; i < hostList->listSize; i++) { sprintf(prline, " <%d*%s>", hostList->counter[i], hostList->names[i]); prtLineWUF(prline); } } else { for (i = 0; i < job->numExHosts; i++) { sprintf(prline, " <%s>", job->exHosts[i]); prtLineWUF(prline); } } if (job->execHome && strcmp (job->execHome, "")) { sprintf(prline, ", %s <%s>", I18N(615, "Execution Home"), /* catgets 615 */ job->execHome); prtLineWUF(prline); } if (job->execCwd && strcmp (job->execCwd, "")) { sprintf(prline, ", %s <%s>", I18N(616, "Execution CWD"), /* catgets 616 */ job->execCwd); prtLineWUF(prline); } if (job->execUsername && strcmp(job->execUsername, "") && strcmp(job->user, job->execUsername)) { sprintf(prline, ", %s <%s>", I18N(617, "Execution user name"), /* catgets 617 */ job->execUsername); prtLineWUF(prline); } sprintf(prline, ";\n"); prtLineWUF(prline); } }
static int shouldStop (struct hostLoad *loadV, struct jobCard *jobCard, int *reasons, int *subreasons, int num, int *stopmore) { static char fname[] = "shouldStop"; int i, numLoad = -1, j; struct hostLoad *load = NULL; static struct tclHostData tclHostData; static int first = TRUE; *reasons = 0; *subreasons = 0; if( jobCard->postJobStarted ) { return false; } if (jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP) return false; if (now - jobCard->windWarnTime < sbdSleepTime) return FALSE; if (!JOB_STARTED(jobCard)) return FALSE; if (LS_ISUNAVAIL(loadV->status)) return FALSE; if (num <= 0) return FALSE; for (i = 0; i <jobCard->jobSpecs.numToHosts && (*reasons) == 0; i++) { if (i > 0 && !strcmp (jobCard->jobSpecs.toHosts[i], jobCard->jobSpecs.toHosts[i-1])) continue; numLoad++; load = NULL; for (j = 0; j < num; j ++) { if (equalHost_(jobCard->jobSpecs.toHosts[i], loadV[j].hostName)) { load = &(loadV[j]); break; } } if (load == NULL) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5705, "%s: Can not find load information for host <%s>"), fname, jobCard->jobSpecs.toHosts[i]); /* catgets 5705 */ return FALSE; } if (LS_ISLOCKEDU(load->status) && !(jobCard->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE)) { *reasons = SUSP_HOST_LOCK; *stopmore = TRUE; } else if (LS_ISLOCKEDM(load->status)) { *reasons = SUSP_HOST_LOCK_MASTER; *stopmore = TRUE; } else if (load->li[IT] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][IT] && load->li[IT] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][IT] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = IT; *stopmore = TRUE; } else if (load->li[LS] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][LS] && load->li[LS] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][LS] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = LS; *stopmore = TRUE; } else if (load->li[UT] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][UT] && load->li[UT] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][UT] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = UT; } else if(load->li[PG] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][PG] && load->li[PG] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][PG] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = PG; } else if(load->li[IO] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][IO] && load->li[IO] != INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][IO] != INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = IO; } else if(load->li[MEM] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM] && load->li[MEM] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = MEM; } else if(load->li[SWP] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP] && load->li[SWP] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = SWP; } else if(load->li[TMP] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP] && load->li[TMP] != -INFINIT_LOAD && jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP] != -INFINIT_LOAD) { *reasons |= SUSP_LOAD_REASON; *subreasons = TMP; } for (j = R15S; !(*reasons) && j <= R15M; j++) if ((load->li[j] != INFINIT_LOAD) && (jobCard->jobSpecs.thresholds.loadStop[numLoad][j] != INFINIT_LOAD) && (load->li[j] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][j])) { *reasons |= SUSP_LOAD_REASON; *subreasons = j; break; } for (j = MEM + 1; !(*reasons) && j < MIN(allLsInfo->numIndx, jobCard->jobSpecs.thresholds.nIdx); j++) { if (load->li[j] >= INFINIT_LOAD || load->li[j] <= -INFINIT_LOAD || jobCard->jobSpecs.thresholds.loadStop[numLoad][j] >= INFINIT_LOAD || jobCard->jobSpecs.thresholds.loadStop[numLoad][j] <= -INFINIT_LOAD) { continue; } if (allLsInfo->resTable[j].orderType == INCR) { if (load->li[j] >= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) { *reasons |= SUSP_LOAD_REASON; *subreasons = j; break; } } else { if (load->li[j] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) { *reasons |= SUSP_LOAD_REASON; *subreasons = j; break; } } } if (!(*reasons) && jobCard->stopCondVal != NULL) { int returnCode; if (first == TRUE) { initTclHostData (&tclHostData); returnCode = getTclHostData (load, &tclHostData, FALSE); first = FALSE; } else { returnCode = getTclHostData (load, &tclHostData, TRUE); } if (returnCode >= 0 && evalResReq (jobCard->stopCondVal->selectStr, &tclHostData, DFT_FROMTYPE) == 1) { *reasons |= SUSP_QUE_STOP_COND; break; } } } if (! (*reasons)) return FALSE; if (LS_ISLOCKEDU(load->status) || LS_ISLOCKEDM(load->status)) { return TRUE; } else if (shouldStop1 (load)) { if (logclass & (LC_SCHED | LC_EXEC)) ls_syslog (LOG_DEBUG2, "%s: Should stop job %s; reason=%x, subreasons=%d", fname, lsb_jobid2str(jobCard->jobSpecs.jobId), *reasons, *subreasons); return TRUE; } return FALSE; }