static void chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed) { static char fname[] = "chkpntEnd()"; int savePid, saveStatus; if (IS_SUSP(jobCard->jobSpecs.jStatus) && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG)) jobsig(jobCard, SIGSTOP, TRUE); saveStatus = jobCard->jobSpecs.jStatus; if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) { if (w_status == 0) { if (!jobCard->missing) { jobCard->missing = TRUE; need_checkfinish = TRUE; return; } else if (jobCard->notReported == 0) return; if (jobCard->cleanupPid == 0) { if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0) return; ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709, "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */ fname, lsb_jobid2str(jobCard->jobSpecs.jobId)); } SBD_SET_STATE(jobCard, JOB_STAT_PEND); } else { jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG; } } savePid = jobCard->jobSpecs.actPid; if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus, w_status == 0 ? ERR_NO_ERROR : ERR_SYSACT_FAIL) < 0) { jobCard->jobSpecs.actPid = savePid; jobCard->jobSpecs.jStatus = saveStatus; } else { jobCard->lastChkpntTime = now; jobCard->jobSpecs.actPid = 0; jobCard->actStatus = ACT_NO; jobCard->jobSpecs.actValue = SIG_NULL; if (w_status == 0) { jobCard->migCnt = 1; } if (saveStatus & JOB_STAT_MIG) { if (w_status == 0) { cleanupMigJob(jobCard); deallocJobCard(jobCard); *freed = TRUE; } else jobCard->migCnt *= 2; } } }
void do_newjob(XDR *xdrs, int chfd, struct LSFHeader *reqHdr) { static char fname[] = "do_newjob()"; char reply_buf[MSGSIZE]; XDR xdrs2; struct jobSpecs jobSpecs; struct jobReply jobReply; struct jobCard *jp; sbdReplyType reply; struct LSFHeader replyHdr; char *replyStruct; struct lsfAuth *auth = NULL; memset(&jobReply, 0, sizeof(struct jobReply)); if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) { reply = ERR_BAD_REQ; ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs"); goto sendReply; } for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) { if (jp->jobSpecs.jobId == jobSpecs.jobId) { jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; reply = ERR_NO_ERROR; goto sendReply; } } jp = calloc(1, sizeof(struct jobCard)); if (jp == NULL) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname, lsb_jobid2str(jobSpecs.jobId), "calloc"); reply = ERR_MEM; goto sendReply; } memcpy((char *) &jp->jobSpecs, (char *) &jobSpecs, sizeof(struct jobSpecs)); jp->jobSpecs.jStatus &= ~JOB_STAT_MIG; jp->jobSpecs.startTime = now; jp->jobSpecs.reasons = 0; jp->jobSpecs.subreasons = 0; /* Initialize the core number */ jp->core_num = -1; if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { if (lockHosts (jp) < 0) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "lockHosts"); unlockHosts (jp, jp->jobSpecs.numToHosts); reply = ERR_LOCK_FAIL; freeWeek(jp->week); FREEUP(jp); goto sendReply; } } jp->runTime = 0; if (initJobCard(jp, &jobSpecs, (int *)&reply) < 0) { if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { unlockHosts (jp, jp->jobSpecs.numToHosts); } FREEUP(jp); goto sendReply; } jp->execJobFlag = 0; if (jp->runTime < 0) { jp->runTime = 0; } jp->execGid = 0; jp->execUsername[0] = '\0'; jp->jobSpecs.execUid = -1; jp->jobSpecs.execUsername[0] = '\0'; if (jp->jobSpecs.jobSpoolDir[0] != '\0') { char *tmp; if ((tmp = getUnixSpoolDir (jp->jobSpecs.jobSpoolDir)) == NULL) { jp->jobSpecs.jobSpoolDir[0] = '\0'; } } if ((logclass & LC_TRACE) && jp->jobSpecs.jobSpoolDir[0] != 0) { ls_syslog(LOG_DEBUG, "%s: the SpoolDir for job <%s> is %s \n", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.jobSpoolDir); } if (jp->jobSpecs.options & SUB_PRE_EXEC) SBD_SET_STATE(jp, (JOB_STAT_RUN | JOB_STAT_PRE_EXEC)) else SBD_SET_STATE(jp, JOB_STAT_RUN); reply = job_exec(jp, chfd); if (reply != ERR_NO_ERROR) { ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, lsb_jobid2str(jp->jobSpecs.jobId), "job_exec"); if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) { unlockHosts (jp, jp->jobSpecs.numToHosts); } deallocJobCard(jp); } else { jobReply.jobId = jp->jobSpecs.jobId; jobReply.jobPid = jp->jobSpecs.jobPid; jobReply.jobPGid = jp->jobSpecs.jobPGid; jobReply.jStatus = jp->jobSpecs.jStatus; } sendReply: xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr); xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE); initLSFHeader_(&replyHdr); replyHdr.opCode = reply; replyStruct = (reply == ERR_NO_ERROR) ? (char *) &jobReply : (char *) NULL; if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) { ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobReply"); lsb_merr(_i18n_msg_get(ls_catd , NL_SETN, 5804, "Fatal error: xdr_jobReply() failed; sbatchd relifing")); /* catgets 5804 */ relife(); } if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5805, "%s: Sending jobReply (len=%d) to master failed: %m"), /* catgets 5805 */ fname, XDR_GETPOS(&xdrs2)); } xdr_destroy(&xdrs2); if (reply == ERR_NO_ERROR && !daemonParams[LSB_BSUBI_OLD].paramValue && PURE_INTERACTIVE(&jp->jobSpecs)) { if (status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus, ERR_NO_ERROR) < 0) { jp->notReported++; } } }