示例#1
0
static int
jobResumeAction (struct jobCard *jp, int sigValue, int suspReason)
{
    static char fname[] = "jobResumeAction";


    if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) {

        return -1;
    };


    if (jp->jobSpecs.actPid)
        return 0;


    if (!(jp->jobSpecs.reasons & suspReason))
        return -1;

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;",
                 fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason);

    if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0)
        if (jobsig(jp, 0, FALSE) < 0) {

            SBD_SET_STATE(jp, JOB_STAT_EXIT);
            return -1;
        }
    sbdlog_newstatus(jp);
    return 0;

}
示例#2
0
int
job_resume (struct jobCard *jp)
{
    static char fname[] = "job_resume";
    int rep;

    if (jp->jobSpecs.actPid)
	return 0;

    if (jobsig(jp, SIGCONT, FALSE) < 0)
        return -1;

    SBD_SET_STATE(jp, JOB_STAT_RUN);

    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		      ERR_NO_ERROR);
    if (rep < 0)
        jp->notReported++;
    else {
	if (jp->notReported > 0)
            jp->notReported = 0;
    }
    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Resume job %s",
                               fname, lsb_jobid2str(jp->jobSpecs.jobId));
    return 0;
}
示例#3
0
void
do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_sigjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSig      jobSig;
    sbdReplyType       reply;
    struct jobReply    jobReply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp = NULL;
    char               found = FALSE;
    int                cc;
    int                sigValue;
    int                savedActReasons;
    int                savedActSubReasons;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));
    if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig");

	goto Reply1;
    }

    jobSig.sigValue = sig_decode(jobSig.sigValue);
    sigValue = jobSig.sigValue;

    if (logclass & LC_SIGNAL)
        ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue);

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jobSig.jobId)
            continue;
        found = TRUE;
        break;
    }
    if (found == FALSE) {
        reply = ERR_NO_JOB;
        jp = NULL;
        goto Reply1;
    }

    if (jobSig.reasons & SUSP_MBD_LOCK) {

        jp->jobSpecs.reasons = jobSig.reasons;
        jp->jobSpecs.subreasons = jobSig.subReasons;
        savedActReasons = jp->actReasons;
        savedActSubReasons = jp->actSubReasons;
        jp->actReasons = jobSig.reasons;
        jp->actSubReasons = jobSig.subReasons;
    }


    if (jp->postJobStarted) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (IS_FINISH(jp->jobSpecs.jStatus)) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (jp->jobSpecs.jobPGid == -1) {
        SBD_SET_STATE(jp, JOB_STAT_EXIT);
        reply = ERR_NO_ERROR;
        goto Reply;
    }

    if (!JOB_STARTED(jp)) {
        if (isSigTerm(sigValue) == TRUE) {
            if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0)
           	reply = ERR_SIG_RETRY;
    	    else
            	reply = ERR_NO_ERROR;

            goto Reply;
        }

        reply = ERR_SIG_RETRY;

        if (logclass & LC_EXEC)
            ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>",
                      fname, getLsbSigSymbol(sigValue),
                      lsb_jobid2str(jp->jobSpecs.jobId));
        goto Reply1;
    }

    if (IS_PEND(jp->jobSpecs.jStatus)) {
        reply = ERR_SIG_RETRY;
        goto Reply1;
    }

    if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) {

        if ((cc = jobSigStart(jp,
                              sigValue,
                              jobSig.actFlags,
                              jobSig.chkPeriod,
                              NO_SIGLOG)) < 0)
            reply = ERR_SIG_RETRY;
        else {
            jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
            reply = ERR_NO_ERROR;
        }
        goto Reply;
    }

    if ((cc = jobSigStart(jp,
                          sigValue,
                          jobSig.actFlags,
                          jobSig.chkPeriod,
                          NO_SIGLOG)) < 0)
        reply = ERR_SIG_RETRY;
    else
        reply = ERR_NO_ERROR;

Reply:
    sbdlog_newstatus(jp);

Reply1:

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR) {
        jobReply.jobPid = jp->jobSpecs.jobPid;
        jobReply.actPid = jp->jobSpecs.actPid;
        jobReply.jobId = jp->jobSpecs.jobId;
        jobReply.jobPGid = jp->jobSpecs.jobPGid;
        jobReply.jStatus = jp->jobSpecs.jStatus;
        jobReply.reasons = jp->jobSpecs.reasons;
        jobReply.actStatus = jp->actStatus;
        replyStruct = (char *) &jobReply;
    } else {
        if (reply != ERR_NO_JOB)
            if  ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) {
                jp->actReasons = savedActReasons;
                jp->actSubReasons = savedActSubReasons;
            }
        replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply");
        relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821,
                                         "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */
    }
    if (jp != NULL)
        jp->actStatus = ACT_NO;

    xdr_destroy(&xdrs2);

    return;
}
示例#4
0
static void
chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed)
{
    static char fname[] = "chkpntEnd()";
    int savePid, saveStatus;


    if (IS_SUSP(jobCard->jobSpecs.jStatus)
       && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG))
        jobsig(jobCard, SIGSTOP, TRUE);

    saveStatus = jobCard->jobSpecs.jStatus;
    if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) {
        if (w_status == 0)  {
            if (!jobCard->missing) {

                jobCard->missing = TRUE;
                need_checkfinish = TRUE;
                return;
            } else if (jobCard->notReported == 0)
                return;

            if (jobCard->cleanupPid == 0) {
                if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0)
                    return;

                ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709,
                    "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */
                    fname, lsb_jobid2str(jobCard->jobSpecs.jobId));
            }

            SBD_SET_STATE(jobCard, JOB_STAT_PEND);
        } else {
            jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG;
        }
    }

    savePid = jobCard->jobSpecs.actPid;

    if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus,
                    w_status == 0 ? ERR_NO_ERROR :
                    ERR_SYSACT_FAIL) < 0) {
        jobCard->jobSpecs.actPid = savePid;
        jobCard->jobSpecs.jStatus = saveStatus;
    } else {
        jobCard->lastChkpntTime = now;
        jobCard->jobSpecs.actPid = 0;
        jobCard->actStatus = ACT_NO;
        jobCard->jobSpecs.actValue = SIG_NULL;

        if (w_status == 0) {

            jobCard->migCnt = 1;
        }

        if (saveStatus & JOB_STAT_MIG) {
            if (w_status == 0) {

                cleanupMigJob(jobCard);
		deallocJobCard(jobCard);
		*freed = TRUE;
            } else
                jobCard->migCnt *= 2;
        }
    }

}
示例#5
0
void
do_newjob(XDR *xdrs, int chfd, struct LSFHeader *reqHdr)
{
    static char        fname[] = "do_newjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSpecs    jobSpecs;
    struct jobReply    jobReply;
    struct jobCard     *jp;
    sbdReplyType       reply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));

    if (!xdr_jobSpecs(xdrs, &jobSpecs, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSpecs");
	goto sendReply;
    }


    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId == jobSpecs.jobId) {

	    jobReply.jobId = jp->jobSpecs.jobId;
	    jobReply.jobPid = jp->jobSpecs.jobPid;
	    jobReply.jobPGid = jp->jobSpecs.jobPGid;
	    jobReply.jStatus = jp->jobSpecs.jStatus;
	    reply = ERR_NO_ERROR;
	    goto sendReply;
	}
    }

    jp = calloc(1, sizeof(struct jobCard));
    if (jp == NULL) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jobSpecs.jobId), "calloc");
	reply = ERR_MEM;
	goto sendReply;
    }

    memcpy((char *) &jp->jobSpecs, (char *) &jobSpecs,
	   sizeof(struct jobSpecs));

    jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
    jp->jobSpecs.startTime = now;
    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    /* Initialize the core number
     */
    jp->core_num = -1;

    if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
	if (lockHosts (jp) < 0) {
	    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
                      lsb_jobid2str(jp->jobSpecs.jobId), "lockHosts");
            unlockHosts (jp, jp->jobSpecs.numToHosts);
	    reply = ERR_LOCK_FAIL;
	    freeWeek(jp->week);
	    FREEUP(jp);
	    goto sendReply;
        }
    }
    jp->runTime = 0;
    if (initJobCard(jp, &jobSpecs, (int *)&reply) < 0) {

	if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
	    unlockHosts (jp, jp->jobSpecs.numToHosts);
	}
	FREEUP(jp);
	goto sendReply;
    }

    jp->execJobFlag = 0;

    if (jp->runTime < 0) {
        jp->runTime = 0;
    }
    jp->execGid = 0;
    jp->execUsername[0] = '\0';
    jp->jobSpecs.execUid = -1;
    jp->jobSpecs.execUsername[0] = '\0';

    if (jp->jobSpecs.jobSpoolDir[0] != '\0') {
        char *tmp;

        if ((tmp = getUnixSpoolDir (jp->jobSpecs.jobSpoolDir)) == NULL) {

            jp->jobSpecs.jobSpoolDir[0] = '\0';
        }
    }

    if ((logclass & LC_TRACE) && jp->jobSpecs.jobSpoolDir[0] != 0) {
        ls_syslog(LOG_DEBUG,
                  "%s: the SpoolDir for  job <%s>  is %s \n",
                  fname, lsb_jobid2str(jp->jobSpecs.jobId),
                  jp->jobSpecs.jobSpoolDir);
    }
    if (jp->jobSpecs.options & SUB_PRE_EXEC)
	SBD_SET_STATE(jp, (JOB_STAT_RUN | JOB_STAT_PRE_EXEC))
        else
            SBD_SET_STATE(jp, JOB_STAT_RUN);

    reply = job_exec(jp, chfd);

    if (reply != ERR_NO_ERROR) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "job_exec");
	if (jp->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE) {
            unlockHosts (jp, jp->jobSpecs.numToHosts);
	}
	deallocJobCard(jp);
    } else {
	jobReply.jobId = jp->jobSpecs.jobId;
	jobReply.jobPid = jp->jobSpecs.jobPid;
	jobReply.jobPGid = jp->jobSpecs.jobPGid;
	jobReply.jStatus = jp->jobSpecs.jStatus;
    }


sendReply:
    xdr_lsffree(xdr_jobSpecs, (char *)&jobSpecs, reqHdr);
    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);
    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    replyStruct = (reply == ERR_NO_ERROR) ? (char *) &jobReply : (char *) NULL;
    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobReply");
	lsb_merr(_i18n_msg_get(ls_catd , NL_SETN, 5804,
			       "Fatal error: xdr_jobReply() failed; sbatchd relifing")); /* catgets 5804 */
	relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
	ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5805,
					 "%s: Sending jobReply (len=%d) to master failed: %m"), /* catgets 5805 */
		  fname, XDR_GETPOS(&xdrs2));
    }

    xdr_destroy(&xdrs2);


    if (reply == ERR_NO_ERROR && !daemonParams[LSB_BSUBI_OLD].paramValue &&
	PURE_INTERACTIVE(&jp->jobSpecs)) {
  	if (status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		        ERR_NO_ERROR) < 0) {
            jp->notReported++;
	}
    }

}