Пример #1
0
static int
jobResumeAction (struct jobCard *jp, int sigValue, int suspReason)
{
    static char fname[] = "jobResumeAction";


    if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) {

        return -1;
    };


    if (jp->jobSpecs.actPid)
        return 0;


    if (!(jp->jobSpecs.reasons & suspReason))
        return -1;

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;",
                 fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason);

    if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0)
        if (jobsig(jp, 0, FALSE) < 0) {

            SBD_SET_STATE(jp, JOB_STAT_EXIT);
            return -1;
        }
    sbdlog_newstatus(jp);
    return 0;

}
Пример #2
0
void
jobSuspendAction(struct jobCard *jp,
                 int sigValue,
                 int suspReasons,
                 int suspSubReasons)
{
    static char fname[] = "jobSuspendAction";

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Suspend job %s; reasons=%x, subresons=%d, sigValue=%d, status=%x",
		  fname, lsb_jobid2str(jp->jobSpecs.jobId),
		  jp->jobSpecs.reasons,
		  jp->jobSpecs.subreasons, sigValue, jp->jobSpecs.jStatus);


    jp->actReasons = suspReasons;
    jp->actSubReasons = suspSubReasons;



    if (!JOB_RUNNING(jp))
        return;


    if( jp->postJobStarted ) {
        return;
    }

    if (IS_SUSP (jp->jobSpecs.jStatus)) {
        if (jp->jobSpecs.reasons & suspReasons)
            return;
        else if (jp->jobSpecs.sigMap[-sigValue] == 0)
            return;
    }


    if ((jp->jobSpecs.actPid)
        && ((jp->jobSpecs.actValue == sigValue)
            || (jp->jobSpecs.actValue == (sigValue + jp->jobSpecs.sigMap[-sigValue]))))
    return;

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Call jobSigStart(sigValue =%d) to suspend job", fname, sigValue + jp->jobSpecs.sigMap[-(sigValue)]);

    jobSigStart(jp,
                sigValue + jp->jobSpecs.sigMap[-(sigValue)],
                0,
                0,
                SIGLOG);

    sbdlog_newstatus(jp);
}
Пример #3
0
static void
sigActEnd (struct jobCard *jobCard)
{
    int w_status;
    struct stat st;
    bool_t freed = FALSE;

    char exitFile[MAXFILENAMELEN];


    if (jobCard->jobSpecs.actValue < 0) {
        sprintf(exitFile, "%s/.%s.%s.%s",
            LSTMPDIR, jobCard->jobSpecs.jobFile,
                    lsb_jobidinstr(jobCard->jobSpecs.jobId),
                    exitFileSuffix(jobCard->jobSpecs.actValue));


        w_status = stat(exitFile, &st);

        if (w_status == 0)
            jobCard->actStatus = ACT_DONE;
        else {
            jobCard->actStatus = ACT_FAIL;
        }
    }

    jobCard->jobSpecs.jStatus &= ~JOB_STAT_SIGNAL;

    switch (jobCard->jobSpecs.actValue) {
        case SIG_CHKPNT:
        case SIG_CHKPNT_COPY:
            chkpntEnd (jobCard, w_status, &freed);
            break;
        case SIG_SUSP_USER:
        case SIG_SUSP_LOAD:
        case SIG_SUSP_WINDOW:
        case SIG_SUSP_OTHER:
            suspendActEnd (jobCard, w_status);
            break;

        case SIG_RESUME_USER:
        case SIG_RESUME_LOAD:
        case SIG_RESUME_WINDOW:
        case SIG_RESUME_OTHER:
            resumeActEnd (jobCard, w_status);
            break;

        case SIG_TERM_USER:
        case SIG_KILL_REQUEUE:
        case SIG_TERM_OTHER:
        case SIG_TERM_FORCE:

            if (jobSigLog (jobCard, w_status) == 0) {
                jobCard->jobSpecs.actPid = 0;
                jobCard->jobSpecs.actValue = SIG_NULL;
            }
            break;
        case SIG_TERM_LOAD:
        case SIG_TERM_WINDOW:
        case SIG_TERM_RUNLIMIT:
        case SIG_TERM_DEADLINE:
        case SIG_TERM_PROCESSLIMIT:
        case SIG_TERM_CPULIMIT:
        case SIG_TERM_MEMLIMIT:
            suspendActEnd (jobCard, w_status);
            break;
        default:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5708,
                "sigActEnd: unknown sigValue <%d> for job <%s> at the job status <%d> with actPid <%d>"), /* catgets 5708 */
                jobCard->jobSpecs.actValue,
                lsb_jobid2str(jobCard->jobSpecs.jobId),
                jobCard->jobSpecs.jStatus,
                jobCard->jobSpecs.actPid);

            jobCard->jobSpecs.actPid = 0;
            return;

    }

    if (!freed) {

	sbdlog_newstatus (jobCard);
    }
}
Пример #4
0
void
job_checking (void)
{
    static char fname[] = "job_checking";
    struct jobCard *jobCard, *nextJob;
    struct hostLoad *myload, savedLoad;
    char *myhostnm;
    static time_t last_check;
    char preempted = FALSE;
    int i;

    if (last_check == 0)
	last_check = now;
    if (jobcnt <= 0) {
        last_check = now;
        return;
    }

    checkFinish ();

    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
         jobCard = nextJob) {

	nextJob = jobCard->forw;
        if (IS_FINISH(jobCard->jobSpecs.jStatus)
              || (jobCard->jobSpecs.jStatus & JOB_STAT_PEND))
            continue;

	ruLimits(jobCard);

	if (IS_RUN_JOB_CMD(jobCard->jobSpecs.jStatus)) {

	    jobCard->runTime += (int) (now - last_check);
	}
	if (jobCard->runTime >
	    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
	        if (jobCard->runTime >
		    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl
		    + WARN_TIME && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        ls_syslog(LOG_INFO, \
                                  "%s: warning period expired killing the job=%d",
			    fname, jobCard->jobSpecs.jobId);
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
	        } else if (!jobCard->timeExpire) {
		    ls_syslog(LOG_INFO, I18N(5704,
                        "%s: sending warning signal to job=%d"), /* catgets 5704 */
			fname, jobCard->jobSpecs.jobId);
		    jobsig(jobCard, SIGUSR2, FALSE);
		    jobCard->timeExpire = TRUE;
	        }
            } else {
                if (jobCard->runTime >
                    jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RUN].rlim_curl) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_RUNLIMIT))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_RUNLIMIT, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
	    continue;
	}

        if (jobCard->jobSpecs.termTime && now > jobCard->jobSpecs.termTime

             && !(jobCard->jobSpecs.jAttrib & JOB_FORCE_KILL)) {
            if ((jobCard->jobSpecs.terminateActCmd == NULL)
                 || (jobCard->jobSpecs.terminateActCmd[0] == '\0')) {
                if (now > jobCard->jobSpecs.termTime + WARN_TIME
                                                   && jobCard->timeExpire) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
		    else if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL)
			continue;
                    else {

                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
			jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
                    }
                } else
		    if (!jobCard->timeExpire) {
		        jobsig(jobCard, SIGUSR2, FALSE);
		        jobCard->timeExpire = TRUE;
		    }
            } else {
                if (now > jobCard->jobSpecs.termTime) {

                    if ((IS_SUSP (jobCard->jobSpecs.jStatus))
                       && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
                       && (jobCard->jobSpecs.subreasons & SUB_REASON_DEADLINE))
                        continue;
                    else {
                        jobSigStart (jobCard, SIG_TERM_DEADLINE, 0, 0, SIGLOG);
                        sbdlog_newstatus(jobCard);
                    }
                }
            }
            continue;
        }


        if (! window_ok (jobCard)
	    && !(jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)) {
	    if (! (jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                || ((jobCard->jobSpecs.options & SUB_WINDOW_SIG)
                          && now - jobCard->windWarnTime >= WARN_TIME)) {


	        jobSuspendAction(jobCard, SIG_SUSP_WINDOW, SUSP_QUEUE_WINDOW, 0);
		continue;

	    }
	} else {

		jobResumeAction(jobCard, SIG_RESUME_WINDOW, SUSP_QUEUE_WINDOW);
                continue;
	}
    }


    if ((myhostnm = ls_getmyhostname()) == NULL) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL_MM, fname, "ls_getmyhostname");
        die(SLAVE_FATAL);
    }
    myload = ls_loadofhosts (NULL, 0, EXACT|EFFECTIVE, 0, &myhostnm, 1);
    if (myload == NULL) {
        if (myStatus != NO_LIM)

	    ls_syslog(LOG_INFO, I18N_FUNC_FAIL_MM, fname, "ls_loadofhosts");
	if (lserrno == LSE_LIM_BADHOST)
	    relife();
	if (lserrno == LSE_BAD_XDR)
	    relife();
	if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT) {
	    myStatus |= NO_LIM;


            tryChkpntMig();
        }
        last_check = now;
	return;
    } else
	myStatus = 0;



    memcpy ((char *)&savedLoad, (char *)myload, sizeof (struct hostLoad));
    savedLoad.li = (float *) my_malloc (allLsInfo->numIndx * sizeof (float),
				   "job_checking");
    savedLoad.status = (int *) my_malloc
       ((1 + GET_INTNUM(allLsInfo->numIndx)) * sizeof (int), "job_checking");
    for (i = 0; i < allLsInfo->numIndx; i++)
        savedLoad.li[i] = myload->li[i];
    for (i = 0; i < 1 + GET_INTNUM(allLsInfo->numIndx); i++)
        savedLoad.status[i] = myload->status[i];
    tryResume (&savedLoad);

    if (!preempted)
        tryStop (myhostnm, &savedLoad);

    tryChkpntMig();


    FREEUP(savedLoad.li);
    FREEUP(savedLoad.status);
    last_check = now;
    return;

}
Пример #5
0
static void
tryChkpntMig(void)
{
    char migrating = FALSE;
    struct jobCard *jobCard, *nextJob;


    for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
	 jobCard = jobCard->forw) {
	if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) {
	    migrating = TRUE;
	    break;
	}
    }

    for (jobCard = jobQueHead->forw; jobCard != jobQueHead;
                                     jobCard = nextJob) {
        nextJob = jobCard->forw;

	if (jobCard->missing)
	    continue;


	if ((jobCard->jobSpecs.jStatus & JOB_STAT_SSUSP)
	    && !migrating
	    && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG)
	    && jobCard->jobSpecs.actPid == 0
	    && (jobCard->jobSpecs.options & (SUB_CHKPNTABLE | SUB_RERUNNABLE))
	    && (now - jobCard->jobSpecs.lastSSuspTime
                > jobCard->jobSpecs.migThresh)
	    && (now - jobCard->lastChkpntTime
                > jobCard->migCnt * sbdSleepTime)
            && !(jobCard->jobSpecs.reasons & SUSP_QUEUE_WINDOW))
        {

	    if (jobSigStart (jobCard, SIG_CHKPNT, LSB_CHKPNT_KILL,
                             jobCard->jobSpecs.chkPeriod, SIGLOG) == 0) {
		jobCard->jobSpecs.jStatus |= JOB_STAT_MIG;
		migrating = TRUE;
                sbdlog_newstatus(jobCard);
		continue;
	    }
	}


        if (!(jobCard->jobSpecs.jStatus & JOB_STAT_MIG) &&
	    (jobCard->jobSpecs.jStatus & JOB_STAT_RUN) &&
	    jobCard->jobSpecs.actPid == 0 &&
	    jobCard->jobSpecs.chkPeriod &&
	    now - jobCard->lastChkpntTime > jobCard->jobSpecs.chkPeriod) {


            if (jobSigStart (jobCard, SIG_CHKPNT, 0,
                             jobCard->jobSpecs.chkPeriod, SIGLOG) == 0) {
                sbdlog_newstatus(jobCard);
                continue;
            }

        }
    }
}
Пример #6
0
static void
ruLimits(struct jobCard *jobCard)
{
    struct rlimit rlimit;


    rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_CPU],
		  &rlimit, LSF_RLIMIT_CPU);



    if (rlimit.rlim_cur != RLIM_INFINITY && lsbJobCpuLimit != 0) {

	if ((long)rlimit.rlim_cur < ((long)jobCard->runRusage.utime +
			       (long)jobCard->runRusage.stime)) {

            if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) {

	    } else {
		jobSigStart (jobCard, SIG_TERM_CPULIMIT, 0, 0, SIGLOG);
		sbdlog_newstatus(jobCard);

		jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
	    }
	}
    }


    rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_SWAP],
                  &rlimit, LSF_RLIMIT_SWAP);
    if (rlimit.rlim_cur != RLIM_INFINITY) {
        if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.swap) {
            jobsig(jobCard, SIGQUIT, FALSE);
            jobsig(jobCard, SIGKILL, TRUE);
        }
    }


    rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_PROCESS],
                  &rlimit, LSF_RLIMIT_PROCESS);

    if (rlimit.rlim_cur != RLIM_INFINITY) {
        if ((int)rlimit.rlim_cur + 2 < jobCard->runRusage.npids) {

            if ((IS_SUSP (jobCard->jobSpecs.jStatus))
               && (jobCard->jobSpecs.reasons & SUSP_RES_LIMIT)
               && (jobCard->jobSpecs.subreasons & SUB_REASON_PROCESSLIMIT))
                return;
            else {
                jobSigStart (jobCard, SIG_TERM_PROCESSLIMIT, 0, 0, SIGLOG);
                sbdlog_newstatus(jobCard);
            }
        }
    }


    if ( (lsbJobMemLimit == 1) ||
	 (lsbJobMemLimit != 0 && lsbMemEnforce == TRUE)) {
        rlimitDecode_(&jobCard->jobSpecs.lsfLimits[LSF_RLIMIT_RSS],
                      &rlimit, LSF_RLIMIT_RSS);
        if (rlimit.rlim_cur != RLIM_INFINITY) {
	    if ((long)(rlimit.rlim_cur / 1024) < (long)jobCard->runRusage.mem) {
                if (jobCard->jobSpecs.jStatus & JOB_STAT_KILL) {

	        } else {
		    jobSigStart (jobCard, SIG_TERM_MEMLIMIT, 0, 0, SIGLOG);
		    sbdlog_newstatus(jobCard);
		    jobCard->jobSpecs.jStatus |= JOB_STAT_KILL;
		}
	    }
        }
    }
}
Пример #7
0
void
do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_sigjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSig      jobSig;
    sbdReplyType       reply;
    struct jobReply    jobReply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp = NULL;
    char               found = FALSE;
    int                cc;
    int                sigValue;
    int                savedActReasons;
    int                savedActSubReasons;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));
    if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig");

	goto Reply1;
    }

    jobSig.sigValue = sig_decode(jobSig.sigValue);
    sigValue = jobSig.sigValue;

    if (logclass & LC_SIGNAL)
        ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue);

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jobSig.jobId)
            continue;
        found = TRUE;
        break;
    }
    if (found == FALSE) {
        reply = ERR_NO_JOB;
        jp = NULL;
        goto Reply1;
    }

    if (jobSig.reasons & SUSP_MBD_LOCK) {

        jp->jobSpecs.reasons = jobSig.reasons;
        jp->jobSpecs.subreasons = jobSig.subReasons;
        savedActReasons = jp->actReasons;
        savedActSubReasons = jp->actSubReasons;
        jp->actReasons = jobSig.reasons;
        jp->actSubReasons = jobSig.subReasons;
    }


    if (jp->postJobStarted) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (IS_FINISH(jp->jobSpecs.jStatus)) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (jp->jobSpecs.jobPGid == -1) {
        SBD_SET_STATE(jp, JOB_STAT_EXIT);
        reply = ERR_NO_ERROR;
        goto Reply;
    }

    if (!JOB_STARTED(jp)) {
        if (isSigTerm(sigValue) == TRUE) {
            if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0)
           	reply = ERR_SIG_RETRY;
    	    else
            	reply = ERR_NO_ERROR;

            goto Reply;
        }

        reply = ERR_SIG_RETRY;

        if (logclass & LC_EXEC)
            ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>",
                      fname, getLsbSigSymbol(sigValue),
                      lsb_jobid2str(jp->jobSpecs.jobId));
        goto Reply1;
    }

    if (IS_PEND(jp->jobSpecs.jStatus)) {
        reply = ERR_SIG_RETRY;
        goto Reply1;
    }

    if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) {

        if ((cc = jobSigStart(jp,
                              sigValue,
                              jobSig.actFlags,
                              jobSig.chkPeriod,
                              NO_SIGLOG)) < 0)
            reply = ERR_SIG_RETRY;
        else {
            jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
            reply = ERR_NO_ERROR;
        }
        goto Reply;
    }

    if ((cc = jobSigStart(jp,
                          sigValue,
                          jobSig.actFlags,
                          jobSig.chkPeriod,
                          NO_SIGLOG)) < 0)
        reply = ERR_SIG_RETRY;
    else
        reply = ERR_NO_ERROR;

Reply:
    sbdlog_newstatus(jp);

Reply1:

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR) {
        jobReply.jobPid = jp->jobSpecs.jobPid;
        jobReply.actPid = jp->jobSpecs.actPid;
        jobReply.jobId = jp->jobSpecs.jobId;
        jobReply.jobPGid = jp->jobSpecs.jobPGid;
        jobReply.jStatus = jp->jobSpecs.jStatus;
        jobReply.reasons = jp->jobSpecs.reasons;
        jobReply.actStatus = jp->actStatus;
        replyStruct = (char *) &jobReply;
    } else {
        if (reply != ERR_NO_JOB)
            if  ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) {
                jp->actReasons = savedActReasons;
                jp->actSubReasons = savedActSubReasons;
            }
        replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply");
        relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821,
                                         "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */
    }
    if (jp != NULL)
        jp->actStatus = ACT_NO;

    xdr_destroy(&xdrs2);

    return;
}