예제 #1
0
static int
shouldStop (struct hostLoad *loadV,
	    struct jobCard *jobCard, int *reasons, int *subreasons, int num, int *stopmore)
{
    static char fname[] = "shouldStop";
    int i, numLoad = -1, j;
    struct hostLoad *load = NULL;
    static struct tclHostData tclHostData;
    static int first = TRUE;

    *reasons = 0;
    *subreasons = 0;


    if( jobCard->postJobStarted ) {
        return false;
    }


    if (jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)
	return false;


    if (now - jobCard->windWarnTime < sbdSleepTime)
        return FALSE;


    if (!JOB_STARTED(jobCard))
        return FALSE;


    if (LS_ISUNAVAIL(loadV->status))
	return FALSE;
    if (num <= 0)
	return FALSE;


    for (i = 0; i <jobCard->jobSpecs.numToHosts && (*reasons) == 0; i++) {
        if (i > 0 && !strcmp (jobCard->jobSpecs.toHosts[i],
					     jobCard->jobSpecs.toHosts[i-1]))
            continue;
        numLoad++;
	load = NULL;
        for (j = 0; j < num; j ++) {
    	    if (equalHost_(jobCard->jobSpecs.toHosts[i], loadV[j].hostName)) {
	        load = &(loadV[j]);
	        break;
            }
        }
        if (load == NULL) {
	    ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5705,
		"%s: Can not find load information for host <%s>"), fname, jobCard->jobSpecs.toHosts[i]); /* catgets 5705 */
            return FALSE;
        }
        if (LS_ISLOCKEDU(load->status)
            && !(jobCard->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE)) {
            *reasons = SUSP_HOST_LOCK;
            *stopmore = TRUE;
        }
	else if (LS_ISLOCKEDM(load->status)) {
            *reasons = SUSP_HOST_LOCK_MASTER;
            *stopmore = TRUE;
        }
        else if (load->li[IT] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][IT]
            && load->li[IT] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][IT] != -INFINIT_LOAD) {
	    *reasons |= SUSP_LOAD_REASON;
            *subreasons = IT;
            *stopmore = TRUE;
        }
        else if (load->li[LS] >=
			  jobCard->jobSpecs.thresholds.loadStop[numLoad][LS]
            && load->li[LS] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][LS]
						      != INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = LS;
            *stopmore = TRUE;
        }
        else if (load->li[UT] >=
			 jobCard->jobSpecs.thresholds.loadStop[numLoad][UT]
            && load->li[UT] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][UT] !=
							   INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = UT;
        }
        else if(load->li[PG] >=
		      jobCard->jobSpecs.thresholds.loadStop[numLoad][PG]
            && load->li[PG] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][PG]
						    != INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = PG;
        }
        else if(load->li[IO] >=
		     jobCard->jobSpecs.thresholds.loadStop[numLoad][IO]
            && load->li[IO] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][IO]
						      != INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = IO;
        }
        else if(load->li[MEM]
			 <= jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM]
            && load->li[MEM] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM]
						      != -INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = MEM;
        }

        else if(load->li[SWP]
			 <= jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP]
            && load->li[SWP] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP]
						      != -INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = SWP;
        }
        else if(load->li[TMP]
			 <= jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP]
            && load->li[TMP] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP]
						      != -INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = TMP;
        }

        for (j = R15S; !(*reasons) && j <= R15M; j++)
	    if ((load->li[j] != INFINIT_LOAD)
	        && (jobCard->jobSpecs.thresholds.loadStop[numLoad][j]
							 != INFINIT_LOAD)
	        && (load->li[j]
			>= jobCard->jobSpecs.thresholds.loadStop[numLoad][j])) {
	        *reasons |= SUSP_LOAD_REASON;
                *subreasons = j;
                break;
	    }


        for (j = MEM + 1; !(*reasons) &&
               j < MIN(allLsInfo->numIndx, jobCard->jobSpecs.thresholds.nIdx);
	              j++) {
            if (load->li[j] >= INFINIT_LOAD || load->li[j] <= -INFINIT_LOAD
                || jobCard->jobSpecs.thresholds.loadStop[numLoad][j]
							 >= INFINIT_LOAD
                || jobCard->jobSpecs.thresholds.loadStop[numLoad][j]
							 <= -INFINIT_LOAD) {
                continue;
            }
	    if (allLsInfo->resTable[j].orderType == INCR) {
	        if (load->li[j]
		       >= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) {
		    *reasons |= SUSP_LOAD_REASON;
                    *subreasons = j;
		    break;
                }
	    } else {
	        if (load->li[j]
		      <= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) {
		    *reasons |= SUSP_LOAD_REASON;
                    *subreasons = j;
		    break;
                }
	    }
        }

        if (!(*reasons) && jobCard->stopCondVal != NULL) {
            int returnCode;
            if (first == TRUE) {
                initTclHostData (&tclHostData);
                returnCode = getTclHostData (load, &tclHostData, FALSE);
                first = FALSE;
            } else {
                returnCode = getTclHostData (load, &tclHostData, TRUE);
            }
            if (returnCode >= 0
		     && evalResReq (jobCard->stopCondVal->selectStr,
    	       	        &tclHostData, DFT_FROMTYPE) == 1) {
        	*reasons |= SUSP_QUE_STOP_COND;
		break;
            }
        }
    }


    if (! (*reasons))
	return FALSE;


    if (LS_ISLOCKEDU(load->status) || LS_ISLOCKEDM(load->status)) {
	return TRUE;
    } else if (shouldStop1 (load)) {
        if (logclass & (LC_SCHED | LC_EXEC))
            ls_syslog (LOG_DEBUG2,
			"%s: Should stop job %s; reason=%x, subreasons=%d",
                        fname, lsb_jobid2str(jobCard->jobSpecs.jobId),
			*reasons, *subreasons);

        return TRUE;
    }
    return FALSE;

}
예제 #2
0
void
do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_sigjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSig      jobSig;
    sbdReplyType       reply;
    struct jobReply    jobReply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp = NULL;
    char               found = FALSE;
    int                cc;
    int                sigValue;
    int                savedActReasons;
    int                savedActSubReasons;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));
    if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig");

	goto Reply1;
    }

    jobSig.sigValue = sig_decode(jobSig.sigValue);
    sigValue = jobSig.sigValue;

    if (logclass & LC_SIGNAL)
        ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue);

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jobSig.jobId)
            continue;
        found = TRUE;
        break;
    }
    if (found == FALSE) {
        reply = ERR_NO_JOB;
        jp = NULL;
        goto Reply1;
    }

    if (jobSig.reasons & SUSP_MBD_LOCK) {

        jp->jobSpecs.reasons = jobSig.reasons;
        jp->jobSpecs.subreasons = jobSig.subReasons;
        savedActReasons = jp->actReasons;
        savedActSubReasons = jp->actSubReasons;
        jp->actReasons = jobSig.reasons;
        jp->actSubReasons = jobSig.subReasons;
    }


    if (jp->postJobStarted) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (IS_FINISH(jp->jobSpecs.jStatus)) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (jp->jobSpecs.jobPGid == -1) {
        SBD_SET_STATE(jp, JOB_STAT_EXIT);
        reply = ERR_NO_ERROR;
        goto Reply;
    }

    if (!JOB_STARTED(jp)) {
        if (isSigTerm(sigValue) == TRUE) {
            if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0)
           	reply = ERR_SIG_RETRY;
    	    else
            	reply = ERR_NO_ERROR;

            goto Reply;
        }

        reply = ERR_SIG_RETRY;

        if (logclass & LC_EXEC)
            ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>",
                      fname, getLsbSigSymbol(sigValue),
                      lsb_jobid2str(jp->jobSpecs.jobId));
        goto Reply1;
    }

    if (IS_PEND(jp->jobSpecs.jStatus)) {
        reply = ERR_SIG_RETRY;
        goto Reply1;
    }

    if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) {

        if ((cc = jobSigStart(jp,
                              sigValue,
                              jobSig.actFlags,
                              jobSig.chkPeriod,
                              NO_SIGLOG)) < 0)
            reply = ERR_SIG_RETRY;
        else {
            jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
            reply = ERR_NO_ERROR;
        }
        goto Reply;
    }

    if ((cc = jobSigStart(jp,
                          sigValue,
                          jobSig.actFlags,
                          jobSig.chkPeriod,
                          NO_SIGLOG)) < 0)
        reply = ERR_SIG_RETRY;
    else
        reply = ERR_NO_ERROR;

Reply:
    sbdlog_newstatus(jp);

Reply1:

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR) {
        jobReply.jobPid = jp->jobSpecs.jobPid;
        jobReply.actPid = jp->jobSpecs.actPid;
        jobReply.jobId = jp->jobSpecs.jobId;
        jobReply.jobPGid = jp->jobSpecs.jobPGid;
        jobReply.jStatus = jp->jobSpecs.jStatus;
        jobReply.reasons = jp->jobSpecs.reasons;
        jobReply.actStatus = jp->actStatus;
        replyStruct = (char *) &jobReply;
    } else {
        if (reply != ERR_NO_JOB)
            if  ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) {
                jp->actReasons = savedActReasons;
                jp->actSubReasons = savedActSubReasons;
            }
        replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply");
        relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821,
                                         "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */
    }
    if (jp != NULL)
        jp->actStatus = ACT_NO;

    xdr_destroy(&xdrs2);

    return;
}