Пример #1
0
static int
rmJobBufFilesPid(struct jobCard *jp)
{
    static char fname[] = "rmJobBufFilesPid()";
    int pid;

    if ((pid = fork()) < 0) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
	    lsb_jobid2str(jp->jobSpecs.jobId), "fork");
	return (pid);
    }

    if (pid)
	return (pid);



    closeBatchSocket();
    putEnv(LS_EXEC_T, "END");

    if (postJobSetup(jp) < 0) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S,
	    fname,
	    lsb_jobid2str(jp->jobSpecs.jobId),
	    "postSetupUser");
	exit(-1);
    }

    rmJobBufFiles(jp);
    exit(0);
}
Пример #2
0
static
void prtSignaled (int signalValue, LS_LONG_INT jobId)
{
    char *op;

    switch (signalValue) {
      case SIGCHK:
	op = (_i18n_msg_get(ls_catd,NL_SETN,473, "checkpointed")); /* catgets  473  */
	break;
      case SIGSTOP:
	op = (_i18n_msg_get(ls_catd,NL_SETN,474, "stopped")); /* catgets  474  */
	break;
      case SIGCONT:
	op = (_i18n_msg_get(ls_catd,NL_SETN,475, "resumed")); /* catgets  475  */
	break;
      case SIGKILL:
      case SIGFORCE:
	op = (_i18n_msg_get(ls_catd,NL_SETN,476, "terminated")); /* catgets  476  */
	break;
      case SIGDEL:
        op = (_i18n_msg_get(ls_catd,NL_SETN,477, "deleted")); /* catgets  477  */
	break;
      default:
	op = (_i18n_msg_get(ls_catd,NL_SETN,478, "signaled")); /* catgets  478  */
	break;
    }

    if (signalValue == SIGDEL && runCount != 0)
        printf((_i18n_msg_get(ls_catd,NL_SETN,479, "Job <%s> will be deleted after running next %d times\n")),  /* catgets  479  */
               lsb_jobid2str(jobId), runCount);
    else
        printf((_i18n_msg_get(ls_catd,NL_SETN,480, "Job <%s> is being %s\n")), lsb_jobid2str(jobId), op); /* catgets  480  */

}
Пример #3
0
static int
xdr_thresholds(XDR *xdrs, struct jobSpecs *jobSpecs)
{
    static char fname[] = "xdr_thresholds";
    int i, j;

    if (xdrs->x_op == XDR_DECODE) {
        jobSpecs->thresholds.loadSched = NULL; 
        jobSpecs->thresholds.loadStop = NULL; 
    }

    if (xdrs->x_op == XDR_FREE) {
        for(i=0; i < jobSpecs->thresholds.nThresholds; i++) {
            FREEUP(jobSpecs->thresholds.loadSched[i]);
            FREEUP(jobSpecs->thresholds.loadStop[i]);
        }
        FREEUP(jobSpecs->thresholds.loadSched);
        FREEUP(jobSpecs->thresholds.loadStop);
        return(TRUE);
    }

    if (!(xdr_int(xdrs, &jobSpecs->thresholds.nIdx) && 
         xdr_int(xdrs, &jobSpecs->thresholds.nThresholds))) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		  lsb_jobid2str(jobSpecs->jobId), 
		  "xdr_int", "nIdx/nThresholds");
        return(FALSE);
    }
    if (xdrs->x_op == XDR_DECODE) {
	jobSpecs->thresholds.loadSched = (float **) 
	       my_calloc (jobSpecs->thresholds.nThresholds, sizeof(float *), fname);
	jobSpecs->thresholds.loadStop = (float **) 
	       my_calloc (jobSpecs->thresholds.nThresholds, sizeof(float *), fname);
        for (i = 0; i < jobSpecs->thresholds.nThresholds; i++) {
            jobSpecs->thresholds.loadSched[i] = (float *)
                 my_calloc (jobSpecs->thresholds.nIdx, sizeof(float), fname);
            jobSpecs->thresholds.loadStop[i] = (float *)
                my_calloc (jobSpecs->thresholds.nIdx, sizeof(float), fname);
        }
    }

    for (j = 0; j < jobSpecs->thresholds.nThresholds; j++) {
        for (i = 0; i < jobSpecs->thresholds.nIdx; i++) {
            if (!(xdr_float(xdrs, &jobSpecs->thresholds.loadStop[j][i]) &&
	          xdr_float(xdrs, &jobSpecs->thresholds.loadSched[j][i]))) {
		ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
			  lsb_jobid2str(jobSpecs->jobId), 
			  "xdr_float", "loadStop/loadSched");
                return(FALSE);
            }
	}
    }
    return (TRUE);

} 
Пример #4
0
static int
jobResumeAction (struct jobCard *jp, int sigValue, int suspReason)
{
    static char fname[] = "jobResumeAction";


    if (jp->jobSpecs.reasons & SUSP_MBD_LOCK) {

        return -1;
    };


    if (jp->jobSpecs.actPid)
        return 0;


    if (!(jp->jobSpecs.reasons & suspReason))
        return -1;

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Try to resume job %s with the current reason %d and the triggered reason %d;",
                 fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, suspReason);

    if (jobSigStart(jp, sigValue, 0, 0, SIGLOG) < 0)
        if (jobsig(jp, 0, FALSE) < 0) {

            SBD_SET_STATE(jp, JOB_STAT_EXIT);
            return -1;
        }
    sbdlog_newstatus(jp);
    return 0;

}
Пример #5
0
void
prtJobSubmit(struct jobInfoEnt *job, int prt_q, int tFormat)
{
    char prline[MAXLINELEN];
    char *timestr;

    timestr = putstr_(_i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->submitTime));
    if (tFormat) {
        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,569, "%s: Job <%s> submitted from host <%s>")), /* catgets  569  */
                    timestr, lsb_jobid2str(job->jobId), job->fromHost);
    } else {
        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,570, "%s: Submitted from host <%s>")), /* catgets  570  */
                    timestr, job->fromHost);
    }

    FREEUP(timestr);
    prtLine(prline);

    if (job->submit.options2 & SUB2_HOLD) {
        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,570, " with hold"))); 
        /* catgets  570  */
        prtLine(prline);
    }

    if (prt_q) {
        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,571, ", to Queue <%s>")), job->submit.queue); /* catgets  571  */
        prtLine(prline);
    }

    TIMEIT(2, prtBTTime(job), "prtBTTime");

} 
Пример #6
0
void
prtJobSubmit(struct jobInfoEnt *job, int prt_q, int tFormat)
{
    char prline[MAXLINELEN];
    char timeBuf[128];

     if (tFormat) {
         sprintf(timeBuf, " Job <%s> s", lsb_jobid2str(job->jobId));
     } else {
         sprintf(timeBuf, " S");
     }
     sprintf(prline, "%-12.19s:%submitted from host <%s>",
             ctime(&job->submitTime), timeBuf, job->fromHost);

     prtLineWUF(prline);

     if (job->submit.options2 & SUB2_HOLD) {
         sprintf(prline, " on hold");
         prtLineWUF(prline);
     }

     if (prt_q) {
         sprintf(prline, " to Queue <%s>", job->submit.queue);
         prtLineWUF(prline);
     }

    TIMEIT(2, prtBTTime(job), "prtBTTime");
}
Пример #7
0
int
job_resume (struct jobCard *jp)
{
    static char fname[] = "job_resume";
    int rep;

    if (jp->jobSpecs.actPid)
	return 0;

    if (jobsig(jp, SIGCONT, FALSE) < 0)
        return -1;

    SBD_SET_STATE(jp, JOB_STAT_RUN);

    jp->jobSpecs.reasons = 0;
    jp->jobSpecs.subreasons = 0;
    rep = status_job (BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
		      ERR_NO_ERROR);
    if (rep < 0)
        jp->notReported++;
    else {
	if (jp->notReported > 0)
            jp->notReported = 0;
    }
    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Resume job %s",
                               fname, lsb_jobid2str(jp->jobSpecs.jobId));
    return 0;
}
Пример #8
0
static int
cleanupMigJob(struct jobCard *jp)
{
    static char fname[] = "cleanupMigJob()";
    int pid;


    unlockHosts (jp, jp->jobSpecs.numToHosts);

    if (!jp->jobSpecs.postCmd || jp->jobSpecs.postCmd[0] == '\0')
        return 0;


    if ((pid = fork()) < 0) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
	    lsb_jobid2str(jp->jobSpecs.jobId), "fork");
        lsb_merr2(_i18n_msg_get(ls_catd , NL_SETN, 700,
	    "Unable to fork a child to run the queue's post-exec command for job <%s>.  Please run <%s> manually if necessary.\n"), /* catgets 700 */
	    lsb_jobid2str(jp->jobSpecs.jobId),
	    jp->jobSpecs.postCmd);
	return (pid);
    }

    if (pid)
	return (pid);



    closeBatchSocket();
    putEnv(LS_EXEC_T, "END");

    if (postJobSetup(jp) == -1) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname,
	    lsb_jobid2str(jp->jobSpecs.jobId), "postJobSetup");
	lsb_merr2(_i18n_msg_get(ls_catd , NL_SETN, 701,
	    "Unable to setup the environment for job <%s> to run the queue's post exec.  Please run <%s> manually if necessary.\n"), /* catgets 701 */
	    lsb_jobid2str(jp->jobSpecs.jobId),
	    jp->jobSpecs.postCmd);
	exit(-1);
    }

    runQPost(jp);
    exit(0);
}
Пример #9
0
static int
signalJobs (LS_LONG_INT *jobIds, int numJobs)
{
    int failsignal = FALSE, signaled = FALSE;
    int i, cc;
    char msg[80];

    for (i = 0; i < numJobs; i++) {
	if (sigValue == SIGCHK)
	    cc = lsb_chkpntjob(jobIds[i], chkPeriod, chkOptions);
        else if (sigValue == SIGDEL)
	    cc = lsb_deletejob(jobIds[i], runCount, 0);
	else if (sigValue == SIGFORCE)
	    cc = lsb_forcekilljob(jobIds[i]);
	else
	    cc = lsb_signaljob(jobIds[i], sigValue);

	if (cc < 0) {
	    if (sigValue == SIGCHK && lsberrno == LSBE_NOT_STARTED &&
		chkPeriod != LSB_CHKPERIOD_NOCHNG) {
		if (chkPeriod)
		    printf((_i18n_msg_get(ls_catd,NL_SETN,470, "Job <%s>: Checkpoint period is now %d min.\n")), /* catgets  470  */
			   lsb_jobid2str(jobIds[i]),
			   (int) (chkPeriod / 60));
		else
		    printf((_i18n_msg_get(ls_catd,NL_SETN,471, "Job <%s>: Periodic checkpointing is disabled\n")), /* catgets  471  */
			   lsb_jobid2str(jobIds[i]));
		signaled = TRUE;
	    } else {
		failsignal = TRUE;
		sprintf (msg, "%s <%s>", I18N_Job, lsb_jobid2str(jobIds[i]));
		lsb_perror (msg);
	    }
	} else {
	    signaled = TRUE;
            prtSignaled (sigValue, jobIds[i]);
	}
    }


    return (signaled ? !failsignal : FALSE);

}
Пример #10
0
int
main(int argc, char **argv)
{
    int cc;
    char *msg;
    LS_LONG_INT *jobIDs;

    if (lsb_init(argv[0]) < 0) {
	lsb_perror("lsb_init");
	return -1;
    }

    while ((cc = getopt(argc, argv, "Vhd:")) != EOF) {
        switch (cc) {
            case 'V':
                fputs(_LS_VERSION_, stderr);
                return -1;
            case 'h':
                usage();
                return -1;
            case 'd':
                msg = optarg;
                break;
            default:
                usage();
                return -1;
        }
    }

    if (strlen(msg) > LSB_MAX_MSGSIZE) {
        fprintf(stderr, "bpost: message bigger than %d\n", LSB_MAX_MSGSIZE);
        return -1;
    }

    getJobIds(argc,
              argv,
              NULL,
              NULL,
              NULL,
              NULL,
              &jobIDs,
              0);

    cc = lsb_postjobmsg(jobIDs[0], msg);
    if (cc < 0) {
        lsb_perror("lsb_jobmsg()");
        return -1;
    }

    printf("Message to job %s posted all right.\n", lsb_jobid2str(jobIDs[0]));

    return 0;
}
Пример #11
0
void
jobSuspendAction(struct jobCard *jp,
                 int sigValue,
                 int suspReasons,
                 int suspSubReasons)
{
    static char fname[] = "jobSuspendAction";

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Suspend job %s; reasons=%x, subresons=%d, sigValue=%d, status=%x",
		  fname, lsb_jobid2str(jp->jobSpecs.jobId),
		  jp->jobSpecs.reasons,
		  jp->jobSpecs.subreasons, sigValue, jp->jobSpecs.jStatus);


    jp->actReasons = suspReasons;
    jp->actSubReasons = suspSubReasons;



    if (!JOB_RUNNING(jp))
        return;


    if( jp->postJobStarted ) {
        return;
    }

    if (IS_SUSP (jp->jobSpecs.jStatus)) {
        if (jp->jobSpecs.reasons & suspReasons)
            return;
        else if (jp->jobSpecs.sigMap[-sigValue] == 0)
            return;
    }


    if ((jp->jobSpecs.actPid)
        && ((jp->jobSpecs.actValue == sigValue)
            || (jp->jobSpecs.actValue == (sigValue + jp->jobSpecs.sigMap[-sigValue]))))
    return;

    if (logclass & (LC_TRACE | LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG1, "%s: Call jobSigStart(sigValue =%d) to suspend job", fname, sigValue + jp->jobSpecs.sigMap[-(sigValue)]);

    jobSigStart(jp,
                sigValue + jp->jobSpecs.sigMap[-(sigValue)],
                0,
                0,
                SIGLOG);

    sbdlog_newstatus(jp);
}
Пример #12
0
int
main(int argc, char **argv)
{
    struct submit req;
    struct submitReply  reply;
    char *job;
    LS_LONG_INT jobId = -1, *jobIdList = NULL;
    int numJobIds;
    time_t beginTime, terminTime;

    if (lsb_init(argv[0]) < 0) {
        sub_perror("lsb_init");
        fprintf(stderr, ". Job not modified.\n");
        exit (-1);
    }

    if (fillReq (argc, argv, CMD_BMODIFY, &req) < 0) {
        fprintf(stderr, ". Job not modified.\n");
        exit (-1);
    }

    job = req.command;
    beginTime = req.beginTime;
    terminTime = req.termTime;

    if ((numJobIds = getJobIdList(job, &jobIdList)) < 0) {
        exit(-1);
    }

    jobId = jobIdList[0];

    if ((jobId = lsb_modify(&req, &reply, jobId)) < 0) {
       if (lsberrno == LSBE_JOB_ARRAY) {
            fprintf(stderr, "Options -q and -O cannot be applied on job array");
       } else {
            prtErrMsg (&req, &reply);
       }
	fprintf(stderr, ". Job not modified.\n");
        if (req.nxf)
            free(req.xf);
        exit (-1);
    }

    printf("Parameters of job <%s> are being changed\n", lsb_jobid2str(jobId));
    if (beginTime > 0 || terminTime > 0)
        prtBETime_(&req);
    if (req.nxf)
        free(req.xf);

    return 0;
}
Пример #13
0
void
shout_err (struct jobCard *jobPtr, char *msg)
{
     char buf[MSGSIZE];

     sprintf(buf, \
"We are unable to run your job %s:<%s>. The error is:\n%s.",
             lsb_jobid2str(jobPtr->jobSpecs.jobId),
             jobPtr->jobSpecs.command, msg);

     if (jobPtr->jobSpecs.options & SUB_MAIL_USER) {
         merr_user(jobPtr->jobSpecs.mailUser, jobPtr->jobSpecs.fromHost,
                   buf, I18N_error);
     } else {
         merr_user(jobPtr->jobSpecs.userName, jobPtr->jobSpecs.fromHost,
                   buf, I18N_error);
     }
}
Пример #14
0
void
suspendActEnd (struct jobCard *jobCard, int w_status)
{
    int sbdStartStop = 0;

    if (logclass & (LC_TRACE | LC_SIGNAL))
        ls_syslog(LOG_DEBUG, "suspendActEnd: Suspend job %s; reasons=%x, subresons=%d",
		  lsb_jobid2str(jobCard->jobSpecs.jobId),
		  jobCard->actReasons,
		  jobCard->actReasons);

    sbdStartStop = (jobCard->actReasons & SUSP_SBD_STARTUP);

    jobCard->jobSpecs.lastSSuspTime = now;
    jobCard->jobSpecs.reasons |= jobCard->actReasons & (~SUSP_SBD_STARTUP);
    jobCard->jobSpecs.subreasons = jobCard->actSubReasons;

    if ((jobCard->jobSpecs.actValue == SIG_SUSP_USER) ||
       (jobCard->jobSpecs.actValue == SIG_TERM_USER))
	SET_STATE(jobCard->jobSpecs.jStatus, JOB_STAT_USUSP);
    else
	SET_STATE(jobCard->jobSpecs.jStatus, JOB_STAT_SSUSP);

    if (w_status == 0)
	jobCard->actStatus = ACT_DONE;
    else
        jobCard->actStatus = ACT_FAIL;

    if (sbdStartStop)
        jobCard->actStatus = ACT_NO;

    if (jobSigLog(jobCard, w_status) == 0) {
        jobCard->jobSpecs.actValue = SIG_NULL;
        jobCard->jobSpecs.actPid = 0;
    }

}
Пример #15
0
void
prtHeader(struct jobInfoEnt *job, int prt_q, int tFormat)
{
    char prline[MAXLINELEN];

    if (!tFormat) {
        sprintf(prline, "\nJob%s <%s>,", uf_format?"":" Id", lsb_jobid2str(job->jobId));

        prtLineWUF(prline);
        if (job->submit.options & SUB_JOB_NAME) {
           char *jobName, *pos;
           jobName = job->submit.jobName;
           if ((pos = strchr(jobName, '[')) && LSB_ARRAY_IDX(job->jobId)) {
               *pos = '\0';
               sprintf(jobName, "%s[%d]", jobName, LSB_ARRAY_IDX(job->jobId));
           }
           sprintf(prline, " Job Name <%s>,", jobName);
           prtLineWUF(prline);
        }
    }
    if (tFormat) {
       sprintf(prline, ",");
       prtLine(prline);
    }
    sprintf(prline, " User <%s>,", job->user);
    prtLineWUF(prline);

    if (lsbMode_ & LSB_MODE_BATCH) {
	sprintf(prline, " Project <%s>,", job->submit.projectName);
	prtLineWUF(prline);
    }

    if (job->submit.userGroup && job->submit.userGroup[0] != '\0') {
	sprintf(prline, " User Group <%s>,", job->submit.userGroup);
        prtLineWUF(prline);
    }

    if (job->submit.options & SUB_MAIL_USER) {
        sprintf(prline, " Mail <%s>,", job->submit.mailUser);
        prtLineWUF(prline);
    }

    if (prt_q) {
	sprintf(prline, " Status <%s>, Queue <%s>,",
                get_status(job),
		job->submit.queue);
	prtLineWUF(prline);
    }

    /* Interactive job */
    if (job->submit.options & SUB_INTERACTIVE) {
	sprintf(prline, " Interactive");
	if (job->submit.options & SUB_PTY) {
	    strcat(prline, " pseudo-terminal");
	    if (job->submit.options & (SUB_PTY_SHELL))
		strcat(prline, " shell");
	}
	strcat(prline, " mode,");
	prtLineWUF(prline);
    }

    if (job->jobPriority > 0) {
	sprintf(prline, " Job Priority <%d>,", job->jobPriority);
	prtLineWUF(prline);
    }

    if (job->submit.options2 & (SUB2_JOB_CMD_SPOOL)) {
	 if (tFormat)
             sprintf(prline, " Command(Spooled) <%s>", job->submit.command);
         else
             sprintf(prline, " Command(Spooled) <%s>", job->submit.command);
    } else {
	 if (tFormat)
             sprintf(prline, " Command <%s>", job->submit.command);
         else
             sprintf(prline, " Command <%s>", job->submit.command);
    }
    prtLineWUF(prline);

    if (job->submit.options2 & SUB2_JOB_GROUP) {
	sprintf(prline, ", Job Group <%s>", job->submit.job_group);
	prtLineWUF(prline);
    }

    sprintf(prline, "\n");
    prtLineWUF(prline);
}
Пример #16
0
void
child_handler(int sig)
{
    int             pid;
    LS_WAIT_T       status;
    struct rusage   rusage;
    register float  cpuTime;
    struct lsfRusage lsfRusage;
    struct jobCard *jobCard;
    static short lastMbdExitVal = MASTER_NULL;
    static int sbd_finish_sleep = -1;

    cleanRusage (&rusage);
    now = time(0);
    while ((pid=wait3(&status, WNOHANG, &rusage)) > 0) {
        if (pid == mbdPid) {
            int sig = WTERMSIG(status);
            if (mbdExitCnt > 150)
                mbdExitCnt = 150;
            mbdExitVal = WIFSIGNALED(status);
            if (mbdExitVal) {
                ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5600,
                    "mbatchd died with signal <%d> termination"), /* catgets 5600 */
                    sig);
                if (WCOREDUMP(status))
                    ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5601,
                        "mbatchd core dumped")); /* catgets 5601 */
                mbdExitVal = sig;
                if (mbdExitVal == lastMbdExitVal)
                    mbdExitCnt++;
                else {
                    mbdExitCnt = 0;
                    lastMbdExitVal = mbdExitVal;
                }
                continue;
            } else {
                mbdExitVal = WEXITSTATUS(status);

                if (mbdExitVal == lastMbdExitVal)
                    mbdExitCnt++;
                else {
                    mbdExitCnt = 0;
                    lastMbdExitVal = mbdExitVal;
                }
                if (mbdExitVal == MASTER_RECONFIG) {
                    ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5602,
                        "mbatchd resigned for reconfiguration")); /* catgets 5602 */
                    start_master();
                } else
                    ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5603,
                        "mbatchd exited with value <%d>"),  /* catgets 5603 */
                        mbdExitVal);
                continue;
            }
        }

        ls_ruunix2lsf (&rusage, &lsfRusage);
        cpuTime = lsfRusage.ru_utime + lsfRusage.ru_stime;

        for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
             jobCard = jobCard->forw) {

            if (jobCard->exitPid == pid) {
                jobCard->w_status = LS_STATUS(status);
                jobCard->exitPid = -1;
                if (logclass & LC_EXEC) {
                    ls_syslog(LOG_DEBUG, I18N(5604,
                              "child_handler: Job <%s> exitPid <%d> status <%d> exitcode <%d>"),/*catgets 5604*/
                              lsb_jobid2str(jobCard->jobSpecs.jobId),
                              pid, jobCard->w_status,
                              WEXITSTATUS(status));
                }
            }

            if (jobCard->jobSpecs.jobPid == pid) {
                jobCard->collectedChild = TRUE;
                jobCard->cpuTime = cpuTime;
                jobCard->w_status = LS_STATUS(status);
                jobCard->exitPid = -1;
                memcpy ((char *) &jobCard->lsfRusage, (char *) &lsfRusage,
                        sizeof (struct lsfRusage));
                jobCard->notReported++;



                if (sbd_finish_sleep < 0) {
                    if (daemonParams[LSB_SBD_FINISH_SLEEP].paramValue) {
                        errno = 0;
                        sbd_finish_sleep = atoi(daemonParams[LSB_SBD_FINISH_SLEEP].paramValue);
                        if (errno)
                            sbd_finish_sleep = 0;
                    } else {
                        sbd_finish_sleep=0;
                    }
                }
                if (sbd_finish_sleep > 0) {
                    millisleep_(sbd_finish_sleep);
                }

                if (logclass & LC_EXEC) {
                    ls_syslog(LOG_DEBUG, I18N(5605,
                              "child_handler: Job <%s> Pid <%d> status <%d> exitcode <%d>"), /*catgets 5605*/
                              lsb_jobid2str(jobCard->jobSpecs.jobId), pid,
                              jobCard->w_status, WEXITSTATUS(status));
                }
                need_checkfinish = TRUE;

                break;
            }
        }
    }


}
Пример #17
0
int
main (int argc, char **argv, char **environ)
{
  char *queue = NULL, *host = NULL, *jobName = NULL, *user = NULL;
  LS_LONG_INT jobId;
  int options;
  struct jobInfoEnt *jInfo;
  char *outFile;
  char fflag = FALSE;
  int cc;
  int rc;

  rc = _i18n_init (I18N_CAT_MIN);

  if (lsb_init (argv[0]) < 0)
    {
      lsb_perror ("lsb_init");
      exit (-1);
    }

  while ((cc = getopt (argc, argv, "Vhfq:m:J:")) != EOF)
    {
      switch (cc)
	{
	case 'q':
	  if (queue || host || jobName)
	    oneOf (argv[0]);
	  queue = optarg;
	  break;
	case 'm':
	  if (queue || host || jobName)
	    oneOf (argv[0]);
	  host = optarg;
	  break;
	case 'J':
	  if (queue || host || jobName)
	    oneOf (argv[0]);
	  jobName = optarg;
	  break;
	case 'V':
	  fputs (_LS_VERSION_, stderr);
	  exit (0);
	case 'f':
	  fflag = TRUE;
	  break;
	case 'h':
	default:
	  usage (argv[0]);
	}
    }

  jobId = 0;
  options = LAST_JOB;
  if (argc >= optind + 1)
    {
      if (queue || host || jobName)
	{
	  oneOf (argv[0]);
	}
      else if ((argc > 2 && !fflag) || (argc > 3 && fflag))
	usage (argv[0]);

      if (getOneJobId (argv[optind], &jobId, 0))
	{
	  usage (argv[0]);
	}

      options = 0;
    }



  if (lsb_openjobinfo (jobId, jobName, NULL, queue, host, options) < 0
      || (jInfo = lsb_readjobinfo (NULL)) == NULL)
    {

      if (jobId != 0 || jobName != NULL)
	{
	  user = ALL_USERS;
	  if (lsb_openjobinfo (jobId, jobName, user, queue, host, options) < 0
	      || (jInfo = lsb_readjobinfo (NULL)) == NULL)
	    {
	      jobInfoErr (jobId, jobName, NULL, queue, host, options);
	      exit (-1);
	    }
	}
      else
	{
	  jobInfoErr (jobId, jobName, NULL, queue, host, options);
	  exit (-1);
	}
    }
  lsb_closejobinfo ();


  if (jobId && jInfo->jobId != jobId)
    {
      lsberrno = LSBE_JOB_ARRAY;
      lsb_perror ("bpeek");
      exit (-1);
    }


  if ((jInfo->submit.options & SUB_INTERACTIVE) &&
      !(jInfo->submit.options & (SUB_OUT_FILE | SUB_ERR_FILE)))
    {
      fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2456, "Job <%s> : Cannot bpeek an interactive job.\n"),	/* catgets  2456 */
	       lsb_jobid2str (jInfo->jobId));
      exit (-1);
    }

  if (IS_PEND (jInfo->status) || jInfo->execUsername[0] == '\0')
    {
      fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2454, "Job <%s> : Not yet started.\n"),	/* catgets  2454 */
	       lsb_jobid2str (jInfo->jobId));

      exit (-1);
    }
  if (IS_FINISH (jInfo->status))
    {
      fprintf (stderr, _i18n_msg_get (ls_catd, NL_SETN, 2455, "Job <%s> : Already finished.\n"),	/* catgets  2455  */
	       lsb_jobid2str (jInfo->jobId));
      exit (-1);
    }

  if ((outFile = lsb_peekjob (jInfo->jobId)) == NULL)
    {
      char msg[50];
      sprintf (msg, "%s <%s>", I18N_Job, lsb_jobid2str (jInfo->jobId));
      lsb_perror (msg);
      exit (-1);
    }
  displayOutput (outFile, jInfo, fflag, environ);
  _i18n_end (ls_catd);
  exit (0);

}
Пример #18
0
void
bmove (int argc, char **argv, int opCode)
{
    int position, reqPos;
    LS_LONG_INT jobId = 0;
    int achar;

    if (lsb_init(argv[0]) < 0) {
	lsb_perror("lsb_init");
	exit(-1);
    }

    opterr = 0;
    while((achar = getopt(argc, argv, "hV")) != EOF) {
	switch(achar) {
            case 'V':
		fputs(_LS_VERSION_, stderr);
		exit(0);
	    case 'h':
            default:
		usage(argv[0]);
        }
    }
    if (argc == optind) {
        fprintf(stderr, "%s.\n",
	    (_i18n_msg_get(ls_catd,NL_SETN,852, "Job ID must be specified"))); /* catgets  852  */
        usage(argv[0]);
    }
    if (optind < argc-2) {
	fprintf(stderr, "%s.\n",
	    (_i18n_msg_get(ls_catd,NL_SETN,853, "Command syntax error: too many arguments"))); /* catgets  853  */
	usage(argv[0]);
    }

    if (getOneJobId (argv[optind], &jobId, 0)) {
	usage(argv[0]);
    }

    position = 1;
    if (optind == argc - 2) {
	if (!isint_(argv[++optind]) || atoi(argv[optind]) <= 0) {
            fprintf(stderr, "%s: %s.\n", argv[optind],
		I18N(854, "Position value must be a positive integer")); /* catgets854*/
	    usage(argv[0]);
        }
	position = atoi(argv[optind]);
    }

    reqPos = position;
    if (lsb_movejob(jobId, &position, opCode) <0) {
	lsb_perror(lsb_jobid2str (jobId));
	exit(-1);
    }

    if (position != reqPos)
	fprintf(stderr, (_i18n_msg_get(ls_catd,NL_SETN,855, "Warning: position value <%d> is beyond movable range.\n")), /* catgets  855  */
	       reqPos);
    if (opCode == TO_TOP)
	fprintf(stderr, (_i18n_msg_get(ls_catd,NL_SETN,856, "Job <%s> has been moved to position %d from top.\n")),  /* catgets  856  */
	    lsb_jobid2str (jobId), position);
    else
	fprintf(stderr, (_i18n_msg_get(ls_catd,NL_SETN,857, "Job <%s> has been moved to position %d from bottom.\n")), /* catgets  857  */
	    lsb_jobid2str (jobId), position);

    exit(0);
}
Пример #19
0
void
do_sigjob(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char        fname[] = "do_sigjob()";
    char               reply_buf[MSGSIZE];
    XDR                xdrs2;
    struct jobSig      jobSig;
    sbdReplyType       reply;
    struct jobReply    jobReply;
    struct LSFHeader   replyHdr;
    char               *replyStruct;
    struct jobCard     *jp = NULL;
    char               found = FALSE;
    int                cc;
    int                sigValue;
    int                savedActReasons;
    int                savedActSubReasons;
    struct lsfAuth     *auth = NULL;

    memset(&jobReply, 0, sizeof(struct jobReply));
    if (!xdr_jobSig(xdrs, &jobSig, reqHdr)) {
	reply = ERR_BAD_REQ;
	ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSig");

	goto Reply1;
    }

    jobSig.sigValue = sig_decode(jobSig.sigValue);
    sigValue = jobSig.sigValue;

    if (logclass & LC_SIGNAL)
        ls_syslog(LOG_DEBUG, "do_sigJob: sigValue =%d", sigValue);

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jobSig.jobId)
            continue;
        found = TRUE;
        break;
    }
    if (found == FALSE) {
        reply = ERR_NO_JOB;
        jp = NULL;
        goto Reply1;
    }

    if (jobSig.reasons & SUSP_MBD_LOCK) {

        jp->jobSpecs.reasons = jobSig.reasons;
        jp->jobSpecs.subreasons = jobSig.subReasons;
        savedActReasons = jp->actReasons;
        savedActSubReasons = jp->actSubReasons;
        jp->actReasons = jobSig.reasons;
        jp->actSubReasons = jobSig.subReasons;
    }


    if (jp->postJobStarted) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (IS_FINISH(jp->jobSpecs.jStatus)) {
        reply = ERR_NO_ERROR;
        goto Reply1;
    }

    if (jp->jobSpecs.jobPGid == -1) {
        SBD_SET_STATE(jp, JOB_STAT_EXIT);
        reply = ERR_NO_ERROR;
        goto Reply;
    }

    if (!JOB_STARTED(jp)) {
        if (isSigTerm(sigValue) == TRUE) {
            if ((cc = jobSigStart (jp, sigValue, jobSig.actFlags, jobSig.chkPeriod, NO_SIGLOG)) < 0)
           	reply = ERR_SIG_RETRY;
    	    else
            	reply = ERR_NO_ERROR;

            goto Reply;
        }

        reply = ERR_SIG_RETRY;

        if (logclass & LC_EXEC)
            ls_syslog(LOG_DEBUG, "%s: Retry signal %s for job <%s>",
                      fname, getLsbSigSymbol(sigValue),
                      lsb_jobid2str(jp->jobSpecs.jobId));
        goto Reply1;
    }

    if (IS_PEND(jp->jobSpecs.jStatus)) {
        reply = ERR_SIG_RETRY;
        goto Reply1;
    }

    if (jp->jobSpecs.actPid || (jp->jobSpecs.jStatus & JOB_STAT_MIG)) {

        if ((cc = jobSigStart(jp,
                              sigValue,
                              jobSig.actFlags,
                              jobSig.chkPeriod,
                              NO_SIGLOG)) < 0)
            reply = ERR_SIG_RETRY;
        else {
            jp->jobSpecs.jStatus &= ~JOB_STAT_MIG;
            reply = ERR_NO_ERROR;
        }
        goto Reply;
    }

    if ((cc = jobSigStart(jp,
                          sigValue,
                          jobSig.actFlags,
                          jobSig.chkPeriod,
                          NO_SIGLOG)) < 0)
        reply = ERR_SIG_RETRY;
    else
        reply = ERR_NO_ERROR;

Reply:
    sbdlog_newstatus(jp);

Reply1:

    xdrmem_create(&xdrs2, reply_buf, MSGSIZE, XDR_ENCODE);

    initLSFHeader_(&replyHdr);
    replyHdr.opCode = reply;
    if (reply == ERR_NO_ERROR) {
        jobReply.jobPid = jp->jobSpecs.jobPid;
        jobReply.actPid = jp->jobSpecs.actPid;
        jobReply.jobId = jp->jobSpecs.jobId;
        jobReply.jobPGid = jp->jobSpecs.jobPGid;
        jobReply.jStatus = jp->jobSpecs.jStatus;
        jobReply.reasons = jp->jobSpecs.reasons;
        jobReply.actStatus = jp->actStatus;
        replyStruct = (char *) &jobReply;
    } else {
        if (reply != ERR_NO_JOB)
            if  ((jp != NULL) && (jobSig.reasons & SUSP_MBD_LOCK)) {
                jp->actReasons = savedActReasons;
                jp->actSubReasons = savedActSubReasons;
            }
        replyStruct = (char *) 0;
    }

    if (!xdr_encodeMsg(&xdrs2, replyStruct, &replyHdr, xdr_jobReply, 0, auth)) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
                  lsb_jobid2str(jp->jobSpecs.jobId), "xdr_jobReply");
        relife();
    }

    if (chanWrite_(chfd, reply_buf, XDR_GETPOS(&xdrs2)) <= 0) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5821,
                                         "%s: Sending jobReply (len=%d) to master failed for job <%s>: %m"), fname, XDR_GETPOS(&xdrs2), lsb_jobid2str(jobSig.jobId)); /* catgets 5821 */
    }
    if (jp != NULL)
        jp->actStatus = ACT_NO;

    xdr_destroy(&xdrs2);

    return;
}
Пример #20
0
void
do_jobSetup(XDR * xdrs, int chfd, struct LSFHeader * reqHdr)
{
    static char       fname[] = "do_jobSetup()";
    struct jobSetup   jsetup;
    struct jobCard    *jp = NULL;
    char              found = FALSE;
    struct jobCard    savejp;

    if (logclass & LC_EXEC)
        ls_syslog(LOG_DEBUG, "%s: Entering ...", fname);

    if (!xdr_jobSetup(xdrs, &jsetup, reqHdr)) {
        ls_syslog(LOG_ERR, I18N_FUNC_FAIL, fname, "xdr_jobSetup");
        return;
    }

    for (jp = jobQueHead->forw; (jp != jobQueHead); jp = jp->forw) {
        if (jp->jobSpecs.jobId != jsetup.jobId)
            continue;
        found = TRUE;
        break;
    }

    if (found == FALSE) {
        ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5838,
                                         "%s: Job <%s> is not found"), /* catgets 5838 */
                  fname, lsb_jobid2str(jsetup.jobId));
        replyHdrWithRC(LSBE_NO_JOB, chfd, jsetup.jobId);
        return;
    }
    if (jp->jobSpecs.actPid)
        return;

    memcpy((char *) &savejp, (char *) jp, sizeof(savejp));

    jp->execJobFlag |= JOB_EXEC_QPRE_KNOWN;
    if (jsetup.execJobFlag & JOB_EXEC_QPRE_OK)
        jp->execJobFlag |= JOB_EXEC_QPRE_OK;

    jp->jobSpecs.jobPid = jsetup.jobPid;
    jp->jobSpecs.jobPGid = jsetup.jobPGid;
    jp->jobSpecs.execUid = jsetup.execUid;
    strcpy(jp->jobSpecs.execUsername, jsetup.execUsername);
    jp->execGid = jsetup.execGid;
    strcpy(jp->execUsername, jsetup.execUsername);
    strcpy(jp->jobSpecs.execCwd, jsetup.execCwd);
    strcpy(jp->jobSpecs.execHome, jsetup.execHome);

    if (jsetup.jStatus & JOB_STAT_RUN) {
        if (!(jsetup.jStatus & JOB_STAT_PRE_EXEC))
            jp->jobSpecs.jStatus &= ~JOB_STAT_PRE_EXEC;

	if (status_job(BATCH_STATUS_JOB, jp, jp->jobSpecs.jStatus,
                       ERR_NO_ERROR) < 0) {
            memcpy((char *) jp, (char *) &savejp, sizeof(savejp));
            return;
        }
        jp->execJobFlag |= JOB_EXEC_STARTED;

    } else {
        jp->jobSpecs.reasons = jsetup.reason;
        jp->collectedChild = TRUE;
        jp->notReported = 0;
        jp->exitPid = -1;
        jp->needReportRU = FALSE;
        jp->jobSpecs.jStatus = jsetup.jStatus;
        jp->w_status = jsetup.w_status;
        jp->lsfRusage = jsetup.lsfRusage;
        jp->cpuTime = jsetup.cpuTime;
        if (job_finish(jp, TRUE) < 0) {
            memcpy((char *) jp, (char *) &savejp, sizeof(savejp));
            return;
        }
    }

    if (replyHdrWithRC(LSBE_NO_ERROR, chfd, jsetup.jobId) < 0) {
        ls_syslog(LOG_DEBUG, "%s: Reply header failed for job <%s>",
                  fname, lsb_jobid2str(jsetup.jobId));
    }
    if (logclass & LC_EXEC)
        ls_syslog(LOG_DEBUG1, "%s: JobId %s jstatus %d reason %x  jobPid %d jobPGid %d execUid %d execGid <%d> execUser <%s> execHome <%s> execCwd <%s> execJobFlag %x cpuTime %f w_status %d",
                  fname, lsb_jobid2str(jsetup.jobId), jsetup.jStatus,
                  jsetup.reason, jsetup.jobPid,
                  jsetup.jobPGid, jsetup.execUid, jsetup.execGid,
                  jsetup.execUsername, jsetup.execHome, jsetup.execCwd,
                  jsetup.execJobFlag, jsetup.cpuTime, jsetup.w_status);
}
Пример #21
0
int
main(int argc, char** argv)
{
    char*                 hosts   = NULL;
    struct runJobRequest  runJobRequest;
    int                   cc;
    int                   c;
    bool_t                fFlag = FALSE;
    bool_t		  bFlag = FALSE;
    int rc;

    rc = _i18n_init ( I18N_CAT_MIN );


    if (lsb_init(argv[0]) < 0) {
	lsb_perror("lsb_init");
	exit (-1);
    }

    while((c = getopt(argc, argv, "m:fbhV")) != EOF) {
	switch(c) {
	case 'm':
	    hosts = putstr_(optarg);
	    if (hosts == NULL) {
		perror("putstr_");
		exit(-1);
	    }
	    break;
        case 'f':
	    fFlag = TRUE;
	    break;
	case 'b':
	    bFlag = TRUE;
	    break;
	case 'V':
	    fputs(_LS_VERSION_, stderr);
	    return (0);
	case 'h':
	    usage(argv[0]);
	    exit(-1);
	}
    }

    if (argc <= optind) {
	usage(argv[0]);
	exit(-1);
    }

    memset((struct runJobRequest* )&runJobRequest, 0,
	   sizeof(struct runJobRequest));


    if (getOneJobId (argv[argc - 1], &(runJobRequest.jobId), 0)) {
	usage(argv[0]);
	exit(-1);
    }
    runJobRequest.numHosts = countHosts(hosts);

    if (runJobRequest.numHosts > 1) {
	int     i;

	runJobRequest.hostname = (char **)calloc(runJobRequest.numHosts,
						 sizeof(char *));
	if (runJobRequest.hostname == NULL) {
	    perror("calloc");
	    exit(-1);
	}

	for (i = 0; i < runJobRequest.numHosts; i++) {
	    while (isspace(*hosts)) hosts++;
	    runJobRequest.hostname[i] = hosts;
	    hosts += strlen(hosts) + 1;
	}
    } else
	runJobRequest.hostname = &hosts;

    runJobRequest.options = (fFlag == TRUE) ?
	RUNJOB_OPT_NOSTOP : RUNJOB_OPT_NORMAL;

    if (bFlag) {
	runJobRequest.options |= RUNJOB_OPT_FROM_BEGIN;
    }


    cc = lsb_runjob(&runJobRequest);
    if (cc < 0) {
	lsb_perror((_i18n_msg_get(ls_catd,NL_SETN,2755, "Failed to run the job"))); /* catgets  2755  */
	exit(-1);
    }

    printf((_i18n_msg_get(ls_catd,NL_SETN,2756, "Job <%s> is being forced to run.\n")), /* catgets  2756  */
	   lsb_jobid2str(runJobRequest.jobId));

    _i18n_end ( ls_catd );
    return (0);
}
Пример #22
0
static int
shouldResume (struct hostLoad *loadV, struct jobCard *jp, int num)
{
    static char fname[] = "shouldResume";
    int i, j, numHosts = -1;
    int resume = TRUE, found;
    int lastReasons = jp->jobSpecs.reasons;
    int lastSubreasons = jp->jobSpecs.subreasons;
    struct hostLoad *loads = NULL;
    struct tclHostData *tclHostData = NULL;

    if (logclass & (LC_SCHED | LC_EXEC))
        ls_syslog(LOG_DEBUG3, "%s: job=%s; jStatus=%d; reasons=%x, subreasons=%d, numHosts=%d", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.jStatus, jp->jobSpecs.reasons, jp->jobSpecs.subreasons, num);

    if (num <= 0)
        return FALSE;


    if (!(jp->jobSpecs.jStatus & JOB_STAT_SSUSP))
        return FALSE;



    if ((jp->jobSpecs.reasons & SUSP_QUEUE_WINDOW)
        || (jp->jobSpecs.reasons & SUSP_USER_STOP)
        || (jp->jobSpecs.reasons & SUSP_MBD_LOCK))
        return FALSE;




    loads = (struct hostLoad *)
			my_malloc (num * sizeof (struct hostLoad), fname);
    if (jp->resumeCondVal != NULL) {
        tclHostData = (struct tclHostData *)
		       my_malloc (num * sizeof (struct tclHostData), fname);
        for (i = 0; i < num; i++) {
            initTclHostData (&tclHostData[i]);
        }
    } else {
	tclHostData = NULL;
    }
    for (j = 0; j <jp->jobSpecs.numToHosts; j++) {
        if (j > 0 && !strcmp (jp->jobSpecs.toHosts[j],
                                    jp->jobSpecs.toHosts[j-1]))
            continue;
        numHosts++;
        found = FALSE;
        for (i = 0; i < num; i++) {
            if (equalHost_(jp->jobSpecs.toHosts[j], loadV[i].hostName)) {
                loads[numHosts] = loadV[i];
                if (tclHostData != NULL) {
                    if (getTclHostData (&loadV[i],
                                     &tclHostData[numHosts], FALSE) < 0) {
                        break;
                    }
                }
                found = TRUE;
                break;
            }
        }
        if (found != TRUE) {

            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5706,
		"%s: Can not find load information for host <%s> to check resume condiftions for job <%s>"), fname, jp->jobSpecs.toHosts[j], lsb_jobid2str(jp->jobSpecs.jobId)); /* catgets 5706 */
            loads[numHosts].li = NULL;
            continue;
        }
    }
    if (numHosts >= 0) {
	numHosts++;
        resume = checkResumeByLoad (jp->jobSpecs.jobId, numHosts,
               jp->jobSpecs.thresholds, loads, &jp->jobSpecs.reasons,
               &jp->jobSpecs.subreasons,
               jp->jobSpecs.jAttrib, jp->resumeCondVal, tclHostData);

        FREEUP (loads);
        if (tclHostData != NULL) {
            for (i = 0; i < numHosts; i++)  {
                 FREEUP (tclHostData[i].resBitMaps);
                 FREEUP (tclHostData[i].loadIndex);
            }
            FREEUP (tclHostData);
        }
    } else {
	ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5707,
	    "%s: No valid load information is found for job <%s>"), fname, lsb_jobid2str(jp->jobSpecs.jobId)); /* catgets 5707 */
    }
    if ((logclass & (LC_SCHED | LC_EXEC)) && !resume)
        ls_syslog(LOG_DEBUG2, "%s: Can't resume job %s; reason=%x, subreasons=%d", fname, lsb_jobid2str(jp->jobSpecs.jobId), jp->jobSpecs.reasons, jp->jobSpecs.subreasons);

    if (!resume) {


	if ((jp->jobSpecs.reasons != lastReasons ||
	     (jp->jobSpecs.reasons == lastReasons &&
	      jp->jobSpecs.subreasons != lastSubreasons)) &&
	    (now - jp->lastStatusMbdTime > rusageUpdateRate * sbdSleepTime))
	    jp->notReported++;
    }

    return (resume);

}
Пример #23
0
bool_t 
xdr_jobSpecs (XDR *xdrs, struct jobSpecs *jobSpecs, struct LSFHeader *hdr)
{
    static char fname[] = "xdr_jobSpecs";
    char *sp[15];
    char *pTemp;
    int i, nLimits;
    int jobArrId, jobArrElemId;
    LS_LONG_INT tmpJobId;

    if (xdrs->x_op == XDR_DECODE) {
        
        jobSpecs->numToHosts = 0;
        jobSpecs->toHosts = NULL;
        jobSpecs->nxf = 0;
        jobSpecs->xf  = NULL;
        jobSpecs->numEnv = 0;
        jobSpecs->env  = NULL;
        jobSpecs->eexec.len = 0;
        jobSpecs->eexec.data = NULL;
        jobSpecs->loginShell = NULL;
        jobSpecs->schedHostType= NULL;
        jobSpecs->execHosts = NULL;
    }

    if (xdrs->x_op == XDR_FREE) {
        
        for(i=0; i < jobSpecs->numToHosts; i++) {
            FREEUP(jobSpecs->toHosts[i]);
        }
        FREEUP(jobSpecs->toHosts);
    
        for(i=0; i < jobSpecs->numEnv; i++)
            FREEUP(jobSpecs->env[i]);
        FREEUP(jobSpecs->env);

        FREEUP(jobSpecs->xf);
        FREEUP(jobSpecs->loginShell);
        FREEUP(jobSpecs->schedHostType);
        FREEUP(jobSpecs->execHosts);
        if (!xdr_thresholds(xdrs, jobSpecs) ||
            !xdr_lenData(xdrs, &jobSpecs->eexec))
            return(FALSE);
        return(TRUE);
    }

    if (xdrs->x_op == XDR_ENCODE) {
	jobId64To32(jobSpecs->jobId, &jobArrId, &jobArrElemId);
    }
    if (!xdr_int(xdrs, &jobArrId)) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_int", "jobId");
        return(FALSE);
    }

    if (!(xdr_int(xdrs, &jobSpecs->userId) &&
	  xdr_int(xdrs, &jobSpecs->options) &&
	  xdr_short(xdrs, &jobSpecs->nice) &&
	  xdr_int(xdrs, &jobSpecs->priority) &&
	  xdr_int(xdrs, &jobSpecs->chkSig) &&	  
	  xdr_int(xdrs, &jobSpecs->actPid) &&
	  xdr_time_t(xdrs, &jobSpecs->chkPeriod) &&
	  xdr_time_t(xdrs, &jobSpecs->migThresh) &&
	  xdr_time_t(xdrs, &jobSpecs->lastSSuspTime) &&	  
	  xdr_float(xdrs, &jobSpecs->lastCpuTime))) {
	ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, 
	    "xdr_int", "userId");
        return(FALSE);
    }

    nLimits = LSF_RLIM_NLIMITS;
    tmpJobId = jobArrId;
    if (!xdr_int(xdrs, &nLimits)) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		  lsb_jobid2str(tmpJobId), 
		  "xdr_int", "nLimits");
	return(FALSE);
    }

    
    for (i = 0; i < nLimits && i < LSF_RLIM_NLIMITS; i++) {
	if(!xdr_lsfLimit(xdrs, &jobSpecs->lsfLimits[i], hdr)) { 
	    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, 
		      lsb_jobid2str(tmpJobId), 
		      "xdr_lsfLimit");
	    return(FALSE);
	}
    }

    if (nLimits > LSF_RLIM_NLIMITS) {
	
	for (i=LSF_RLIM_NLIMITS; i<nLimits; i++) {
	    struct lsfLimit lsfLimit;
	    if (!xdr_lsfLimit(xdrs, &lsfLimit, hdr)) {
	        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, 
			  lsb_jobid2str(tmpJobId), 
			  "xdr_lsfLimit");
		return(FALSE);
	    }
	}
    }

    if (!(xdr_int(xdrs, &jobSpecs->jStatus)      &&
          xdr_int(xdrs, &jobSpecs->reasons)      &&
          xdr_int(xdrs, &jobSpecs->subreasons)   &&	  
          xdr_time_t(xdrs, &jobSpecs->termTime)  &&
          xdr_time_t(xdrs, &jobSpecs->startTime) &&
          xdr_int(xdrs, &jobSpecs->runTime) &&
	  xdr_time_t(xdrs, &jobSpecs->submitTime) &&
          xdr_int(xdrs, &jobSpecs->jobPid)       &&
          xdr_int(xdrs, &jobSpecs->jobPGid)      &&
          xdr_int(xdrs, &jobSpecs->restartPid)   &&
	  xdr_int(xdrs, &jobSpecs->sigValue)     &&
          xdr_int(xdrs, &jobSpecs->umask)         &&
          xdr_int(xdrs, &jobSpecs->jAttrib)))  {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), 
	    "xdr_int", "jStatus");
	return(FALSE);
    }

    sp[0] = jobSpecs->jobFile;
    sp[1] = jobSpecs->inFile;
    sp[2] = jobSpecs->outFile;
    sp[3] = jobSpecs->errFile;
    sp[4] = jobSpecs->chkpntDir;
    sp[5] = jobSpecs->cwd;
    sp[6] = jobSpecs->subHomeDir;
    sp[7] = jobSpecs->command;
    sp[8] = jobSpecs->jobName;
    sp[9] = jobSpecs->preExecCmd;
    sp[10] = jobSpecs->fromHost;
    sp[11] = jobSpecs->resReq;

    if (xdrs->x_op == XDR_DECODE)
        for (i = 0; i < 11; i++)
            sp[i][0] = '\0';
    if (!(xdr_string(xdrs, &sp[0], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[1], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[2], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[3], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[4], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[5], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[6], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[7], MAXLINELEN) &&
          xdr_string(xdrs, &sp[8], MAXLINELEN) &&
          xdr_string(xdrs, &sp[9], MAXLINELEN) &&
          xdr_string(xdrs, &sp[10], MAXHOSTNAMELEN))) { 
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		  lsb_jobid2str(tmpJobId), 
		  "xdr_int", "jobFile");
        return(FALSE);
    }
     if (xdrs->x_op == XDR_DECODE)
         sp[11][0] = '\0';
     if (!xdr_string(xdrs, &sp[11], MAXLINELEN)) {
         ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		   lsb_jobid2str(tmpJobId),
                   "xdr_int", "jobFile");
         return(FALSE);
     }

    sp[12] = jobSpecs->queue;
    sp[13] = jobSpecs->windows;
    sp[14] = jobSpecs->userName;

    if (xdrs->x_op == XDR_DECODE)
        for (i = 12; i < 15; i++)
            sp[i][0] = '\0';

    if (!(xdr_string(xdrs, &sp[12], MAXFILENAMELEN) &&
          xdr_string(xdrs, &sp[13], MAXLINELEN) &&
          xdr_string(xdrs, &sp[14], MAX_LSB_NAME_LEN))) {
        ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId),
            "xdr_int", "jobFile");
        return(FALSE);
    }

    

    if (!xdr_int(xdrs, &jobSpecs->numToHosts)) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, lsb_jobid2str(tmpJobId), 
	    "xdr_int", "numToHosts");
	return(FALSE);
    }

    if (xdrs->x_op == XDR_DECODE && jobSpecs->numToHosts) {
        jobSpecs->toHosts = (char **) my_calloc(jobSpecs->numToHosts, 
                                      sizeof (char *), fname);
    }

    for (i = 0; i < jobSpecs->numToHosts; i++) {
        if (!xdr_var_string(xdrs, &jobSpecs->toHosts[i])) {
	    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		      lsb_jobid2str(tmpJobId), 
		      "xdr_var_string", "toHosts");
	    return(FALSE);
	}
    }

    
    if (!xdr_thresholds(xdrs, jobSpecs))
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S, fname, 
		  lsb_jobid2str(tmpJobId), 
		  "xdr_thresholds");

    
    if (!xdr_int(xdrs, &jobSpecs->nxf)) {
	ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		  lsb_jobid2str(tmpJobId), 
		  "xdr_int", "nxf");
        return(FALSE);
    }

    if (xdrs->x_op == XDR_DECODE && jobSpecs->nxf > 0) {
	jobSpecs->xf = (struct xFile *) my_calloc(jobSpecs->nxf, 
                                        sizeof(struct xFile), fname);
    }

    for (i = 0; i < jobSpecs->nxf; i++) {
	if (!xdr_arrayElement(xdrs, (char *) &(jobSpecs->xf[i]),
			      hdr, xdr_xFile)) {
	    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_S, fname, 
		      lsb_jobid2str(tmpJobId), 
		      "xdr_arrayElement", "xf");
	    return(FALSE);
	}
    }

    sp[0] = jobSpecs->mailUser;
    sp[1] = jobSpecs->clusterName;
    sp[2] = jobSpecs->projectName;
    sp[3] = jobSpecs->preCmd;
    sp[4] = jobSpecs->postCmd;
    sp[5] = jobSpecs->execCwd;
    sp[6] = jobSpecs->execHome;
    sp[7] = jobSpecs->requeueEValues;

    if (xdrs->x_op == XDR_DECODE) {
        for(i=0; i < 8; i++)
            sp[i][0]= '\0';
    }

    if (!(xdr_string(xdrs, &sp[0], MAXLINELEN) &&
          xdr_string(xdrs, &sp[1], MAX_LSB_NAME_LEN) &&
	  xdr_string(xdrs, &sp[2], MAX_LSB_NAME_LEN) &&
	  xdr_string(xdrs, &sp[3], MAXLINELEN) &&
	  xdr_string(xdrs, &sp[4], MAXLINELEN) &&
	  xdr_string(xdrs, &sp[5], MAXFILENAMELEN) &&
	  xdr_string(xdrs, &sp[6], MAXFILENAMELEN) &&
	  xdr_string(xdrs, &sp[7], MAXLINELEN) &&
          xdr_int(xdrs, &jobSpecs->execUid)  &&
          xdr_int(xdrs, &jobSpecs->maxNumProcessors)  &&
          xdr_int(xdrs, &jobSpecs->numEnv)))
            return(FALSE);

    if (xdrs->x_op == XDR_DECODE && jobSpecs->numEnv) {
        jobSpecs->env = (char **) my_calloc(jobSpecs->numEnv, 
					  sizeof (char *), fname);
    }
	
    for (i = 0; i < jobSpecs->numEnv; i++) {
        if (!xdr_var_string(xdrs, &jobSpecs->env[i]))
                return(FALSE);
    }

    
    if (!xdr_lenData(xdrs, &jobSpecs->eexec)) 
        return (FALSE);


    if (!xdr_int(xdrs, &jobSpecs->niosPort))
        return (FALSE);
    sp[0] = jobSpecs->resumeCond;
    sp[1] = jobSpecs->stopCond;
    sp[2] = jobSpecs->suspendActCmd;
    sp[3] = jobSpecs->resumeActCmd;
    sp[4] = jobSpecs->terminateActCmd;

    if (xdrs->x_op == XDR_DECODE) {
        sp[0][0] = '\0';
        sp[1][0] = '\0';
    }
    if (!(xdr_string(xdrs, &sp[0],  MAXLINELEN)) ||
        !(xdr_string(xdrs, &sp[1],  MAXLINELEN)))
        return (FALSE);


    if (xdrs->x_op == XDR_DECODE) {
        for ( i = 2; i < 5; i++)
            sp[i][0] = '\0';
    }

    for ( i = 2; i < 5; i++)
        if (!(xdr_string(xdrs, &sp[i], MAXLINELEN)))
            return(FALSE);
    
    
    for (i = 0; i <LSB_SIG_NUM; i++)
        if (!(xdr_int(xdrs, &jobSpecs->sigMap[i])))
            return(FALSE);

    if (!(xdr_int(xdrs, &jobSpecs->actValue)))
        return (FALSE); 

    if (!xdr_var_string(xdrs, &jobSpecs->loginShell)) 
        return (FALSE);

    if (!xdr_var_string(xdrs, &jobSpecs->schedHostType)) 
        return (FALSE);

    if (!xdr_var_string(xdrs, &jobSpecs->execHosts)) 
        return (FALSE);

    
    if (!xdr_int(xdrs, &jobSpecs->options2)) {
        return(FALSE);
    }

    
    pTemp = jobSpecs->jobSpoolDir;
    if(!( xdr_string(xdrs, &pTemp, MAXPATHLEN))) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname,
                  "xdr_string", "jobSpoolDir");
        return(FALSE);
    }

    if (!(xdr_int(xdrs, &jobArrElemId))) {
	return (FALSE);
    }

    if (xdrs->x_op == XDR_DECODE) {
	jobId32To64(&jobSpecs->jobId,jobArrId,jobArrElemId);
    }    

    sp[0] = jobSpecs->inFileSpool;
    sp[1] = jobSpecs->commandSpool;

    if (!(xdr_string(xdrs, &sp[0], MAXFILENAMELEN))) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, 
		       "xdr_string", "inFileSpool"); 
        return (FALSE);
    }
    if (!(xdr_string(xdrs, &sp[1], MAXFILENAMELEN))) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname,
			"xdr_string", "commandSpool");
        return (FALSE);
    }

    if (!(xdr_int(xdrs, &jobSpecs->userPriority))) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname, "xdr_int", 
		       "userPriority");
        return (FALSE);
    }

    sp[0] = jobSpecs->execUsername;
    if (!(xdr_string(xdrs, &sp[0], MAX_LSB_NAME_LEN))) {
        ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname,
			"xdr_string", "execUsername");
        return (FALSE);
    }

    sp[0] = jobSpecs->prepostUsername;
    if (!(xdr_string(xdrs, &sp[0], MAX_LSB_NAME_LEN))) {
	ls_syslog(LOG_ERR, I18N_FUNC_S_FAIL, fname,
			"xdr_string", "prepostUsername");
	return (FALSE);
    }
    
    return(TRUE);

} 
Пример #24
0
static void
chkpntEnd (struct jobCard *jobCard, int w_status, bool_t *freed)
{
    static char fname[] = "chkpntEnd()";
    int savePid, saveStatus;


    if (IS_SUSP(jobCard->jobSpecs.jStatus)
       && !(jobCard->jobSpecs.jStatus & JOB_STAT_MIG))
        jobsig(jobCard, SIGSTOP, TRUE);

    saveStatus = jobCard->jobSpecs.jStatus;
    if (jobCard->jobSpecs.jStatus & JOB_STAT_MIG) {
        if (w_status == 0)  {
            if (!jobCard->missing) {

                jobCard->missing = TRUE;
                need_checkfinish = TRUE;
                return;
            } else if (jobCard->notReported == 0)
                return;

            if (jobCard->cleanupPid == 0) {
                if ((jobCard->cleanupPid = rmJobBufFilesPid(jobCard)) > 0)
                    return;

                ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5709,
                    "%s: Unable to cleanup migrating job <%s>"), /* catgets 5709 */
                    fname, lsb_jobid2str(jobCard->jobSpecs.jobId));
            }

            SBD_SET_STATE(jobCard, JOB_STAT_PEND);
        } else {
            jobCard->jobSpecs.jStatus &= ~JOB_STAT_MIG;
        }
    }

    savePid = jobCard->jobSpecs.actPid;

    if (status_job (BATCH_STATUS_JOB, jobCard, jobCard->jobSpecs.jStatus,
                    w_status == 0 ? ERR_NO_ERROR :
                    ERR_SYSACT_FAIL) < 0) {
        jobCard->jobSpecs.actPid = savePid;
        jobCard->jobSpecs.jStatus = saveStatus;
    } else {
        jobCard->lastChkpntTime = now;
        jobCard->jobSpecs.actPid = 0;
        jobCard->actStatus = ACT_NO;
        jobCard->jobSpecs.actValue = SIG_NULL;

        if (w_status == 0) {

            jobCard->migCnt = 1;
        }

        if (saveStatus & JOB_STAT_MIG) {
            if (w_status == 0) {

                cleanupMigJob(jobCard);
		deallocJobCard(jobCard);
		*freed = TRUE;
            } else
                jobCard->migCnt *= 2;
        }
    }

}
Пример #25
0
int
status_job (mbdReqType reqType,
	    struct jobCard *jp, int newStatus, sbdReplyType err)
{
  static char fname[] = "status_job()";
  static int seq = 1;
  static char lastHost[MAXHOSTNAMELEN];
  int reply;
  char *request_buf;
  char *reply_buf = NULL;
  XDR xdrs;
  struct LSFHeader hdr;
  int cc;
  struct statusReq statusReq;
  int flags;
  int i;
  int len;
  struct lsfAuth *auth = NULL;

  if ((logclass & LC_TRACE) && (logclass & LC_SIGNAL))
    ls_syslog (LOG_DEBUG, "%s: Entering ... regType %d jobId %s",
	       fname, reqType, lsb_jobid2str (jp->jobSpecs.jobId));

  if (newStatus == JOB_STAT_EXIT)
    {
      jp->userJobSucc = FALSE;
    }

  if (MASK_STATUS (newStatus) == JOB_STAT_DONE)
    {
      jp->userJobSucc = TRUE;
    }

  if (IS_POST_FINISH (newStatus))
    {
      if (jp->userJobSucc != TRUE)
	{
	  return 0;
	}
    }

  if (masterHost == NULL)
    return -1;

  if (jp->notReported < 0)
    {
      jp->notReported = -INFINIT_INT;
      return (0);
    }

  statusReq.jobId = jp->jobSpecs.jobId;
  statusReq.actPid = jp->jobSpecs.actPid;
  statusReq.jobPid = jp->jobSpecs.jobPid;
  statusReq.jobPGid = jp->jobSpecs.jobPGid;
  statusReq.newStatus = newStatus;
  statusReq.reason = jp->jobSpecs.reasons;
  statusReq.subreasons = jp->jobSpecs.subreasons;
  statusReq.sbdReply = err;
  statusReq.lsfRusage = jp->lsfRusage;
  statusReq.execUid = jp->jobSpecs.execUid;
  statusReq.numExecHosts = 0;
  statusReq.execHosts = NULL;
  statusReq.exitStatus = jp->w_status;
  statusReq.execCwd = jp->jobSpecs.execCwd;
  statusReq.execHome = jp->jobSpecs.execHome;
  statusReq.execUsername = jp->execUsername;
  statusReq.queuePostCmd = "";
  statusReq.queuePreCmd = "";
  statusReq.msgId = jp->delieveredMsgId;

  if (IS_FINISH (newStatus))
    {
      if (jp->maxRusage.mem > jp->runRusage.mem)
	jp->runRusage.mem = jp->maxRusage.mem;
      if (jp->maxRusage.swap > jp->runRusage.swap)
	jp->runRusage.swap = jp->maxRusage.swap;
      if (jp->maxRusage.stime > jp->runRusage.stime)
	jp->runRusage.stime = jp->maxRusage.stime;
      if (jp->maxRusage.utime > jp->runRusage.utime)
	jp->runRusage.utime = jp->maxRusage.utime;
    }
  statusReq.runRusage.mem = jp->runRusage.mem;
  statusReq.runRusage.swap = jp->runRusage.swap;
  statusReq.runRusage.utime = jp->runRusage.utime;
  statusReq.runRusage.stime = jp->runRusage.stime;
  statusReq.runRusage.npids = jp->runRusage.npids;
  statusReq.runRusage.pidInfo = jp->runRusage.pidInfo;
  statusReq.runRusage.npgids = jp->runRusage.npgids;
  statusReq.runRusage.pgid = jp->runRusage.pgid;
  statusReq.actStatus = jp->actStatus;
  statusReq.sigValue = jp->jobSpecs.actValue;
  statusReq.seq = seq;
  seq++;
  if (seq >= MAX_SEQ_NUM)
    seq = 1;

  len = 1024 + ALIGNWORD_ (sizeof (struct statusReq));

  len += ALIGNWORD_ (strlen (statusReq.execHome)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execCwd)) + 4 +
    ALIGNWORD_ (strlen (statusReq.execUsername)) + 4;

  for (i = 0; i < statusReq.runRusage.npids; i++)
    len += ALIGNWORD_ (sizeof (struct pidInfo)) + 4;

  for (i = 0; i < statusReq.runRusage.npgids; i++)
    len += ALIGNWORD_ (sizeof (int)) + 4;

  if (logclass & (LC_TRACE | LC_COMM))
    ls_syslog (LOG_DEBUG, "%s: The length of the job message is: <%d>", fname,
	       len);

  if ((request_buf = malloc (len)) == NULL)
    {
      ls_syslog (LOG_ERR, I18N_FUNC_FAIL_M, fname, "malloc");
      return (-1);
    }

  xdrmem_create (&xdrs, request_buf, len, XDR_ENCODE);
  initLSFHeader_ (&hdr);
  hdr.opCode = reqType;

  if (!xdr_encodeMsg (&xdrs, (char *) &statusReq, &hdr, xdr_statusReq, 0,
		      auth))
    {
      ls_syslog (LOG_ERR, I18N_JOB_FAIL_S_M,
		 fname, lsb_jobid2str (jp->jobSpecs.jobId), "xdr_statusReq");
      lsb_merr2 (I18N_FUNC_FAIL, fname, "xdr_statusReq");
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      relife ();
    }

  flags = CALL_SERVER_NO_HANDSHAKE;
  if (statusChan >= 0)
    flags |= CALL_SERVER_USE_SOCKET;

  if (reqType == BATCH_RUSAGE_JOB)
    flags |= CALL_SERVER_NO_WAIT_REPLY;

  if (logclass & LC_COMM)
    ls_syslog (LOG_DEBUG1, "%s: before call_server statusChan=%d flags=%d",
	       fname, statusChan, flags);

  cc = call_server (masterHost,
		    mbd_port,
		    request_buf,
		    XDR_GETPOS (&xdrs),
		    &reply_buf,
		    &hdr,
		    connTimeout, readTimeout, &statusChan, NULL, NULL, flags);
  if (cc < 0)
    {
      statusChan = -1;
      if (!equalHost_ (masterHost, lastHost))
	{
	  if (errno != EINTR)
	    ls_syslog (LOG_DEBUG,
		       "%s: Failed to reach mbatchd on host <%s> for job <%s>: %s",
		       fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId),
		       lsb_sysmsg ());
	  strcpy (lastHost, masterHost);
	}
      xdr_destroy (&xdrs);
      FREEUP (request_buf);
      failcnt++;
      return (-1);
    }
  else if (cc == 0)
    {

    }

  failcnt = 0;
  lastHost[0] = '\0';
  xdr_destroy (&xdrs);
  FREEUP (request_buf);

  if (cc)
    free (reply_buf);

  if (flags & CALL_SERVER_NO_WAIT_REPLY)
    {

      struct timeval timeval;

      timeval.tv_sec = 0;
      timeval.tv_usec = 0;

      if (rd_select_ (chanSock_ (statusChan), &timeval) == 0)
	{
	  jp->needReportRU = FALSE;
	  jp->lastStatusMbdTime = now;
	  return 0;
	}

      CLOSECD (statusChan);

      if (logclass & LC_COMM)
	ls_syslog (LOG_DEBUG1,
		   "%s: Job <%s> rd_select() failed, assume connection broken",
		   fname, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
  reply = hdr.opCode;
  switch (reply)
    {
    case LSBE_NO_ERROR:
    case LSBE_LOCK_JOB:
      jp->needReportRU = FALSE;
      jp->lastStatusMbdTime = now;
      if (reply == LSBE_LOCK_JOB)
	{
	  if (IS_SUSP (jp->jobSpecs.jStatus))
	    jp->jobSpecs.reasons |= SUSP_MBD_LOCK;
	  else
	    ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5204, "%s: Job <%s> is in status <%x> and mbatchd wants to lock it, ignored."),	/* catgets 5204 */
		       fname,
		       lsb_jobid2str (jp->jobSpecs.jobId),
		       jp->jobSpecs.jStatus);
	}
      return (0);
    case LSBE_NO_JOB:
      if (!IS_POST_FINISH (jp->jobSpecs.jStatus))
	{
	  ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5205, "%s: Job <%s> is forgotten by mbatchd on host <%s>, ignored."), fname, lsb_jobid2str (jp->jobSpecs.jobId), masterHost);	/* catgets 5205 */
	}

      jp->notReported = -INFINIT_INT;
      return (0);
    case LSBE_STOP_JOB:
      if (jobsig (jp, SIGSTOP, TRUE) < 0)
	SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_EXIT);
      else
	{
	  SET_STATE (jp->jobSpecs.jStatus, JOB_STAT_USUSP);
	  jp->jobSpecs.reasons |= SUSP_USER_STOP;
	}
      return (-1);
    case LSBE_SBATCHD:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5206, "%s: mbatchd on host <%s> doesn't think I'm configured as a batch server when I report the status for job <%s>"),	/* catgets 5206 */
		 fname, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    default:
      ls_syslog (LOG_ERR, _i18n_msg_get (ls_catd, NL_SETN, 5207, "%s: Illegal reply code <%d> from mbatchd on host <%s> for job <%s>"),	/* catgets 5207 */
		 fname,
		 reply, masterHost, lsb_jobid2str (jp->jobSpecs.jobId));
      return (-1);
    }
}
Пример #26
0
static void
sigActEnd (struct jobCard *jobCard)
{
    int w_status;
    struct stat st;
    bool_t freed = FALSE;

    char exitFile[MAXFILENAMELEN];


    if (jobCard->jobSpecs.actValue < 0) {
        sprintf(exitFile, "%s/.%s.%s.%s",
            LSTMPDIR, jobCard->jobSpecs.jobFile,
                    lsb_jobidinstr(jobCard->jobSpecs.jobId),
                    exitFileSuffix(jobCard->jobSpecs.actValue));


        w_status = stat(exitFile, &st);

        if (w_status == 0)
            jobCard->actStatus = ACT_DONE;
        else {
            jobCard->actStatus = ACT_FAIL;
        }
    }

    jobCard->jobSpecs.jStatus &= ~JOB_STAT_SIGNAL;

    switch (jobCard->jobSpecs.actValue) {
        case SIG_CHKPNT:
        case SIG_CHKPNT_COPY:
            chkpntEnd (jobCard, w_status, &freed);
            break;
        case SIG_SUSP_USER:
        case SIG_SUSP_LOAD:
        case SIG_SUSP_WINDOW:
        case SIG_SUSP_OTHER:
            suspendActEnd (jobCard, w_status);
            break;

        case SIG_RESUME_USER:
        case SIG_RESUME_LOAD:
        case SIG_RESUME_WINDOW:
        case SIG_RESUME_OTHER:
            resumeActEnd (jobCard, w_status);
            break;

        case SIG_TERM_USER:
        case SIG_KILL_REQUEUE:
        case SIG_TERM_OTHER:
        case SIG_TERM_FORCE:

            if (jobSigLog (jobCard, w_status) == 0) {
                jobCard->jobSpecs.actPid = 0;
                jobCard->jobSpecs.actValue = SIG_NULL;
            }
            break;
        case SIG_TERM_LOAD:
        case SIG_TERM_WINDOW:
        case SIG_TERM_RUNLIMIT:
        case SIG_TERM_DEADLINE:
        case SIG_TERM_PROCESSLIMIT:
        case SIG_TERM_CPULIMIT:
        case SIG_TERM_MEMLIMIT:
            suspendActEnd (jobCard, w_status);
            break;
        default:
            ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5708,
                "sigActEnd: unknown sigValue <%d> for job <%s> at the job status <%d> with actPid <%d>"), /* catgets 5708 */
                jobCard->jobSpecs.actValue,
                lsb_jobid2str(jobCard->jobSpecs.jobId),
                jobCard->jobSpecs.jStatus,
                jobCard->jobSpecs.actPid);

            jobCard->jobSpecs.actPid = 0;
            return;

    }

    if (!freed) {

	sbdlog_newstatus (jobCard);
    }
}
Пример #27
0
static void
tryResume (struct hostLoad *myload)
{
    char fname[] = "tryResume";
    struct jobCard *jobCard, *next;

    static int errCount = 0, lastTryResumeTime = 0;


    if (now - lastTryResumeTime < sbdSleepTime) {

        return;
    }
    lastTryResumeTime = now;

    for (jobCard = jobQueHead->back; jobCard != jobQueHead; jobCard = next) {
        next = jobCard->back;

	if (!(jobCard->jobSpecs.jStatus & JOB_STAT_SSUSP) ||
	    jobCard->jobSpecs.actPid)
            continue;

        if (jobCard->jobSpecs.numToHosts == 1) {
            if (shouldResume (myload, jobCard, 1)) {


                if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0)
                    continue;
                else
                    return;
            }
	} else {
            int numh;
	    struct hostLoad *load;
            struct nameList *hostList;

	    numh = jobCard->jobSpecs.numToHosts;
            hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh);
            numh = hostList->listSize;
            load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0,
                                   hostList->names, hostList->listSize);

	    if (load == NULL) {
                if (errCount < 3)
		    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_M, fname,
			      lsb_jobid2str(jobCard->jobSpecs.jobId),
			      "ls_loadofhosts");
                errCount++;
		if (lserrno == LSE_LIM_BADHOST)
		    relife();
		if (lserrno == LSE_BAD_XDR)
		    relife();
		if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT)
		    myStatus |= NO_LIM;
		continue;
	    } else {
		myStatus = 0;
                errCount = 0;
            }
            if (!shouldResume (load, jobCard, numh))
                continue;



	    if (jobResumeAction(jobCard, SIG_RESUME_LOAD, LOAD_REASONS) < 0)

		continue;
	    else
	        return;
        }
    }
    return;

}
Пример #28
0
static void
tryStop (char *myhostnm, struct hostLoad *myload)
{
    static char fname[] = "tryStop";
    struct jobCard *jobCard, *next;
    int reasons, subreasons, stopmore = FALSE;
    static int errCount = 0, lastTryStopTime = 0;

    if (now - lastTryStopTime < sbdSleepTime) {

        return;
    }
    lastTryStopTime = now;

    for (jobCard = jobQueHead->forw; jobCard != jobQueHead; jobCard = next) {
        next = jobCard->forw;

	if (jobCard->jobSpecs.numToHosts == 1) {

            if  ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN)
                && (now >= jobCard->jobSpecs.startTime + sbdSleepTime)
                && shouldStop (myload, jobCard, &reasons, &subreasons, 1, &stopmore)) {


    	        jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons);
	        if (stopmore)
                    continue;
                else
	            return;
	    }
	} else {
	    struct hostLoad *load;
            int numh;
            struct nameList *hostList;

            numh = jobCard->jobSpecs.numToHosts;
            hostList = lsb_compressStrList(jobCard->jobSpecs.toHosts, numh);
            numh = hostList->listSize;



           if (hostList->listSize == 1) {
               load = myload;
           } else {
                load = ls_loadofhosts ("-", &numh, EFFECTIVE, 0,
                                    hostList->names, hostList->listSize);
           }

	    if (load == NULL) {
                if (errCount < 3)
		    ls_syslog(LOG_ERR, I18N_JOB_FAIL_S_MM, fname,
			      lsb_jobid2str(jobCard->jobSpecs.jobId),
			      "ls_loadofhosts");
		errCount++;
                if (lserrno == LSE_LIM_BADHOST)
                    relife();
		if (lserrno == LSE_BAD_XDR)
	            relife();
		if (lserrno == LSE_LIM_DOWN || lserrno == LSE_TIME_OUT)
		    myStatus |= NO_LIM;
		continue;
	    } else {
                errCount = 0;
		myStatus = 0;
            }

	    if ((jobCard->jobSpecs.jStatus & JOB_STAT_RUN)
                && now >= jobCard->jobSpecs.startTime + sbdSleepTime) {
		if (shouldStop (load, jobCard, &reasons, &subreasons, numh, &stopmore)) {

                    jobSuspendAction (jobCard, SIG_SUSP_LOAD, reasons, subreasons);
                    if (stopmore)
		        break;
                    else
		        return;
		}
            }
        }
    }
    return;

}
Пример #29
0
void
prtJobStart(struct jobInfoEnt *job, int prtFlag, int jobPid, int tFormat)
{
    char prline[MAXLINELEN], tBuff[20];
    time_t startTime;



    int                 i = 0;
    struct nameList  *hostList = NULL;


    if (lsbParams[LSB_SHORT_HOSTLIST].paramValue && job->numExHosts > 1
         &&  strcmp(lsbParams[LSB_SHORT_HOSTLIST].paramValue, "1") == 0) {
        hostList = lsb_compressStrList(job->exHosts, job->numExHosts);
        if (!hostList) {
            exit(99);
        }
    }


    if (tFormat) {
        sprintf (tBuff, "%s <%s>", I18N_Job, lsb_jobid2str(job->jobId));
    }
    else if (LSB_ARRAY_IDX(job->jobId) > 0 )
        sprintf (tBuff, " [%d]", LSB_ARRAY_IDX(job->jobId));
    else
       tBuff[0] = '\0';

    if (job->startTime && job->numExHosts) {


        if (job->startTime < job->submitTime)
            startTime = job->submitTime;
        else
            startTime = job->startTime;

        if ((job->submit.options & SUB_PRE_EXEC)
             && (prtFlag != BJOBS_PRINT)) {
            if (prtFlag == BHIST_PRINT_PRE_EXEC)
	    {
		if (tBuff[0] == '\0')
	            sprintf(prline, "%s: %s",
		            _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T,
					 &startTime),
		            I18N(604, "The pre-exec command is started on")); /* catgets  604  */
		else
	            sprintf(prline, "%s:%s, %s",
			    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T,
					&startTime),
		            tBuff,
			    I18N(605, "the pre-exec command is started on")); /* catgets  605  */
	    }
	    else
	    {
		if (tBuff[0] == '\0')
	            sprintf(prline, "%s: %s",
			    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime),
			    I18N(606, "The batch job command is started on")); /*catgets 606 */
		else
	            sprintf(prline, "%s:%s, %s",
			   _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime),
			   tBuff,
			   I18N(607, "the batch job command is started on")); /*catgets 607 */
	    }
	} else {
	    if (jobPid > 0)
	    {
		if (tBuff[0] == '\0')
		    sprintf(prline, "%s: %s",
			    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime),
			    I18N(608, "Started on")); /* catgets  608  */
		else
		    sprintf(prline, "%s:%s %s",
			    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime),
			    tBuff,
			    I18N(609, "started on")); /* catgets  609  */
	    }
	    else
	    {
		if (tBuff[0] == '\0')
		    sprintf(prline, "%s: %s",
			    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime),
			    I18N(610, "Dispatched to")); /* catgets 610 */
		else
		    sprintf(prline, "%s: %s %s",
			    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &startTime),
			    tBuff,
			    I18N(611, "dispatched to")); /* catgets  611 */
	    }
	}


        prtLineWUF(prline);
        if (job->numExHosts > 1) {
            sprintf(prline, " %d %s",
		    job->numExHosts,
		    I18N(612, "Hosts/Processors")); /* catgets  612  */
            prtLineWUF(prline);
        }

        if (lsbParams[LSB_SHORT_HOSTLIST].paramValue && job->numExHosts > 1
             && strcmp(lsbParams[LSB_SHORT_HOSTLIST].paramValue, "1") == 0) {
            for (i = 0; i < hostList->listSize; i++) {
                sprintf(prline, " <%d*%s>", hostList->counter[i],
                                            hostList->names[i]);
                prtLineWUF(prline);
            }
        } else {
            for (i = 0; i < job->numExHosts; i++) {
                sprintf(prline, " <%s>", job->exHosts[i]);
                prtLineWUF(prline);
            }
        }

	if (job->execHome && strcmp (job->execHome, "")) {
	    sprintf(prline, ", %s <%s>",
		    I18N(615, "Execution Home"),  /* catgets 615 */
		    job->execHome);
	    prtLineWUF(prline);
        }
	if (job->execCwd && strcmp (job->execCwd, "")) {
	    sprintf(prline, ", %s <%s>",
		    I18N(616, "Execution CWD"), /* catgets 616 */
		    job->execCwd);
	    prtLineWUF(prline);
        }
	if (job->execUsername && strcmp(job->execUsername, "") &&
		strcmp(job->user, job->execUsername)) {
	    sprintf(prline, ", %s <%s>",
		    I18N(617, "Execution user name"), /* catgets 617 */
		    job->execUsername);
            prtLineWUF(prline);
        }
	sprintf(prline, ";\n");
	prtLineWUF(prline);
    }
}
Пример #30
0
static int
shouldStop (struct hostLoad *loadV,
	    struct jobCard *jobCard, int *reasons, int *subreasons, int num, int *stopmore)
{
    static char fname[] = "shouldStop";
    int i, numLoad = -1, j;
    struct hostLoad *load = NULL;
    static struct tclHostData tclHostData;
    static int first = TRUE;

    *reasons = 0;
    *subreasons = 0;


    if( jobCard->postJobStarted ) {
        return false;
    }


    if (jobCard->jobSpecs.jAttrib & JOB_URGENT_NOSTOP)
	return false;


    if (now - jobCard->windWarnTime < sbdSleepTime)
        return FALSE;


    if (!JOB_STARTED(jobCard))
        return FALSE;


    if (LS_ISUNAVAIL(loadV->status))
	return FALSE;
    if (num <= 0)
	return FALSE;


    for (i = 0; i <jobCard->jobSpecs.numToHosts && (*reasons) == 0; i++) {
        if (i > 0 && !strcmp (jobCard->jobSpecs.toHosts[i],
					     jobCard->jobSpecs.toHosts[i-1]))
            continue;
        numLoad++;
	load = NULL;
        for (j = 0; j < num; j ++) {
    	    if (equalHost_(jobCard->jobSpecs.toHosts[i], loadV[j].hostName)) {
	        load = &(loadV[j]);
	        break;
            }
        }
        if (load == NULL) {
	    ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5705,
		"%s: Can not find load information for host <%s>"), fname, jobCard->jobSpecs.toHosts[i]); /* catgets 5705 */
            return FALSE;
        }
        if (LS_ISLOCKEDU(load->status)
            && !(jobCard->jobSpecs.jAttrib & Q_ATTRIB_EXCLUSIVE)) {
            *reasons = SUSP_HOST_LOCK;
            *stopmore = TRUE;
        }
	else if (LS_ISLOCKEDM(load->status)) {
            *reasons = SUSP_HOST_LOCK_MASTER;
            *stopmore = TRUE;
        }
        else if (load->li[IT] <= jobCard->jobSpecs.thresholds.loadStop[numLoad][IT]
            && load->li[IT] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][IT] != -INFINIT_LOAD) {
	    *reasons |= SUSP_LOAD_REASON;
            *subreasons = IT;
            *stopmore = TRUE;
        }
        else if (load->li[LS] >=
			  jobCard->jobSpecs.thresholds.loadStop[numLoad][LS]
            && load->li[LS] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][LS]
						      != INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = LS;
            *stopmore = TRUE;
        }
        else if (load->li[UT] >=
			 jobCard->jobSpecs.thresholds.loadStop[numLoad][UT]
            && load->li[UT] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][UT] !=
							   INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = UT;
        }
        else if(load->li[PG] >=
		      jobCard->jobSpecs.thresholds.loadStop[numLoad][PG]
            && load->li[PG] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][PG]
						    != INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = PG;
        }
        else if(load->li[IO] >=
		     jobCard->jobSpecs.thresholds.loadStop[numLoad][IO]
            && load->li[IO] != INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][IO]
						      != INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = IO;
        }
        else if(load->li[MEM]
			 <= jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM]
            && load->li[MEM] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][MEM]
						      != -INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = MEM;
        }

        else if(load->li[SWP]
			 <= jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP]
            && load->li[SWP] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][SWP]
						      != -INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = SWP;
        }
        else if(load->li[TMP]
			 <= jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP]
            && load->li[TMP] != -INFINIT_LOAD
            && jobCard->jobSpecs.thresholds.loadStop[numLoad][TMP]
						      != -INFINIT_LOAD) {
            *reasons |= SUSP_LOAD_REASON;
            *subreasons = TMP;
        }

        for (j = R15S; !(*reasons) && j <= R15M; j++)
	    if ((load->li[j] != INFINIT_LOAD)
	        && (jobCard->jobSpecs.thresholds.loadStop[numLoad][j]
							 != INFINIT_LOAD)
	        && (load->li[j]
			>= jobCard->jobSpecs.thresholds.loadStop[numLoad][j])) {
	        *reasons |= SUSP_LOAD_REASON;
                *subreasons = j;
                break;
	    }


        for (j = MEM + 1; !(*reasons) &&
               j < MIN(allLsInfo->numIndx, jobCard->jobSpecs.thresholds.nIdx);
	              j++) {
            if (load->li[j] >= INFINIT_LOAD || load->li[j] <= -INFINIT_LOAD
                || jobCard->jobSpecs.thresholds.loadStop[numLoad][j]
							 >= INFINIT_LOAD
                || jobCard->jobSpecs.thresholds.loadStop[numLoad][j]
							 <= -INFINIT_LOAD) {
                continue;
            }
	    if (allLsInfo->resTable[j].orderType == INCR) {
	        if (load->li[j]
		       >= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) {
		    *reasons |= SUSP_LOAD_REASON;
                    *subreasons = j;
		    break;
                }
	    } else {
	        if (load->li[j]
		      <= jobCard->jobSpecs.thresholds.loadStop[numLoad][j]) {
		    *reasons |= SUSP_LOAD_REASON;
                    *subreasons = j;
		    break;
                }
	    }
        }

        if (!(*reasons) && jobCard->stopCondVal != NULL) {
            int returnCode;
            if (first == TRUE) {
                initTclHostData (&tclHostData);
                returnCode = getTclHostData (load, &tclHostData, FALSE);
                first = FALSE;
            } else {
                returnCode = getTclHostData (load, &tclHostData, TRUE);
            }
            if (returnCode >= 0
		     && evalResReq (jobCard->stopCondVal->selectStr,
    	       	        &tclHostData, DFT_FROMTYPE) == 1) {
        	*reasons |= SUSP_QUE_STOP_COND;
		break;
            }
        }
    }


    if (! (*reasons))
	return FALSE;


    if (LS_ISLOCKEDU(load->status) || LS_ISLOCKEDM(load->status)) {
	return TRUE;
    } else if (shouldStop1 (load)) {
        if (logclass & (LC_SCHED | LC_EXEC))
            ls_syslog (LOG_DEBUG2,
			"%s: Should stop job %s; reason=%x, subreasons=%d",
                        fname, lsb_jobid2str(jobCard->jobSpecs.jobId),
			*reasons, *subreasons);

        return TRUE;
    }
    return FALSE;

}