Exemple #1
0
static int
readWaitReply(LS_WAIT_T *status, struct rusage *ru)
{
    struct lslibNiosWaitReply reply;

    if (b_read_fix(cli_nios_fd[0], (char *) &reply.r, sizeof(reply.r))
	!= sizeof(reply.r)) {
	lserrno = LSE_MSG_SYS;
	return -1;
    }
    (void) tid_remove(reply.r.pid);
    if (status)
	LS_STATUS(*status) = reply.r.status;
    if (ru)
	*ru = reply.r.ru;

    return (reply.r.pid);
}
Exemple #2
0
void
child_handler(int sig)
{
    int             pid;
    LS_WAIT_T       status;
    struct rusage   rusage;
    register float  cpuTime;
    struct lsfRusage lsfRusage;
    struct jobCard *jobCard;
    static short lastMbdExitVal = MASTER_NULL;
    static int sbd_finish_sleep = -1;

    cleanRusage (&rusage);
    now = time(0);
    while ((pid=wait3(&status, WNOHANG, &rusage)) > 0) {
        if (pid == mbdPid) {
            int sig = WTERMSIG(status);
            if (mbdExitCnt > 150)
                mbdExitCnt = 150;
            mbdExitVal = WIFSIGNALED(status);
            if (mbdExitVal) {
                ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5600,
                    "mbatchd died with signal <%d> termination"), /* catgets 5600 */
                    sig);
                if (WCOREDUMP(status))
                    ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5601,
                        "mbatchd core dumped")); /* catgets 5601 */
                mbdExitVal = sig;
                if (mbdExitVal == lastMbdExitVal)
                    mbdExitCnt++;
                else {
                    mbdExitCnt = 0;
                    lastMbdExitVal = mbdExitVal;
                }
                continue;
            } else {
                mbdExitVal = WEXITSTATUS(status);

                if (mbdExitVal == lastMbdExitVal)
                    mbdExitCnt++;
                else {
                    mbdExitCnt = 0;
                    lastMbdExitVal = mbdExitVal;
                }
                if (mbdExitVal == MASTER_RECONFIG) {
                    ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5602,
                        "mbatchd resigned for reconfiguration")); /* catgets 5602 */
                    start_master();
                } else
                    ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5603,
                        "mbatchd exited with value <%d>"),  /* catgets 5603 */
                        mbdExitVal);
                continue;
            }
        }

        ls_ruunix2lsf (&rusage, &lsfRusage);
        cpuTime = lsfRusage.ru_utime + lsfRusage.ru_stime;

        for (jobCard = jobQueHead->forw; (jobCard != jobQueHead);
             jobCard = jobCard->forw) {

            if (jobCard->exitPid == pid) {
                jobCard->w_status = LS_STATUS(status);
                jobCard->exitPid = -1;
                if (logclass & LC_EXEC) {
                    ls_syslog(LOG_DEBUG, I18N(5604,
                              "child_handler: Job <%s> exitPid <%d> status <%d> exitcode <%d>"),/*catgets 5604*/
                              lsb_jobid2str(jobCard->jobSpecs.jobId),
                              pid, jobCard->w_status,
                              WEXITSTATUS(status));
                }
            }

            if (jobCard->jobSpecs.jobPid == pid) {
                jobCard->collectedChild = TRUE;
                jobCard->cpuTime = cpuTime;
                jobCard->w_status = LS_STATUS(status);
                jobCard->exitPid = -1;
                memcpy ((char *) &jobCard->lsfRusage, (char *) &lsfRusage,
                        sizeof (struct lsfRusage));
                jobCard->notReported++;



                if (sbd_finish_sleep < 0) {
                    if (daemonParams[LSB_SBD_FINISH_SLEEP].paramValue) {
                        errno = 0;
                        sbd_finish_sleep = atoi(daemonParams[LSB_SBD_FINISH_SLEEP].paramValue);
                        if (errno)
                            sbd_finish_sleep = 0;
                    } else {
                        sbd_finish_sleep=0;
                    }
                }
                if (sbd_finish_sleep > 0) {
                    millisleep_(sbd_finish_sleep);
                }

                if (logclass & LC_EXEC) {
                    ls_syslog(LOG_DEBUG, I18N(5605,
                              "child_handler: Job <%s> Pid <%d> status <%d> exitcode <%d>"), /*catgets 5605*/
                              lsb_jobid2str(jobCard->jobSpecs.jobId), pid,
                              jobCard->w_status, WEXITSTATUS(status));
                }
                need_checkfinish = TRUE;

                break;
            }
        }
    }


}
Exemple #3
0
void
prtJobFinish(struct jobInfoEnt *job, struct jobInfoHead *jInfoH)
{
    char prline[MSGSIZE];
    time_t doneTime;
    static struct loadIndexLog *loadIndex = NULL;
    char *pendReasons;

    if (loadIndex == NULL)
	TIMEIT(1, loadIndex = initLoadIndex(), "initLoadIndex");

    doneTime = job->endTime;

    switch (job->status) {
    case JOB_STAT_DONE:
    case (JOB_STAT_DONE | JOB_STAT_PDONE):
    case (JOB_STAT_DONE | JOB_STAT_PERR):

        if ((job->startTime < job->submitTime)
		 && (job->endTime < (job->submitTime
				     + (time_t) MAX(job->cpuTime, MIN_CPU_TIME)))) {
            doneTime = job->submitTime +
		(time_t) MAX(job->cpuTime, 0.0001);
        } else if (job->startTime >= job->submitTime &&
		 job->endTime < (job->startTime +
				 (time_t)MAX(job->cpuTime, 0.0001)) &&
		 job->numExHosts == 1) {

	    doneTime = job->startTime + (time_t) MAX(job->cpuTime, 0.0001);

	    if (job->endTime <= doneTime) {
                doneTime = job->endTime;
	    }


        }

    case (JOB_STAT_EXIT | JOB_STAT_PDONE):
    case (JOB_STAT_EXIT | JOB_STAT_PERR):
    case JOB_STAT_EXIT:
        if (job->reasons & EXIT_ZOMBIE) {
	    sprintf(prline, "%s: ",
		    _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->endTime));
	    prtLineWUF(prline);
	    sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,623, "Termination request issued; the job will be killed once the host is ok;"))); /* catgets  623  */
	    prtLineWUF(prline);
	    break;
        }
        sprintf(prline, "%s: ",
		_i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &doneTime));
        prtLineWUF(prline);
        if (strcmp(get_status(job), "DONE") == 0)
	{
	    sprintf(prline, I18N(624, "Done successfully.")); /* catgets 624 */
        }
        else {
	    LS_WAIT_T wStatus;

	    LS_STATUS(wStatus) = job->exitStatus;

	    if (job->cpuTime >= MIN_CPU_TIME && job->exitStatus) {
		if (WEXITSTATUS(wStatus))
	            sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,626, "Exited with exit code %d.")), /* catgets  626  */
			WEXITSTATUS(wStatus));
                else
		    sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,627, "Exited by signal %d.")), WTERMSIG(wStatus)); /* catgets  627  */
            } else
		sprintf(prline, I18N_Exited);
	}

        prtLineWUF(prline);

	if (job->numExHosts > 0) {
	    if (job->cpuTime < MIN_CPU_TIME)
		sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,629, " The CPU time used is unknown.\n"))); /* catgets  629  */
	    else
		sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,630, " The CPU time used is %1.1f seconds.\n")),  /* catgets  630  */
			job->cpuTime);
	} else {
	    sprintf(prline, "\n");
	}

	prtLineWUF(prline);
        break;
    case JOB_STAT_PSUSP:
    case JOB_STAT_PEND:
        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,631, " PENDING REASONS:\n"))); /* catgets  631  */
        prtLineWUF(prline);
	pendReasons = lsb_pendreason(job->numReasons, job->reasonTb,
				     jInfoH, loadIndex);
	prtLineWUF(pendReasons);
        break;
    case JOB_STAT_SSUSP:
    case JOB_STAT_USUSP:


        TIMEIT(1, prtJobRusage(job), "prtJobRusage");

        sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,632, " SUSPENDING REASONS:\n"))); /* catgets  632  */
        prtLineWUF(prline);

        if (job->reasons) {
            sprintf(prline, "%s", lsb_suspreason(job->reasons,
						 job->subreasons,
						 loadIndex));
            prtLineWUF(prline);
        }
        break;
    case JOB_STAT_RUN:

        TIMEIT(1, prtJobRusage(job), "prtJobRusage");
        break;
    default:
        break;
    }
}