static int readWaitReply(LS_WAIT_T *status, struct rusage *ru) { struct lslibNiosWaitReply reply; if (b_read_fix(cli_nios_fd[0], (char *) &reply.r, sizeof(reply.r)) != sizeof(reply.r)) { lserrno = LSE_MSG_SYS; return -1; } (void) tid_remove(reply.r.pid); if (status) LS_STATUS(*status) = reply.r.status; if (ru) *ru = reply.r.ru; return (reply.r.pid); }
void child_handler(int sig) { int pid; LS_WAIT_T status; struct rusage rusage; register float cpuTime; struct lsfRusage lsfRusage; struct jobCard *jobCard; static short lastMbdExitVal = MASTER_NULL; static int sbd_finish_sleep = -1; cleanRusage (&rusage); now = time(0); while ((pid=wait3(&status, WNOHANG, &rusage)) > 0) { if (pid == mbdPid) { int sig = WTERMSIG(status); if (mbdExitCnt > 150) mbdExitCnt = 150; mbdExitVal = WIFSIGNALED(status); if (mbdExitVal) { ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5600, "mbatchd died with signal <%d> termination"), /* catgets 5600 */ sig); if (WCOREDUMP(status)) ls_syslog(LOG_ERR, _i18n_msg_get(ls_catd , NL_SETN, 5601, "mbatchd core dumped")); /* catgets 5601 */ mbdExitVal = sig; if (mbdExitVal == lastMbdExitVal) mbdExitCnt++; else { mbdExitCnt = 0; lastMbdExitVal = mbdExitVal; } continue; } else { mbdExitVal = WEXITSTATUS(status); if (mbdExitVal == lastMbdExitVal) mbdExitCnt++; else { mbdExitCnt = 0; lastMbdExitVal = mbdExitVal; } if (mbdExitVal == MASTER_RECONFIG) { ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5602, "mbatchd resigned for reconfiguration")); /* catgets 5602 */ start_master(); } else ls_syslog(LOG_NOTICE, _i18n_msg_get(ls_catd , NL_SETN, 5603, "mbatchd exited with value <%d>"), /* catgets 5603 */ mbdExitVal); continue; } } ls_ruunix2lsf (&rusage, &lsfRusage); cpuTime = lsfRusage.ru_utime + lsfRusage.ru_stime; for (jobCard = jobQueHead->forw; (jobCard != jobQueHead); jobCard = jobCard->forw) { if (jobCard->exitPid == pid) { jobCard->w_status = LS_STATUS(status); jobCard->exitPid = -1; if (logclass & LC_EXEC) { ls_syslog(LOG_DEBUG, I18N(5604, "child_handler: Job <%s> exitPid <%d> status <%d> exitcode <%d>"),/*catgets 5604*/ lsb_jobid2str(jobCard->jobSpecs.jobId), pid, jobCard->w_status, WEXITSTATUS(status)); } } if (jobCard->jobSpecs.jobPid == pid) { jobCard->collectedChild = TRUE; jobCard->cpuTime = cpuTime; jobCard->w_status = LS_STATUS(status); jobCard->exitPid = -1; memcpy ((char *) &jobCard->lsfRusage, (char *) &lsfRusage, sizeof (struct lsfRusage)); jobCard->notReported++; if (sbd_finish_sleep < 0) { if (daemonParams[LSB_SBD_FINISH_SLEEP].paramValue) { errno = 0; sbd_finish_sleep = atoi(daemonParams[LSB_SBD_FINISH_SLEEP].paramValue); if (errno) sbd_finish_sleep = 0; } else { sbd_finish_sleep=0; } } if (sbd_finish_sleep > 0) { millisleep_(sbd_finish_sleep); } if (logclass & LC_EXEC) { ls_syslog(LOG_DEBUG, I18N(5605, "child_handler: Job <%s> Pid <%d> status <%d> exitcode <%d>"), /*catgets 5605*/ lsb_jobid2str(jobCard->jobSpecs.jobId), pid, jobCard->w_status, WEXITSTATUS(status)); } need_checkfinish = TRUE; break; } } } }
void prtJobFinish(struct jobInfoEnt *job, struct jobInfoHead *jInfoH) { char prline[MSGSIZE]; time_t doneTime; static struct loadIndexLog *loadIndex = NULL; char *pendReasons; if (loadIndex == NULL) TIMEIT(1, loadIndex = initLoadIndex(), "initLoadIndex"); doneTime = job->endTime; switch (job->status) { case JOB_STAT_DONE: case (JOB_STAT_DONE | JOB_STAT_PDONE): case (JOB_STAT_DONE | JOB_STAT_PERR): if ((job->startTime < job->submitTime) && (job->endTime < (job->submitTime + (time_t) MAX(job->cpuTime, MIN_CPU_TIME)))) { doneTime = job->submitTime + (time_t) MAX(job->cpuTime, 0.0001); } else if (job->startTime >= job->submitTime && job->endTime < (job->startTime + (time_t)MAX(job->cpuTime, 0.0001)) && job->numExHosts == 1) { doneTime = job->startTime + (time_t) MAX(job->cpuTime, 0.0001); if (job->endTime <= doneTime) { doneTime = job->endTime; } } case (JOB_STAT_EXIT | JOB_STAT_PDONE): case (JOB_STAT_EXIT | JOB_STAT_PERR): case JOB_STAT_EXIT: if (job->reasons & EXIT_ZOMBIE) { sprintf(prline, "%s: ", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &job->endTime)); prtLineWUF(prline); sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,623, "Termination request issued; the job will be killed once the host is ok;"))); /* catgets 623 */ prtLineWUF(prline); break; } sprintf(prline, "%s: ", _i18n_ctime(ls_catd, CTIME_FORMAT_a_b_d_T, &doneTime)); prtLineWUF(prline); if (strcmp(get_status(job), "DONE") == 0) { sprintf(prline, I18N(624, "Done successfully.")); /* catgets 624 */ } else { LS_WAIT_T wStatus; LS_STATUS(wStatus) = job->exitStatus; if (job->cpuTime >= MIN_CPU_TIME && job->exitStatus) { if (WEXITSTATUS(wStatus)) sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,626, "Exited with exit code %d.")), /* catgets 626 */ WEXITSTATUS(wStatus)); else sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,627, "Exited by signal %d.")), WTERMSIG(wStatus)); /* catgets 627 */ } else sprintf(prline, I18N_Exited); } prtLineWUF(prline); if (job->numExHosts > 0) { if (job->cpuTime < MIN_CPU_TIME) sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,629, " The CPU time used is unknown.\n"))); /* catgets 629 */ else sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,630, " The CPU time used is %1.1f seconds.\n")), /* catgets 630 */ job->cpuTime); } else { sprintf(prline, "\n"); } prtLineWUF(prline); break; case JOB_STAT_PSUSP: case JOB_STAT_PEND: sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,631, " PENDING REASONS:\n"))); /* catgets 631 */ prtLineWUF(prline); pendReasons = lsb_pendreason(job->numReasons, job->reasonTb, jInfoH, loadIndex); prtLineWUF(pendReasons); break; case JOB_STAT_SSUSP: case JOB_STAT_USUSP: TIMEIT(1, prtJobRusage(job), "prtJobRusage"); sprintf(prline, (_i18n_msg_get(ls_catd,NL_SETN,632, " SUSPENDING REASONS:\n"))); /* catgets 632 */ prtLineWUF(prline); if (job->reasons) { sprintf(prline, "%s", lsb_suspreason(job->reasons, job->subreasons, loadIndex)); prtLineWUF(prline); } break; case JOB_STAT_RUN: TIMEIT(1, prtJobRusage(job), "prtJobRusage"); break; default: break; } }