static int post_config(void) { /* char *id = "post_config"; */ /* Set up per-queue primetime enforcement. */ if (schd_BatchQueues) schd_reset_observed_pt(schd_BatchQueues); if (schd_ExternQueues) schd_reset_observed_pt(schd_ExternQueues); /* Post processing complete. */ return (1); }
/* ARGSUSED */ int schd_req(int cmd) { char *id = "schd_req"; Job *this, *jobs = NULL; QueueList *qptr, *next; QueueList *normalQs = NULL, *normalQtail = NULL, *newqlp; Outage *outages; int ran, error, total_ran = 0; int hosts_in_dedtime = 0; struct tm *tm_ptr; char reason[MAX_TXT + 1]; /* Save "last" run time (in global 'schd_TimeNow') for later use. */ schd_TimeLast = schd_TimeNow; /* * Get the number of seconds since the Epoch, and break it down into * the various day, month, year, fields in a struct tm. */ time(&schd_TimeNow); if (tm_ptr = localtime(&schd_TimeNow)) memcpy((void *) & schd_TmNow, (void *)tm_ptr, sizeof(struct tm)); else memset((void *)&schd_TmNow, 0, sizeof(struct tm)); DBPRT(("[time_t %d] %s", schd_TimeNow, ctime(&schd_TimeNow))); /* * If the configuration file has been changed since the last time the * scheduler was run, than note that in the logs. Don't re-read it * automatically, just note the fact. Don't reset the timestamp - it * will be done when someone finally HUP's the scheduler. */ if (schd_CfgFilename && schd_file_has_changed(schd_CfgFilename, 0)) { (void)sprintf(log_buffer, "WARNING!!! Scheduler config file %s has changed!", schd_CfgFilename); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); (void)sprintf(log_buffer, "Run 'kill -HUP %ld' to reconfigure.", (long)getpid()); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } /* * See if the holidays file has changed. If it's re-read successfully, * update the last changed timestamp. Otherwise, keep it around and * keep trying to re-read it until someone fixes the problem. "This * shouldn't happen." */ if (schd_file_has_changed(HOLIDAYS_FILE, 0) > 0) { (void)sprintf(log_buffer, "Attempting to update holidays/primetime from %s.", HOLIDAYS_FILE); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); if (schd_read_holidays() < 0) { (void)sprintf(log_buffer, "Failed to read holidays file."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); } else { /* Reset the "last changed time", since it was re-read okay. */ (void)schd_file_has_changed(HOLIDAYS_FILE, 1); } } /* * If this is the first run during non-primetime, set all the execution * queues' observed primetime back to 'on'. If it's primetime now, set * the "last run in primetime" global. */ if (schd_ENFORCE_PRIME_TIME && schd_TimeNow >= schd_ENFORCE_PRIME_TIME) { if (schd_prime_time(0)) { last_run_in_pt = 1; } else if (last_run_in_pt) { DBPRT(("%s: First non-pt run, reset queue observed times.\n", id)); if (schd_BatchQueues) schd_reset_observed_pt(schd_BatchQueues); if (schd_ExternQueues) schd_reset_observed_pt(schd_ExternQueues); /* Last run was not in prime time. */ last_run_in_pt = 0; } } /* Get the current list of all jobs known to our server. * Sort these based on several criteria including recent * past usage, and then populate the schd_AllJobs list with * these sorted jobs */ jobs = schd_get_jobs(NULL, NULL); /* * Check for queued jobs on any of the run queues. This may happen if * there is some glitch and the POSIX jobs are checkpointed. * schedule_restart() will return non-zero if it finds and restarts * any jobs. Recycle if this is the case. */ if (schd_SCHED_RESTART_ACTION != SCHD_RESTART_NONE) { if (schedule_restart(jobs)) { schd_free_jobs(jobs); return (0); } } /* * Reorder the list of jobs returned by the server. Note that the jobs * are reordered "in situ". The sorting routine returns a pointer to * the new head of the list created by relinking the elements of the * linked list, or NULL if an error occurs. Zero the original list * pointer to reduce confusion - the same list, in different order, now * lives on schd_AllJobs. */ schd_AllJobs = schd_sort_jobs(jobs); jobs = NULL; /* * Get the queue limits and utilization for each queue about which the * scheduler knows. Any jobs on schd_AllJobs (set by get_and_sort_jobs() * above) that belong to the queue will be placed on the queue->jobs * list. * * If PBS fails to provide us any information about a queue, treat it * as a fatal error. If a queue has failed the sanity checks, qsane * will be set to */ error = get_all_queue_info(5 /* Number of queue lists */, schd_SubmitQueue, schd_BatchQueues, schd_DedQueues, schd_SpecialQueue, schd_ExternQueues); if (error < 0) { DBPRT(("get_all_queue_info() failed\n")); return (1); /* Bogus queue - don't recycle. */ } else if (error > 0) { DBPRT(("queue failed sanity check - wait and recycle.\n")); sleep(WAIT_FOR_QUEUE_SANITY); return (0); /* Attempt to recycle scheduler. */ } #ifdef NODEMASK /* * Prevent a case where two queues would have overlapping nodemasks. */ if (nodemask_overlaps()) { DBPRT(("nodemask overlap found. bailing.\n")); return(1); /* Don't bother trying to recycle. */ } #endif /* NODEMASK */ /* * Due to queues "claiming" the jobs from schd_AllJobs for which they * are responsible, the special jobs will be left enqueued on the * special queue, not the submit queue. This is correct behavior, but * not exactly what is needed. fixup_special() dequeues the jobs from * the special queue, marks them as special, and places them at the * head of the submit queue's list. * One could argue, successfully, that this is a crock. It is, in * fact, more of a work around for a misfeature. */ if (schd_SpecialQueue && schd_SpecialQueue->queue->queued) { if (fixup_special() < 0) { DBPRT(("%s: fixup_special() failed\n", id)); return (1); } } /* * At this point, schd_AllJobs should hold only orphan jobs (i.e. only * jobs that belong to queues about which the scheduler does not care). * Note it and go on scheduling -- unless nothing is being scheduled, * this is more-or-less meaningless. */ if (schd_AllJobs) { (void)sprintf(log_buffer, "Some jobs not claimed by queues."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n%s: Unclaimed jobs: ", id, log_buffer, id)); #ifdef DEBUG for (this = schd_AllJobs; this != NULL; this = this->next) { DBPRT(("%s%s", this->jobid, this->next ? ", " : "")); } DBPRT(("\n")); #endif /* DEBUG */ } /* Dump the list of jobs being scheduled from submit queue. */ if (schd_JOB_DUMPFILE) { (void)sprintf(log_buffer, "Dumping sorted job information to %s", schd_JOB_DUMPFILE); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); make_job_dump(schd_JOB_DUMPFILE); } /* * Allocation and usage information are updated at [roughly] 2:00 AM * (Eastern time). Since they may have been updated, attempt to fetch * them again in the middle of the night. */ if (schd_NeedToGetDecayInfo) schd_decay_info("r"); /* get users' recent past usage */ if (schd_ENFORCE_ALLOCATION && schd_TimeNow >= schd_ENFORCE_ALLOCATION) { /* * If the allocations file has already been loaded, consult the file * timestamp to determine if it has changed. If so, flag that it * needs to be reloaded. */ if (!schd_NeedToGetAllocInfo && schd_AllocFilename) schd_NeedToGetAllocInfo = schd_file_has_changed(schd_AllocFilename, 1); if (!schd_NeedToGetYTDInfo && schd_CurrentFilename) schd_NeedToGetYTDInfo = schd_file_has_changed(schd_CurrentFilename, 1); /* If either file needs to be [re]loaded, do so. */ if (schd_NeedToGetAllocInfo || schd_NeedToGetYTDInfo) schd_alloc_info(); } /* * We need to save the past usage data periodically, so that a restart * of pbs_sched doesn't lose it ... */ if (schd_save_decay()) /* is it time yet ? */ schd_decay_info("w"); /* yep, so do it */ if (schd_SubmitQueue->queue->jobs && !(schd_SubmitQueue->queue->flags & (QFLAGS_DISABLED | QFLAGS_STOPPED))) { /* * Test each job against the set of execution queues. If it can * never be run in any queue, reject it immediately. This saves * the user having to wait for the scheduler to get around to being * able to run it. */ jobs = reject_unrunnables(schd_SubmitQueue->queue->jobs); /* * Look for queues whose execution hosts are in dedicated time. If * any are found, note that fact and continue. Otherwise, add them * to the normalQs list, which will be scheduled normally. If the * flag is set indicating that one or more hosts is in dedtime, they * will be scheduled after everything else is done. */ for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next) { if (schd_ENFORCE_DEDTIME && schd_TimeNow >= schd_ENFORCE_DEDTIME) outages = schd_host_outage(qptr->queue->exechost, 0); else outages = NULL; /* * Is there a scheduled outage right now for this host? If so, * note that fact and continue to the next queue. All of this * information is cached, so this isn't as expensive as it seems. */ if (outages != NULL) { if ((outages->beg_time <= schd_TimeNow) && (outages->end_time > schd_TimeNow)) { DBPRT(("%s: Host %s is in dedtime (from %s:%s to %s:%s)\n", id, outages->exechost, outages->beg_datestr, outages->beg_timestr, outages->end_datestr, outages->end_timestr)); DBPRT(("%s: Queue %s@%s will not be scheduled.\n", id, qptr->queue->qname, qptr->queue->exechost)); /* This exechost is in dedicated time, ignore the queue. */ hosts_in_dedtime ++; continue; } else if (outages->beg_time > schd_TimeNow) { /* Upcoming dedtime, but not yet. Schedule the queue. */ DBPRT(("%s: Host %s upcoming dedtime (at %s:%s to %s:%s)\n", id, outages->exechost, outages->beg_datestr, outages->beg_timestr, outages->end_datestr, outages->end_timestr)); } } /* * This host is not currently in dedicated time. Add it to the * tail of the list of queues to be scheduled. */ newqlp = (QueueList *)malloc(sizeof(QueueList)); if (newqlp == NULL) { (void)sprintf(log_buffer, "malloc(QueueList) for %s@%s failed", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); if (normalQs) schd_free_qlist(normalQs); return (1); } newqlp->queue = qptr->queue; if (normalQtail) normalQtail->next = newqlp; else normalQs = newqlp; normalQtail = newqlp; newqlp->next = NULL; } DBPRT(("%s: calling schedule_jobs(", id)); if (normalQs) { for (qptr = normalQs; qptr != NULL; qptr = qptr->next) DBPRT(("%s@%s%s", qptr->queue->qname, qptr->queue->exechost, qptr->next ? ", " : "")); } else { DBPRT(("<no batch queues>")); } DBPRT((")\n")); total_ran += ran = schedule_jobs(normalQs, jobs, reason); if (ran < 0) { DBPRT(("Could not run any jobs!\n")); } else { DBPRT(("RAN %d jobs.\n", ran)); } if (normalQs) schd_free_qlist(normalQs); normalQs = normalQtail = NULL; } /* * If there are any externally-routed queues, schedule any jobs * that are enqueued in them. */ for (qptr = schd_ExternQueues; qptr != NULL; qptr = qptr->next) { if (qptr->queue->queued == 0) continue; (void)sprintf(log_buffer, "Scheduling external queue %s@%s ...", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); /* * Keep track of the next pointer. Zero it so that each queue * looks like a single queue to schd_pack_queues(). */ next = qptr->next; qptr->next = NULL; ran = schd_pack_queues(qptr->queue->jobs, qptr, reason); if (ran < 0) { (void)sprintf(log_buffer, "sched_pack_queues(%s@%s) failed!", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } else { DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname, qptr->queue->exechost)); total_ran += ran; } /* Replace the zero'd next pointer to rechain the list. */ qptr->next = next; } /* * Now check the dedtime queues with queued jobs for hosts that are * in dedicated time. If any are found, comment the jobs appropriately * and/or schedule them. */ for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next) { if (qptr->queue->queued == 0) continue; DBPRT(("%s: schd_handle_dedicated_time(%s)\n", id, qptr->queue->qname)); /* * Keep track of the next pointer, and zero the queue's next ptr so * it looks like a single queue. */ next = qptr->next; qptr->next = NULL; ran = schd_handle_dedicated_time(qptr->queue); if (ran < 0) { (void)sprintf(log_buffer, "schd_handle_dedicated_time(%s@%s) failed!", qptr->queue->qname, qptr->queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } else { DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname, qptr->queue->exechost)); total_ran += ran; } /* Replace the zero'd next pointer to rechain the list. */ qptr->next = next; } /* * Attempt to revoke any unused HPM counters that are still in user * mode. Returns number of errors encountered. This should be zero * for a healthy system. */ if (schd_MANAGE_HPM) { if (schd_revoke_hpm()) { (void)sprintf(log_buffer, "Failed to revoke unused HPM counters!"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); } } if (total_ran > 0) { (void)sprintf(log_buffer, "System resources after scheduling:"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); schd_dump_rsrclist(); } (void)sprintf(log_buffer, ">>> End Scheduling Cycle (ran %d jobs) <<<", total_ran); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s\n", log_buffer)); return (1); }