Ejemplo n.º 1
0
static int
post_config(void)
  {
  /* char   *id = "post_config"; */

  /* Set up per-queue primetime enforcement. */
  if (schd_BatchQueues)
    schd_reset_observed_pt(schd_BatchQueues);

  if (schd_ExternQueues)
    schd_reset_observed_pt(schd_ExternQueues);

  /* Post processing complete. */
  return (1);
  }
Ejemplo n.º 2
0
/* ARGSUSED */
int
schd_req(int cmd)
  {
  char   *id = "schd_req";
  Job    *this, *jobs = NULL;
  QueueList *qptr, *next;
  QueueList *normalQs = NULL, *normalQtail = NULL, *newqlp;
  Outage *outages;
  int     ran, error, total_ran = 0;
  int     hosts_in_dedtime = 0;

  struct tm *tm_ptr;
  char    reason[MAX_TXT + 1];

  /* Save "last" run time (in global 'schd_TimeNow') for later use. */
  schd_TimeLast = schd_TimeNow;

  /*
   * Get the number of seconds since the Epoch, and break it down into
   * the various day, month, year, fields in a struct tm.
   */
  time(&schd_TimeNow);

  if (tm_ptr = localtime(&schd_TimeNow))
    memcpy((void *) & schd_TmNow, (void *)tm_ptr, sizeof(struct tm));
  else
    memset((void *)&schd_TmNow, 0, sizeof(struct tm));

  DBPRT(("[time_t %d] %s", schd_TimeNow, ctime(&schd_TimeNow)));

  /*
   * If the configuration file has been changed since the last time the
   * scheduler was run, than note that in the logs.  Don't re-read it
   * automatically, just note the fact.  Don't reset the timestamp - it
   * will be done when someone finally HUP's the scheduler.
   */
  if (schd_CfgFilename && schd_file_has_changed(schd_CfgFilename, 0))
    {
    (void)sprintf(log_buffer,
                  "WARNING!!!  Scheduler config file %s has changed!",
                  schd_CfgFilename);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    (void)sprintf(log_buffer, "Run 'kill -HUP %ld' to reconfigure.",
                  (long)getpid());
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  /*
   * See if the holidays file has changed.  If it's re-read successfully,
   * update the last changed timestamp.  Otherwise, keep it around and
   * keep trying to re-read it until someone fixes the problem.  "This
   * shouldn't happen."
   */
  if (schd_file_has_changed(HOLIDAYS_FILE, 0) > 0)
    {
    (void)sprintf(log_buffer,
                  "Attempting to update holidays/primetime from %s.", HOLIDAYS_FILE);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s\n", log_buffer));

    if (schd_read_holidays() < 0)
      {
      (void)sprintf(log_buffer, "Failed to read holidays file.");
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s\n", log_buffer));
      }
    else
      {
      /* Reset the "last changed time", since it was re-read okay. */
      (void)schd_file_has_changed(HOLIDAYS_FILE, 1);
      }
    }

  /*
   * If this is the first run during non-primetime, set all the execution
   * queues' observed primetime back to 'on'.  If it's primetime now, set
   * the "last run in primetime" global.
   */
  if (schd_ENFORCE_PRIME_TIME && schd_TimeNow >= schd_ENFORCE_PRIME_TIME)
    {
    if (schd_prime_time(0))
      {
      last_run_in_pt = 1;

      }
    else if (last_run_in_pt)
      {
      DBPRT(("%s: First non-pt run, reset queue observed times.\n", id));

      if (schd_BatchQueues)
        schd_reset_observed_pt(schd_BatchQueues);

      if (schd_ExternQueues)
        schd_reset_observed_pt(schd_ExternQueues);

      /* Last run was not in prime time. */
      last_run_in_pt = 0;
      }
    }

  /* Get the current list of all jobs known to our server.
   * Sort these based on several criteria including recent
   * past usage, and then populate the schd_AllJobs list with
   * these sorted jobs
   */
  jobs = schd_get_jobs(NULL, NULL);

  /*
   * Check for queued jobs on any of the run queues.  This may happen if
   * there is some glitch and the POSIX jobs are checkpointed.
   * schedule_restart() will return non-zero if it finds and restarts
   * any jobs.  Recycle if this is the case.
   */
  if (schd_SCHED_RESTART_ACTION != SCHD_RESTART_NONE)
    {
    if (schedule_restart(jobs))
      {
      schd_free_jobs(jobs);
      return (0);
      }
    }

  /*
   * Reorder the list of jobs returned by the server.  Note that the jobs
   * are reordered "in situ".  The sorting routine returns a pointer to
   * the new head of the list created by relinking the elements of the
   * linked list, or NULL if an error occurs.  Zero the original list
   * pointer to reduce confusion - the same list, in different order, now
   * lives on schd_AllJobs.
   */
  schd_AllJobs = schd_sort_jobs(jobs);

  jobs = NULL;

  /*
   * Get the queue limits and utilization for each queue about which the
   * scheduler knows.  Any jobs on schd_AllJobs (set by get_and_sort_jobs()
   * above) that belong to the queue will be placed on the queue->jobs
   * list.
   *
   * If PBS fails to provide us any information about a queue, treat it
   * as a fatal error.  If a queue has failed the sanity checks, qsane
   * will be set to
   */

  error = get_all_queue_info(5 /* Number of queue lists */,
                             schd_SubmitQueue,
                             schd_BatchQueues,
                             schd_DedQueues,
                             schd_SpecialQueue,
                             schd_ExternQueues);

  if (error < 0)
    {
    DBPRT(("get_all_queue_info() failed\n"));

    return (1); /* Bogus queue - don't recycle. */

    }
  else if (error > 0)
    {
    DBPRT(("queue failed sanity check - wait and recycle.\n"));

    sleep(WAIT_FOR_QUEUE_SANITY);

    return (0); /* Attempt to recycle scheduler. */
    }

#ifdef NODEMASK
  /*
   * Prevent a case where two queues would have overlapping nodemasks.
   */
  if (nodemask_overlaps())
    {
    DBPRT(("nodemask overlap found.  bailing.\n"));

    return(1); /* Don't bother trying to recycle. */
    }

#endif /* NODEMASK */

  /*
   * Due to queues "claiming" the jobs from schd_AllJobs for which they
   * are responsible, the special jobs will be left enqueued on the
   * special queue, not the submit queue.  This is correct behavior, but
   * not exactly what is needed.  fixup_special() dequeues the jobs from
   * the special queue, marks them as special, and places them at the
   * head of the submit queue's list.
   * One could argue, successfully, that this is a crock.  It is, in
   * fact, more of a work around for a misfeature.
   */
  if (schd_SpecialQueue && schd_SpecialQueue->queue->queued)
    {
    if (fixup_special() < 0)
      {
      DBPRT(("%s: fixup_special() failed\n", id));
      return (1);
      }
    }

  /*
   * At this point, schd_AllJobs should hold only orphan jobs (i.e. only
   * jobs that belong to queues about which the scheduler does not care).
   * Note it and go on scheduling -- unless nothing is being scheduled,
   * this is more-or-less meaningless.
   */
  if (schd_AllJobs)
    {
    (void)sprintf(log_buffer, "Some jobs not claimed by queues.");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n%s: Unclaimed jobs: ", id, log_buffer, id));
#ifdef DEBUG

    for (this = schd_AllJobs; this != NULL; this = this->next)
      {
      DBPRT(("%s%s", this->jobid, this->next ? ", " : ""));
      }

    DBPRT(("\n"));

#endif /* DEBUG */
    }

  /* Dump the list of jobs being scheduled from submit queue. */
  if (schd_JOB_DUMPFILE)
    {
    (void)sprintf(log_buffer, "Dumping sorted job information to %s",
                  schd_JOB_DUMPFILE);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    make_job_dump(schd_JOB_DUMPFILE);
    }

  /*
   * Allocation and usage information are updated at [roughly] 2:00 AM
   * (Eastern time).  Since they may have been updated, attempt to fetch
   * them again in the middle of the night.
   */

  if (schd_NeedToGetDecayInfo)
    schd_decay_info("r");    /* get users' recent past usage */

  if (schd_ENFORCE_ALLOCATION && schd_TimeNow >= schd_ENFORCE_ALLOCATION)
    {
    /*
     * If the allocations file has already been loaded, consult the file
     * timestamp to determine if it has changed.  If so, flag that it
     * needs to be reloaded.
     */

    if (!schd_NeedToGetAllocInfo && schd_AllocFilename)
      schd_NeedToGetAllocInfo =
        schd_file_has_changed(schd_AllocFilename, 1);

    if (!schd_NeedToGetYTDInfo && schd_CurrentFilename)
      schd_NeedToGetYTDInfo =
        schd_file_has_changed(schd_CurrentFilename, 1);

    /* If either file needs to be [re]loaded, do so. */

    if (schd_NeedToGetAllocInfo || schd_NeedToGetYTDInfo)
      schd_alloc_info();
    }

  /*
   * We need to save the past usage data periodically, so that a restart
   * of pbs_sched doesn't lose it ...
   */
  if (schd_save_decay()) /* is it time yet ? */
    schd_decay_info("w"); /* yep, so do it */

  if (schd_SubmitQueue->queue->jobs &&
      !(schd_SubmitQueue->queue->flags & (QFLAGS_DISABLED | QFLAGS_STOPPED)))
    {
    /*
     * Test each job against the set of execution queues.  If it can
     * never be run in any queue, reject it immediately.  This saves
     * the user having to wait for the scheduler to get around to being
     * able to run it.
     */
    jobs = reject_unrunnables(schd_SubmitQueue->queue->jobs);

    /*
     * Look for queues whose execution hosts are in dedicated time.  If
     * any are found, note that fact and continue.  Otherwise, add them
     * to the normalQs list, which will be scheduled normally.  If the
     * flag is set indicating that one or more hosts is in dedtime, they
     * will be scheduled after everything else is done.
     */

    for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next)
      {
      if (schd_ENFORCE_DEDTIME && schd_TimeNow >= schd_ENFORCE_DEDTIME)
        outages = schd_host_outage(qptr->queue->exechost, 0);
      else
        outages = NULL;

      /*
       * Is there a scheduled outage right now for this host?  If so,
       * note that fact and continue to the next queue.  All of this
       * information is cached, so this isn't as expensive as it seems.
       */
      if (outages != NULL)
        {
        if ((outages->beg_time <= schd_TimeNow) &&
            (outages->end_time > schd_TimeNow))
          {
          DBPRT(("%s: Host %s is in dedtime (from %s:%s to %s:%s)\n",
                 id, outages->exechost,
                 outages->beg_datestr, outages->beg_timestr,
                 outages->end_datestr, outages->end_timestr));
          DBPRT(("%s: Queue %s@%s will not be scheduled.\n", id,
                 qptr->queue->qname, qptr->queue->exechost));

          /* This exechost is in dedicated time, ignore the queue. */
          hosts_in_dedtime ++;
          continue;

          }
        else if (outages->beg_time > schd_TimeNow)
          {

          /* Upcoming dedtime, but not yet.  Schedule the queue. */
          DBPRT(("%s: Host %s upcoming dedtime (at %s:%s to %s:%s)\n",
                 id, outages->exechost,
                 outages->beg_datestr, outages->beg_timestr,
                 outages->end_datestr, outages->end_timestr));
          }
        }

      /*
       * This host is not currently in dedicated time.  Add it to the
       * tail of the list of queues to be scheduled.
       */
      newqlp = (QueueList *)malloc(sizeof(QueueList));

      if (newqlp == NULL)
        {
        (void)sprintf(log_buffer, "malloc(QueueList) for %s@%s failed",
                      qptr->queue->qname, qptr->queue->exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        if (normalQs)
          schd_free_qlist(normalQs);

        return (1);
        }

      newqlp->queue = qptr->queue;

      if (normalQtail)
        normalQtail->next = newqlp;
      else
        normalQs = newqlp;

      normalQtail = newqlp;

      newqlp->next = NULL;
      }

    DBPRT(("%s: calling schedule_jobs(", id));

    if (normalQs)
      {
      for (qptr = normalQs; qptr != NULL; qptr = qptr->next)
        DBPRT(("%s@%s%s", qptr->queue->qname, qptr->queue->exechost,
               qptr->next ? ", " : ""));
      }
    else
      {
      DBPRT(("<no batch queues>"));
      }

    DBPRT((")\n"));

    total_ran += ran = schedule_jobs(normalQs, jobs, reason);

    if (ran < 0)
      {
      DBPRT(("Could not run any jobs!\n"));
      }
    else
      {
      DBPRT(("RAN %d jobs.\n", ran));
      }

    if (normalQs)
      schd_free_qlist(normalQs);

    normalQs = normalQtail = NULL;
    }

  /*
   * If there are any externally-routed queues, schedule any jobs
   * that are enqueued in them.
   */
  for (qptr = schd_ExternQueues; qptr != NULL; qptr = qptr->next)
    {
    if (qptr->queue->queued == 0)
      continue;

    (void)sprintf(log_buffer, "Scheduling external queue %s@%s ...",
                  qptr->queue->qname, qptr->queue->exechost);

    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
               log_buffer);

    DBPRT(("%s: %s\n", id, log_buffer));

    /*
     * Keep track of the next pointer.  Zero it so that each queue
     * looks like a single queue to schd_pack_queues().
     */
    next = qptr->next;

    qptr->next = NULL;

    ran = schd_pack_queues(qptr->queue->jobs, qptr, reason);

    if (ran < 0)
      {
      (void)sprintf(log_buffer, "sched_pack_queues(%s@%s) failed!",
                    qptr->queue->qname, qptr->queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                 log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));
      }
    else
      {
      DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname,
             qptr->queue->exechost));
      total_ran += ran;
      }

    /* Replace the zero'd next pointer to rechain the list. */
    qptr->next = next;
    }

  /*
   * Now check the dedtime queues with queued jobs for hosts that are
   * in dedicated time.  If any are found, comment the jobs appropriately
   * and/or schedule them.
   */

  for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next)
    {
    if (qptr->queue->queued == 0)
      continue;

    DBPRT(("%s: schd_handle_dedicated_time(%s)\n", id, qptr->queue->qname));

    /*
     * Keep track of the next pointer, and zero the queue's next ptr so
     * it looks like a single queue.
     */
    next = qptr->next;

    qptr->next = NULL;

    ran = schd_handle_dedicated_time(qptr->queue);

    if (ran < 0)
      {
      (void)sprintf(log_buffer,
                    "schd_handle_dedicated_time(%s@%s) failed!",
                    qptr->queue->qname, qptr->queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                 log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));
      }
    else
      {
      DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname,
             qptr->queue->exechost));
      total_ran += ran;
      }

    /* Replace the zero'd next pointer to rechain the list. */
    qptr->next = next;
    }

  /*
   * Attempt to revoke any unused HPM counters that are still in user
   * mode.  Returns number of errors encountered.  This should be zero
   * for a healthy system.
   */
  if (schd_MANAGE_HPM)
    {
    if (schd_revoke_hpm())
      {
      (void)sprintf(log_buffer, "Failed to revoke unused HPM counters!");
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s\n", log_buffer));
      }
    }

  if (total_ran > 0)
    {
    (void)sprintf(log_buffer, "System resources after scheduling:");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    schd_dump_rsrclist();
    }

  (void)sprintf(log_buffer, ">>>  End Scheduling Cycle (ran %d jobs)  <<<",
                total_ran);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
  DBPRT(("%s\n", log_buffer));

  return (1);
  }