Пример #1
0
/*
 * Determine if a job *can* run in this queue.  This is distinct from if
 * it *should* be run in the queue.
 *
 * A job *can* fit in a queue if its requested resources are not greater
 * than the queue's maximums.
 *
 * A job *should* be run only if its requested resources do not exceed the
 * queue's *available* resources.
 */
int
schd_job_fits_queue(Job *job, Queue *queue, char *reason)
  {
  /* char   *id = "schd_job_fits_queue"; */

  /*
   * Compare the job's requested resources against the queue's limits.
   */
  if ((queue->wallt_min != UNSPECIFIED) &&
      (job->walltime < queue->wallt_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' walltime minimum (%s).",
                    queue->qname, schd_sec2val(queue->wallt_min));

    return (0);
    }

  if ((queue->wallt_max != UNSPECIFIED) &&
      (job->walltime > queue->wallt_max))
    {
    if (reason)
      (void)sprintf(reason,
                    "Would exceed queue '%s' walltime limit (%s).", queue->qname,
                    schd_sec2val(queue->wallt_max));

    return (0);
    }

  if ((queue->ncpus_min != UNSPECIFIED) && (job->ncpus < queue->ncpus_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' CPU minimum (%d).", queue->qname,
                    queue->ncpus_min);

    return (0);
    }

  if ((queue->ncpus_max != UNSPECIFIED) && (job->ncpus > queue->ncpus_max))
    {
    if (reason)
      (void)sprintf(reason, "Would exceed queue '%s' CPU limit (%d).",
                    queue->qname, queue->ncpus_max);

    return (0);
    }

  if ((queue->mem_min != UNSPECIFIED) && (job->memory < queue->mem_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' memory minimum (%ld).", queue->qname,
                    (long)queue->mem_min);

    return (0);
    }

  if ((queue->mem_max != UNSPECIFIED) && (job->memory > queue->mem_max))
    {
    if (reason)
      (void)sprintf(reason, "Would exceed queue '%s' memory limit (%ld).",
                    queue->qname, (long)queue->mem_max);

    return (0);
    }

  if ((queue->speed != UNSPECIFIED) && (job->speed != UNSPECIFIED) &&
      (job->speed > queue->speed))
    {
    if (reason)
      (void)sprintf(reason,
                    "Host %s is too slow (%d MHz) to fill request (%d MHz).",
                    queue->qname, queue->speed, job->speed);

    return (0);
    }

  if ((queue->featureA != NULL) && (job->featureA != NULL) &&
      (!strcmp(queue->featureA, job->featureA)))
    {
    if (reason)
      (void)sprintf(reason, "Requested %s %s not available on Queue %s.",
                    FEATURE_A, job->featureA, queue->qname);

    return (0);
    }

  if ((queue->featureB != NULL) && (job->featureB != NULL) &&
      (!strcmp(queue->featureB, job->featureB)))
    {
    if (reason)
      (void)sprintf(reason, "Requested %s %s not available on Queue %s.",
                    FEATURE_B, job->featureB, queue->qname);

    return (0);
    }

  if ((queue->featureC != NULL) && (job->featureC != NULL) &&
      (!strcmp(queue->featureC, job->featureC)))
    {
    if (reason)
      (void)sprintf(reason, "Requested %s %s not available on Queue %s.",
                    FEATURE_C, job->featureC, queue->qname);

    return (0);
    }

  if ((queue->featureD != UNSPECIFIED) && (job->featureD != UNSPECIFIED) &&
      (job->featureD > queue->featureD))
    {
    if (reason)
      (void)sprintf(reason,
                    "Requested %s (%ld) exceeds queue %s limit (%ld).",
                    FEATURE_D, job->featureD, queue->qname, queue->featureD);

    return (0);
    }

  if ((queue->featureE != UNSPECIFIED) && (job->featureE != UNSPECIFIED) &&
      (job->featureE > queue->featureE))
    {
    if (reason)
      (void)sprintf(reason,
                    "Requested %s (%ld) exceeds queue %s limit (%ld).",
                    FEATURE_E, job->featureE, queue->qname, queue->featureE);

    return (0);
    }

  if ((queue->featureF != UNSPECIFIED) && (job->featureF != UNSPECIFIED) &&
      (job->featureF > queue->featureF))
    {
    if (reason)
      (void)sprintf(reason,
                    "Requested %s (%ld) exceeds queue %s limit (%ld).",
                    FEATURE_F, job->featureF, queue->qname, queue->featureF);

    return (0);
    }

  if ((queue->featureG != UNSPECIFIED) && (job->featureG != UNSPECIFIED) &&
      (job->featureG > queue->featureG))
    {
    if (reason)
      (void)sprintf(reason,
                    "Requested %s (%d) exceeds queue %s limit (%d).",
                    FEATURE_G, job->featureG, queue->qname, queue->featureG);

    return (0);
    }

  if ((queue->featureH != UNSPECIFIED) && (job->featureH != UNSPECIFIED) &&
      (job->featureH > queue->featureH))
    {
    if (reason)
      (void)sprintf(reason,
                    "Requested %s (%d) exceeds queue %s limit (%d).",
                    FEATURE_H, job->featureH, queue->qname, queue->featureH);

    return (0);
    }

  if ((queue->featureI != UNSPECIFIED) && (job->featureI != UNSPECIFIED) &&
      (job->featureI > queue->featureI))
    {
    if (reason)
      (void)sprintf(reason,
                    "Requested %s (%d) exceeds queue %s limit (%d).",
                    FEATURE_I, job->featureI, queue->qname, queue->featureI);

    return (0);
    }

  if ((job->arch != NULL) && (queue->rsrcs->arch != NULL))
    {
    if (strcmp(job->arch, queue->rsrcs->arch) && reason)
      {
      (void)sprintf(reason,
                    "Host %s is wrong architecture (%s) to fill request (%s).",
                    queue->qname, queue->rsrcs->arch, job->arch);
      return (0);
      }
    }

  /*
   * The job _can_ fit in this queue.  This doesn't mean it *will* fit
   * in the queue as it currently exists, but it *would* fit if the queue
   * was completely empty.
   */
  return (1);
  }
Пример #2
0
void
schd_dump_queue(Queue *queue, int dumpjobs)
  {
  Job    *job;
  UserAcl *aclent;
  char    num[32];
  char   *ptr;
  int     columns;
#ifdef NODEMASK
  Bitfield all_ones;
#endif /* NODEMASK */

  DBPRT(("\nQueue '%s@%s': %sabled/%sed",
         queue->qname, queue->exechost,
         (queue->flags & QFLAGS_DISABLED) ? "Dis" : "En",
         (queue->flags & QFLAGS_STOPPED) ? "Stopp" : "Start"));

  DBPRT(("%s%s%s%s ",
         (queue->flags & QFLAGS_FULL) ? "/Full" : "",
         (queue->flags & QFLAGS_MAXRUN) ? "/MaxRun" : "",
         (queue->flags & QFLAGS_DRAINING) ? "/Drain" : "",
         (queue->flags & QFLAGS_USER_ACL) ? "/ACL" : ""));

  if (schd_ENFORCE_PRIME_TIME && schd_TimeNow >= schd_ENFORCE_PRIME_TIME)
    DBPRT(("obsv_pt:%s", queue->observe_pt ? "Yes" : "No"));

  DBPRT(("\n"));

#ifdef NODEMASK
  if (queue->flags & QFLAGS_NODEMASK)
    {
    BITFIELD_SETALL(&all_ones);
    DBPRT(("  Nodes: %s\n",
           schd_format_nodemask(&queue->queuemask, &all_ones)));
    DBPRT(("  Avail: %s\n",
           schd_format_nodemask(&queue->queuemask, &queue->availmask)));
    }

#endif /* NODEMASK */

  sprintf(num, "%d", queue->running);

  DBPRT(("  Job counts: %s running, ",
         queue->running != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->maxrun);

  DBPRT(("%s max ", queue->maxrun != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->userrun);

  DBPRT(("(%s/user), ", queue->userrun != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->queued);

  DBPRT(("%s queued\n", queue->queued != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->nodes_assn);

  DBPRT(("  Nodes:%s/", queue->nodes_assn != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->nodes_max);

  DBPRT(("%s", queue->nodes_max != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->nodes_default);

  DBPRT((" [def %s, ", queue->nodes_default != UNSPECIFIED ? num : "???"));

  sprintf(num, "%d", queue->nodes_min);

  DBPRT(("min %s], ", queue->nodes_min != UNSPECIFIED ? num : "???"));

  DBPRT(("wallt max %s ", (queue->wallt_max != UNSPECIFIED) ?
         schd_sec2val(queue->wallt_max) : "???"));

  DBPRT(("[def %s ",
         queue->wallt_default != UNSPECIFIED ?
         schd_sec2val(queue->wallt_default) : "???"));

  DBPRT(("min %s]\n", (queue->wallt_min != UNSPECIFIED) ?
         schd_sec2val(queue->wallt_min) : "???"));

  if (queue->empty_by)
    /* ctime(2) returns a '\n'-terminated string, so no additional '\n' */
    DBPRT(("  Queue will empty by: %s", ctime(&queue->empty_by)));

  if (queue->idle_since)
    /* ctime(2) returns a '\n'-terminated string, so no additional '\n' */
    DBPRT(("  Queue idle since: %s", ctime(&queue->idle_since)));

  if (queue->useracl && (queue->flags & QFLAGS_USER_ACL))
    {
    DBPRT(("    User ACL: "));
    columns = 9;  /* Start with 9 columns for 'User ACL: ' */

    for (aclent = queue->useracl; aclent != NULL; aclent = aclent->next)
      {
      columns += strlen(aclent->user) + 1;

      if (columns >= 72)
        {
        DBPRT(("\n    "));
        columns = 0;
        }

      DBPRT(("%s%s",

             ((columns == 0) || (aclent == queue->useracl)) ? "" : "/",
             aclent->user));
      }

    DBPRT(("\n"));
    }

  if (dumpjobs && queue->jobs)
    {
    DBPRT(("  Jobs: "));

    columns = 5;  /* Start with 5 columns for 'Jobs: ' */

    for (job = queue->jobs; job != NULL; job = job->next)
      {

      /* Just the job numbers -- but be sure to put the '.' back! */
      if ((ptr = strchr(job->jobid, '.')) != NULL)
        * ptr = '\0';

      columns += strlen(job->jobid) + 3; /* 3 == job->state + '/' + ' ' */

      if (columns >= 72)
        {
        DBPRT(("\n   "));
        columns = 0;
        }

      DBPRT((" %s/", job->jobid));

      DBPRT(("%c",
             (job->flags & JFLAGS_PRIORITY) ? '!' :
             (job->flags & JFLAGS_WAITING) ? 'W' :
             job->state));

      if (ptr != NULL)
        *ptr = '.';
      }

    DBPRT(("\n"));
    }
  }
Пример #3
0
static int
schedule_jobs(QueueList *queues, Job *jobs, char *reason)
  {
  char *id = "schedule_jobs";
  int    numran;
  Job   *job;
  Queue *shortest;
  int    priority_to_1st = 1;

  /*
   * Since the sorting code has provided an order in which the jobs should
   * be run, attempt to honor that order by treating the first job on the
   * list as our first priority.  This amounts to draining the queue in
   * order to run that job, if necessary.
   *
   * If the job has been waiting too long, find the smallest queue in which
   * the job will fit, and consider its expected run time.  If the waiting
   * job cannot run when the queue has emptied, then go on to the next.
   * However, if there are jobs running on the queue, it is possible that
   * this queue could support the waiting job if it were started draining
   * now.  When enough jobs had exited, the waiting job would be runnable.
   * In order to determine if this is true, walk through the list of jobs,
   * which are sorted in order of completion (from soonest to last), and
   * find how many resources would be available after that job finished.
   * If there is space, calculate what time it will be when that many jobs
   * have completed, and see if the primetime limits apply at that time.
   * If the job fits in the primetime limits at that time, then start the
   * queue draining.  If it will not fit after all jobs have been tested,
   * then give up on this queue and go on to the next.
   *
   * If a queue was found that requires draining, mark it for draining.
   *
   * After the waiting job handling has completed, collect a list of
   * all the available execution queues, and place it into the pointer
   * given to this function by the caller.
   */

  for (job = jobs; job != NULL; job = job->next)
    {
    if (job->state != 'Q')
      continue;

    if (!priority_to_1st && !(job->flags & JFLAGS_WAITING))
      continue;

    DBPRT(("%s: job %s is %s (eligible for %s, needs %d nodes)\n", id,
           job->jobid,
           priority_to_1st ? "FIRSTJOB" :
           (job->flags & JFLAGS_PRIORITY) ? "SPECIAL" : "WAITING",
           schd_sec2val(job->eligible), job->nodes));

    /*
     * Find the smallest, shortest-wait queue in which this job will
     * fit.  If it is empty, great.  If not, mark it to be drained,
     * in anticipation of the job being run soon.  Note that the queue
     * drain_by time should only be shortened - it doesn't make sense
     * to push it out.
     */
    shortest = schd_find_drain(queues, job);

    if (shortest)
      {
      /*
       * If there are no jobs running in the queue, then unset the
       * draining flag (if present), so that the queue will be
       * available for this job.
       *
       * If there are running jobs, set the draining flag, and
       * adjust the empty_by value to be the expected time when
       * the job will first become runnable.
       */
      if (shortest->running == 0)
        {
        shortest->flags &= ~QFLAGS_DRAINING;
        }
      else
        {
        /* If running jobs, empty_by should be non-zero. */
        if (shortest->drain_by <= shortest->empty_by)
          {
          shortest->flags |= QFLAGS_DRAINING;

          DBPRT(("%s: shortest queue %s now draining, drain_by %s",
                 id, shortest->qname, ctime(&shortest->drain_by)));
          }
        }
      }

    /*
     * We have looked at (and possibly arranged for special treatment
     * of) the first job on the list.  Now only look for special or
     * waiting jobs.
     */
    priority_to_1st = 0;
    }

  numran = schd_pack_queues(jobs, queues, reason);

  if (numran < 0)
    {
    (void)sprintf(log_buffer,
                  "sched_pack_queues() failed!");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  return (numran);
  }
Пример #4
0
/* print_config(): Dump the current config to the log */
static void
print_config(void)
  {
  char   *id = "print_config";
  QueueList *qptr;

  if (schd_TEST_ONLY)
    {
    (void)sprintf(log_buffer, "%-24s = %s", "TEST_ONLY",
                  schd_bool2val(schd_TEST_ONLY));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  if (schd_SubmitQueue)
    {
    (void)sprintf(log_buffer, "%-24s = %s@%s", "SUBMIT_QUEUE",
                  schd_SubmitQueue->queue->qname, schd_SubmitQueue->queue->exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  if (schd_BatchQueues)
    {
    for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next)
      {
      (void)sprintf(log_buffer, "%-24s = %s@%s",
                    (qptr == schd_BatchQueues) ? "BATCH_QUEUES" : "",
                    qptr->queue->qname, qptr->queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      }
    }

  if (schd_ExternQueues)
    {
    for (qptr = schd_ExternQueues; qptr != NULL; qptr = qptr->next)
      {
      (void)sprintf(log_buffer, "%-24s = %s@%s",
                    (qptr == schd_ExternQueues) ? "EXTERN_QUEUES" : "",
                    qptr->queue->qname, qptr->queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      }
    }

  if (schd_SpecialQueue)
    {
    (void)sprintf(log_buffer, "%-24s = %s@%s", "SPECIAL_QUEUE",
                  schd_SpecialQueue->queue->qname, schd_SpecialQueue->queue->exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  if (schd_DedQueues)
    {
    for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next)
      {
      (void)sprintf(log_buffer, "%-24s = %s@%s",
                    (qptr == schd_DedQueues) ? "DEDICATED_QUEUES" : "",
                    qptr->queue->qname, qptr->queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      }
    }

  (void)sprintf(log_buffer, "%-24s = %s", "ENFORCE_PRIME_TIME",
                schd_booltime2val(schd_ENFORCE_PRIME_TIME));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "PRIME_TIME_WALLT_LIMIT",
                schd_sec2val(schd_PT_WALLT_LIMIT));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  if (schd_PT_SMALL_NODE_LIMIT)
    {
    (void)sprintf(log_buffer, "%-24s = %d", "PRIME_TIME_SMALL_NODE_LIMIT",
                  schd_PT_SMALL_NODE_LIMIT);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, "%-24s = %s", "PRIME_TIME_SMALL_WALLT_LIMIT",
                  schd_sec2val(schd_PT_SMALL_WALLT_LIMIT));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  (void)sprintf(log_buffer, "%-24s = %s", "NONPRIME_DRAIN_SYS",
                schd_bool2val(schd_NONPRIME_DRAIN_SYS));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  if (schd_NP_DRAIN_BACKTIME > 0)
    {
    (void)sprintf(log_buffer, "%-24s = %s", "NP_DRAIN_BACKTIME",
                  schd_sec2val(schd_NP_DRAIN_BACKTIME));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  if (schd_NP_DRAIN_IDLETIME > 0)
    {
    (void)sprintf(log_buffer, "%-24s = %s", "NP_DRAIN_IDLETIME",
                  schd_sec2val(schd_NP_DRAIN_IDLETIME));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  (void)sprintf(log_buffer, "%-24s = %s", "WALLT_LIMIT_LARGE_JOB",
                schd_sec2val(schd_WALLT_LARGE_LIMIT));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  if (schd_SMALL_JOB_MAX)
    {
    (void)sprintf(log_buffer, "%-24s = %d", "SMALL_JOB_MAX",
                  schd_SMALL_JOB_MAX);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, "%-24s = %s", "WALLT_LIMIT_SMALL_JOB",
                  schd_sec2val(schd_WALLT_SMALL_LIMIT));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  (void)sprintf(log_buffer, "%-24s = %s", "PRIME_TIME_START",
                schd_sec2val(schd_PRIME_TIME_START));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "PRIME_TIME_END",
                schd_sec2val(schd_PRIME_TIME_END));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %d%%", "TARGET_LOAD_PCT",
                schd_TARGET_LOAD_PCT);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = -%d%%,+%d%%", "TARGET_LOAD_VARIANCE",
                schd_TARGET_LOAD_MINUS, schd_TARGET_LOAD_PLUS);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %d", "HIGH_SYSTIME", schd_HIGH_SYSTIME);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %d", "MAX_JOBS", schd_MAX_JOBS);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %d", "MIN_JOBS", schd_MIN_JOBS);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "MAX_QUEUED_TIME",
                schd_sec2val(schd_MAX_QUEUED_TIME));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SMALL_QUEUED_TIME",
                schd_sec2val(schd_SMALL_QUEUED_TIME));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %d", "INTERACTIVE_LONG_WAIT",
                schd_INTERACTIVE_LONG_WAIT);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %d", "MAX_DEDICATED_JOBS",
                schd_MAX_DEDICATED_JOBS);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SORT_BY_PAST_USAGE",
                schd_bool2val(schd_SORT_BY_PAST_USAGE));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "ENFORCE_ALLOCATION",
                schd_booltime2val(schd_ENFORCE_ALLOCATION));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "ENFORCE_DEDICATED_TIME",
                schd_booltime2val(schd_ENFORCE_DEDTIME));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SCHED_ACCT_DIR",
                schd_SCHED_ACCT_DIR ? schd_SCHED_ACCT_DIR : "[null]");
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "DEDICATED_TIME_COMMAND",
                schd_DEDTIME_COMMAND ? schd_DEDTIME_COMMAND : "[null]");
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SYSTEM_NAME",
                schd_SYSTEM_NAME ? schd_SYSTEM_NAME : "[null]");
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SERVER_HOST",
                schd_SERVER_HOST ? schd_SERVER_HOST : "[null]");
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SCHED_HOST",
                schd_SCHED_HOST ? schd_SCHED_HOST : "[null]");
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, "%-24s = %s", "SCHED_RESTART_ACTION",
                (schd_SCHED_RESTART_ACTION == SCHD_RESTART_NONE ? "NONE" :
                 (schd_SCHED_RESTART_ACTION == SCHD_RESTART_RESUBMIT ? "RESUBMIT" :
                  (schd_SCHED_RESTART_ACTION == SCHD_RESTART_RERUN ? "RERUN" : "?"))));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  if (schd_AVOID_FRAGS)
    {
    (void)sprintf(log_buffer, "%-24s = %s", "AVOID_FRAGMENTATION",
                  schd_bool2val(schd_AVOID_FRAGS));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  if (schd_JOB_DUMPFILE)
    {
    (void)sprintf(log_buffer, "%-24s = %s", "SORTED_JOB_DUMPFILE",
                  schd_JOB_DUMPFILE);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }

  if (schd_MANAGE_HPM)
    {
    (void)sprintf(log_buffer, "%-24s = %s", "MANAGE_HPM_COUNTERS",
                  schd_bool2val(schd_MANAGE_HPM));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    if (schd_REVOKE_HPM)
      {
      (void)sprintf(log_buffer, "%-24s = %s", "REVOKE_HPM_COUNTERS",
                    schd_bool2val(schd_MANAGE_HPM));
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      }
    }

  if (schd_FAKE_MACH_MULT != 1)
    {
    (void)sprintf(log_buffer, "%-24s = %d", "FAKE_MACHINE_MULT",
                  schd_FAKE_MACH_MULT);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }
  }
Пример #5
0
static int
dump_sorted_jobs(FILE *dump, Job *joblist)
  {
  Job     *job;
  int      njobs;
  int      elig_mesg = 0;

#define DUMP_JID_LEN  16
#define DUMP_STATE_LEN  1
#define DUMP_OWNER_LEN  8
#define DUMP_NODES_LEN  3
#define DUMP_WALLT_LEN  8
#define DUMP_WAITT_LEN  8
#define DUMP_ELIGI_LEN  9 /* time plus '*' if wait != eligible */
#define DUMP_FLAGS_LEN  18

  char     jid[DUMP_JID_LEN + 1];
  char     owner[DUMP_OWNER_LEN + 1];
  char     wallt[DUMP_WALLT_LEN + 1];
  char     waitt[DUMP_WAITT_LEN + 1];
  char     eligi[DUMP_ELIGI_LEN + 1];
  char     flags[DUMP_FLAGS_LEN + 1];

  fprintf(dump, "  %*s %*s %*s %*s %*s %*s %*s %*s\n",
          -DUMP_JID_LEN,  "Job ID",
          -DUMP_STATE_LEN, "S",
          -DUMP_OWNER_LEN, "Owner",
          -DUMP_NODES_LEN, "Nds",
          -DUMP_WALLT_LEN, "Walltime",
          -DUMP_WAITT_LEN, "Q'd for",
          -DUMP_ELIGI_LEN, "Eligible",
          -DUMP_FLAGS_LEN, "Flags");

  fprintf(dump, "  %*s %c %*s %*s %*s %*s %*s %*s\n",
          -DUMP_JID_LEN,  "----------------",
          '-',
          -DUMP_OWNER_LEN, "--------",
          -DUMP_NODES_LEN, "---",
          -DUMP_WALLT_LEN, "--------",
          -DUMP_WAITT_LEN, "--------",
          -DUMP_ELIGI_LEN, "---------",
          -DUMP_FLAGS_LEN, "------------------");

  for (njobs = 0, job = joblist; job != NULL; job = job->next)
    {

    njobs++;
    strncpy(jid, job->jobid, DUMP_JID_LEN);
    strncpy(owner, job->owner, DUMP_OWNER_LEN);
    strcpy(wallt, schd_sec2val(job->walltime));
    strcpy(waitt, schd_sec2val(job->time_queued));
    strcpy(eligi, schd_sec2val(job->eligible));

    if (job->time_queued != job->eligible)
      {
      strcat(eligi, "*");
      elig_mesg ++;
      }

    flags[0] = '\0';

    /* Watch length of 'flags[]' array! */

    if (job->flags & JFLAGS_INTERACTIVE)
      strcat(flags, "Int ");

    /* "Priority" jobs are marked as being waiting, even if they're new. */
    if (job->flags & JFLAGS_PRIORITY)
      strcat(flags, "High ");
    else if (job->flags & JFLAGS_WAITING)
      strcat(flags, "Wait ");

    if (job->flags & JFLAGS_DEDICATED)
      strcat(flags, "Ded ");

    if (job->flags & JFLAGS_NEEDS_HPM)
      strcat(flags, "HPM ");

    /* Trim off the trailing space if any flags were listed. */
    if (flags[0] != '\0')
      flags[strlen(flags) - 1] = '\0';

    fprintf(dump, "  %*s %c %*s %*d %*s %*s %*s %*s\n",
            -DUMP_JID_LEN, jid,
            job->state,
            -DUMP_OWNER_LEN, job->owner,
            -DUMP_NODES_LEN, job->nodes,
            -DUMP_WALLT_LEN, wallt,
            -DUMP_WAITT_LEN, waitt,
            -DUMP_ELIGI_LEN, eligi,
            -DUMP_FLAGS_LEN, flags);
    }

  fprintf(dump, "    Total: %d job%s\n\n", njobs, (njobs == 1) ? "" : "s");

  if (elig_mesg)
    {
    fprintf(dump, "Jobs marked with a ``*'' have an etime different "
            "from their ctime.\n\n");
    }

  return (njobs);
  }
Пример #6
0
static int
make_job_dump(char *dumpfile)
  {
  char    *id = "make_job_dump";
  FILE    *dump;
  QueueList *qptr;

  /*
   * Attempt to open the dump file, creating it if necessary.  It should
   * be truncated each time this runs, so don't open with append mode.
   */

  if ((dump = fopen(dumpfile, "w")) == NULL)
    {
    (void)sprintf(log_buffer, "Cannot write to %s: %s\n", dumpfile,
                  strerror(errno));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (-1);
    }

  /* Head the file with a timestamp. */
  fprintf(dump, "%s\n", ctime(&schd_TimeNow));

  /* Include the version string compiled into the scheduler binary. */
  fprintf(dump, "%s\n", schd_VersionString);

  /* And some more useful information about the state of the world. */
  fprintf(dump, "Scheduler running on '%s'\n", schd_ThisHost);

  fprintf(dump, "Prime-time is ");

  if (schd_ENFORCE_PRIME_TIME && schd_TimeNow >= schd_ENFORCE_PRIME_TIME)
    {
    fprintf(dump, "from %s ", schd_sec2val(schd_PRIME_TIME_START));
    fprintf(dump, "to %s.\n", schd_sec2val(schd_PRIME_TIME_END));
    }
  else
    fprintf(dump, "not enforced.\n");

  fprintf(dump, "\nJOBS LISTED IN ORDER FROM HIGHEST TO LOWEST PRIORITY\n\n");

  /* Now dump the jobs queued on the various queues, in order of priority. */
  qptr = schd_SubmitQueue;

  if (qptr->queue->jobs)
    {
    fprintf(dump, "Jobs on submit queue '%s':\n", qptr->queue->qname);
    dump_sorted_jobs(dump, qptr->queue->jobs);
    }

  for (qptr = schd_ExternQueues; qptr != NULL; qptr = qptr->next)
    {
    if (qptr->queue->jobs)
      {
      fprintf(dump, "Jobs on external queue '%s':\n", qptr->queue->qname);
      dump_sorted_jobs(dump, qptr->queue->jobs);
      }
    }

  for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next)
    {
    if (qptr->queue->jobs)
      {
      fprintf(dump, "Jobs on dedicated queue '%s':\n",
              qptr->queue->qname);
      dump_sorted_jobs(dump, qptr->queue->jobs);
      }
    }

  if (fclose(dump))
    {
    (void)sprintf(log_buffer, "close(%s): %s\n", dumpfile, strerror(errno));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (-1);
    }

  return (0);
  }
Пример #7
0
static Job *
reject_unrunnables(Job *jobs)
  {
  Job *this, *nextjob;
  char tmpstr[300];

  for (this = jobs; this != NULL; this = nextjob)
    {
    nextjob = this->next;

    if (!schd_job_can_queue(this))
      {

      /*
       * If this job is at the head of the list, we must deal with
       * it specially.  We need to advance the list pointer forward
       * so that further scheduling will not be done on the now
       * bogus job.  Advance 'jobs', and make 'nextjob' the 'next'
       * pointer for the new head of the list.
       */
      if (this == jobs)
        {
        jobs = jobs->next;
        nextjob = jobs ? jobs->next : NULL;
        }

      DBPRT(("job %s does not fit on any execution queue - reject\n",

             this->jobid));
      schd_reject_job(this,
                      "Job will not fit on any execution queue.\n"
                      "\n"
                      "Use 'qstat -q' to get execution queue limits.\n");

      continue;
      }

    /*
     * Enforce maximum job limits
     * "Big" jobs are given a maximum walltime limit (WALLT_LARGE_LIMIT)
     * that differs from "small" jobs. (Job size distinction based on
     * the size specified by SMALL_JOB_MAX.) We need to reject any job
     * which violate these limits.
     *
     * Special-priority jobs are not affected.
     */
    if (!(this->flags & JFLAGS_PRIORITY) && (schd_SMALL_JOB_MAX > 0))
      {
      if (this->nodes <= schd_SMALL_JOB_MAX)
        {
        if (this->walltime > schd_WALLT_SMALL_LIMIT)
          {
          if (this == jobs)
            {
            jobs = jobs->next;
            nextjob = jobs ? jobs->next : NULL;
            }

          DBPRT(("job %s exceeds Small job walltime limit - reject\n",

                 this->jobid));
          sprintf(tmpstr,
                  "Job exceeds maximum walltime limit (%s) policy\n"
                  "\tfor small jobs (1 - %d nodes).\n",
                  schd_sec2val(schd_WALLT_SMALL_LIMIT),
                  schd_SMALL_JOB_MAX);
          schd_reject_job(this, tmpstr);
          continue;
          }
        }
      else
        {
        if (this->walltime > schd_WALLT_LARGE_LIMIT)
          {
          if (this == jobs)
            {
            jobs = jobs->next;
            nextjob = jobs ? jobs->next : NULL;
            }

          DBPRT(("job %s exceeds Large job walltime limit - reject\n",

                 this->jobid));
          sprintf(tmpstr,
                  "Job exceeds maximum walltime limit (%s) policy\n"
                  "\tfor large jobs (%d+ nodes).\n",
                  schd_sec2val(schd_WALLT_LARGE_LIMIT),
                  schd_SMALL_JOB_MAX + 1);
          schd_reject_job(this, tmpstr);

          continue;
          }
        }
      }
    }

  return (jobs);
  }
Пример #8
0
/*
 * Determine if a job *can* run in this queue.  This is distinct from if
 * it *should* be run in the queue.
 *
 * A job *can* fit in a queue if its requested resources are not greater
 * than the queue's maximums.
 *
 * A job *should* be run only if its requested resources do not exceed the
 * queue's *available* resources.
 */
int
schd_job_fits_queue(Job *job, Queue *queue, char *reason)
  {
  /* char   *id = "schd_job_fits_queue"; */


  /* Is the System architecture correct for this job? */
  if (job->arch != NULL)
    {
    if (strcmp(job->arch, queue->rsrcs->arch))
      {
      return (0);
      }
    }

  /*
   * Compare the job's requested resources against the queue's limits.
   */
  if ((queue->wallt_min != UNSPECIFIED) &&
      (job->walltime < queue->wallt_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' walltime minimum (%s).",
                    queue->qname, schd_sec2val(queue->wallt_min));

    return (0);
    }

  if ((queue->wallt_max != UNSPECIFIED) &&
      (job->walltime > queue->wallt_max))
    {
    if (reason)
      (void)sprintf(reason,
                    "Exceeds queue '%s' walltime limit (%s).", queue->qname,
                    schd_sec2val(queue->wallt_max));

    return (0);
    }

  if ((queue->ncpus_min != UNSPECIFIED) && (job->ncpus < queue->ncpus_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' CPU minimum (%d).", queue->qname,
                    queue->ncpus_min);

    return (0);
    }

  if ((queue->ncpus_max != UNSPECIFIED) && (job->ncpus > queue->ncpus_max))
    {
    if (reason)
      (void)sprintf(reason, "Exceeds queue '%s' CPU limit (%d).",
                    queue->qname, queue->ncpus_max);

    return (0);
    }

  if ((queue->mem_min != UNSPECIFIED) && (job->memory < queue->mem_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' memory minimum (%ul).", queue->qname,
                    queue->mem_min);

    return (0);
    }

  if ((queue->mem_max != UNSPECIFIED) && (job->memory > queue->mem_max))
    {
    if (reason)
      (void)sprintf(reason, "Exceeds queue '%s' memory limit (%ul).",
                    queue->qname, queue->mem_max);

    return (0);
    }

  /*
   * The job _can_ fit in this queue.  This doesn't mean it *will* fit
   * in the queue as it currently exists, but it *would* fit if the queue
   * was completely empty.
   */
  return (1);
  }
Пример #9
0
/*
 * Determine if a job *can* run in this queue.  This is distinct from if
 * it *should* be run in the queue.
 *
 * A job *can* fit in a queue if its requested resources are not greater
 * than the queue's maximums.
 *
 * A job *should* be run only if its requested resources do not exceed the
 * queue's *available* resources.
 */
int
schd_job_fits_queue(Job *job, Queue *queue, char *reason)
  {
  /* char   *id = "schd_job_fits_queue"; */

  /* check if this job has to run on a specific host (e.g. it was
   * previously checkpointed on this host.
   */
  if (job->exechost)
    if (strcmp(job->exechost, queue->exechost))
      return(0);

  /*
   * Compare the job's requested resources against the queue's limits.
   */
  if ((queue->wallt_min != UNSPECIFIED) &&
      (job->walltime < queue->wallt_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' walltime minimum (%s).",
                    queue->qname, schd_sec2val(queue->wallt_min));

    return (0);
    }

  if ((queue->wallt_max != UNSPECIFIED) &&
      (job->walltime > queue->wallt_max))
    {
    if (reason)
      (void)sprintf(reason,
                    "Would exceed queue '%s' walltime limit (%s).", queue->qname,
                    schd_sec2val(queue->wallt_max));

    return (0);
    }

  if ((queue->ncpus_min != UNSPECIFIED) && (job->ncpus < queue->ncpus_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' CPU minimum (%d).", queue->qname,
                    queue->ncpus_min);

    return (0);
    }

  if ((queue->ncpus_max != UNSPECIFIED) && (job->ncpus > queue->ncpus_max))
    {
    if (reason)
      (void)sprintf(reason, "Would exceed queue '%s' CPU limit (%d).",
                    queue->qname, queue->ncpus_max);

    return (0);
    }

  if ((queue->mem_min != UNSPECIFIED) && (job->memory < queue->mem_min))
    {
    if (reason)
      (void)sprintf(reason,
                    "Does not meet queue '%s' memory minimum (%ul).", queue->qname,
                    queue->mem_min);

    return (0);
    }

  if ((queue->mem_max != UNSPECIFIED) && (job->memory > queue->mem_max))
    {
    if (reason)
      (void)sprintf(reason, "Would exceed queue '%s' memory limit (%ul).",
                    queue->qname, queue->mem_max);

    return (0);
    }

  if ((queue->rsrcs->mem_total != UNSPECIFIED) &&
      (job->memory > queue->rsrcs->mem_total))
    {
    if (reason)
      (void)sprintf(reason, "Exceeds host '%s' memory limit (%ul).",
                    queue->exechost, queue->rsrcs->mem_total);

    return (0);
    }

  /*
   * The job _can_ fit in this queue.  This doesn't mean it *will* fit
   * in the queue as it currently exists, but it *would* fit if the queue
   * was completely empty.
   */
  return (1);
  }