Пример #1
0
/*
 * schd_get_queue_limits - query queue information from the server.
 *
 * Returns 0 on success, -1 for "fatal errors", and 1 for a transient
 * error (i.e., the queue failed the sanity checks imposed by the
 * queue_sanity() function).
 */
int
schd_get_queue_limits(Queue *queue)
  {
  char   *id = "schd_get_queue_limits";
  int     moved = 0, istrue;
  Batch_Status *bs;
  AttrList *attr;
  static AttrList alist[] =
    {
      {&alist[1],  ATTR_start, "", ""},
    {&alist[2],  ATTR_enable, "", ""},
    {&alist[3],  ATTR_count, "", ""},
    {&alist[4],  ATTR_maxuserrun, "", ""},
    {&alist[5],  ATTR_rescavail, "", ""},
    {&alist[6],  ATTR_rescassn, "", ""},
    {&alist[7],  ATTR_rescdflt, "", ""},
    {&alist[8],  ATTR_rescmax, "", ""},
    {&alist[9],  ATTR_rescmin, "", ""},
    {&alist[10], ATTR_acluren, "", ""},
    {&alist[11], ATTR_acluser, "", ""},
    {NULL,       ATTR_maxrun, "", ""}
    };
  size_t  mem_default = UNSPECIFIED;
  size_t  mem_assn    = UNSPECIFIED;
  size_t  mem_max     = UNSPECIFIED;
  size_t  mem_min     = UNSPECIFIED;
  int     cpu_default = UNSPECIFIED;
  int     cpu_assn    = UNSPECIFIED;
  int     cpu_max     = UNSPECIFIED;
  int     cpu_min     = UNSPECIFIED;
  int     nodes_from_cpu, nodes_from_mem;

  queue->running = UNSPECIFIED;
  queue->queued = UNSPECIFIED;
  queue->maxrun = UNSPECIFIED;
  queue->userrun      = UNSPECIFIED;

  queue->nodes_max = UNSPECIFIED;
  queue->nodes_min = UNSPECIFIED;
  queue->nodes_default = UNSPECIFIED;
  queue->nodes_assn = UNSPECIFIED;
  queue->nodes_rsvd = UNSPECIFIED;
  queue->wallt_max = UNSPECIFIED;
  queue->wallt_min = UNSPECIFIED;
  queue->wallt_default = UNSPECIFIED;

  queue->flags = 0;
#ifdef NODEMASK
  BITFIELD_CLRALL(&queue->queuemask);
  BITFIELD_CLRALL(&queue->availmask);
#endif /* NODEMASK */
  queue->rsrcs = NULL;

  if (queue->jobs)
    {
    DBPRT(("%s: found jobs on queue '%s'!  Freeing them...\n", id,
           queue->qname));
    schd_free_jobs(queue->jobs);
    }

  if (queue->useracl)
    {
    DBPRT(("%s: found user ACL list on queue '%s'!  Freeing it...\n", id,
           queue->qname));
    schd_free_useracl(queue->useracl);
    }

  queue->jobs         = NULL;

  queue->useracl = NULL;

  /* Ask the server for information about the specified queue. */

  if ((bs = pbs_statque(connector, queue->qname, alist, NULL)) == NULL)
    {
    sprintf(log_buffer, "pbs_statque failed, \"%s\" %d",
            queue->qname, pbs_errno);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (-1);
    }

  /* Process the list of attributes returned by the server. */

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /* Is queue started? */
    if (!strcmp(attr->name, ATTR_start))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not stopped. */
          queue->flags &= ~QFLAGS_STOPPED;
        else
          queue->flags |= QFLAGS_STOPPED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* Is queue enabled? */
    if (!strcmp(attr->name, ATTR_enable))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not disabled. */
          queue->flags &= ~QFLAGS_DISABLED;
        else
          queue->flags |= QFLAGS_DISABLED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* How many jobs are queued and running? */
    if (!strcmp(attr->name, ATTR_count))
      {
      queue->queued = schd_how_many(attr->value, SC_QUEUED);
      queue->running = schd_how_many(attr->value, SC_RUNNING);
      continue;
      }

    /* Queue-wide maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxrun))
      {
      queue->maxrun = atoi(attr->value);
      continue;
      }

    /* Per-user maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxuserrun))
      {
      queue->userrun = atoi(attr->value);
      continue;
      }

    /* Is there an enabled user access control list on this queue? */
    if (!strcmp(attr->name, ATTR_acluren))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue has an ACL */
          queue->flags |= QFLAGS_USER_ACL;
        else
          queue->flags &= ~QFLAGS_USER_ACL;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_acluser))
      {
      if (queue->useracl)
        {
        DBPRT(("queue %s acluser already set!\n", queue->qname));
        schd_free_useracl(queue->useracl);
        }

      queue->useracl = schd_create_useracl(attr->value);

      continue;
      }

    /* Queue maximum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmax))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_max = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_max = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_max = schd_val2sec(attr->value);
        continue;
        }

#ifdef NODEMASK
      if (!strcmp("nodemask", attr->resource))
        {
        if (schd_str2mask(attr->value, &queue->queuemask))
          {
          (void)sprintf(log_buffer, "couldn't convert nodemask %s",
                        attr->value);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          }
        else
          queue->flags |= QFLAGS_NODEMASK; /* Valid nodemask. */
        }

#endif /* NODEMASK */

      continue;
      }

    /* Queue minimum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmin))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_min = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_min = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_min = schd_val2sec(attr->value);
        continue;
        }

      continue;
      }

    /* Queue assigned (in use) resource usage. */
    if (!strcmp(attr->name, ATTR_rescassn))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_assn = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_assn = atoi(attr->value);
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_rescdflt))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_default = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_default = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        queue->wallt_default = schd_val2sec(attr->value);
      }

    /* Ignore anything else */
    }

  pbs_statfree(bs);

  /*
   * Calculate values for queue node limits, given memory and cpu values.
   * Note any discrepancies.
   */
  nodes_from_cpu = NODES_FROM_CPU(cpu_default);
  nodes_from_mem = NODES_FROM_MEM(mem_default);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' default cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_default, schd_byte2val(mem_default),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  nodes_from_cpu = NODES_FROM_CPU(cpu_max);

  nodes_from_mem = NODES_FROM_MEM(mem_max);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' maximum cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_max, schd_byte2val(mem_max),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  nodes_from_cpu = NODES_FROM_CPU(cpu_min);

  nodes_from_mem = NODES_FROM_MEM(mem_min);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' minimum cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_min, schd_byte2val(mem_min),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  /*
   * Note: The assigned cpus and memory need not be exactly the same
   * node equivalency.
   */
  if ((cpu_default != UNSPECIFIED) && (mem_default != UNSPECIFIED))
    queue->nodes_default = NODES_REQD(cpu_default, mem_default);

  if ((cpu_max != UNSPECIFIED) && (mem_max != UNSPECIFIED))
    queue->nodes_max     = NODES_REQD(cpu_max, mem_max);

  if ((cpu_min != UNSPECIFIED) && (mem_min != UNSPECIFIED))
    queue->nodes_min     = NODES_REQD(cpu_min, mem_min);

  if ((cpu_assn != UNSPECIFIED) && (mem_assn != UNSPECIFIED))
    queue->nodes_assn    = NODES_REQD(cpu_assn, mem_assn);

  /*
   * Move any jobs on this queue from the global list onto the queue's
   * list.  Keep track of when the longest-running job will end, and set
   * the 'empty_by' field to that value.  Maintain the ordering as it was
   * in "schd_AllJobs".
   */

  if (schd_AllJobs)
    moved = queue_claim_jobs(queue, &schd_AllJobs);

  if (moved < 0)
    {
    sprintf(log_buffer, "%s: WARNING! Queue '%s' failed to claim jobs.",
            id, queue->qname);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  if (queue->nodes_assn == UNSPECIFIED)
    queue->nodes_assn = 0;

  if (queue->running    == UNSPECIFIED)
    queue->running    = 0;

  /*
   * Find out if the queue is idle, and if it was not before, set the idle
   * time to now.  If there are running jobs, the queue is not idle at the
   * start of this iteration - set idle_since to 0.
   */
  if (queue->running)
    {
    queue->idle_since = 0;
    }
  else
    {
    if (queue->idle_since == 0)
      queue->idle_since = schd_TimeNow;
    }

  /*
   * Get the resources for this queue from the resource monitor (if
   * available).  If the resmom is not accessible, disable the queue.
   * If the resources were received okay, compute the available node
   * masks from the resources and jobs.
   * Don't bother with resources for the special or submit queues.
   */
  if ((strcmp(queue->qname, schd_SubmitQueue->queue->qname) != 0) ||
      ((schd_SpecialQueue != NULL) &&
       (!strcmp(queue->qname, schd_SpecialQueue->queue->qname))))
    {

    queue->rsrcs = schd_get_resources(queue->exechost);

    if (queue->rsrcs != NULL)
      {
      /* Account for this queue's resources. */
      queue->rsrcs->nodes_alloc += queue->nodes_assn;
      queue->rsrcs->njobs       += queue->running;

      /*
       * If the HPM counters do not appear to be in use on this host,
       * check for jobs on the queue that are using hpm.  If so, set
       * the 'HPM_IN_USE' flag on the resources.  This will prevent the
       * HPM counters from being released to global mode at the end
       * of the scheduling run (c.f. cleanup.c).
       * The 'HPM_IN_USE' flag will also be asserted if a job is run
       * that uses the HPM counters.
       */

      if (schd_MANAGE_HPM &&
          !(queue->rsrcs->flags & RSRCS_FLAGS_HPM_IN_USE))
        {
        if (schd_hpm_job_count(queue->jobs))
          queue->rsrcs->flags |= RSRCS_FLAGS_HPM_IN_USE;
        }

#ifdef NODEMASK
      /* And find the nodemasks for the queue and resources. */
      find_nodemasks(queue, queue->rsrcs);

#endif /* NODEMASK */

      }
    else
      {
      (void)sprintf(log_buffer,
                    "Can't get resources for %s@%s - marking unavailable.",
                    queue->qname, queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      queue->flags |= QFLAGS_DISABLED;
      }
    }

#ifdef DEBUG
  schd_dump_queue(queue, QUEUE_DUMP_JOBS);

#endif /* DEBUG */

  /*
   * It would probably be better to wait for the world to stabilize
   * than to try to impose some artificial order upon it.  Do not do
   * the sanity check if the queue is stopped.
   */
  if ((queue->flags & QFLAGS_STOPPED) == 0)
    {
    if (!queue_sanity(queue))
      {
      sprintf(log_buffer, "WARNING! Queue '%s' failed sanity checks.",
              queue->qname);
      log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      return (1);
      }
    }

  return (0);
  }
Пример #2
0
/*
 * This function takes a pointer to a struct batch_status for a job, and
 * fills in the appropriate fields of the supplied job struct.  It returns
 * the number of items that were found.
 */
int
schd_get_jobinfo(Batch_Status *bs, Job *job)
  {
  char *id = "schd_get_jobinfo";
  int     changed = 0;
  int     cpu_req = 0;
  size_t  mem_req = 0;
  char     *host;
  char *p, *tmp_p, *var_p;
  AttrList *attr;
  char      canon[PBS_MAXHOSTNAME + 1];
  int     istrue;

  memset((void *)job, 0, sizeof(Job));

  job->jobid = schd_strdup(bs->name);

  if (job->jobid == NULL)
    {
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
               "schd_strdup(bs->name)");
    return (-1);
    }

  changed ++;

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /*
     * If this is the 'owner' field, chop it into 'owner' and 'host'
     * fields, and copy them into the Job struct.
     */
    if (!strcmp(attr->name, ATTR_owner))
      {

      /* Look for the '@' that separates user and hostname. */
      host = strchr(attr->value, '@');

      if (host)
        {
        *host = '\0'; /* Replace '@' with NULL (ends username). */
        host ++; /* Move to first character of hostname. */
        }

      job->owner = schd_strdup(attr->value);

      if (job->owner == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->owner)");
        return (-1);
        }

      changed ++;

      job->host  = schd_strdup(host);

      if (job->host == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->host)");
        return (-1);
        }

      changed ++;

      /*
       * We don't "own" the attribute strings, so put back the '@'
       * character we removed above, in case something else expects
       * it to be there.
       * Note that 'host' points to the first character of the host-
       * name, not the hole one character behind.
       */

      if (host)
        {
        host --; /* Step back one character. */
        *host = '@'; /* Replace the '@' that was deleted above. */
        }

      /* That's all for the owner field. */
      continue;
      }

    /* The group to which to charge the resources for this job. */
    if (!strcmp(attr->name, ATTR_egroup))
      {
      job->group = schd_strdup(attr->value);

      if (job->group == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->group)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The comment currently assigned to this job. */
    if (!strcmp(attr->name, ATTR_comment))
      {
      job->comment = schd_strdup(attr->value);

      if (job->comment == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->comment)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The host on which this job is running. */
    if (!strcmp(attr->name, ATTR_exechost))
      {
      job->exechost = schd_strdup(attr->value);

      if (job->exechost == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->exechost)");
        return (-1);
        }

      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_inter))
      {
      /* Is this job interactive or not? */
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue)
          job->flags |= JFLAGS_INTERACTIVE;
        else
          job->flags &= ~JFLAGS_INTERACTIVE;

        changed ++;
        }
      else
        {
        DBPRT(("%s: can't parse %s = %s into boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_state))
      {
      /* State is one of 'R', 'Q', 'E', etc. */
      job->state = attr->value[0];
      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_queue))
      {
      job->qname = schd_strdup(attr->value);

      if (job->qname == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->qname)");
        return (-1);
        }

      job->flags |= JFLAGS_QNAME_LOCAL;

      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_v))
      {
      var_p = schd_strdup(attr->value);

      if (var_p == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(Variable_List)");
        return (-1);
        }

      p = NULL;

      tmp_p = strstr(var_p, "PBS_O_QUEUE");

      if (tmp_p)
        {
        p = strtok(tmp_p, "=");
        p = strtok(NULL,  ", ");
        }

      if (p != NULL)
        {
        job->oqueue = schd_strdup(p);
        }
      else
        {
        /* if the originating queue is unknown, default
         * to the locally defined "submit" queue.
         */
        job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname);
        }

      free(var_p);

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_l))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime = schd_val2sec(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "ncpus"))
        {
        cpu_req = atoi(attr->value);
        job->nodes = MAX(job->nodes, NODES_FROM_CPU(cpu_req));
        changed ++;

        }
      else if (!strcmp(attr->resource, "mem"))
        {
        mem_req = schd_val2byte(attr->value);
        job->nodes = MAX(job->nodes, NODES_FROM_MEM(mem_req));
        changed ++;

#ifdef NODEMASK
        }
      else if (!strcmp(attr->resource, "nodemask"))
        {
        if (schd_str2mask(attr->value, &job->nodemask))
          {
          (void)sprintf(log_buffer,
                        "bad nodemask %s for job %s", attr->value, job->jobid);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          }
        else
          changed++; /* Job nodemask was valid. */

#endif /* NODEMASK */

        }

      if (!strcmp(attr->resource, HPM_ATTRIBUTE))
        {
        /*
         * If the job requests hpm support, set the flag, otherwise
         * turn it off.
         */
        if (schd_val2bool(attr->value, &istrue) == 0)
          {
          if (istrue)
            job->flags |= JFLAGS_NEEDS_HPM;
          else
            job->flags &= ~JFLAGS_NEEDS_HPM;

          changed ++;
          }
        else
          {
          DBPRT(("%s: can't parse %s = %s into boolean\n", id,
                 attr->name, attr->value));
          }
        }

      /* That's all for requested resources. */
      continue;
      }

    if (!strcmp(attr->name, ATTR_used))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime_used = schd_val2sec(attr->value);

        changed ++;
        }

      /* No other interesting cases. */
      continue;
      }

    /* Creation time attribute. */
    if (!strcmp(attr->name, ATTR_ctime))
      {
      /* How long ago was it put in the queue ? */
      job->time_queued = schd_TimeNow - atoi(attr->value);

      continue;
      }

    /* Modified time attribute. */
    if (!strcmp(attr->name, ATTR_mtime))
      {
      /* When was the job last modified? */
      job->mtime = atoi(attr->value);

      continue;
      }

#ifdef ATTR_etime
    /*
     * When was the job last eligible to run?  When a user-hold is
     * released, this value is updated to the current time.  This
     * prevents users from gaining higher priority from holding their
     * jobs.
     */
    if (!strcmp(attr->name, ATTR_etime))
      {
      job->eligible = schd_TimeNow - atoi(attr->value);

      continue;
      }

#endif /* ATTR_etime */
    }

  /*
   * If this job is in the "Running" state, compute how many seconds
   * remain until it is completed.
   */
  if (job->state == 'R')
    {
    job->time_left = job->walltime - job->walltime_used;
    }

  /*
   * If this job was enqueued since the last time we ran, set the job
   * flag to indicate that we have not yet seen this job.  This makes it
   * a candidate for additional processing.  There may be some inaccuracy,
   * since the time_t has resolution of 1 second.  Attempt to err on the
   * side of caution.
   */
  if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED))
    {
    if (job->time_queued <= (schd_TimeNow - schd_TimeLast))
      {
      job->flags |= JFLAGS_FIRST_SEEN;
      }
    }

  /*
   * If the 'etime' attribute wasn't found, set it to the time the job has
   * been queued.  Most jobs will be eligible to run their entire lifetime.
   * The exception is a job that has been held - if it was a user hold,
   * the release will reset the etime to the latest value.
   * If not eligible time was given, use the job's creation time.
   */
  if (!job->eligible)
    job->eligible = job->time_queued;

  /*
   * If the job provided a memory or CPU resource that does not match
   * the resources that will be allocated by the assigned nodes (i.e.
   * a request for 100mb of memory and 16 CPUs - the job will "get" all
   * 4GB of memory anyway), alter the job attributes such that they
   * will align with the assigned nodes later.
   */
  bump_rsrc_requests(job, cpu_req, mem_req);

  return (changed);
  }