Example #1
0
/*
 * schd_get_queue_limits - query queue information from the server.
 *
 * Returns 0 on success, -1 for "fatal errors", and 1 for a transient
 * error (i.e., the queue failed the sanity checks imposed by the
 * queue_sanity() function).
 */
int
schd_get_queue_limits(Queue *queue)
  {
  char   *id = "schd_get_queue_limits";
  int     moved = 0, istrue;
  Batch_Status *bs;
  AttrList *attr;
  static AttrList alist[] =
    {
      {&alist[1],  ATTR_start, "", ""},
    {&alist[2],  ATTR_enable, "", ""},
    {&alist[3],  ATTR_count, "", ""},
    {&alist[4],  ATTR_maxuserrun, "", ""},
    {&alist[5],  ATTR_rescavail, "", ""},
    {&alist[6],  ATTR_rescassn, "", ""},
    {&alist[7],  ATTR_rescdflt, "", ""},
    {&alist[8],  ATTR_rescmax, "", ""},
    {&alist[9],  ATTR_rescmin, "", ""},
    {&alist[10], ATTR_acluren, "", ""},
    {&alist[11], ATTR_acluser, "", ""},
    {NULL,       ATTR_maxrun, "", ""}
    };
  size_t  mem_default = UNSPECIFIED;
  size_t  mem_assn    = UNSPECIFIED;
  size_t  mem_max     = UNSPECIFIED;
  size_t  mem_min     = UNSPECIFIED;
  int     cpu_default = UNSPECIFIED;
  int     cpu_assn    = UNSPECIFIED;
  int     cpu_max     = UNSPECIFIED;
  int     cpu_min     = UNSPECIFIED;
  int     nodes_from_cpu, nodes_from_mem;

  queue->running = UNSPECIFIED;
  queue->queued = UNSPECIFIED;
  queue->maxrun = UNSPECIFIED;
  queue->userrun      = UNSPECIFIED;

  queue->nodes_max = UNSPECIFIED;
  queue->nodes_min = UNSPECIFIED;
  queue->nodes_default = UNSPECIFIED;
  queue->nodes_assn = UNSPECIFIED;
  queue->nodes_rsvd = UNSPECIFIED;
  queue->wallt_max = UNSPECIFIED;
  queue->wallt_min = UNSPECIFIED;
  queue->wallt_default = UNSPECIFIED;

  queue->flags = 0;
#ifdef NODEMASK
  BITFIELD_CLRALL(&queue->queuemask);
  BITFIELD_CLRALL(&queue->availmask);
#endif /* NODEMASK */
  queue->rsrcs = NULL;

  if (queue->jobs)
    {
    DBPRT(("%s: found jobs on queue '%s'!  Freeing them...\n", id,
           queue->qname));
    schd_free_jobs(queue->jobs);
    }

  if (queue->useracl)
    {
    DBPRT(("%s: found user ACL list on queue '%s'!  Freeing it...\n", id,
           queue->qname));
    schd_free_useracl(queue->useracl);
    }

  queue->jobs         = NULL;

  queue->useracl = NULL;

  /* Ask the server for information about the specified queue. */

  if ((bs = pbs_statque(connector, queue->qname, alist, NULL)) == NULL)
    {
    sprintf(log_buffer, "pbs_statque failed, \"%s\" %d",
            queue->qname, pbs_errno);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (-1);
    }

  /* Process the list of attributes returned by the server. */

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /* Is queue started? */
    if (!strcmp(attr->name, ATTR_start))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not stopped. */
          queue->flags &= ~QFLAGS_STOPPED;
        else
          queue->flags |= QFLAGS_STOPPED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* Is queue enabled? */
    if (!strcmp(attr->name, ATTR_enable))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not disabled. */
          queue->flags &= ~QFLAGS_DISABLED;
        else
          queue->flags |= QFLAGS_DISABLED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* How many jobs are queued and running? */
    if (!strcmp(attr->name, ATTR_count))
      {
      queue->queued = schd_how_many(attr->value, SC_QUEUED);
      queue->running = schd_how_many(attr->value, SC_RUNNING);
      continue;
      }

    /* Queue-wide maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxrun))
      {
      queue->maxrun = atoi(attr->value);
      continue;
      }

    /* Per-user maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxuserrun))
      {
      queue->userrun = atoi(attr->value);
      continue;
      }

    /* Is there an enabled user access control list on this queue? */
    if (!strcmp(attr->name, ATTR_acluren))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue has an ACL */
          queue->flags |= QFLAGS_USER_ACL;
        else
          queue->flags &= ~QFLAGS_USER_ACL;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_acluser))
      {
      if (queue->useracl)
        {
        DBPRT(("queue %s acluser already set!\n", queue->qname));
        schd_free_useracl(queue->useracl);
        }

      queue->useracl = schd_create_useracl(attr->value);

      continue;
      }

    /* Queue maximum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmax))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_max = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_max = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_max = schd_val2sec(attr->value);
        continue;
        }

#ifdef NODEMASK
      if (!strcmp("nodemask", attr->resource))
        {
        if (schd_str2mask(attr->value, &queue->queuemask))
          {
          (void)sprintf(log_buffer, "couldn't convert nodemask %s",
                        attr->value);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          }
        else
          queue->flags |= QFLAGS_NODEMASK; /* Valid nodemask. */
        }

#endif /* NODEMASK */

      continue;
      }

    /* Queue minimum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmin))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_min = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_min = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_min = schd_val2sec(attr->value);
        continue;
        }

      continue;
      }

    /* Queue assigned (in use) resource usage. */
    if (!strcmp(attr->name, ATTR_rescassn))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_assn = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_assn = atoi(attr->value);
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_rescdflt))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_default = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_default = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        queue->wallt_default = schd_val2sec(attr->value);
      }

    /* Ignore anything else */
    }

  pbs_statfree(bs);

  /*
   * Calculate values for queue node limits, given memory and cpu values.
   * Note any discrepancies.
   */
  nodes_from_cpu = NODES_FROM_CPU(cpu_default);
  nodes_from_mem = NODES_FROM_MEM(mem_default);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' default cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_default, schd_byte2val(mem_default),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  nodes_from_cpu = NODES_FROM_CPU(cpu_max);

  nodes_from_mem = NODES_FROM_MEM(mem_max);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' maximum cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_max, schd_byte2val(mem_max),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  nodes_from_cpu = NODES_FROM_CPU(cpu_min);

  nodes_from_mem = NODES_FROM_MEM(mem_min);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' minimum cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_min, schd_byte2val(mem_min),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  /*
   * Note: The assigned cpus and memory need not be exactly the same
   * node equivalency.
   */
  if ((cpu_default != UNSPECIFIED) && (mem_default != UNSPECIFIED))
    queue->nodes_default = NODES_REQD(cpu_default, mem_default);

  if ((cpu_max != UNSPECIFIED) && (mem_max != UNSPECIFIED))
    queue->nodes_max     = NODES_REQD(cpu_max, mem_max);

  if ((cpu_min != UNSPECIFIED) && (mem_min != UNSPECIFIED))
    queue->nodes_min     = NODES_REQD(cpu_min, mem_min);

  if ((cpu_assn != UNSPECIFIED) && (mem_assn != UNSPECIFIED))
    queue->nodes_assn    = NODES_REQD(cpu_assn, mem_assn);

  /*
   * Move any jobs on this queue from the global list onto the queue's
   * list.  Keep track of when the longest-running job will end, and set
   * the 'empty_by' field to that value.  Maintain the ordering as it was
   * in "schd_AllJobs".
   */

  if (schd_AllJobs)
    moved = queue_claim_jobs(queue, &schd_AllJobs);

  if (moved < 0)
    {
    sprintf(log_buffer, "%s: WARNING! Queue '%s' failed to claim jobs.",
            id, queue->qname);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  if (queue->nodes_assn == UNSPECIFIED)
    queue->nodes_assn = 0;

  if (queue->running    == UNSPECIFIED)
    queue->running    = 0;

  /*
   * Find out if the queue is idle, and if it was not before, set the idle
   * time to now.  If there are running jobs, the queue is not idle at the
   * start of this iteration - set idle_since to 0.
   */
  if (queue->running)
    {
    queue->idle_since = 0;
    }
  else
    {
    if (queue->idle_since == 0)
      queue->idle_since = schd_TimeNow;
    }

  /*
   * Get the resources for this queue from the resource monitor (if
   * available).  If the resmom is not accessible, disable the queue.
   * If the resources were received okay, compute the available node
   * masks from the resources and jobs.
   * Don't bother with resources for the special or submit queues.
   */
  if ((strcmp(queue->qname, schd_SubmitQueue->queue->qname) != 0) ||
      ((schd_SpecialQueue != NULL) &&
       (!strcmp(queue->qname, schd_SpecialQueue->queue->qname))))
    {

    queue->rsrcs = schd_get_resources(queue->exechost);

    if (queue->rsrcs != NULL)
      {
      /* Account for this queue's resources. */
      queue->rsrcs->nodes_alloc += queue->nodes_assn;
      queue->rsrcs->njobs       += queue->running;

      /*
       * If the HPM counters do not appear to be in use on this host,
       * check for jobs on the queue that are using hpm.  If so, set
       * the 'HPM_IN_USE' flag on the resources.  This will prevent the
       * HPM counters from being released to global mode at the end
       * of the scheduling run (c.f. cleanup.c).
       * The 'HPM_IN_USE' flag will also be asserted if a job is run
       * that uses the HPM counters.
       */

      if (schd_MANAGE_HPM &&
          !(queue->rsrcs->flags & RSRCS_FLAGS_HPM_IN_USE))
        {
        if (schd_hpm_job_count(queue->jobs))
          queue->rsrcs->flags |= RSRCS_FLAGS_HPM_IN_USE;
        }

#ifdef NODEMASK
      /* And find the nodemasks for the queue and resources. */
      find_nodemasks(queue, queue->rsrcs);

#endif /* NODEMASK */

      }
    else
      {
      (void)sprintf(log_buffer,
                    "Can't get resources for %s@%s - marking unavailable.",
                    queue->qname, queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      queue->flags |= QFLAGS_DISABLED;
      }
    }

#ifdef DEBUG
  schd_dump_queue(queue, QUEUE_DUMP_JOBS);

#endif /* DEBUG */

  /*
   * It would probably be better to wait for the world to stabilize
   * than to try to impose some artificial order upon it.  Do not do
   * the sanity check if the queue is stopped.
   */
  if ((queue->flags & QFLAGS_STOPPED) == 0)
    {
    if (!queue_sanity(queue))
      {
      sprintf(log_buffer, "WARNING! Queue '%s' failed sanity checks.",
              queue->qname);
      log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      return (1);
      }
    }

  return (0);
  }
Example #2
0
/*
 * This function takes a pointer to a struct batch_status for a job, and
 * fills in the appropriate fields of the supplied job struct.  It returns
 * the number of items that were found.
 */
int
schd_get_jobinfo(Batch_Status *bs, Job *job)
  {
  int       changed = 0;
  int       istrue;
  char      tmp_str[120];
  char     *id = "schd_get_jobinfo";
  char     *host;
  char     *p, *tmp_p, *var_p;
  AttrList *attr;

  memset((void *)job, 0, sizeof(Job));

  job->jobid = schd_strdup(bs->name);

  if (job->jobid == NULL)
    {
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
               "schd_strdup(bs->name)");
    return (-1);
    }

  changed ++;

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /*
     * If this is the 'owner' field, chop it into 'owner' and 'host'
     * fields, and copy them into the Job struct.
     */
    if (!strcmp(attr->name, ATTR_owner))
      {

      /* Look for the '@' that separates user and hostname. */
      strcpy(tmp_str, attr->value);
      host = strchr(tmp_str, '@');

      if (host)
        {
        *host = '\0'; /* Replace '@' with NULL (ends username). */
        host ++; /* Move to first character of hostname. */
        }

      job->owner = schd_strdup(tmp_str);

      if (job->owner == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->owner)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The group to which to charge the resources for this job. */
    if (!strcmp(attr->name, ATTR_egroup))
      {
      job->group = schd_strdup(attr->value);

      if (job->group == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->group)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The comment currently assigned to this job. */
    if (!strcmp(attr->name, ATTR_comment))
      {
      job->comment = schd_strdup(attr->value);

      if (job->comment == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->comment)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The host on which this job is running. */
    if (!strcmp(attr->name, ATTR_exechost))
      {
      job->exechost = schd_strdup(attr->value);

      if (job->exechost == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->exechost)");
        return (-1);
        }

      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_inter))
      {
      /* Is this job interactive or not? */
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue)
          job->flags |= JFLAGS_INTERACTIVE;
        else
          job->flags &= ~JFLAGS_INTERACTIVE;

        changed ++;
        }
      else
        {
        DBPRT(("%s: can't parse %s = %s into boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_state))
      {
      /* State is one of 'R', 'Q', 'E', etc. */
      job->state = attr->value[0];
      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_queue))
      {
      job->qname = schd_strdup(attr->value);

      if (job->qname == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->qname)");
        return (-1);
        }

      job->flags |= JFLAGS_QNAME_LOCAL;

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_v))
      {
      var_p = schd_strdup(attr->value);

      if (var_p == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(Variable_List)");
        return (-1);
        }

      p = NULL;

      tmp_p = strstr(var_p, "PBS_O_QUEUE");

      if (tmp_p)
        {
        p = strtok(tmp_p, "=");
        p = strtok(NULL,  ", ");
        }

      if (p != NULL)
        {
        job->oqueue = schd_strdup(p);
        }
      else
        {
        /* if the originating queue is unknown, default
         * to the locally defined "submit" queue.
         */
        job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname);
        }

      free(var_p);

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_l))
      {
      if (!strcmp(attr->resource, "arch"))
        {
        job->arch = schd_strdup(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "mem"))
        {
        job->memory = schd_val2byte(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "ncpus"))
        {
        job->ncpus = atoi(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime = schd_val2sec(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "speed"))
        {
        job->speed = atoi(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "tmpdir"))
        {
        job->tmpdir = schd_val2byte(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, FEATURE_A))
        {
        job->featureA = schd_strdup(attr->value);
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_B))
        {
        job->featureB = schd_strdup(attr->value);
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_C))
        {
        job->featureC = schd_strdup(attr->value);
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_D))
        {
        job->featureD = atol(attr->value);
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_E))
        {
        job->featureE = atol(attr->value);
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_F))
        {
        job->featureF = atol(attr->value);
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_G))
        {
        schd_val2bool(attr->value, &istrue);
        job->featureG = istrue;
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_H))
        {
        schd_val2bool(attr->value, &istrue);
        job->featureH = istrue;
        changed ++;
        }
      else if (!strcmp(attr->resource, FEATURE_I))
        {
        schd_val2bool(attr->value, &istrue);
        job->featureI = istrue;
        changed ++;
        }

      /* That's all for requested resources. */
      continue;
      }

    if (!strcmp(attr->name, ATTR_used))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime_used = schd_val2sec(attr->value);
        changed ++;
        }

      /* No other interesting cases. */
      continue;
      }

    /* Creation time attribute. */
    if (!strcmp(attr->name, ATTR_ctime))
      {
      /* How long ago was it put in the queue ? */
      job->time_queued = schd_TimeNow - atoi(attr->value);
      continue;
      }

    /* Modified time attribute. */
    if (!strcmp(attr->name, ATTR_mtime))
      {
      /* When was the job last modified? */
      job->mtime = atoi(attr->value);
      continue;
      }

    /*
     * When was the job last eligible to run?  When a user-hold is
     * released, this value is updated to the current time.  This
     * prevents users from gaining higher priority from holding their
     * jobs.
     */
    if (!strcmp(attr->name, ATTR_etime))
      {
      job->eligible = schd_TimeNow - atoi(attr->value);

      continue;
      }
    }

  /*
   * If this job is in the "Running" state, compute how many seconds
   * remain until it is completed.
   */
  if (job->state == 'R')
    {
    job->time_left = job->walltime - job->walltime_used;
    }

  /*
   * If this job was enqueued since the last time we ran, set the job
   * flag to indicate that we have not yet seen this job.  This makes it
   * a candidate for additional processing.  There may be some inaccuracy,
   * since the time_t has resolution of 1 second.  Attempt to err on the
   * side of caution.
   */
  if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED))
    {
    if (job->time_queued <= (schd_TimeNow - schd_TimeLast))
      {
      job->flags |= JFLAGS_FIRST_SEEN;
      }
    }

  /*
   * If the 'etime' attribute wasn't found, set it to the time the job has
   * been queued.  Most jobs will be eligible to run their entire lifetime.
   * The exception is a job that has been held - if it was a user hold,
   * the release will reset the etime to the latest value.
   * If not eligible time was given, use the job's creation time.
   */
  if (!job->eligible)
    job->eligible = job->time_queued;

  return (changed);
  }
Example #3
0
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
{
    char   *id = "schd_get_resources";
    Resources *rptr, *new_rsrcs;
    int     rm;

    char   *response = NULL;
    int     badreply   = 0;
    int     cpus_avail = 0;
    size_t  pmem_avail = 0;

    char    hpm_ctl[64];

    struct sigaction act, oact;

    unsigned int remain; /* Time remaining in any old alarm(). */
    time_t then;  /* When this alarm() was started. */

#ifdef NODEMASK
    Bitfield cpy;
    int     i, j;
#endif /* NODEMASK */

    /*
     * Check for a local copy of the resources being available already.
     * If so, just return a reference to that Resources structure.
     */

    if (schd_RsrcsList != NULL)
    {
        for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
            if (strcmp(rptr->exechost, exechost) == 0)
                return (rptr);
    }

    schd_timestamp("get_rsrcs");

    /*
     * No cached resource information for 'exechost'.  Need to query the
     * host for its information.
     */

    if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
        (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        return (NULL); /* Can't get the information - nowhere to store it. */
    }

    memset((void *)new_rsrcs, 0, sizeof(Resources));

    act.sa_flags = 0;
    act.sa_handler = connect_interrupt;
    sigemptyset(&act.sa_mask);
    remain = 0;
    then = 0;

    /*
     * Set the alarm, and maintain some idea of how long was left on any
     * previously set alarm.
     */

    if (sigaction(SIGALRM, &act, &oact) == 0)
    {
        remain = alarm(GETRSRCS_CONNECT_TIME);
        then = time(NULL);
    }

    if ((rm = openrm(exechost, 0)) == -1)
    {
        (void)sprintf(log_buffer,
                      "Unable to contact resmom@%s (%d)", exechost, pbs_errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

        badreply = 1;
        goto bail;
    }

    /*
     * Turn off full response.  Responses will be received in the order in
     * which they are sent.
     */
    fullresp(0);

    /* Build a list of all the resources about which we want information. */

    addreq(rm, "loadave");

    addreq(rm, "availmem");

    addreq(rm, "physmem");

    addreq(rm, "ncpus");

#ifdef NODEMASK
    addreq(rm, "availmask");

#endif /* NODEMASK */

    if (schd_MANAGE_HPM)
    {
        (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR);
        addreq(rm, hpm_ctl);
    }

    /* Get the values back from the resource monitor, and round up. */

    /* Receive LOADAVE response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive AVAILMEM response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        new_rsrcs->freemem = schd_val2byte(response);
        new_rsrcs->freemem *= schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive PHYSMEM response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        pmem_avail = schd_val2byte(response);
        pmem_avail *= schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive NCPUS response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        cpus_avail = atoi(response) * schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

#ifdef NODEMASK
    /* Receive available nodes from resource monitor. */
    response = getreq(rm);

    if (response == NULL)
    {
        (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }
    else
    {
        if (schd_bits2mask(response, &new_rsrcs->availmask) != 0)
        {
            if (schd_str2mask(response, &new_rsrcs->availmask) != 0)
            {
                (void)sprintf(log_buffer, "can't parse availmask '%s'", response);
                log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
                badreply = 1;
                goto bail;
            }
        }

        (void)free(response);
    }

#endif /* NODEMASK */

    if (schd_MANAGE_HPM)
    {
        /* Receive HPM_CTL response from resource monitor. */
        response = getreq(rm);

        if (response != NULL)
        {
            if (strcmp(response, HPM_CTL_USERMODE_STR) == 0)
                new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER;
            else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0)
                new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER;
            else
            {
                (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'",
                              response, hpm_ctl, exechost);
                log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                           log_buffer);
                badreply = 1;
                goto bail;
            }
        }
        else
        {
            (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d",
                          hpm_ctl, pbs_errno, errno);
            log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
            badreply = 1;
            goto bail;
        }
    }

    /*
     * NOTE: response will be free()'d in bail.  Be sure to explicitly free()
     * response if more getreq() calls are added before the code below.
     */

bail:
    if (response != NULL)
        (void)free(response);

    /* Disconnect from the resource monitor. */
    if (rm >= 0)  /* resmom handle "0" is valid in RPP. */
        closerm(rm);

    /* And unset the alarm and handler. */
    alarm(0);

    sigaction(SIGALRM, &oact, &act);

    /* Reset the old alarm, taking into account how much time has passed. */
    if (remain)
    {
        DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
               remain, (time(NULL) - then)));
        /* How much time remains even after the time spent above? */
        remain -= (time(NULL) - then);

        /*
         * Would the previous time have already expired?  If so, schedule
         * an alarm call in 1 second (close enough, hopefully).
         */

        if (remain < 1)
            remain = 1;

        DBPRT(("reset to %d secs\n", remain));

        alarm(remain);
    }

    /*
     * Verify all the data came back as expected; if not, abort this
     * iteration of the scheduler.
     */
    if (badreply)
    {
        (void)sprintf(log_buffer,
                      "Got bad info from mom@%s - aborting sched run", exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        free(new_rsrcs);
        return (NULL);
    }

    /* Make a copy of the hostname for the resources struct. */
    new_rsrcs->exechost = schd_strdup(exechost);

    if (new_rsrcs->exechost == NULL)
    {
        (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                      exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        free(new_rsrcs);
        return (NULL);
    }

    new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail);

#ifdef NODEMASK
    /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */
    BITFIELD_CPY(&cpy, &(new_rsrcs->availmask));

    for (i = 2; i <= schd_FAKE_MACH_MULT; i++)
    {
        for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++)
            BITFIELD_SHIFTL(&cpy);

        BITFIELD_SETM(&(new_rsrcs->availmask), &cpy);
    }

#endif /* NODEMASK */

    if (schd_RsrcsList == NULL)
    {
        schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
    else
    {
        for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
            /* Find the last element in the list. */ ;

        rptr->next = new_rsrcs;
    }

    /* Next pointer for the tail of the list points to nothing. */
    new_rsrcs->next = NULL;

    return (new_rsrcs);
}
Example #4
0
/*
 * This function takes a pointer to a struct batch_status for a job, and
 * fills in the appropriate fields of the supplied job struct.  It returns
 * the number of items that were found.
 */
int
schd_get_jobinfo(Batch_Status *bs, Job *job)
  {
  int       changed = 0;
  int       istrue;
  char      tmp_str[120];
  char     *id = "schd_get_jobinfo";
  char     *host;
  char     *p, *tmp_p, *var_p;
  AttrList *attr;

  memset((void *)job, 0, sizeof(Job));

  job->jobid = schd_strdup(bs->name);

  if (job->jobid == NULL)
    {
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
               "schd_strdup(bs->name)");
    return (-1);
    }

  changed ++;

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /*
     * If this is the 'owner' field, chop it into 'owner' and 'host'
     * fields, and copy them into the Job struct.
     */
    if (!strcmp(attr->name, ATTR_owner))
      {

      /* Look for the '@' that separates user and hostname. */
      strcpy(tmp_str, attr->value);
      host = strchr(tmp_str, '@');

      if (host)
        {
        *host = '\0'; /* Replace '@' with NULL (ends username). */
        host ++; /* Move to first character of hostname. */
        }

      job->owner = schd_strdup(tmp_str);

      if (job->owner == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->owner)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The group to which to charge the resources for this job. */
    if (!strcmp(attr->name, ATTR_egroup))
      {
      job->group = schd_strdup(attr->value);

      if (job->group == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->group)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The comment currently assigned to this job. */
    if (!strcmp(attr->name, ATTR_comment))
      {
      job->comment = schd_strdup(attr->value);

      if (job->comment == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->comment)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The host on which this job is running (or was running for
     * suspended or checkpointed jobs. */

    if (!strcmp(attr->name, ATTR_exechost))
      {
      job->exechost = schd_strdup(attr->value);

      if (job->exechost == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->exechost)");
        return (-1);
        }

      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_inter))
      {
      /* Is this job interactive or not? */
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue)
          job->flags |= JFLAGS_INTERACTIVE;
        else
          job->flags &= ~JFLAGS_INTERACTIVE;

        changed ++;
        }
      else
        {
        DBPRT(("%s: can't parse %s = %s into boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_state))
      {
      /* State is one of 'R', 'Q', 'E', etc. */
      job->state = attr->value[0];
      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_queue))
      {
      job->qname = schd_strdup(attr->value);

      if (job->qname == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->qname)");
        return (-1);
        }

      job->flags |= JFLAGS_QNAME_LOCAL;

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_v))
      {
      var_p = schd_strdup(attr->value);

      if (var_p == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(Variable_List)");
        return (-1);
        }

      p = NULL;

      tmp_p = strstr(var_p, "PBS_O_QUEUE");

      if (tmp_p)
        {
        p = strtok(tmp_p, "=");
        p = strtok(NULL,  ", ");
        }

      if (p != NULL)
        {
        job->oqueue = schd_strdup(p);
        }
      else
        {
        /* if the originating queue is unknown, default
         * to the locally defined "submit" queue.
         */
        job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname);
        }

      free(var_p);

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_l))
      {
      if (!strcmp(attr->resource, "arch"))
        {
        job->arch = schd_strdup(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "mem"))
        {
        job->memory = schd_val2byte(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "ncpus"))
        {
        job->ncpus = atoi(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime = schd_val2sec(attr->value);
        changed ++;

        }

      /* That's all for requested resources. */
      continue;
      }

    if (!strcmp(attr->name, ATTR_used))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime_used = schd_val2sec(attr->value);
        changed ++;
        }

      /* No other interesting cases. */
      continue;
      }

    /* Creation time attribute. */
    if (!strcmp(attr->name, ATTR_ctime))
      {
      /* How long ago was it put in the queue ? */
      job->time_queued = schd_TimeNow - atoi(attr->value);
      continue;
      }

    /* Modified time attribute. */
    if (!strcmp(attr->name, ATTR_mtime))
      {
      /* When was the job last modified? */
      job->mtime = atoi(attr->value);
      continue;
      }

    /* Job Substate attribute. */
    if (!strcmp(attr->name, ATTR_substate))
      {
      if (atoi(attr->value) == 43 /* JOB_SUBSTATE_SUSPEND */)
        job->flags |= JFLAGS_SUSPENDED;

      continue;
      }

    /*
     * When was the job last eligible to run?  When a user-hold is
     * released, this value is updated to the current time.  This
     * prevents users from gaining higher priority from holding their
     * jobs.
     */
    if (!strcmp(attr->name, ATTR_etime))
      {
      job->eligible = schd_TimeNow - atoi(attr->value);

      continue;
      }
    }

  if (job->memory < 1)
    {
    job->memory = get_default_mem(job->oqueue);
    schd_alterjob(connector, job, ATTR_l, schd_byte2val(job->memory), "mem");
    changed++;
    }

  /*
   * If this job is in the "Running" or "Suspended" state, compute how
   * many seconds remain until it is completed.
   */
  if (job->state == 'R' || job->state == 'S')
    {
    job->time_left = job->walltime - job->walltime_used;
    }

  /*
   * If this job was enqueued since the last time we ran, set the job
   * flag to indicate that we have not yet seen this job.  This makes it
   * a candidate for additional processing.  There may be some inaccuracy,
   * since the time_t has resolution of 1 second.  Attempt to err on the
   * side of caution.
   */
  if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED))
    {
    if (job->time_queued <= (schd_TimeNow - schd_TimeLast))
      {
      job->flags |= JFLAGS_FIRST_SEEN;
      }
    }

  /*
   * If this job was previously running and is now queued, then we
   * need to (a) flag it as having been checkpointed, and (b) move
   * it back to the submit queue, if its not already there.
   */
  if (job->exechost && job->state == 'Q')
    {
    job->flags |= JFLAGS_CHKPTD;

    if (strcmp(job->qname, schd_SubmitQueue->queue->qname))
      {
      sprintf(log_buffer, "moving Q'd job %s back to SUBMIT Q",
              job->jobid);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      pbs_movejob(connector, job->jobid, schd_SubmitQueue->queue->qname,
                  NULL);
      }
    }

  /*
   * if this job is currently Suspended (a substate of 'R'unning), then
   * pretend its queued, so that the scheduling logic will work.
   */
  if (job->state == 'S')
    {
    job->state = 'Q';
    job->flags |= JFLAGS_SUSPENDED;
    }

  /* if this job is suspended, checkpointed, or otherwise "queued"
   * on an exection queue, update the internal representation of
   * to pretend it is really on the submit queue.
   */

  if ((job->flags & JFLAGS_SUSPENDED) || (job->flags & JFLAGS_CHKPTD))
    {
    free(job->qname);
    job->qname = schd_strdup(schd_SubmitQueue->queue->qname);
    }

  /*
   * If this job came from the EXPRESS queue, set the flag so that it
   * will be treated with the highest of priority.
   */
  if (!strcmp(job->oqueue, schd_EXPRESS_Q_NAME))
    job->flags |= JFLAGS_PRIORITY;

  /*
   * If the 'etime' attribute wasn't found, set it to the time the job has
   * been queued.  Most jobs will be eligible to run their entire lifetime.
   * The exception is a job that has been held - if it was a user hold,
   * the release will reset the etime to the latest value.
   * If not eligible time was given, use the job's creation time.
   */
  if (!job->eligible)
    job->eligible = job->time_queued;

  /* if this job has waited too long, and its queue is NOT over its
   * shares, then bump it up in priority.
   */
  if (job->eligible > schd_MAX_WAIT_TIME && job->sort_order <= 100)
    job->flags |= JFLAGS_WAITING;

  return (changed);
  }
Example #5
0
/*
 * This function takes a pointer to a struct batch_status for a job, and
 * fills in the appropriate fields of the supplied job struct.  It returns
 * the number of items that were found.
 */
int
schd_get_jobinfo(Batch_Status *bs, Job *job)
  {
  char *id = "schd_get_jobinfo";
  int     changed = 0;
  int     cpu_req = 0;
  size_t  mem_req = 0;
  char     *host;
  char *p, *tmp_p, *var_p;
  AttrList *attr;
  char      canon[PBS_MAXHOSTNAME + 1];
  int     istrue;

  memset((void *)job, 0, sizeof(Job));

  job->jobid = schd_strdup(bs->name);

  if (job->jobid == NULL)
    {
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
               "schd_strdup(bs->name)");
    return (-1);
    }

  changed ++;

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /*
     * If this is the 'owner' field, chop it into 'owner' and 'host'
     * fields, and copy them into the Job struct.
     */
    if (!strcmp(attr->name, ATTR_owner))
      {

      /* Look for the '@' that separates user and hostname. */
      host = strchr(attr->value, '@');

      if (host)
        {
        *host = '\0'; /* Replace '@' with NULL (ends username). */
        host ++; /* Move to first character of hostname. */
        }

      job->owner = schd_strdup(attr->value);

      if (job->owner == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->owner)");
        return (-1);
        }

      changed ++;

      job->host  = schd_strdup(host);

      if (job->host == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->host)");
        return (-1);
        }

      changed ++;

      /*
       * We don't "own" the attribute strings, so put back the '@'
       * character we removed above, in case something else expects
       * it to be there.
       * Note that 'host' points to the first character of the host-
       * name, not the hole one character behind.
       */

      if (host)
        {
        host --; /* Step back one character. */
        *host = '@'; /* Replace the '@' that was deleted above. */
        }

      /* That's all for the owner field. */
      continue;
      }

    /* The group to which to charge the resources for this job. */
    if (!strcmp(attr->name, ATTR_egroup))
      {
      job->group = schd_strdup(attr->value);

      if (job->group == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->group)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The comment currently assigned to this job. */
    if (!strcmp(attr->name, ATTR_comment))
      {
      job->comment = schd_strdup(attr->value);

      if (job->comment == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->comment)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The host on which this job is running. */
    if (!strcmp(attr->name, ATTR_exechost))
      {
      job->exechost = schd_strdup(attr->value);

      if (job->exechost == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->exechost)");
        return (-1);
        }

      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_inter))
      {
      /* Is this job interactive or not? */
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue)
          job->flags |= JFLAGS_INTERACTIVE;
        else
          job->flags &= ~JFLAGS_INTERACTIVE;

        changed ++;
        }
      else
        {
        DBPRT(("%s: can't parse %s = %s into boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_state))
      {
      /* State is one of 'R', 'Q', 'E', etc. */
      job->state = attr->value[0];
      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_queue))
      {
      job->qname = schd_strdup(attr->value);

      if (job->qname == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->qname)");
        return (-1);
        }

      job->flags |= JFLAGS_QNAME_LOCAL;

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_v))
      {
      var_p = schd_strdup(attr->value);

      if (var_p == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(Variable_List)");
        return (-1);
        }

      p = NULL;

      tmp_p = strstr(var_p, "PBS_O_QUEUE");

      if (tmp_p)
        {
        p = strtok(tmp_p, "=");
        p = strtok(NULL,  ", ");
        }

      if (p != NULL)
        {
        job->oqueue = schd_strdup(p);
        }
      else
        {
        /* if the originating queue is unknown, default
         * to the locally defined "submit" queue.
         */
        job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname);
        }

      free(var_p);

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_l))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime = schd_val2sec(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "ncpus"))
        {
        cpu_req = atoi(attr->value);
        job->nodes = MAX(job->nodes, cpu_req);
        changed ++;

        }
      else if (!strcmp(attr->resource, "mppe"))
        {
        cpu_req = atoi(attr->value);
        job->nodes = MAX(job->nodes, cpu_req);
        changed ++;

        }
      else if (!strcmp(attr->resource, "mem"))
        {
        mem_req = schd_val2byte(attr->value);
        job->nodes = MAX(job->nodes, NODES_FROM_MEM(mem_req));
        changed ++;

#if PE_MASK != 0
        }
      else if (!strcmp(attr->resource, "pe_mask"))
        {
        if (schd_str2mask(attr->value, &job->nodemask))
          {
          (void)sprintf(log_buffer,
                        "bad pe_mask %s for job %s", attr->value, job->jobid);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          }
        else
          changed++; /* Job pe_mask was valid. */

#endif /* PE_MASK */
        }

      /* That's all for requested resources. */
      continue;
      }

    if (!strcmp(attr->name, ATTR_used))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime_used = schd_val2sec(attr->value);

        changed ++;
        }

      /* No other interesting cases. */
      continue;
      }

    /* Session ID for running jobs (used to correlate GRM info */
    if (!strcmp(attr->name, ATTR_session))
      {
      job->session = atoi(attr->value);
      continue;
      }

    /* Job Priority attribute (inherited from queue) */
    if (!strcmp(attr->name, ATTR_p))
      {
      job->priority = atoi(attr->value);
      continue;
      }

    /* Creation time attribute. */
    if (!strcmp(attr->name, ATTR_ctime))
      {
      /* How long ago was it put in the queue ? */
      job->time_queued = schd_TimeNow - atoi(attr->value);

      continue;
      }

    /* Modified time attribute. */
    if (!strcmp(attr->name, ATTR_mtime))
      {
      /* When was the job last modified? */
      job->mtime = atoi(attr->value);

      continue;
      }

#ifdef ATTR_etime
    /*
     * When was the job last eligible to run?  When a user-hold is
     * released, this value is updated to the current time.  This
     * prevents users from gaining higher priority from holding their
     * jobs.
     */
    if (!strcmp(attr->name, ATTR_etime))
      {
      job->eligible = schd_TimeNow - atoi(attr->value);

      continue;
      }

#endif /* ATTR_etime */
    }

  /*
   * If this job is in the "Running" state, compute how many seconds
   * remain until it is completed.
   */
  if (job->state == 'R')
    {
    job->time_left = job->walltime - job->walltime_used;
    }

  /*
   * If this job was enqueued since the last time we ran, set the job
   * flag to indicate that we have not yet seen this job.  This makes it
   * a candidate for additional processing.  There may be some inaccuracy,
   * since the time_t has resolution of 1 second.  Attempt to err on the
   * side of caution.
   */
  if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED))
    {
    if (job->time_queued <= (schd_TimeNow - schd_TimeLast))
      {
      job->flags |= JFLAGS_FIRST_SEEN;
      }
    }

  /*
   * If the 'etime' attribute wasn't found, set it to the time the job has
   * been queued.  Most jobs will be eligible to run their entire lifetime.
   * The exception is a job that has been held - if it was a user hold,
   * the release will reset the etime to the latest value.
   * If not eligible time was given, use the job's creation time.
   */
  if (!job->eligible)
    job->eligible = job->time_queued;

#if defined(sgi)
  /*
   * If the job provided a memory or CPU resource that does not match
   * the resources that will be allocated by the assigned nodes (i.e.
   * a request for 100mb of memory and 16 CPUs - the job will "get" all
   * 4GB of memory anyway), alter the job attributes such that they
   * will align with the assigned nodes later.
   */
  bump_rsrc_requests(job, cpu_req, mem_req);

#endif /* defined(sgi) */

  /*
   * Need to update the  time_until_eligible  and  total_delay fields,
   * probably from a global array of information saved from previous
   * scheduler iteration.
   */

  /*
   * Calculate the job priority weight sort key to be used later in
   * job sorting. This is the "priority" the job should have during
   * sorting based on the size of the job, the length of time queued,
   * and the job type.
   */
  calc_job_weight(job);

  return (changed);
  }
Example #6
0
/*
 * schd_get_queue_limits - query queue information from the server.
 *
 * Returns 0 on success, -1 for "fatal errors", and 1 for a transient
 * error (i.e., the queue failed the sanity checks imposed by the
 * queue_sanity() function).
 */
int
schd_get_queue_limits(Queue *queue)
  {
  char   *id = "schd_get_queue_limits";
  int     istrue;
  int     local_errno = 0;
  Batch_Status *bs;
  AttrList *attr;
  static AttrList alist[] =
    {
      {&alist[1],  ATTR_start, "", ""},
    {&alist[2],  ATTR_enable, "", ""},
    {&alist[3],  ATTR_count, "", ""},
    {&alist[4],  ATTR_maxuserrun, "", ""},
    {&alist[5],  ATTR_rescavail, "", ""},
    {&alist[6],  ATTR_rescassn, "", ""},
    {&alist[7],  ATTR_rescdflt, "", ""},
    {&alist[8],  ATTR_rescmax, "", ""},
    {&alist[9],  ATTR_rescmin, "", ""},
    {&alist[10], ATTR_acluren, "", ""},
    {&alist[11], ATTR_acluser, "", ""},
    {&alist[12], ATTR_p,  "", ""},
    {NULL,       ATTR_maxrun, "", ""}
    };
  queue->running = UNSPECIFIED;
  queue->queued = UNSPECIFIED;
  queue->maxrun = UNSPECIFIED;
  queue->userrun      = UNSPECIFIED;
  queue->ncpus_max = UNSPECIFIED;
  queue->ncpus_min = UNSPECIFIED;
  queue->ncpus_default = UNSPECIFIED;
  queue->ncpus_assn = UNSPECIFIED;
  queue->mem_max = UNSPECIFIED;
  queue->mem_min = UNSPECIFIED;
  queue->mem_default  = UNSPECIFIED;
  queue->wallt_max = UNSPECIFIED;
  queue->wallt_min = UNSPECIFIED;
  queue->wallt_default = UNSPECIFIED;
  queue->rsrcs = NULL;
  queue->flags = 0;
  queue->priority     = UNSPECIFIED;
  queue->speed   = UNSPECIFIED;

  if (queue->featureA)
    {
    free(queue->featureA);
    queue->featureA = NULL;
    }

  if (queue->featureB)
    {
    free(queue->featureB);
    queue->featureB = NULL;
    }

  if (queue->featureC)
    {
    free(queue->featureC);
    queue->featureC = NULL;
    }

  queue->featureD = UNSPECIFIED;

  queue->featureE = UNSPECIFIED;
  queue->featureF = UNSPECIFIED;
  queue->featureG = UNSPECIFIED;
  queue->featureH = UNSPECIFIED;
  queue->featureI = UNSPECIFIED;

  if (queue->rsrcs)
    {
    DBPRT(("%s: found resource list on queue '%s'!  Freeing them...\n", id,
           queue->qname));
    cleanup_rsrcs(queue->rsrcs);
    queue->rsrcs        = NULL;
    }

  if (queue->jobs)
    {
    DBPRT(("%s: found jobs on queue '%s'!  Freeing them...\n", id,
           queue->qname));
    schd_free_jobs(queue->jobs);
    queue->jobs         = NULL;
    }

  if (queue->useracl)
    {
    DBPRT(("%s: found user ACL list on queue '%s'!  Freeing it...\n", id,
           queue->qname));
    schd_free_useracl(queue->useracl);
    queue->useracl = NULL;
    }


  /* Ask the server for information about the specified queue. */

  if ((bs = pbs_statque_err(connector, queue->qname, alist, NULL, &local_errno)) == NULL)
    {
    sprintf(log_buffer, "pbs_statque failed, \"%s\" %d",
            queue->qname, local_errno);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (-1);
    }

  /* Process the list of attributes returned by the server. */

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /* Is queue started? */
    if (!strcmp(attr->name, ATTR_start))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not stopped. */
          queue->flags &= ~QFLAGS_STOPPED;
        else
          queue->flags |= QFLAGS_STOPPED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* Is queue enabled? */
    if (!strcmp(attr->name, ATTR_enable))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not disabled. */
          queue->flags &= ~QFLAGS_DISABLED;
        else
          queue->flags |= QFLAGS_DISABLED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* How many jobs are queued and running? */
    if (!strcmp(attr->name, ATTR_count))
      {
      queue->queued = schd_how_many(attr->value, SC_QUEUED);
      queue->running = schd_how_many(attr->value, SC_RUNNING);
      continue;
      }

    /* Queue-wide maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxrun))
      {
      queue->maxrun = atoi(attr->value);
      continue;
      }

    /* Per-user maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxuserrun))
      {
      queue->userrun = atoi(attr->value);
      continue;
      }

    /* Queue Priority Value */
    if (!strcmp(attr->name, ATTR_p))
      {
      queue->priority = atoi(attr->value);
      continue;
      }

    /* Is there an enabled user access control list on this queue? */
    if (!strcmp(attr->name, ATTR_acluren))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue has an ACL */
          queue->flags |= QFLAGS_USER_ACL;
        else
          queue->flags &= ~QFLAGS_USER_ACL;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_acluser))
      {
      if (queue->useracl)
        {
        DBPRT(("queue %s acluser already set!\n", queue->qname));
        schd_free_useracl(queue->useracl);
        }

      queue->useracl = schd_create_useracl(attr->value);

      continue;
      }

    /* Queue maximum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmax))
      {
      if (!strcmp("mem", attr->resource))
        {
        queue->mem_max = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        queue->ncpus_max = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_max = schd_val2sec(attr->value);
        continue;
        }

      if (!strcmp("speed", attr->resource))
        {
        queue->speed = atoi(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_A, attr->resource))
        {
        queue->featureA = schd_strdup(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_B, attr->resource))
        {
        queue->featureB = schd_strdup(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_C, attr->resource))
        {
        queue->featureC = schd_strdup(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_D, attr->resource))
        {
        queue->featureD = atol(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_E, attr->resource))
        {
        queue->featureE = atol(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_F, attr->resource))
        {
        queue->featureF = atol(attr->value);
        continue;
        }

      if (!strcmp(FEATURE_G, attr->resource))
        {
        schd_val2bool(attr->value, &istrue);
        queue->featureG = istrue;
        continue;
        }

      if (!strcmp(FEATURE_H, attr->resource))
        {
        schd_val2bool(attr->value, &istrue);
        queue->featureH = istrue;
        continue;
        }

      if (!strcmp(FEATURE_I, attr->resource))
        {
        schd_val2bool(attr->value, &istrue);
        queue->featureI = istrue;
        continue;
        }

      continue;
      }

    /* Queue minimum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmin))
      {
      if (!strcmp("mem", attr->resource))
        {
        queue->mem_min = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        queue->ncpus_min = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_min = schd_val2sec(attr->value);
        continue;
        }

      continue;
      }

    /* Queue assigned (in use) resource usage. */
    if (!strcmp(attr->name, ATTR_rescassn))
      {
      if (!strcmp("mem", attr->resource))
        {
        queue->mem_assn = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        queue->ncpus_assn = atoi(attr->value);
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_rescdflt))
      {
      if (!strcmp("mem", attr->resource))
        {
        queue->mem_default = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        queue->ncpus_default = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        queue->wallt_default = schd_val2sec(attr->value);
      }

    /* Ignore anything else */
    }

  pbs_statfree(bs);

  return (0);
  }
Example #7
0
/* schd_get_queue_memory - query queue memory limit from the server.
 */
size_t schd_get_queue_memory(char *qName)
  {
  char   *id = "schd_get_queue_limits";
  size_t  mem_max, mem_default;
  Batch_Status *bs;
  AttrList *attr;
  int local_errno = 0;
  static AttrList alist[] =
    {   {&alist[1],  ATTR_rescdflt, "", ""},
    {NULL,       ATTR_rescmax, "", ""}
    };

  mem_default  = (size_t)0;
  mem_max  = (size_t)0;

  /* Ask the server for information about the specified queue. */

  if ((bs = pbs_statque_err(connector, qName, alist, NULL, &local_errno)) == NULL)
    {
    sprintf(log_buffer, "pbs_statque failed, \"%s\" %d",
            qName, local_errno);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (UNSPECIFIED);
    }

  /* Process the list of attributes returned by the server. */

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {
    /* Queue maximum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmax))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_max = schd_val2byte(attr->value);
        continue;
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_rescdflt))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_default = schd_val2byte(attr->value);
        continue;
        }
      }

    /* Ignore anything else */
    }

  pbs_statfree(bs);

  if (mem_default != (size_t)0)
    return(mem_default);

  if (mem_max != (size_t)0)
    return(mem_max);

  return (UNSPECIFIED);
  }
Example #8
0
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
  {
  char   *id = "schd_get_resources";
  Resources *rptr, *new_rsrcs;
  int     rm;

  char   *response = NULL;
  int     badreply   = 0;
  int     local_errno = 0;

  struct sigaction act, oact;
  unsigned int remain; /* Time remaining in any old alarm(). */
  time_t then;  /* When this alarm() was started. */

  /*
   * Check for a local copy of the resources being available already.
   * If so, just return a reference to that Resources structure.
   */

  if (schd_RsrcsList != NULL)
    {
    for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
      if (strcmp(rptr->exechost, exechost) == 0)
        return (rptr);
    }

  schd_timestamp("get_rsrcs");

  /*
   * No cached resource information for 'exechost'.  Need to query the
   * host for its information.
   */

  if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
    (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));

    return (NULL); /* Can't get the information - nowhere to store it. */
    }

  memset((void *)new_rsrcs, 0, sizeof(Resources));

  act.sa_flags = 0;
  act.sa_handler = connect_interrupt;
  sigemptyset(&act.sa_mask);
  remain = 0;
  then = 0;

  /*
   * Set the alarm, and maintain some idea of how long was left on any
   * previously set alarm.
   */

  if (sigaction(SIGALRM, &act, &oact) == 0)
    {
    remain = alarm(GETRSRCS_CONNECT_TIME);
    then = time(NULL);
    }

  if ((rm = openrm(exechost, pbs_rm_port)) == -1)
    {
    (void)sprintf(log_buffer,
                  "Unable to contact resmom@%s", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    badreply = 1;
    goto bail;
    }

  /*
   * Turn off full response.  Responses will be received in the order in
   * which they are sent.
   */
  fullresp(0);

  /* Build a list of all the resources about which we want information. */

  addreq(rm, "loadave");

  addreq(rm, "physmem");

  addreq(rm, "ncpus");

  addreq(rm, "arch");

  /* Get the values back from the resource monitor, and round up. */

  /* Receive LOADAVE response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->loadave = atof(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive PHYSMEM response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->mem_total = schd_val2byte(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive NCPUS response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->ncpus_total = atoi(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive ARCH response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->arch = schd_strdup(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

bail:

  /* Disconnect from the resource monitor. */

  if (rm >= 0)  /* resmom handle "0" is valid in RPP. */
    closerm(rm);

  /* And unset the alarm and handler. */
  alarm(0);

  sigaction(SIGALRM, &oact, &act);

  /* Reset the old alarm, taking into account how much time has passed. */
  if (remain)
    {
    DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
           remain, (time(NULL) - then)));

    /* How much time remains even after the time spent above? */
    remain -= (time(NULL) - then);

    /*
     * Would the previous time have already expired?  If so, schedule
     * an alarm call in 1 second (close enough, hopefully).
     */

    if (remain < 1)
      remain = 1;

    DBPRT(("reset to %d secs\n", remain));

    alarm(remain);
    }

  /*
   * Verify all the data came back as expected; if not, abort this
   * iteration of the scheduler.
   */

  if (badreply)
    {
    (void)sprintf(log_buffer,
                  "Got bad info from mom@%s - skipping this node", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    free(new_rsrcs);
    return (NULL);
    }

  /* Make a copy of the hostname for the resources struct. */
  new_rsrcs->exechost = schd_strdup(exechost);

  if (new_rsrcs->exechost == NULL)
    {
    (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                  exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    free(new_rsrcs);
    return (NULL);
    }

  if (schd_RsrcsList == NULL)
    {
    schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
  else
    {
    for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
      /* Find the last element in the list. */ ;

    rptr->next = new_rsrcs;
    }

  /* Next pointer for the tail of the list points to nothing. */
  new_rsrcs->next = NULL;

  return (new_rsrcs);
  }