Пример #1
0
static void
dump_resources(Resources *rsrcs)
  {
  char   *id = "dump_res";
  char  tmpstr[40];
  long long tmpval;

  /* Log the system's status */

  (void)sprintf(log_buffer,
                "Resources for host %s", rsrcs->exechost);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %s", "SYSTEM Architecture:",
                rsrcs->arch);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %0.2f", "CPU Load average:",
                rsrcs->loadave);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %d / %d (%.2f%%)",
                "CPUs allocated:", rsrcs->ncpus_alloc, rsrcs->ncpus_total,
                (rsrcs->ncpus_alloc * 100.0) / rsrcs->ncpus_total);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  strcpy(tmpstr, schd_byte2val(rsrcs->mem_total));
  tmpval = rsrcs->mem_total - rsrcs->mem_alloc;

  if (tmpval < 0L)
    tmpval = 0L;

  (void)sprintf(log_buffer, " :: %-20s = %s / %s", "Memory (free):",
                schd_byte2val((size_t)tmpval), tmpstr);

  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %d", "Running jobs:",
                rsrcs->njobs);

  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  }
Пример #2
0
static void
dump_resources(Resources *rsrcs)
{
    char   *id = "dump_resources";

    /* Log the system's status */

#if 0
    (void)sprintf(log_buffer, " %d%% usr, %d%% sys, %d%% idl",
                  rsrcs->usrtime, rsrcs->systime, rsrcs->idltime);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
#endif /* 0 */

    (void)sprintf(log_buffer,
                  "Resources for host %s", rsrcs->exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %s", "Memory (free):",
                  schd_byte2val(rsrcs->freemem));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %d / %d (%.2f%% utilization)",
                  "Nodes allocated:", rsrcs->nodes_alloc, rsrcs->nodes_total,
                  (rsrcs->nodes_alloc * 100.0) / rsrcs->nodes_total);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

#ifdef NODEMASK
    (void)sprintf(log_buffer, " :: %-24s = %s", "Nodes configured:",
                  schd_format_nodemask(&(rsrcs->availmask), &(rsrcs->availmask)));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %s", "Nodes in use:",
                  schd_format_nodemask(&(rsrcs->availmask), &(rsrcs->nodes_used)));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
#endif /* NODEMASK */

    (void)sprintf(log_buffer, " :: %-24s = %0.2f", "CPU Load average:",
                  rsrcs->loadave);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %d", "Running jobs:",
                  rsrcs->njobs);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    if (schd_MANAGE_HPM)
    {
        (void)sprintf(log_buffer, " :: %-24s = %s mode, %sin use",
                      "HPM Counters:",
                      rsrcs->flags & RSRCS_FLAGS_HPM_USER ? "user" : "global",
                      rsrcs->flags & RSRCS_FLAGS_HPM_IN_USE ? "" : "not ");
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    }
}
Пример #3
0
static void
dump_resources(Resources *rsrcs)
  {
  char   *id = "dump_res";

  /* Log the system's status */

  (void)sprintf(log_buffer,
                "Resources for host %s", rsrcs->exechost);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %s", "SYSTEM Architecture:",
                rsrcs->arch);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %d / %d (%.2f%%)",
                "CPUs allocated:", rsrcs->ncpus_alloc, rsrcs->ncpus_total,
                (rsrcs->ncpus_alloc * 100.0) / rsrcs->ncpus_total);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %0.2f", "CPU Load average:",
                rsrcs->loadave);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %s", "Memory (free):",
                schd_byte2val(rsrcs->freemem));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %s", "/tmp (free):",
                schd_byte2val(rsrcs->tmpdir));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-20s = %d", "Running jobs:",
                rsrcs->njobs);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  }
Пример #4
0
void
schd_print_schedule(void)
  {
  int i;

  /* print Schedule for sanity check */
#if 0
  printf("DEBUG: (job schedule)\n");

  for (i = 0; i < nJobSchedule; i++)
    {
    printf("%c %ld %s %d cpus %ld(b)memory %ld\n", JobSchedule[i].event,
           JobSchedule[i].time, JobSchedule[i].qname, JobSchedule[i].cpu,
           schd_byte2val(JobSchedule[i].mem), JobSchedule[i].walltime);
    }

#endif
  }
Пример #5
0
static void
dump_resources(Resources *rsrcs)
  {
  char   *id = "dump_resources";

  /* Log the system's status */

  (void)sprintf(log_buffer,
                "Resources for host %s", rsrcs->exechost);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-24s = %s", "Memory (free):",
                schd_byte2val(rsrcs->freemem));
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-24s = %d / %d (%.2f%% utilization)",
                "Nodes allocated:", rsrcs->nodes_alloc, rsrcs->nodes_total,
                (rsrcs->nodes_alloc * 100.0) / rsrcs->nodes_total);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  (void)sprintf(log_buffer, " :: %-24s = %d", "Running jobs:",
                rsrcs->njobs);
  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
  }
Пример #6
0
/*
 * schd_get_queue_limits - query queue information from the server.
 *
 * Returns 0 on success, -1 for "fatal errors", and 1 for a transient
 * error (i.e., the queue failed the sanity checks imposed by the
 * queue_sanity() function).
 */
int
schd_get_queue_limits(Queue *queue)
  {
  char   *id = "schd_get_queue_limits";
  int     moved = 0, istrue;
  Batch_Status *bs;
  AttrList *attr;
  static AttrList alist[] =
    {
      {&alist[1],  ATTR_start, "", ""},
    {&alist[2],  ATTR_enable, "", ""},
    {&alist[3],  ATTR_count, "", ""},
    {&alist[4],  ATTR_maxuserrun, "", ""},
    {&alist[5],  ATTR_rescavail, "", ""},
    {&alist[6],  ATTR_rescassn, "", ""},
    {&alist[7],  ATTR_rescdflt, "", ""},
    {&alist[8],  ATTR_rescmax, "", ""},
    {&alist[9],  ATTR_rescmin, "", ""},
    {&alist[10], ATTR_acluren, "", ""},
    {&alist[11], ATTR_acluser, "", ""},
    {NULL,       ATTR_maxrun, "", ""}
    };
  size_t  mem_default = UNSPECIFIED;
  size_t  mem_assn    = UNSPECIFIED;
  size_t  mem_max     = UNSPECIFIED;
  size_t  mem_min     = UNSPECIFIED;
  int     cpu_default = UNSPECIFIED;
  int     cpu_assn    = UNSPECIFIED;
  int     cpu_max     = UNSPECIFIED;
  int     cpu_min     = UNSPECIFIED;
  int     nodes_from_cpu, nodes_from_mem;

  queue->running = UNSPECIFIED;
  queue->queued = UNSPECIFIED;
  queue->maxrun = UNSPECIFIED;
  queue->userrun      = UNSPECIFIED;

  queue->nodes_max = UNSPECIFIED;
  queue->nodes_min = UNSPECIFIED;
  queue->nodes_default = UNSPECIFIED;
  queue->nodes_assn = UNSPECIFIED;
  queue->nodes_rsvd = UNSPECIFIED;
  queue->wallt_max = UNSPECIFIED;
  queue->wallt_min = UNSPECIFIED;
  queue->wallt_default = UNSPECIFIED;

  queue->flags = 0;
#ifdef NODEMASK
  BITFIELD_CLRALL(&queue->queuemask);
  BITFIELD_CLRALL(&queue->availmask);
#endif /* NODEMASK */
  queue->rsrcs = NULL;

  if (queue->jobs)
    {
    DBPRT(("%s: found jobs on queue '%s'!  Freeing them...\n", id,
           queue->qname));
    schd_free_jobs(queue->jobs);
    }

  if (queue->useracl)
    {
    DBPRT(("%s: found user ACL list on queue '%s'!  Freeing it...\n", id,
           queue->qname));
    schd_free_useracl(queue->useracl);
    }

  queue->jobs         = NULL;

  queue->useracl = NULL;

  /* Ask the server for information about the specified queue. */

  if ((bs = pbs_statque(connector, queue->qname, alist, NULL)) == NULL)
    {
    sprintf(log_buffer, "pbs_statque failed, \"%s\" %d",
            queue->qname, pbs_errno);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (-1);
    }

  /* Process the list of attributes returned by the server. */

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /* Is queue started? */
    if (!strcmp(attr->name, ATTR_start))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not stopped. */
          queue->flags &= ~QFLAGS_STOPPED;
        else
          queue->flags |= QFLAGS_STOPPED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* Is queue enabled? */
    if (!strcmp(attr->name, ATTR_enable))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue is not disabled. */
          queue->flags &= ~QFLAGS_DISABLED;
        else
          queue->flags |= QFLAGS_DISABLED;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    /* How many jobs are queued and running? */
    if (!strcmp(attr->name, ATTR_count))
      {
      queue->queued = schd_how_many(attr->value, SC_QUEUED);
      queue->running = schd_how_many(attr->value, SC_RUNNING);
      continue;
      }

    /* Queue-wide maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxrun))
      {
      queue->maxrun = atoi(attr->value);
      continue;
      }

    /* Per-user maximum number of jobs running. */
    if (!strcmp(attr->name, ATTR_maxuserrun))
      {
      queue->userrun = atoi(attr->value);
      continue;
      }

    /* Is there an enabled user access control list on this queue? */
    if (!strcmp(attr->name, ATTR_acluren))
      {
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue) /* if true, queue has an ACL */
          queue->flags |= QFLAGS_USER_ACL;
        else
          queue->flags &= ~QFLAGS_USER_ACL;
        }
      else
        {
        DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_acluser))
      {
      if (queue->useracl)
        {
        DBPRT(("queue %s acluser already set!\n", queue->qname));
        schd_free_useracl(queue->useracl);
        }

      queue->useracl = schd_create_useracl(attr->value);

      continue;
      }

    /* Queue maximum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmax))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_max = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_max = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_max = schd_val2sec(attr->value);
        continue;
        }

#ifdef NODEMASK
      if (!strcmp("nodemask", attr->resource))
        {
        if (schd_str2mask(attr->value, &queue->queuemask))
          {
          (void)sprintf(log_buffer, "couldn't convert nodemask %s",
                        attr->value);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                     log_buffer);
          }
        else
          queue->flags |= QFLAGS_NODEMASK; /* Valid nodemask. */
        }

#endif /* NODEMASK */

      continue;
      }

    /* Queue minimum resource usage. */
    if (!strcmp(attr->name, ATTR_rescmin))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_min = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_min = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        {
        queue->wallt_min = schd_val2sec(attr->value);
        continue;
        }

      continue;
      }

    /* Queue assigned (in use) resource usage. */
    if (!strcmp(attr->name, ATTR_rescassn))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_assn = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_assn = atoi(attr->value);
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_rescdflt))
      {
      if (!strcmp("mem", attr->resource))
        {
        mem_default = schd_val2byte(attr->value);
        continue;
        }

      if (!strcmp("ncpus", attr->resource))
        {
        cpu_default = atoi(attr->value);
        continue;
        }

      if (!strcmp("walltime", attr->resource))
        queue->wallt_default = schd_val2sec(attr->value);
      }

    /* Ignore anything else */
    }

  pbs_statfree(bs);

  /*
   * Calculate values for queue node limits, given memory and cpu values.
   * Note any discrepancies.
   */
  nodes_from_cpu = NODES_FROM_CPU(cpu_default);
  nodes_from_mem = NODES_FROM_MEM(mem_default);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' default cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_default, schd_byte2val(mem_default),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  nodes_from_cpu = NODES_FROM_CPU(cpu_max);

  nodes_from_mem = NODES_FROM_MEM(mem_max);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' maximum cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_max, schd_byte2val(mem_max),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  nodes_from_cpu = NODES_FROM_CPU(cpu_min);

  nodes_from_mem = NODES_FROM_MEM(mem_min);

  if (nodes_from_cpu != nodes_from_mem)
    {
    sprintf(log_buffer,
            "%s: Queue '%s' minimum cpu/mem (%d/%s) convert to %d != %d nodes",
            id, queue->qname, cpu_min, schd_byte2val(mem_min),
            nodes_from_cpu, nodes_from_mem);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  /*
   * Note: The assigned cpus and memory need not be exactly the same
   * node equivalency.
   */
  if ((cpu_default != UNSPECIFIED) && (mem_default != UNSPECIFIED))
    queue->nodes_default = NODES_REQD(cpu_default, mem_default);

  if ((cpu_max != UNSPECIFIED) && (mem_max != UNSPECIFIED))
    queue->nodes_max     = NODES_REQD(cpu_max, mem_max);

  if ((cpu_min != UNSPECIFIED) && (mem_min != UNSPECIFIED))
    queue->nodes_min     = NODES_REQD(cpu_min, mem_min);

  if ((cpu_assn != UNSPECIFIED) && (mem_assn != UNSPECIFIED))
    queue->nodes_assn    = NODES_REQD(cpu_assn, mem_assn);

  /*
   * Move any jobs on this queue from the global list onto the queue's
   * list.  Keep track of when the longest-running job will end, and set
   * the 'empty_by' field to that value.  Maintain the ordering as it was
   * in "schd_AllJobs".
   */

  if (schd_AllJobs)
    moved = queue_claim_jobs(queue, &schd_AllJobs);

  if (moved < 0)
    {
    sprintf(log_buffer, "%s: WARNING! Queue '%s' failed to claim jobs.",
            id, queue->qname);
    log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  if (queue->nodes_assn == UNSPECIFIED)
    queue->nodes_assn = 0;

  if (queue->running    == UNSPECIFIED)
    queue->running    = 0;

  /*
   * Find out if the queue is idle, and if it was not before, set the idle
   * time to now.  If there are running jobs, the queue is not idle at the
   * start of this iteration - set idle_since to 0.
   */
  if (queue->running)
    {
    queue->idle_since = 0;
    }
  else
    {
    if (queue->idle_since == 0)
      queue->idle_since = schd_TimeNow;
    }

  /*
   * Get the resources for this queue from the resource monitor (if
   * available).  If the resmom is not accessible, disable the queue.
   * If the resources were received okay, compute the available node
   * masks from the resources and jobs.
   * Don't bother with resources for the special or submit queues.
   */
  if ((strcmp(queue->qname, schd_SubmitQueue->queue->qname) != 0) ||
      ((schd_SpecialQueue != NULL) &&
       (!strcmp(queue->qname, schd_SpecialQueue->queue->qname))))
    {

    queue->rsrcs = schd_get_resources(queue->exechost);

    if (queue->rsrcs != NULL)
      {
      /* Account for this queue's resources. */
      queue->rsrcs->nodes_alloc += queue->nodes_assn;
      queue->rsrcs->njobs       += queue->running;

      /*
       * If the HPM counters do not appear to be in use on this host,
       * check for jobs on the queue that are using hpm.  If so, set
       * the 'HPM_IN_USE' flag on the resources.  This will prevent the
       * HPM counters from being released to global mode at the end
       * of the scheduling run (c.f. cleanup.c).
       * The 'HPM_IN_USE' flag will also be asserted if a job is run
       * that uses the HPM counters.
       */

      if (schd_MANAGE_HPM &&
          !(queue->rsrcs->flags & RSRCS_FLAGS_HPM_IN_USE))
        {
        if (schd_hpm_job_count(queue->jobs))
          queue->rsrcs->flags |= RSRCS_FLAGS_HPM_IN_USE;
        }

#ifdef NODEMASK
      /* And find the nodemasks for the queue and resources. */
      find_nodemasks(queue, queue->rsrcs);

#endif /* NODEMASK */

      }
    else
      {
      (void)sprintf(log_buffer,
                    "Can't get resources for %s@%s - marking unavailable.",
                    queue->qname, queue->exechost);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      queue->flags |= QFLAGS_DISABLED;
      }
    }

#ifdef DEBUG
  schd_dump_queue(queue, QUEUE_DUMP_JOBS);

#endif /* DEBUG */

  /*
   * It would probably be better to wait for the world to stabilize
   * than to try to impose some artificial order upon it.  Do not do
   * the sanity check if the queue is stopped.
   */
  if ((queue->flags & QFLAGS_STOPPED) == 0)
    {
    if (!queue_sanity(queue))
      {
      sprintf(log_buffer, "WARNING! Queue '%s' failed sanity checks.",
              queue->qname);
      log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer);
      DBPRT(("%s: %s\n", id, log_buffer));

      return (1);
      }
    }

  return (0);
  }
Пример #7
0
/*
 * This function takes a pointer to a struct batch_status for a job, and
 * fills in the appropriate fields of the supplied job struct.  It returns
 * the number of items that were found.
 */
int
schd_get_jobinfo(Batch_Status *bs, Job *job)
  {
  int       changed = 0;
  int       istrue;
  char      tmp_str[120];
  char     *id = "schd_get_jobinfo";
  char     *host;
  char     *p, *tmp_p, *var_p;
  AttrList *attr;

  memset((void *)job, 0, sizeof(Job));

  job->jobid = schd_strdup(bs->name);

  if (job->jobid == NULL)
    {
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
               "schd_strdup(bs->name)");
    return (-1);
    }

  changed ++;

  for (attr = bs->attribs; attr != NULL; attr = attr->next)
    {

    /*
     * If this is the 'owner' field, chop it into 'owner' and 'host'
     * fields, and copy them into the Job struct.
     */
    if (!strcmp(attr->name, ATTR_owner))
      {

      /* Look for the '@' that separates user and hostname. */
      strcpy(tmp_str, attr->value);
      host = strchr(tmp_str, '@');

      if (host)
        {
        *host = '\0'; /* Replace '@' with NULL (ends username). */
        host ++; /* Move to first character of hostname. */
        }

      job->owner = schd_strdup(tmp_str);

      if (job->owner == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->owner)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The group to which to charge the resources for this job. */
    if (!strcmp(attr->name, ATTR_egroup))
      {
      job->group = schd_strdup(attr->value);

      if (job->group == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->group)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The comment currently assigned to this job. */
    if (!strcmp(attr->name, ATTR_comment))
      {
      job->comment = schd_strdup(attr->value);

      if (job->comment == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->comment)");
        return (-1);
        }

      changed ++;

      continue;
      }

    /* The host on which this job is running (or was running for
     * suspended or checkpointed jobs. */

    if (!strcmp(attr->name, ATTR_exechost))
      {
      job->exechost = schd_strdup(attr->value);

      if (job->exechost == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->exechost)");
        return (-1);
        }

      changed ++;

      continue;
      }

    if (!strcmp(attr->name, ATTR_inter))
      {
      /* Is this job interactive or not? */
      if (schd_val2bool(attr->value, &istrue) == 0)
        {
        if (istrue)
          job->flags |= JFLAGS_INTERACTIVE;
        else
          job->flags &= ~JFLAGS_INTERACTIVE;

        changed ++;
        }
      else
        {
        DBPRT(("%s: can't parse %s = %s into boolean\n", id,
               attr->name, attr->value));
        }

      continue;
      }

    if (!strcmp(attr->name, ATTR_state))
      {
      /* State is one of 'R', 'Q', 'E', etc. */
      job->state = attr->value[0];
      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_queue))
      {
      job->qname = schd_strdup(attr->value);

      if (job->qname == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(job->qname)");
        return (-1);
        }

      job->flags |= JFLAGS_QNAME_LOCAL;

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_v))
      {
      var_p = schd_strdup(attr->value);

      if (var_p == NULL)
        {
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                   "schd_strdup(Variable_List)");
        return (-1);
        }

      p = NULL;

      tmp_p = strstr(var_p, "PBS_O_QUEUE");

      if (tmp_p)
        {
        p = strtok(tmp_p, "=");
        p = strtok(NULL,  ", ");
        }

      if (p != NULL)
        {
        job->oqueue = schd_strdup(p);
        }
      else
        {
        /* if the originating queue is unknown, default
         * to the locally defined "submit" queue.
         */
        job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname);
        }

      free(var_p);

      changed ++;
      continue;
      }

    if (!strcmp(attr->name, ATTR_l))
      {
      if (!strcmp(attr->resource, "arch"))
        {
        job->arch = schd_strdup(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "mem"))
        {
        job->memory = schd_val2byte(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "ncpus"))
        {
        job->ncpus = atoi(attr->value);
        changed ++;

        }
      else if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime = schd_val2sec(attr->value);
        changed ++;

        }

      /* That's all for requested resources. */
      continue;
      }

    if (!strcmp(attr->name, ATTR_used))
      {
      if (!strcmp(attr->resource, "walltime"))
        {
        job->walltime_used = schd_val2sec(attr->value);
        changed ++;
        }

      /* No other interesting cases. */
      continue;
      }

    /* Creation time attribute. */
    if (!strcmp(attr->name, ATTR_ctime))
      {
      /* How long ago was it put in the queue ? */
      job->time_queued = schd_TimeNow - atoi(attr->value);
      continue;
      }

    /* Modified time attribute. */
    if (!strcmp(attr->name, ATTR_mtime))
      {
      /* When was the job last modified? */
      job->mtime = atoi(attr->value);
      continue;
      }

    /* Job Substate attribute. */
    if (!strcmp(attr->name, ATTR_substate))
      {
      if (atoi(attr->value) == 43 /* JOB_SUBSTATE_SUSPEND */)
        job->flags |= JFLAGS_SUSPENDED;

      continue;
      }

    /*
     * When was the job last eligible to run?  When a user-hold is
     * released, this value is updated to the current time.  This
     * prevents users from gaining higher priority from holding their
     * jobs.
     */
    if (!strcmp(attr->name, ATTR_etime))
      {
      job->eligible = schd_TimeNow - atoi(attr->value);

      continue;
      }
    }

  if (job->memory < 1)
    {
    job->memory = get_default_mem(job->oqueue);
    schd_alterjob(connector, job, ATTR_l, schd_byte2val(job->memory), "mem");
    changed++;
    }

  /*
   * If this job is in the "Running" or "Suspended" state, compute how
   * many seconds remain until it is completed.
   */
  if (job->state == 'R' || job->state == 'S')
    {
    job->time_left = job->walltime - job->walltime_used;
    }

  /*
   * If this job was enqueued since the last time we ran, set the job
   * flag to indicate that we have not yet seen this job.  This makes it
   * a candidate for additional processing.  There may be some inaccuracy,
   * since the time_t has resolution of 1 second.  Attempt to err on the
   * side of caution.
   */
  if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED))
    {
    if (job->time_queued <= (schd_TimeNow - schd_TimeLast))
      {
      job->flags |= JFLAGS_FIRST_SEEN;
      }
    }

  /*
   * If this job was previously running and is now queued, then we
   * need to (a) flag it as having been checkpointed, and (b) move
   * it back to the submit queue, if its not already there.
   */
  if (job->exechost && job->state == 'Q')
    {
    job->flags |= JFLAGS_CHKPTD;

    if (strcmp(job->qname, schd_SubmitQueue->queue->qname))
      {
      sprintf(log_buffer, "moving Q'd job %s back to SUBMIT Q",
              job->jobid);
      log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
      pbs_movejob(connector, job->jobid, schd_SubmitQueue->queue->qname,
                  NULL);
      }
    }

  /*
   * if this job is currently Suspended (a substate of 'R'unning), then
   * pretend its queued, so that the scheduling logic will work.
   */
  if (job->state == 'S')
    {
    job->state = 'Q';
    job->flags |= JFLAGS_SUSPENDED;
    }

  /* if this job is suspended, checkpointed, or otherwise "queued"
   * on an exection queue, update the internal representation of
   * to pretend it is really on the submit queue.
   */

  if ((job->flags & JFLAGS_SUSPENDED) || (job->flags & JFLAGS_CHKPTD))
    {
    free(job->qname);
    job->qname = schd_strdup(schd_SubmitQueue->queue->qname);
    }

  /*
   * If this job came from the EXPRESS queue, set the flag so that it
   * will be treated with the highest of priority.
   */
  if (!strcmp(job->oqueue, schd_EXPRESS_Q_NAME))
    job->flags |= JFLAGS_PRIORITY;

  /*
   * If the 'etime' attribute wasn't found, set it to the time the job has
   * been queued.  Most jobs will be eligible to run their entire lifetime.
   * The exception is a job that has been held - if it was a user hold,
   * the release will reset the etime to the latest value.
   * If not eligible time was given, use the job's creation time.
   */
  if (!job->eligible)
    job->eligible = job->time_queued;

  /* if this job has waited too long, and its queue is NOT over its
   * shares, then bump it up in priority.
   */
  if (job->eligible > schd_MAX_WAIT_TIME && job->sort_order <= 100)
    job->flags |= JFLAGS_WAITING;

  return (changed);
  }
Пример #8
0
static int
bump_rsrc_requests(Job *job, int cpu_req, size_t mem_req)
  {
  char   *id = "bump_rsrc_requests";
  char   *val;
  char    buf[64];
  int     bumped = 0;

  /*
   * If a job gives the "wrong" value for the memory request (for the
   * number of nodes required to fulfill the request), then bump the
   * memory request to the amount of memory the assigned nodes would
   * consume.
   */

  if ((mem_req == 0) || (mem_req != (job->nodes * MB_PER_NODE)))
    {

    /* Make a printable version of the requested memory. */
    strcpy(buf, schd_byte2val(mem_req));

    /* Compute the "right" memory request, based on the nodes. */
    val = schd_byte2val(job->nodes * MB_PER_NODE);

    if (val == NULL)
      return (1);

    if (schd_alterjob(connector, job, ATTR_l, val, "mem"))
      {
      DBPRT(("%s: Failed to set job %s \"mem\" attr to %s\n", id,
             job->jobid, val));
      return (1);
      }

    bumped++;
    }

  /*
   * If a job gives the "wrong" value for the CPU request (for the
   * number of nodes required to fulfill the request), then bump the
   * CPU request to the number of CPUs the assigned nodes would
   * consume.
   */
  if ((cpu_req == 0) || (cpu_req != (job->nodes * PE_PER_NODE)))
    {

    /* Compute the "right" memory request, based on the nodes. */
    sprintf(buf, "%d", (job->nodes * PE_PER_NODE));

    if (schd_alterjob(connector, job, ATTR_l, buf, "ncpus"))
      {
      DBPRT(("%s: Failed to set job %s \"ncpus\" attr to %s\n", id,
             job->jobid, buf));
      return (1);
      }

    bumped++;
    }

  if (bumped)
    {
    strncpy(buf, schd_byte2val(job->nodes * MB_PER_NODE), sizeof(buf) - 1);
    sprintf(log_buffer, "%s cpu/mem (%d/%s) bumped to %d/%s (%d nodes)",
            job->jobid, cpu_req, schd_byte2val(mem_req),
            job->nodes * PE_PER_NODE, buf, job->nodes);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    }

  return (0);
  }