/* * schd_get_queue_limits - query queue information from the server. * * Returns 0 on success, -1 for "fatal errors", and 1 for a transient * error (i.e., the queue failed the sanity checks imposed by the * queue_sanity() function). */ int schd_get_queue_limits(Queue *queue) { char *id = "schd_get_queue_limits"; int moved = 0, istrue; Batch_Status *bs; AttrList *attr; static AttrList alist[] = { {&alist[1], ATTR_start, "", ""}, {&alist[2], ATTR_enable, "", ""}, {&alist[3], ATTR_count, "", ""}, {&alist[4], ATTR_maxuserrun, "", ""}, {&alist[5], ATTR_rescavail, "", ""}, {&alist[6], ATTR_rescassn, "", ""}, {&alist[7], ATTR_rescdflt, "", ""}, {&alist[8], ATTR_rescmax, "", ""}, {&alist[9], ATTR_rescmin, "", ""}, {&alist[10], ATTR_acluren, "", ""}, {&alist[11], ATTR_acluser, "", ""}, {NULL, ATTR_maxrun, "", ""} }; size_t mem_default = UNSPECIFIED; size_t mem_assn = UNSPECIFIED; size_t mem_max = UNSPECIFIED; size_t mem_min = UNSPECIFIED; int cpu_default = UNSPECIFIED; int cpu_assn = UNSPECIFIED; int cpu_max = UNSPECIFIED; int cpu_min = UNSPECIFIED; int nodes_from_cpu, nodes_from_mem; queue->running = UNSPECIFIED; queue->queued = UNSPECIFIED; queue->maxrun = UNSPECIFIED; queue->userrun = UNSPECIFIED; queue->nodes_max = UNSPECIFIED; queue->nodes_min = UNSPECIFIED; queue->nodes_default = UNSPECIFIED; queue->nodes_assn = UNSPECIFIED; queue->nodes_rsvd = UNSPECIFIED; queue->wallt_max = UNSPECIFIED; queue->wallt_min = UNSPECIFIED; queue->wallt_default = UNSPECIFIED; queue->flags = 0; #ifdef NODEMASK BITFIELD_CLRALL(&queue->queuemask); BITFIELD_CLRALL(&queue->availmask); #endif /* NODEMASK */ queue->rsrcs = NULL; if (queue->jobs) { DBPRT(("%s: found jobs on queue '%s'! Freeing them...\n", id, queue->qname)); schd_free_jobs(queue->jobs); } if (queue->useracl) { DBPRT(("%s: found user ACL list on queue '%s'! Freeing it...\n", id, queue->qname)); schd_free_useracl(queue->useracl); } queue->jobs = NULL; queue->useracl = NULL; /* Ask the server for information about the specified queue. */ if ((bs = pbs_statque(connector, queue->qname, alist, NULL)) == NULL) { sprintf(log_buffer, "pbs_statque failed, \"%s\" %d", queue->qname, pbs_errno); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } /* Process the list of attributes returned by the server. */ for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* Is queue started? */ if (!strcmp(attr->name, ATTR_start)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not stopped. */ queue->flags &= ~QFLAGS_STOPPED; else queue->flags |= QFLAGS_STOPPED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* Is queue enabled? */ if (!strcmp(attr->name, ATTR_enable)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not disabled. */ queue->flags &= ~QFLAGS_DISABLED; else queue->flags |= QFLAGS_DISABLED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* How many jobs are queued and running? */ if (!strcmp(attr->name, ATTR_count)) { queue->queued = schd_how_many(attr->value, SC_QUEUED); queue->running = schd_how_many(attr->value, SC_RUNNING); continue; } /* Queue-wide maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxrun)) { queue->maxrun = atoi(attr->value); continue; } /* Per-user maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxuserrun)) { queue->userrun = atoi(attr->value); continue; } /* Is there an enabled user access control list on this queue? */ if (!strcmp(attr->name, ATTR_acluren)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue has an ACL */ queue->flags |= QFLAGS_USER_ACL; else queue->flags &= ~QFLAGS_USER_ACL; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_acluser)) { if (queue->useracl) { DBPRT(("queue %s acluser already set!\n", queue->qname)); schd_free_useracl(queue->useracl); } queue->useracl = schd_create_useracl(attr->value); continue; } /* Queue maximum resource usage. */ if (!strcmp(attr->name, ATTR_rescmax)) { if (!strcmp("mem", attr->resource)) { mem_max = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_max = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_max = schd_val2sec(attr->value); continue; } #ifdef NODEMASK if (!strcmp("nodemask", attr->resource)) { if (schd_str2mask(attr->value, &queue->queuemask)) { (void)sprintf(log_buffer, "couldn't convert nodemask %s", attr->value); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } else queue->flags |= QFLAGS_NODEMASK; /* Valid nodemask. */ } #endif /* NODEMASK */ continue; } /* Queue minimum resource usage. */ if (!strcmp(attr->name, ATTR_rescmin)) { if (!strcmp("mem", attr->resource)) { mem_min = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_min = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_min = schd_val2sec(attr->value); continue; } continue; } /* Queue assigned (in use) resource usage. */ if (!strcmp(attr->name, ATTR_rescassn)) { if (!strcmp("mem", attr->resource)) { mem_assn = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_assn = atoi(attr->value); } continue; } if (!strcmp(attr->name, ATTR_rescdflt)) { if (!strcmp("mem", attr->resource)) { mem_default = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_default = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) queue->wallt_default = schd_val2sec(attr->value); } /* Ignore anything else */ } pbs_statfree(bs); /* * Calculate values for queue node limits, given memory and cpu values. * Note any discrepancies. */ nodes_from_cpu = NODES_FROM_CPU(cpu_default); nodes_from_mem = NODES_FROM_MEM(mem_default); if (nodes_from_cpu != nodes_from_mem) { sprintf(log_buffer, "%s: Queue '%s' default cpu/mem (%d/%s) convert to %d != %d nodes", id, queue->qname, cpu_default, schd_byte2val(mem_default), nodes_from_cpu, nodes_from_mem); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } nodes_from_cpu = NODES_FROM_CPU(cpu_max); nodes_from_mem = NODES_FROM_MEM(mem_max); if (nodes_from_cpu != nodes_from_mem) { sprintf(log_buffer, "%s: Queue '%s' maximum cpu/mem (%d/%s) convert to %d != %d nodes", id, queue->qname, cpu_max, schd_byte2val(mem_max), nodes_from_cpu, nodes_from_mem); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } nodes_from_cpu = NODES_FROM_CPU(cpu_min); nodes_from_mem = NODES_FROM_MEM(mem_min); if (nodes_from_cpu != nodes_from_mem) { sprintf(log_buffer, "%s: Queue '%s' minimum cpu/mem (%d/%s) convert to %d != %d nodes", id, queue->qname, cpu_min, schd_byte2val(mem_min), nodes_from_cpu, nodes_from_mem); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } /* * Note: The assigned cpus and memory need not be exactly the same * node equivalency. */ if ((cpu_default != UNSPECIFIED) && (mem_default != UNSPECIFIED)) queue->nodes_default = NODES_REQD(cpu_default, mem_default); if ((cpu_max != UNSPECIFIED) && (mem_max != UNSPECIFIED)) queue->nodes_max = NODES_REQD(cpu_max, mem_max); if ((cpu_min != UNSPECIFIED) && (mem_min != UNSPECIFIED)) queue->nodes_min = NODES_REQD(cpu_min, mem_min); if ((cpu_assn != UNSPECIFIED) && (mem_assn != UNSPECIFIED)) queue->nodes_assn = NODES_REQD(cpu_assn, mem_assn); /* * Move any jobs on this queue from the global list onto the queue's * list. Keep track of when the longest-running job will end, and set * the 'empty_by' field to that value. Maintain the ordering as it was * in "schd_AllJobs". */ if (schd_AllJobs) moved = queue_claim_jobs(queue, &schd_AllJobs); if (moved < 0) { sprintf(log_buffer, "%s: WARNING! Queue '%s' failed to claim jobs.", id, queue->qname); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } if (queue->nodes_assn == UNSPECIFIED) queue->nodes_assn = 0; if (queue->running == UNSPECIFIED) queue->running = 0; /* * Find out if the queue is idle, and if it was not before, set the idle * time to now. If there are running jobs, the queue is not idle at the * start of this iteration - set idle_since to 0. */ if (queue->running) { queue->idle_since = 0; } else { if (queue->idle_since == 0) queue->idle_since = schd_TimeNow; } /* * Get the resources for this queue from the resource monitor (if * available). If the resmom is not accessible, disable the queue. * If the resources were received okay, compute the available node * masks from the resources and jobs. * Don't bother with resources for the special or submit queues. */ if ((strcmp(queue->qname, schd_SubmitQueue->queue->qname) != 0) || ((schd_SpecialQueue != NULL) && (!strcmp(queue->qname, schd_SpecialQueue->queue->qname)))) { queue->rsrcs = schd_get_resources(queue->exechost); if (queue->rsrcs != NULL) { /* Account for this queue's resources. */ queue->rsrcs->nodes_alloc += queue->nodes_assn; queue->rsrcs->njobs += queue->running; /* * If the HPM counters do not appear to be in use on this host, * check for jobs on the queue that are using hpm. If so, set * the 'HPM_IN_USE' flag on the resources. This will prevent the * HPM counters from being released to global mode at the end * of the scheduling run (c.f. cleanup.c). * The 'HPM_IN_USE' flag will also be asserted if a job is run * that uses the HPM counters. */ if (schd_MANAGE_HPM && !(queue->rsrcs->flags & RSRCS_FLAGS_HPM_IN_USE)) { if (schd_hpm_job_count(queue->jobs)) queue->rsrcs->flags |= RSRCS_FLAGS_HPM_IN_USE; } #ifdef NODEMASK /* And find the nodemasks for the queue and resources. */ find_nodemasks(queue, queue->rsrcs); #endif /* NODEMASK */ } else { (void)sprintf(log_buffer, "Can't get resources for %s@%s - marking unavailable.", queue->qname, queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); queue->flags |= QFLAGS_DISABLED; } } #ifdef DEBUG schd_dump_queue(queue, QUEUE_DUMP_JOBS); #endif /* DEBUG */ /* * It would probably be better to wait for the world to stabilize * than to try to impose some artificial order upon it. Do not do * the sanity check if the queue is stopped. */ if ((queue->flags & QFLAGS_STOPPED) == 0) { if (!queue_sanity(queue)) { sprintf(log_buffer, "WARNING! Queue '%s' failed sanity checks.", queue->qname); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (1); } } return (0); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { char *id = "schd_get_jobinfo"; int changed = 0; int cpu_req = 0; size_t mem_req = 0; char *host; char *p, *tmp_p, *var_p; AttrList *attr; char canon[PBS_MAXHOSTNAME + 1]; int istrue; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ host = strchr(attr->value, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(attr->value); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; job->host = schd_strdup(host); if (job->host == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->host)"); return (-1); } changed ++; /* * We don't "own" the attribute strings, so put back the '@' * character we removed above, in case something else expects * it to be there. * Note that 'host' points to the first character of the host- * name, not the hole one character behind. */ if (host) { host --; /* Step back one character. */ *host = '@'; /* Replace the '@' that was deleted above. */ } /* That's all for the owner field. */ continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { cpu_req = atoi(attr->value); job->nodes = MAX(job->nodes, NODES_FROM_CPU(cpu_req)); changed ++; } else if (!strcmp(attr->resource, "mem")) { mem_req = schd_val2byte(attr->value); job->nodes = MAX(job->nodes, NODES_FROM_MEM(mem_req)); changed ++; #ifdef NODEMASK } else if (!strcmp(attr->resource, "nodemask")) { if (schd_str2mask(attr->value, &job->nodemask)) { (void)sprintf(log_buffer, "bad nodemask %s for job %s", attr->value, job->jobid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } else changed++; /* Job nodemask was valid. */ #endif /* NODEMASK */ } if (!strcmp(attr->resource, HPM_ATTRIBUTE)) { /* * If the job requests hpm support, set the flag, otherwise * turn it off. */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_NEEDS_HPM; else job->flags &= ~JFLAGS_NEEDS_HPM; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } #ifdef ATTR_etime /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } #endif /* ATTR_etime */ } /* * If this job is in the "Running" state, compute how many seconds * remain until it is completed. */ if (job->state == 'R') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; /* * If the job provided a memory or CPU resource that does not match * the resources that will be allocated by the assigned nodes (i.e. * a request for 100mb of memory and 16 CPUs - the job will "get" all * 4GB of memory anyway), alter the job attributes such that they * will align with the assigned nodes later. */ bump_rsrc_requests(job, cpu_req, mem_req); return (changed); }