/* * schd_get_queue_limits - query queue information from the server. * * Returns 0 on success, -1 for "fatal errors", and 1 for a transient * error (i.e., the queue failed the sanity checks imposed by the * queue_sanity() function). */ int schd_get_queue_limits(Queue *queue) { char *id = "schd_get_queue_limits"; int moved = 0, istrue; Batch_Status *bs; AttrList *attr; static AttrList alist[] = { {&alist[1], ATTR_start, "", ""}, {&alist[2], ATTR_enable, "", ""}, {&alist[3], ATTR_count, "", ""}, {&alist[4], ATTR_maxuserrun, "", ""}, {&alist[5], ATTR_rescavail, "", ""}, {&alist[6], ATTR_rescassn, "", ""}, {&alist[7], ATTR_rescdflt, "", ""}, {&alist[8], ATTR_rescmax, "", ""}, {&alist[9], ATTR_rescmin, "", ""}, {&alist[10], ATTR_acluren, "", ""}, {&alist[11], ATTR_acluser, "", ""}, {NULL, ATTR_maxrun, "", ""} }; size_t mem_default = UNSPECIFIED; size_t mem_assn = UNSPECIFIED; size_t mem_max = UNSPECIFIED; size_t mem_min = UNSPECIFIED; int cpu_default = UNSPECIFIED; int cpu_assn = UNSPECIFIED; int cpu_max = UNSPECIFIED; int cpu_min = UNSPECIFIED; int nodes_from_cpu, nodes_from_mem; queue->running = UNSPECIFIED; queue->queued = UNSPECIFIED; queue->maxrun = UNSPECIFIED; queue->userrun = UNSPECIFIED; queue->nodes_max = UNSPECIFIED; queue->nodes_min = UNSPECIFIED; queue->nodes_default = UNSPECIFIED; queue->nodes_assn = UNSPECIFIED; queue->nodes_rsvd = UNSPECIFIED; queue->wallt_max = UNSPECIFIED; queue->wallt_min = UNSPECIFIED; queue->wallt_default = UNSPECIFIED; queue->flags = 0; #ifdef NODEMASK BITFIELD_CLRALL(&queue->queuemask); BITFIELD_CLRALL(&queue->availmask); #endif /* NODEMASK */ queue->rsrcs = NULL; if (queue->jobs) { DBPRT(("%s: found jobs on queue '%s'! Freeing them...\n", id, queue->qname)); schd_free_jobs(queue->jobs); } if (queue->useracl) { DBPRT(("%s: found user ACL list on queue '%s'! Freeing it...\n", id, queue->qname)); schd_free_useracl(queue->useracl); } queue->jobs = NULL; queue->useracl = NULL; /* Ask the server for information about the specified queue. */ if ((bs = pbs_statque(connector, queue->qname, alist, NULL)) == NULL) { sprintf(log_buffer, "pbs_statque failed, \"%s\" %d", queue->qname, pbs_errno); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } /* Process the list of attributes returned by the server. */ for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* Is queue started? */ if (!strcmp(attr->name, ATTR_start)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not stopped. */ queue->flags &= ~QFLAGS_STOPPED; else queue->flags |= QFLAGS_STOPPED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* Is queue enabled? */ if (!strcmp(attr->name, ATTR_enable)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not disabled. */ queue->flags &= ~QFLAGS_DISABLED; else queue->flags |= QFLAGS_DISABLED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* How many jobs are queued and running? */ if (!strcmp(attr->name, ATTR_count)) { queue->queued = schd_how_many(attr->value, SC_QUEUED); queue->running = schd_how_many(attr->value, SC_RUNNING); continue; } /* Queue-wide maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxrun)) { queue->maxrun = atoi(attr->value); continue; } /* Per-user maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxuserrun)) { queue->userrun = atoi(attr->value); continue; } /* Is there an enabled user access control list on this queue? */ if (!strcmp(attr->name, ATTR_acluren)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue has an ACL */ queue->flags |= QFLAGS_USER_ACL; else queue->flags &= ~QFLAGS_USER_ACL; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_acluser)) { if (queue->useracl) { DBPRT(("queue %s acluser already set!\n", queue->qname)); schd_free_useracl(queue->useracl); } queue->useracl = schd_create_useracl(attr->value); continue; } /* Queue maximum resource usage. */ if (!strcmp(attr->name, ATTR_rescmax)) { if (!strcmp("mem", attr->resource)) { mem_max = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_max = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_max = schd_val2sec(attr->value); continue; } #ifdef NODEMASK if (!strcmp("nodemask", attr->resource)) { if (schd_str2mask(attr->value, &queue->queuemask)) { (void)sprintf(log_buffer, "couldn't convert nodemask %s", attr->value); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } else queue->flags |= QFLAGS_NODEMASK; /* Valid nodemask. */ } #endif /* NODEMASK */ continue; } /* Queue minimum resource usage. */ if (!strcmp(attr->name, ATTR_rescmin)) { if (!strcmp("mem", attr->resource)) { mem_min = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_min = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_min = schd_val2sec(attr->value); continue; } continue; } /* Queue assigned (in use) resource usage. */ if (!strcmp(attr->name, ATTR_rescassn)) { if (!strcmp("mem", attr->resource)) { mem_assn = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_assn = atoi(attr->value); } continue; } if (!strcmp(attr->name, ATTR_rescdflt)) { if (!strcmp("mem", attr->resource)) { mem_default = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { cpu_default = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) queue->wallt_default = schd_val2sec(attr->value); } /* Ignore anything else */ } pbs_statfree(bs); /* * Calculate values for queue node limits, given memory and cpu values. * Note any discrepancies. */ nodes_from_cpu = NODES_FROM_CPU(cpu_default); nodes_from_mem = NODES_FROM_MEM(mem_default); if (nodes_from_cpu != nodes_from_mem) { sprintf(log_buffer, "%s: Queue '%s' default cpu/mem (%d/%s) convert to %d != %d nodes", id, queue->qname, cpu_default, schd_byte2val(mem_default), nodes_from_cpu, nodes_from_mem); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } nodes_from_cpu = NODES_FROM_CPU(cpu_max); nodes_from_mem = NODES_FROM_MEM(mem_max); if (nodes_from_cpu != nodes_from_mem) { sprintf(log_buffer, "%s: Queue '%s' maximum cpu/mem (%d/%s) convert to %d != %d nodes", id, queue->qname, cpu_max, schd_byte2val(mem_max), nodes_from_cpu, nodes_from_mem); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } nodes_from_cpu = NODES_FROM_CPU(cpu_min); nodes_from_mem = NODES_FROM_MEM(mem_min); if (nodes_from_cpu != nodes_from_mem) { sprintf(log_buffer, "%s: Queue '%s' minimum cpu/mem (%d/%s) convert to %d != %d nodes", id, queue->qname, cpu_min, schd_byte2val(mem_min), nodes_from_cpu, nodes_from_mem); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } /* * Note: The assigned cpus and memory need not be exactly the same * node equivalency. */ if ((cpu_default != UNSPECIFIED) && (mem_default != UNSPECIFIED)) queue->nodes_default = NODES_REQD(cpu_default, mem_default); if ((cpu_max != UNSPECIFIED) && (mem_max != UNSPECIFIED)) queue->nodes_max = NODES_REQD(cpu_max, mem_max); if ((cpu_min != UNSPECIFIED) && (mem_min != UNSPECIFIED)) queue->nodes_min = NODES_REQD(cpu_min, mem_min); if ((cpu_assn != UNSPECIFIED) && (mem_assn != UNSPECIFIED)) queue->nodes_assn = NODES_REQD(cpu_assn, mem_assn); /* * Move any jobs on this queue from the global list onto the queue's * list. Keep track of when the longest-running job will end, and set * the 'empty_by' field to that value. Maintain the ordering as it was * in "schd_AllJobs". */ if (schd_AllJobs) moved = queue_claim_jobs(queue, &schd_AllJobs); if (moved < 0) { sprintf(log_buffer, "%s: WARNING! Queue '%s' failed to claim jobs.", id, queue->qname); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } if (queue->nodes_assn == UNSPECIFIED) queue->nodes_assn = 0; if (queue->running == UNSPECIFIED) queue->running = 0; /* * Find out if the queue is idle, and if it was not before, set the idle * time to now. If there are running jobs, the queue is not idle at the * start of this iteration - set idle_since to 0. */ if (queue->running) { queue->idle_since = 0; } else { if (queue->idle_since == 0) queue->idle_since = schd_TimeNow; } /* * Get the resources for this queue from the resource monitor (if * available). If the resmom is not accessible, disable the queue. * If the resources were received okay, compute the available node * masks from the resources and jobs. * Don't bother with resources for the special or submit queues. */ if ((strcmp(queue->qname, schd_SubmitQueue->queue->qname) != 0) || ((schd_SpecialQueue != NULL) && (!strcmp(queue->qname, schd_SpecialQueue->queue->qname)))) { queue->rsrcs = schd_get_resources(queue->exechost); if (queue->rsrcs != NULL) { /* Account for this queue's resources. */ queue->rsrcs->nodes_alloc += queue->nodes_assn; queue->rsrcs->njobs += queue->running; /* * If the HPM counters do not appear to be in use on this host, * check for jobs on the queue that are using hpm. If so, set * the 'HPM_IN_USE' flag on the resources. This will prevent the * HPM counters from being released to global mode at the end * of the scheduling run (c.f. cleanup.c). * The 'HPM_IN_USE' flag will also be asserted if a job is run * that uses the HPM counters. */ if (schd_MANAGE_HPM && !(queue->rsrcs->flags & RSRCS_FLAGS_HPM_IN_USE)) { if (schd_hpm_job_count(queue->jobs)) queue->rsrcs->flags |= RSRCS_FLAGS_HPM_IN_USE; } #ifdef NODEMASK /* And find the nodemasks for the queue and resources. */ find_nodemasks(queue, queue->rsrcs); #endif /* NODEMASK */ } else { (void)sprintf(log_buffer, "Can't get resources for %s@%s - marking unavailable.", queue->qname, queue->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); queue->flags |= QFLAGS_DISABLED; } } #ifdef DEBUG schd_dump_queue(queue, QUEUE_DUMP_JOBS); #endif /* DEBUG */ /* * It would probably be better to wait for the world to stabilize * than to try to impose some artificial order upon it. Do not do * the sanity check if the queue is stopped. */ if ((queue->flags & QFLAGS_STOPPED) == 0) { if (!queue_sanity(queue)) { sprintf(log_buffer, "WARNING! Queue '%s' failed sanity checks.", queue->qname); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (1); } } return (0); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { int changed = 0; int istrue; char tmp_str[120]; char *id = "schd_get_jobinfo"; char *host; char *p, *tmp_p, *var_p; AttrList *attr; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ strcpy(tmp_str, attr->value); host = strchr(tmp_str, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(tmp_str); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "arch")) { job->arch = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, "mem")) { job->memory = schd_val2byte(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { job->ncpus = atoi(attr->value); changed ++; } else if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } else if (!strcmp(attr->resource, "speed")) { job->speed = atoi(attr->value); changed ++; } else if (!strcmp(attr->resource, "tmpdir")) { job->tmpdir = schd_val2byte(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_A)) { job->featureA = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_B)) { job->featureB = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_C)) { job->featureC = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_D)) { job->featureD = atol(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_E)) { job->featureE = atol(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_F)) { job->featureF = atol(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_G)) { schd_val2bool(attr->value, &istrue); job->featureG = istrue; changed ++; } else if (!strcmp(attr->resource, FEATURE_H)) { schd_val2bool(attr->value, &istrue); job->featureH = istrue; changed ++; } else if (!strcmp(attr->resource, FEATURE_I)) { schd_val2bool(attr->value, &istrue); job->featureI = istrue; changed ++; } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } } /* * If this job is in the "Running" state, compute how many seconds * remain until it is completed. */ if (job->state == 'R') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; return (changed); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int cpus_avail = 0; size_t pmem_avail = 0; char hpm_ctl[64]; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ #ifdef NODEMASK Bitfield cpy; int i, j; #endif /* NODEMASK */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); #ifdef NODEMASK addreq(rm, "availmask"); #endif /* NODEMASK */ if (schd_MANAGE_HPM) { (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR); addreq(rm, hpm_ctl); } /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); new_rsrcs->freemem *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { pmem_avail = schd_val2byte(response); pmem_avail *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } #ifdef NODEMASK /* Receive available nodes from resource monitor. */ response = getreq(rm); if (response == NULL) { (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } else { if (schd_bits2mask(response, &new_rsrcs->availmask) != 0) { if (schd_str2mask(response, &new_rsrcs->availmask) != 0) { (void)sprintf(log_buffer, "can't parse availmask '%s'", response); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } (void)free(response); } #endif /* NODEMASK */ if (schd_MANAGE_HPM) { /* Receive HPM_CTL response from resource monitor. */ response = getreq(rm); if (response != NULL) { if (strcmp(response, HPM_CTL_USERMODE_STR) == 0) new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER; else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0) new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER; else { (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'", response, hpm_ctl, exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } else { (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d", hpm_ctl, pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } /* * NOTE: response will be free()'d in bail. Be sure to explicitly free() * response if more getreq() calls are added before the code below. */ bail: if (response != NULL) (void)free(response); /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail); #ifdef NODEMASK /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */ BITFIELD_CPY(&cpy, &(new_rsrcs->availmask)); for (i = 2; i <= schd_FAKE_MACH_MULT; i++) { for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++) BITFIELD_SHIFTL(&cpy); BITFIELD_SETM(&(new_rsrcs->availmask), &cpy); } #endif /* NODEMASK */ if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { int changed = 0; int istrue; char tmp_str[120]; char *id = "schd_get_jobinfo"; char *host; char *p, *tmp_p, *var_p; AttrList *attr; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ strcpy(tmp_str, attr->value); host = strchr(tmp_str, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(tmp_str); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running (or was running for * suspended or checkpointed jobs. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "arch")) { job->arch = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, "mem")) { job->memory = schd_val2byte(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { job->ncpus = atoi(attr->value); changed ++; } else if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } /* Job Substate attribute. */ if (!strcmp(attr->name, ATTR_substate)) { if (atoi(attr->value) == 43 /* JOB_SUBSTATE_SUSPEND */) job->flags |= JFLAGS_SUSPENDED; continue; } /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } } if (job->memory < 1) { job->memory = get_default_mem(job->oqueue); schd_alterjob(connector, job, ATTR_l, schd_byte2val(job->memory), "mem"); changed++; } /* * If this job is in the "Running" or "Suspended" state, compute how * many seconds remain until it is completed. */ if (job->state == 'R' || job->state == 'S') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If this job was previously running and is now queued, then we * need to (a) flag it as having been checkpointed, and (b) move * it back to the submit queue, if its not already there. */ if (job->exechost && job->state == 'Q') { job->flags |= JFLAGS_CHKPTD; if (strcmp(job->qname, schd_SubmitQueue->queue->qname)) { sprintf(log_buffer, "moving Q'd job %s back to SUBMIT Q", job->jobid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); pbs_movejob(connector, job->jobid, schd_SubmitQueue->queue->qname, NULL); } } /* * if this job is currently Suspended (a substate of 'R'unning), then * pretend its queued, so that the scheduling logic will work. */ if (job->state == 'S') { job->state = 'Q'; job->flags |= JFLAGS_SUSPENDED; } /* if this job is suspended, checkpointed, or otherwise "queued" * on an exection queue, update the internal representation of * to pretend it is really on the submit queue. */ if ((job->flags & JFLAGS_SUSPENDED) || (job->flags & JFLAGS_CHKPTD)) { free(job->qname); job->qname = schd_strdup(schd_SubmitQueue->queue->qname); } /* * If this job came from the EXPRESS queue, set the flag so that it * will be treated with the highest of priority. */ if (!strcmp(job->oqueue, schd_EXPRESS_Q_NAME)) job->flags |= JFLAGS_PRIORITY; /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; /* if this job has waited too long, and its queue is NOT over its * shares, then bump it up in priority. */ if (job->eligible > schd_MAX_WAIT_TIME && job->sort_order <= 100) job->flags |= JFLAGS_WAITING; return (changed); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { char *id = "schd_get_jobinfo"; int changed = 0; int cpu_req = 0; size_t mem_req = 0; char *host; char *p, *tmp_p, *var_p; AttrList *attr; char canon[PBS_MAXHOSTNAME + 1]; int istrue; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ host = strchr(attr->value, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(attr->value); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; job->host = schd_strdup(host); if (job->host == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->host)"); return (-1); } changed ++; /* * We don't "own" the attribute strings, so put back the '@' * character we removed above, in case something else expects * it to be there. * Note that 'host' points to the first character of the host- * name, not the hole one character behind. */ if (host) { host --; /* Step back one character. */ *host = '@'; /* Replace the '@' that was deleted above. */ } /* That's all for the owner field. */ continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { cpu_req = atoi(attr->value); job->nodes = MAX(job->nodes, cpu_req); changed ++; } else if (!strcmp(attr->resource, "mppe")) { cpu_req = atoi(attr->value); job->nodes = MAX(job->nodes, cpu_req); changed ++; } else if (!strcmp(attr->resource, "mem")) { mem_req = schd_val2byte(attr->value); job->nodes = MAX(job->nodes, NODES_FROM_MEM(mem_req)); changed ++; #if PE_MASK != 0 } else if (!strcmp(attr->resource, "pe_mask")) { if (schd_str2mask(attr->value, &job->nodemask)) { (void)sprintf(log_buffer, "bad pe_mask %s for job %s", attr->value, job->jobid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } else changed++; /* Job pe_mask was valid. */ #endif /* PE_MASK */ } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Session ID for running jobs (used to correlate GRM info */ if (!strcmp(attr->name, ATTR_session)) { job->session = atoi(attr->value); continue; } /* Job Priority attribute (inherited from queue) */ if (!strcmp(attr->name, ATTR_p)) { job->priority = atoi(attr->value); continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } #ifdef ATTR_etime /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } #endif /* ATTR_etime */ } /* * If this job is in the "Running" state, compute how many seconds * remain until it is completed. */ if (job->state == 'R') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; #if defined(sgi) /* * If the job provided a memory or CPU resource that does not match * the resources that will be allocated by the assigned nodes (i.e. * a request for 100mb of memory and 16 CPUs - the job will "get" all * 4GB of memory anyway), alter the job attributes such that they * will align with the assigned nodes later. */ bump_rsrc_requests(job, cpu_req, mem_req); #endif /* defined(sgi) */ /* * Need to update the time_until_eligible and total_delay fields, * probably from a global array of information saved from previous * scheduler iteration. */ /* * Calculate the job priority weight sort key to be used later in * job sorting. This is the "priority" the job should have during * sorting based on the size of the job, the length of time queued, * and the job type. */ calc_job_weight(job); return (changed); }
/* * schd_get_queue_limits - query queue information from the server. * * Returns 0 on success, -1 for "fatal errors", and 1 for a transient * error (i.e., the queue failed the sanity checks imposed by the * queue_sanity() function). */ int schd_get_queue_limits(Queue *queue) { char *id = "schd_get_queue_limits"; int istrue; int local_errno = 0; Batch_Status *bs; AttrList *attr; static AttrList alist[] = { {&alist[1], ATTR_start, "", ""}, {&alist[2], ATTR_enable, "", ""}, {&alist[3], ATTR_count, "", ""}, {&alist[4], ATTR_maxuserrun, "", ""}, {&alist[5], ATTR_rescavail, "", ""}, {&alist[6], ATTR_rescassn, "", ""}, {&alist[7], ATTR_rescdflt, "", ""}, {&alist[8], ATTR_rescmax, "", ""}, {&alist[9], ATTR_rescmin, "", ""}, {&alist[10], ATTR_acluren, "", ""}, {&alist[11], ATTR_acluser, "", ""}, {&alist[12], ATTR_p, "", ""}, {NULL, ATTR_maxrun, "", ""} }; queue->running = UNSPECIFIED; queue->queued = UNSPECIFIED; queue->maxrun = UNSPECIFIED; queue->userrun = UNSPECIFIED; queue->ncpus_max = UNSPECIFIED; queue->ncpus_min = UNSPECIFIED; queue->ncpus_default = UNSPECIFIED; queue->ncpus_assn = UNSPECIFIED; queue->mem_max = UNSPECIFIED; queue->mem_min = UNSPECIFIED; queue->mem_default = UNSPECIFIED; queue->wallt_max = UNSPECIFIED; queue->wallt_min = UNSPECIFIED; queue->wallt_default = UNSPECIFIED; queue->rsrcs = NULL; queue->flags = 0; queue->priority = UNSPECIFIED; queue->speed = UNSPECIFIED; if (queue->featureA) { free(queue->featureA); queue->featureA = NULL; } if (queue->featureB) { free(queue->featureB); queue->featureB = NULL; } if (queue->featureC) { free(queue->featureC); queue->featureC = NULL; } queue->featureD = UNSPECIFIED; queue->featureE = UNSPECIFIED; queue->featureF = UNSPECIFIED; queue->featureG = UNSPECIFIED; queue->featureH = UNSPECIFIED; queue->featureI = UNSPECIFIED; if (queue->rsrcs) { DBPRT(("%s: found resource list on queue '%s'! Freeing them...\n", id, queue->qname)); cleanup_rsrcs(queue->rsrcs); queue->rsrcs = NULL; } if (queue->jobs) { DBPRT(("%s: found jobs on queue '%s'! Freeing them...\n", id, queue->qname)); schd_free_jobs(queue->jobs); queue->jobs = NULL; } if (queue->useracl) { DBPRT(("%s: found user ACL list on queue '%s'! Freeing it...\n", id, queue->qname)); schd_free_useracl(queue->useracl); queue->useracl = NULL; } /* Ask the server for information about the specified queue. */ if ((bs = pbs_statque_err(connector, queue->qname, alist, NULL, &local_errno)) == NULL) { sprintf(log_buffer, "pbs_statque failed, \"%s\" %d", queue->qname, local_errno); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } /* Process the list of attributes returned by the server. */ for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* Is queue started? */ if (!strcmp(attr->name, ATTR_start)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not stopped. */ queue->flags &= ~QFLAGS_STOPPED; else queue->flags |= QFLAGS_STOPPED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* Is queue enabled? */ if (!strcmp(attr->name, ATTR_enable)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not disabled. */ queue->flags &= ~QFLAGS_DISABLED; else queue->flags |= QFLAGS_DISABLED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* How many jobs are queued and running? */ if (!strcmp(attr->name, ATTR_count)) { queue->queued = schd_how_many(attr->value, SC_QUEUED); queue->running = schd_how_many(attr->value, SC_RUNNING); continue; } /* Queue-wide maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxrun)) { queue->maxrun = atoi(attr->value); continue; } /* Per-user maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxuserrun)) { queue->userrun = atoi(attr->value); continue; } /* Queue Priority Value */ if (!strcmp(attr->name, ATTR_p)) { queue->priority = atoi(attr->value); continue; } /* Is there an enabled user access control list on this queue? */ if (!strcmp(attr->name, ATTR_acluren)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue has an ACL */ queue->flags |= QFLAGS_USER_ACL; else queue->flags &= ~QFLAGS_USER_ACL; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_acluser)) { if (queue->useracl) { DBPRT(("queue %s acluser already set!\n", queue->qname)); schd_free_useracl(queue->useracl); } queue->useracl = schd_create_useracl(attr->value); continue; } /* Queue maximum resource usage. */ if (!strcmp(attr->name, ATTR_rescmax)) { if (!strcmp("mem", attr->resource)) { queue->mem_max = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_max = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_max = schd_val2sec(attr->value); continue; } if (!strcmp("speed", attr->resource)) { queue->speed = atoi(attr->value); continue; } if (!strcmp(FEATURE_A, attr->resource)) { queue->featureA = schd_strdup(attr->value); continue; } if (!strcmp(FEATURE_B, attr->resource)) { queue->featureB = schd_strdup(attr->value); continue; } if (!strcmp(FEATURE_C, attr->resource)) { queue->featureC = schd_strdup(attr->value); continue; } if (!strcmp(FEATURE_D, attr->resource)) { queue->featureD = atol(attr->value); continue; } if (!strcmp(FEATURE_E, attr->resource)) { queue->featureE = atol(attr->value); continue; } if (!strcmp(FEATURE_F, attr->resource)) { queue->featureF = atol(attr->value); continue; } if (!strcmp(FEATURE_G, attr->resource)) { schd_val2bool(attr->value, &istrue); queue->featureG = istrue; continue; } if (!strcmp(FEATURE_H, attr->resource)) { schd_val2bool(attr->value, &istrue); queue->featureH = istrue; continue; } if (!strcmp(FEATURE_I, attr->resource)) { schd_val2bool(attr->value, &istrue); queue->featureI = istrue; continue; } continue; } /* Queue minimum resource usage. */ if (!strcmp(attr->name, ATTR_rescmin)) { if (!strcmp("mem", attr->resource)) { queue->mem_min = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_min = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_min = schd_val2sec(attr->value); continue; } continue; } /* Queue assigned (in use) resource usage. */ if (!strcmp(attr->name, ATTR_rescassn)) { if (!strcmp("mem", attr->resource)) { queue->mem_assn = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_assn = atoi(attr->value); } continue; } if (!strcmp(attr->name, ATTR_rescdflt)) { if (!strcmp("mem", attr->resource)) { queue->mem_default = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_default = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) queue->wallt_default = schd_val2sec(attr->value); } /* Ignore anything else */ } pbs_statfree(bs); return (0); }
/* schd_get_queue_memory - query queue memory limit from the server. */ size_t schd_get_queue_memory(char *qName) { char *id = "schd_get_queue_limits"; size_t mem_max, mem_default; Batch_Status *bs; AttrList *attr; int local_errno = 0; static AttrList alist[] = { {&alist[1], ATTR_rescdflt, "", ""}, {NULL, ATTR_rescmax, "", ""} }; mem_default = (size_t)0; mem_max = (size_t)0; /* Ask the server for information about the specified queue. */ if ((bs = pbs_statque_err(connector, qName, alist, NULL, &local_errno)) == NULL) { sprintf(log_buffer, "pbs_statque failed, \"%s\" %d", qName, local_errno); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (UNSPECIFIED); } /* Process the list of attributes returned by the server. */ for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* Queue maximum resource usage. */ if (!strcmp(attr->name, ATTR_rescmax)) { if (!strcmp("mem", attr->resource)) { mem_max = schd_val2byte(attr->value); continue; } continue; } if (!strcmp(attr->name, ATTR_rescdflt)) { if (!strcmp("mem", attr->resource)) { mem_default = schd_val2byte(attr->value); continue; } } /* Ignore anything else */ } pbs_statfree(bs); if (mem_default != (size_t)0) return(mem_default); if (mem_max != (size_t)0) return(mem_max); return (UNSPECIFIED); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int local_errno = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, pbs_rm_port)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "physmem"); addreq(rm, "ncpus"); addreq(rm, "arch"); /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->loadave = atof(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->mem_total = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->ncpus_total = atoi(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive ARCH response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->arch = schd_strdup(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } bail: /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - skipping this node", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }