/* * Record the reason that the current candidate job cannot currently run. * When it is decided that the job will remain queued, place the reason * string in the comment field of the job structure. */ void schd_comment_job(Job *job, char *reason, int optional) { char *id = "schd_comment_job"; char *msg_ptr; char *old_msg; /* * If the 'optional' argument is true, then this comment is optional. * Do not bother commenting this job if this is not the first time it * has been seen, and it has been recently modified (hopefully it was * a comment change). If there is no comment for the job, comment it * this time. */ if (optional && (!schd_FirstRun) && (job->comment != NULL) && !(job->flags & JFLAGS_FIRST_SEEN) && (MIN_COMMENT_AGE && ((schd_TimeNow - job->mtime) < MIN_COMMENT_AGE))) { return; } if (reason == NULL) reason = ""; old_msg = job->comment; /* If there is no old message, or they are different, set it. */ if ((old_msg == NULL) || (strcmp(reason, old_msg) != 0)) { msg_ptr = schd_strdup(reason); /* Alter PBS' view of the job. */ schd_alterjob(connector, job, ATTR_comment, msg_ptr, NULL); /* Copy the new comment into the job field. */ if (job->comment) free(job->comment); job->comment = msg_ptr; if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return; } } return; }
int schd_how_many(char *str, char *state) { /* char *id = "how_many"; */ char *avalue, *ptr1; int intval = 0; /* Make a duplicate of the string since strtok() is destructive. */ avalue = schd_strdup(str); if (avalue == NULL) return (0); /* XXX - return an error (malloc) here? */ /* Search for a token that matches the requested state */ for (ptr1 = strtok(avalue, " "); ptr1 != NULL; ptr1 = strtok(NULL, " ")) if (!strncmp(state, ptr1, strlen(state))) break; if (ptr1 == NULL) goto free_and_exit; /* XXX return an error */ /* Locate the number after the colon */ ptr1 = strchr(ptr1, ':'); if (ptr1 == NULL) goto free_and_exit; /* XXX return an error? */ ptr1++; intval = atoi(ptr1); free_and_exit: free(avalue); /* Free the memory schd_strdup() allocated. */ return (intval); }
/* extract and store the Fair Access Directives from a line just read from * the scheduler's configuration file. */ int arg_to_fairshare(char *arg, char *sep, FairAccessList **fairacl_ptr) { int num = 0, max_A = 0, max_B = 0, fieldcnt, found; char *id = "arg_to_fairshare"; char *field, aclname[30]; AccessEntry *new_ae, *FAptr = NULL, *fptr = NULL; /* * Multiple lines may be used to add entries to the FairACL list. Find * the tail of the passed-in list (if there is one), and assign the * FAptr to the tail element. Later, the new element will be hung off * FAptr's next field (or FAptr will be set to it.) */ if (*fairacl_ptr == NULL) { *fairacl_ptr = (FairAccessList *)malloc(sizeof(FairAccessList)); (*fairacl_ptr)->next = NULL; (*fairacl_ptr)->entry = NULL; } FAptr = (*fairacl_ptr)->entry; /* first we process the configuration line passed in to use, saving * the important bits for later; at this point we don't know if we * have a new queue entry, or simply an addition to an existing * queue's FairAccess list. */ fieldcnt = 0; for (field = strtok(arg, sep); field != NULL; field = strtok(NULL, sep)) { fieldcnt++; if (fieldcnt == 1 && /* first field on FAIR_SHARE line */ (strcmp(field, "QUEUE"))) { sprintf(log_buffer, "Unrecognized FAIR_SHARE directive: %s", field); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); return(-1); } else if (fieldcnt == 2) /* Queue name */ strcpy(aclname, field); else if (fieldcnt == 3) /* Queue max shares (%) */ max_A = atoi(field); else if (fieldcnt == 4) /* Queue max running jobs (% CPUs) */ max_B = atoi(field); } if (fieldcnt != 4) { sprintf(log_buffer, "Incomplete FAIR_SHARE directive: %s", arg); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); return(-1); } if (max_A < 0 || max_A > 100) { sprintf(log_buffer, "FAIR_SHARE share percentage (%d) should be between 1 and 100", max_A); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); return(-1); } if (max_B < 0 || max_B > 100) { sprintf(log_buffer, "FAIR_SHARE running job percentage (%d) should be between 1 and 100", max_B); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); return(-1); } found = 0; if (FAptr) { /* search for an existing entry for this queue */ for (fptr = FAptr; fptr != NULL; fptr = fptr->next) { if (!strcmp(fptr->name, aclname)) { found = 1; break; /* need to add a new ACL on the entry */ } } if (!found) { for (fptr = FAptr; fptr->next != NULL; fptr = fptr->next) /* Walk the list, looking for last element. */; } } /* if we found the entry, then we need add the new info to it; * otherwise, we first need to create a new entry struct, and * then add to it... */ if (!found) { new_ae = (AccessEntry *)malloc(sizeof(AccessEntry)); if (new_ae == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "malloc(newAccessEntry)"); return(-1); } memset(new_ae, 0, sizeof(AccessEntry)); if (FAptr == NULL) { (*fairacl_ptr)->entry = new_ae; fptr = (*fairacl_ptr)->entry; } else { fptr->next = new_ae; fptr = new_ae; } new_ae->name = schd_strdup(aclname); } /* Finially we get to fill it in with the info we saved earlier */ new_ae->past_ndays = 0; new_ae->past_percent = 0.0; new_ae->max_percent = (double)max_A; new_ae->max_running = max_B; new_ae->today_max = 0.0; new_ae->today_usage = 0.0; new_ae->default_mem = (size_t)0; return (num); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int cpus_avail = 0; size_t pmem_avail = 0; char hpm_ctl[64]; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ #ifdef NODEMASK Bitfield cpy; int i, j; #endif /* NODEMASK */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); #ifdef NODEMASK addreq(rm, "availmask"); #endif /* NODEMASK */ if (schd_MANAGE_HPM) { (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR); addreq(rm, hpm_ctl); } /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); new_rsrcs->freemem *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { pmem_avail = schd_val2byte(response); pmem_avail *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } #ifdef NODEMASK /* Receive available nodes from resource monitor. */ response = getreq(rm); if (response == NULL) { (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } else { if (schd_bits2mask(response, &new_rsrcs->availmask) != 0) { if (schd_str2mask(response, &new_rsrcs->availmask) != 0) { (void)sprintf(log_buffer, "can't parse availmask '%s'", response); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } (void)free(response); } #endif /* NODEMASK */ if (schd_MANAGE_HPM) { /* Receive HPM_CTL response from resource monitor. */ response = getreq(rm); if (response != NULL) { if (strcmp(response, HPM_CTL_USERMODE_STR) == 0) new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER; else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0) new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER; else { (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'", response, hpm_ctl, exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } else { (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d", hpm_ctl, pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } /* * NOTE: response will be free()'d in bail. Be sure to explicitly free() * response if more getreq() calls are added before the code below. */ bail: if (response != NULL) (void)free(response); /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail); #ifdef NODEMASK /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */ BITFIELD_CPY(&cpy, &(new_rsrcs->availmask)); for (i = 2; i <= schd_FAKE_MACH_MULT; i++) { for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++) BITFIELD_SHIFTL(&cpy); BITFIELD_SETM(&(new_rsrcs->availmask), &cpy); } #endif /* NODEMASK */ if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { int changed = 0; int istrue; char tmp_str[120]; char *id = "schd_get_jobinfo"; char *host; char *p, *tmp_p, *var_p; AttrList *attr; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ strcpy(tmp_str, attr->value); host = strchr(tmp_str, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(tmp_str); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "arch")) { job->arch = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, "mem")) { job->memory = schd_val2byte(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { job->ncpus = atoi(attr->value); changed ++; } else if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } else if (!strcmp(attr->resource, "speed")) { job->speed = atoi(attr->value); changed ++; } else if (!strcmp(attr->resource, "tmpdir")) { job->tmpdir = schd_val2byte(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_A)) { job->featureA = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_B)) { job->featureB = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_C)) { job->featureC = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_D)) { job->featureD = atol(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_E)) { job->featureE = atol(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_F)) { job->featureF = atol(attr->value); changed ++; } else if (!strcmp(attr->resource, FEATURE_G)) { schd_val2bool(attr->value, &istrue); job->featureG = istrue; changed ++; } else if (!strcmp(attr->resource, FEATURE_H)) { schd_val2bool(attr->value, &istrue); job->featureH = istrue; changed ++; } else if (!strcmp(attr->resource, FEATURE_I)) { schd_val2bool(attr->value, &istrue); job->featureI = istrue; changed ++; } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } } /* * If this job is in the "Running" state, compute how many seconds * remain until it is completed. */ if (job->state == 'R') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; return (changed); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { int changed = 0; int istrue; char tmp_str[120]; char *id = "schd_get_jobinfo"; char *host; char *p, *tmp_p, *var_p; AttrList *attr; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ strcpy(tmp_str, attr->value); host = strchr(tmp_str, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(tmp_str); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running (or was running for * suspended or checkpointed jobs. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "arch")) { job->arch = schd_strdup(attr->value); changed ++; } else if (!strcmp(attr->resource, "mem")) { job->memory = schd_val2byte(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { job->ncpus = atoi(attr->value); changed ++; } else if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } /* Job Substate attribute. */ if (!strcmp(attr->name, ATTR_substate)) { if (atoi(attr->value) == 43 /* JOB_SUBSTATE_SUSPEND */) job->flags |= JFLAGS_SUSPENDED; continue; } /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } } if (job->memory < 1) { job->memory = get_default_mem(job->oqueue); schd_alterjob(connector, job, ATTR_l, schd_byte2val(job->memory), "mem"); changed++; } /* * If this job is in the "Running" or "Suspended" state, compute how * many seconds remain until it is completed. */ if (job->state == 'R' || job->state == 'S') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If this job was previously running and is now queued, then we * need to (a) flag it as having been checkpointed, and (b) move * it back to the submit queue, if its not already there. */ if (job->exechost && job->state == 'Q') { job->flags |= JFLAGS_CHKPTD; if (strcmp(job->qname, schd_SubmitQueue->queue->qname)) { sprintf(log_buffer, "moving Q'd job %s back to SUBMIT Q", job->jobid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); pbs_movejob(connector, job->jobid, schd_SubmitQueue->queue->qname, NULL); } } /* * if this job is currently Suspended (a substate of 'R'unning), then * pretend its queued, so that the scheduling logic will work. */ if (job->state == 'S') { job->state = 'Q'; job->flags |= JFLAGS_SUSPENDED; } /* if this job is suspended, checkpointed, or otherwise "queued" * on an exection queue, update the internal representation of * to pretend it is really on the submit queue. */ if ((job->flags & JFLAGS_SUSPENDED) || (job->flags & JFLAGS_CHKPTD)) { free(job->qname); job->qname = schd_strdup(schd_SubmitQueue->queue->qname); } /* * If this job came from the EXPRESS queue, set the flag so that it * will be treated with the highest of priority. */ if (!strcmp(job->oqueue, schd_EXPRESS_Q_NAME)) job->flags |= JFLAGS_PRIORITY; /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; /* if this job has waited too long, and its queue is NOT over its * shares, then bump it up in priority. */ if (job->eligible > schd_MAX_WAIT_TIME && job->sort_order <= 100) job->flags |= JFLAGS_WAITING; return (changed); }
/* * This function takes a pointer to a struct batch_status for a job, and * fills in the appropriate fields of the supplied job struct. It returns * the number of items that were found. */ int schd_get_jobinfo(Batch_Status *bs, Job *job) { char *id = "schd_get_jobinfo"; int changed = 0; int cpu_req = 0; size_t mem_req = 0; char *host; char *p, *tmp_p, *var_p; AttrList *attr; char canon[PBS_MAXHOSTNAME + 1]; int istrue; memset((void *)job, 0, sizeof(Job)); job->jobid = schd_strdup(bs->name); if (job->jobid == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(bs->name)"); return (-1); } changed ++; for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* * If this is the 'owner' field, chop it into 'owner' and 'host' * fields, and copy them into the Job struct. */ if (!strcmp(attr->name, ATTR_owner)) { /* Look for the '@' that separates user and hostname. */ host = strchr(attr->value, '@'); if (host) { *host = '\0'; /* Replace '@' with NULL (ends username). */ host ++; /* Move to first character of hostname. */ } job->owner = schd_strdup(attr->value); if (job->owner == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->owner)"); return (-1); } changed ++; job->host = schd_strdup(host); if (job->host == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->host)"); return (-1); } changed ++; /* * We don't "own" the attribute strings, so put back the '@' * character we removed above, in case something else expects * it to be there. * Note that 'host' points to the first character of the host- * name, not the hole one character behind. */ if (host) { host --; /* Step back one character. */ *host = '@'; /* Replace the '@' that was deleted above. */ } /* That's all for the owner field. */ continue; } /* The group to which to charge the resources for this job. */ if (!strcmp(attr->name, ATTR_egroup)) { job->group = schd_strdup(attr->value); if (job->group == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->group)"); return (-1); } changed ++; continue; } /* The comment currently assigned to this job. */ if (!strcmp(attr->name, ATTR_comment)) { job->comment = schd_strdup(attr->value); if (job->comment == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->comment)"); return (-1); } changed ++; continue; } /* The host on which this job is running. */ if (!strcmp(attr->name, ATTR_exechost)) { job->exechost = schd_strdup(attr->value); if (job->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->exechost)"); return (-1); } changed ++; continue; } if (!strcmp(attr->name, ATTR_inter)) { /* Is this job interactive or not? */ if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) job->flags |= JFLAGS_INTERACTIVE; else job->flags &= ~JFLAGS_INTERACTIVE; changed ++; } else { DBPRT(("%s: can't parse %s = %s into boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_state)) { /* State is one of 'R', 'Q', 'E', etc. */ job->state = attr->value[0]; changed ++; continue; } if (!strcmp(attr->name, ATTR_queue)) { job->qname = schd_strdup(attr->value); if (job->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(job->qname)"); return (-1); } job->flags |= JFLAGS_QNAME_LOCAL; changed ++; continue; } if (!strcmp(attr->name, ATTR_v)) { var_p = schd_strdup(attr->value); if (var_p == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(Variable_List)"); return (-1); } p = NULL; tmp_p = strstr(var_p, "PBS_O_QUEUE"); if (tmp_p) { p = strtok(tmp_p, "="); p = strtok(NULL, ", "); } if (p != NULL) { job->oqueue = schd_strdup(p); } else { /* if the originating queue is unknown, default * to the locally defined "submit" queue. */ job->oqueue = schd_strdup(schd_SubmitQueue->queue->qname); } free(var_p); changed ++; continue; } if (!strcmp(attr->name, ATTR_l)) { if (!strcmp(attr->resource, "walltime")) { job->walltime = schd_val2sec(attr->value); changed ++; } else if (!strcmp(attr->resource, "ncpus")) { cpu_req = atoi(attr->value); job->nodes = MAX(job->nodes, cpu_req); changed ++; } else if (!strcmp(attr->resource, "mppe")) { cpu_req = atoi(attr->value); job->nodes = MAX(job->nodes, cpu_req); changed ++; } else if (!strcmp(attr->resource, "mem")) { mem_req = schd_val2byte(attr->value); job->nodes = MAX(job->nodes, NODES_FROM_MEM(mem_req)); changed ++; #if PE_MASK != 0 } else if (!strcmp(attr->resource, "pe_mask")) { if (schd_str2mask(attr->value, &job->nodemask)) { (void)sprintf(log_buffer, "bad pe_mask %s for job %s", attr->value, job->jobid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } else changed++; /* Job pe_mask was valid. */ #endif /* PE_MASK */ } /* That's all for requested resources. */ continue; } if (!strcmp(attr->name, ATTR_used)) { if (!strcmp(attr->resource, "walltime")) { job->walltime_used = schd_val2sec(attr->value); changed ++; } /* No other interesting cases. */ continue; } /* Session ID for running jobs (used to correlate GRM info */ if (!strcmp(attr->name, ATTR_session)) { job->session = atoi(attr->value); continue; } /* Job Priority attribute (inherited from queue) */ if (!strcmp(attr->name, ATTR_p)) { job->priority = atoi(attr->value); continue; } /* Creation time attribute. */ if (!strcmp(attr->name, ATTR_ctime)) { /* How long ago was it put in the queue ? */ job->time_queued = schd_TimeNow - atoi(attr->value); continue; } /* Modified time attribute. */ if (!strcmp(attr->name, ATTR_mtime)) { /* When was the job last modified? */ job->mtime = atoi(attr->value); continue; } #ifdef ATTR_etime /* * When was the job last eligible to run? When a user-hold is * released, this value is updated to the current time. This * prevents users from gaining higher priority from holding their * jobs. */ if (!strcmp(attr->name, ATTR_etime)) { job->eligible = schd_TimeNow - atoi(attr->value); continue; } #endif /* ATTR_etime */ } /* * If this job is in the "Running" state, compute how many seconds * remain until it is completed. */ if (job->state == 'R') { job->time_left = job->walltime - job->walltime_used; } /* * If this job was enqueued since the last time we ran, set the job * flag to indicate that we have not yet seen this job. This makes it * a candidate for additional processing. There may be some inaccuracy, * since the time_t has resolution of 1 second. Attempt to err on the * side of caution. */ if ((job->state == 'Q') && (job->time_queued != UNSPECIFIED)) { if (job->time_queued <= (schd_TimeNow - schd_TimeLast)) { job->flags |= JFLAGS_FIRST_SEEN; } } /* * If the 'etime' attribute wasn't found, set it to the time the job has * been queued. Most jobs will be eligible to run their entire lifetime. * The exception is a job that has been held - if it was a user hold, * the release will reset the etime to the latest value. * If not eligible time was given, use the job's creation time. */ if (!job->eligible) job->eligible = job->time_queued; #if defined(sgi) /* * If the job provided a memory or CPU resource that does not match * the resources that will be allocated by the assigned nodes (i.e. * a request for 100mb of memory and 16 CPUs - the job will "get" all * 4GB of memory anyway), alter the job attributes such that they * will align with the assigned nodes later. */ bump_rsrc_requests(job, cpu_req, mem_req); #endif /* defined(sgi) */ /* * Need to update the time_until_eligible and total_delay fields, * probably from a global array of information saved from previous * scheduler iteration. */ /* * Calculate the job priority weight sort key to be used later in * job sorting. This is the "priority" the job should have during * sorting based on the size of the job, the length of time queued, * and the job type. */ calc_job_weight(job); return (changed); }
int schd_register_file(char *filename) { char *id = "schd_register_file"; FileStatus *stats, *tail, *new_fs = NULL; /* * Look for the tail of the list. While walking the list, check to see * that the filename is not already registered. */ tail = NULL; for (stats = filestats; stats != NULL; stats = stats->next) { if (strcmp(filename, stats->filename) == 0) { sprintf(log_buffer, "%s: file %s already registered.", id, filename); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } tail = stats; } /* Create space for the new record. */ new_fs = (FileStatus *) malloc(sizeof(FileStatus)); if (new_fs == NULL) { sprintf(log_buffer, "%s: out of memory allocating FileStatus for file %s", id, filename); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); return (-1); } /* Clear the record out -- this clears the ctime and next pointer. */ memset(new_fs, 0, sizeof(FileStatus)); /* Keep a copy of the filename around. */ new_fs->filename = schd_strdup(filename); if (new_fs->filename == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(filename)"); return (-1); } /* * If this is not the first element, tack it on the end of the list. * Otherwise, start the list with it. */ if (tail) tail->next = new_fs; else filestats = new_fs; (void)sprintf(log_buffer, "%s: file %s registered.", id, filename); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); /* * Load the new element with the initial values for the file. Ignore * the return value - only setting up the timestamp and file existence * status are important. */ (void)schd_file_has_changed(filename, 1); return (0); }
/* * schd_get_queue_limits - query queue information from the server. * * Returns 0 on success, -1 for "fatal errors", and 1 for a transient * error (i.e., the queue failed the sanity checks imposed by the * queue_sanity() function). */ int schd_get_queue_limits(Queue *queue) { char *id = "schd_get_queue_limits"; int istrue; int local_errno = 0; Batch_Status *bs; AttrList *attr; static AttrList alist[] = { {&alist[1], ATTR_start, "", ""}, {&alist[2], ATTR_enable, "", ""}, {&alist[3], ATTR_count, "", ""}, {&alist[4], ATTR_maxuserrun, "", ""}, {&alist[5], ATTR_rescavail, "", ""}, {&alist[6], ATTR_rescassn, "", ""}, {&alist[7], ATTR_rescdflt, "", ""}, {&alist[8], ATTR_rescmax, "", ""}, {&alist[9], ATTR_rescmin, "", ""}, {&alist[10], ATTR_acluren, "", ""}, {&alist[11], ATTR_acluser, "", ""}, {&alist[12], ATTR_p, "", ""}, {NULL, ATTR_maxrun, "", ""} }; queue->running = UNSPECIFIED; queue->queued = UNSPECIFIED; queue->maxrun = UNSPECIFIED; queue->userrun = UNSPECIFIED; queue->ncpus_max = UNSPECIFIED; queue->ncpus_min = UNSPECIFIED; queue->ncpus_default = UNSPECIFIED; queue->ncpus_assn = UNSPECIFIED; queue->mem_max = UNSPECIFIED; queue->mem_min = UNSPECIFIED; queue->mem_default = UNSPECIFIED; queue->wallt_max = UNSPECIFIED; queue->wallt_min = UNSPECIFIED; queue->wallt_default = UNSPECIFIED; queue->rsrcs = NULL; queue->flags = 0; queue->priority = UNSPECIFIED; queue->speed = UNSPECIFIED; if (queue->featureA) { free(queue->featureA); queue->featureA = NULL; } if (queue->featureB) { free(queue->featureB); queue->featureB = NULL; } if (queue->featureC) { free(queue->featureC); queue->featureC = NULL; } queue->featureD = UNSPECIFIED; queue->featureE = UNSPECIFIED; queue->featureF = UNSPECIFIED; queue->featureG = UNSPECIFIED; queue->featureH = UNSPECIFIED; queue->featureI = UNSPECIFIED; if (queue->rsrcs) { DBPRT(("%s: found resource list on queue '%s'! Freeing them...\n", id, queue->qname)); cleanup_rsrcs(queue->rsrcs); queue->rsrcs = NULL; } if (queue->jobs) { DBPRT(("%s: found jobs on queue '%s'! Freeing them...\n", id, queue->qname)); schd_free_jobs(queue->jobs); queue->jobs = NULL; } if (queue->useracl) { DBPRT(("%s: found user ACL list on queue '%s'! Freeing it...\n", id, queue->qname)); schd_free_useracl(queue->useracl); queue->useracl = NULL; } /* Ask the server for information about the specified queue. */ if ((bs = pbs_statque_err(connector, queue->qname, alist, NULL, &local_errno)) == NULL) { sprintf(log_buffer, "pbs_statque failed, \"%s\" %d", queue->qname, local_errno); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } /* Process the list of attributes returned by the server. */ for (attr = bs->attribs; attr != NULL; attr = attr->next) { /* Is queue started? */ if (!strcmp(attr->name, ATTR_start)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not stopped. */ queue->flags &= ~QFLAGS_STOPPED; else queue->flags |= QFLAGS_STOPPED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* Is queue enabled? */ if (!strcmp(attr->name, ATTR_enable)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue is not disabled. */ queue->flags &= ~QFLAGS_DISABLED; else queue->flags |= QFLAGS_DISABLED; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } /* How many jobs are queued and running? */ if (!strcmp(attr->name, ATTR_count)) { queue->queued = schd_how_many(attr->value, SC_QUEUED); queue->running = schd_how_many(attr->value, SC_RUNNING); continue; } /* Queue-wide maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxrun)) { queue->maxrun = atoi(attr->value); continue; } /* Per-user maximum number of jobs running. */ if (!strcmp(attr->name, ATTR_maxuserrun)) { queue->userrun = atoi(attr->value); continue; } /* Queue Priority Value */ if (!strcmp(attr->name, ATTR_p)) { queue->priority = atoi(attr->value); continue; } /* Is there an enabled user access control list on this queue? */ if (!strcmp(attr->name, ATTR_acluren)) { if (schd_val2bool(attr->value, &istrue) == 0) { if (istrue) /* if true, queue has an ACL */ queue->flags |= QFLAGS_USER_ACL; else queue->flags &= ~QFLAGS_USER_ACL; } else { DBPRT(("%s: couldn't parse attr %s value %s to boolean\n", id, attr->name, attr->value)); } continue; } if (!strcmp(attr->name, ATTR_acluser)) { if (queue->useracl) { DBPRT(("queue %s acluser already set!\n", queue->qname)); schd_free_useracl(queue->useracl); } queue->useracl = schd_create_useracl(attr->value); continue; } /* Queue maximum resource usage. */ if (!strcmp(attr->name, ATTR_rescmax)) { if (!strcmp("mem", attr->resource)) { queue->mem_max = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_max = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_max = schd_val2sec(attr->value); continue; } if (!strcmp("speed", attr->resource)) { queue->speed = atoi(attr->value); continue; } if (!strcmp(FEATURE_A, attr->resource)) { queue->featureA = schd_strdup(attr->value); continue; } if (!strcmp(FEATURE_B, attr->resource)) { queue->featureB = schd_strdup(attr->value); continue; } if (!strcmp(FEATURE_C, attr->resource)) { queue->featureC = schd_strdup(attr->value); continue; } if (!strcmp(FEATURE_D, attr->resource)) { queue->featureD = atol(attr->value); continue; } if (!strcmp(FEATURE_E, attr->resource)) { queue->featureE = atol(attr->value); continue; } if (!strcmp(FEATURE_F, attr->resource)) { queue->featureF = atol(attr->value); continue; } if (!strcmp(FEATURE_G, attr->resource)) { schd_val2bool(attr->value, &istrue); queue->featureG = istrue; continue; } if (!strcmp(FEATURE_H, attr->resource)) { schd_val2bool(attr->value, &istrue); queue->featureH = istrue; continue; } if (!strcmp(FEATURE_I, attr->resource)) { schd_val2bool(attr->value, &istrue); queue->featureI = istrue; continue; } continue; } /* Queue minimum resource usage. */ if (!strcmp(attr->name, ATTR_rescmin)) { if (!strcmp("mem", attr->resource)) { queue->mem_min = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_min = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) { queue->wallt_min = schd_val2sec(attr->value); continue; } continue; } /* Queue assigned (in use) resource usage. */ if (!strcmp(attr->name, ATTR_rescassn)) { if (!strcmp("mem", attr->resource)) { queue->mem_assn = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_assn = atoi(attr->value); } continue; } if (!strcmp(attr->name, ATTR_rescdflt)) { if (!strcmp("mem", attr->resource)) { queue->mem_default = schd_val2byte(attr->value); continue; } if (!strcmp("ncpus", attr->resource)) { queue->ncpus_default = atoi(attr->value); continue; } if (!strcmp("walltime", attr->resource)) queue->wallt_default = schd_val2sec(attr->value); } /* Ignore anything else */ } pbs_statfree(bs); return (0); }
/* * Now that an option and its argument have been read, validate them and * set the appropriate global configuration variables. */ static int set_cfg_opt(char *cfg_option, char *cfg_arg) { char *id = "set_cfg_opt"; /* XXX Should smash case on these before doing string compares? */ if (!strcmp(cfg_option, "TARGET_LOAD_PCT")) { schd_TARGET_LOAD_PCT = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "TARGET_LOAD_VARIANCE")) { return (get_variance(cfg_arg, &schd_TARGET_LOAD_MINUS, &schd_TARGET_LOAD_PLUS)); } if (!strcmp(cfg_option, "HIGH_SYSTIME")) { schd_HIGH_SYSTIME = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "MAX_JOBS")) { schd_MAX_JOBS = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "MIN_JOBS")) { schd_MIN_JOBS = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "MAX_DEDICATED_JOBS")) { schd_MAX_DEDICATED_JOBS = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "MAX_USER_RUN_JOBS")) { schd_MAX_USER_RUN_JOBS = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "ENFORCE_ALLOCATION")) { return schd_val2booltime(cfg_arg, &schd_ENFORCE_ALLOCATION); } if (!strcmp(cfg_option, "TEST_ONLY")) { return schd_val2bool(cfg_arg, &schd_TEST_ONLY); } if (!strcmp(cfg_option, "ENFORCE_DEDICATED_TIME")) { return schd_val2booltime(cfg_arg, &schd_ENFORCE_DEDTIME); } if (!strcmp(cfg_option, "DEDICATED_TIME_COMMAND")) { if (schd_DEDTIME_COMMAND) free(schd_DEDTIME_COMMAND); schd_DEDTIME_COMMAND = schd_strdup(cfg_arg); if (schd_DEDTIME_COMMAND == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(schd_DEDTIME_COMMAND)"); return (-1); } return (0); } if (!strcmp(cfg_option, "DEDICATED_TIME_CACHE_SECS")) { schd_DEDTIME_CACHE_SECS = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "DECAY_FACTOR")) { schd_DECAY_FACTOR = atof(cfg_arg); return (0); } if (!strcmp(cfg_option, "OA_DECAY_FACTOR")) { schd_OA_DECAY_FACTOR = atof(cfg_arg); return (0); } if (!strcmp(cfg_option, "SCHED_ACCT_DIR")) { if (schd_SCHED_ACCT_DIR) free(schd_SCHED_ACCT_DIR); schd_SCHED_ACCT_DIR = schd_strdup(cfg_arg); if (schd_SCHED_ACCT_DIR == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(schd_SCHED_ACCT_DIR)"); return (-1); } return (0); } if (!strcmp(cfg_option, "SYSTEM_NAME")) { if (schd_SYSTEM_NAME) free(schd_SYSTEM_NAME); schd_SYSTEM_NAME = schd_strdup(cfg_arg); schd_lowercase(schd_SYSTEM_NAME); return (0); } if (!strcmp(cfg_option, "SERVER_HOST")) { if (schd_SERVER_HOST) free(schd_SERVER_HOST); schd_SERVER_HOST = schd_strdup(cfg_arg); if (schd_SERVER_HOST == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(schd_SERVER_HOST)"); return (-1); } schd_lowercase(schd_SERVER_HOST); return (0); } if (!strcmp(cfg_option, "SCHED_HOST")) { if (schd_SCHED_HOST) free(schd_SCHED_HOST); schd_SCHED_HOST = schd_strdup(cfg_arg); if (schd_SCHED_HOST == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(schd_SCHED_HOST)"); return (-1); } schd_lowercase(schd_SCHED_HOST); return (0); } if (!strcmp(cfg_option, "SORTED_JOB_DUMPFILE")) { if (schd_JOB_DUMPFILE) free(schd_JOB_DUMPFILE); schd_JOB_DUMPFILE = schd_strdup(cfg_arg); if (schd_JOB_DUMPFILE == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(schd_JOB_DUMPFILE)"); return (-1); } return (0); } if (!strcmp(cfg_option, "SCHED_RESTART_ACTION")) { if (strcmp(cfg_arg, "NONE") == 0) { schd_SCHED_RESTART_ACTION = SCHD_RESTART_NONE; return (0); } if (strcmp(cfg_arg, "RESUBMIT") == 0) { schd_SCHED_RESTART_ACTION = SCHD_RESTART_RESUBMIT; return (0); } if (strcmp(cfg_arg, "RERUN") == 0) { schd_SCHED_RESTART_ACTION = SCHD_RESTART_RERUN; return (0); } return (-1); /* Bad argument */ } if (!strcmp(cfg_option, "SORT_BY_PAST_USAGE")) { return schd_val2bool(cfg_arg, &schd_SORT_BY_PAST_USAGE); } if (!strcmp(cfg_option, "ENFORCE_PRIME_TIME")) { return schd_val2booltime(cfg_arg, &schd_ENFORCE_PRIME_TIME); } if (!strcmp(cfg_option, "PRIME_TIME_START")) { schd_PRIME_TIME_START = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "PRIME_TIME_END")) { schd_PRIME_TIME_END = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "PRIME_TIME_SMALL_NODE_LIMIT")) { schd_PT_SMALL_NODE_LIMIT = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "PRIME_TIME_SMALL_WALLT_LIMIT")) { schd_PT_SMALL_WALLT_LIMIT = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "PRIME_TIME_WALLT_LIMIT")) { schd_PT_WALLT_LIMIT = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "NONPRIME_DRAIN_SYS")) { return schd_val2bool(cfg_arg, &schd_NONPRIME_DRAIN_SYS); } if (!strcmp(cfg_option, "NP_DRAIN_BACKTIME")) { schd_NP_DRAIN_BACKTIME = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "NP_DRAIN_IDLETIME")) { schd_NP_DRAIN_IDLETIME = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "SMALL_JOB_MAX")) { schd_SMALL_JOB_MAX = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "WALLT_LIMIT_LARGE_JOB")) { schd_WALLT_LARGE_LIMIT = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "WALLT_LIMIT_SMALL_JOB")) { schd_WALLT_SMALL_LIMIT = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "MAX_QUEUED_TIME")) { schd_MAX_QUEUED_TIME = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "SMALL_QUEUED_TIME")) { schd_SMALL_QUEUED_TIME = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "INTERACTIVE_LONG_WAIT")) { schd_INTERACTIVE_LONG_WAIT = schd_val2sec(cfg_arg); return (0); } if (!strcmp(cfg_option, "SUBMIT_QUEUE")) { arg_to_qlist(cfg_arg, ",", &schd_SubmitQueue); return (0); } if (!strcmp(cfg_option, "BATCH_QUEUES")) { arg_to_qlist(cfg_arg, ",", &schd_BatchQueues); return (0); } if (!strcmp(cfg_option, "SPECIAL_QUEUE")) { arg_to_qlist(cfg_arg, ",", &schd_SpecialQueue); return (0); } if (!strcmp(cfg_option, "DEDICATED_QUEUES")) { arg_to_qlist(cfg_arg, ",", &schd_DedQueues); return (0); } if (!strcmp(cfg_option, "EXTERNAL_QUEUES")) { arg_to_qlist(cfg_arg, ",", &schd_ExternQueues); return (0); } if (!strcmp(cfg_option, "FAKE_MACHINE_MULT")) { schd_FAKE_MACH_MULT = atoi(cfg_arg); return (0); } if (!strcmp(cfg_option, "AVOID_FRAGMENTATION")) { return schd_val2bool(cfg_arg, &schd_AVOID_FRAGS); } if (!strcmp(cfg_option, "MANAGE_HPM_COUNTERS")) { return schd_val2bool(cfg_arg, &schd_MANAGE_HPM); } if (!strcmp(cfg_option, "REVOKE_HPM_COUNTERS")) { return schd_val2bool(cfg_arg, &schd_REVOKE_HPM); } /* Unknown option -- return an error. */ return (-1); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response; int badreply = 0; int cpus_avail = 0; int cpus_tot = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "mppe_app"); addreq(rm, "mppe_avail"); /* Get the values back from the resource monitor, and round up. */ /* Receive MPPE_APP response from resource monitor. */ /* returns the total number of Application PEs configured */ response = getreq(rm); if (response != NULL) { cpus_tot = atoi(response) * schd_FAKE_MACH_MULT; } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive MPPE_AVAIL response from resource monitor. */ /* returns the largest contiguous block of APP PEs */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } new_rsrcs->freemem = MB_PER_NODE * schd_FAKE_MACH_MULT; bail: /* Disconnect from the resource monitor. */ if (rm) closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = cpus_tot; new_rsrcs->nodes_alloc = cpus_tot - cpus_avail; if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
static int get_variance(char *string, int *lowp, int *highp) { /* char *id = "get_variance"; */ char *ptr, *buf, *end, sign; long n; int i, low = -1, high = -1; if ((string == NULL) || ((buf = schd_strdup(string)) == NULL)) return (-1); ptr = strtok(buf, ","); while (ptr != NULL) { /* Ensure that the string matches '{+-}[0-9][0-9]*%'. */ sign = *ptr; if ((sign != '+') && (sign != '-')) goto parse_error; ptr++; if ((*ptr < '0') || (*ptr > '9')) goto parse_error; n = strtol(ptr, &end, 10); if (n > INT_MAX) goto parse_error; i = (int)n; if (*end != '%') goto parse_error; if (sign == '-') { if (low >= 0) /* Already set. */ goto parse_error; else low = i; } else { if (high >= 0) /* Already set. */ goto parse_error; else high = i; } ptr = strtok(NULL, ","); } free(buf); *lowp = (low >= 0) ? low : 0; *highp = (high >= 0) ? high : 0; return (0); parse_error: free(buf); return (-1); }
static int arg_to_qlist(char *arg, char *sep, QueueList **qlist_ptr) { char *id = "arg_to_qlist"; QueueList *qptr = NULL, *new_qlist; int num = 0; char *name, *exechost, canon[PBS_MAXHOSTNAME + 1]; /* * Multiple lines may be used to add queues to the queue list. Find * the tail of the passed-in list (if there is one), and assign the * qptr to the tail element. Later, the new element will be hung off * qptr's next field (or qptr will be set to it.) */ if (*qlist_ptr) { for (qptr = *qlist_ptr; qptr->next != NULL; qptr = qptr->next) /* Walk the list, looking for last element. */; } else { qptr = NULL; } for (name = strtok(arg, sep); name != NULL; name = strtok(NULL, sep)) { /* * If the list is NULL, create the first element and point qptr * at it. If not, take the qptr from the last iteration (which * will be the head the second time through) and place a new * element on its next pointer. Then replace qptr with the * address of the newly allocated struct. */ new_qlist = (QueueList *)malloc(sizeof(QueueList)); if (new_qlist == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "malloc(newQueue)"); goto error_in_list; } memset(new_qlist, 0, sizeof(QueueList)); if (qptr == NULL) { *qlist_ptr = new_qlist; qptr = *qlist_ptr; } else { qptr->next = new_qlist; qptr = new_qlist; } new_qlist->queue = (Queue *)malloc(sizeof(Queue)); if (new_qlist->queue == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "malloc(newQueue->queue)"); goto error_in_list; } memset(new_qlist->queue, 0, sizeof(Queue)); /* * Queue names may be either 'queue3' or 'queue3@exechost'. * If there is a '@', convert it to a '\0' and copy the two * halves of the string into the qname and exechost fields. * Otherwise, this queue is local to this host - paste in the * "local" hostname. */ if ((exechost = strchr(name, '@')) != NULL) { /* Parse queue@host into queue and hostname. */ *exechost = '\0'; /* '@' ==> '\0' to terminate qname */ exechost ++; /* Next character after the new '\0' */ if (get_fullhostname(exechost, canon, PBS_MAXHOSTNAME) == 0) { exechost = canon; /* Point at canonical name. */ } else { sprintf(log_buffer, "Warning: Cannot canonicalize queue %s@%s", name, exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } } else { exechost = schd_ThisHost; /* Queue lives on localhost. */ } new_qlist->queue->qname = schd_strdup(name); if (new_qlist->queue->qname == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(qname)"); goto error_in_list; } new_qlist->queue->exechost = schd_strdup(exechost); if (new_qlist->queue->exechost == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(exechost)"); goto error_in_list; } num++; } return (num); error_in_list: /* Something went wrong - delete the new list and return a fatal error. */ if (*qlist_ptr) { schd_destroy_qlist(*qlist_ptr); *qlist_ptr = NULL; } return (-1); }
/*ARGSUSED*/ int schedinit(int argc, char *argv[]) { char *id = "schedinit"; struct utsname name; struct sigaction act, oact; char *ptr, canon[PBS_MAXHOSTNAME + 1]; DBPRT(("\n%s\n", schd_VersionString)); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, schd_VersionString); /* * If this is the initial startup configuration, then schd_TimeNow will * be 0. Initialize it to the current time, so it can be tested against * various times when initializing. */ if (schd_TimeNow == 0) { schd_TimeNow = time(NULL); DBPRT(("%s: initialize/startup at %s", id, ctime(&schd_TimeNow))); } /* * Determine location of configuration file. Check for the presence of * the PBSSCHED_CONFIG environment variable. If not defined, fall back * to the compiled default CONFIGFILE. * * Since neither the environment variables nor the compiled-in default * can be changed (with the exception of someone wreaking havoc with * a debugger or something), this only needs to be done once. */ if (schd_CfgFilename == NULL) { ptr = getenv("PBSSCHED_CONFIG"); if (ptr == NULL) ptr = CONFIGFILE; schd_CfgFilename = schd_strdup(ptr); if (schd_CfgFilename == NULL) { (void)sprintf(log_buffer, "schd_strdup() failed for configfile"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (-1); } } /* * From this point on, goto cleanup_and_error: to clean up any allocated * storage for filenames. */ DBPRT(("SCHEDINIT: Reading configuration from '%s'\n", schd_CfgFilename)); /* Reset the configuration to a "known" state. */ reset_config(); /* Determine on what host this scheduler running. */ uname(&name); if (get_fullhostname(name.nodename, canon, PBS_MAXHOSTNAME) == 0) { strncpy(schd_ThisHost, canon, PBS_MAXHOSTNAME); } else { (void)sprintf(log_buffer, "Failed to canonicalize uname %s (using it anyway)", name.nodename); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); strncpy(schd_ThisHost, name.nodename, PBS_MAXHOSTNAME); } schd_lowercase(schd_ThisHost); DBPRT(("%s: This host is '%s'\n", id, schd_ThisHost)); /* * Register the state of the config file. The call to reset_config() * above will have cleared all file watches. */ if (schd_register_file(schd_CfgFilename)) { (void)sprintf(log_buffer, "cannot watch %s", schd_CfgFilename); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); goto cleanup_and_error; } /* Read the configuration file. */ if (schd_get_config(schd_CfgFilename)) return (-1); /* if we are NOT making a distinction between Prime and Non-Prime Time, * then simply empty the holidays array; otherwise, try to read and load * the holidays file. */ if (!schd_ENFORCE_PRIME_TIME) init_holidays(); else { /* * Register the state of the holidays file. This allows schd_req() to * reload it if it is changed. */ if (schd_register_file(HOLIDAYS_FILE)) { (void)sprintf(log_buffer, "cannot watch %s", HOLIDAYS_FILE); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); goto cleanup_and_error; } /* Get a list of prime/non-prime times from the holidays file */ if (schd_read_holidays() < 0) return (-1); } /* * Set up a signal handler for SIGHUP. catch_HUP() will re-read the * configuration file. */ act.sa_flags = 0; act.sa_handler = catch_HUP; sigemptyset(&act.sa_mask); if (sigaction(SIGHUP, &act, &oact)) { (void)sprintf(log_buffer, "Failed to setup SIGHUP handler."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); } DBPRT(("SCHEDINIT: configuration complete.\n")); return (0); cleanup_and_error: /* * Some error occurred. Remove watches and free the storage allocated * for the filenames. */ if (schd_CfgFilename) { schd_forget_file(schd_CfgFilename); free(schd_CfgFilename); } schd_CfgFilename = NULL; return (-1); }
/* * Parse a string like "foo@*.bar.com,[email protected],[email protected]" into a * linked list of UserAcl's. Each element's user [and possibly host] field * points to an individually schd_strdup()'d string. */ UserAcl * schd_create_useracl(char *useracl) { char *id = "schd_create_useracl"; char *useracl_copy, *user, *atsign; UserAcl *acl, *new_acl, *acltail; /* * Copy the string. This copy will be chopped up with '\0's to create * the strings pointed to by the array of UserAcl's pointed to by acl. */ if ((useracl_copy = schd_strdup(useracl)) == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(useracl) failed"); DBPRT(("schd_strdup(useracl) failed\n")); return (NULL); } acl = NULL; acltail = NULL; user = strtok(useracl_copy, ","); while (user != NULL) { new_acl = (UserAcl *)malloc(sizeof(UserAcl)); if (new_acl == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "malloc(UserAcl) failed"); DBPRT(("malloc(UserAcl) failed\n")); if (acl) schd_free_useracl(acl); return (NULL); } /* * If a host string is given, change the '@' into a '\0' to terminate * the user string (for the strncpy() below) and place a reference to * a copy of the host string into the host pointer. */ new_acl->host = NULL; if ((atsign = strchr(user, '@')) != NULL) { *atsign = '\0'; /* Skip forward to the start of the remaining host string. */ atsign ++; new_acl->host = schd_strdup(atsign); if (new_acl->host == NULL) { log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, "schd_strdup(host) failed"); DBPRT(("schd_strdup(host) failed\n")); if (acl) schd_free_useracl(acl); free(new_acl); return (NULL); } } /* * Copy the username into the static array in the UserAcl struct. */ strncpy(new_acl->user, user, PBS_MAXUSER); /* * Place the new ACL element on the tail of the list, or create it * if this is the first element. */ if (acltail) acltail->next = new_acl; else acl = new_acl; acltail = new_acl; acltail->next = NULL; /* Move on to the next user entry in the list. */ user = strtok(NULL, ","); } /* * Free the storage used by the copy of the string that was strtok()'d. */ free(useracl_copy); return (acl); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int local_errno = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s ", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); addreq(rm, "tmpdir"); addreq(rm, "arch"); /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->loadave = atof(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->mem_total = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->ncpus_total = atoi(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive TMPDIR response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->tmpdir = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(tmpdir), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive ARCH response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->arch = schd_strdup(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } bail: /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - skipping this node", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }