/** * @brief * pbsTcl_Init - Function to initialize Tcl interpreter based on the environment. * * @param[in,out] interp - Interpreter for application. * * @return int * @retval TCL_OK : everything looks good. * @retval TCL_ERROR : something got wrong! */ int pbsTcl_Init(Tcl_Interp *interp) { if (Tcl_Init(interp) == TCL_ERROR) return TCL_ERROR; #if TCLX if (Tclx_Init(interp) == TCL_ERROR) return TCL_ERROR; #endif fullresp(0); add_cmds(interp); Tcl_SetVar(interp, "tcl_rcFileName", "~/.tclshrc", TCL_GLOBAL_ONLY); return TCL_OK; }
int pbsTcl_Init( Tcl_Interp *interp) /* Interpreter for application. */ { if (Tcl_Init(interp) == TCL_ERROR) return TCL_ERROR; #if TCLX #if TCL_MINOR_VERSION < 5 && TCL_MAJOR_VERSION < 8 if (TclX_Init(interp) == TCL_ERROR) { #else if (Tclx_Init(interp) == TCL_ERROR) { #endif return TCL_ERROR; } #endif /* TCLX */ #ifndef __cplusplus fullresp(0); #endif add_cmds(interp); Tcl_SetVar(interp, "tcl_rcFileName", "~/.tclshrc", TCL_GLOBAL_ONLY); return TCL_OK; } int main(int argc, char *argv[]) { chk_file_sec_stderr = 1; Tcl_Main(argc, argv, pbsTcl_Init); return 0; } /* END main() */
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int cpus_avail = 0; size_t pmem_avail = 0; char hpm_ctl[64]; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ #ifdef NODEMASK Bitfield cpy; int i, j; #endif /* NODEMASK */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); #ifdef NODEMASK addreq(rm, "availmask"); #endif /* NODEMASK */ if (schd_MANAGE_HPM) { (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR); addreq(rm, hpm_ctl); } /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); new_rsrcs->freemem *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { pmem_avail = schd_val2byte(response); pmem_avail *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } #ifdef NODEMASK /* Receive available nodes from resource monitor. */ response = getreq(rm); if (response == NULL) { (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } else { if (schd_bits2mask(response, &new_rsrcs->availmask) != 0) { if (schd_str2mask(response, &new_rsrcs->availmask) != 0) { (void)sprintf(log_buffer, "can't parse availmask '%s'", response); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } (void)free(response); } #endif /* NODEMASK */ if (schd_MANAGE_HPM) { /* Receive HPM_CTL response from resource monitor. */ response = getreq(rm); if (response != NULL) { if (strcmp(response, HPM_CTL_USERMODE_STR) == 0) new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER; else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0) new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER; else { (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'", response, hpm_ctl, exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } else { (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d", hpm_ctl, pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } /* * NOTE: response will be free()'d in bail. Be sure to explicitly free() * response if more getreq() calls are added before the code below. */ bail: if (response != NULL) (void)free(response); /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail); #ifdef NODEMASK /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */ BITFIELD_CPY(&cpy, &(new_rsrcs->availmask)); for (i = 2; i <= schd_FAKE_MACH_MULT; i++) { for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++) BITFIELD_SHIFTL(&cpy); BITFIELD_SETM(&(new_rsrcs->availmask), &cpy); } #endif /* NODEMASK */ if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response; int badreply = 0; int cpus_avail = 0; int cpus_tot = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "mppe_app"); addreq(rm, "mppe_avail"); /* Get the values back from the resource monitor, and round up. */ /* Receive MPPE_APP response from resource monitor. */ /* returns the total number of Application PEs configured */ response = getreq(rm); if (response != NULL) { cpus_tot = atoi(response) * schd_FAKE_MACH_MULT; } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive MPPE_AVAIL response from resource monitor. */ /* returns the largest contiguous block of APP PEs */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } new_rsrcs->freemem = MB_PER_NODE * schd_FAKE_MACH_MULT; bail: /* Disconnect from the resource monitor. */ if (rm) closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = cpus_tot; new_rsrcs->nodes_alloc = cpus_tot - cpus_avail; if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
void start_tcl(void) { char *id = "start_tcl"; char buf[BUFSIZ]; int fd; int tot, len; interp = Tcl_CreateInterp(); if (Tcl_Init(interp) == TCL_ERROR) { sprintf(log_buffer, "Tcl_Init error: %s", Tcl_GetStringResult(interp)); log_err(-1, id, log_buffer); die(0); } #if TCLX #if TCL_MINOR_VERSION < 5 && TCL_MAJOR_VERSION < 8 if (TclX_Init(interp) == TCL_ERROR) { #else if (Tclx_Init(interp) == TCL_ERROR) { #endif sprintf(log_buffer, "Tclx_Init error: %s", Tcl_GetStringResult(interp)); log_err(-1, id, log_buffer); die(0); } #endif add_cmds(interp); if (initfil) { int code; code = Tcl_EvalFile(interp, initfil); if (code != TCL_OK) { char *trace; trace = (char *)Tcl_GetVar(interp, "errorInfo", 0); if (trace == NULL) trace = (char *)Tcl_GetStringResult(interp); fprintf(stderr, "%s: TCL error @ line %d: %s\n", initfil, interp->errorLine, trace); sprintf(log_buffer, "%s: TCL error @ line %d: %s", initfil, interp->errorLine, Tcl_GetStringResult(interp)); log_err(-1, id, log_buffer); die(0); } sprintf(log_buffer, "init file %s", initfil); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } if ((fd = open(bodyfil, O_RDONLY)) == -1) { log_err(errno, id, bodyfil); die(0); } sprintf(log_buffer, "body file: %s", bodyfil); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); if (body) free(body); if ((body = malloc(BUFSIZ)) == NULL) { log_err(errno, id, "malloc"); die(0); } for (tot = 0; (len = read(fd, buf, sizeof(buf))) > 0; tot += len) { if ((body = realloc(body, tot + len + 1)) == NULL) { log_err(errno, id, "realloc"); die(0); } memcpy(&body[tot], buf, len); } if (len == -1) { log_err(errno, id, bodyfil); die(0); } body[tot] = '\0'; close(fd); #if TCL_MAJOR_VERSION >= 8 if (body_obj == NULL) { body_obj = Tcl_NewStringObj(body, tot); Tcl_IncrRefCount(body_obj); } else { Tcl_SetStringObj(body_obj, body, tot); } #endif } int addclient(name) char *name; { static char id[] = "addclient"; struct hostent *host, *gethostbyname(); struct in_addr saddr; if ((host = gethostbyname(name)) == NULL) { sprintf(log_buffer, "host %s not found", name); log_err(-1, id, log_buffer); return -1; } if (numclients >= START_CLIENTS) { pbs_net_t *newclients; newclients = realloc(okclients, sizeof(pbs_net_t) * (numclients + 1)); if (newclients == NULL) return -1; okclients = newclients; } memcpy((char *)&saddr, host->h_addr, host->h_length); okclients[numclients++] = saddr.s_addr; return 0; } /* * read_config - read and process the configuration file (see -c option) * * Currently, the only statement is $clienthost to specify which systems * can contact the scheduler. */ #define CONF_LINE_LEN 120 static int read_config(file) char *file; { static char *id = "read_config"; FILE *conf; int i; char line[CONF_LINE_LEN]; char *token; struct specialconfig { char *name; int (*handler)(); } special[] = { {"clienthost", addclient }, { NULL, NULL } }; #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) if (chk_file_sec(file, 0, 0, S_IWGRP | S_IWOTH, 1, 0)) return (-1); #endif if ((conf = fopen(file, "r")) == NULL) { log_err(errno, id, "cannot open config file"); return (-1); } while (fgets(line, CONF_LINE_LEN, conf)) { if ((line[0] == '#') || (line[0] == '\n')) continue; /* ignore comment & null line */ else if (line[0] == '$') /* special */ { if ((token = strtok(line, " \t")) == NULL) token = ""; for (i = 0; special[i].name; i++) { if (strcmp(token + 1, special[i].name) == 0) break; } if (special[i].name == NULL) { sprintf(log_buffer, "config name %s not known", token); log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); return (-1); } token = strtok(NULL, " \t"); if (*(token + strlen(token) - 1) == '\n') *(token + strlen(token) - 1) = '\0'; if (special[i].handler(token)) { fclose(conf); return (-1); } } else { log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, msg_daemonname, "invalid line in config file"); fclose(conf); return (-1); } } fclose(conf); return (0); } void restart(sig) int sig; { char *id = "restart"; if (sig) { sprintf(log_buffer, "restart on signal %d", sig); log_close(1); log_open(logfile, path_log); } else { sprintf(log_buffer, "restart command"); } log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); Tcl_DeleteInterp(interp); if (configfile) { if (read_config(configfile) != 0) die(0); } start_tcl(); } void badconn(msg) char *msg; { static char id[] = "badconn"; struct in_addr addr; char buf[5*sizeof(addr) + 100]; struct hostent *phe; addr = saddr.sin_addr; phe = gethostbyaddr((void *) & addr, sizeof(addr), AF_INET); if (phe == NULL) { char hold[6]; int i; union { struct in_addr aa; u_char bb[sizeof(addr)]; } uu; uu.aa = addr; sprintf(buf, "%u", uu.bb[0]); for (i = 1; i < (int)sizeof(addr); i++) { sprintf(hold, ".%u", uu.bb[i]); strcat(buf, hold); } } else { strncpy(buf, phe->h_name, sizeof(buf)); buf[sizeof(buf)-1] = '\0'; } sprintf(log_buffer, "%s on port %u %s", buf, ntohs(saddr.sin_port), msg); log_err(-1, id, log_buffer); return; } unsigned int server_command() { static char id[] = "server_command"; int new_socket; int i; torque_socklen_t slen; unsigned int cmd; pbs_net_t addr; slen = sizeof(saddr); new_socket = accept(server_sock, (struct sockaddr *) & saddr, &slen); if (new_socket == -1) { log_err(errno, id, "accept"); return SCH_ERROR; } if (ntohs(saddr.sin_port) >= IPPORT_RESERVED) { badconn("non-reserved port"); close(new_socket); return SCH_ERROR; } addr = (pbs_net_t)saddr.sin_addr.s_addr; for (i = 0; i < numclients; i++) { if (addr == okclients[i]) break; } if (i == numclients) { badconn("unauthorized host"); close(new_socket); return SCH_ERROR; } if ((connector = socket_to_conn(new_socket)) < 0) { log_err(errno, id, "socket_to_conn"); return SCH_ERROR; } if (get_4byte(new_socket, &cmd) != 1) { log_err(errno, id, "get4bytes"); return SCH_ERROR; } return cmd; } /* * lock_out - lock out other daemons from this directory. */ static void lock_out(fds, op) int fds; int op; /* F_WRLCK or F_UNLCK */ { struct flock flock; flock.l_type = op; flock.l_whence = SEEK_SET; flock.l_start = 0; flock.l_len = 0; /* whole file */ if (fcntl(fds, F_SETLK, &flock) < 0) { (void)strcpy(log_buffer, "pbs_sched: another scheduler running\n"); log_err(errno, msg_daemonname, log_buffer); fprintf(stderr, log_buffer); exit(1); } } int main(argc, argv) int argc; char *argv[]; { char *id = "main"; int code; struct hostent *hp; int go, c, errflg = 0; int lockfds; int t = 1; char *ptr; pid_t pid; char *cp, host[100]; char *homedir = PBS_SERVER_HOME; unsigned int port; char path_priv[_POSIX_PATH_MAX]; char *dbfile = "sched_out"; int alarm_time = 180; struct sigaction act; caddr_t curr_brk = 0, next_brk; extern char *optarg; extern int optind, opterr; fd_set fdset; #ifndef DEBUG if (IamRoot() == 0) { return (1); } #endif /* DEBUG */ glob_argv = argv; if ((cp = strrchr(argv[0], '/')) == NULL) cp = argv[0]; else cp++; msg_daemonname = strdup(cp); port = get_svrport(PBS_SCHEDULER_SERVICE_NAME, "tcp", PBS_SCHEDULER_SERVICE_PORT); while ((c = getopt(argc, argv, "L:S:d:i:b:t:p:a:vc:")) != EOF) { switch (c) { case 'L': logfile = optarg; break; case 'S': port = (unsigned int)atoi(optarg); if (port == 0) { fprintf(stderr, "%s: illegal port\n", optarg); errflg = 1; } break; case 'd': homedir = optarg; break; case 'i': /* initialize */ initfil = optarg; break; case 'b': bodyfil = optarg; break; case 't': termfil = optarg; break; case 'p': dbfile = optarg; break; case 'a': alarm_time = strtol(optarg, &ptr, 10); if (alarm_time <= 0 || *ptr != '\0') { fprintf(stderr, "%s: bad alarm time\n", optarg); errflg = 1; } break; case 'c': configfile = optarg; break; case 'v': verbose = 1; break; case '?': errflg = 1; break; } } if (errflg || optind != argc) { static char *options[] = { "[-L logfile]", "[-S port]", "[-d home]", "[-i init]", "[-b body]", "[-t term]", "[-p output]", "[-a alarm]", "[-c configfile]", "[-v]", NULL }; int i; fprintf(stderr, "usage: %s\n", argv[0]); for (i = 0; options[i]; i++) fprintf(stderr, "\t%s\n", options[i]); exit(1); } /* Save the original working directory for "restart" */ if ((oldpath = getcwd((char *)NULL, MAXPATHLEN)) == NULL) { fprintf(stderr, "cannot get current working directory\n"); exit(1); } (void)sprintf(path_priv, "%s/sched_priv", homedir); #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) c = chk_file_sec(path_priv, 1, 0, S_IWGRP | S_IWOTH, 1, 0); c |= chk_file_sec(PBS_ENVIRON, 0, 0, S_IWGRP | S_IWOTH, 0, 0); if (c != 0) exit(1); #endif /* not DEBUG and not NO_SECURITY_CHECK */ if (chdir(path_priv) == -1) { perror(path_priv); exit(1); } (void)sprintf(path_log, "%s/sched_logs", homedir); (void)strcpy(pbs_current_user, "Scheduler"); /* The following is code to reduce security risks */ /* start out with standard umask, system resource limit infinite */ umask(022); if (setup_env(PBS_ENVIRON) == -1) exit(1); c = getgid(); (void)setgroups(1, (gid_t *)&c); /* secure suppl. group ids */ c = sysconf(_SC_OPEN_MAX); while (--c > 2) (void)close(c); /* close any file desc left open by parent */ #ifndef DEBUG #ifdef _CRAY (void)limit(C_JOB, 0, L_CPROC, 0); (void)limit(C_JOB, 0, L_CPU, 0); (void)limit(C_JOBPROCS, 0, L_CPU, 0); (void)limit(C_PROC, 0, L_FD, 255); (void)limit(C_JOB, 0, L_FSBLK, 0); (void)limit(C_JOBPROCS, 0, L_FSBLK, 0); (void)limit(C_JOB, 0, L_MEM , 0); (void)limit(C_JOBPROCS, 0, L_MEM , 0); #else /* not _CRAY */ { struct rlimit rlimit; rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; (void)setrlimit(RLIMIT_CPU, &rlimit); (void)setrlimit(RLIMIT_FSIZE, &rlimit); (void)setrlimit(RLIMIT_DATA, &rlimit); (void)setrlimit(RLIMIT_STACK, &rlimit); #ifdef RLIMIT_RSS (void)setrlimit(RLIMIT_RSS , &rlimit); #endif /* RLIMIT_RSS */ #ifdef RLIMIT_VMEM (void)setrlimit(RLIMIT_VMEM , &rlimit); #endif /* RLIMIT_VMEM */ } #endif /* not _CRAY */ #if !defined(NO_SECURITY_CHECK) c = 0; if (initfil) { if (*initfil != '/') { (void)sprintf(log_buffer, "%s/%s", path_priv, initfil); c |= chk_file_sec(log_buffer, 0, 0, S_IWGRP | S_IWOTH, 1, 0); } else { c |= chk_file_sec(initfil, 0, 0, S_IWGRP | S_IWOTH, 1, 0); } } if (bodyfil) { if (*bodyfil != '/') { (void)sprintf(log_buffer, "%s/%s", path_priv, bodyfil); c |= chk_file_sec(log_buffer, 0, 0, S_IWGRP | S_IWOTH, 1, 0); } else { c |= chk_file_sec(bodyfil, 0, 0, S_IWGRP | S_IWOTH, 1, 0); } } if (termfil) { if (*termfil != '/') { (void)sprintf(log_buffer, "%s/%s", path_priv, termfil); c |= chk_file_sec(log_buffer, 0, 0, S_IWGRP | S_IWOTH, 1, 0); } else { c |= chk_file_sec(termfil, 0, 0, S_IWGRP | S_IWOTH, 1, 0); } } if (c) exit(1); #endif /* not NO_SECURITY_CHECK */ #endif /* not DEBUG */ if (log_open(logfile, path_log) == -1) { fprintf(stderr, "%s: logfile could not be opened\n", argv[0]); exit(1); } if (gethostname(host, sizeof(host)) == -1) { char *prob = "gethostname"; log_err(errno, id, prob); perror(prob); die(0); } if ((hp = gethostbyname(host)) == NULL) { char *prob = "gethostbyname"; log_err(errno, id, prob); perror(prob); die(0); } if ((server_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) { char *prob = "socket"; log_err(errno, id, prob); perror(prob); die(0); } if (setsockopt(server_sock, SOL_SOCKET, SO_REUSEADDR, (char *)&t, sizeof(t)) == -1) { char *prob = "setsockopt"; log_err(errno, id, prob); perror(prob); die(0); } saddr.sin_family = AF_INET; saddr.sin_port = htons((unsigned short)port); memcpy(&saddr.sin_addr, hp->h_addr, hp->h_length); if (bind(server_sock, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) { char *prob = "bind"; log_err(errno, id, prob); perror(prob); die(0); } if (listen(server_sock, 5) < 0) { char *prob = "listen"; log_err(errno, id, prob); perror(prob); die(0); } okclients = (pbs_net_t *)calloc(START_CLIENTS, sizeof(pbs_net_t)); addclient("localhost"); /* who has permission to call MOM */ addclient(host); if (configfile) { if (read_config(configfile) != 0) die(0); } lockfds = open("sched.lock", O_CREAT | O_TRUNC | O_WRONLY, 0644); if (lockfds < 0) { char *prob = "lock file"; log_err(errno, id, prob); perror(prob); die(0); } lock_out(lockfds, F_WRLCK); #ifndef DEBUG lock_out(lockfds, F_UNLCK); if ((pid = fork()) == -1) /* error on fork */ { char *prob = "fork"; log_err(errno, id, prob); perror(prob); die(0); } else if (pid > 0) /* parent exits */ exit(0); if ((pid = setsid()) == -1) { log_err(errno, id, "setsid"); die(0); } lock_out(lockfds, F_WRLCK); freopen(dbfile, "a", stdout); setvbuf(stdout, NULL, _IOLBF, 0); dup2(fileno(stdout), fileno(stderr)); #else pid = getpid(); setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); #endif freopen("/dev/null", "r", stdin); /* write schedulers pid into lockfile */ (void)sprintf(log_buffer, "%d\n", pid); (void)write(lockfds, log_buffer, strlen(log_buffer) + 1); #if (PLOCK_DAEMONS & 2) (void)plock(PROCLOCK); /* lock daemon into memory */ #endif sprintf(log_buffer, "%s startup pid %d", argv[0], pid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); sprintf(log_buffer, "%s using TCL %s (%s)", argv[0], TCL_VERSION, TCL_PATCH_LEVEL); fprintf(stderr, "%s\n", log_buffer); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); fullresp(0); sigemptyset(&allsigs); act.sa_flags = 0; sigaddset(&allsigs, SIGHUP); /* remember to block these */ sigaddset(&allsigs, SIGINT); /* during critical sections */ sigaddset(&allsigs, SIGTERM); /* so we don't get confused */ act.sa_mask = allsigs; act.sa_handler = restart; /* do a restart on SIGHUP */ sigaction(SIGHUP, &act, NULL); act.sa_handler = toolong; /* handle an alarm call */ sigaction(SIGALRM, &act, NULL); act.sa_handler = die; /* bite the biscuit for all following */ sigaction(SIGINT, &act, NULL); sigaction(SIGTERM, &act, NULL); start_tcl(); FD_ZERO(&fdset); for (go = 1; go;) { unsigned int cmd; FD_SET(server_sock, &fdset); if (select(FD_SETSIZE, &fdset, NULL, NULL, NULL) == -1) { if (errno != EINTR) log_err(errno, id, "select"); continue; } if (!FD_ISSET(server_sock, &fdset)) continue; cmd = server_command(); if (cmd == (unsigned)SCH_ERROR || cmd == (unsigned)SCH_SCHEDULE_NULL) continue; if (sigprocmask(SIG_BLOCK, &allsigs, &oldsigs) == -1) log_err(errno, id, "sigprocmaskSIG_BLOCK)"); if (verbose) { sprintf(log_buffer, "command %d", cmd); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } switch (cmd) { case SCH_SCHEDULE_NEW: case SCH_SCHEDULE_TERM: case SCH_SCHEDULE_TIME: case SCH_SCHEDULE_RECYC: case SCH_SCHEDULE_CMD: case SCH_SCHEDULE_FIRST: alarm(alarm_time); #if TCL_MAJOR_VERSION >= 8 /* execute compiled body code for TCL-8 */ code = Tcl_EvalObj(interp, body_obj); #else code = Tcl_Eval(interp, body); #endif alarm(0); switch (code) { case TCL_OK: case TCL_RETURN: break; default: { char *trace; char codename[20]; switch (code) { case TCL_BREAK: strcpy(codename, "break"); break; case TCL_CONTINUE: strcpy(codename, "continue"); break; default: strcpy(codename, "<unknown>"); break; } trace = (char *)Tcl_GetVar(interp, "errorInfo", 0); if (trace == NULL) trace = (char *)Tcl_GetStringResult(interp); fprintf(stderr, "%s: TCL interpreter return code %d (%s) @ line %d: %s\n", bodyfil, code, codename, interp->errorLine, trace); sprintf(log_buffer, "%s: TCL error @ line %d: %s", bodyfil, interp->errorLine, Tcl_GetStringResult(interp)); log_err(-1, id, log_buffer); die(0); } } break; case SCH_CONFIGURE: case SCH_RULESET: restart(0); break; case SCH_QUIT: go = 0; break; default: log_err(-1, id, "unknown command"); break; } if (connector >= 0 && server_disconnect(connector)) { log_err(errno, id, "server_disconnect"); die(0); } connector = -1; if (verbose) { next_brk = (caddr_t)sbrk(0); if (next_brk > curr_brk) { sprintf(log_buffer, "brk point %p", next_brk); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); curr_brk = next_brk; } } if (sigprocmask(SIG_SETMASK, &oldsigs, NULL) == -1) log_err(errno, id, "sigprocmask(SIG_SETMASK)"); } if (termfil) { code = Tcl_EvalFile(interp, termfil); if (code != TCL_OK) { char *trace; trace = (char *)Tcl_GetVar(interp, "errorInfo", 0); if (trace == NULL) trace = (char *)Tcl_GetStringResult(interp); fprintf(stderr, "%s: TCL error @ line %d: %s\n", termfil, interp->errorLine, trace); sprintf(log_buffer, "%s: TCL error @ line %d: %s", termfil, interp->errorLine, Tcl_GetStringResult(interp)); log_err(-1, id, log_buffer); die(0); } sprintf(log_buffer, "term file: %s", termfil); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); } sprintf(log_buffer, "%s normal finish pid %d", argv[0], pid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); (void)close(server_sock); exit(0); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int local_errno = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s ", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); addreq(rm, "tmpdir"); addreq(rm, "arch"); /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->loadave = atof(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->mem_total = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->ncpus_total = atoi(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive TMPDIR response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->tmpdir = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(tmpdir), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive ARCH response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->arch = schd_strdup(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } bail: /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - skipping this node", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
int main( int argc, char *argv[]) { char *id = "main"; struct hostent *hp; int go, c, errflg = 0; int lockfds; int t = 1; pid_t pid; char host[100]; char *homedir = PBS_SERVER_HOME; unsigned int port; char *dbfile = "sched_out"; struct sigaction act; sigset_t oldsigs; caddr_t curr_brk = 0; caddr_t next_brk; extern char *optarg; extern int optind, opterr; extern int rpp_fd; fd_set fdset; int schedinit(int argc, char **argv); int schedule(int com, int connector); glob_argv = argv; alarm_time = 180; /* The following is code to reduce security risks */ /* move this to a place where nss_ldap doesn't hold a socket yet */ c = sysconf(_SC_OPEN_MAX); while (--c > 2) (void)close(c); /* close any file desc left open by parent */ port = get_svrport(PBS_SCHEDULER_SERVICE_NAME, "tcp", PBS_SCHEDULER_SERVICE_PORT); pbs_rm_port = get_svrport(PBS_MANAGER_SERVICE_NAME, "tcp", PBS_MANAGER_SERVICE_PORT); strcpy(pbs_current_user, "Scheduler"); msg_daemonname = strdup("pbs_sched"); opterr = 0; while ((c = getopt(argc, argv, "L:S:R:d:p:c:a:-:")) != EOF) { switch (c) { case '-': if ((optarg == NULL) || (optarg[0] == '\0')) { errflg = 1; } if (!strcmp(optarg, "version")) { fprintf(stderr, "version: %s\n", PACKAGE_VERSION); exit(0); } else { errflg = 1; } break; case 'L': logfile = optarg; break; case 'S': port = atoi(optarg); if (port == 0) { fprintf(stderr, "%s: illegal port\n", optarg); errflg = 1; } break; case 'R': if ((pbs_rm_port = atoi(optarg)) == 0) { (void)fprintf(stderr, "%s: bad -R %s\n", argv[0], optarg); return 1; } break; case 'd': homedir = optarg; break; case 'p': dbfile = optarg; break; case 'c': configfile = optarg; break; case 'a': alarm_time = atoi(optarg); if (alarm_time == 0) { fprintf(stderr, "%s: bad alarm time\n", optarg); errflg = 1; } break; case '?': errflg = 1; break; } } if (errflg) { fprintf(stderr, "usage: %s %s\n", argv[0], usage); exit(1); } #ifndef DEBUG if (IamRoot() == 0) { return (1); } #endif /* DEBUG */ /* Save the original working directory for "restart" */ if ((oldpath = getcwd((char *)NULL, MAXPATHLEN)) == NULL) { fprintf(stderr, "cannot get current working directory\n"); exit(1); } (void)sprintf(log_buffer, "%s/sched_priv", homedir); #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) c = chk_file_sec(log_buffer, 1, 0, S_IWGRP | S_IWOTH, 1, NULL); c |= chk_file_sec(PBS_ENVIRON, 0, 0, S_IWGRP | S_IWOTH, 0, NULL); if (c != 0) exit(1); #endif /* not DEBUG and not NO_SECURITY_CHECK */ if (chdir(log_buffer) == -1) { perror("chdir"); exit(1); } (void)sprintf(path_log, "%s/sched_logs", homedir); (void)sprintf(path_acct, "%s/%s", log_buffer, PBS_ACCT); /* The following is code to reduce security risks */ /* start out with standard umask, system resource limit infinite */ umask(022); if (setup_env(PBS_ENVIRON) == -1) exit(1); c = getgid(); (void)setgroups(1, (gid_t *)&c); /* secure suppl. groups */ #ifndef DEBUG #ifdef _CRAY (void)limit(C_JOB, 0, L_CPROC, 0); (void)limit(C_JOB, 0, L_CPU, 0); (void)limit(C_JOBPROCS, 0, L_CPU, 0); (void)limit(C_PROC, 0, L_FD, 255); (void)limit(C_JOB, 0, L_FSBLK, 0); (void)limit(C_JOBPROCS, 0, L_FSBLK, 0); (void)limit(C_JOB, 0, L_MEM , 0); (void)limit(C_JOBPROCS, 0, L_MEM , 0); #else /* not _CRAY */ { struct rlimit rlimit; rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; (void)setrlimit(RLIMIT_CPU, &rlimit); (void)setrlimit(RLIMIT_FSIZE, &rlimit); (void)setrlimit(RLIMIT_DATA, &rlimit); (void)setrlimit(RLIMIT_STACK, &rlimit); #ifdef RLIMIT_RSS (void)setrlimit(RLIMIT_RSS , &rlimit); #endif /* RLIMIT_RSS */ #ifdef RLIMIT_VMEM (void)setrlimit(RLIMIT_VMEM , &rlimit); #endif /* RLIMIT_VMEM */ } #endif /* not _CRAY */ #endif /* DEBUG */ if (log_open(logfile, path_log) == -1) { fprintf(stderr, "%s: logfile could not be opened\n", argv[0]); exit(1); } if (gethostname(host, sizeof(host)) == -1) { log_err(errno, id, "gethostname"); die(0); } if ((hp = gethostbyname(host)) == NULL) { log_err(errno, id, "gethostbyname"); die(0); } if ((server_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) { log_err(errno, id, "socket"); die(0); } if (setsockopt(server_sock, SOL_SOCKET, SO_REUSEADDR, (char *)&t, sizeof(t)) == -1) { log_err(errno, id, "setsockopt"); die(0); } saddr.sin_family = AF_INET; saddr.sin_port = htons(port); memcpy(&saddr.sin_addr, hp->h_addr, hp->h_length); if (bind(server_sock, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) { log_err(errno, id, "bind"); die(0); } if (listen(server_sock, 5) < 0) { log_err(errno, id, "listen"); die(0); } okclients = (pbs_net_t *)calloc(START_CLIENTS, sizeof(pbs_net_t)); addclient("localhost"); /* who has permission to call MOM */ addclient(host); if (configfile) { if (read_config(configfile) != 0) die(0); } lockfds = open("sched.lock", O_CREAT | O_TRUNC | O_WRONLY, 0644); if (lockfds < 0) { log_err(errno, id, "open lock file"); exit(1); } lock_out(lockfds, F_WRLCK); fullresp(0); if (sigemptyset(&allsigs) == -1) { perror("sigemptyset"); exit(1); } if (sigprocmask(SIG_SETMASK, &allsigs, NULL) == -1) /* unblock */ { perror("sigprocmask"); exit(1); } act.sa_flags = 0; sigaddset(&allsigs, SIGHUP); /* remember to block these */ sigaddset(&allsigs, SIGINT); /* during critical sections */ sigaddset(&allsigs, SIGTERM); /* so we don't get confused */ act.sa_mask = allsigs; act.sa_handler = restart; /* do a restart on SIGHUP */ sigaction(SIGHUP, &act, NULL); act.sa_handler = toolong; /* handle an alarm call */ sigaction(SIGALRM, &act, NULL); act.sa_handler = die; /* bite the biscuit for all following */ sigaction(SIGINT, &act, NULL); sigaction(SIGTERM, &act, NULL); /* * Catch these signals to ensure we core dump even if * our rlimit for core dumps is set to 0 initially. * * Chris Samuel - VPAC * [email protected] - 29th July 2003 * * Now conditional on the PBSCOREDUMP environment variable */ if (getenv("PBSCOREDUMP")) { act.sa_handler = catch_abort; /* make sure we core dump */ sigaction(SIGSEGV, &act, NULL); sigaction(SIGBUS, &act, NULL); sigaction(SIGFPE, &act, NULL); sigaction(SIGILL, &act, NULL); sigaction(SIGTRAP, &act, NULL); sigaction(SIGSYS, &act, NULL); } /* * Local initialization stuff */ if (schedinit(argc, argv)) { (void) sprintf(log_buffer, "local initialization failed, terminating"); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); exit(1); } if (getenv("PBSDEBUG") == NULL) { lock_out(lockfds, F_UNLCK); #ifdef DISABLE_DAEMONS pid = getpid(); #else if ((pid = fork()) == -1) { /* error on fork */ perror("fork"); exit(1); } else if (pid > 0) /* parent exits */ { exit(0); } if ((pid = setsid()) == -1) { perror("setsid"); exit(1); } #endif /* DISABLE_DAEMONS */ lock_out(lockfds, F_WRLCK); if (freopen(dbfile, "a", stdout) == NULL) { perror("opening lockfile"); exit(1); } setvbuf(stdout, NULL, _IOLBF, 0); dup2(fileno(stdout), fileno(stderr)); } else { setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); pid = getpid(); } if (freopen("/dev/null", "r", stdin) == NULL) { perror("opening /dev/null"); exit(1); } /* write scheduler's pid into lockfile */ (void)sprintf(log_buffer, "%ld\n", (long)pid); if (write(lockfds, log_buffer, strlen(log_buffer) + 1) != (ssize_t)(strlen(log_buffer) + 1)) { perror("writing to lockfile"); exit(1); } #if (PLOCK_DAEMONS & 2) (void)plock(PROCLOCK); /* lock daemon into memory */ #endif sprintf(log_buffer, "%s startup pid %ld", argv[0], (long)pid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); FD_ZERO(&fdset); for (go = 1;go;) { int cmd; if (rpp_fd != -1) FD_SET(rpp_fd, &fdset); FD_SET(server_sock, &fdset); if (select(FD_SETSIZE, &fdset, NULL, NULL, NULL) == -1) { if (errno != EINTR) { log_err(errno, id, "select"); die(0); } continue; } if (rpp_fd != -1 && FD_ISSET(rpp_fd, &fdset)) { if (rpp_io() == -1) log_err(errno, id, "rpp_io"); } if (!FD_ISSET(server_sock, &fdset)) continue; cmd = server_command(); if (sigprocmask(SIG_BLOCK, &allsigs, &oldsigs) == -1) log_err(errno, id, "sigprocmaskSIG_BLOCK)"); alarm(alarm_time); if (schedule(cmd, connector)) /* magic happens here */ go = 0; alarm(0); if (connector >= 0 && server_disconnect(connector)) { log_err(errno, id, "server_disconnect"); die(0); } next_brk = (caddr_t)sbrk(0); if (next_brk > curr_brk) { sprintf(log_buffer, "brk point %ld", (long)next_brk); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, log_buffer); curr_brk = next_brk; } if (sigprocmask(SIG_SETMASK, &oldsigs, NULL) == -1) log_err(errno, id, "sigprocmask(SIG_SETMASK)"); } sprintf(log_buffer, "%s normal finish pid %ld", argv[0], (long)pid); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); close(server_sock); exit(0); } /* END main() */