/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int cpus_avail = 0; size_t pmem_avail = 0; char hpm_ctl[64]; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ #ifdef NODEMASK Bitfield cpy; int i, j; #endif /* NODEMASK */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); #ifdef NODEMASK addreq(rm, "availmask"); #endif /* NODEMASK */ if (schd_MANAGE_HPM) { (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR); addreq(rm, hpm_ctl); } /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); new_rsrcs->freemem *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq(rm); if (response != NULL) { pmem_avail = schd_val2byte(response); pmem_avail *= schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } #ifdef NODEMASK /* Receive available nodes from resource monitor. */ response = getreq(rm); if (response == NULL) { (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } else { if (schd_bits2mask(response, &new_rsrcs->availmask) != 0) { if (schd_str2mask(response, &new_rsrcs->availmask) != 0) { (void)sprintf(log_buffer, "can't parse availmask '%s'", response); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } (void)free(response); } #endif /* NODEMASK */ if (schd_MANAGE_HPM) { /* Receive HPM_CTL response from resource monitor. */ response = getreq(rm); if (response != NULL) { if (strcmp(response, HPM_CTL_USERMODE_STR) == 0) new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER; else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0) new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER; else { (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'", response, hpm_ctl, exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } else { (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d", hpm_ctl, pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } } /* * NOTE: response will be free()'d in bail. Be sure to explicitly free() * response if more getreq() calls are added before the code below. */ bail: if (response != NULL) (void)free(response); /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail); #ifdef NODEMASK /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */ BITFIELD_CPY(&cpy, &(new_rsrcs->availmask)); for (i = 2; i <= schd_FAKE_MACH_MULT; i++) { for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++) BITFIELD_SHIFTL(&cpy); BITFIELD_SETM(&(new_rsrcs->availmask), &cpy); } #endif /* NODEMASK */ if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
/* * Attempt to set the state of the hpm counters on the host associated * with the given Resources. Mode must be one of HPM_SETUP_USERMODE or * HPM_SETUP_GLOBALMODE. Return 0 on success, non-zero otherwise. */ static int setup_hpm(Resources *rsrcs, int mode) { char *id = "setup_hpm"; char *response, *value; char hpm_ctl[64]; int rm; int local_errno = 0; switch (mode) { case HPM_SETUP_USERMODE: /* Sanity check - is the host already in the requested mode? */ if (rsrcs->flags & RSRCS_FLAGS_HPM_USER) { DBPRT(("%s: hpm user mode requested for %s, but already set!\n", id, rsrcs->exechost)); return (0); } (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_USERMODE_STR); break; case HPM_SETUP_GLOBALMODE: /* Sanity check - is the host already in the requested mode? */ if (!(rsrcs->flags & RSRCS_FLAGS_HPM_USER)) { DBPRT(("%s: hpm global mode requested for %s, but already set!\n", id, rsrcs->exechost)); return (0); } (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_GLOBALMODE_STR); break; case HPM_SETUP_REVOKE: /* Sanity check - is the host already in the requested mode? */ if (!(rsrcs->flags & RSRCS_FLAGS_HPM_USER)) { DBPRT(("%s: hpm revocation requested for %s, but already global!\n", id, rsrcs->exechost)); return (0); } (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_REVOKE_STR); break; default: DBPRT(("%s: Bogus mode %d - bailing.\n", id, mode)); return (1); } DBPRT(("%s: '%s' @ %s\n", id, hpm_ctl, rsrcs->exechost)); if ((rm = openrm(rsrcs->exechost, 0)) < 0) { (void)sprintf(log_buffer, "Unable to contact resmom@%s", rsrcs->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); return (1); } /* Ask the resource monitor on the remote host to set the mode for us. */ response = NULL; if (addreq_err(rm, &local_errno, hpm_ctl) == 0) response = getreq_err(&local_errno, rm); closerm(rm); if (response == NULL) { (void)sprintf(log_buffer, "bad return from getreq(%s) @%s, %d", hpm_ctl, rsrcs->exechost, local_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (1); } /* * If a full response was received, move forward to the first character * of the value (following the '=' in the attribute-value pair). */ if (value = strchr(response, '=')) response = ++value; /* * If the hpm_ctl request succeeded, log the fact, and set the flag in * the resources for this host to indicate that it is now in the other * state. */ if (strcmp(response, HPM_CTL_OKAY_STR) == 0) { if (mode == HPM_SETUP_USERMODE) rsrcs->flags |= RSRCS_FLAGS_HPM_USER; else rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER; (void)sprintf(log_buffer, "%s on %s succeeded", hpm_ctl, rsrcs->exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (0); } /* If it's an error string, just report the error message returned. */ if (strncmp(response, HPM_CTL_ERROR_STR, strlen(HPM_CTL_ERROR_STR)) == 0) { response += strlen(HPM_CTL_ERROR_STR); /* Skip the error string. */ while (*response == ' ') /* Skip leading whitespace. */ ++ response; /* And generate the log message from the request and the response. */ (void)sprintf(log_buffer, "%s: %s (%s)", hpm_ctl, response, rsrcs->exechost); } else { (void)sprintf(log_buffer, "cannot parse response %s to request %s@%s", response, hpm_ctl, rsrcs->exechost); } log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (1); }
int do_mom( char *HPtr, int MOMPort, int CmdIndex) { int sd; if ((sd = openrm(HPtr, MOMPort)) < 0) { /* FAILURE */ extern char TRMEMsg[]; fprintf(stderr, "cannot connect to MOM on node '%s', errno=%d (%s)\n", HPtr, pbs_errno, strerror(pbs_errno)); if (TRMEMsg[0] != '\0') { fprintf(stderr, " %s\n", TRMEMsg); } return(sd); } if (IsVerbose == TRUE) { fprintf(stderr, "INFO: successfully connected to %s\n", HPtr); } switch (CmdIndex) { case momClear: { char tmpLine[1024]; char *Value; snprintf(tmpLine, 1024, "clearjob=%s", (JPtr != NULL) ? JPtr : "all"); if (addreq(sd, tmpLine) != 0) { /* FAILURE */ fprintf(stderr,"ERROR: cannot request job clear on %s (errno=%d-%s: %d-%s)\n", HPtr, errno, pbs_strerror(errno), pbs_errno, pbs_strerror(pbs_errno)); closerm(sd); return(FAILURE); } if ((Value = (char *)getreq(sd)) == NULL) { /* FAILURE */ fprintf(stderr,"ERROR: job clear failed on %s (errno=%d-%s: %d-%s)\n", HPtr, errno, pbs_strerror(errno), pbs_errno, pbs_strerror(pbs_errno)); closerm(sd); return(FAILURE); } /* job cleared */ fprintf(stdout,"job clear request successful on %s\n", HPtr); } /* END BLOCK (case momClear) */ break; case momShutdown: { int rc; rc = downrm(sd); if (rc != 0) { /* FAILURE */ fprintf(stderr,"ERROR: cannot shutdown mom daemon on %s (errno=%d-%s: %d-%s)\n", HPtr, errno, pbs_strerror(errno), pbs_errno, pbs_strerror(pbs_errno)); closerm(sd); exit(EXIT_FAILURE); } fprintf(stdout, "shutdown request successful on %s\n", HPtr); } /* END BLOCK */ break; case momReconfig: { int rc; rc = configrm(sd, ConfigBuf); if (rc != 0) { /* FAILURE */ fprintf(stderr,"ERROR: cannot reconfigure mom on %s (errno=%d-%s: %d-%s)\n", HPtr, errno, pbs_strerror(errno), pbs_errno, pbs_strerror(pbs_errno)); closerm(sd); return(FAILURE); } fprintf(stdout, "reconfig successful on %s\n", HPtr); } /* END BLOCK (case momReconfig) */ break; case momQuery: default: { char *ptr; int rindex; char *Value; int was_error = 0; for (rindex = 0; rindex < QueryI; rindex++) { if (addreq(sd, Query[rindex]) != 0) { fprintf(stderr,"ERROR: cannot add query for '%s' on %s (errno=%d-%s: %d-%s)\n", Query[rindex], HPtr, errno, pbs_strerror(errno), pbs_errno, pbs_strerror(pbs_errno)); was_error = 1; } } for (rindex = 0; rindex < QueryI; rindex++) { if ((ptr = strchr(Query[rindex],'=')) != NULL) { *ptr = '\0'; } if ((Value = (char *)getreq(sd)) == NULL) { fprintf(stderr, "ERROR: query[%d] '%s' failed on %s (errno=%d-%s: %d-%s)\n", rindex, Query[rindex], HPtr, errno, pbs_strerror(errno), pbs_errno, pbs_strerror(pbs_errno)); was_error = 1; } else { if (!strncmp(Query[rindex], "diag", strlen("diag"))) { fprintf(stdout, "%s\n", Value); } else if (!strncmp(Query[rindex], "cycle", strlen("cycle"))) { fprintf(stdout, "mom %s successfully cycled %s\n", HPtr, Value); } else { fprintf(stdout, "%12s: %12s = '%s'\n", HPtr, Query[rindex], Value); } } if (ptr != NULL) { *ptr = '='; } } /* END for (rindex) */ return (was_error); } /* END BLOCK (case momQuery) */ break; } /* END switch(CmdIndex) */ closerm(sd); return(0); } /* END do_mom() */
int main(int argc, char *argv[]) { int i; char mom_name[PBS_MAXHOSTNAME+1]; int mom_port = 0; int c, rc; int mom_sd; char *req; #ifdef WIN32 if (winsock_init()) { return 1; } #endif if (gethostname(mom_name, (sizeof(mom_name) - 1)) < 0 ) mom_name[0] = '\0'; while ((c = getopt(argc, argv, "m:p:")) != EOF) { switch (c) { case 'm': strcpy(mom_name, optarg); break; case 'p': mom_port = atoi(optarg); break; default: fprintf(stderr, "Bad option: %c\n", c); } } if (mom_name[0] == '\0' || optind == argc) { fprintf(stderr, "Error in usage: pbs_rmget [-m mom name] [-p mom port] <req1>...[reqN]\n"); return 1; } if(set_msgdaemonname("pbs_rmget")) { fprintf(stderr, "Out of memory\n"); return 1; } /* load the pbs conf file */ if (pbs_loadconf(0) == 0) { fprintf(stderr, "%s: Configuration error\n", argv[0]); return (1); } if (pbs_conf.pbs_use_tcp == 1) { struct tpp_config tpp_conf; fd_set selset; struct timeval tv; if (!pbs_conf.pbs_leaf_name) { char my_hostname[PBS_MAXHOSTNAME+1]; if (gethostname(my_hostname, (sizeof(my_hostname) - 1)) < 0) { fprintf(stderr, "Failed to get hostname\n"); return -1; } pbs_conf.pbs_leaf_name = get_all_ips(my_hostname, log_buffer, sizeof(log_buffer) - 1); if (!pbs_conf.pbs_leaf_name) { fprintf(stderr, "%s\n", log_buffer); fprintf(stderr, "%s\n", "Unable to determine TPP node name"); return -1; } } /* We don't want to show logs related to connecting pbs_comm on console * this set this flag to ignore it */ log_mask = SHOW_NONE; /* set tpp function pointers */ set_tpp_funcs(log_tppmsg); /* call tpp_init */ rc = 0; #ifndef WIN32 if (pbs_conf.auth_method == AUTH_MUNGE) rc = set_tpp_config(&pbs_conf, &tpp_conf, pbs_conf.pbs_leaf_name, -1, pbs_conf.pbs_leaf_routers, pbs_conf.pbs_use_compression, TPP_AUTH_EXTERNAL, get_ext_auth_data, validate_ext_auth_data); else #endif rc = set_tpp_config(&pbs_conf, &tpp_conf, pbs_conf.pbs_leaf_name, -1, pbs_conf.pbs_leaf_routers, pbs_conf.pbs_use_compression, TPP_AUTH_RESV_PORT, NULL, NULL); if (rc == -1) { fprintf(stderr, "Error setting TPP config\n"); return -1; } if ((rpp_fd = tpp_init(&tpp_conf)) == -1) { fprintf(stderr, "rpp_init failed\n"); return -1; } /* * Wait for net to get restored, ie, app to connect to routers */ FD_ZERO(&selset); FD_SET(rpp_fd, &selset); tv.tv_sec = 5; tv.tv_usec = 0; select(FD_SETSIZE, &selset, NULL, NULL, &tv); rpp_poll(); /* to clear off the read notification */ /* Once the connection is established we can unset log_mask */ log_mask &= ~SHOW_NONE; } else { /* set rpp function pointers */ set_rpp_funcs(log_rppfail); } /* get the FQDN of the mom */ c = get_fullhostname(mom_name, mom_name, (sizeof(mom_name) - 1)); if (c == -1) { fprintf(stderr, "Unable to get full hostname for mom %s\n", mom_name); return -1; } if ((mom_sd = openrm(mom_name, mom_port)) < 0) { fprintf(stderr, "Unable to open connection to mom: %s:%d\n", mom_name, mom_port); return 1; } for (i = optind; i < argc; i++) addreq(mom_sd, argv[i]); for (i = optind; i < argc; i++) { req = getreq(mom_sd); if (req == NULL) { fprintf(stderr, "Error getting response %d from mom.\n", i - optind); return 1; } printf("[%d] %s\n", i - optind, req); free(req); } closerm(mom_sd); return 0; }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response; int badreply = 0; int cpus_avail = 0; int cpus_tot = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s (%d)", exechost, pbs_errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "mppe_app"); addreq(rm, "mppe_avail"); /* Get the values back from the resource monitor, and round up. */ /* Receive MPPE_APP response from resource monitor. */ /* returns the total number of Application PEs configured */ response = getreq(rm); if (response != NULL) { cpus_tot = atoi(response) * schd_FAKE_MACH_MULT; } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive MPPE_AVAIL response from resource monitor. */ /* returns the largest contiguous block of APP PEs */ response = getreq(rm); if (response != NULL) { cpus_avail = atoi(response) * schd_FAKE_MACH_MULT; } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", pbs_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } new_rsrcs->freemem = MB_PER_NODE * schd_FAKE_MACH_MULT; bail: /* Disconnect from the resource monitor. */ if (rm) closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - aborting sched run", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } new_rsrcs->nodes_total = cpus_tot; new_rsrcs->nodes_alloc = cpus_tot - cpus_avail; if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }
/* * Find an entry for the resources for the requested host in the list of * existing resources, or create a new one for that host and return it. */ Resources * schd_get_resources(char *exechost) { char *id = "schd_get_resources"; Resources *rptr, *new_rsrcs; int rm; char *response = NULL; int badreply = 0; int local_errno = 0; struct sigaction act, oact; unsigned int remain; /* Time remaining in any old alarm(). */ time_t then; /* When this alarm() was started. */ /* * Check for a local copy of the resources being available already. * If so, just return a reference to that Resources structure. */ if (schd_RsrcsList != NULL) { for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next) if (strcmp(rptr->exechost, exechost) == 0) return (rptr); } schd_timestamp("get_rsrcs"); /* * No cached resource information for 'exechost'. Need to query the * host for its information. */ if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) { (void)sprintf(log_buffer, "Unable to alloc space for Resources."); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); return (NULL); /* Can't get the information - nowhere to store it. */ } memset((void *)new_rsrcs, 0, sizeof(Resources)); act.sa_flags = 0; act.sa_handler = connect_interrupt; sigemptyset(&act.sa_mask); remain = 0; then = 0; /* * Set the alarm, and maintain some idea of how long was left on any * previously set alarm. */ if (sigaction(SIGALRM, &act, &oact) == 0) { remain = alarm(GETRSRCS_CONNECT_TIME); then = time(NULL); } if ((rm = openrm(exechost, 0)) == -1) { (void)sprintf(log_buffer, "Unable to contact resmom@%s ", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* * Turn off full response. Responses will be received in the order in * which they are sent. */ fullresp(0); /* Build a list of all the resources about which we want information. */ addreq(rm, "loadave"); addreq(rm, "availmem"); addreq(rm, "physmem"); addreq(rm, "ncpus"); addreq(rm, "tmpdir"); addreq(rm, "arch"); /* Get the values back from the resource monitor, and round up. */ /* Receive LOADAVE response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->loadave = atof(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive AVAILMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->freemem = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive PHYSMEM response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->mem_total = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive NCPUS response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->ncpus_total = atoi(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive TMPDIR response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->tmpdir = schd_val2byte(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(tmpdir), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } /* Receive ARCH response from resource monitor. */ response = getreq_err(&local_errno, rm); if (response != NULL) { new_rsrcs->arch = schd_strdup(response); (void)free(response); } else { (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d", local_errno, errno); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); badreply = 1; goto bail; } bail: /* Disconnect from the resource monitor. */ if (rm >= 0) /* resmom handle "0" is valid in RPP. */ closerm(rm); /* And unset the alarm and handler. */ alarm(0); sigaction(SIGALRM, &oact, &act); /* Reset the old alarm, taking into account how much time has passed. */ if (remain) { DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id, remain, (time(NULL) - then))); /* How much time remains even after the time spent above? */ remain -= (time(NULL) - then); /* * Would the previous time have already expired? If so, schedule * an alarm call in 1 second (close enough, hopefully). */ if (remain < 1) remain = 1; DBPRT(("reset to %d secs\n", remain)); alarm(remain); } /* * Verify all the data came back as expected; if not, abort this * iteration of the scheduler. */ if (badreply) { (void)sprintf(log_buffer, "Got bad info from mom@%s - skipping this node", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } /* Make a copy of the hostname for the resources struct. */ new_rsrcs->exechost = schd_strdup(exechost); if (new_rsrcs->exechost == NULL) { (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs", exechost); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer); DBPRT(("%s: %s\n", id, log_buffer)); free(new_rsrcs); return (NULL); } if (schd_RsrcsList == NULL) { schd_RsrcsList = new_rsrcs; /* Start the list. */ } else { for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next) /* Find the last element in the list. */ ; rptr->next = new_rsrcs; } /* Next pointer for the tail of the list points to nothing. */ new_rsrcs->next = NULL; return (new_rsrcs); }