int perform_communications_with_retry( char *hostname, int MOMPort, int *fail_count) { int retries = 0; int local_errno; int rc = PBSE_NONE; while (retries < 5) { rc = do_mom(hostname, MOMPort, CmdIndex); if (rc >= 0) break; *fail_count = *fail_count + 1; local_errno = -1 * rc; switch (local_errno) { case EACCES: case EPERM: case ECONNREFUSED: case ENETUNREACH: case EFAULT: case EAFNOSUPPORT: return(rc); break; } fprintf(stdout, "attempting command again\n"); retries++; sleep(1); continue; } return(rc); } /* END perform_communications_with_retry() */
int main( int ArgC, /* I */ char **ArgV) /* I */ { const char *OptString = "c:Cd:f:h:p:q:r:sv"; char HostList[65536]; char *HPtr; int c; int HostCount; int FailCount; /* initialize */ HostList[0] = '\0'; ConfigBuf[0] = '\0'; if (getuid() != 0) { fprintf(stderr, "ERROR: must be root to run this command\n"); exit(EXIT_FAILURE); } while ((c = getopt(ArgC, ArgV, OptString)) != EOF) { switch (c) { case 'c': /* clear stale job */ JPtr = optarg; CmdIndex = momClear; break; case 'C': /* force cycle */ CmdIndex = momQuery; Query[QueryI] = strdup("cycle"); QueryI++; break; case 'd': /* diagnose */ /* FORMAT: momctl -d<X> */ CmdIndex = momQuery; if ((Query[QueryI] = calloc(strlen(DiagPtr) + 3, sizeof(char))) == NULL) { fprintf(stderr,"ERROR: could not calloc %d bytes!\n", (int)strlen(DiagPtr) + 3); exit(EXIT_FAILURE); } if (optarg == NULL) { strncpy(Query[QueryI],DiagPtr,strlen(DiagPtr)); } else { snprintf(Query[QueryI],strlen(DiagPtr) + 2,"%s%s", DiagPtr, optarg); } QueryI++; break; case 'f': { int rc; FILE *fp; long size; if ((fp = fopen(optarg, "r")) == NULL) { fprintf(stderr, "ERROR: cannot open file '%s', errno: %d (%s)\n", optarg, errno, strerror(errno)); exit(EXIT_FAILURE); } rc = fread(HostList, sizeof(HostList), 1, fp); if ((rc == 0) && (!feof(fp))) { fprintf(stderr, "ERROR: cannot read file '%s', errno: %d (%s)\n", optarg, errno, strerror(errno)); exit(EXIT_FAILURE); } size = ftell(fp); HostList[MIN(size,(long)sizeof(HostList) - 1)] = '\0'; fclose(fp); } /* END BLOCK */ break; case 'h': /* connect to specified host */ strncpy(HostList,optarg,sizeof(HostList)); break; case 'p': /* port */ if (optarg == NULL) MCShowUsage("port not specified"); MOMPort = (int)strtol(optarg, NULL, 10); if (MOMPort == 0) MCShowUsage("invalid port specified"); break; case 'q': /* query resources */ if (optarg == NULL) { MCShowUsage("query not specified"); Query[QueryI] = strdup(DiagPtr); } else { Query[QueryI] = strdup(optarg); } QueryI++; CmdIndex = momQuery; break; case 'r': /* reconfigure */ { CmdIndex = momReconfig; /* NOTE: specify remote file to load -> 'fname' */ /* specify local file to stage -> 'LOCAL:fname' */ if (optarg == NULL) MCShowUsage("file not specified"); if (!strncmp(optarg, "LOCAL:", strlen("LOCAL:"))) { FILE *fp; int size; int rc; char *ptr; char *cptr; strcpy(ConfigBuf, "CONFIG:"); cptr = ConfigBuf + strlen(ConfigBuf); ptr = optarg + strlen("LOCAL:"); if ((fp = fopen(ptr, "r")) == NULL) { fprintf(stderr, "ERROR: cannot open file '%s', errno: %d (%s)\n", optarg, errno, strerror(errno)); exit(EXIT_FAILURE); } rc = fread(cptr, sizeof(ConfigBuf) - strlen(ConfigBuf), 1, fp); if ((rc == 0) && (!feof(fp))) { fprintf(stderr, "ERROR: cannot read file '%s', errno: %d (%s)\n", optarg, errno, strerror(errno)); exit(EXIT_FAILURE); } size = ftell(fp); ConfigBuf[MIN(size + strlen("CONFIG:"),sizeof(ConfigBuf) - 1)] = '\0'; fclose(fp); } else { strncpy(ConfigBuf, optarg, sizeof(ConfigBuf)); } } /* END (case 'r') */ break; case 's': /* shutdown */ CmdIndex = momShutdown; break; case 'v': /* report verbose logging */ IsVerbose = TRUE; break; } /* END switch (c) */ } /* END while (c = getopt()) */ if (CmdIndex == momNONE) { MCShowUsage("no command specified"); } if (HostList[0] == '\0') strcpy(HostList, LocalHost); HPtr = strtok(HostList, ", \t\n"); HostCount = 0; FailCount = 0; /* at this point, all args processing and setup is completed ... * ... now we run through each comma-delimited word in HPtr */ while (HPtr != NULL) { if ((*HPtr == ':') && (*(HPtr + 1) != '\0')) { /* finds nodes with this property */ int con; char *def_server, *pserver, *servername; struct batch_status *bstatus, *pbstat; struct attrl *nodeattrs; def_server = pbs_default(); if ((pserver = strchr(HPtr,'@')) != NULL) { *pserver = '\0'; servername = pserver + 1; } else { servername = def_server; } con = pbs_connect(servername); if (con < 0) { fprintf(stderr,"failed to connect to pbs_server:%s\n", servername); exit(EXIT_FAILURE); } /* get a batch_status entry for each node in ":property" */ bstatus = pbs_statnode(con,HPtr,NULL,NULL); if (bstatus != NULL) { for (pbstat = bstatus; pbstat != NULL; pbstat = pbstat->next) { /* check state first, only do_mom() if not down */ for (nodeattrs = pbstat->attribs; nodeattrs != NULL; nodeattrs = nodeattrs->next) { if (!strcmp(nodeattrs->name, ATTR_NODE_state)) { if (!strstr(nodeattrs->value, ND_down)) { do_mom(pbstat->name, MOMPort, CmdIndex) >= 0 ? HostCount++ : FailCount++; } else { fprintf(stderr,"%12s: skipping down node\n", pbstat->name); } break; } /* END if (attrib name eq state) */ } /* END for (nodeattrs) */ } /* END for (pbstat) */ pbs_statfree(bstatus); } /* END if (bstatus != NULL) */ else { fprintf(stderr,"no nodes found in %s on %s\n", HPtr, servername); } pbs_disconnect(con); if (pserver != NULL) *pserver = '@'; } else { do_mom(HPtr, MOMPort, CmdIndex) >= 0 ? HostCount++ : FailCount++; } /* END if (*HPtr == ':') */ HPtr = strtok(NULL, ", \t\n"); } /* END while (HPtr != NULL) */ if (IsVerbose == TRUE) { fprintf(stdout, "Node Summary: %d Successful %d Failed\n", HostCount, FailCount); } /* SUCCESS */ exit(EXIT_SUCCESS); } /* END main() */