struct rudp *rudpOpenBound(struct sockaddr_in *sai) /* Open up a rudp socket bound to a particular port and address. * Use this rather than rudpOpen if you want to wait for * messages at a specific address in a server or the like. */ { struct rudp *ru = rudpOpen(); if (ru != NULL) { if (bind(ru->socket, (struct sockaddr *)sai, sizeof(*sai)) < 0) { warn("Couldn't bind rudp socket: %s", strerror(errno)); rudpClose(&ru); } } return ru; }
void paraNodeStop(char *machineList) /* Stop node server on all machines in list. */ { struct lineFile *lf = lineFileOpen(machineList, FALSE); char *row[1]; while (lineFileRow(lf, row)) { struct rudp *ru = rudpMustOpen(); char *name = row[0]; struct paraMessage pm; ru->maxRetries = 6; printf("Telling %s to quit \n", name); pmInitFromName(&pm, name, paraNodePort); pmPrintf(&pm, "%s", "quit"); pmSend(&pm, ru); rudpClose(&ru); } }
void paraNode() /* paraNode - a net server. */ { char *line; char *command; struct sockaddr_in sai; /* We have to know who we are... */ hostName = getMachine(); initRandom(); getTicksToHundreths(); /* log init */ if (optionExists("log")) logOpenFile("paraNode", optionVal("log", NULL)); else logOpenSyslog("paraNode", optionVal("logFacility", NULL)); logSetMinPriority(optionVal("logMinPriority", "info")); logInfo("starting paraNode on %s", hostName); /* Make job lists. */ jobsRunning = newDlList(); jobsFinished = newDlList(); /* Set up socket and self to listen to it. */ ZeroVar(&sai); sai.sin_family = AF_INET; sai.sin_port = htons(paraNodePort); sai.sin_addr.s_addr = INADDR_ANY; mainRudp = rudpMustOpenBound(&sai); mainRudp->maxRetries = 12; /* Event loop. */ findNow(); for (;;) { /* Get next incoming message and optionally check to make * sure that it's from a host we trust, and check signature * on first bit of incoming data. */ if (pmReceive(&pmIn, mainRudp)) { findNow(); if (hubName == NULL || ntohl(pmIn.ipAddress.sin_addr.s_addr) == hubIp || ntohl(pmIn.ipAddress.sin_addr.s_addr) == localIp) { /* Host and signature look ok, read a string and * parse out first word as command. */ line = pmIn.data; logDebug("message from %s: \"%s\"", paraFormatIp(ntohl(pmIn.ipAddress.sin_addr.s_addr)), line); command = nextWord(&line); if (command != NULL) { if (sameString("quit", command)) break; else if (sameString("run", command)) doRun(line, &pmIn.ipAddress); else if (sameString("jobDone", command)) jobDone(line); else if (sameString("status", command)) doStatus(); else if (sameString("kill", command)) doKill(line); else if (sameString("check", command)) doCheck(line, &pmIn.ipAddress); else if (sameString("resurrect", command)) doResurrect(line, &pmIn.ipAddress); else if (sameString("listJobs", command)) listJobs(); else if (sameString("fetch", command)) doFetch(line); else logWarn("invalid command: \"%s\"", command); } logDebug("done command"); } else { logWarn("command from unauthorized host %s", paraFormatIp(ntohl(pmIn.ipAddress.sin_addr.s_addr))); } } } rudpClose(&mainRudp); }
void execProc(char *managingHost, char *jobIdString, char *reserved, char *user, char *dir, char *in, char *out, char *err, long long memLimit, char *exe, char **params) /* This routine is the child process of doExec. * It spawns a grandchild that actually does the * work and waits on it. It sends message to the * main message loop here when done. */ { if ((grandChildId = forkOrDie()) == 0) { char *homeDir = ""; /* Change to given user (if root) */ changeUid(user, &homeDir); /* create output files just after becoming user so that errors in the rest * of this proc will go to the err file and be available via para * problems */ setupProcStdio(in, out, err); if (chdir(dir) < 0) errnoAbort("can't chdir to %s", dir); setsid(); // setpgid(0,0); umask(umaskVal); struct rlimit rlim; rlim.rlim_cur = rlim.rlim_max = memLimit; if(setrlimit(RLIMIT_CORE, &rlim) < 0) perror("setrlimit"); /* Update environment. */ { struct hash *hash = environToHash(environ); hashUpdate(hash, "JOB_ID", jobIdString); hashUpdate(hash, "USER", user); hashUpdate(hash, "HOME", homeDir); hashUpdate(hash, "HOST", hostName); hashUpdate(hash, "PARASOL", "7"); updatePath(hash, userPath, homeDir, sysPath); addEnvExtras(hash); environ = hashToEnviron(hash); freeHashAndVals(&hash); } randomSleep(); /* Sleep a random bit before executing this thing * to help spread out i/o when a big batch of jobs * hit idle cluster */ execvp(exe, params); errnoAbort("execvp'ing %s", exe); } else { /* Wait on executed job and send jobID and status back to * main process. */ int status = -1; int cid; struct paraMessage pm; struct rudp *ru = NULL; struct tms tms; unsigned long uTime = 0; unsigned long sTime = 0; if (grandChildId >= 0) { signal(SIGTERM, termHandler); cid = waitpid(grandChildId, &status, 0); if (cid < 0) errnoAbort("wait on grandchild failed"); times(&tms); uTime = ticksToHundreths*tms.tms_cutime; sTime = ticksToHundreths*tms.tms_cstime; } ru = rudpOpen(); if (ru != NULL) { ru->maxRetries = 20; pmInit(&pm, localIp, paraNodePort); pmPrintf(&pm, "jobDone %s %s %d %lu %lu", managingHost, jobIdString, status, uTime, sTime); pmSend(&pm, ru); rudpClose(&ru); } } }