void listJobs() /* Report jobs running and recently finished. */ { struct job *job; struct dlNode *node; pmClear(&pmIn); pmPrintf(&pmIn, "%d running", dlCount(jobsRunning)); if (!pmSend(&pmIn, mainRudp)) return; for (node = jobsRunning->head; !dlEnd(node); node=node->next) { job = node->val; pmClear(&pmIn); pmPrintf(&pmIn, "%s", job->startMessage); if (!pmSend(&pmIn, mainRudp)) return; } pmClear(&pmIn); pmPrintf(&pmIn, "%d recent", dlCount(jobsFinished)); if (!pmSend(&pmIn, mainRudp)) return; for (node = jobsFinished->head; !dlEnd(node); node=node->next) { job = node->val; pmClear(&pmIn); pmPrintf(&pmIn, "%s", job->startMessage); if (!pmSend(&pmIn, mainRudp)) return; pmClear(&pmIn); pmPrintf(&pmIn, "%s", job->doneMessage); if (!pmSend(&pmIn, mainRudp)) return; } }
void doFetch(char *line) /* Fetch first part of file. Protocol is to send the * file one UDP packet at a time. A zero length packet * indicates end of file. */ { char *user = cloneString(nextWord(&line)); char *fileName = cloneString(nextWord(&line)); if ((user == NULL) || (fileName != NULL)) { FILE *f = fopen(fileName, "r"); pmClear(&pmIn); if (f == NULL) { if (user == NULL) user = "******"; pmPrintf(&pmIn, "Couldn't open fetch file: \"%s\" %s for user %s", fileName, strerror(errno), user); warn("Couldn't open fetch file: \"%s\" %s for user %s", fileName, strerror(errno), user); pmSend(&pmIn, mainRudp); pmClear(&pmIn); pmSend(&pmIn, mainRudp); } else { int size; for (;;) { size = fread(pmIn.data,1, sizeof(pmIn.data)-1, f); if (size < 0) { size = 0; warn("Couldn't read fetch file: \"%s\" %s", fileName, strerror(errno)); } pmIn.size = size; pmSend(&pmIn, mainRudp); if (size == 0) break; } fclose(f); } } freez(&user); freez(&fileName); }
void tellManagerJobIsDone(char *managingHost, char *jobIdString, char *line) /* Try and send message to host saying job is done. */ { struct paraMessage pm; bits32 ip; if (!internetDottedQuadToIp(managingHost, &ip)) { warn("%s doesn't seem to be in dotted quad form\n", managingHost); return; } pmInit(&pm, ip, paraHubPort); pmPrintf(&pm, "jobDone %s %s", jobIdString, line); if (!pmSend(&pm, mainRudp)) warn("Couldn't send message to %s to say %s is done\n", managingHost, jobIdString); }
void doStatus() /* Report status. */ { pmClear(&pmIn); pmPrintf(&pmIn, "%d of %d CPUs busy", busyProcs, maxProcs); if (busyProcs > 0) { struct dlNode *node; pmPrintf(&pmIn, ". Jobs:"); for (node = jobsRunning->head; !dlEnd(node); node = node->next) { struct job *job = node->val; pmPrintf(&pmIn, " %d", job->jobId); } } pmSend(&pmIn, mainRudp); }
void paraNodeStop(char *machineList) /* Stop node server on all machines in list. */ { struct lineFile *lf = lineFileOpen(machineList, FALSE); char *row[1]; while (lineFileRow(lf, row)) { struct rudp *ru = rudpMustOpen(); char *name = row[0]; struct paraMessage pm; ru->maxRetries = 6; printf("Telling %s to quit \n", name); pmInitFromName(&pm, name, paraNodePort); pmPrintf(&pm, "%s", "quit"); pmSend(&pm, ru); rudpClose(&ru); } }
void doResurrect(char *line, struct sockaddr_in *hubIp) /* Send back I'm alive message */ { struct paraMessage pm; struct dlNode *node; int jobsReported = 0; pmInit(&pm, ntohl(hubIp->sin_addr.s_addr), paraHubPort); pmPrintf(&pm, "alive %s", hostName); for (node = jobsRunning->head; !dlEnd(node); node = node->next) { struct job *job = node->val; pmPrintf(&pm, " %d", job->jobId); ++jobsReported; } for (node = jobsFinished->head; !dlEnd(node); node = node->next) { struct job *job = node->val; if (jobsReported >= maxProcs) break; pmPrintf(&pm, " %d", job->jobId); ++jobsReported; } pmSend(&pm, mainRudp); }
void doCheck(char *line, struct sockaddr_in *hubIp) /* Send back check result - either a check in message or * jobDone. */ { char *jobIdString = nextWord(&line); if (jobIdString != NULL) { int jobId = atoi(jobIdString); struct job *job = findRunningJob(jobId); struct paraMessage pm; pmInit(&pm, ntohl(hubIp->sin_addr.s_addr), paraHubPort); if (job != NULL) pmPrintf(&pm, "checkIn %s %s running", hostName, jobIdString); else { struct job *job = findFinishedJob(jobId); if (job == NULL) pmPrintf(&pm, "checkIn %s %s free", hostName, jobIdString); else pmPrintf(&pm, "jobDone %s %s", jobIdString, job->doneMessage); } pmSend(&pm, mainRudp); } }
void execProc(char *managingHost, char *jobIdString, char *reserved, char *user, char *dir, char *in, char *out, char *err, long long memLimit, char *exe, char **params) /* This routine is the child process of doExec. * It spawns a grandchild that actually does the * work and waits on it. It sends message to the * main message loop here when done. */ { if ((grandChildId = forkOrDie()) == 0) { char *homeDir = ""; /* Change to given user (if root) */ changeUid(user, &homeDir); /* create output files just after becoming user so that errors in the rest * of this proc will go to the err file and be available via para * problems */ setupProcStdio(in, out, err); if (chdir(dir) < 0) errnoAbort("can't chdir to %s", dir); setsid(); // setpgid(0,0); umask(umaskVal); struct rlimit rlim; rlim.rlim_cur = rlim.rlim_max = memLimit; if(setrlimit(RLIMIT_CORE, &rlim) < 0) perror("setrlimit"); /* Update environment. */ { struct hash *hash = environToHash(environ); hashUpdate(hash, "JOB_ID", jobIdString); hashUpdate(hash, "USER", user); hashUpdate(hash, "HOME", homeDir); hashUpdate(hash, "HOST", hostName); hashUpdate(hash, "PARASOL", "7"); updatePath(hash, userPath, homeDir, sysPath); addEnvExtras(hash); environ = hashToEnviron(hash); freeHashAndVals(&hash); } randomSleep(); /* Sleep a random bit before executing this thing * to help spread out i/o when a big batch of jobs * hit idle cluster */ execvp(exe, params); errnoAbort("execvp'ing %s", exe); } else { /* Wait on executed job and send jobID and status back to * main process. */ int status = -1; int cid; struct paraMessage pm; struct rudp *ru = NULL; struct tms tms; unsigned long uTime = 0; unsigned long sTime = 0; if (grandChildId >= 0) { signal(SIGTERM, termHandler); cid = waitpid(grandChildId, &status, 0); if (cid < 0) errnoAbort("wait on grandchild failed"); times(&tms); uTime = ticksToHundreths*tms.tms_cutime; sTime = ticksToHundreths*tms.tms_cstime; } ru = rudpOpen(); if (ru != NULL) { ru->maxRetries = 20; pmInit(&pm, localIp, paraNodePort); pmPrintf(&pm, "jobDone %s %s %d %lu %lu", managingHost, jobIdString, status, uTime, sTime); pmSend(&pm, ru); rudpClose(&ru); } } }