void listJobs()
/* Report jobs running and recently finished. */
{
struct job *job;
struct dlNode *node;

pmClear(&pmIn);
pmPrintf(&pmIn, "%d running", dlCount(jobsRunning));
if (!pmSend(&pmIn, mainRudp))
    return;
for (node = jobsRunning->head; !dlEnd(node); node=node->next)
    {
    job = node->val;
    pmClear(&pmIn);
    pmPrintf(&pmIn, "%s", job->startMessage);
    if (!pmSend(&pmIn, mainRudp))
        return;
    }
pmClear(&pmIn);
pmPrintf(&pmIn, "%d recent", dlCount(jobsFinished));
if (!pmSend(&pmIn, mainRudp))
    return;
for (node = jobsFinished->head; !dlEnd(node); node=node->next)
    {
    job = node->val;
    pmClear(&pmIn);
    pmPrintf(&pmIn, "%s", job->startMessage);
    if (!pmSend(&pmIn, mainRudp))
        return;
    pmClear(&pmIn);
    pmPrintf(&pmIn, "%s", job->doneMessage);
    if (!pmSend(&pmIn, mainRudp))
        return;
    }
}
void doFetch(char *line)
/* Fetch first part of file.  Protocol is to send the
 * file one UDP packet at a time.  A zero length packet
 * indicates end of file. */
{
char *user = cloneString(nextWord(&line));
char *fileName = cloneString(nextWord(&line));
if ((user == NULL) || (fileName != NULL))
    {
    FILE *f = fopen(fileName, "r");
    pmClear(&pmIn);
    if (f == NULL)
	{
        if (user == NULL)
            user = "******";
	pmPrintf(&pmIn, "Couldn't open fetch file: \"%s\" %s for user %s",
                 fileName, strerror(errno), user);
	warn("Couldn't open fetch file: \"%s\" %s for user %s",
             fileName, strerror(errno), user);
	pmSend(&pmIn, mainRudp);
	pmClear(&pmIn);
	pmSend(&pmIn, mainRudp);
	}
    else
	{
	int size;
	for (;;)
	    {
	    size = fread(pmIn.data,1,  sizeof(pmIn.data)-1, f);
	    if (size < 0)
		{
		size = 0;
		warn("Couldn't read fetch file: \"%s\" %s",
                     fileName, strerror(errno));
		}
	    pmIn.size = size;
	    pmSend(&pmIn, mainRudp);
	    if (size == 0)
		break;
	    }
	fclose(f);
	}
    }
freez(&user);
freez(&fileName);
}
void tellManagerJobIsDone(char *managingHost, char *jobIdString, char *line)
/* Try and send message to host saying job is done. */
{
struct paraMessage pm;
bits32 ip;
if (!internetDottedQuadToIp(managingHost, &ip))
    {
    warn("%s doesn't seem to be in dotted quad form\n", managingHost);
    return;
    }
pmInit(&pm, ip, paraHubPort);
pmPrintf(&pm, "jobDone %s %s", jobIdString, line);
if (!pmSend(&pm, mainRudp))
    warn("Couldn't send message to %s to say %s is done\n", managingHost, jobIdString);
}
void doStatus()
/* Report status. */
{
pmClear(&pmIn);
pmPrintf(&pmIn, "%d of %d CPUs busy", busyProcs, maxProcs);
if (busyProcs > 0)
    {
    struct dlNode *node;
    pmPrintf(&pmIn, ". Jobs:");
    for (node = jobsRunning->head; !dlEnd(node); node = node->next)
        {
	struct job *job = node->val;
        pmPrintf(&pmIn, " %d", job->jobId);
	}
    }
pmSend(&pmIn, mainRudp);
}
Beispiel #5
0
void paraNodeStop(char *machineList)
/* Stop node server on all machines in list. */
{
struct lineFile *lf = lineFileOpen(machineList, FALSE);
char *row[1];

while (lineFileRow(lf, row))
    {
    struct rudp *ru = rudpMustOpen();
    char *name = row[0];
    struct paraMessage pm;
    ru->maxRetries = 6;
    printf("Telling %s to quit \n", name);
    pmInitFromName(&pm, name, paraNodePort);
    pmPrintf(&pm, "%s", "quit");
    pmSend(&pm, ru);
    rudpClose(&ru);
    }
}
void doResurrect(char *line, struct sockaddr_in *hubIp)
/* Send back I'm alive message */
{
struct paraMessage pm;
struct dlNode *node;
int jobsReported = 0;
pmInit(&pm, ntohl(hubIp->sin_addr.s_addr), paraHubPort);
pmPrintf(&pm, "alive %s", hostName);
for (node = jobsRunning->head; !dlEnd(node); node = node->next)
    {
    struct job *job = node->val;
    pmPrintf(&pm, " %d", job->jobId);
    ++jobsReported;
    }
for (node = jobsFinished->head; !dlEnd(node); node = node->next)
    {
    struct job *job = node->val;
    if (jobsReported >= maxProcs)
	break;
    pmPrintf(&pm, " %d", job->jobId);
    ++jobsReported;
    }
pmSend(&pm, mainRudp);
}
void doCheck(char *line, struct sockaddr_in *hubIp)
/* Send back check result - either a check in message or
 * jobDone. */
{
char *jobIdString = nextWord(&line);
if (jobIdString != NULL)
    {
    int jobId = atoi(jobIdString);
    struct job *job = findRunningJob(jobId);
    struct paraMessage pm;
    pmInit(&pm, ntohl(hubIp->sin_addr.s_addr), paraHubPort);
    if (job != NULL)
	pmPrintf(&pm, "checkIn %s %s running", hostName, jobIdString);
    else
	{
	struct job *job = findFinishedJob(jobId);
	if (job == NULL)
	    pmPrintf(&pm, "checkIn %s %s free", hostName, jobIdString);
	else
	    pmPrintf(&pm, "jobDone %s %s", jobIdString, job->doneMessage);
	}
    pmSend(&pm, mainRudp);
    }
}
void execProc(char *managingHost, char *jobIdString, char *reserved,
	char *user, char *dir, char *in, char *out, char *err, long long memLimit,
	char *exe, char **params)
/* This routine is the child process of doExec.
 * It spawns a grandchild that actually does the
 * work and waits on it.  It sends message to the
 * main message loop here when done. */
{
if ((grandChildId = forkOrDie()) == 0)
    {
    char *homeDir = "";

    /* Change to given user (if root) */
    changeUid(user, &homeDir);

    /* create output files just after becoming user so that errors in the rest
     * of this proc will go to the err file and be available via para
     * problems */
    setupProcStdio(in, out, err);

    if (chdir(dir) < 0)
        errnoAbort("can't chdir to %s", dir);
    setsid();
    // setpgid(0,0);
    umask(umaskVal); 

    struct rlimit rlim;
    rlim.rlim_cur = rlim.rlim_max = memLimit;
    if(setrlimit(RLIMIT_CORE, &rlim) < 0)
    perror("setrlimit"); 


    /* Update environment. */
        {
	struct hash *hash = environToHash(environ);
	hashUpdate(hash, "JOB_ID", jobIdString);
	hashUpdate(hash, "USER", user);
	hashUpdate(hash, "HOME", homeDir);
	hashUpdate(hash, "HOST", hostName);
	hashUpdate(hash, "PARASOL", "7");
	updatePath(hash, userPath, homeDir, sysPath);
        addEnvExtras(hash);
	environ = hashToEnviron(hash);
	freeHashAndVals(&hash);
	}

    randomSleep();	/* Sleep a random bit before executing this thing
                         * to help spread out i/o when a big batch of jobs
			 * hit idle cluster */
    execvp(exe, params);
    errnoAbort("execvp'ing %s", exe);
    }
else
    {
    /* Wait on executed job and send jobID and status back to 
     * main process. */
    int status = -1;
    int cid;
    struct paraMessage pm;
    struct rudp *ru = NULL;
    struct tms tms;
    unsigned long uTime = 0;
    unsigned long sTime = 0;

    if (grandChildId >= 0)
	{
	signal(SIGTERM, termHandler);
	cid = waitpid(grandChildId, &status, 0);
        if (cid < 0)
            errnoAbort("wait on grandchild failed");
	times(&tms);
	uTime = ticksToHundreths*tms.tms_cutime;
	sTime = ticksToHundreths*tms.tms_cstime;
	}
    ru = rudpOpen();
    if (ru != NULL)
	{
	ru->maxRetries = 20;
	pmInit(&pm, localIp, paraNodePort);
	pmPrintf(&pm, "jobDone %s %s %d %lu %lu", managingHost, 
	    jobIdString, status, uTime, sTime);
	pmSend(&pm, ru);
	rudpClose(&ru);
	}
    }
}