/* * call_ftw_for_dev() uses ftw() to pass pathnames under /dev to gdev() * along with a status buffer. */ static void call_ftw_for_dev(void) { int gdev(); int rcode; ndev = 0; rcode = ftw("/dev", gdev, 17); switch (rcode) { case 0: return; /* successful return, devl populated */ case 1: SYSLOG0(" ftw() encountered problem\n"); break; case -1: SYSLOG1(" ftw() failed, %s\n", strerror(errno)); break; default: SYSLOG1(" ftw() unexpected return, rcode=%d\n", rcode); break; } exit(1); }
int readata() { struct stat sbuf1, sbuf2; int fd; if ((fd = open(psfile, O_RDONLY)) == -1) return (0); if (fstat(fd, &sbuf1) < 0 || sbuf1.st_size == 0 || stat("/dev", &sbuf2) == -1 || sbuf1.st_mtime <= sbuf2.st_mtime || sbuf1.st_mtime <= sbuf2.st_ctime) { if (!rd_only) { /* if read-only, believe old data */ (void) close(fd); return (0); } } /* Read /dev data from psfile. */ if (read_tmp_file(fd, (char *) &ndev, sizeof (ndev)) == 0) { (void) close(fd); return (0); } if (devl) free(devl); if ((devl = (struct devl *)malloc(ndev * sizeof (*devl))) == NULL) { SYSLOG1("malloc() for device table failed, %s\n", strerror(errno)); exit(1); } if (read_tmp_file(fd, (char *)devl, ndev * sizeof (*devl)) == 0) { (void) close(fd); return (0); } (void) close(fd); return (1); }
void Migd_GatherLoad() { int oldAllow; int oldInput; int oldForeign; static int iteration = 0; int numWritten; int error; int status; oldAllow = curVecPtr->allowMigration; oldInput = curVecPtr->noInput; oldForeign = curVecPtr->foreignProcs; if (migd_Debug > 2) { fprintf(stderr, "Migd_GatherLoad - time %d, oldAllow %d, oldInput %d\n", time((int *) NULL), oldAllow, oldInput); } GetStats(curVecPtr->lengths, &curVecPtr->noInput, &curVecPtr->foreignProcs); curVecPtr->timestamp = time((time_t *)0); ExamineLoads(curVecPtr); if ((oldInput > migd_Parms.noInput) && (curVecPtr->noInput < migd_Parms.noInput) && !ignoreInput && !migd_NeverEvict && !refuseMigration) { Migd_Evict(TRUE); } /* * Send the new load vector to the global daemon periodically, * or if our migration status changes, or if the number of * foreign processes goes from zero to non-zero or vice-versa. * This way the global daemon can track things like the last use * of a machine by a process that won't release the host when it * finishes. */ if (iteration == 0 || (oldAllow != curVecPtr->allowMigration) || (oldForeign > 0 && curVecPtr->foreignProcs == 0) || (oldForeign == 0 && curVecPtr->foreignProcs > 0)) { if (migd_Debug > 2) { fprintf(stderr, "Notifying global server, iteration %d, oldAllow %d, newAllow %d, oldForeign %d, newForeign %d.\n", iteration, oldAllow, curVecPtr->allowMigration, oldForeign, curVecPtr->foreignProcs); } iteration = 0; /* * Get the kernel's variable determining whether to refuse * migrations. We keep rechecking periodically in case it changes. */ status = Sys_Stats(SYS_PROC_MIGRATION, SYS_PROC_MIG_GET_STATE, (Address) &migd_Parms.criteria); if (status != SUCCESS) { SYSLOG1(LOG_ERR, "Error in Sys_Stats getting migration state: %s.\n", Stat_GetMsg(status)); exit(Compat_MapCode(status)); } ParseMigStatus(); if (curVecPtr->lengths[1] >= 1.0) { struct timeval tv; struct timeval curTime; /* * The 5-minute load average is over 1. This could * happen if there is a long-running process but it * also seems to happen without anything running. * Sleep a short period of time to try to * keep from being in lock-step with someone else. There's * nothing too magical about the number except that it's * intended to be something that other processes are unlikely * to sleep for. */ tv.tv_sec = 0; tv.tv_usec = ((random() % 999) + 1) * 1000; ; if (migd_Debug > 2) { if (gettimeofday(&curTime, (struct timezone *) NULL) < 0) { perror("Error in gettimeofday"); exit(1); } fprintf(stderr, "Sleeping %d usec to avoid lock step, time %d.%d.\n", tv.tv_usec, curTime.tv_sec, curTime.tv_usec); } if (select(0, (int *) NULL, (int *) NULL, (int *) NULL, &tv) < 0) { if (migd_Debug > 2) { perror("select"); } } if (migd_Debug > 2) { if (gettimeofday(&curTime, (struct timezone *) NULL) < 0) { perror("Error in gettimeofday"); exit(1); } fprintf(stderr, "Time is now %d.%d.\n", curTime.tv_sec, curTime.tv_usec); } } if (migd_Debug > 3) { fprintf(stderr, "Writing vector to global daemon.\n"); } /* * OK, here's the tricky part. We don't want our write to wait * indefinitely, so we set an alarm. But just waking up won't * cause Fs_Write to return an error, so we have to longjmp. * So we set the signal handler, set the timer, and setjmp, then * after the write we reverse the process. */ if (setjmp(writejmp)) { numWritten = -1; errno = EIO; } else { if ((int) signal(SIGALRM, WriteAlarm) < 0) { syslog(LOG_ERR, "Error setting signal handler: %s.\n", strerror(errno)); exit(1); } if (setitimer(ITIMER_REAL, &timeOutTimer, (struct itimerval *) NULL) == -1) { syslog(LOG_ERR, "Error setting interval timer: %s.\n", strerror(errno)); exit(1); } numWritten = write(migdGlobalDesc, (char *) curVecPtr, sizeof(Mig_LoadVector)); } error = errno; if (setitimer(ITIMER_REAL, &noTimer, (struct itimerval *) NULL) == -1) { syslog(LOG_ERR, "Error disabling interval timer: %s.\n", strerror(errno)); exit(1); } (void) signal(SIGALRM, SIG_IGN); errno = error; /* * Now we're back to where we would be if all we'd done was * write(), with errno and numWritten set to appropriate values. */ if (migd_Debug > 3) { fprintf(stderr, "Write returned value %d.\n", numWritten); } if (numWritten < 0) { if (migd_Debug > 0) { fprintf(stderr, "Error %d writing to global daemon: %s.\n", error, strerror(error)); } close(migdGlobalDesc); if (migd_Quit || ContactGlobal() < 0) { fprintf(stderr, "Exiting.\n"); exit(1); } } else if (numWritten != sizeof(Mig_LoadVector)) { SYSLOG2(LOG_WARNING, "short write to global daemon of %d/%d bytes.\n", numWritten, sizeof(Mig_LoadVector)); } iteration = 0; /* * Check on currentInfo.state in case we have to reconnect to the * global daemon or a user process reads the Mig_Info struct from * us. */ if (curVecPtr->allowMigration && currentInfo.state == MIG_HOST_ACTIVE) { currentInfo.state = MIG_HOST_IDLE; } else if (!curVecPtr->allowMigration && currentInfo.state == MIG_HOST_IDLE) { currentInfo.state = refuseMigration ? MIG_HOST_REFUSES : MIG_HOST_ACTIVE; } if (CheckMessages() >= 0) { if (migd_Debug > 0) { fprintf(stderr, "This host is being reclaimed by order of global migration daemon.\n"); } Migd_Evict(FALSE); } } iteration = (iteration + 1) % writeRate; }
static int ContactGlobal() { int sleepTime; int status; int retries; int ioctlRetries; int realErrno; int success = 0; static int firstContact = 1; /* First time we are trying to reach the global daemon? */ int t; t = time(0); if (migd_Debug > 1) { fprintf(stderr, "ContactGlobal - %s\n", ctime(&t)); } /* * Set a temporary variable to track firstContact, and reset it so * any subsequent calls have the updated value. This avoids the * need to reset it before every return statement. */ if (firstContact) { /* * First time we've been called. Set up seed for random * numbers. */ srandom(getpid()); firstContact = 0; } /* * Clean up any old descriptor. */ if (migdGlobalDesc >= 0) { Fs_EventHandlerDestroy(migdGlobalDesc); (void) close(migdGlobalDesc); } sleepTime = (random() & 07) + 1; for (retries = 1; retries <= MAX_GLOBAL_CONTACTS && !migd_Quit && !success; retries++) { migdGlobalDesc = open(migd_GlobalPdevName, O_RDWR, 0); if (migdGlobalDesc < 0) { if (migd_Debug > 2) { fprintf(stderr, "ContactGlobal - sleeping %d seconds\n", sleepTime); } sleep(sleepTime); sleepTime *= 2; migdGlobalDesc = open(migd_GlobalPdevName, O_RDWR, 0); } if (migdGlobalDesc < 0) { if (migd_Debug > 0) { fprintf(stderr, "ContactGlobal: couldn't open %s: %s\n", migd_GlobalPdevName, strerror(errno)); } /* * If errno is ENOENT, there is not currently a master, anywhere. * (When the master exits it removes the pdev.) EIO * may mean the daemon crashed. EINVAL may mean the daemon's host * crashed. We special case EIO due to a race condition * between recovery and starting daemons. */ if (errno == ENOENT || errno == EIO || errno == EINVAL) { if (retries == MAX_GLOBAL_CONTACTS - 1 && errno != ENOENT) { /* * We're getting desperate here. We can't open * the file, but we should be able to. Remove * the pdev and try one last time to create the * master, since it may be that the host running * the master has crashed and the name server * is continually returning a bad status to us. * We risk clobbering someone else who has successfully * opened the pdev just before us, but there's a small * window of vulnerability and by this time we're sleeping * a long time. */ (void) unlink(migd_GlobalPdevName); } if (!migd_NeverRunGlobal) { if (CreateGlobal() < 0) { return(-1); } } /* * Go to start of for loop, trying to open pdev. */ continue; } else { realErrno = errno; fprintf(stderr, "Migd_Init - Unable to contact master of global pdev: %s\n", strerror(errno)); errno = realErrno; return(-1); } } else { /* * We've successfully opened the pdev. * Try to tell the global master that we're a daemon. It may say * DEV_BUSY, which means that there is already a daemon. In that * case, it tells the other daemon to go away, and we will keep * trying. In some cases we may get an error doing the ioctl, * such as a stale handle, in which case we close the file and * go to the top again. */ for (ioctlRetries = 1; ioctlRetries <= MAX_GLOBAL_CONTACTS; ioctlRetries++) { status = Fs_IOControl(migdGlobalDesc, IOC_MIG_DAEMON, sizeof(Mig_Info), (char *) ¤tInfo, 0, (char *) NULL); if (status == DEV_BUSY) { if (migd_Debug > 0) { fprintf(stderr, "ContactGlobal - ioctl returned busy.\n"); } sleepTime = ((random() & 07) + 1) * ioctlRetries; if (migd_Debug > 2) { fprintf(stderr, "ContactGlobal - sleeping %d seconds\n", sleepTime); } sleep(sleepTime); } else { /* * An error we can't deal with, or SUCCESS. */ break; } } if (status != SUCCESS) { SYSLOG1(LOG_ERR, "ContactGlobal: warning: error during ioctl to global master: %s\n", Stat_GetMsg(status)); errno = Compat_MapCode(status); close(migdGlobalDesc); } else { /* * We did it! Break out of the inner for loop, and the * success flag will break us out of the outer loop. */ success = 1; break; } } } if (!success) { realErrno = errno; SYSLOG0(LOG_ERR, "unable to contact master; giving up.\n"); errno = realErrno; return(-1); } #ifdef FAST_SELECT Fs_EventHandlerCreate(migdGlobalDesc, FS_READ|FS_EXCEPTION, HandleException, (ClientData) NULL); #endif /* FAST_SELECT */ if (migd_Debug > 1) { fprintf(stderr, "ContactGlobal - completed successfully\n"); } return (0); }