/** * send a command to a sync server and wait for a response. * * @param[in] state pointer to sync client handle * @param[in] com command object * @param[out] res response object * * @return operation status * @retval SYNC_OK success * @retval SYNC_COM_ERROR communications error * @retval SYNC_BAD_COMMAND server did not recognize command code * * @note this routine merely handles error processing; SYNC_ask_internal() * handles the low-level details of communicating with the SYNC server. * * @see SYNC_ask_internal */ afs_int32 SYNC_ask(SYNC_client_state * state, SYNC_command * com, SYNC_response * res) { int tries; afs_uint32 now, timeout, code=SYNC_OK; if (state->fatal_error) { return SYNC_COM_ERROR; } if (state->fd == -1) { SYNC_connect(state); } if (state->fd == -1) { state->fatal_error = 1; return SYNC_COM_ERROR; } #ifdef AFS_DEMAND_ATTACH_FS com->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS; #endif now = FT_ApproxTime(); timeout = now + state->hard_timeout; for (tries = 0; (tries <= state->retry_limit) && (now <= timeout); tries++, now = FT_ApproxTime()) { code = SYNC_ask_internal(state, com, res); if (code == SYNC_OK) { break; } else if (code == SYNC_BAD_COMMAND) { Log("SYNC_ask: protocol mismatch on circuit '%s'; make sure " "fileserver, volserver, salvageserver and salvager are same " "version\n", state->proto_name); break; } else if ((code == SYNC_COM_ERROR) && (tries < state->retry_limit)) { Log("SYNC_ask: protocol communications failure on circuit '%s'; " "attempting reconnect to server\n", state->proto_name); SYNC_reconnect(state); /* try again */ } else { /* * unknown (probably protocol-specific) response code, pass it up to * the caller, and let them deal with it */ break; } } if (code == SYNC_COM_ERROR) { Log("SYNC_ask: fatal protocol error on circuit '%s'; disabling sync " "protocol until next server restart\n", state->proto_name); state->fatal_error = 1; } return code; }
/* run at creation or after process exit. figures out if we're all done (if a one shot run) or when we should run again. Sleeps until we should run again. Note that the computation of when we should run again is made in procexit and/or create procs. This guy only schedules the sleep */ int ScheduleCronBnode(register struct cronbnode *abnode) { register afs_int32 code; register afs_int32 temp; struct bnode_proc *tp; /* If this proc is shutdown, tell bproc() to no longer run this job */ if (abnode->b.goal == BSTAT_SHUTDOWN) { bnode_SetTimeout(abnode, 0); return 0; } /* otherwise we're supposed to be running, figure out when */ if (abnode->when == 0) { /* one shot */ if (abnode->everRun) { /* once is enough */ bnode_Delete(abnode); return 0; } /* otherwise start it */ if (!abnode->running) { /* start up */ abnode->lastStart = FT_ApproxTime(); code = bnode_NewProc(abnode, abnode->command, NULL, &tp); if (code) { bozo_Log("cron bnode %s failed to start (code %d)\n", abnode->b.name, code); return code; } abnode->everRun = 1; abnode->running = 1; abnode->proc = tp; return 0; } } else { /* run periodically */ if (abnode->running) return 0; /* otherwise find out when to run it, and do it then */ temp = abnode->when - FT_ApproxTime(); if (temp < 1) temp = 1; /* temp is when to start dude */ bnode_SetTimeout(abnode, temp); } return 0; }
static void * SlowCall(void * rock) { struct rx_connection *conn = rock; u_long ntime; u_long now; long temp_rc; #ifdef AFS_PTHREAD_ENV pthread_mutex_lock(&slowCallLock); #endif slowCallCode = RXKST_PROCESSRUNNING; #ifdef AFS_PTHREAD_ENV pthread_cond_signal(&slowCallCV); #else LWP_NoYieldSignal(&slowCallCode); #endif slowCallCode = RXKST_Slow(conn, 1, &ntime); if (!slowCallCode) { now = FT_ApproxTime(); if ((ntime < now - maxSkew) || (ntime > now + maxSkew)) slowCallCode = RXKST_TIMESKEW; } temp_rc = slowCallCode; #ifdef AFS_PTHREAD_ENV pthread_cond_signal(&slowCallCV); pthread_mutex_unlock(&slowCallLock); #else LWP_NoYieldSignal(&slowCallCode); #endif return (void *)(intptr_t)temp_rc; }
/* create a new transaction, returning ptr to same with high ref count */ struct volser_trans * NewTrans(afs_uint32 avol, afs_int32 apart) { /* set volid, next, partition */ struct volser_trans *tt, *newtt; struct timeval tp; struct timezone tzp; newtt = (struct volser_trans *)malloc(sizeof(struct volser_trans)); VTRANS_LOCK; /* don't allow the same volume to be attached twice */ for (tt = allTrans; tt; tt = tt->next) { if ((tt->volid == avol) && (tt->partition == apart)) { VTRANS_UNLOCK; free(newtt); return (struct volser_trans *)0; /* volume busy */ } } tt = newtt; memset(tt, 0, sizeof(struct volser_trans)); tt->volid = avol; tt->partition = apart; tt->refCount = 1; tt->rxCallPtr = (struct rx_call *)0; strcpy(tt->lastProcName, ""); gettimeofday(&tp, &tzp); tt->creationTime = tp.tv_sec; tt->time = FT_ApproxTime(); tt->tid = transCounter++; tt->next = allTrans; VTRANS_OBJ_LOCK_INIT(tt); allTrans = tt; VTRANS_UNLOCK; return tt; }
afs_int32 GCTrans(void) { register struct volser_trans *tt, *nt; afs_int32 now; now = FT_ApproxTime(); VTRANS_LOCK; for (tt = allTrans; tt; tt = nt) { nt = tt->next; /* remember in case we zap it */ if (tt->time + OLDTRANSWARN < now) { Log("trans %u on volume %u %s than %d seconds\n", tt->tid, tt->volid, ((tt->refCount > 0) ? "is older" : "has been idle for more"), (((now - tt->time) / GCWAKEUP) * GCWAKEUP)); } if (tt->refCount > 0) continue; if (tt->time + OLDTRANSTIME < now) { Log("trans %u on volume %u has timed out\n", tt->tid, tt->volid); tt->refCount++; /* we're using it now */ DeleteTrans(tt, 0); /* drops refCount or deletes it */ GCDeletes++; } } VTRANS_UNLOCK; return 0; }
static int ez_setstat(struct bnode *bn, afs_int32 astatus) { struct ezbnode *abnode = (struct ezbnode *)bn; struct bnode_proc *tp; afs_int32 code; if (abnode->waitingForShutdown) return BZBUSY; if (astatus == BSTAT_NORMAL && !abnode->running) { /* start up */ abnode->lastStart = FT_ApproxTime(); code = bnode_NewProc((struct bnode *)abnode, abnode->command, NULL, &tp); if (code) return code; abnode->running = 1; abnode->proc = tp; return 0; } else if (astatus == BSTAT_SHUTDOWN && abnode->running) { /* start shutdown */ bnode_StopProc(abnode->proc, SIGTERM); abnode->waitingForShutdown = 1; bnode_SetTimeout((struct bnode *)abnode, SDTIME); return 0; } return 0; }
/*! * \brief Called once/run to init the vote module */ int uvote_Init(void) { /* pretend we just voted for someone else, since we just restarted */ ubik_lastYesTime = FT_ApproxTime(); return 0; }
static int fs_stateFillHeader(struct fs_state_header * hdr) { hdr->stamp.magic = FS_STATE_MAGIC; hdr->stamp.version = FS_STATE_VERSION; #ifdef SYS_NAME_ID hdr->sys_name = SYS_NAME_ID; #else hdr->sys_name = 0xFFFFFFFF; #endif hdr->timestamp = FT_ApproxTime(); hdr->server_uuid = FS_HostUUID; hdr->valid = 1; #ifdef WORDS_BIGENDIAN hdr->endianness = 1; #else hdr->endianness = 0; #endif #ifdef FS_STATS_DETAILED hdr->stats_detailed = 1; #else hdr->stats_detailed = 0; #endif if (strlcpy(hdr->server_version_string, cml_version_number, sizeof(hdr->server_version_string)) >= sizeof(hdr->server_version_string)) { ViceLog(0, ("fs_stateFillHeader: WARNING -- cml_version_number field truncated\n")); } return 0; }
/*! * \brief Handle basic network debug command. This is the global state dumper. */ afs_int32 SVOTE_Debug(struct rx_call * rxcall, struct ubik_debug * aparm) { int i; /* fill in the basic debug structure. Note the the RPC protocol transfers, * integers in host order. */ aparm->now = FT_ApproxTime(); aparm->lastYesTime = vote_globals.ubik_lastYesTime; aparm->lastYesHost = ntohl(vote_globals.lastYesHost); aparm->lastYesState = vote_globals.lastYesState; aparm->lastYesClaim = vote_globals.lastYesClaim; aparm->lowestHost = ntohl(vote_globals.lowestHost); aparm->lowestTime = vote_globals.lowestTime; aparm->syncHost = ntohl(vote_globals.syncHost); aparm->syncTime = vote_globals.syncTime; memcpy(&aparm->syncVersion, &vote_globals.ubik_dbVersion, sizeof(struct ubik_version)); memcpy(&aparm->syncTid, &vote_globals.ubik_dbTid, sizeof(struct ubik_tid)); /* fill in all interface addresses of myself in hostbyte order */ for (i = 0; i < UBIK_MAX_INTERFACE_ADDR; i++) aparm->interfaceAddr[i] = ntohl(ubik_host[i]); aparm->amSyncSite = beacon_globals.ubik_amSyncSite; ubeacon_Debug(aparm); udisk_Debug(aparm); ulock_Debug(aparm); /* Get the recovery state. The label of the database may not have * been written yet but set the flag so udebug behavior remains. * Defect 9477. */ aparm->recoveryState = urecovery_state; if ((urecovery_state & UBIK_RECSYNCSITE) && (urecovery_state & UBIK_RECFOUNDDB) && (urecovery_state & UBIK_RECHAVEDB)) { aparm->recoveryState |= UBIK_RECLABELDB; } aparm->activeWrite = (ubik_dbase->flags & DBWRITING); aparm->tidCounter = ubik_dbase->tidCounter; if (ubik_currentTrans) { aparm->currentTrans = 1; if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1; else aparm->writeTrans = 0; } else { aparm->currentTrans = 0; } aparm->epochTime = version_globals.ubik_epochTime; return 0; }
/*! * \brief Handle basic network debug command. This is the global state dumper. */ afs_int32 SVOTE_DebugOld(struct rx_call * rxcall, struct ubik_debug_old * aparm) { /* fill in the basic debug structure. Note the the RPC protocol transfers, * integers in host order. */ aparm->now = FT_ApproxTime(); aparm->lastYesTime = ubik_lastYesTime; aparm->lastYesHost = ntohl(lastYesHost); aparm->lastYesState = lastYesState; aparm->lastYesClaim = lastYesClaim; aparm->lowestHost = ntohl(lowestHost); aparm->lowestTime = lowestTime; aparm->syncHost = ntohl(syncHost); aparm->syncTime = syncTime; aparm->amSyncSite = ubik_amSyncSite; ubeacon_Debug((ubik_debug *)aparm); udisk_Debug((ubik_debug *)aparm); ulock_Debug((ubik_debug *)aparm); /* Get the recovery state. The label of the database may not have * been written yet but set the flag so udebug behavior remains. * Defect 9477. */ aparm->recoveryState = urecovery_state; if ((urecovery_state & UBIK_RECSYNCSITE) && (urecovery_state & UBIK_RECFOUNDDB) && (urecovery_state & UBIK_RECHAVEDB)) { aparm->recoveryState |= UBIK_RECLABELDB; } memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version)); memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid)); aparm->activeWrite = (ubik_dbase->flags & DBWRITING); aparm->tidCounter = ubik_dbase->tidCounter; if (ubik_currentTrans) { aparm->currentTrans = 1; if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1; else aparm->writeTrans = 0; } else { aparm->currentTrans = 0; } aparm->epochTime = ubik_epochTime; return 0; }
static int fs_stateLoadDump(struct fs_dump_state * state) { afs_uint64 z; int fd, ret = 0; struct afs_stat status; afs_int32 now = FT_ApproxTime(); ZeroInt64(z); if ((fd = afs_open(state->fn, O_RDWR)) == -1 || (afs_fstat(fd, &status) == -1)) { ViceLog(0, ("fs_stateLoadDump: failed to load state dump file '%s'\n", state->fn)); ret = 1; goto done; } state->fd = fd; state->mode = FS_STATE_LOAD_MODE; state->file_len = status.st_size; #ifdef FS_STATE_USE_MMAP if (fs_stateMapFile(state)) { ViceLog(0, ("fs_stateLoadDump: failed to memory map state dump file '%s'\n", state->fn)); ret = 1; goto done; } #endif if (fs_stateReadHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) { ViceLog(0, ("fs_stateLoadDump: failed to read header from dump file '%s'\n", state->fn)); ret = 1; goto done; } /* check the validity of the header */ if (fs_stateCheckHeader(state->hdr)) { ViceLog(1, ("fs_stateLoadDump: header failed validity checks; not restoring '%s'\n", state->fn)); ret = 1; goto done; } if ((state->hdr->timestamp + HOST_STATE_VALID_WINDOW) >= now) { state->flags.do_host_restore = 1; } else { ViceLog(0, ("fs_stateLoadDump: warning: dump is too old for host and callback restore; skipping those steps\n")); } done: return ret; }
/* called to SIGKILL a process if it doesn't terminate normally. In cron, also start up a process if it is time and not already running */ static int cron_timeout(struct bnode *bn) { struct cronbnode *abnode = (struct cronbnode *)bn; register afs_int32 temp; register afs_int32 code; struct bnode_proc *tp; if (!abnode->running) { if (abnode->when == 0) return 0; /* spurious timeout activation */ /* not running, perhaps we should start it */ if (FT_ApproxTime() >= abnode->when) { abnode->lastStart = FT_ApproxTime(); bnode_SetTimeout((struct bnode *)abnode, 0); code = bnode_NewProc((struct bnode *)abnode, abnode->command, NULL, &tp); if (code) { bozo_Log("cron failed to start bnode %s (code %d)\n", abnode->b.name, code); return code; } abnode->everRun = 1; abnode->running = 1; abnode->proc = tp; } else { /* woke up too early, try again */ temp = abnode->when - FT_ApproxTime(); if (temp < 1) temp = 1; bnode_SetTimeout((struct bnode *)abnode, temp); } } else { if (!abnode->waitingForShutdown) return 0; /* spurious */ /* send kill and turn off timer */ bnode_StopProc(abnode->proc, SIGKILL); abnode->killSent = 1; bnode_SetTimeout((struct bnode *)abnode, 0); } return 0; }
/* function called to set / clear periodic bnode wakeup times */ int bnode_SetTimeout(struct bnode *abnode, afs_int32 atimeout) { if (atimeout != 0) { abnode->nextTimeout = FT_ApproxTime() + atimeout; abnode->flags |= BNODE_NEEDTIMEOUT; abnode->period = atimeout; IOMGR_Cancel(bproc_pid); } else { abnode->flags &= ~BNODE_NEEDTIMEOUT; } return 0; }
/*! * \brief Decide if we should try to become sync site. * * The basic rule is that we * don't run if there is a valid sync site and it ain't us (we have to run if * it is us, in order to keep our votes). If there is no sync site, then we * want to run if we're the lowest numbered host running, otherwise we defer to * the lowest host. However, if the lowest host hasn't been heard from for a * while, then we start running again, in case he crashed. * * \return true if we should run, and false otherwise. */ int uvote_ShouldIRun(void) { afs_int32 now; now = FT_ApproxTime(); if (BIGTIME + ubik_lastYesTime < now) return 1; /* no valid guy even trying */ if (lastYesState && lastYesHost != ubik_host[0]) return 0; /* other guy is sync site, leave him alone */ if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0])) return 0; /* if someone is valid and better than us, don't run */ /* otherwise we should run */ return 1; }
int bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName, struct bnode_proc **aproc) { struct bnode_token *tlist, *tt; afs_int32 code; struct bnode_proc *tp; pid_t cpid; char *argv[MAXVARGS]; int i; code = bnode_ParseLine(aexecString, &tlist); /* try parsing first */ if (code) return code; tp = (struct bnode_proc *)malloc(sizeof(struct bnode_proc)); memset(tp, 0, sizeof(struct bnode_proc)); tp->next = allProcs; tp->bnode = abnode; tp->comLine = aexecString; tp->coreName = coreName; /* may be null */ abnode->procStartTime = FT_ApproxTime(); abnode->procStarts++; /* convert linked list of tokens into argv structure */ for (tt = tlist, i = 0; i < (MAXVARGS - 1) && tt; tt = tt->next, i++) { argv[i] = tt->key; } argv[i] = NULL; /* null-terminated */ cpid = spawnprocve(argv[0], argv, environ, -1); osi_audit(BOSSpawnProcEvent, 0, AUD_STR, aexecString, AUD_END); if (cpid == (pid_t) - 1) { bozo_Log("Failed to spawn process for bnode '%s'\n", abnode->name); bnode_FreeTokens(tlist); free(tp); return errno; } bnode_FreeTokens(tlist); allProcs = tp; *aproc = tp; tp->pid = cpid; tp->flags = BPROC_STARTED; tp->flags &= ~BPROC_EXITED; bnode_Check(abnode); return 0; }
/* find a trans, again returning with high ref count */ struct volser_trans * FindTrans(register afs_int32 atrans) { register struct volser_trans *tt; VTRANS_LOCK; for (tt = allTrans; tt; tt = tt->next) { if (tt->tid == atrans) { tt->time = FT_ApproxTime(); tt->refCount++; VTRANS_UNLOCK; return tt; } } VTRANS_UNLOCK; return (struct volser_trans *)0; }
/*! * \brief Return the current synchronization site, if any. * * Simple approach: if the * last guy we voted yes for claims to be the sync site, then we we're happy to * use that guy for a sync site until the time his mandate expires. If the guy * does not claim to be sync site, then, of course, there's none. * * In addition, if we lost the sync, we set #urecovery_syncSite to an invalid * value, indicating that we no longer know which version of the dbase is the * one we should have. We'll get a new one when we next hear from the sync * site. * * \return 0 or currently valid sync site. It can return our own * address, if we're the sync site. */ afs_int32 uvote_GetSyncSite(void) { afs_int32 now; afs_int32 code; if (!lastYesState) code = 0; else { now = FT_ApproxTime(); if (SMALLTIME + lastYesClaim < now) code = 0; /* last guy timed out */ else code = lastYesHost; } return code; }
int uvote_HaveSyncAndVersion(struct ubik_version version) { afs_int32 now; int code; UBIK_VOTE_LOCK; now = FT_ApproxTime(); if (!vote_globals.lastYesState || (SMALLTIME + vote_globals.lastYesClaim < now) || vote_globals.ubik_dbVersion.epoch != version.epoch || vote_globals.ubik_dbVersion.counter != version.counter) code = 0; else code = 1; UBIK_VOTE_UNLOCK; return code; }
/*! * \brief Return the current synchronization site, if any. * * Simple approach: if the * last guy we voted yes for claims to be the sync site, then we we're happy to * use that guy for a sync site until the time his mandate expires. If the guy * does not claim to be sync site, then, of course, there's none. * * In addition, if we lost the sync, we set #urecovery_syncSite to an invalid * value, indicating that we no longer know which version of the dbase is the * one we should have. We'll get a new one when we next hear from the sync * site. * * \return 0 or currently valid sync site. It can return our own * address, if we're the sync site. */ afs_int32 uvote_GetSyncSite(void) { afs_int32 now; afs_int32 code; UBIK_VOTE_LOCK; if (!vote_globals.lastYesState) code = 0; else { now = FT_ApproxTime(); if (SMALLTIME + vote_globals.lastYesClaim < now) code = 0; /* last guy timed out */ else code = vote_globals.lastYesHost; } UBIK_VOTE_UNLOCK; return code; }
/* background daemon for timing out transactions */ static void* BKGLoop(void *unused) { struct timeval tv; time_t now; afs_int32 sleepseconds; int loop = 0; while (1) { now = FT_ApproxTime(); sleepseconds = GCWAKEUP - (now % GCWAKEUP); /* synchronize with wall clock */ tv.tv_sec = sleepseconds; tv.tv_usec = 0; #ifdef AFS_PTHREAD_ENV #ifdef AFS_NT40_ENV Sleep(GCWAKEUP * 1000); #else select(0, 0, 0, 0, &tv); #endif #else (void)IOMGR_Select(0, 0, 0, 0, &tv); #endif TransferRate(); GCTrans(); TryUnlock(); loop++; if (loop == 10) { /* reopen log every 5 minutes */ loop = 0; ReOpenLog(AFSDIR_SERVER_VOLSERLOG_FILEPATH); if (osdvol) (osdvol->op_osd_5min_check)(); #ifdef AFS_DEMAND_ATTACH_FS if (VInit >= 2) { /* look for newly mounted partitions */ VAttachPartitions(); VInitAttachVolumes(fileServer); } #endif } } return NULL; }
static long DoClient(int index, opaque rock) { struct client *c = (struct client *)rock; long code = 0; int i; u_long n, inc_n; n = 95678; for (i = 0; i < c->fastCalls[index]; i++) { code = RXKST_Fast(c->conn, n, &inc_n); if (code) return (code); if (n + 1 != inc_n) return RXKST_INCFAILED; n++; } for (i = 0; i < c->slowCalls[index]; i++) { u_long ntime; u_long now; code = RXKST_Slow(c->conn, 1, &ntime); if (code) return (code); now = FT_ApproxTime(); if ((ntime < now - maxSkew) || (ntime > now + maxSkew)) return RXKST_TIMESKEW; } if (c->copiousCalls[index] > 0) { u_long buflen = 10000; char *buf = osi_Alloc(buflen); for (i = 0; i < c->copiousCalls[index]; i++) { code = Copious(c, buf, buflen); if (code) break; } osi_Free(buf, buflen); if (code) return code; } return 0; }
/*! * \brief Called once/run to init the vote module */ int uvote_Init(void) { UBIK_VOTE_LOCK; /* pretend we just voted for someone else, since we just restarted */ vote_globals.ubik_lastYesTime = FT_ApproxTime(); /* Initialize globals */ vote_globals.ubik_lastYesTime = 0; vote_globals.lastYesHost = 0xffffffff; vote_globals.lastYesClaim = 0; vote_globals.lastYesState = 0; vote_globals.lowestTime = 0; vote_globals.lowestHost = 0xffffffff; vote_globals.syncTime = 0; vote_globals.syncHost = 0; UBIK_VOTE_UNLOCK; return 0; }
/* put a transaction back */ afs_int32 TRELE(register struct volser_trans *at) { VTRANS_LOCK; if (at->refCount == 0) { Log("TRELE: bad refcount\n"); VTRANS_UNLOCK; return VOLSERTRELE_ERROR; } at->time = FT_ApproxTime(); /* we're still using it */ if (at->refCount == 1 && (at->tflags & TTDeleted)) { DeleteTrans(at, 0); VTRANS_UNLOCK; return 0; } /* otherwise simply drop refcount */ at->refCount--; VTRANS_UNLOCK; return 0; }
int DirAccessOK(void) { #ifdef AFS_NT40_ENV /* underlying filesystem may not support directory protection */ return 1; #else static afs_uint32 lastTime = 0; afs_uint32 now = FT_ApproxTime(); static int lastResult = -1; int result; int i; if ((now - lastTime) < 5) return lastResult; lastTime = now; result = 1; for (i = 0; i < bozo_nbosEntryStats; i++) { struct bozo_bosEntryStats *e = &bozo_bosEntryStats[i]; if (!StatEachEntry(e)) { bozo_Log("unhappy with %s which is a %s that should " "have at least rights %o, at most rights %o %s\n", e->path, e->dir ? "dir" : "file", e->reqPerm, (~e->proPerm & 0777), e->rootOwner ? ", owned by root" : ""); result = 0; break; } } if (result != lastResult) { /* log changes */ bozo_Log("Server directory access is %sokay\n", (result ? "" : "not ")); } lastResult = result; return lastResult; #endif /* AFS_NT40_ENV */ }
/* now rename .OLD to .BAK */ if (stat(fpOld, &tstat) == 0) code = rk_rename(fpOld, fpBak); } if (code) code = errno; osi_auditU(acall, BOS_UnInstallEvent, code, AUD_STR, filepath, AUD_END); free(filepath); return code; } #define BOZO_OLDTIME (7*24*3600) /* 7 days old */ static void SaveOldFiles(char *aname) { afs_int32 code; char bbuffer[AFSDIR_PATH_MAX], obuffer[AFSDIR_PATH_MAX]; struct stat tstat; afs_int32 now; afs_int32 oldTime, bakTime; strcpy(bbuffer, aname); strcat(bbuffer, ".BAK"); strcpy(obuffer, aname); strcat(obuffer, ".OLD"); now = FT_ApproxTime(); code = stat(aname, &tstat); if (code < 0) return; /* can't stat file */ code = stat(obuffer, &tstat); /* discover old file's time */ if (code) oldTime = 0; else oldTime = tstat.st_mtime; code = stat(bbuffer, &tstat); /* discover back file's time */ if (code) bakTime = 0; else bakTime = tstat.st_mtime; if (bakTime && (oldTime == 0 || bakTime < now - BOZO_OLDTIME)) { /* no .OLD file, or .BAK is at least a week old */ rk_rename(bbuffer, obuffer); } /* finally rename to .BAK extension */ rk_rename(aname, bbuffer); }
/*! * \brief Decide if we should try to become sync site. * * The basic rule is that we * don't run if there is a valid sync site and it ain't us (we have to run if * it is us, in order to keep our votes). If there is no sync site, then we * want to run if we're the lowest numbered host running, otherwise we defer to * the lowest host. However, if the lowest host hasn't been heard from for a * while, then we start running again, in case he crashed. * * \return true if we should run, and false otherwise. */ int uvote_ShouldIRun(void) { afs_int32 now; int code = 1; /* default to yes */ UBIK_VOTE_LOCK; now = FT_ApproxTime(); if (BIGTIME + vote_globals.ubik_lastYesTime < now) goto done; if (vote_globals.lastYesState && vote_globals.lastYesHost != ubik_host[0]) { code = 0; /* other guy is sync site, leave him alone */ goto done; } if (ntohl((afs_uint32)vote_globals.lastYesHost) < ntohl((afs_uint32)ubik_host[0])) { code = 0; /* if someone is valid and better than us, don't run */ goto done; } done: UBIK_VOTE_UNLOCK; return code; }
/* lwp to handle system restarts */ static void * BozoDaemon(void *unused) { afs_int32 now; /* now initialize the values */ bozo_newKTs = 1; while (1) { IOMGR_Sleep(60); now = FT_ApproxTime(); if (bozo_restdisable) { bozo_Log("Restricted mode disabled by signal\n"); bozo_restdisable = 0; } if (bozo_newKTs) { /* need to recompute restart times */ bozo_newKTs = 0; /* done for a while */ nextRestart = ktime_next(&bozo_nextRestartKT, BOZO_MINSKIP); nextDay = ktime_next(&bozo_nextDayKT, BOZO_MINSKIP); } /* see if we should do a restart */ if (now > nextRestart) { SBOZO_ReBozo(0); /* doesn't come back */ } /* see if we should restart a server */ if (now > nextDay) { nextDay = ktime_next(&bozo_nextDayKT, BOZO_MINSKIP); /* call the bnode restartp function, and restart all that require it */ bnode_ApplyInstance(bdrestart, 0); } } return NULL; }
/*! * \brief Main interaction loop for the recovery manager * * The recovery light-weight process only runs when you're the * synchronization site. It performs the following tasks, if and only * if the prerequisite tasks have been performed successfully (it * keeps track of which ones have been performed in its bit map, * \p urecovery_state). * * First, it is responsible for probing that all servers are up. This * is the only operation that must be performed even if this is not * yet the sync site, since otherwise this site may not notice that * enough other machines are running to even elect this guy to be the * sync site. * * After that, the recovery process does nothing until the beacon and * voting modules manage to get this site elected sync site. * * After becoming sync site, recovery first attempts to find the best * database available in the network (it must do this in order to * ensure finding the latest committed data). After finding the right * database, it must fetch this dbase to the sync site. * * After fetching the dbase, it relabels it with a new version number, * to ensure that everyone recognizes this dbase as the most recent * dbase. * * One the dbase has been relabelled, this machine can start handling * requests. However, the recovery module still has one more task: * propagating the dbase out to everyone who is up in the network. */ void * urecovery_Interact(void *dummy) { afs_int32 code, tcode; struct ubik_server *bestServer = NULL; struct ubik_server *ts; int dbok, doingRPC, now; afs_int32 lastProbeTime; /* if we're the sync site, the best db version we've found yet */ static struct ubik_version bestDBVersion; struct ubik_version tversion; struct timeval tv; int length, tlen, offset, file, nbytes; struct rx_call *rxcall; char tbuffer[1024]; struct ubik_stat ubikstat; struct in_addr inAddr; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 pass; afs_pthread_setname_self("recovery"); /* otherwise, begin interaction */ urecovery_state = 0; lastProbeTime = 0; while (1) { /* Run through this loop every 4 seconds */ tv.tv_sec = 4; tv.tv_usec = 0; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif ubik_dprint("recovery running in state %x\n", urecovery_state); /* Every 30 seconds, check all the down servers and mark them * as up if they respond. When a server comes up or found to * not be current, then re-find the the best database and * propogate it. */ if ((now = FT_ApproxTime()) > 30 + lastProbeTime) { for (ts = ubik_servers, doingRPC = 0; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; doingRPC = 1; code = DoProbe(ts); if (code == 0) { UBIK_BEACON_LOCK; ts->up = 1; UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } else { UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); if (!ts->currentDB) urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } if (doingRPC) now = FT_ApproxTime(); lastProbeTime = now; } /* Mark whether we are the sync site */ DBHOLD(ubik_dbase); if (!ubeacon_AmSyncSite()) { urecovery_state &= ~UBIK_RECSYNCSITE; DBRELE(ubik_dbase); continue; /* nothing to do */ } urecovery_state |= UBIK_RECSYNCSITE; /* If a server has just come up or if we have not found the * most current database, then go find the most current db. */ if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); bestServer = (struct ubik_server *)0; bestDBVersion.epoch = 0; bestDBVersion.counter = 0; for (ts = ubik_servers; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; continue; /* don't bother with these guys */ } UBIK_BEACON_UNLOCK; if (ts->isClone) continue; UBIK_ADDR_LOCK; code = DISK_GetVersion(ts->disk_rxcid, &ts->version); UBIK_ADDR_UNLOCK; if (code == 0) { /* perhaps this is the best version */ if (vcmp(ts->version, bestDBVersion) > 0) { /* new best version */ bestDBVersion = ts->version; bestServer = ts; } } } /* take into consideration our version. Remember if we, * the sync site, have the best version. Also note that * we may need to send the best version out. */ DBHOLD(ubik_dbase); if (vcmp(ubik_dbase->version, bestDBVersion) >= 0) { bestDBVersion = ubik_dbase->version; bestServer = (struct ubik_server *)0; urecovery_state |= UBIK_RECHAVEDB; } else { /* Clear the flag only when we know we have to retrieve * the db. Because urecovery_AllBetter() looks at it. */ urecovery_state &= ~UBIK_RECHAVEDB; } urecovery_state |= UBIK_RECFOUNDDB; urecovery_state &= ~UBIK_RECSENTDB; } if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If we, the sync site, do not have the best db version, then * go and get it from the server that does. */ if ((urecovery_state & UBIK_RECHAVEDB) || !bestServer) { urecovery_state |= UBIK_RECHAVEDB; } else { /* we don't have the best version; we should fetch it. */ urecovery_AbortAll(ubik_dbase); /* Rx code to do the Bulk fetch */ file = 0; offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(bestServer->disk_rxcid); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(bestServer->addr[0], hoststr)); UBIK_ADDR_UNLOCK; code = StartDISK_GetFile(rxcall, file); if (code) { ubik_dprint("StartDiskGetFile failed=%d\n", code); goto FetchEndCall; } nbytes = rx_Read(rxcall, (char *)&length, sizeof(afs_int32)); length = ntohl(length); if (nbytes != sizeof(afs_int32)) { ubik_dprint("Rx-read length error=%d\n", code = BULK_ERROR); code = EIO; goto FetchEndCall; } /* give invalid label during file transit */ UBIK_VERSION_LOCK; tversion.epoch = 0; code = (*ubik_dbase->setlabel) (ubik_dbase, file, &tversion); UBIK_VERSION_UNLOCK; if (code) { ubik_dprint("setlabel io error=%d\n", code); goto FetchEndCall; } snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto FetchEndCall; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto FetchEndCall; } pass = 0; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #ifndef AFS_PTHREAD_ENV if (pass % 4 == 0) IOMGR_Poll(); #endif nbytes = rx_Read(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-read bulk error=%d\n", code = BULK_ERROR); code = EIO; close(fd); goto FetchEndCall; } nbytes = write(fd, tbuffer, tlen); pass++; if (nbytes != tlen) { code = UIOERROR; close(fd); goto FetchEndCall; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto FetchEndCall; code = EndDISK_GetFile(rxcall, &tversion); FetchEndCall: tcode = rx_EndCall(rxcall, code); if (!code) code = tcode; if (!code) { /* we got a new file, set up its header */ urecovery_state |= UBIK_RECHAVEDB; UBIK_VERSION_LOCK; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); if (!code) { (*ubik_dbase->open) (ubik_dbase, file); /* after data is good, sync disk with correct label */ code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); } UBIK_VERSION_UNLOCK; #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif } if (code) { unlink(pbuffer); /* * We will effectively invalidate the old data forever now. * Unclear if we *should* but we do. */ UBIK_VERSION_LOCK; ubik_dbase->version.epoch = 0; ubik_dbase->version.counter = 0; UBIK_VERSION_UNLOCK; ubik_print("Ubik: Synchronize database failed (error = %d)\n", code); } else { ubik_print("Ubik: Synchronize database completed\n"); urecovery_state |= UBIK_RECHAVEDB; } udisk_Invalidate(ubik_dbase, 0); /* data has changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } if (!(urecovery_state & UBIK_RECHAVEDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If the database was newly initialized, then when we establish quorum, write * a new label. This allows urecovery_AllBetter() to allow access for reads. * Setting it to 2 also allows another site to come along with a newer * database and overwrite this one. */ if (ubik_dbase->version.epoch == 1) { urecovery_AbortAll(ubik_dbase); UBIK_VERSION_LOCK; version_globals.ubik_epochTime = 2; ubik_dbase->version.epoch = version_globals.ubik_epochTime; ubik_dbase->version.counter = 1; code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); UBIK_VERSION_UNLOCK; udisk_Invalidate(ubik_dbase, 0); /* data may have changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } /* Check the other sites and send the database to them if they * do not have the current db. */ if (!(urecovery_state & UBIK_RECSENTDB)) { /* now propagate out new version to everyone else */ dbok = 1; /* start off assuming they all worked */ /* * Check if a write transaction is in progress. We can't send the * db when a write is in progress here because the db would be * obsolete as soon as it goes there. Also, ops after the begin * trans would reach the recepient and wouldn't find a transaction * pending there. Frankly, I don't think it's possible to get past * the write-lock above if there is a write transaction in progress, * but then, it won't hurt to check, will it? */ if (ubik_dbase->flags & DBWRITING) { struct timeval tv; int safety = 0; long cur_usec = 50000; while ((ubik_dbase->flags & DBWRITING) && (safety < 500)) { DBRELE(ubik_dbase); /* sleep for a little while */ tv.tv_sec = 0; tv.tv_usec = cur_usec; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif cur_usec += 10000; safety++; DBHOLD(ubik_dbase); } } for (ts = ubik_servers; ts; ts = ts->next) { UBIK_ADDR_LOCK; inAddr.s_addr = ts->addr[0]; UBIK_ADDR_UNLOCK; UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; ubik_dprint("recovery cannot send version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); dbok = 0; continue; } UBIK_BEACON_UNLOCK; ubik_dprint("recovery sending version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); if (vcmp(ts->version, ubik_dbase->version) != 0) { ubik_dprint("recovery stating local database\n"); /* Rx code to do the Bulk Store */ code = (*ubik_dbase->stat) (ubik_dbase, 0, &ubikstat); if (!code) { length = ubikstat.size; file = offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(ts->disk_rxcid); UBIK_ADDR_UNLOCK; code = StartDISK_SendFile(rxcall, file, length, &ubik_dbase->version); if (code) { ubik_dprint("StartDiskSendFile failed=%d\n", code); goto StoreEndCall; } while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); nbytes = (*ubik_dbase->read) (ubik_dbase, file, tbuffer, offset, tlen); if (nbytes != tlen) { ubik_dprint("Local disk read error=%d\n", code = UIOERROR); goto StoreEndCall; } nbytes = rx_Write(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-write bulk error=%d\n", code = BULK_ERROR); goto StoreEndCall; } offset += tlen; length -= tlen; } code = EndDISK_SendFile(rxcall); StoreEndCall: code = rx_EndCall(rxcall, code); } if (code == 0) { /* we set a new file, process its header */ ts->version = ubik_dbase->version; ts->currentDB = 1; } else dbok = 0; } else { /* mark file up to date */ ts->currentDB = 1; } } if (dbok) urecovery_state |= UBIK_RECSENTDB; } DBRELE(ubik_dbase); } return NULL; }
/* make sure user authenticated on rx call acall is in list of valid users. Copy the "real name" of the authenticated user into namep if a pointer is passed. */ afs_int32 afsconf_SuperUser(struct afsconf_dir *adir, struct rx_call *acall, char *namep) { register struct rx_connection *tconn; register afs_int32 code; int flag; LOCK_GLOBAL_MUTEX; if (!adir) { UNLOCK_GLOBAL_MUTEX; return 0; } if (afsconf_GetNoAuthFlag(adir)) { if (namep) strcpy(namep, "<NoAuth>"); UNLOCK_GLOBAL_MUTEX; return 1; } tconn = rx_ConnectionOf(acall); code = rx_SecurityClassOf(tconn); if (code == 0) { UNLOCK_GLOBAL_MUTEX; return 0; /* not authenticated at all, answer is no */ } else if (code == 1) { /* bcrypt tokens */ UNLOCK_GLOBAL_MUTEX; return 0; /* not supported any longer */ } else if (code == 2) { char tname[MAXKTCNAMELEN]; /* authentication from ticket */ char tinst[MAXKTCNAMELEN]; char tcell[MAXKTCREALMLEN]; char tcell_l[MAXKTCREALMLEN]; char *tmp; /* keep track of which one actually authorized request */ char uname[MAXKTCNAMELEN + MAXKTCNAMELEN + MAXKTCREALMLEN + 3]; afs_uint32 exp; static char lcell[MAXCELLCHARS] = ""; static char lrealms[AFS_NUM_LREALMS][AFS_REALM_SZ]; static int num_lrealms = -1; int lrealm_match = 0, i; /* get auth details from server connection */ code = rxkad_GetServerInfo(acall->conn, NULL, &exp, tname, tinst, tcell, NULL); if (code) { UNLOCK_GLOBAL_MUTEX; return 0; /* bogus connection/other error */ } /* don't bother checking anything else if tix have expired */ #ifdef AFS_PTHREAD_ENV if (exp < clock_Sec()) { #else if (exp < FT_ApproxTime()) { #endif UNLOCK_GLOBAL_MUTEX; return 0; /* expired tix */ } /* generate lowercased version of cell name */ strcpy(tcell_l, tcell); tmp = tcell_l; while (*tmp) { *tmp = tolower(*tmp); tmp++; } /* determine local cell name. It's static, so will only get * calculated the first time through */ if (!lcell[0]) afsconf_GetLocalCell(adir, lcell, sizeof(lcell)); /* if running a krb environment, also get the local realm */ /* note - this assumes AFS_REALM_SZ <= MAXCELLCHARS */ /* just set it to lcell if it fails */ if (num_lrealms == -1) { for (i=0; i<AFS_NUM_LREALMS; i++) { if (afs_krb_get_lrealm(lrealms[i], i) != 0 /*KSUCCESS*/) break; } if (i == 0) { strncpy(lrealms[0], lcell, AFS_REALM_SZ); num_lrealms = 1; } else { num_lrealms = i; } } /* See if the ticket cell matches one of the local realms */ lrealm_match = 0; for ( i=0;i<num_lrealms;i++ ) { if (!strcasecmp(lrealms[i], tcell)) { lrealm_match = 1; break; } } /* If yes, then make sure that the name is not present in * an exclusion list */ if (lrealm_match) { if (tinst[0]) snprintf(uname,sizeof(uname),"%s.%s@%s",tname,tinst,tcell); else snprintf(uname,sizeof(uname),"%s@%s",tname,tcell); if (afs_krb_exclusion(uname)) lrealm_match = 0; } /* start with no uname and no authorization */ strcpy(uname, ""); flag = 0; /* localauth special case */ if (strlen(tinst) == 0 && strlen(tcell) == 0 && !strcmp(tname, AUTH_SUPERUSER)) { strcpy(uname, "<LocalAuth>"); flag = 1; /* cell of connection matches local cell or one of the realms */ } else if (!strcasecmp(tcell, lcell) || lrealm_match) { if ((tmp = CompFindUser(adir, tname, ".", tinst, NULL))) { strcpy(uname, tmp); flag = 1; #ifdef notyet } else if ((tmp = CompFindUser(adir, tname, "/", tinst, NULL))) { strcpy(uname, tmp); flag = 1; #endif } /* cell of conn doesn't match local cell or realm */ } else { if ((tmp = CompFindUser(adir, tname, ".", tinst, tcell))) { strcpy(uname, tmp); flag = 1; #ifdef notyet } else if ((tmp = CompFindUser(adir, tname, "/", tinst, tcell))) { strcpy(uname, tmp); flag = 1; #endif } else if ((tmp = CompFindUser(adir, tname, ".", tinst, tcell_l))) { strcpy(uname, tmp); flag = 1; #ifdef notyet } else if ((tmp = CompFindUser(adir, tname, "/", tinst, tcell_l))) { strcpy(uname, tmp); flag = 1; #endif } } if (namep) strcpy(namep, uname); UNLOCK_GLOBAL_MUTEX; return flag; } else { /* some other auth type */ UNLOCK_GLOBAL_MUTEX; return 0; /* mysterious, just say no */ } }
/*! * \brief called by the sync site to handle vote beacons; if aconn is null, this is a * local call * * \returns 0 or time when the vote was sent. It returns 0 if we are * not voting for this sync site, or the time we actually voted yes, if * non-zero. */ afs_int32 SVOTE_Beacon(struct rx_call * rxcall, afs_int32 astate, afs_int32 astart, struct ubik_version * avers, struct ubik_tid * atid) { afs_int32 otherHost; afs_int32 now; afs_int32 vote; struct rx_connection *aconn; struct rx_peer *rxp; struct ubik_server *ts; int isClone = 0; char hoststr[16]; if (rxcall) { /* caller's host */ aconn = rx_ConnectionOf(rxcall); rxp = rx_PeerOf(aconn); otherHost = rx_HostOf(rxp); /* get the primary interface address for this host. */ /* This is the identifier that ubik uses. */ otherHost = ubikGetPrimaryInterfaceAddr(otherHost); if (!otherHost) { ubik_dprint("Received beacon from unknown host %s\n", afs_inet_ntoa_r(rx_HostOf(rxp), hoststr)); return 0; /* I don't know about you: vote no */ } for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) break; } if (!ts) ubik_dprint("Unknown host %x has sent a beacon\n", otherHost); if (ts && ts->isClone) isClone = 1; } else { otherHost = ubik_host[0]; /* this host */ isClone = amIClone; } ubik_dprint("Received beacon type %d from host %s\n", astate, afs_inet_ntoa_r(otherHost, hoststr)); /* compute the lowest server we've heard from. We'll try to only vote for * this dude if we don't already have a synchronization site. Also, don't * let a very old lowestHost confusing things forever. We pick a new * lowestHost after BIGTIME seconds to limit the damage if this host * actually crashes. Finally, we also count in this computation: don't * pick someone else if we're even better! * * Note that the test below must be <=, not <, so that we keep refreshing * lowestTime. Otherwise it will look like we haven't heard from * lowestHost in a while and another host could slip in. */ /* First compute the lowest host we've heard from, whether we want them * for a sync site or not. If we haven't heard from a site in BIGTIME * seconds, we ignore its presence in lowestHost: it may have crashed. * Note that we don't ever let anyone appear in our lowestHost if we're * lower than them, 'cause we know we're up. */ /* But do not consider clones for lowesHost since they never may become * sync site */ UBIK_VOTE_LOCK; now = FT_ApproxTime(); /* close to current time */ if (!isClone && (ntohl((afs_uint32)otherHost) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = otherHost; } /* why do we need this next check? Consider the case where each of two * servers decides the other is lowestHost. Each stops sending beacons * 'cause the other is there. Not obvious that this process terminates: * i.e. each guy could restart procedure and again think other side is * lowest. Need to prove: if one guy in the system is lowest and knows * he's lowest, these loops don't occur. because if someone knows he's * lowest, he will send out beacons telling others to vote for him. */ if (!amIClone && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = ubik_host[0]; } /* tell if we've heard from a sync site recently (even if we're not voting * for this dude yet). After a while, time the guy out. */ if (astate) { /* this guy is a sync site */ vote_globals.syncHost = otherHost; vote_globals.syncTime = now; } else if (vote_globals.syncTime + BIGTIME < now) { if (vote_globals.syncHost) { ubik_dprint ("Ubik: Lost contact with sync-site %s (NOT in quorum)\n", afs_inet_ntoa_r(vote_globals.syncHost, hoststr)); } vote_globals.syncHost = 0; } /* decide how to vote */ vote = 0; /* start off voting no */ /* if we this guy isn't a sync site, we don't really have to vote for him. * We get to apply some heuristics to try to avoid weird oscillation sates * in the voting procedure. */ if (astate == 0) { /* in here only if this guy doesn't claim to be a sync site */ /* lowestHost is also trying for our votes, then just say no. */ if (ntohl(vote_globals.lowestHost) != ntohl(otherHost)) { goto done_zero; } /* someone else *is* a sync site, just say no */ if (vote_globals.syncHost && vote_globals.syncHost != otherHost) goto done_zero; } else if (vote_globals.lastYesHost == 0xffffffff && otherHost == ubik_host[0]) { /* fast startup if this is the only non-clone */ int i = 0; for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) continue; if (!ts->isClone) i++; } if (!i) vote_globals.lastYesHost = otherHost; } if (isClone) goto done_zero; /* clone never can become sync site */ /* Don't promise sync site support to more than one host every BIGTIME * seconds. This is the heart of our invariants in this system. */ if (vote_globals.ubik_lastYesTime + BIGTIME < now || otherHost == vote_globals.lastYesHost) { if ((vote_globals.ubik_lastYesTime + BIGTIME < now) || (otherHost != vote_globals.lastYesHost) || (vote_globals.lastYesState != astate)) { /* A new vote or a change in the vote or changed quorum */ ubik_dprint("Ubik: vote 'yes' for %s %s\n", afs_inet_ntoa_r(otherHost, hoststr), (astate ? "(in quorum)" : "(NOT in quorum)")); } vote = now; /* vote yes */ vote_globals.ubik_lastYesTime = now; /* remember when we voted yes */ vote_globals.lastYesClaim = astart; /* remember for computing when sync site expires */ vote_globals.lastYesHost = otherHost; /* and who for */ vote_globals.lastYesState = astate; /* remember if site is a sync site */ vote_globals.ubik_dbVersion = *avers; /* resync value */ vote_globals.ubik_dbTid = *atid; /* transaction id, if any, of active trans */ UBIK_VOTE_UNLOCK; DBHOLD(ubik_dbase); urecovery_CheckTid(atid, 0); /* check if current write trans needs aborted */ DBRELE(ubik_dbase); } else { UBIK_VOTE_UNLOCK; } return vote; done_zero: UBIK_VOTE_UNLOCK; return 0; }