afs_int32 SDISK_SendFile(struct rx_call *rxcall, afs_int32 file, afs_int32 length, struct ubik_version *avers) { afs_int32 code; struct ubik_dbase *dbase = NULL; char tbuffer[1024]; afs_int32 offset; struct ubik_version tversion; int tlen; struct rx_peer *tpeer; struct rx_connection *tconn; afs_uint32 otherHost = 0; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 epoch = 0; afs_int32 pass; /* send the file back to the requester */ dbase = ubik_dbase; if ((code = ubik_CheckAuth(rxcall))) { DBHOLD(dbase); goto failed; } /* next, we do a sanity check to see if the guy sending us the database is * the guy we think is the sync site. It turns out that we might not have * decided yet that someone's the sync site, but they could have enough * votes from others to be sync site anyway, and could send us the database * in advance of getting our votes. This is fine, what we're really trying * to check is that some authenticated bogon isn't sending a random database * into another configuration. This could happen on a bad configuration * screwup. Thus, we only object if we're sure we know who the sync site * is, and it ain't the guy talking to us. */ offset = uvote_GetSyncSite(); tconn = rx_ConnectionOf(rxcall); tpeer = rx_PeerOf(tconn); otherHost = ubikGetPrimaryInterfaceAddr(rx_HostOf(tpeer)); if (offset && offset != otherHost) { /* we *know* this is the wrong guy */ code = USYNC; DBHOLD(dbase); goto failed; } DBHOLD(dbase); /* abort any active trans that may scribble over the database */ urecovery_AbortAll(dbase); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(otherHost, hoststr)); offset = 0; UBIK_VERSION_LOCK; epoch = tversion.epoch = 0; /* start off by labelling in-transit db as invalid */ (*dbase->setlabel) (dbase, file, &tversion); /* setlabel does sync */ snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto failed_locked; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto failed_locked; } pass = 0; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); UBIK_VERSION_UNLOCK; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #if !defined(AFS_PTHREAD_ENV) if (pass % 4 == 0) IOMGR_Poll(); #endif code = rx_Read(rxcall, tbuffer, tlen); if (code != tlen) { ubik_dprint("Rx-read length error=%d\n", code); code = BULK_ERROR; close(fd); goto failed; } code = write(fd, tbuffer, tlen); pass++; if (code != tlen) { ubik_dprint("write failed error=%d\n", code); code = UIOERROR; close(fd); goto failed; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto failed; /* sync data first, then write label and resync (resync done by setlabel call). * This way, good label is only on good database. */ snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); UBIK_VERSION_LOCK; if (!code) { (*ubik_dbase->open) (ubik_dbase, file); code = (*ubik_dbase->setlabel) (dbase, file, avers); } #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif memcpy(&ubik_dbase->version, avers, sizeof(struct ubik_version)); udisk_Invalidate(dbase, file); /* new dbase, flush disk buffers */ #ifdef AFS_PTHREAD_ENV assert(pthread_cond_broadcast(&dbase->version_cond) == 0); #else LWP_NoYieldSignal(&dbase->version); #endif failed_locked: UBIK_VERSION_UNLOCK; failed: if (code) { unlink(pbuffer); /* Failed to sync. Allow reads again for now. */ if (dbase != NULL) { UBIK_VERSION_LOCK; tversion.epoch = epoch; (*dbase->setlabel) (dbase, file, &tversion); UBIK_VERSION_UNLOCK; } ubik_print ("Ubik: Synchronize database with server %s failed (error = %d)\n", afs_inet_ntoa_r(otherHost, hoststr), code); } else { ubik_print("Ubik: Synchronize database completed\n"); } DBRELE(dbase); return code; }
/*! * \brief called by the sync site to handle vote beacons; if aconn is null, this is a * local call * * \returns 0 or time when the vote was sent. It returns 0 if we are * not voting for this sync site, or the time we actually voted yes, if * non-zero. */ afs_int32 SVOTE_Beacon(struct rx_call * rxcall, afs_int32 astate, afs_int32 astart, struct ubik_version * avers, struct ubik_tid * atid) { afs_int32 otherHost; afs_int32 now; afs_int32 vote; struct rx_connection *aconn; struct rx_peer *rxp; struct ubik_server *ts; int isClone = 0; char hoststr[16]; if (rxcall) { /* caller's host */ aconn = rx_ConnectionOf(rxcall); rxp = rx_PeerOf(aconn); otherHost = rx_HostOf(rxp); /* get the primary interface address for this host. */ /* This is the identifier that ubik uses. */ otherHost = ubikGetPrimaryInterfaceAddr(otherHost); if (!otherHost) { ubik_dprint("Received beacon from unknown host %s\n", afs_inet_ntoa_r(rx_HostOf(rxp), hoststr)); return 0; /* I don't know about you: vote no */ } for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) break; } if (!ts) ubik_dprint("Unknown host %x has sent a beacon\n", otherHost); if (ts && ts->isClone) isClone = 1; } else { otherHost = ubik_host[0]; /* this host */ isClone = amIClone; } ubik_dprint("Received beacon type %d from host %s\n", astate, afs_inet_ntoa_r(otherHost, hoststr)); /* compute the lowest server we've heard from. We'll try to only vote for * this dude if we don't already have a synchronization site. Also, don't * let a very old lowestHost confusing things forever. We pick a new * lowestHost after BIGTIME seconds to limit the damage if this host * actually crashes. Finally, we also count in this computation: don't * pick someone else if we're even better! * * Note that the test below must be <=, not <, so that we keep refreshing * lowestTime. Otherwise it will look like we haven't heard from * lowestHost in a while and another host could slip in. */ /* First compute the lowest host we've heard from, whether we want them * for a sync site or not. If we haven't heard from a site in BIGTIME * seconds, we ignore its presence in lowestHost: it may have crashed. * Note that we don't ever let anyone appear in our lowestHost if we're * lower than them, 'cause we know we're up. */ /* But do not consider clones for lowesHost since they never may become * sync site */ UBIK_VOTE_LOCK; now = FT_ApproxTime(); /* close to current time */ if (!isClone && (ntohl((afs_uint32)otherHost) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = otherHost; } /* why do we need this next check? Consider the case where each of two * servers decides the other is lowestHost. Each stops sending beacons * 'cause the other is there. Not obvious that this process terminates: * i.e. each guy could restart procedure and again think other side is * lowest. Need to prove: if one guy in the system is lowest and knows * he's lowest, these loops don't occur. because if someone knows he's * lowest, he will send out beacons telling others to vote for him. */ if (!amIClone && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = ubik_host[0]; } /* tell if we've heard from a sync site recently (even if we're not voting * for this dude yet). After a while, time the guy out. */ if (astate) { /* this guy is a sync site */ vote_globals.syncHost = otherHost; vote_globals.syncTime = now; } else if (vote_globals.syncTime + BIGTIME < now) { if (vote_globals.syncHost) { ubik_dprint ("Ubik: Lost contact with sync-site %s (NOT in quorum)\n", afs_inet_ntoa_r(vote_globals.syncHost, hoststr)); } vote_globals.syncHost = 0; } /* decide how to vote */ vote = 0; /* start off voting no */ /* if we this guy isn't a sync site, we don't really have to vote for him. * We get to apply some heuristics to try to avoid weird oscillation sates * in the voting procedure. */ if (astate == 0) { /* in here only if this guy doesn't claim to be a sync site */ /* lowestHost is also trying for our votes, then just say no. */ if (ntohl(vote_globals.lowestHost) != ntohl(otherHost)) { goto done_zero; } /* someone else *is* a sync site, just say no */ if (vote_globals.syncHost && vote_globals.syncHost != otherHost) goto done_zero; } else if (vote_globals.lastYesHost == 0xffffffff && otherHost == ubik_host[0]) { /* fast startup if this is the only non-clone */ int i = 0; for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) continue; if (!ts->isClone) i++; } if (!i) vote_globals.lastYesHost = otherHost; } if (isClone) goto done_zero; /* clone never can become sync site */ /* Don't promise sync site support to more than one host every BIGTIME * seconds. This is the heart of our invariants in this system. */ if (vote_globals.ubik_lastYesTime + BIGTIME < now || otherHost == vote_globals.lastYesHost) { if ((vote_globals.ubik_lastYesTime + BIGTIME < now) || (otherHost != vote_globals.lastYesHost) || (vote_globals.lastYesState != astate)) { /* A new vote or a change in the vote or changed quorum */ ubik_dprint("Ubik: vote 'yes' for %s %s\n", afs_inet_ntoa_r(otherHost, hoststr), (astate ? "(in quorum)" : "(NOT in quorum)")); } vote = now; /* vote yes */ vote_globals.ubik_lastYesTime = now; /* remember when we voted yes */ vote_globals.lastYesClaim = astart; /* remember for computing when sync site expires */ vote_globals.lastYesHost = otherHost; /* and who for */ vote_globals.lastYesState = astate; /* remember if site is a sync site */ vote_globals.ubik_dbVersion = *avers; /* resync value */ vote_globals.ubik_dbTid = *atid; /* transaction id, if any, of active trans */ UBIK_VOTE_UNLOCK; DBHOLD(ubik_dbase); urecovery_CheckTid(atid, 0); /* check if current write trans needs aborted */ DBRELE(ubik_dbase); } else { UBIK_VOTE_UNLOCK; } return vote; done_zero: UBIK_VOTE_UNLOCK; return 0; }