afs_int32 SDISK_Truncate(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, afs_int32 afile, afs_int32 alen) { afs_int32 code; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } code = udisk_truncate(ubik_currentTrans[index], afile, alen); DBRELE(dbase); return code; }
afs_int32 SDISK_SetVersion(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, struct ubik_version *oldversionp, struct ubik_version *newversionp) { afs_int32 code = 0; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return (code); } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } /* Should not get this for the sync site */ if (ubeacon_AmSyncSite()) { return UDEADLOCK; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } /* Set the label if its version matches the sync-site's */ if ((oldversionp->epoch == ubik_dbVersion[index].epoch) && (oldversionp->counter == ubik_dbVersion[index].counter)) { code = (*dbase->setlabel) (dbase, 0, newversionp); if (!code) { dbase->version = *newversionp; ubik_dbVersion[index] = *newversionp; } } else { code = USYNC; } DBRELE(dbase); return code; }
afs_int32 SDISK_GetFile(struct rx_call *rxcall, afs_int32 file, struct ubik_version *version) { afs_int32 code; struct ubik_dbase *dbase; afs_int32 offset; struct ubik_stat ubikstat; char tbuffer[256]; afs_int32 tlen; afs_int32 length; if ((code = ubik_CheckAuth(rxcall))) { return code; } dbase = ubik_dbase; DBHOLD(dbase); code = (*dbase->stat) (dbase, file, &ubikstat); if (code < 0) { DBRELE(dbase); return code; } length = ubikstat.size; tlen = htonl(length); code = rx_Write(rxcall, (char *)&tlen, sizeof(afs_int32)); if (code != sizeof(afs_int32)) { DBRELE(dbase); ubik_dprint("Rx-write length error=%d\n", code); return BULK_ERROR; } offset = 0; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); code = (*dbase->read) (dbase, file, tbuffer, offset, tlen); if (code != tlen) { DBRELE(dbase); ubik_dprint("read failed error=%d\n", code); return UIOERROR; } code = rx_Write(rxcall, tbuffer, tlen); if (code != tlen) { DBRELE(dbase); ubik_dprint("Rx-write length error=%d\n", code); return BULK_ERROR; } length -= tlen; offset += tlen; } code = (*dbase->getlabel) (dbase, file, version); /* return the dbase, too */ DBRELE(dbase); return code; }
/*! * \brief Write a vector of data */ afs_int32 SDISK_WriteV(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, iovec_wrt *io_vector, iovec_buf *io_buffer) { afs_int32 code, i, offset; struct ubik_dbase *dbase; struct ubik_iovec *iovec; char *iobuf; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } iovec = (struct ubik_iovec *)io_vector->iovec_wrt_val; iobuf = (char *)io_buffer->iovec_buf_val; for (i = 0, offset = 0; i < io_vector->iovec_wrt_len; i++) { /* Sanity check for going off end of buffer */ if ((offset + iovec[i].length) > io_buffer->iovec_buf_len) { code = UINTERNAL; } else { code = udisk_write(ubik_currentTrans[index], iovec[i].file, &iobuf[offset], iovec[i].position, iovec[i].length); } if (code) break; offset += iovec[i].length; } DBRELE(dbase); return code; }
/* apos and alen are not used */ afs_int32 SDISK_Lock(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, afs_int32 afile, afs_int32 apos, afs_int32 alen, afs_int32 atype) { afs_int32 code; struct ubik_dbase *dbase; struct ubik_trans *ubik_thisTrans; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } if (!ubik_dbase[index]) { return ENOENT; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (alen != 1) { return UBADLOCK; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } ubik_thisTrans = ubik_currentTrans[index]; code = ulock_getLock(ubik_currentTrans[index], atype, 1); /* While waiting, the transaction may have been ended/ * aborted from under us (urecovery_CheckTid). In that * case, end the transaction here. */ if (!code && (ubik_currentTrans[index] != ubik_thisTrans)) { udisk_end(ubik_thisTrans); code = USYNC; } DBRELE(dbase); return code; }
afs_int32 SDISK_Abort(struct rx_call *rxcall, struct ubik_tid *atid) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } code = udisk_abort(ubik_currentTrans); /* If the thread is not waiting for lock - ok to end it */ if (ubik_currentTrans->locktype != LOCKWAIT) { udisk_end(ubik_currentTrans); } ubik_currentTrans = (struct ubik_trans *)0; done: DBRELE(ubik_dbase); return code; }
afs_int32 SDISK_Truncate(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 afile, afs_int32 alen) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } code = udisk_truncate(ubik_currentTrans, afile, alen); done: DBRELE(ubik_dbase); return code; }
/*! * \brief Set a transaction lock. * \param atype is #LOCKREAD or #LOCKWRITE. * \param await is TRUE if you want to wait for the lock instead of returning * #EWOULDBLOCK. * * \note The #DBHOLD lock must be held. */ extern int ulock_getLock(struct ubik_trans *atrans, int atype, int await) { struct ubik_dbase *dbase = atrans->dbase; if ((atype != LOCKREAD) && (atype != LOCKWRITE)) return EINVAL; if (atrans->flags & TRDONE) return UDONE; if (atype != LOCKREAD && (atrans->flags & TRREADWRITE)) { return EINVAL; } if (atrans->locktype != 0) { ubik_print("Ubik: Internal Error: attempted to take lock twice\n"); abort(); } /* *ubik_print("Ubik: DEBUG: Thread 0x%x request %s lock\n", lwp_cpptr, * ((atype == LOCKREAD) ? "READ" : "WRITE")); */ /* Check if the lock would would block */ if (!await && !(atrans->flags & TRREADWRITE)) { if (atype == LOCKREAD) { if (WouldReadBlock(&rwlock)) return EAGAIN; } else { if (WouldWriteBlock(&rwlock)) return EAGAIN; } } /* Create new lock record and add to spec'd transaction: * locktype. This field also tells us if the thread is * waiting for a lock: It will be equal to LOCKWAIT. */ atrans->locktype = LOCKWAIT; DBRELE(dbase); if (atrans->flags & TRREADWRITE) { /* noop; don't actually lock anything for TRREADWRITE */ } else if (atype == LOCKREAD) { ObtainReadLock(&rwlock); } else { ObtainWriteLock(&rwlock); } DBHOLD(dbase); atrans->locktype = atype; /* *ubik_print("Ubik: DEBUG: Thread 0x%x took %s lock\n", lwp_cpptr, * ((atype == LOCKREAD) ? "READ" : "WRITE")); */ return 0; }
afs_int32 SDISK_Commit(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index) { afs_int32 code; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_dbase[index]) { return ENOENT; } if (!ubik_currentTrans[index]) { return USYNC; } /* * sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } dbase = ubik_currentTrans[index]->dbase; ObtainWriteLock(&dbase->cache_lock); DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); ReleaseWriteLock(&dbase->cache_lock); return USYNC; } code = udisk_commit(ubik_currentTrans[index]); if (code == 0) { /* sync site should now match */ ubik_dbVersion[index] = ubik_dbase[index]->version; } DBRELE(dbase); ReleaseWriteLock(&dbase->cache_lock); return code; }
afs_int32 SDISK_Abort(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index) { afs_int32 code; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } code = udisk_abort(ubik_currentTrans[index]); /* If the thread is not waiting for lock - ok to end it */ #if !defined(UBIK_PAUSE) if (ubik_currentTrans[index]->locktype != LOCKWAIT) { #endif /* UBIK_PAUSE */ udisk_end(ubik_currentTrans[index]); #if !defined(UBIK_PAUSE) } #endif /* UBIK_PAUSE */ ubik_currentTrans[index] = (struct ubik_trans *)0; DBRELE(dbase); return code; }
/*! * \brief initialize the local ubik_dbase * * We replay the logs and then read the resulting file to figure out what version we've really got. */ int urecovery_Initialize(struct ubik_dbase *adbase) { afs_int32 code; DBHOLD(adbase); code = ReplayLog(adbase); if (code) goto done; code = InitializeDB(adbase); done: DBRELE(adbase); return code; }
afs_int32 SDISK_GetVersion(struct rx_call *rxcall, struct ubik_version *aversion) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } /* * If we are the sync site, recovery shouldn't be running on any * other site. We shouldn't be getting this RPC as long as we are * the sync site. To prevent any unforseen activity, we should * reject this RPC until we have recognized that we are not the * sync site anymore, and/or if we have any pending WRITE * transactions that have to complete. This way we can be assured * that this RPC would not block any pending transactions that * should either fail or pass. If we have recognized the fact that * we are not the sync site any more, all write transactions would * fail with UNOQUORUM anyway. */ DBHOLD(ubik_dbase); if (ubeacon_AmSyncSite()) { DBRELE(ubik_dbase); return UDEADLOCK; } code = (*ubik_dbase->getlabel) (ubik_dbase, 0, aversion); DBRELE(ubik_dbase); if (code) { /* tell other side there's no dbase */ aversion->epoch = 0; aversion->counter = 0; } return 0; }
afs_int32 SDISK_SetVersion(struct rx_call *rxcall, struct ubik_tid *atid, struct ubik_version *oldversionp, struct ubik_version *newversionp) { afs_int32 code = 0; if ((code = ubik_CheckAuth(rxcall))) { return (code); } DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } /* Should not get this for the sync site */ if (ubeacon_AmSyncSite()) { code = UDEADLOCK; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } /* Set the label if its version matches the sync-site's */ if (uvote_eq_dbVersion(*oldversionp)) { UBIK_VERSION_LOCK; code = (*ubik_dbase->setlabel) (ubik_dbase, 0, newversionp); if (!code) { ubik_dbase->version = *newversionp; uvote_set_dbVersion(*newversionp); } UBIK_VERSION_UNLOCK; } else { code = USYNC; } done: DBRELE(ubik_dbase); return code; }
/* the rest of these guys handle remote execution of write * transactions: this is the code executed on the other servers when a * sync site is executing a write transaction. */ afs_int32 SDISK_Begin(struct rx_call *rxcall, struct ubik_tid *atid) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } DBHOLD(ubik_dbase); urecovery_CheckTid(atid, 1); code = udisk_begin(ubik_dbase, UBIK_WRITETRANS, &ubik_currentTrans); if (!code && ubik_currentTrans) { /* label this trans with the right trans id */ ubik_currentTrans->tid.epoch = atid->epoch; ubik_currentTrans->tid.counter = atid->counter; } DBRELE(ubik_dbase); return code; }
afs_int32 SDISK_Commit(struct rx_call *rxcall, struct ubik_tid *atid) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } ObtainWriteLock(&ubik_dbase->cache_lock); DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* * sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } code = udisk_commit(ubik_currentTrans); if (code == 0) { /* sync site should now match */ uvote_set_dbVersion(ubik_dbase->version); } done: DBRELE(ubik_dbase); ReleaseWriteLock(&ubik_dbase->cache_lock); return code; }
/* the rest of these guys handle remote execution of write * transactions: this is the code executed on the other servers when a * sync site is executing a write transaction. */ afs_int32 SDISK_Begin(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_dbase[index]) { return ENOENT; } DBHOLD(ubik_dbase[index]); if (urecovery_AllBetter(ubik_dbase[index], 0) == 0) { code = UNOQUORUM; goto out; } urecovery_CheckTid(atid, index); if (ubik_currentTrans[index]) { /* If the thread is not waiting for lock - ok to end it */ #if !defined(UBIK_PAUSE) if (ubik_currentTrans[index]->locktype != LOCKWAIT) { #endif /* UBIK_PAUSE */ udisk_end(ubik_currentTrans[index]); #if !defined(UBIK_PAUSE) } #endif /* UBIK_PAUSE */ ubik_currentTrans[index] = (struct ubik_trans *)0; } code = udisk_begin(ubik_dbase[index], UBIK_WRITETRANS, &ubik_currentTrans[index]); if (!code && ubik_currentTrans[index]) { /* label this trans with the right trans id */ ubik_currentTrans[index]->tid.epoch = atid->epoch; ubik_currentTrans[index]->tid.counter = atid->counter; } out: DBRELE(ubik_dbase[index]); return code; }
/*! * \brief called by the sync site to handle vote beacons; if aconn is null, this is a * local call * * \returns 0 or time when the vote was sent. It returns 0 if we are * not voting for this sync site, or the time we actually voted yes, if * non-zero. */ afs_int32 SVOTE_Beacon(struct rx_call * rxcall, afs_int32 astate, afs_int32 astart, struct ubik_version * avers, struct ubik_tid * atid) { afs_int32 otherHost; afs_int32 now; afs_int32 vote; struct rx_connection *aconn; struct rx_peer *rxp; struct ubik_server *ts; int isClone = 0; char hoststr[16]; if (rxcall) { /* caller's host */ aconn = rx_ConnectionOf(rxcall); rxp = rx_PeerOf(aconn); otherHost = rx_HostOf(rxp); /* get the primary interface address for this host. */ /* This is the identifier that ubik uses. */ otherHost = ubikGetPrimaryInterfaceAddr(otherHost); if (!otherHost) { ubik_dprint("Received beacon from unknown host %s\n", afs_inet_ntoa_r(rx_HostOf(rxp), hoststr)); return 0; /* I don't know about you: vote no */ } for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) break; } if (!ts) ubik_dprint("Unknown host %x has sent a beacon\n", otherHost); if (ts && ts->isClone) isClone = 1; } else { otherHost = ubik_host[0]; /* this host */ isClone = amIClone; } ubik_dprint("Received beacon type %d from host %s\n", astate, afs_inet_ntoa_r(otherHost, hoststr)); /* compute the lowest server we've heard from. We'll try to only vote for * this dude if we don't already have a synchronization site. Also, don't * let a very old lowestHost confusing things forever. We pick a new * lowestHost after BIGTIME seconds to limit the damage if this host * actually crashes. Finally, we also count in this computation: don't * pick someone else if we're even better! * * Note that the test below must be <=, not <, so that we keep refreshing * lowestTime. Otherwise it will look like we haven't heard from * lowestHost in a while and another host could slip in. */ /* First compute the lowest host we've heard from, whether we want them * for a sync site or not. If we haven't heard from a site in BIGTIME * seconds, we ignore its presence in lowestHost: it may have crashed. * Note that we don't ever let anyone appear in our lowestHost if we're * lower than them, 'cause we know we're up. */ /* But do not consider clones for lowesHost since they never may become * sync site */ UBIK_VOTE_LOCK; now = FT_ApproxTime(); /* close to current time */ if (!isClone && (ntohl((afs_uint32)otherHost) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = otherHost; } /* why do we need this next check? Consider the case where each of two * servers decides the other is lowestHost. Each stops sending beacons * 'cause the other is there. Not obvious that this process terminates: * i.e. each guy could restart procedure and again think other side is * lowest. Need to prove: if one guy in the system is lowest and knows * he's lowest, these loops don't occur. because if someone knows he's * lowest, he will send out beacons telling others to vote for him. */ if (!amIClone && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = ubik_host[0]; } /* tell if we've heard from a sync site recently (even if we're not voting * for this dude yet). After a while, time the guy out. */ if (astate) { /* this guy is a sync site */ vote_globals.syncHost = otherHost; vote_globals.syncTime = now; } else if (vote_globals.syncTime + BIGTIME < now) { if (vote_globals.syncHost) { ubik_dprint ("Ubik: Lost contact with sync-site %s (NOT in quorum)\n", afs_inet_ntoa_r(vote_globals.syncHost, hoststr)); } vote_globals.syncHost = 0; } /* decide how to vote */ vote = 0; /* start off voting no */ /* if we this guy isn't a sync site, we don't really have to vote for him. * We get to apply some heuristics to try to avoid weird oscillation sates * in the voting procedure. */ if (astate == 0) { /* in here only if this guy doesn't claim to be a sync site */ /* lowestHost is also trying for our votes, then just say no. */ if (ntohl(vote_globals.lowestHost) != ntohl(otherHost)) { goto done_zero; } /* someone else *is* a sync site, just say no */ if (vote_globals.syncHost && vote_globals.syncHost != otherHost) goto done_zero; } else if (vote_globals.lastYesHost == 0xffffffff && otherHost == ubik_host[0]) { /* fast startup if this is the only non-clone */ int i = 0; for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) continue; if (!ts->isClone) i++; } if (!i) vote_globals.lastYesHost = otherHost; } if (isClone) goto done_zero; /* clone never can become sync site */ /* Don't promise sync site support to more than one host every BIGTIME * seconds. This is the heart of our invariants in this system. */ if (vote_globals.ubik_lastYesTime + BIGTIME < now || otherHost == vote_globals.lastYesHost) { if ((vote_globals.ubik_lastYesTime + BIGTIME < now) || (otherHost != vote_globals.lastYesHost) || (vote_globals.lastYesState != astate)) { /* A new vote or a change in the vote or changed quorum */ ubik_dprint("Ubik: vote 'yes' for %s %s\n", afs_inet_ntoa_r(otherHost, hoststr), (astate ? "(in quorum)" : "(NOT in quorum)")); } vote = now; /* vote yes */ vote_globals.ubik_lastYesTime = now; /* remember when we voted yes */ vote_globals.lastYesClaim = astart; /* remember for computing when sync site expires */ vote_globals.lastYesHost = otherHost; /* and who for */ vote_globals.lastYesState = astate; /* remember if site is a sync site */ vote_globals.ubik_dbVersion = *avers; /* resync value */ vote_globals.ubik_dbTid = *atid; /* transaction id, if any, of active trans */ UBIK_VOTE_UNLOCK; DBHOLD(ubik_dbase); urecovery_CheckTid(atid, 0); /* check if current write trans needs aborted */ DBRELE(ubik_dbase); } else { UBIK_VOTE_UNLOCK; } return vote; done_zero: UBIK_VOTE_UNLOCK; return 0; }
afs_int32 SDISK_SendFile(struct rx_call *rxcall, afs_int32 file, afs_int32 length, struct ubik_version *avers) { afs_int32 code; struct ubik_dbase *dbase = NULL; char tbuffer[1024]; afs_int32 offset; struct ubik_version tversion; int tlen; struct rx_peer *tpeer; struct rx_connection *tconn; afs_uint32 otherHost = 0; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 epoch = 0; afs_int32 pass; /* send the file back to the requester */ dbase = ubik_dbase; if ((code = ubik_CheckAuth(rxcall))) { DBHOLD(dbase); goto failed; } /* next, we do a sanity check to see if the guy sending us the database is * the guy we think is the sync site. It turns out that we might not have * decided yet that someone's the sync site, but they could have enough * votes from others to be sync site anyway, and could send us the database * in advance of getting our votes. This is fine, what we're really trying * to check is that some authenticated bogon isn't sending a random database * into another configuration. This could happen on a bad configuration * screwup. Thus, we only object if we're sure we know who the sync site * is, and it ain't the guy talking to us. */ offset = uvote_GetSyncSite(); tconn = rx_ConnectionOf(rxcall); tpeer = rx_PeerOf(tconn); otherHost = ubikGetPrimaryInterfaceAddr(rx_HostOf(tpeer)); if (offset && offset != otherHost) { /* we *know* this is the wrong guy */ code = USYNC; DBHOLD(dbase); goto failed; } DBHOLD(dbase); /* abort any active trans that may scribble over the database */ urecovery_AbortAll(dbase); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(otherHost, hoststr)); offset = 0; UBIK_VERSION_LOCK; epoch = tversion.epoch = 0; /* start off by labelling in-transit db as invalid */ (*dbase->setlabel) (dbase, file, &tversion); /* setlabel does sync */ snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto failed_locked; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto failed_locked; } pass = 0; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); UBIK_VERSION_UNLOCK; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #if !defined(AFS_PTHREAD_ENV) if (pass % 4 == 0) IOMGR_Poll(); #endif code = rx_Read(rxcall, tbuffer, tlen); if (code != tlen) { ubik_dprint("Rx-read length error=%d\n", code); code = BULK_ERROR; close(fd); goto failed; } code = write(fd, tbuffer, tlen); pass++; if (code != tlen) { ubik_dprint("write failed error=%d\n", code); code = UIOERROR; close(fd); goto failed; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto failed; /* sync data first, then write label and resync (resync done by setlabel call). * This way, good label is only on good database. */ snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); UBIK_VERSION_LOCK; if (!code) { (*ubik_dbase->open) (ubik_dbase, file); code = (*ubik_dbase->setlabel) (dbase, file, avers); } #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif memcpy(&ubik_dbase->version, avers, sizeof(struct ubik_version)); udisk_Invalidate(dbase, file); /* new dbase, flush disk buffers */ #ifdef AFS_PTHREAD_ENV assert(pthread_cond_broadcast(&dbase->version_cond) == 0); #else LWP_NoYieldSignal(&dbase->version); #endif failed_locked: UBIK_VERSION_UNLOCK; failed: if (code) { unlink(pbuffer); /* Failed to sync. Allow reads again for now. */ if (dbase != NULL) { UBIK_VERSION_LOCK; tversion.epoch = epoch; (*dbase->setlabel) (dbase, file, &tversion); UBIK_VERSION_UNLOCK; } ubik_print ("Ubik: Synchronize database with server %s failed (error = %d)\n", afs_inet_ntoa_r(otherHost, hoststr), code); } else { ubik_print("Ubik: Synchronize database completed\n"); } DBRELE(dbase); return code; }
afs_int32 SDISK_GetFile(struct rx_call *rxcall, afs_int32 index, afs_int32 file, struct ubik_version *version) { afs_int32 code; struct ubik_dbase *dbase; afs_int32 offset; struct ubik_stat ubikstat; char tbuffer[256]; afs_int32 tlen; afs_int32 length; if ((code = ubik_CheckAuth(rxcall))) { return code; } /* temporarily disabled because it causes problems for migration tool. Hey, it's just * a sanity check, anyway. if (ubeacon_AmSyncSite()) { return UDEADLOCK; } */ if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_dbase[index]; DBHOLD(dbase); code = (*dbase->stat) (dbase, file, &ubikstat); if (code < 0) { DBRELE(dbase); return code; } length = ubikstat.size; tlen = htonl(length); code = rx_Write(rxcall, (char *)&tlen, sizeof(afs_int32)); if (code != sizeof(afs_int32)) { DBRELE(dbase); ubik_dprint("Rx-write length error=%d\n", code); return BULK_ERROR; } offset = 0; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); code = (*dbase->read) (dbase, file, tbuffer, offset, tlen); if (code != tlen) { DBRELE(dbase); ubik_dprint("read failed error=%d\n", code); return UIOERROR; } code = rx_Write(rxcall, tbuffer, tlen); if (code != tlen) { DBRELE(dbase); ubik_dprint("Rx-write length error=%d\n", code); return BULK_ERROR; } length -= tlen; offset += tlen; } code = (*dbase->getlabel) (dbase, file, version); /* return the dbase, too */ DBRELE(dbase); return code; }
/*! * \brief Main interaction loop for the recovery manager * * The recovery light-weight process only runs when you're the * synchronization site. It performs the following tasks, if and only * if the prerequisite tasks have been performed successfully (it * keeps track of which ones have been performed in its bit map, * \p urecovery_state). * * First, it is responsible for probing that all servers are up. This * is the only operation that must be performed even if this is not * yet the sync site, since otherwise this site may not notice that * enough other machines are running to even elect this guy to be the * sync site. * * After that, the recovery process does nothing until the beacon and * voting modules manage to get this site elected sync site. * * After becoming sync site, recovery first attempts to find the best * database available in the network (it must do this in order to * ensure finding the latest committed data). After finding the right * database, it must fetch this dbase to the sync site. * * After fetching the dbase, it relabels it with a new version number, * to ensure that everyone recognizes this dbase as the most recent * dbase. * * One the dbase has been relabelled, this machine can start handling * requests. However, the recovery module still has one more task: * propagating the dbase out to everyone who is up in the network. */ void * urecovery_Interact(void *dummy) { afs_int32 code, tcode; struct ubik_server *bestServer = NULL; struct ubik_server *ts; int dbok, doingRPC, now; afs_int32 lastProbeTime; /* if we're the sync site, the best db version we've found yet */ static struct ubik_version bestDBVersion; struct ubik_version tversion; struct timeval tv; int length, tlen, offset, file, nbytes; struct rx_call *rxcall; char tbuffer[1024]; struct ubik_stat ubikstat; struct in_addr inAddr; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 pass; afs_pthread_setname_self("recovery"); /* otherwise, begin interaction */ urecovery_state = 0; lastProbeTime = 0; while (1) { /* Run through this loop every 4 seconds */ tv.tv_sec = 4; tv.tv_usec = 0; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif ubik_dprint("recovery running in state %x\n", urecovery_state); /* Every 30 seconds, check all the down servers and mark them * as up if they respond. When a server comes up or found to * not be current, then re-find the the best database and * propogate it. */ if ((now = FT_ApproxTime()) > 30 + lastProbeTime) { for (ts = ubik_servers, doingRPC = 0; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; doingRPC = 1; code = DoProbe(ts); if (code == 0) { UBIK_BEACON_LOCK; ts->up = 1; UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } else { UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); if (!ts->currentDB) urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } if (doingRPC) now = FT_ApproxTime(); lastProbeTime = now; } /* Mark whether we are the sync site */ DBHOLD(ubik_dbase); if (!ubeacon_AmSyncSite()) { urecovery_state &= ~UBIK_RECSYNCSITE; DBRELE(ubik_dbase); continue; /* nothing to do */ } urecovery_state |= UBIK_RECSYNCSITE; /* If a server has just come up or if we have not found the * most current database, then go find the most current db. */ if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); bestServer = (struct ubik_server *)0; bestDBVersion.epoch = 0; bestDBVersion.counter = 0; for (ts = ubik_servers; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; continue; /* don't bother with these guys */ } UBIK_BEACON_UNLOCK; if (ts->isClone) continue; UBIK_ADDR_LOCK; code = DISK_GetVersion(ts->disk_rxcid, &ts->version); UBIK_ADDR_UNLOCK; if (code == 0) { /* perhaps this is the best version */ if (vcmp(ts->version, bestDBVersion) > 0) { /* new best version */ bestDBVersion = ts->version; bestServer = ts; } } } /* take into consideration our version. Remember if we, * the sync site, have the best version. Also note that * we may need to send the best version out. */ DBHOLD(ubik_dbase); if (vcmp(ubik_dbase->version, bestDBVersion) >= 0) { bestDBVersion = ubik_dbase->version; bestServer = (struct ubik_server *)0; urecovery_state |= UBIK_RECHAVEDB; } else { /* Clear the flag only when we know we have to retrieve * the db. Because urecovery_AllBetter() looks at it. */ urecovery_state &= ~UBIK_RECHAVEDB; } urecovery_state |= UBIK_RECFOUNDDB; urecovery_state &= ~UBIK_RECSENTDB; } if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If we, the sync site, do not have the best db version, then * go and get it from the server that does. */ if ((urecovery_state & UBIK_RECHAVEDB) || !bestServer) { urecovery_state |= UBIK_RECHAVEDB; } else { /* we don't have the best version; we should fetch it. */ urecovery_AbortAll(ubik_dbase); /* Rx code to do the Bulk fetch */ file = 0; offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(bestServer->disk_rxcid); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(bestServer->addr[0], hoststr)); UBIK_ADDR_UNLOCK; code = StartDISK_GetFile(rxcall, file); if (code) { ubik_dprint("StartDiskGetFile failed=%d\n", code); goto FetchEndCall; } nbytes = rx_Read(rxcall, (char *)&length, sizeof(afs_int32)); length = ntohl(length); if (nbytes != sizeof(afs_int32)) { ubik_dprint("Rx-read length error=%d\n", code = BULK_ERROR); code = EIO; goto FetchEndCall; } /* give invalid label during file transit */ UBIK_VERSION_LOCK; tversion.epoch = 0; code = (*ubik_dbase->setlabel) (ubik_dbase, file, &tversion); UBIK_VERSION_UNLOCK; if (code) { ubik_dprint("setlabel io error=%d\n", code); goto FetchEndCall; } snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto FetchEndCall; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto FetchEndCall; } pass = 0; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #ifndef AFS_PTHREAD_ENV if (pass % 4 == 0) IOMGR_Poll(); #endif nbytes = rx_Read(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-read bulk error=%d\n", code = BULK_ERROR); code = EIO; close(fd); goto FetchEndCall; } nbytes = write(fd, tbuffer, tlen); pass++; if (nbytes != tlen) { code = UIOERROR; close(fd); goto FetchEndCall; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto FetchEndCall; code = EndDISK_GetFile(rxcall, &tversion); FetchEndCall: tcode = rx_EndCall(rxcall, code); if (!code) code = tcode; if (!code) { /* we got a new file, set up its header */ urecovery_state |= UBIK_RECHAVEDB; UBIK_VERSION_LOCK; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); if (!code) { (*ubik_dbase->open) (ubik_dbase, file); /* after data is good, sync disk with correct label */ code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); } UBIK_VERSION_UNLOCK; #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif } if (code) { unlink(pbuffer); /* * We will effectively invalidate the old data forever now. * Unclear if we *should* but we do. */ UBIK_VERSION_LOCK; ubik_dbase->version.epoch = 0; ubik_dbase->version.counter = 0; UBIK_VERSION_UNLOCK; ubik_print("Ubik: Synchronize database failed (error = %d)\n", code); } else { ubik_print("Ubik: Synchronize database completed\n"); urecovery_state |= UBIK_RECHAVEDB; } udisk_Invalidate(ubik_dbase, 0); /* data has changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } if (!(urecovery_state & UBIK_RECHAVEDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If the database was newly initialized, then when we establish quorum, write * a new label. This allows urecovery_AllBetter() to allow access for reads. * Setting it to 2 also allows another site to come along with a newer * database and overwrite this one. */ if (ubik_dbase->version.epoch == 1) { urecovery_AbortAll(ubik_dbase); UBIK_VERSION_LOCK; version_globals.ubik_epochTime = 2; ubik_dbase->version.epoch = version_globals.ubik_epochTime; ubik_dbase->version.counter = 1; code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); UBIK_VERSION_UNLOCK; udisk_Invalidate(ubik_dbase, 0); /* data may have changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } /* Check the other sites and send the database to them if they * do not have the current db. */ if (!(urecovery_state & UBIK_RECSENTDB)) { /* now propagate out new version to everyone else */ dbok = 1; /* start off assuming they all worked */ /* * Check if a write transaction is in progress. We can't send the * db when a write is in progress here because the db would be * obsolete as soon as it goes there. Also, ops after the begin * trans would reach the recepient and wouldn't find a transaction * pending there. Frankly, I don't think it's possible to get past * the write-lock above if there is a write transaction in progress, * but then, it won't hurt to check, will it? */ if (ubik_dbase->flags & DBWRITING) { struct timeval tv; int safety = 0; long cur_usec = 50000; while ((ubik_dbase->flags & DBWRITING) && (safety < 500)) { DBRELE(ubik_dbase); /* sleep for a little while */ tv.tv_sec = 0; tv.tv_usec = cur_usec; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif cur_usec += 10000; safety++; DBHOLD(ubik_dbase); } } for (ts = ubik_servers; ts; ts = ts->next) { UBIK_ADDR_LOCK; inAddr.s_addr = ts->addr[0]; UBIK_ADDR_UNLOCK; UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; ubik_dprint("recovery cannot send version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); dbok = 0; continue; } UBIK_BEACON_UNLOCK; ubik_dprint("recovery sending version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); if (vcmp(ts->version, ubik_dbase->version) != 0) { ubik_dprint("recovery stating local database\n"); /* Rx code to do the Bulk Store */ code = (*ubik_dbase->stat) (ubik_dbase, 0, &ubikstat); if (!code) { length = ubikstat.size; file = offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(ts->disk_rxcid); UBIK_ADDR_UNLOCK; code = StartDISK_SendFile(rxcall, file, length, &ubik_dbase->version); if (code) { ubik_dprint("StartDiskSendFile failed=%d\n", code); goto StoreEndCall; } while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); nbytes = (*ubik_dbase->read) (ubik_dbase, file, tbuffer, offset, tlen); if (nbytes != tlen) { ubik_dprint("Local disk read error=%d\n", code = UIOERROR); goto StoreEndCall; } nbytes = rx_Write(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-write bulk error=%d\n", code = BULK_ERROR); goto StoreEndCall; } offset += tlen; length -= tlen; } code = EndDISK_SendFile(rxcall); StoreEndCall: code = rx_EndCall(rxcall, code); } if (code == 0) { /* we set a new file, process its header */ ts->version = ubik_dbase->version; ts->currentDB = 1; } else dbok = 0; } else { /* mark file up to date */ ts->currentDB = 1; } } if (dbok) urecovery_state |= UBIK_RECSENTDB; } DBRELE(ubik_dbase); } return NULL; }
/*! * \brief Set a transaction lock. * \param atype is #LOCKREAD or #LOCKWRITE. * \param await is TRUE if you want to wait for the lock instead of returning * #EWOULDBLOCK. * * \note The #DBHOLD lock must be held. */ extern int ulock_getLock(struct ubik_trans *atrans, int atype, int await) { struct ubik_dbase *dbase = atrans->dbase; /* On first pass, initialize the lock */ if (rwlockinit) { Lock_Init(&rwlock); rwlockinit = 0; } if ((atype != LOCKREAD) && (atype != LOCKWRITE)) return EINVAL; if (atrans->flags & TRDONE) return UDONE; if (atype != LOCKREAD && (atrans->flags & TRREADWRITE)) { return EINVAL; } if (atrans->locktype != 0) { ubik_print("Ubik: Internal Error: attempted to take lock twice\n"); abort(); } /* *ubik_print("Ubik: DEBUG: Thread 0x%x request %s lock\n", lwp_cpptr, * ((atype == LOCKREAD) ? "READ" : "WRITE")); */ /* Check if the lock would would block */ if (!await && !(atrans->flags & TRREADWRITE)) { if (atype == LOCKREAD) { if (WouldReadBlock(&rwlock)) return EAGAIN; } else { if (WouldWriteBlock(&rwlock)) return EAGAIN; } } /* Create new lock record and add to spec'd transaction: * #if defined(UBIK_PAUSE) * * locktype. Before doing that, set TRSETLOCK, * * to tell udisk_end that another thread (us) is waiting. * #else * * locktype. This field also tells us if the thread is * * waiting for a lock: It will be equal to LOCKWAIT. * #endif */ #if defined(UBIK_PAUSE) if (atrans->flags & TRSETLOCK) { printf("Ubik: Internal Error: TRSETLOCK already set?\n"); return EBUSY; } atrans->flags |= TRSETLOCK; #else atrans->locktype = LOCKWAIT; #endif /* UBIK_PAUSE */ DBRELE(dbase); if (atrans->flags & TRREADWRITE) { /* noop; don't actually lock anything for TRREADWRITE */ } else if (atype == LOCKREAD) { ObtainReadLock(&rwlock); } else { ObtainWriteLock(&rwlock); } DBHOLD(dbase); atrans->locktype = atype; #if defined(UBIK_PAUSE) atrans->flags &= ~TRSETLOCK; #if 0 /* We don't do this here, because this can only happen in SDISK_Lock, * and there's already code there to catch this condition. */ if (atrans->flags & TRSTALE) { udisk_end(atrans); return UINTERNAL; } #endif #endif /* UBIK_PAUSE */ /* *ubik_print("Ubik: DEBUG: Thread 0x%x took %s lock\n", lwp_cpptr, * ((atype == LOCKREAD) ? "READ" : "WRITE")); */ return 0; }