afs_int32 SDISK_Truncate(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 afile, afs_int32 alen) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } code = udisk_truncate(ubik_currentTrans, afile, alen); done: DBRELE(ubik_dbase); return code; }
afs_int32 SDISK_Truncate(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, afs_int32 afile, afs_int32 alen) { afs_int32 code; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } code = udisk_truncate(ubik_currentTrans[index], afile, alen); DBRELE(dbase); return code; }
afs_int32 SDISK_Abort(struct rx_call *rxcall, struct ubik_tid *atid) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } code = udisk_abort(ubik_currentTrans); /* If the thread is not waiting for lock - ok to end it */ if (ubik_currentTrans->locktype != LOCKWAIT) { udisk_end(ubik_currentTrans); } ubik_currentTrans = (struct ubik_trans *)0; done: DBRELE(ubik_dbase); return code; }
afs_int32 SDISK_SetVersion(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, struct ubik_version *oldversionp, struct ubik_version *newversionp) { afs_int32 code = 0; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return (code); } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } /* Should not get this for the sync site */ if (ubeacon_AmSyncSite()) { return UDEADLOCK; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } /* Set the label if its version matches the sync-site's */ if ((oldversionp->epoch == ubik_dbVersion[index].epoch) && (oldversionp->counter == ubik_dbVersion[index].counter)) { code = (*dbase->setlabel) (dbase, 0, newversionp); if (!code) { dbase->version = *newversionp; ubik_dbVersion[index] = *newversionp; } } else { code = USYNC; } DBRELE(dbase); return code; }
/*! * \brief Write a vector of data */ afs_int32 SDISK_WriteV(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, iovec_wrt *io_vector, iovec_buf *io_buffer) { afs_int32 code, i, offset; struct ubik_dbase *dbase; struct ubik_iovec *iovec; char *iobuf; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } iovec = (struct ubik_iovec *)io_vector->iovec_wrt_val; iobuf = (char *)io_buffer->iovec_buf_val; for (i = 0, offset = 0; i < io_vector->iovec_wrt_len; i++) { /* Sanity check for going off end of buffer */ if ((offset + iovec[i].length) > io_buffer->iovec_buf_len) { code = UINTERNAL; } else { code = udisk_write(ubik_currentTrans[index], iovec[i].file, &iobuf[offset], iovec[i].position, iovec[i].length); } if (code) break; offset += iovec[i].length; } DBRELE(dbase); return code; }
afs_int32 SDISK_SetVersion(struct rx_call *rxcall, struct ubik_tid *atid, struct ubik_version *oldversionp, struct ubik_version *newversionp) { afs_int32 code = 0; if ((code = ubik_CheckAuth(rxcall))) { return (code); } DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } /* Should not get this for the sync site */ if (ubeacon_AmSyncSite()) { code = UDEADLOCK; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } /* Set the label if its version matches the sync-site's */ if (uvote_eq_dbVersion(*oldversionp)) { UBIK_VERSION_LOCK; code = (*ubik_dbase->setlabel) (ubik_dbase, 0, newversionp); if (!code) { ubik_dbase->version = *newversionp; uvote_set_dbVersion(*newversionp); } UBIK_VERSION_UNLOCK; } else { code = USYNC; } done: DBRELE(ubik_dbase); return code; }
/* apos and alen are not used */ afs_int32 SDISK_Lock(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index, afs_int32 afile, afs_int32 apos, afs_int32 alen, afs_int32 atype) { afs_int32 code; struct ubik_dbase *dbase; struct ubik_trans *ubik_thisTrans; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } if (!ubik_dbase[index]) { return ENOENT; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (alen != 1) { return UBADLOCK; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } ubik_thisTrans = ubik_currentTrans[index]; code = ulock_getLock(ubik_currentTrans[index], atype, 1); /* While waiting, the transaction may have been ended/ * aborted from under us (urecovery_CheckTid). In that * case, end the transaction here. */ if (!code && (ubik_currentTrans[index] != ubik_thisTrans)) { udisk_end(ubik_thisTrans); code = USYNC; } DBRELE(dbase); return code; }
afs_int32 SDISK_Commit(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index) { afs_int32 code; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_dbase[index]) { return ENOENT; } if (!ubik_currentTrans[index]) { return USYNC; } /* * sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } dbase = ubik_currentTrans[index]->dbase; ObtainWriteLock(&dbase->cache_lock); DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); ReleaseWriteLock(&dbase->cache_lock); return USYNC; } code = udisk_commit(ubik_currentTrans[index]); if (code == 0) { /* sync site should now match */ ubik_dbVersion[index] = ubik_dbase[index]->version; } DBRELE(dbase); ReleaseWriteLock(&dbase->cache_lock); return code; }
/* the rest of these guys handle remote execution of write * transactions: this is the code executed on the other servers when a * sync site is executing a write transaction. */ afs_int32 SDISK_Begin(struct rx_call *rxcall, struct ubik_tid *atid) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } DBHOLD(ubik_dbase); urecovery_CheckTid(atid, 1); code = udisk_begin(ubik_dbase, UBIK_WRITETRANS, &ubik_currentTrans); if (!code && ubik_currentTrans) { /* label this trans with the right trans id */ ubik_currentTrans->tid.epoch = atid->epoch; ubik_currentTrans->tid.counter = atid->counter; } DBRELE(ubik_dbase); return code; }
afs_int32 SDISK_Abort(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index) { afs_int32 code; struct ubik_dbase *dbase; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_currentTrans[index]) { return USYNC; } /* sanity check to make sure only write trans appear here */ if (ubik_currentTrans[index]->type != UBIK_WRITETRANS) { return UBADTYPE; } if (!ubik_dbase[index]) { return ENOENT; } dbase = ubik_currentTrans[index]->dbase; DBHOLD(dbase); urecovery_CheckTid(atid, index); if (!ubik_currentTrans[index]) { DBRELE(dbase); return USYNC; } code = udisk_abort(ubik_currentTrans[index]); /* If the thread is not waiting for lock - ok to end it */ #if !defined(UBIK_PAUSE) if (ubik_currentTrans[index]->locktype != LOCKWAIT) { #endif /* UBIK_PAUSE */ udisk_end(ubik_currentTrans[index]); #if !defined(UBIK_PAUSE) } #endif /* UBIK_PAUSE */ ubik_currentTrans[index] = (struct ubik_trans *)0; DBRELE(dbase); return code; }
afs_int32 SDISK_Commit(struct rx_call *rxcall, struct ubik_tid *atid) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } ObtainWriteLock(&ubik_dbase->cache_lock); DBHOLD(ubik_dbase); if (!ubik_currentTrans) { code = USYNC; goto done; } /* * sanity check to make sure only write trans appear here */ if (ubik_currentTrans->type != UBIK_WRITETRANS) { code = UBADTYPE; goto done; } urecovery_CheckTid(atid, 0); if (!ubik_currentTrans) { code = USYNC; goto done; } code = udisk_commit(ubik_currentTrans); if (code == 0) { /* sync site should now match */ uvote_set_dbVersion(ubik_dbase->version); } done: DBRELE(ubik_dbase); ReleaseWriteLock(&ubik_dbase->cache_lock); return code; }
/* the rest of these guys handle remote execution of write * transactions: this is the code executed on the other servers when a * sync site is executing a write transaction. */ afs_int32 SDISK_Begin(struct rx_call *rxcall, struct ubik_tid *atid, afs_int32 index) { afs_int32 code; if ((code = ubik_CheckAuth(rxcall))) { return code; } if (!ubik_dbase[index]) { return ENOENT; } DBHOLD(ubik_dbase[index]); if (urecovery_AllBetter(ubik_dbase[index], 0) == 0) { code = UNOQUORUM; goto out; } urecovery_CheckTid(atid, index); if (ubik_currentTrans[index]) { /* If the thread is not waiting for lock - ok to end it */ #if !defined(UBIK_PAUSE) if (ubik_currentTrans[index]->locktype != LOCKWAIT) { #endif /* UBIK_PAUSE */ udisk_end(ubik_currentTrans[index]); #if !defined(UBIK_PAUSE) } #endif /* UBIK_PAUSE */ ubik_currentTrans[index] = (struct ubik_trans *)0; } code = udisk_begin(ubik_dbase[index], UBIK_WRITETRANS, &ubik_currentTrans[index]); if (!code && ubik_currentTrans[index]) { /* label this trans with the right trans id */ ubik_currentTrans[index]->tid.epoch = atid->epoch; ubik_currentTrans[index]->tid.counter = atid->counter; } out: DBRELE(ubik_dbase[index]); return code; }
/*! * \brief called by the sync site to handle vote beacons; if aconn is null, this is a * local call * * \returns 0 or time when the vote was sent. It returns 0 if we are * not voting for this sync site, or the time we actually voted yes, if * non-zero. */ afs_int32 SVOTE_Beacon(struct rx_call * rxcall, afs_int32 astate, afs_int32 astart, struct ubik_version * avers, struct ubik_tid * atid) { afs_int32 otherHost; afs_int32 now; afs_int32 vote; struct rx_connection *aconn; struct rx_peer *rxp; struct ubik_server *ts; int isClone = 0; char hoststr[16]; if (rxcall) { /* caller's host */ aconn = rx_ConnectionOf(rxcall); rxp = rx_PeerOf(aconn); otherHost = rx_HostOf(rxp); /* get the primary interface address for this host. */ /* This is the identifier that ubik uses. */ otherHost = ubikGetPrimaryInterfaceAddr(otherHost); if (!otherHost) { ubik_dprint("Received beacon from unknown host %s\n", afs_inet_ntoa_r(rx_HostOf(rxp), hoststr)); return 0; /* I don't know about you: vote no */ } for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) break; } if (!ts) ubik_dprint("Unknown host %x has sent a beacon\n", otherHost); if (ts && ts->isClone) isClone = 1; } else { otherHost = ubik_host[0]; /* this host */ isClone = amIClone; } ubik_dprint("Received beacon type %d from host %s\n", astate, afs_inet_ntoa_r(otherHost, hoststr)); /* compute the lowest server we've heard from. We'll try to only vote for * this dude if we don't already have a synchronization site. Also, don't * let a very old lowestHost confusing things forever. We pick a new * lowestHost after BIGTIME seconds to limit the damage if this host * actually crashes. Finally, we also count in this computation: don't * pick someone else if we're even better! * * Note that the test below must be <=, not <, so that we keep refreshing * lowestTime. Otherwise it will look like we haven't heard from * lowestHost in a while and another host could slip in. */ /* First compute the lowest host we've heard from, whether we want them * for a sync site or not. If we haven't heard from a site in BIGTIME * seconds, we ignore its presence in lowestHost: it may have crashed. * Note that we don't ever let anyone appear in our lowestHost if we're * lower than them, 'cause we know we're up. */ /* But do not consider clones for lowesHost since they never may become * sync site */ UBIK_VOTE_LOCK; now = FT_ApproxTime(); /* close to current time */ if (!isClone && (ntohl((afs_uint32)otherHost) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = otherHost; } /* why do we need this next check? Consider the case where each of two * servers decides the other is lowestHost. Each stops sending beacons * 'cause the other is there. Not obvious that this process terminates: * i.e. each guy could restart procedure and again think other side is * lowest. Need to prove: if one guy in the system is lowest and knows * he's lowest, these loops don't occur. because if someone knows he's * lowest, he will send out beacons telling others to vote for him. */ if (!amIClone && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32)vote_globals.lowestHost) || vote_globals.lowestTime + BIGTIME < now)) { vote_globals.lowestTime = now; vote_globals.lowestHost = ubik_host[0]; } /* tell if we've heard from a sync site recently (even if we're not voting * for this dude yet). After a while, time the guy out. */ if (astate) { /* this guy is a sync site */ vote_globals.syncHost = otherHost; vote_globals.syncTime = now; } else if (vote_globals.syncTime + BIGTIME < now) { if (vote_globals.syncHost) { ubik_dprint ("Ubik: Lost contact with sync-site %s (NOT in quorum)\n", afs_inet_ntoa_r(vote_globals.syncHost, hoststr)); } vote_globals.syncHost = 0; } /* decide how to vote */ vote = 0; /* start off voting no */ /* if we this guy isn't a sync site, we don't really have to vote for him. * We get to apply some heuristics to try to avoid weird oscillation sates * in the voting procedure. */ if (astate == 0) { /* in here only if this guy doesn't claim to be a sync site */ /* lowestHost is also trying for our votes, then just say no. */ if (ntohl(vote_globals.lowestHost) != ntohl(otherHost)) { goto done_zero; } /* someone else *is* a sync site, just say no */ if (vote_globals.syncHost && vote_globals.syncHost != otherHost) goto done_zero; } else if (vote_globals.lastYesHost == 0xffffffff && otherHost == ubik_host[0]) { /* fast startup if this is the only non-clone */ int i = 0; for (ts = ubik_servers; ts; ts = ts->next) { if (ts->addr[0] == otherHost) continue; if (!ts->isClone) i++; } if (!i) vote_globals.lastYesHost = otherHost; } if (isClone) goto done_zero; /* clone never can become sync site */ /* Don't promise sync site support to more than one host every BIGTIME * seconds. This is the heart of our invariants in this system. */ if (vote_globals.ubik_lastYesTime + BIGTIME < now || otherHost == vote_globals.lastYesHost) { if ((vote_globals.ubik_lastYesTime + BIGTIME < now) || (otherHost != vote_globals.lastYesHost) || (vote_globals.lastYesState != astate)) { /* A new vote or a change in the vote or changed quorum */ ubik_dprint("Ubik: vote 'yes' for %s %s\n", afs_inet_ntoa_r(otherHost, hoststr), (astate ? "(in quorum)" : "(NOT in quorum)")); } vote = now; /* vote yes */ vote_globals.ubik_lastYesTime = now; /* remember when we voted yes */ vote_globals.lastYesClaim = astart; /* remember for computing when sync site expires */ vote_globals.lastYesHost = otherHost; /* and who for */ vote_globals.lastYesState = astate; /* remember if site is a sync site */ vote_globals.ubik_dbVersion = *avers; /* resync value */ vote_globals.ubik_dbTid = *atid; /* transaction id, if any, of active trans */ UBIK_VOTE_UNLOCK; DBHOLD(ubik_dbase); urecovery_CheckTid(atid, 0); /* check if current write trans needs aborted */ DBRELE(ubik_dbase); } else { UBIK_VOTE_UNLOCK; } return vote; done_zero: UBIK_VOTE_UNLOCK; return 0; }