static void * SlowCall(void * rock) { struct rx_connection *conn = rock; u_long ntime; u_long now; long temp_rc; #ifdef AFS_PTHREAD_ENV pthread_mutex_lock(&slowCallLock); #endif slowCallCode = RXKST_PROCESSRUNNING; #ifdef AFS_PTHREAD_ENV pthread_cond_signal(&slowCallCV); #else LWP_NoYieldSignal(&slowCallCode); #endif slowCallCode = RXKST_Slow(conn, 1, &ntime); if (!slowCallCode) { now = FT_ApproxTime(); if ((ntime < now - maxSkew) || (ntime > now + maxSkew)) slowCallCode = RXKST_TIMESKEW; } temp_rc = slowCallCode; #ifdef AFS_PTHREAD_ENV pthread_cond_signal(&slowCallCV); pthread_mutex_unlock(&slowCallLock); #else LWP_NoYieldSignal(&slowCallCode); #endif return (void *)(intptr_t)temp_rc; }
static void re_probe (ConnCacheEntry *e) { Listitem *item; struct timeval tv; assert (e->probe != NULL); gettimeofday (&tv, NULL); if (e->probe_le) { listdel (connprobelist, e->probe_le); e->probe_next = min(tv.tv_sec + (1 << e->ntries), e->probe_next); } else e->probe_next = tv.tv_sec + (1 << e->ntries); if (e->ntries <= MAX_RETRIES) ++e->ntries; for (item = listhead (connprobelist); item; item = listnext (connprobelist, item)) { ConnCacheEntry *this = (ConnCacheEntry *)listdata (item); if (e->probe_next < this->probe_next) { e->probe_le = listaddbefore (connprobelist, item, e); LWP_NoYieldSignal (connprobelist); return; } } e->probe_le = listaddtail (connprobelist, e); LWP_NoYieldSignal (connprobelist); }
static void Consumer(void *foo) { LWP_NoYieldSignal ((char *)Producer); while (1) { LWP_WaitProcess((char *)Consumer); printf("[consumer] eating %c\n", pcfoo); LWP_NoYieldSignal ((char *)Producer); } }
/* release a lock, giving preference to new writers */ void Lock_ReleaseW(struct Lock *lock) { if (lock->wait_states & EXCL_LOCKS) { lock->wait_states &= ~EXCL_LOCKS; LWP_NoYieldSignal(&lock->excl_locked); } else { lock->wait_states &= ~READ_LOCK; LWP_NoYieldSignal(&lock->readers_reading); } }
/*! \brief * Called at initialization to figure out version of the dbase we really have. * * This routine is called after replaying the log; it reads the restored labels. */ static int InitializeDB(register struct ubik_dbase *adbase) { register afs_int32 code; code = (*adbase->getlabel) (adbase, 0, &adbase->version); if (code) { /* try setting the label to a new value */ adbase->version.epoch = 1; /* value for newly-initialized db */ adbase->version.counter = 1; code = (*adbase->setlabel) (adbase, 0, &adbase->version); if (code) { /* failed, try to set it back */ adbase->version.epoch = 0; adbase->version.counter = 0; (*adbase->setlabel) (adbase, 0, &adbase->version); } #ifdef AFS_PTHREAD_ENV assert(pthread_cond_broadcast(&adbase->version_cond) == 0); #else LWP_NoYieldSignal(&adbase->version); #endif } return 0; }
/*! \brief * Called at initialization to figure out version of the dbase we really have. * * This routine is called after replaying the log; it reads the restored labels. */ static int InitializeDB(struct ubik_dbase *adbase) { afs_int32 code; code = (*adbase->getlabel) (adbase, 0, &adbase->version); if (code) { /* try setting the label to a new value */ UBIK_VERSION_LOCK; adbase->version.epoch = 1; /* value for newly-initialized db */ adbase->version.counter = 1; code = (*adbase->setlabel) (adbase, 0, &adbase->version); if (code) { /* failed, try to set it back */ adbase->version.epoch = 0; adbase->version.counter = 0; (*adbase->setlabel) (adbase, 0, &adbase->version); } #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&adbase->version_cond); #else LWP_NoYieldSignal(&adbase->version); #endif UBIK_VERSION_UNLOCK; } return 0; }
/* client receives a reply and bundles it off to the right sle */ static void HandleCurrentReply(RPC2_PacketBuffer *pb, struct CEntry *ce) { struct SL_Entry *sl; say(1, RPC2_DebugLevel, "HandleCurrentReply()\n"); rpc2_Recvd.Replies++; /* should this assert ?? XXXX */ if (BogusSl(ce, pb)) return; pb = ShrinkPacket(pb); /* convert rpc2 error value to system errno */ pb->Header.ReturnCode = RPC2_R2SError(pb->Header.ReturnCode); rpc2_UpdateRTT(pb, ce); rpc2_Recvd.GoodReplies++; sl = ce->MySl; sl->data = pb; SetState(ce, C_THINK); rpc2_IncrementSeqNumber(ce); rpc2_DeactivateSle(sl, ARRIVED); LWP_NoYieldSignal((char *)sl); }
/* Special packet from socketlistener: never encrypted */ static void HandleSLPacket(RPC2_PacketBuffer *pb, struct CEntry *ce) { rpc2_ntohp(pb); if (pb->Header.Opcode != RPC2_NAKED) { BOGUS(pb, "HandleSLPacket: bogus opcode\n"); return; } if (!TestState(ce, CLIENT, (C_AWAITREPLY | C_AWAITINIT2))) { BOGUS(pb, "HandleSLPacket: state != AWAIT\n"); return; } say(1, RPC2_DebugLevel, "HandleNak()\n"); rpc2_Recvd.Naks++; if (BogusSl(ce, pb)) return; rpc2_SetConnError(ce); rpc2_DeactivateSle(ce->MySl, NAKED); LWP_NoYieldSignal((char *)ce->MySl); RPC2_FreeBuffer(&pb); }
void rpc2_ExpireEvents() { int i; struct SL_Entry *sl; struct TM_Elem *t; for (i = TM_Rescan(rpc2_TimerQueue); i > 0; i--) { t = TM_GetExpired(rpc2_TimerQueue); if (!t) continue; sl = (struct SL_Entry *)t->BackPointer; rpc2_DeactivateSle(sl, TIMEOUT); if (sl->Type == REPLY) FreeHeld(sl); else if (sl->Type == DELACK) DelayedAck(sl); else if (sl->Type == DELAYED_SEND) rpc2_SendDelayedPacket(sl); else if (sl->Type == DELAYED_RECV) { RPC2_PacketBuffer *pb = rpc2_RecvDelayedPacket(sl); DispatchPacket(pb); } else LWP_NoYieldSignal((char *)sl); } }
static void TermSignal() { LogMsg(0,LogLevel,LogFile, "Term signal caught"); lobotomy = mtrue; /* wake up the BrainSurgeon */ lwp_debug = 1; LWP_NoYieldSignal(DoLobotomy); return; }
/*! * \brief sync site * * routine called when a non-sync site server goes down; restarts recovery * process to send missing server the new db when it comes back up. * * \note This routine should not do anything with variables used by non-sync site servers. */ int urecovery_LostServer(void) { #if !defined(AFS_PTHREAD_ENV) /* No corresponding LWP_WaitProcess found anywhere for this -- klm */ LWP_NoYieldSignal(&urecovery_state); #endif return 0; }
static int bnode_Check(struct bnode *abnode) { if (abnode->flags & BNODE_WAIT) { abnode->flags &= ~BNODE_WAIT; LWP_NoYieldSignal(abnode); } return 0; }
/* release a lock, giving preference to new writers */ void Afs_Lock_ReleaseW(struct Lock *lock) { if (lock->wait_states & EXCL_LOCKS) { lock->wait_states &= ~EXCL_LOCKS; #ifdef AFS_PTHREAD_ENV opr_Verify(pthread_cond_broadcast(&lock->write_cv) == 0); #else /* AFS_PTHREAD_ENV */ LWP_NoYieldSignal(&lock->excl_locked); #endif /* AFS_PTHREAD_ENV */ } else { lock->wait_states &= ~READ_LOCK; #ifdef AFS_PTHREAD_ENV opr_Verify(pthread_cond_broadcast(&lock->read_cv) == 0); #else /* AFS_PTHREAD_ENV */ LWP_NoYieldSignal(&lock->readers_reading); #endif /* AFS_PTHREAD_ENV */ } }
/*! * \brief sync site * * routine called when a non-sync site server goes down; restarts recovery * process to send missing server the new db when it comes back up for * non-sync site servers. * * \note This routine should not do anything with variables used by non-sync site servers. */ int urecovery_LostServer(struct ubik_server *ts) { ubeacon_ReinitServer(ts); #if !defined(AFS_PTHREAD_ENV) /* No corresponding LWP_WaitProcess found anywhere for this -- klm */ LWP_NoYieldSignal(&urecovery_state); #endif return 0; }
static void Producer(void *foo) { while (1) { LWP_WaitProcess((char *)Producer); pcfoo++; if (pcfoo > 'z') pcfoo = 'a'; printf("[producer] creating %c\n", pcfoo); LWP_NoYieldSignal ((char *)Consumer); } }
/* wake up readers waiting for this lock */ void Afs_Lock_WakeupR(struct Lock *lock) { if (lock->wait_states & READ_LOCK) { lock->wait_states &= ~READ_LOCK; #ifdef AFS_PTHREAD_ENV opr_Verify(pthread_cond_broadcast(&lock->read_cv) == 0); #else /* AFS_PTHREAD_ENV */ LWP_NoYieldSignal(&lock->readers_reading); #endif /* AFS_PTHREAD_ENV */ } }
static struct ropa_ccpair * add_client (struct ropa_cb *cb, struct ropa_client *c) { struct timeval tv; struct ropa_ccpair cckey, *cc; assert (cb && c); cckey.client = c; cckey.cb = cb; cc = hashtabsearch (ht_ccpairs, &cckey); if (cc) { listdel (lru_ccpair, cc->li); cc->li = listaddhead (lru_ccpair, cc); return cc; } /* The reverse of these are in break_ccpair */ callback_ref (cb); client_ref (c); cc = listdeltail (lru_ccpair); DIAGNOSTIC_CHECK_CCPAIR(cc); cc->li = NULL; if (ccpairs_inuse_p (cc)) break_ccpair (cc, TRUE); /* XXX do it for real */ gettimeofday(&tv, NULL); cc->expire = tv.tv_sec + 3600; heap_insert (heap_ccpairs, cc, &cc->heap); LWP_NoYieldSignal (heap_ccpairs); cc->cb_li = listaddtail (cb->ccpairs, cc); cc->client = c; cc->cb = cb; cc->li = listaddhead (lru_ccpair, cc); hashtabadd (ht_ccpairs, cc); mlog_log (MDEBROPA, "add_client: added %x to callback %x.%x.%x", c->addr[0].addr_in, cb->fid.Volume, cb->fid.Vnode, cb->fid.Unique); return cc; }
/* Looks like server code */ static void HandleNewRequest(RPC2_PacketBuffer *pb, struct CEntry *ce) { struct SL_Entry *sl; say(1, RPC2_DebugLevel, "HandleNewRequest()\n"); pb = ShrinkPacket(pb); ce->TimeStampEcho = pb->Header.TimeStamp; TVTOTS(&pb->Prefix.RecvStamp, ce->RequestTime); say(15, RPC2_DebugLevel, "handlenewrequest TS %u RQ %u\n", ce->TimeStampEcho, ce->RequestTime); rpc2_Recvd.Requests++; rpc2_Recvd.GoodRequests++; sl = ce->MySl; /* Free held packet and SL entry */ if (sl != NULL) { rpc2_DeactivateSle(sl, 0); FreeHeld(sl); } rpc2_IncrementSeqNumber(ce); { /* set up a timer to send a unsolicited ack response (actually an RCP2_BUSY) within 100ms. */ struct timeval tv; tv.tv_sec = 0; tv.tv_usec = RPC2_DELACK_DELAY; sl = rpc2_AllocSle(DELACK, ce); rpc2_ActivateSle(sl, &tv); } /* Look for a waiting recipient */ sl = FindRecipient(pb); if (sl != NULL) { SetState(ce, S_PROCESS); rpc2_DeactivateSle(sl, ARRIVED); sl->data = pb; LWP_NoYieldSignal((char *)sl); } else { /* hold for a future RPC2_GetRequest() */ rpc2_HoldPacket(pb); SetState(ce, S_REQINQUEUE); } }
/* Keep alive for current request */ static void HandleBusy(RPC2_PacketBuffer *pb, struct CEntry *ce) { struct SL_Entry *sl; say(1, RPC2_DebugLevel, "HandleBusy(%x)\n", ce->UniqueCID); rpc2_Recvd.Busies++; if (BogusSl(ce, pb)) return; /* update rtt/bw measurements */ rpc2_UpdateRTT(pb, ce); rpc2_Recvd.GoodBusies++; sl = ce->MySl; rpc2_DeactivateSle(sl, KEPTALIVE); LWP_NoYieldSignal((char *)sl); RPC2_FreeBuffer(&pb); }
static void * DoWorker(void * rock) { struct worker *w = rock; long code; code = (*w->proc) (w->index, w->rock); #ifdef AFS_PTHREAD_ENV pthread_mutex_lock(&workerLock); #endif w->exitCode = code; #ifdef AFS_PTHREAD_ENV pthread_mutex_unlock(&workerLock); #endif #ifdef AFS_PTHREAD_ENV pthread_cond_signal(&workerCV); #else LWP_NoYieldSignal(&workers); #endif return (void *)(intptr_t)code; }
static void HandleInit2(RPC2_PacketBuffer *pb, struct CEntry *ce) { struct SL_Entry *sl; say(1, RPC2_DebugLevel, "HandleInit2()\n"); rpc2_Recvd.Requests++; if (BogusSl(ce, pb)) return; pb = ShrinkPacket(pb); rpc2_UpdateRTT(pb, ce); sl = ce->MySl; sl->data = pb; SetState(ce, C_AWAITINIT4); rpc2_DeactivateSle(sl, ARRIVED); LWP_NoYieldSignal((char *)sl); }
void doneWriting(afs_int32 error) { #ifndef AFS_PTHREAD_ENV afs_int32 code = 0; #endif /* wait for the reader */ ObtainWriteLock(&dumpSyncPtr->ds_lock); while (dumpSyncPtr->ds_readerStatus != DS_WAITING) { LogDebug(4, "doneWriting: waiting for Reader\n"); dumpSyncPtr->ds_writerStatus = DS_WAITING; ReleaseWriteLock(&dumpSyncPtr->ds_lock); #ifdef AFS_PTHREAD_ENV MUTEX_ENTER(&dumpSyncPtr->ds_writerStatus_mutex); CV_WAIT(&dumpSyncPtr->ds_writerStatus_cond, &dumpSyncPtr->ds_writerStatus_mutex); MUTEX_EXIT(&dumpSyncPtr->ds_writerStatus_mutex); #else LWP_WaitProcess(&dumpSyncPtr->ds_writerStatus); #endif ObtainWriteLock(&dumpSyncPtr->ds_lock); } LogDebug(4, "doneWriting: setting done\n"); /* signal that we are done */ if (error) dumpSyncPtr->ds_writerStatus = DS_DONE_ERROR; else dumpSyncPtr->ds_writerStatus = DS_DONE; dumpSyncPtr->ds_readerStatus = 0; #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&dumpSyncPtr->ds_readerStatus_cond); #else code = LWP_NoYieldSignal(&dumpSyncPtr->ds_readerStatus); if (code) LogError(code, "doneWriting: Signal delivery failed\n"); #endif ReleaseWriteLock(&dumpSyncPtr->ds_lock); }
/* lineTest() : wait until a whole line has been entered before processing. */ void lineTest() { char ch; int rc; char line[256]; printf("Will print dots until enter a line\n"); /* start the dotwriter thread running */ waitingForAnswer = 1; LWP_NoYieldSignal(&waitingForAnswer); rc = LWP_GetLine(line, 256); waitingForAnswer = 0; printf("You entered : %s\n", line); if (rc) printf("linebuf was too small\n"); return; }
static void HandleInit3(RPC2_PacketBuffer *pb, struct CEntry *ce) { struct SL_Entry *sl; say(1, RPC2_DebugLevel, "HandleInit3()\n"); rpc2_Recvd.Requests++; /* Am I expecting this packet? */ if (!TestState(ce, SERVER, S_AWAITINIT3)) { if (ce->HeldPacket) { /* My Init4 must have got lost; resend it */ ce->HeldPacket->Header.TimeStamp = htonl(pb->Header.TimeStamp); rpc2_XmitPacket(ce->HeldPacket, ce->HostInfo->Addr, 1); } else say(1, RPC2_DebugLevel, "Bogus Init3\n"); /* Throw packet away anyway */ RPC2_FreeBuffer(&pb); return; } /* Expected Init3 */ if (BogusSl(ce, pb)) return; pb = ShrinkPacket(pb); ce->TimeStampEcho = pb->Header.TimeStamp; TVTOTS(&pb->Prefix.RecvStamp, ce->RequestTime); say(15, RPC2_DebugLevel, "handleinit3 TS %u RQ %u\n", ce->TimeStampEcho, ce->RequestTime); sl = ce->MySl; sl->data = pb; SetState(ce, S_FINISHBIND); rpc2_DeactivateSle(sl, ARRIVED); LWP_NoYieldSignal((char *)sl); }
/* interTest() : wait for key press and beep to remind user every five * seconds. */ void interTest() { char ch; int rc; printf("Will print dots until you hit a key!\n"); /* start the dotwriter thread running */ waitingForAnswer = 1; LWP_NoYieldSignal(&waitingForAnswer); do { rc = LWP_GetResponseKey(5, &ch); if (rc == 0) printf("\a"); } while (rc == 0); waitingForAnswer = 0; /* turn off dotwriter lwp */ printf("\nYou typed %c\n", ch); return; }
static void HandleInit4(RPC2_PacketBuffer *pb, struct CEntry *ce) { struct SL_Entry *sl; say(1, RPC2_DebugLevel, "HandleInit4()\n"); rpc2_Recvd.Requests++; if (BogusSl(ce, pb)) return; pb = ShrinkPacket(pb); rpc2_UpdateRTT(pb, ce); sl = ce->MySl; sl->data = pb; /* really C_THINK, but we don't want this connection to get used * until we're really sure all setup is complete. */ SetState(ce, C_AWAITREPLY); rpc2_DeactivateSle(sl, ARRIVED); LWP_NoYieldSignal((char *)sl); }
int main(void) { long go; GetRoot(); /* Also creates a child process for transcribing stdout */ GetParms(); MakeFiles(); /* in test directory */ InitRPC(); MakeWorkers(); GetConns(); GetVar(&go, "Say when: "); DoBindings(); MakeClients(); /* wait for all clients to get ready */ while (ClientsReady < Clients) LWP_DispatchProcess(); LWP_NoYieldSignal((char *)&ClientsReady); LWP_WaitProcess((char *)main); /* infinite wait */ return 0; /* make compiler happy */ }
void main(int ac, char **av) { int delay = 0; int iters = 0; int inter = 0; int line = 0; int i; PROCESS dotpid; int rc; for (i = 1; i < ac; i++) { if (!strcmp("-delay", av[i])) { if (++i >= ac) { printf("Missing delay time for -delay option.\n"); } delay = atoi(av[i]); if (delay < 0) { printf("Delay must be at least 0 seconds.\n"); Usage(); } } else if (!strcmp("-iters", av[i])) { if (++i >= ac) { printf("Missing iteration count for -iters option.\n"); } iters = atoi(av[i]); if (iters < 0) { printf("Number of iterations must be at least 0.\n"); Usage(); } } else if (!strcmp("-nobuf", av[i])) { rc = setvbuf(stdin, NULL, _IONBF, 0); if (rc < 0) { perror("Setting -nobuf for stdin"); } } else if (!strcmp("-inter", av[i])) { inter = 1; } else if (!strcmp("-line", av[i])) { line = 1; } else Usage(); } IOMGR_Initialize(); LWP_CreateProcess(DotWriter, 32000, LWP_NORMAL_PRIORITY, (char *)0, "DotWriter", &dotpid); if (inter) { interTest(); exit(1); } if (line) { lineTest(); exit(1); } if (delay == 0) { delay = -1; /* Means wait indefinitely. */ } for (; iters >= 0; iters--) { waitingForAnswer = 1; LWP_NoYieldSignal(&waitingForAnswer); rc = LWP_WaitForKeystroke(delay); waitingForAnswer = 0; if (rc) { printf("\n'%c'\n", getchar()); printf("Flushing remaining input.\n"); while (LWP_WaitForKeystroke(0)) { printf("'%c'\n", getchar()); } } else { printf("\nNo data available on this iteration.\n"); } } }
/*! * \brief Main interaction loop for the recovery manager * * The recovery light-weight process only runs when you're the * synchronization site. It performs the following tasks, if and only * if the prerequisite tasks have been performed successfully (it * keeps track of which ones have been performed in its bit map, * \p urecovery_state). * * First, it is responsible for probing that all servers are up. This * is the only operation that must be performed even if this is not * yet the sync site, since otherwise this site may not notice that * enough other machines are running to even elect this guy to be the * sync site. * * After that, the recovery process does nothing until the beacon and * voting modules manage to get this site elected sync site. * * After becoming sync site, recovery first attempts to find the best * database available in the network (it must do this in order to * ensure finding the latest committed data). After finding the right * database, it must fetch this dbase to the sync site. * * After fetching the dbase, it relabels it with a new version number, * to ensure that everyone recognizes this dbase as the most recent * dbase. * * One the dbase has been relabelled, this machine can start handling * requests. However, the recovery module still has one more task: * propagating the dbase out to everyone who is up in the network. */ void * urecovery_Interact(void *dummy) { afs_int32 code, tcode; struct ubik_server *bestServer = NULL; struct ubik_server *ts; int dbok, doingRPC, now; afs_int32 lastProbeTime; /* if we're the sync site, the best db version we've found yet */ static struct ubik_version bestDBVersion; struct ubik_version tversion; struct timeval tv; int length, tlen, offset, file, nbytes; struct rx_call *rxcall; char tbuffer[1024]; struct ubik_stat ubikstat; struct in_addr inAddr; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 pass; afs_pthread_setname_self("recovery"); /* otherwise, begin interaction */ urecovery_state = 0; lastProbeTime = 0; while (1) { /* Run through this loop every 4 seconds */ tv.tv_sec = 4; tv.tv_usec = 0; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif ubik_dprint("recovery running in state %x\n", urecovery_state); /* Every 30 seconds, check all the down servers and mark them * as up if they respond. When a server comes up or found to * not be current, then re-find the the best database and * propogate it. */ if ((now = FT_ApproxTime()) > 30 + lastProbeTime) { for (ts = ubik_servers, doingRPC = 0; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; doingRPC = 1; code = DoProbe(ts); if (code == 0) { UBIK_BEACON_LOCK; ts->up = 1; UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } else { UBIK_BEACON_UNLOCK; DBHOLD(ubik_dbase); if (!ts->currentDB) urecovery_state &= ~UBIK_RECFOUNDDB; DBRELE(ubik_dbase); } } if (doingRPC) now = FT_ApproxTime(); lastProbeTime = now; } /* Mark whether we are the sync site */ DBHOLD(ubik_dbase); if (!ubeacon_AmSyncSite()) { urecovery_state &= ~UBIK_RECSYNCSITE; DBRELE(ubik_dbase); continue; /* nothing to do */ } urecovery_state |= UBIK_RECSYNCSITE; /* If a server has just come up or if we have not found the * most current database, then go find the most current db. */ if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); bestServer = (struct ubik_server *)0; bestDBVersion.epoch = 0; bestDBVersion.counter = 0; for (ts = ubik_servers; ts; ts = ts->next) { UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; continue; /* don't bother with these guys */ } UBIK_BEACON_UNLOCK; if (ts->isClone) continue; UBIK_ADDR_LOCK; code = DISK_GetVersion(ts->disk_rxcid, &ts->version); UBIK_ADDR_UNLOCK; if (code == 0) { /* perhaps this is the best version */ if (vcmp(ts->version, bestDBVersion) > 0) { /* new best version */ bestDBVersion = ts->version; bestServer = ts; } } } /* take into consideration our version. Remember if we, * the sync site, have the best version. Also note that * we may need to send the best version out. */ DBHOLD(ubik_dbase); if (vcmp(ubik_dbase->version, bestDBVersion) >= 0) { bestDBVersion = ubik_dbase->version; bestServer = (struct ubik_server *)0; urecovery_state |= UBIK_RECHAVEDB; } else { /* Clear the flag only when we know we have to retrieve * the db. Because urecovery_AllBetter() looks at it. */ urecovery_state &= ~UBIK_RECHAVEDB; } urecovery_state |= UBIK_RECFOUNDDB; urecovery_state &= ~UBIK_RECSENTDB; } if (!(urecovery_state & UBIK_RECFOUNDDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If we, the sync site, do not have the best db version, then * go and get it from the server that does. */ if ((urecovery_state & UBIK_RECHAVEDB) || !bestServer) { urecovery_state |= UBIK_RECHAVEDB; } else { /* we don't have the best version; we should fetch it. */ urecovery_AbortAll(ubik_dbase); /* Rx code to do the Bulk fetch */ file = 0; offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(bestServer->disk_rxcid); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(bestServer->addr[0], hoststr)); UBIK_ADDR_UNLOCK; code = StartDISK_GetFile(rxcall, file); if (code) { ubik_dprint("StartDiskGetFile failed=%d\n", code); goto FetchEndCall; } nbytes = rx_Read(rxcall, (char *)&length, sizeof(afs_int32)); length = ntohl(length); if (nbytes != sizeof(afs_int32)) { ubik_dprint("Rx-read length error=%d\n", code = BULK_ERROR); code = EIO; goto FetchEndCall; } /* give invalid label during file transit */ UBIK_VERSION_LOCK; tversion.epoch = 0; code = (*ubik_dbase->setlabel) (ubik_dbase, file, &tversion); UBIK_VERSION_UNLOCK; if (code) { ubik_dprint("setlabel io error=%d\n", code); goto FetchEndCall; } snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto FetchEndCall; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto FetchEndCall; } pass = 0; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #ifndef AFS_PTHREAD_ENV if (pass % 4 == 0) IOMGR_Poll(); #endif nbytes = rx_Read(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-read bulk error=%d\n", code = BULK_ERROR); code = EIO; close(fd); goto FetchEndCall; } nbytes = write(fd, tbuffer, tlen); pass++; if (nbytes != tlen) { code = UIOERROR; close(fd); goto FetchEndCall; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto FetchEndCall; code = EndDISK_GetFile(rxcall, &tversion); FetchEndCall: tcode = rx_EndCall(rxcall, code); if (!code) code = tcode; if (!code) { /* we got a new file, set up its header */ urecovery_state |= UBIK_RECHAVEDB; UBIK_VERSION_LOCK; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); if (!code) { (*ubik_dbase->open) (ubik_dbase, file); /* after data is good, sync disk with correct label */ code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); } UBIK_VERSION_UNLOCK; #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif } if (code) { unlink(pbuffer); /* * We will effectively invalidate the old data forever now. * Unclear if we *should* but we do. */ UBIK_VERSION_LOCK; ubik_dbase->version.epoch = 0; ubik_dbase->version.counter = 0; UBIK_VERSION_UNLOCK; ubik_print("Ubik: Synchronize database failed (error = %d)\n", code); } else { ubik_print("Ubik: Synchronize database completed\n"); urecovery_state |= UBIK_RECHAVEDB; } udisk_Invalidate(ubik_dbase, 0); /* data has changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } if (!(urecovery_state & UBIK_RECHAVEDB)) { DBRELE(ubik_dbase); continue; /* not ready */ } /* If the database was newly initialized, then when we establish quorum, write * a new label. This allows urecovery_AllBetter() to allow access for reads. * Setting it to 2 also allows another site to come along with a newer * database and overwrite this one. */ if (ubik_dbase->version.epoch == 1) { urecovery_AbortAll(ubik_dbase); UBIK_VERSION_LOCK; version_globals.ubik_epochTime = 2; ubik_dbase->version.epoch = version_globals.ubik_epochTime; ubik_dbase->version.counter = 1; code = (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version); UBIK_VERSION_UNLOCK; udisk_Invalidate(ubik_dbase, 0); /* data may have changed */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&ubik_dbase->version_cond); #else LWP_NoYieldSignal(&ubik_dbase->version); #endif } /* Check the other sites and send the database to them if they * do not have the current db. */ if (!(urecovery_state & UBIK_RECSENTDB)) { /* now propagate out new version to everyone else */ dbok = 1; /* start off assuming they all worked */ /* * Check if a write transaction is in progress. We can't send the * db when a write is in progress here because the db would be * obsolete as soon as it goes there. Also, ops after the begin * trans would reach the recepient and wouldn't find a transaction * pending there. Frankly, I don't think it's possible to get past * the write-lock above if there is a write transaction in progress, * but then, it won't hurt to check, will it? */ if (ubik_dbase->flags & DBWRITING) { struct timeval tv; int safety = 0; long cur_usec = 50000; while ((ubik_dbase->flags & DBWRITING) && (safety < 500)) { DBRELE(ubik_dbase); /* sleep for a little while */ tv.tv_sec = 0; tv.tv_usec = cur_usec; #ifdef AFS_PTHREAD_ENV select(0, 0, 0, 0, &tv); #else IOMGR_Select(0, 0, 0, 0, &tv); #endif cur_usec += 10000; safety++; DBHOLD(ubik_dbase); } } for (ts = ubik_servers; ts; ts = ts->next) { UBIK_ADDR_LOCK; inAddr.s_addr = ts->addr[0]; UBIK_ADDR_UNLOCK; UBIK_BEACON_LOCK; if (!ts->up) { UBIK_BEACON_UNLOCK; ubik_dprint("recovery cannot send version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); dbok = 0; continue; } UBIK_BEACON_UNLOCK; ubik_dprint("recovery sending version to %s\n", afs_inet_ntoa_r(inAddr.s_addr, hoststr)); if (vcmp(ts->version, ubik_dbase->version) != 0) { ubik_dprint("recovery stating local database\n"); /* Rx code to do the Bulk Store */ code = (*ubik_dbase->stat) (ubik_dbase, 0, &ubikstat); if (!code) { length = ubikstat.size; file = offset = 0; UBIK_ADDR_LOCK; rxcall = rx_NewCall(ts->disk_rxcid); UBIK_ADDR_UNLOCK; code = StartDISK_SendFile(rxcall, file, length, &ubik_dbase->version); if (code) { ubik_dprint("StartDiskSendFile failed=%d\n", code); goto StoreEndCall; } while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); nbytes = (*ubik_dbase->read) (ubik_dbase, file, tbuffer, offset, tlen); if (nbytes != tlen) { ubik_dprint("Local disk read error=%d\n", code = UIOERROR); goto StoreEndCall; } nbytes = rx_Write(rxcall, tbuffer, tlen); if (nbytes != tlen) { ubik_dprint("Rx-write bulk error=%d\n", code = BULK_ERROR); goto StoreEndCall; } offset += tlen; length -= tlen; } code = EndDISK_SendFile(rxcall); StoreEndCall: code = rx_EndCall(rxcall, code); } if (code == 0) { /* we set a new file, process its header */ ts->version = ubik_dbase->version; ts->currentDB = 1; } else dbok = 0; } else { /* mark file up to date */ ts->currentDB = 1; } } if (dbok) urecovery_state |= UBIK_RECSENTDB; } DBRELE(ubik_dbase); } return NULL; }
afs_int32 SDISK_SendFile(struct rx_call *rxcall, afs_int32 file, afs_int32 length, struct ubik_version *avers) { afs_int32 code; struct ubik_dbase *dbase = NULL; char tbuffer[1024]; afs_int32 offset; struct ubik_version tversion; int tlen; struct rx_peer *tpeer; struct rx_connection *tconn; afs_uint32 otherHost = 0; char hoststr[16]; char pbuffer[1028]; int fd = -1; afs_int32 epoch = 0; afs_int32 pass; /* send the file back to the requester */ dbase = ubik_dbase; if ((code = ubik_CheckAuth(rxcall))) { DBHOLD(dbase); goto failed; } /* next, we do a sanity check to see if the guy sending us the database is * the guy we think is the sync site. It turns out that we might not have * decided yet that someone's the sync site, but they could have enough * votes from others to be sync site anyway, and could send us the database * in advance of getting our votes. This is fine, what we're really trying * to check is that some authenticated bogon isn't sending a random database * into another configuration. This could happen on a bad configuration * screwup. Thus, we only object if we're sure we know who the sync site * is, and it ain't the guy talking to us. */ offset = uvote_GetSyncSite(); tconn = rx_ConnectionOf(rxcall); tpeer = rx_PeerOf(tconn); otherHost = ubikGetPrimaryInterfaceAddr(rx_HostOf(tpeer)); if (offset && offset != otherHost) { /* we *know* this is the wrong guy */ code = USYNC; DBHOLD(dbase); goto failed; } DBHOLD(dbase); /* abort any active trans that may scribble over the database */ urecovery_AbortAll(dbase); ubik_print("Ubik: Synchronize database with server %s\n", afs_inet_ntoa_r(otherHost, hoststr)); offset = 0; UBIK_VERSION_LOCK; epoch = tversion.epoch = 0; /* start off by labelling in-transit db as invalid */ (*dbase->setlabel) (dbase, file, &tversion); /* setlabel does sync */ snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600); if (fd < 0) { code = errno; goto failed_locked; } code = lseek(fd, HDRSIZE, 0); if (code != HDRSIZE) { close(fd); goto failed_locked; } pass = 0; memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version)); UBIK_VERSION_UNLOCK; while (length > 0) { tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length); #if !defined(AFS_PTHREAD_ENV) if (pass % 4 == 0) IOMGR_Poll(); #endif code = rx_Read(rxcall, tbuffer, tlen); if (code != tlen) { ubik_dprint("Rx-read length error=%d\n", code); code = BULK_ERROR; close(fd); goto failed; } code = write(fd, tbuffer, tlen); pass++; if (code != tlen) { ubik_dprint("write failed error=%d\n", code); code = UIOERROR; close(fd); goto failed; } offset += tlen; length -= tlen; } code = close(fd); if (code) goto failed; /* sync data first, then write label and resync (resync done by setlabel call). * This way, good label is only on good database. */ snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); code = unlink(pbuffer); if (!code) code = rename(tbuffer, pbuffer); snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); #endif if (!code) code = rename(pbuffer, tbuffer); UBIK_VERSION_LOCK; if (!code) { (*ubik_dbase->open) (ubik_dbase, file); code = (*ubik_dbase->setlabel) (dbase, file, avers); } #ifdef AFS_NT40_ENV snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD", ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file); unlink(pbuffer); #endif memcpy(&ubik_dbase->version, avers, sizeof(struct ubik_version)); udisk_Invalidate(dbase, file); /* new dbase, flush disk buffers */ #ifdef AFS_PTHREAD_ENV assert(pthread_cond_broadcast(&dbase->version_cond) == 0); #else LWP_NoYieldSignal(&dbase->version); #endif failed_locked: UBIK_VERSION_UNLOCK; failed: if (code) { unlink(pbuffer); /* Failed to sync. Allow reads again for now. */ if (dbase != NULL) { UBIK_VERSION_LOCK; tversion.epoch = epoch; (*dbase->setlabel) (dbase, file, &tversion); UBIK_VERSION_UNLOCK; } ubik_print ("Ubik: Synchronize database with server %s failed (error = %d)\n", afs_inet_ntoa_r(otherHost, hoststr), code); } else { ubik_print("Ubik: Synchronize database completed\n"); } DBRELE(dbase); return code; }