/** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ OpTime _logOpObjRS(OperationContext* txn, const BSONObj& op) { Lock::DBLock lk(txn->lockState(), "local", newlm::MODE_X); // XXX soon this needs to be part of an outer WUOW not its own. // We can't do this yet due to locking limitations. WriteUnitOfWork wunit(txn); const OpTime ts = op["ts"]._opTime(); long long hash = op["h"].numberLong(); { if ( localOplogRSCollection == 0 ) { Client::Context ctx(txn, rsoplog); localDB = ctx.db(); verify( localDB ); localOplogRSCollection = localDB->getCollection(txn, rsoplog); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); } Client::Context ctx(txn, rsoplog, localDB); checkOplogInsert(localOplogRSCollection->insertDocument(txn, op, false)); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); OpTime myLastOptime = replCoord->getMyLastOptime(); if (!(myLastOptime < ts)) { severe() << "replication oplog stream went back in time. previous timestamp: " << myLastOptime << " newest timestamp: " << ts; fassertFailedNoTrace(18905); } BackgroundSync* bgsync = BackgroundSync::get(); // Keep this up-to-date, in case we step up to primary. bgsync->setLastAppliedHash(hash); ctx.getClient()->setLastOp( ts ); replCoord->setMyLastOptime(txn, ts); bgsync->notify(); } setNewOptime(ts); wunit.commit(); return ts; }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while(!inShutdown()) { OpQueue ops; OperationContextImpl txn; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); Client::Context ctx(&txn, "local"); ctx.db()->dropCollection(&txn, "local.oplog.rs"); // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync (done in this thread after we return); thus, we must // ensure the lastAppliedOptime is empty before pausing the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. replCoord->setMyLastOptime(&txn, OpTime()); bgsync->stop(); replCoord->clearSyncSourceBlacklist(); wunit.commit(); return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); // TODO(emilkie): This can be removed once we switch over from legacy; // this code is what moves 1-node sets to PRIMARY state. // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet && theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } if (ops.empty()) { continue; } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); if (replCoord->getCurrentMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog OpTime minValid = lastOp["ts"]._opTime(); setMinValid(&txn, minValid); multiApply(ops.getDeque()); applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication // coordinator if (theReplSet && !theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
static void _logOpOld(OperationContext* txn, const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBLock lk(txn->lockState(), "local", newlm::MODE_X); WriteUnitOfWork wunit(txn); static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) { resetSlaveCache(); } return; } mutex::scoped_lock lk2(newOpMutex); OpTime ts(getNextGlobalOptime()); newOptimeNotifier.notify_all(); /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ bufbuilder.reset(); BSONObjBuilder b(bufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); // partial is everything except the o:... part. if( logNS == 0 ) { logNS = "local.oplog.$main"; } if ( localOplogMainCollection == 0 ) { Client::Context ctx(txn, logNS); localDB = ctx.db(); verify( localDB ); localOplogMainCollection = localDB->getCollection(txn, logNS); verify( localOplogMainCollection ); } Client::Context ctx(txn, logNS , localDB); OplogDocWriter writer( partial, obj ); checkOplogInsert( localOplogMainCollection->insertDocument( txn, &writer, false ) ); ctx.getClient()->setLastOp( ts ); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); replCoord->setMyLastOptime(txn, ts); wunit.commit(); }
static void _logOpRS(OperationContext* txn, const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBLock lk1(txn->lockState(), "local", newlm::MODE_X); WriteUnitOfWork wunit(txn); if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) resetSlaveCache(); return; } ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); mutex::scoped_lock lk2(newOpMutex); OpTime ts(getNextGlobalOptime()); newOptimeNotifier.notify_all(); long long hashNew = BackgroundSync::get()->getLastAppliedHash(); // Check to make sure logOp() is legal at this point. if (*opstr == 'n') { // 'n' operations are always logged invariant(*ns == '\0'); // 'n' operations do not advance the hash, since they are not rolled back } else { if (!replCoord->canAcceptWritesForDatabase(nsToDatabaseSubstring(ns))) { severe() << "replSet error : logOp() but can't accept write to collection " << ns; fassertFailed(17405); } // Advance the hash hashNew = (hashNew * 131 + ts.asLL()) * 17 + replCoord->getMyId(); } /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ logopbufbuilder.reset(); BSONObjBuilder b(logopbufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("h", hashNew); b.append("v", OPLOG_VERSION); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); DEV verify( logNS == 0 ); // check this was never a master/slave master if ( localOplogRSCollection == 0 ) { Client::Context ctx(txn, rsoplog); localDB = ctx.db(); verify( localDB ); localOplogRSCollection = localDB->getCollection( txn, rsoplog ); massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); } Client::Context ctx(txn, rsoplog, localDB); OplogDocWriter writer( partial, obj ); checkOplogInsert( localOplogRSCollection->insertDocument( txn, &writer, false ) ); BackgroundSync::get()->setLastAppliedHash(hashNew); ctx.getClient()->setLastOp( ts ); replCoord->setMyLastOptime(txn, ts); wunit.commit(); }