bool SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to update to return true; } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return false; } replCoord->prepareReplSetUpdatePositionCommand(txn, &cmd); } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); bool ok; try { ok = _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; _resetConnection(); return false; } if (!ok) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; _resetConnection(); return false; } return true; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return Status(ErrorCodes::NodeNotFound, "Need to send handshake before updating position upstream"); } replCoord->prepareReplSetUpdatePositionCommand(txn, &cmd); } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; _resetConnection(); } return status; }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedbackThread"); OperationContextImpl txn; bool positionChanged = false; bool handshakeNeeded = false; ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { _cond.wait(lock); } if (_shutdownSignaled) { break; } positionChanged = _positionChanged; handshakeNeeded = _handshakeNeeded; _positionChanged = false; _handshakeNeeded = false; } MemberState state = replCoord->getCurrentMemberState(); if (state.primary() || state.fatal() || state.startup()) { continue; } const Member* target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (!target) { sleepmillis(500); continue; } if (!_connect(&txn, target->fullName())) { sleepmillis(500); continue; } } if (handshakeNeeded) { if (!replHandshake(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _handshakeNeeded = true; continue; } } if (positionChanged) { if (!updateUpstream(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _positionChanged = true; } } } cc().shutdown(); }
void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); // ignore slaveDelay if the box is still initializing. once // it becomes secondary we can worry about it. if (slaveDelaySecs > 0 && replCoord->getMemberState().secondary()) { const Timestamp ts = lastOp["ts"].timestamp(); long long a = ts.getSecs(); long long b = time(0); long long lag = b - a; long long sleeptime = slaveDelaySecs - lag; if (sleeptime > 0) { uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000); if (sleeptime < 60) { sleepsecs((int)sleeptime); } else { warning() << "slavedelay causing a long sleep of " << sleeptime << " seconds"; // sleep(hours) would prevent reconfigs from taking effect & such! long long waitUntil = b + sleeptime; while (time(0) < waitUntil) { sleepsecs(6); // Handle reconfigs that changed the slave delay if (durationCount<Seconds>(replCoord->getSlaveDelaySecs()) != slaveDelaySecs) break; } } } } // endif slaveDelay }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedback"); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (true) { // breaks once _shutdownSignaled is true { stdx::unique_lock<stdx::mutex> lock(_mtx); while (!_positionChanged && !_shutdownSignaled) { if (_cond.wait_for(lock, _keepAliveInterval) == stdx::cv_status::timeout) { break; } } if (_shutdownSignaled) { break; } _positionChanged = false; } auto txn = cc().makeOperationContext(); MemberState state = replCoord->getMemberState(); if (state.primary() || state.startup()) { _resetConnection(); continue; } const HostAndPort target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (target.empty()) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; continue; } if (!_connect(txn.get(), target)) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; continue; } } Status status = updateUpstream(txn.get()); if (!status.isOK()) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; } } }
bool SyncSourceFeedback::replHandshake(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to handshake to return true; } // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; replCoord->prepareReplSetUpdatePositionCommandHandshakes(txn, &handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { std::string errMsg = res["errmsg"].valuestrsafe(); massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", errMsg.find("no such cmd") == std::string::npos); log() << "replSet error while handshaking the upstream updater: " << errMsg; // sleep half a second if we are not in our sync source's config // TODO(dannenberg) after 2.8, remove the string comparison if (res["code"].numberInt() == ErrorCodes::NodeNotFound || errMsg.find("could not be found in replica set config while attempting " "to associate it with") != std::string::npos) { // black list sync target for 10 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 10*1000)); BackgroundSync::get()->clearSyncTarget(); } _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return Status(ErrorCodes::NodeNotFound, "Need to send handshake before updating position upstream"); } // the command could not be created, likely because the node was removed from the set if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); } return status; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite globalWriteLock(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { const MemberState memberState = replCoord->getMemberState(); if (memberState.startup()) { return appendCommandStatus(result, Status(ErrorCodes::NotYetInitialized, "no replication yet active")); } if (memberState.primary() || !replCoord->setFollowerMode(MemberState::RS_STARTUP2)) { return appendCommandStatus(result, Status(ErrorCodes::NotSecondary, "primaries cannot resync")); } BackgroundSync::get()->setInitialSyncRequestedFlag(true); return true; } // below this comment pertains only to master/slave replication if ( cmdObj.getBoolField( "force" ) ) { if ( !waitForSyncToFinish(txn, errmsg ) ) return false; replAllDead = "resync forced"; } // TODO(dannenberg) replAllDead is bad and should be removed when masterslave is removed if (!replAllDead) { errmsg = "not dead, no need to resync"; return false; } if ( !waitForSyncToFinish(txn, errmsg ) ) return false; ReplSource::forceResyncDead( txn, "client" ); result.append( "info", "triggered resync for all sources" ); return true; }
BSONObj generateSection(OperationContext* opCtx, const BSONElement& configElement) const { ReplicationCoordinator* replCoord = ReplicationCoordinator::get(opCtx); if (!replCoord->isReplEnabled()) { return BSONObj(); } BSONObjBuilder result; // TODO(siyuan) Output term of OpTime result.append("latestOptime", replCoord->getMyLastAppliedOpTime().getTimestamp()); BSONObj o; uassert(17347, "Problem reading earliest entry from oplog", Helpers::getSingleton(opCtx, NamespaceString::kRsOplogNamespace.ns().c_str(), o)); result.append("earliestOptime", o["ts"].timestamp()); return result.obj(); }
/** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ OpTime _logOpObjRS(OperationContext* txn, const BSONObj& op) { Lock::DBLock lk(txn->lockState(), "local", newlm::MODE_X); // XXX soon this needs to be part of an outer WUOW not its own. // We can't do this yet due to locking limitations. WriteUnitOfWork wunit(txn); const OpTime ts = op["ts"]._opTime(); long long hash = op["h"].numberLong(); { if ( localOplogRSCollection == 0 ) { Client::Context ctx(txn, rsoplog); localDB = ctx.db(); verify( localDB ); localOplogRSCollection = localDB->getCollection(txn, rsoplog); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); } Client::Context ctx(txn, rsoplog, localDB); checkOplogInsert(localOplogRSCollection->insertDocument(txn, op, false)); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); OpTime myLastOptime = replCoord->getMyLastOptime(); if (!(myLastOptime < ts)) { severe() << "replication oplog stream went back in time. previous timestamp: " << myLastOptime << " newest timestamp: " << ts; fassertFailedNoTrace(18905); } BackgroundSync* bgsync = BackgroundSync::get(); // Keep this up-to-date, in case we step up to primary. bgsync->setLastAppliedHash(hash); ctx.getClient()->setLastOp( ts ); replCoord->setMyLastOptime(txn, ts); bgsync->notify(); } setNewOptime(ts); wunit.commit(); return ts; }
// Applies a batch of oplog entries, by using a set of threads to apply the operations and then // writes the oplog entries to the local oplog. OpTime SyncTail::multiApply(OperationContext* txn, const OpQueue& ops) { invariant(_applyFunc); if (getGlobalServiceContext()->getGlobalStorageEngine()->isMmapV1()) { // Use a ThreadPool to prefetch all the operations in a batch. prefetchOps(ops.getDeque(), &_prefetcherPool); } std::vector<std::vector<BSONObj>> writerVectors(replWriterThreadCount); fillWriterVectors(txn, ops.getDeque(), &writerVectors); LOG(2) << "replication batch size is " << ops.getDeque().size() << endl; // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter // because all readers are blocked anyway. stdx::lock_guard<SimpleMutex> fsynclk(filesLockedFsync); // stop all readers until we're done Lock::ParallelBatchWriterMode pbwm(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } applyOps(writerVectors, &_writerPool, _applyFunc, this); OpTime lastOpTime; { ON_BLOCK_EXIT([&] { _writerPool.join(); }); std::vector<BSONObj> raws; raws.reserve(ops.getDeque().size()); for (auto&& op : ops.getDeque()) { raws.emplace_back(op.raw); } lastOpTime = writeOpsToOplog(txn, raws); if (inShutdown()) { return OpTime(); } } // We have now written all database writes and updated the oplog to match. return lastOpTime; }
BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (!replCoord->isReplEnabled()) { return BSONObj(); } BSONObjBuilder result; // TODO(siyuan) Output term of OpTime result.append("latestOptime", replCoord->getMyLastOptime().getTimestamp()); const std::string& oplogNS = replCoord->getReplicationMode() == ReplicationCoordinator::modeReplSet ? rsOplogName : masterSlaveOplogName; BSONObj o; uassert(17347, "Problem reading earliest entry from oplog", Helpers::getSingleton(txn, oplogNS.c_str(), o)); result.append("earliestOptime", o["ts"].timestamp()); return result.obj(); }
bool SyncSourceFeedback::replHandshake(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to handshake to return true; } // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; replCoord->prepareReplSetUpdatePositionCommandHandshakes(txn, &handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", res["errmsg"].str().find("no such cmd") == std::string::npos); log() << "replSet error while handshaking the upstream updater: " << res["errmsg"].valuestrsafe(); _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { stdx::unique_lock<stdx::mutex> lock(_mtx); // the command could not be created, likely because the node was removed from the set if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; // blacklist sync target for .5 seconds and find a new one, unless we were rejected due // to the syncsource having a newer config if (status != ErrorCodes::InvalidReplicaSetConfig || res["cfgver"].eoo() || res["cfgver"].numberLong() < replCoord->getConfig().getConfigVersion()) { replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); } } return status; }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); ApplyBatchFinalizer finalizer(replCoord); OperationContextImpl txn; OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back(); const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns // true when we need to end a // batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); if (inShutdown()) return; } if (ops.empty()) { continue; } const BSONObj lastOp = ops.back(); handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
void appendReplicationInfo(OperationContext* opCtx, BSONObjBuilder& result, int level) { ReplicationCoordinator* replCoord = ReplicationCoordinator::get(opCtx); if (replCoord->getSettings().usingReplSets()) { IsMasterResponse isMasterResponse; replCoord->fillIsMasterForReplSet(&isMasterResponse); result.appendElements(isMasterResponse.toBSON()); if (level) { replCoord->appendSlaveInfoData(&result); } return; } result.appendBool("ismaster", ReplicationCoordinator::get(opCtx)->isMasterForReportingPurposes()); if (level) { BSONObjBuilder sources(result.subarrayStart("sources")); int n = 0; list<BSONObj> src; { const NamespaceString localSources{"local.sources"}; AutoGetCollectionForReadCommand ctx(opCtx, localSources); auto exec = InternalPlanner::collectionScan( opCtx, localSources.ns(), ctx.getCollection(), PlanExecutor::NO_YIELD); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { src.push_back(obj.getOwned()); } // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::IS_EOF == state); } for (list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++) { BSONObj s = *i; BSONObjBuilder bb; bb.append(s["host"]); string sourcename = s["source"].valuestr(); if (sourcename != "main") bb.append(s["source"]); { BSONElement e = s["syncedTo"]; BSONObjBuilder t(bb.subobjStart("syncedTo")); t.appendDate("time", e.timestampTime()); t.append("inc", e.timestampInc()); t.done(); } if (level > 1) { invariant(!opCtx->lockState()->isLocked()); // note: there is no so-style timeout on this connection; perhaps we should have // one. ScopedDbConnection conn(s["host"].valuestr()); DBClientConnection* cliConn = dynamic_cast<DBClientConnection*>(&conn.conn()); if (cliConn && replAuthenticate(cliConn)) { BSONObj first = conn->findOne((string) "local.oplog.$" + sourcename, Query().sort(BSON("$natural" << 1))); BSONObj last = conn->findOne((string) "local.oplog.$" + sourcename, Query().sort(BSON("$natural" << -1))); bb.appendDate("masterFirst", first["ts"].timestampTime()); bb.appendDate("masterLast", last["ts"].timestampTime()); const auto lag = (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); bb.append("lagSeconds", durationCount<Milliseconds>(lag) / 1000.0); } conn.done(); } sources.append(BSONObjBuilder::numStr(n++), bb.obj()); } sources.done(); replCoord->appendSlaveInfoData(&result); } }
static void _logOpRS(OperationContext* txn, const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBLock lk1(txn->lockState(), "local", newlm::MODE_X); WriteUnitOfWork wunit(txn); if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) resetSlaveCache(); return; } ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); mutex::scoped_lock lk2(newOpMutex); OpTime ts(getNextGlobalOptime()); newOptimeNotifier.notify_all(); long long hashNew = BackgroundSync::get()->getLastAppliedHash(); // Check to make sure logOp() is legal at this point. if (*opstr == 'n') { // 'n' operations are always logged invariant(*ns == '\0'); // 'n' operations do not advance the hash, since they are not rolled back } else { if (!replCoord->canAcceptWritesForDatabase(nsToDatabaseSubstring(ns))) { severe() << "replSet error : logOp() but can't accept write to collection " << ns; fassertFailed(17405); } // Advance the hash hashNew = (hashNew * 131 + ts.asLL()) * 17 + replCoord->getMyId(); } /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ logopbufbuilder.reset(); BSONObjBuilder b(logopbufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("h", hashNew); b.append("v", OPLOG_VERSION); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); DEV verify( logNS == 0 ); // check this was never a master/slave master if ( localOplogRSCollection == 0 ) { Client::Context ctx(txn, rsoplog); localDB = ctx.db(); verify( localDB ); localOplogRSCollection = localDB->getCollection( txn, rsoplog ); massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); } Client::Context ctx(txn, rsoplog, localDB); OplogDocWriter writer( partial, obj ); checkOplogInsert( localOplogRSCollection->insertDocument( txn, &writer, false ) ); BackgroundSync::get()->setLastAppliedHash(hashNew); ctx.getClient()->setLastOp( ts ); replCoord->setMyLastOptime(txn, ts); wunit.commit(); }
static void _logOpOld(OperationContext* txn, const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBLock lk(txn->lockState(), "local", newlm::MODE_X); WriteUnitOfWork wunit(txn); static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) { resetSlaveCache(); } return; } mutex::scoped_lock lk2(newOpMutex); OpTime ts(getNextGlobalOptime()); newOptimeNotifier.notify_all(); /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ bufbuilder.reset(); BSONObjBuilder b(bufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); // partial is everything except the o:... part. if( logNS == 0 ) { logNS = "local.oplog.$main"; } if ( localOplogMainCollection == 0 ) { Client::Context ctx(txn, logNS); localDB = ctx.db(); verify( localDB ); localOplogMainCollection = localDB->getCollection(txn, logNS); verify( localOplogMainCollection ); } Client::Context ctx(txn, logNS , localDB); OplogDocWriter writer( partial, obj ); checkOplogInsert( localOplogMainCollection->insertDocument( txn, &writer, false ) ); ctx.getClient()->setLastOp( ts ); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); replCoord->setMyLastOptime(txn, ts); wunit.commit(); }
void runSyncThread() { Client::initThread("rsSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Overwrite prefetch index mode in BackgroundSync if ReplSettings has a mode set. ReplSettings replSettings = replCoord->getSettings(); if (replSettings.isPrefetchIndexModeSet()) BackgroundSync::get()->setIndexPrefetchConfig(replSettings.getPrefetchIndexMode()); while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet"; sleepsecs(1); continue; } const MemberState memberState = replCoord->getMemberState(); // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { continue; } /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch (...) { std::terminate(); } } }
void runSyncThread() { Client::initThread("rsSync"); replLocalAuth(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Set initial indexPrefetch setting std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; if (!prefetch.empty()) { BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; if (prefetch == "none") prefetchConfig = BackgroundSync::PREFETCH_NONE; else if (prefetch == "_id_only") prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; else if (prefetch == "all") prefetchConfig = BackgroundSync::PREFETCH_ALL; else { warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " << "to \"all\""; } BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); } while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet) { log() << "replSet warning did not receive a valid config yet, sleeping 5 seconds " << rsLog; sleepsecs(5); continue; } const MemberState memberState = replCoord->getCurrentMemberState(); if (replCoord->getCurrentMemberState().arbiter()) { break; } try { if (memberState.primary()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } replCoord->setFollowerMode(MemberState::RS_RECOVERING); /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch(const DBException& e) { log() << "Received exception while syncing: " << e.toString(); sleepsecs(10); } catch(...) { sethbmsg("unexpected exception in syncThread()"); // TODO : SET NOT SECONDARY here? sleepsecs(60); } } cc().shutdown(); }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while(!inShutdown()) { OpQueue ops; OperationContextImpl txn; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); Client::Context ctx(&txn, "local"); ctx.db()->dropCollection(&txn, "local.oplog.rs"); // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync (done in this thread after we return); thus, we must // ensure the lastAppliedOptime is empty before pausing the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. replCoord->setMyLastOptime(&txn, OpTime()); bgsync->stop(); replCoord->clearSyncSourceBlacklist(); wunit.commit(); return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); // TODO(emilkie): This can be removed once we switch over from legacy; // this code is what moves 1-node sets to PRIMARY state. // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet && theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } if (ops.empty()) { continue; } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); if (replCoord->getCurrentMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog OpTime minValid = lastOp["ts"]._opTime(); setMinValid(&txn, minValid); multiApply(ops.getDeque()); applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication // coordinator if (theReplSet && !theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getSettings().usingReplSets()) { if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet || replCoord->getCurrentMemberState().shunned()) { result.append("ismaster", false); result.append("secondary", false); result.append("info", ReplSet::startupStatusMsg.get()); result.append( "isreplicaset" , true ); } else { theReplSet->fillIsMaster(result); } return; } if ( replAllDead ) { result.append("ismaster", 0); string s = string("dead: ") + replAllDead; result.append("info", s); } else { result.appendBool("ismaster", getGlobalReplicationCoordinator()->isMasterForReportingPurposes()); } if (level && replCoord->getSettings().usingReplSets()) { result.append( "info" , "is replica set" ); } else if ( level ) { BSONObjBuilder sources( result.subarrayStart( "sources" ) ); int n = 0; list<BSONObj> src; { const char* localSources = "local.sources"; Client::ReadContext ctx(txn, localSources); auto_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, localSources, ctx.ctx().db()->getCollection(txn, localSources))); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = exec->getNext(&obj, NULL))) { src.push_back(obj); } } for( list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++ ) { BSONObj s = *i; BSONObjBuilder bb; bb.append( s["host"] ); string sourcename = s["source"].valuestr(); if ( sourcename != "main" ) bb.append( s["source"] ); { BSONElement e = s["syncedTo"]; BSONObjBuilder t( bb.subobjStart( "syncedTo" ) ); t.appendDate( "time" , e.timestampTime() ); t.append( "inc" , e.timestampInc() ); t.done(); } if ( level > 1 ) { wassert(txn->lockState()->threadState() == 0); // note: there is no so-style timeout on this connection; perhaps we should have one. ScopedDbConnection conn(s["host"].valuestr()); DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() ); if ( cliConn && replAuthenticate(cliConn) ) { BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename, Query().sort( BSON( "$natural" << 1 ) ) ); BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename, Query().sort( BSON( "$natural" << -1 ) ) ); bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); bb.appendDate( "masterLast" , last["ts"].timestampTime() ); double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); bb.append( "lagSeconds" , lag / 1000 ); } conn.done(); } sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() ); } sources.done(); } }
void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getSettings().usingReplSets()) { IsMasterResponse isMasterResponse; replCoord->fillIsMasterForReplSet(&isMasterResponse); result.appendElements(isMasterResponse.toBSON()); if (level) { replCoord->appendSlaveInfoData(&result); } return; } // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed if (replAllDead) { result.append("ismaster", 0); string s = string("dead: ") + replAllDead; result.append("info", s); } else { result.appendBool("ismaster", getGlobalReplicationCoordinator()->isMasterForReportingPurposes()); } if (level) { BSONObjBuilder sources(result.subarrayStart("sources")); int n = 0; list<BSONObj> src; { const char* localSources = "local.sources"; AutoGetCollectionForRead ctx(txn, localSources); unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, localSources, ctx.getCollection())); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { src.push_back(obj); } } for (list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++) { BSONObj s = *i; BSONObjBuilder bb; bb.append(s["host"]); string sourcename = s["source"].valuestr(); if (sourcename != "main") bb.append(s["source"]); { BSONElement e = s["syncedTo"]; BSONObjBuilder t(bb.subobjStart("syncedTo")); t.appendDate("time", e.timestampTime()); t.append("inc", e.timestampInc()); t.done(); } if (level > 1) { wassert(!txn->lockState()->isLocked()); // note: there is no so-style timeout on this connection; perhaps we should have // one. ScopedDbConnection conn(s["host"].valuestr()); DBClientConnection* cliConn = dynamic_cast<DBClientConnection*>(&conn.conn()); if (cliConn && replAuthenticate(cliConn)) { BSONObj first = conn->findOne((string) "local.oplog.$" + sourcename, Query().sort(BSON("$natural" << 1))); BSONObj last = conn->findOne((string) "local.oplog.$" + sourcename, Query().sort(BSON("$natural" << -1))); bb.appendDate("masterFirst", first["ts"].timestampTime()); bb.appendDate("masterLast", last["ts"].timestampTime()); const auto lag = (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); bb.append("lagSeconds", durationCount<Milliseconds>(lag) / 1000.0); } conn.done(); } sources.append(BSONObjBuilder::numStr(n++), bb.obj()); } sources.done(); replCoord->appendSlaveInfoData(&result); } }
void runSyncThread() { Client::initThread("rsSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Set initial indexPrefetch setting const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; if (!prefetch.empty()) { BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; if (prefetch == "none") prefetchConfig = BackgroundSync::PREFETCH_NONE; else if (prefetch == "_id_only") prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; else if (prefetch == "all") prefetchConfig = BackgroundSync::PREFETCH_ALL; else { warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " << "to \"all\""; } BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); } while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet"; sleepsecs(1); continue; } const MemberState memberState = replCoord->getMemberState(); // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { continue; } /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch (const DBException& e) { log() << "Received exception while syncing: " << e.toString(); sleepsecs(10); } catch (const std::exception& e) { log() << "Received exception while syncing: " << e.what(); sleepsecs(10); } } }
void ReplClientInfo::setLastOpToSystemLastOpTime(OperationContext* txn) { ReplicationCoordinator* replCoord = repl::ReplicationCoordinator::get(txn->getServiceContext()); if (replCoord->isReplEnabled() && txn->writesAreReplicated()) { setLastOp(replCoord->getMyLastOptime()); } }