void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); // ignore slaveDelay if the box is still initializing. once // it becomes secondary we can worry about it. if (slaveDelaySecs > 0 && replCoord->getMemberState().secondary()) { const Timestamp ts = lastOp["ts"].timestamp(); long long a = ts.getSecs(); long long b = time(0); long long lag = b - a; long long sleeptime = slaveDelaySecs - lag; if (sleeptime > 0) { uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000); if (sleeptime < 60) { sleepsecs((int)sleeptime); } else { warning() << "slavedelay causing a long sleep of " << sleeptime << " seconds"; // sleep(hours) would prevent reconfigs from taking effect & such! long long waitUntil = b + sleeptime; while (time(0) < waitUntil) { sleepsecs(6); // Handle reconfigs that changed the slave delay if (durationCount<Seconds>(replCoord->getSlaveDelaySecs()) != slaveDelaySecs) break; } } } } // endif slaveDelay }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedback"); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (true) { // breaks once _shutdownSignaled is true { stdx::unique_lock<stdx::mutex> lock(_mtx); while (!_positionChanged && !_shutdownSignaled) { if (_cond.wait_for(lock, _keepAliveInterval) == stdx::cv_status::timeout) { break; } } if (_shutdownSignaled) { break; } _positionChanged = false; } auto txn = cc().makeOperationContext(); MemberState state = replCoord->getMemberState(); if (state.primary() || state.startup()) { _resetConnection(); continue; } const HostAndPort target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (target.empty()) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; continue; } if (!_connect(txn.get(), target)) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; continue; } } Status status = updateUpstream(txn.get()); if (!status.isOK()) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; } } }
bool SyncSourceFeedback::replHandshake(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to handshake to return true; } // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; replCoord->prepareReplSetUpdatePositionCommandHandshakes(&handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { std::string errMsg = res["errmsg"].valuestrsafe(); massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", errMsg.find("no such cmd") == std::string::npos); error() << "Error while handshaking the upstream updater: " << errMsg; // sleep half a second if we are not in our sync source's config // TODO(dannenberg) after 3.0, remove the string comparison if (res["code"].numberInt() == ErrorCodes::NodeNotFound || errMsg.find("could not be found in replica set config while attempting " "to associate it with") != std::string::npos) { // black list sync target for 10 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 10*1000)); BackgroundSync::get()->clearSyncTarget(); } _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return Status(ErrorCodes::NodeNotFound, "Need to send handshake before updating position upstream"); } // the command could not be created, likely because the node was removed from the set if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); } return status; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite globalWriteLock(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { const MemberState memberState = replCoord->getMemberState(); if (memberState.startup()) { return appendCommandStatus(result, Status(ErrorCodes::NotYetInitialized, "no replication yet active")); } if (memberState.primary() || !replCoord->setFollowerMode(MemberState::RS_STARTUP2)) { return appendCommandStatus(result, Status(ErrorCodes::NotSecondary, "primaries cannot resync")); } BackgroundSync::get()->setInitialSyncRequestedFlag(true); return true; } // below this comment pertains only to master/slave replication if ( cmdObj.getBoolField( "force" ) ) { if ( !waitForSyncToFinish(txn, errmsg ) ) return false; replAllDead = "resync forced"; } // TODO(dannenberg) replAllDead is bad and should be removed when masterslave is removed if (!replAllDead) { errmsg = "not dead, no need to resync"; return false; } if ( !waitForSyncToFinish(txn, errmsg ) ) return false; ReplSource::forceResyncDead( txn, "client" ); result.append( "info", "triggered resync for all sources" ); return true; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { stdx::unique_lock<stdx::mutex> lock(_mtx); // the command could not be created, likely because the node was removed from the set if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; // blacklist sync target for .5 seconds and find a new one, unless we were rejected due // to the syncsource having a newer config if (status != ErrorCodes::InvalidReplicaSetConfig || res["cfgver"].eoo() || res["cfgver"].numberLong() < replCoord->getConfig().getConfigVersion()) { replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); } } return status; }
// Applies a batch of oplog entries, by using a set of threads to apply the operations and then // writes the oplog entries to the local oplog. OpTime SyncTail::multiApply(OperationContext* txn, const OpQueue& ops) { invariant(_applyFunc); if (getGlobalServiceContext()->getGlobalStorageEngine()->isMmapV1()) { // Use a ThreadPool to prefetch all the operations in a batch. prefetchOps(ops.getDeque(), &_prefetcherPool); } std::vector<std::vector<BSONObj>> writerVectors(replWriterThreadCount); fillWriterVectors(txn, ops.getDeque(), &writerVectors); LOG(2) << "replication batch size is " << ops.getDeque().size() << endl; // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter // because all readers are blocked anyway. stdx::lock_guard<SimpleMutex> fsynclk(filesLockedFsync); // stop all readers until we're done Lock::ParallelBatchWriterMode pbwm(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } applyOps(writerVectors, &_writerPool, _applyFunc, this); OpTime lastOpTime; { ON_BLOCK_EXIT([&] { _writerPool.join(); }); std::vector<BSONObj> raws; raws.reserve(ops.getDeque().size()); for (auto&& op : ops.getDeque()) { raws.emplace_back(op.raw); } lastOpTime = writeOpsToOplog(txn, raws); if (inShutdown()) { return OpTime(); } } // We have now written all database writes and updated the oplog to match. return lastOpTime; }
void runSyncThread() { Client::initThread("rsSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Set initial indexPrefetch setting const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; if (!prefetch.empty()) { BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; if (prefetch == "none") prefetchConfig = BackgroundSync::PREFETCH_NONE; else if (prefetch == "_id_only") prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; else if (prefetch == "all") prefetchConfig = BackgroundSync::PREFETCH_ALL; else { warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " << "to \"all\""; } BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); } while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet"; sleepsecs(1); continue; } const MemberState memberState = replCoord->getMemberState(); // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { continue; } /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch (const DBException& e) { log() << "Received exception while syncing: " << e.toString(); sleepsecs(10); } catch (const std::exception& e) { log() << "Received exception while syncing: " << e.what(); sleepsecs(10); } } }
void runSyncThread() { Client::initThread("rsSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Overwrite prefetch index mode in BackgroundSync if ReplSettings has a mode set. ReplSettings replSettings = replCoord->getSettings(); if (replSettings.isPrefetchIndexModeSet()) BackgroundSync::get()->setIndexPrefetchConfig(replSettings.getPrefetchIndexMode()); while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet"; sleepsecs(1); continue; } const MemberState memberState = replCoord->getMemberState(); // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { continue; } /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch (...) { std::terminate(); } } }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedback"); OperationContextImpl txn; bool positionChanged = false; bool handshakeNeeded = false; ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { _cond.wait(lock); } if (_shutdownSignaled) { break; } positionChanged = _positionChanged; handshakeNeeded = _handshakeNeeded; _positionChanged = false; _handshakeNeeded = false; } MemberState state = replCoord->getMemberState(); if (state.primary() || state.startup()) { _resetConnection(); continue; } const HostAndPort target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (target.empty()) { sleepmillis(500); continue; } if (!_connect(&txn, target)) { sleepmillis(500); continue; } handshakeNeeded = true; } if (handshakeNeeded) { positionChanged = true; if (!replHandshake(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _handshakeNeeded = true; continue; } } if (positionChanged) { Status status = updateUpstream(&txn); if (!status.isOK()) { boost::unique_lock<boost::mutex> lock(_mtx); _positionChanged = true; if (status == ErrorCodes::NodeNotFound) { _handshakeNeeded = true; } } } } cc().shutdown(); }