Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return Status(ErrorCodes::NodeNotFound, "Need to send handshake before updating position upstream"); } replCoord->prepareReplSetUpdatePositionCommand(txn, &cmd); } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; _resetConnection(); } return status; }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedbackThread"); OperationContextImpl txn; bool positionChanged = false; bool handshakeNeeded = false; ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { _cond.wait(lock); } if (_shutdownSignaled) { break; } positionChanged = _positionChanged; handshakeNeeded = _handshakeNeeded; _positionChanged = false; _handshakeNeeded = false; } MemberState state = replCoord->getCurrentMemberState(); if (state.primary() || state.fatal() || state.startup()) { continue; } const Member* target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (!target) { sleepmillis(500); continue; } if (!_connect(&txn, target->fullName())) { sleepmillis(500); continue; } } if (handshakeNeeded) { if (!replHandshake(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _handshakeNeeded = true; continue; } } if (positionChanged) { if (!updateUpstream(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _positionChanged = true; } } } cc().shutdown(); }
void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); // ignore slaveDelay if the box is still initializing. once // it becomes secondary we can worry about it. if( slaveDelaySecs > 0 && replCoord->getCurrentMemberState().secondary() ) { const OpTime ts = lastOp["ts"]._opTime(); long long a = ts.getSecs(); long long b = time(0); long long lag = b - a; long long sleeptime = slaveDelaySecs - lag; if( sleeptime > 0 ) { uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000); if( sleeptime < 60 ) { sleepsecs((int) sleeptime); } else { warning() << "replSet slavedelay causing a long sleep of " << sleeptime << " seconds" << rsLog; // sleep(hours) would prevent reconfigs from taking effect & such! long long waitUntil = b + sleeptime; while(time(0) < waitUntil) { sleepsecs(6); // Handle reconfigs that changed the slave delay if (replCoord->getSlaveDelaySecs().total_seconds() != slaveDelaySecs) break; } } } } // endif slaveDelay }
bool SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to update to return true; } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return false; } replCoord->prepareReplSetUpdatePositionCommand(txn, &cmd); } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); bool ok; try { ok = _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; _resetConnection(); return false; } if (!ok) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; _resetConnection(); return false; } return true; }
bool SyncSourceFeedback::replHandshake(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to handshake to return true; } // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; replCoord->prepareReplSetUpdatePositionCommandHandshakes(txn, &handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { std::string errMsg = res["errmsg"].valuestrsafe(); massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", errMsg.find("no such cmd") == std::string::npos); log() << "replSet error while handshaking the upstream updater: " << errMsg; // sleep half a second if we are not in our sync source's config // TODO(dannenberg) after 2.8, remove the string comparison if (res["code"].numberInt() == ErrorCodes::NodeNotFound || errMsg.find("could not be found in replica set config while attempting " "to associate it with") != std::string::npos) { // black list sync target for 10 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 10*1000)); BackgroundSync::get()->clearSyncTarget(); } _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { Lock::GlobalWrite globalWriteLock(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet) { return appendCommandStatus(result, Status(ErrorCodes::NotYetInitialized, "no replication yet active")); } if (replCoord->getCurrentMemberState().primary() || !replCoord->setFollowerMode(MemberState::RS_STARTUP2)) { return appendCommandStatus(result, Status(ErrorCodes::NotSecondary, "primaries cannot resync")); } BackgroundSync::get()->setInitialSyncRequestedFlag(true); return true; } // below this comment pertains only to master/slave replication if ( cmdObj.getBoolField( "force" ) ) { if ( !waitForSyncToFinish(txn, errmsg ) ) return false; replAllDead = "resync forced"; } // TODO(dannenberg) replAllDead is bad and should be removed when masterslave is removed if (!replAllDead) { errmsg = "not dead, no need to resync"; return false; } if ( !waitForSyncToFinish(txn, errmsg ) ) return false; ReplSource::forceResyncDead( txn, "client" ); result.append( "info", "triggered resync for all sources" ); return true; }
bool SyncSourceFeedback::replHandshake(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to handshake to return true; } // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; replCoord->prepareReplSetUpdatePositionCommandHandshakes(txn, &handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", res["errmsg"].str().find("no such cmd") == std::string::npos); log() << "replSet error while handshaking the upstream updater: " << res["errmsg"].valuestrsafe(); _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
void runSyncThread() { Client::initThread("rsSync"); replLocalAuth(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Set initial indexPrefetch setting std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; if (!prefetch.empty()) { BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; if (prefetch == "none") prefetchConfig = BackgroundSync::PREFETCH_NONE; else if (prefetch == "_id_only") prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; else if (prefetch == "all") prefetchConfig = BackgroundSync::PREFETCH_ALL; else { warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " << "to \"all\""; } BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); } while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet) { log() << "replSet warning did not receive a valid config yet, sleeping 5 seconds " << rsLog; sleepsecs(5); continue; } const MemberState memberState = replCoord->getCurrentMemberState(); if (replCoord->getCurrentMemberState().arbiter()) { break; } try { if (memberState.primary()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } replCoord->setFollowerMode(MemberState::RS_RECOVERING); /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch(const DBException& e) { log() << "Received exception while syncing: " << e.toString(); sleepsecs(10); } catch(...) { sethbmsg("unexpected exception in syncThread()"); // TODO : SET NOT SECONDARY here? sleepsecs(60); } } cc().shutdown(); }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while(!inShutdown()) { OpQueue ops; OperationContextImpl txn; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); Client::Context ctx(&txn, "local"); ctx.db()->dropCollection(&txn, "local.oplog.rs"); // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync (done in this thread after we return); thus, we must // ensure the lastAppliedOptime is empty before pausing the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. replCoord->setMyLastOptime(&txn, OpTime()); bgsync->stop(); replCoord->clearSyncSourceBlacklist(); wunit.commit(); return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); // TODO(emilkie): This can be removed once we switch over from legacy; // this code is what moves 1-node sets to PRIMARY state. // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet && theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } if (ops.empty()) { continue; } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); if (replCoord->getCurrentMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog OpTime minValid = lastOp["ts"]._opTime(); setMinValid(&txn, minValid); multiApply(ops.getDeque()); applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication // coordinator if (theReplSet && !theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getSettings().usingReplSets()) { if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet || replCoord->getCurrentMemberState().shunned()) { result.append("ismaster", false); result.append("secondary", false); result.append("info", ReplSet::startupStatusMsg.get()); result.append( "isreplicaset" , true ); } else { theReplSet->fillIsMaster(result); } return; } if ( replAllDead ) { result.append("ismaster", 0); string s = string("dead: ") + replAllDead; result.append("info", s); } else { result.appendBool("ismaster", getGlobalReplicationCoordinator()->isMasterForReportingPurposes()); } if (level && replCoord->getSettings().usingReplSets()) { result.append( "info" , "is replica set" ); } else if ( level ) { BSONObjBuilder sources( result.subarrayStart( "sources" ) ); int n = 0; list<BSONObj> src; { const char* localSources = "local.sources"; Client::ReadContext ctx(txn, localSources); auto_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, localSources, ctx.ctx().db()->getCollection(txn, localSources))); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = exec->getNext(&obj, NULL))) { src.push_back(obj); } } for( list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++ ) { BSONObj s = *i; BSONObjBuilder bb; bb.append( s["host"] ); string sourcename = s["source"].valuestr(); if ( sourcename != "main" ) bb.append( s["source"] ); { BSONElement e = s["syncedTo"]; BSONObjBuilder t( bb.subobjStart( "syncedTo" ) ); t.appendDate( "time" , e.timestampTime() ); t.append( "inc" , e.timestampInc() ); t.done(); } if ( level > 1 ) { wassert(txn->lockState()->threadState() == 0); // note: there is no so-style timeout on this connection; perhaps we should have one. ScopedDbConnection conn(s["host"].valuestr()); DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() ); if ( cliConn && replAuthenticate(cliConn) ) { BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename, Query().sort( BSON( "$natural" << 1 ) ) ); BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename, Query().sort( BSON( "$natural" << -1 ) ) ); bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); bb.appendDate( "masterLast" , last["ts"].timestampTime() ); double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); bb.append( "lagSeconds" , lag / 1000 ); } conn.done(); } sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() ); } sources.done(); } }