/** thread for timing out old cursors */ void ClientCursorMonitor::run() { Client::initThread("clientcursormon"); Client& client = cc(); unsigned old = curTimeMillis(); const int Secs = 4; unsigned n = 0; while ( ! inShutdown() ) { unsigned now = curTimeMillis(); ClientCursor::idleTimeReport( now - old ); old = now; sleepsecs(Secs); if( ++n % (60/4) == 0 /*once a minute*/ ) { sayMemoryStatus(); } } client.shutdown(); }
TEST(LockManagerTest, TxShutdown) { LockManager lm; ClientTransaction t1(&lm, 1); ClientTransaction t2(&lm, 2); t1.acquire(kShared, 1, ACQUIRED); lm.shutdown(3000); // t1 can still do work while quiescing t1.release(kShared, 1); t1.acquire(kShared, 2, ACQUIRED); #ifdef TRANSACTION_REGISTRATION // t2 is new and should be refused t2.acquire(kShared, 3, ABORTED); #else t2.quit(); #endif // after the quiescing period, t1's request should be refused sleepsecs(3); t1.acquire(kShared, 4, ABORTED); }
void DataFileSync::run() { Client::initThread( name().c_str() ); if (mmapv1GlobalOptions.syncdelay == 0) { log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl; } else if (mmapv1GlobalOptions.syncdelay == 1) { log() << "--syncdelay 1" << endl; } else if (mmapv1GlobalOptions.syncdelay != 60) { LOG(1) << "--syncdelay " << mmapv1GlobalOptions.syncdelay << endl; } int time_flushing = 0; while ( ! inShutdown() ) { _diaglog.flush(); if (mmapv1GlobalOptions.syncdelay == 0) { // in case at some point we add an option to change at runtime sleepsecs(5); continue; } sleepmillis((long long) std::max(0.0, (mmapv1GlobalOptions.syncdelay * 1000) - time_flushing)); if ( inShutdown() ) { // occasional issue trying to flush during shutdown when sleep interrupted break; } Date_t start = jsTime(); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); int numFiles = storageEngine->flushAllFiles( true ); time_flushing = (int) (jsTime() - start); _flushed(time_flushing); if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) { log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl; } } }
/* note: not yet in mutex at this point. returns >= 0 if ok. return -1 if you want to reconnect. return value of zero indicates no sleep necessary before next call */ int ReplSource::sync(int& nApplied) { _sleepAdviceTime = 0; ReplInfo r("sync"); if ( !cmdLine.quiet ) { Nullstream& l = log(); l << "repl: from "; if( sourceName() != "main" ) { l << "source:" << sourceName() << ' '; } l << "host:" << hostName << endl; } nClonedThisPass = 0; // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName. if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) { log() << "repl: can't sync from self (localhost). sources configuration may be wrong." << endl; sleepsecs(5); return -1; } if ( !oplogReader.connect(hostName) ) { log(4) << "repl: can't connect to sync source" << endl; return -1; } /* // get current mtime at the server. BSONObj o = conn->findOne("admin.$cmd", opTimeQuery); BSONElement e = o.getField("optime"); if( e.eoo() ) { log() << "repl: failed to get cur optime from master" << endl; log() << " " << o.toString() << endl; return false; } uassert( 10124 , e.type() == Date ); OpTime serverCurTime; serverCurTime.asDate() = e.date(); */ return sync_pullOpLog(nApplied); }
virtual void run() { Client::initThread( name().c_str() ); while ( ! inShutdown() ) { sleepsecs( 60 ); LOG(3) << "TTLMonitor thread awake" << endl; if ( lockedForWriting() ) { // note: this is not perfect as you can go into fsync+lock between // this and actually doing the delete later LOG(3) << " locked for writing" << endl; continue; } // if part of replSet but not in a readable state (e.g. during initial sync), skip. if ( theReplSet && !theReplSet->state().readable() ) continue; set<string> dbs; { Lock::DBRead lk( "local" ); dbHolder().getAllShortNames( dbs ); } ttlPasses.increment(); for ( set<string>::const_iterator i=dbs.begin(); i!=dbs.end(); ++i ) { string db = *i; try { doTTLForDB( db ); } catch ( DBException& e ) { error() << "error processing ttl for db: " << db << " " << e << endl; } } } }
void syncDoInitialSync() { static const int maxFailedAttempts = 10; OperationContextImpl txn; createOplog(&txn); int failedAttempts = 0; while ( failedAttempts < maxFailedAttempts ) { try { _initialSync(); break; } catch(DBException& e) { failedAttempts++; mongoutils::str::stream msg; error() << "initial sync exception: " << e.toString() << " " << (maxFailedAttempts - failedAttempts) << " attempts remaining"; sleepsecs(5); } } fassert( 16233, failedAttempts < maxFailedAttempts); }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { cc().curop()->suppressFromCurop(); cc().curop()->setExpectedLatencyMs( 30000 ); BSONElement e = cmdObj.firstElement(); if ( e.type() != jstOID ) { errmsg = "need oid as first value"; return 0; } // get the command issuer's (a mongos) serverID const OID id = e.__oid(); // the command issuer is blocked awaiting a response // we want to do return at least at every 5 minutes so sockets don't timeout BSONObj z; if ( writeBackManager.getWritebackQueue(id.str())->queue.blockingPop( z, 5 * 60 /* 5 minutes */ ) ) { LOG(1) << "WriteBackCommand got : " << z << endl; result.append( "data" , z ); } else { result.appendBool( "noop" , true ); } #ifdef _DEBUG PseudoRandom r(static_cast<int64_t>(time(0))); // Sleep a short amount of time usually int sleepFor = r.nextInt32( 10 ); sleepmillis( sleepFor ); // Sleep a longer amount of time every once and awhile int sleepLong = r.nextInt32( 50 ); if( sleepLong == 0 ) sleepsecs( 2 ); #endif return true; }
int run() { _sleep = getParam( "sleep" , _sleep ); auth(); BSONObj prev = getData(); while ( true ) { sleepsecs( _sleep ); BSONObj now; try { now = getData(); } catch ( std::exception& e ) { cout << "can't get data: " << e.what() << endl; continue; } if ( now.isEmpty() ) return -2; try { printDiff( prev , now ); } catch ( AssertionException& e ) { cout << "\nerror: " << e.what() << "\n" << now << endl; } prev = now; } return 0; }
void statsThread() { /*cout << "TEMP disabled statsthread" << endl; if( 1 ) return;*/ Client::initThread("stats"); unsigned long long timeLastPass = 0; while ( 1 ) { { /* todo: do we even need readlock here? if so for what? */ readlock lk(""); Top::completeSnapshot(); q = (q+1)%NStats; Timing timing; dbMutex.info().getTimingInfo(timing.start, timing.timeLocked); unsigned long long now = curTimeMicros64(); if ( timeLastPass ) { unsigned long long dt = now - timeLastPass; unsigned long long dlocked = timing.timeLocked - tlast.timeLocked; { stringstream ss; ss << dt / 1000 << '\t'; ss << dlocked / 1000 << '\t'; if ( dt ) ss << (dlocked*100)/dt << '%'; string s = ss.str(); if ( cmdLine.cpu ) log() << "cpu: " << s << endl; lockStats[q] = s; ClientCursor::idleTimeReport( (unsigned) ((dt - dlocked)/1000) ); } } timeLastPass = now; tlast = timing; } sleepsecs(4); } }
int run() { _sleep = getParam( "sleep" , _sleep ); if (isMongos()) { log() << "mongotop only works on instances of mongod." << endl; return EXIT_FAILURE; } NamespaceStats prev = getData(); while ( true ) { sleepsecs( _sleep ); NamespaceStats now; try { now = getData(); } catch ( std::exception& e ) { cout << "can't get data: " << e.what() << endl; continue; } if ( now.size() == 0 ) return -2; try { printDiff( prev , now ); } catch ( AssertionException& e ) { cout << "\nerror: " << e.what() << endl; } prev = now; } return 0; }
virtual void run(){ int minutesRunning = 0; std::string lastRunningTestName, currentTestName; { boost::lock_guard<boost::mutex> lk( globalCurrentTestNameMutex ); lastRunningTestName = globalCurrentTestName; } while (true) { sleepsecs(60); minutesRunning++; { boost::lock_guard<boost::mutex> lk( globalCurrentTestNameMutex ); currentTestName = globalCurrentTestName; } if (currentTestName != lastRunningTestName) { minutesRunning = 0; lastRunningTestName = currentTestName; } if (minutesRunning > 30){ log() << currentTestName << " has been running for more than 30 minutes. aborting." << endl; ::abort(); } else if (minutesRunning > 1){ warning() << currentTestName << " has been running for more than " << minutesRunning-1 << " minutes." << endl; // See what is stuck getGlobalLockManager()->dump(); } } }
void PeriodicTask::Runner::run() { int sleeptime = 60; DEV sleeptime = 5; // to catch race conditions while ( ! inShutdown() ) { sleepsecs( sleeptime ); scoped_spinlock lk( _lock ); size_t size = _tasks.size(); for ( size_t i=0; i<size; i++ ) { PeriodicTask * t = _tasks[i]; if ( ! t ) continue; if ( inShutdown() ) break; Timer timer; try { t->taskDoWork(); } catch ( std::exception& e ) { error() << "task: " << t->taskName() << " failed: " << e.what() << endl; } catch ( ... ) { error() << "task: " << t->taskName() << " failed with unknown error" << endl; } int ms = timer.millis(); LOG( ms <= 3 ? 3 : 0 ) << "task: " << t->taskName() << " took: " << ms << "ms" << endl; } } }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter" << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())) << "tailable" << true << "oplogReplay" << true << "awaitData" << true << "maxTimeMS" << int(fetcherMaxTimeMS.count())); Fetcher fetcher(taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, rpc::makeEmptyMetadata()); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
void runSyncThread() { Client::initThread("rsSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); // Set initial indexPrefetch setting const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; if (!prefetch.empty()) { BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; if (prefetch == "none") prefetchConfig = BackgroundSync::PREFETCH_NONE; else if (prefetch == "_id_only") prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; else if (prefetch == "all") prefetchConfig = BackgroundSync::PREFETCH_ALL; else { warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " << "to \"all\""; } BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); } while (!inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet"; sleepsecs(1); continue; } const MemberState memberState = replCoord->getMemberState(); // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { sleepsecs(1); continue; } bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); // Check criteria for doing an initial sync: // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. } if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { continue; } /* we have some data. continue tailing. */ SyncTail tail(BackgroundSync::get(), multiSyncApply); tail.oplogApplication(); } catch (const DBException& e) { log() << "Received exception while syncing: " << e.toString(); sleepsecs(10); } catch (const std::exception& e) { log() << "Received exception while syncing: " << e.what(); sleepsecs(10); } } }
void BackgroundSync::produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer) { _condvar.wait(lock); } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { boost::unique_lock<boost::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } _syncSourceReader.resetConnection(); _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); { boost::unique_lock<boost::mutex> lock(_mutex); // no server found if (_syncSourceReader.getHost().empty()) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = _syncSourceReader.getHost(); } _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!_syncSourceReader.haveCursor()) { return; } if (_rollbackIfNeeded(txn, _syncSourceReader)) { stop(); return; } while (!inShutdown()) { if (!_syncSourceReader.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = _syncSourceReader.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getCurrentMemberState().primary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncSource()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. _syncSourceReader.more(); } networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); if (!_syncSourceReader.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause) { return; } } _syncSourceReader.tailCheck(); if( !_syncSourceReader.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = _syncSourceReader.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get(), false); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } // Use fresh shard state and balancer settings Grid::get(txn.get())->shardRegistry()->reload(txn.get()); auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); sleepFor(balanceRoundInterval); continue; } // now make sure we should even be running if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, balancerConfig->getSecondaryThrottle(), balancerConfig->waitForDelete()); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
void BackgroundSync::_produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol(); // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election // timeout. This enables the sync source to communicate liveness of the primary to secondaries. // Under protocol version 0, use a default timeout of 2 seconds for awaitData. const Milliseconds fetcherMaxTimeMS( isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2)); Status fetcherReturnStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, fetcherMaxTimeMS, &fetcherReturnStatus); BSONObjBuilder cmdBob; cmdBob.append("find", nsToCollectionSubstring(rsOplogName)); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1))); // 1 min initial find. BSONObjBuilder metadataBob; if (isV1ElectionProtocol) { cmdBob.append("term", _replCoord->getTerm()); metadataBob.append(rpc::kReplSetMetadataFieldName, 1); } auto dbName = nsToDatabase(rsOplogName); auto cmdObj = cmdBob.obj(); auto metadataObj = metadataBob.obj(); Fetcher fetcher(&_threadPoolTaskExecutor, source, dbName, cmdObj, fetcherCallback, metadataObj, _replCoord->getConfig().getElectionTimeoutPeriod()); LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at " << cmdObj["filter"]; auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
void Balancer::run() { // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely while ( ! inShutdown() ) { if ( ! _init() ) { log() << "will retry to initialize balancer in one minute" << endl; sleepsecs( 60 ); continue; } break; } int sleepTime = 30; // getConnectioString and dist lock constructor does not throw, which is what we expect on while // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); while ( ! inShutdown() ) { try { scoped_ptr<ScopedDbConnection> connPtr( ScopedDbConnection::getInternalScopedDbConnection( config.toString() ) ); ScopedDbConnection& conn = *connPtr; // ping has to be first so we keep things in the config server in sync _ping( conn.conn() ); // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); BSONObj balancerConfig; // now make sure we should even be running if ( ! grid.shouldBalance( "", &balancerConfig ) ) { LOG(1) << "skipping balancing round because balancing is disabled" << endl; // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( sleepTime ); continue; } sleepTime = balancerConfig["_nosleep"].trueValue() ? 30 : 6; uassert( 13258 , "oids broken after resetting!" , _checkOIDs() ); { dist_lock_try lk( &balanceLock , "doing balance round" ); if ( ! lk.got() ) { LOG(1) << "skipping balancing round because another balancer is active" << endl; // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( sleepTime ); // no need to wake up soon continue; } LOG(1) << "*** start balancing round" << endl; vector<CandidateChunkPtr> candidateChunks; _doBalanceRound( conn.conn() , &candidateChunks ); if ( candidateChunks.size() == 0 ) { LOG(1) << "no need to move any chunk" << endl; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks( &candidateChunks, balancerConfig["_secondaryThrottle"].trueValue() ); } LOG(1) << "*** end of balancing round" << endl; } // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime ); } catch ( std::exception& e ) { log() << "caught exception while doing balance: " << e.what() << endl; // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round" << endl; sleepsecs( sleepTime ); // sleep a fair amount b/c of error continue; } } }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } r.tailingQueryGTE(rsoplog, _lastOpTimeFetched); } // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) { int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as postentiallyd efating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } // looping back is ok because this is a tailable cursor } }
void BackgroundSync::getOplogReader(OplogReader& r) { const Member *target = NULL, *stale = NULL; BSONObj oldest; { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set _currentSyncTarget = NULL; return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer) { _condvar.wait(lock); } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } verify(r.conn() == NULL); while ((target = theReplSet->getMemberToSyncTo()) != NULL) { string current = target->fullName(); if (!r.connect(current)) { LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog; r.resetConnection(); theReplSet->veto(current); sleepsecs(1); continue; } if (isStale(r, oldest)) { r.resetConnection(); theReplSet->veto(current, 600); stale = target; continue; } // if we made it here, the target is up and not stale { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = target; } boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock); theReplSet->syncSourceFeedback.connect(target); return; } // the only viable sync target was stale if (stale) { theReplSet->goStale(stale, oldest); sleepsecs(120); } { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = NULL; } }
MONGO_FAIL_POINT_BLOCK(rsDelayHeartbeatResponse, delay) { const BSONObj& data = delay.getData(); sleepsecs(data["delay"].numberInt()); }
/* we reuse our existing objects so that we can keep our existing connection and cursor in effect. */ void ReplSource::loadAll(SourceVector &v) { Client::Context ctx("local.sources"); SourceVector old = v; v.clear(); if ( !cmdLine.source.empty() ) { // --source <host> specified. // check that no items are in sources other than that // add if missing shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); int n = 0; while ( c->ok() ) { n++; ReplSource tmp(c->current()); if ( tmp.hostName != cmdLine.source ) { log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl; log() << "repl: for instructions on changing this slave's source, see:" << endl; log() << "http://dochub.mongodb.org/core/masterslave" << endl; log() << "repl: terminating mongod after 30 seconds" << endl; sleepsecs(30); dbexit( EXIT_REPLICATION_ERROR ); } if ( tmp.only != cmdLine.only ) { log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl; log() << "terminating after 30 seconds" << endl; sleepsecs(30); dbexit( EXIT_REPLICATION_ERROR ); } c->advance(); } uassert( 10002 , "local.sources collection corrupt?", n<2 ); if ( n == 0 ) { // source missing. add. ReplSource s; s.hostName = cmdLine.source; s.only = cmdLine.only; s.save(); } } else { try { massert( 10384 , "--only requires use of --source", cmdLine.only.empty()); } catch ( ... ) { dbexit( EXIT_BADOPTIONS ); } } shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); while ( c->ok() ) { ReplSource tmp(c->current()); if ( tmp.syncedTo.isNull() ) { DBDirectClient c; if ( c.exists( "local.oplog.$main" ) ) { BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) ); if ( !op.isEmpty() ) { tmp.syncedTo = op[ "ts" ].date(); } } } addSourceToList(v, tmp, old); c->advance(); } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; do { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); if (theReplSet->initialSyncRequested) { // got a resync command return; } } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; OperationContextImpl txn; theReplSet->tryToGoLiveAsASecondary(&txn, minvalid); } // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes)); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to apply batch up to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } multiApply(ops.getDeque(), multiSyncApply); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to update oplog to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue if (!theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = extractOpTime(o); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count()); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", int(fetcherMaxTimeMS.count())); }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r; OpTime lastOpTimeFetched; // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; } r.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { if (!r.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. r.more(); } networkByteStats.increment(r.currentBatchMessageSize()); if (!r.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
int runMany() { StateMap threads; { string orig = getParam( "host" ); bool showPorts = false; if ( orig == "" ) orig = "localhost"; if ( orig.find( ":" ) != string::npos || hasParam( "port" ) ) showPorts = true; StringSplitter ss( orig.c_str() , "," ); while ( ss.more() ) { string host = ss.next(); if ( showPorts && host.find( ":" ) == string::npos) { // port supplied, but not for this host. use default. if ( hasParam( "port" ) ) host += ":" + _params["port"].as<string>(); else host += ":27017"; } _add( threads , host ); } } sleepsecs(1); int row = 0; bool discover = hasParam( "discover" ); int maxLockedDbWidth = 0; while ( 1 ) { sleepsecs( (int)ceil(_statUtil.getSeconds()) ); // collect data vector<Row> rows; for ( map<string,shared_ptr<ServerState> >::iterator i=threads.begin(); i!=threads.end(); ++i ) { scoped_lock lk( i->second->lock ); if ( i->second->error.size() ) { rows.push_back( Row( i->first , i->second->error ) ); } else if ( i->second->prev.isEmpty() || i->second->now.isEmpty() ) { rows.push_back( Row( i->first ) ); } else { BSONObj out = _statUtil.doRow( i->second->prev , i->second->now ); rows.push_back( Row( i->first , out ) ); } if ( discover && ! i->second->now.isEmpty() ) { if ( _discover( threads , i->first , i->second ) ) break; } } // compute some stats unsigned longestHost = 0; BSONObj biggest; for ( unsigned i=0; i<rows.size(); i++ ) { if ( rows[i].host.size() > longestHost ) longestHost = rows[i].host.size(); if ( rows[i].data.nFields() > biggest.nFields() ) biggest = rows[i].data; // adjust width up as longer 'locked db' values appear setMaxLockedDbWidth( &rows[i].data, &maxLockedDbWidth ); } { // check for any headers not in biggest // TODO: we put any new headers at end, // ideally we would interleave set<string> seen; BSONObjBuilder b; { // iterate biggest BSONObjIterator i( biggest ); while ( i.more() ) { BSONElement e = i.next(); seen.insert( e.fieldName() ); b.append( e ); } } // now do the rest for ( unsigned j=0; j<rows.size(); j++ ) { BSONObjIterator i( rows[j].data ); while ( i.more() ) { BSONElement e = i.next(); if ( seen.count( e.fieldName() ) ) continue; seen.insert( e.fieldName() ); b.append( e ); } } biggest = b.obj(); } // display data cout << endl; // header if ( row++ % 5 == 0 && ! biggest.isEmpty() ) { cout << setw( longestHost ) << "" << "\t"; printHeaders( biggest ); } // rows for ( unsigned i=0; i<rows.size(); i++ ) { cout << setw( longestHost ) << rows[i].host << "\t"; if ( rows[i].err.size() ) cout << rows[i].err << endl; else if ( rows[i].data.isEmpty() ) cout << "no data" << endl; else printData( rows[i].data , biggest ); } } return 0; }
void run() { if ( _token.size() == 0 && _name.size() == 0 ) { log(1) << "mms not configured" << endl; return; } if ( _token.size() == 0 ) { log() << "no token for mms - not running" << endl; return; } if ( _name.size() == 0 ) { log() << "no name for mms - not running" << endl; return; } log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl; Client::initThread( "mms" ); Client& c = cc(); // TODO: using direct client is bad, but easy for now while ( ! inShutdown() ) { sleepsecs( _secsToSleep ); try { stringstream url; url << _baseurl << "?" << "token=" << _token << "&" << "name=" << _name << "&" << "ts=" << time(0) ; BSONObjBuilder bb; // duplicated so the post has everything bb.append( "token" , _token ); bb.append( "name" , _name ); bb.appendDate( "ts" , jsTime() ); // any commands _add( bb , "buildinfo" ); _add( bb , "serverStatus" ); BSONObj postData = bb.obj(); log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;; HttpClient c; HttpClient::Result r; int rc = c.post( url.str() , postData.jsonString() , &r ); log(1) << "\t response code: " << rc << endl; if ( rc != 200 ) { log() << "mms error response code:" << rc << endl; log(1) << "mms error body:" << r.getEntireResponse() << endl; } } catch ( std::exception& e ) { log() << "mms exception: " << e.what() << endl; } } c.shutdown(); }
void BackgroundSync::_produce(OperationContext* txn) { while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) { sleepmillis(0); } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (!_replCoord->isCatchingUp() && (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) { return; } if (_inShutdown_inlock()) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; SyncSourceResolverResponse syncSourceResp; SyncSourceResolver* syncSourceResolver; OpTime minValid; if (_replCoord->getMemberState().recovering()) { auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn); if (minValidSaved > lastOpTimeFetched) { minValid = minValidSaved; } } { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, minValid, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); syncSourceResolver = _syncSourceResolver.get(); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); syncSourceResolver->join(); syncSourceResolver = nullptr; { stdx::unique_lock<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->isCatchingUp()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); sleepsecs(1); return; } error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; if (!_replCoord->isCatchingUp()) { _replCoord->signalUpstreamUpdater(); } } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) { StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto executor = _replicationCoordinatorExternalState->getTaskExecutor(); auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>( executor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { if (_replCoord->isCatchingUp()) { warning() << "Rollback situation detected in catch-up mode; catch-up mode will end."; sleepsecs(1); return; } // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << redact(fetcherReturnStatus); // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state const auto minValid = StorageInterface::get(txn)->getMinValid(txn); if (lastApplied < minValid) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << minValid.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { sleepsecs(100); return true; }
virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if( replSetBlind ) { if (theReplSet) { errmsg = str::stream() << theReplSet->selfFullName() << " is blind"; } return false; } MONGO_FAIL_POINT_BLOCK(rsDelayHeartbeatResponse, delay) { const BSONObj& data = delay.getData(); sleepsecs(data["delay"].numberInt()); } /* we don't call ReplSetCommand::check() here because heartbeat checks many things that are pre-initialization. */ if( !replSet ) { errmsg = "not running with --replSet"; return false; } /* we want to keep heartbeat connections open when relinquishing primary. tag them here. */ { AbstractMessagingPort *mp = cc().port(); if( mp ) mp->tag |= ScopedConn::keepOpen; } if( cmdObj["pv"].Int() != 1 ) { errmsg = "incompatible replset protocol version"; return false; } { string s = string(cmdObj.getStringField("replSetHeartbeat")); if (replSettings.ourSetName() != s) { errmsg = "repl set names do not match"; log() << "replSet set names do not match, our cmdline: " << replSettings.replSet << rsLog; log() << "replSet s: " << s << rsLog; result.append("mismatch", true); return false; } } result.append("rs", true); if( cmdObj["checkEmpty"].trueValue() ) { result.append("hasData", replHasDatabases()); } if( (theReplSet == 0) || (theReplSet->startupStatus == ReplSetImpl::LOADINGCONFIG) ) { string from( cmdObj.getStringField("from") ); if( !from.empty() ) { scoped_lock lck( replSettings.discoveredSeeds_mx ); replSettings.discoveredSeeds.insert(from); } result.append("hbmsg", "still initializing"); return true; } if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) { errmsg = "repl set names do not match (2)"; result.append("mismatch", true); return false; } result.append("set", theReplSet->name()); MemberState currentState = theReplSet->state(); result.append("state", currentState.s); if (currentState == MemberState::RS_PRIMARY) { result.appendDate("electionTime", theReplSet->getElectionTime().asDate()); } result.append("e", theReplSet->iAmElectable()); result.append("hbmsg", theReplSet->hbmsg()); result.append("time", (long long) time(0)); result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate()); const Member *syncTarget = replset::BackgroundSync::get()->getSyncTarget(); if (syncTarget) { result.append("syncingTo", syncTarget->fullName()); } int v = theReplSet->config().version; result.append("v", v); if( v > cmdObj["v"].Int() ) result << "config" << theReplSet->config().asBson(); Member* from = NULL; if (cmdObj.hasField("fromId")) { if (v == cmdObj["v"].Int()) { from = theReplSet->getMutableMember(cmdObj["fromId"].Int()); } } if (!from) { from = theReplSet->findByName(cmdObj.getStringField("from")); if (!from) { return true; } } // if we thought that this node is down, let it know if (!from->hbinfo().up()) { result.append("stateDisagreement", true); } // note that we got a heartbeat from this node theReplSet->mgr->send(boost::bind(&ReplSet::msgUpdateHBRecv, theReplSet, from->hbinfo().id(), time(0))); return true; }