void CollectionShardingState::_onConfigRefreshCompleteInvalidateCachedMetadataAndNotify( OperationContext* opCtx, const BSONObj& query, const BSONObj& update, const BSONObj& updatedDoc) { dassert(opCtx->lockState()->isCollectionLockedForMode(_nss.ns(), MODE_IX)); invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer); // Extract which collection entry is being updated std::string refreshCollection; fassertStatusOK( 40477, bsonExtractStringField(query, ShardCollectionType::uuid.name(), &refreshCollection)); // Parse the '$set' update, which will contain the 'lastRefreshedCollectionVersion' if it is // present. BSONElement updateElement; fassertStatusOK(40478, bsonExtractTypedField(update, StringData("$set"), Object, &updateElement)); BSONObj setField = updateElement.Obj(); // If 'lastRefreshedCollectionVersion' is present, then a refresh completed and the catalog // cache must be invalidated and the catalog cache loader notified of the new version. if (setField.hasField(ShardCollectionType::lastRefreshedCollectionVersion.name())) { opCtx->recoveryUnit()->registerChange( new CollectionVersionLogOpHandler(opCtx, NamespaceString(refreshCollection))); } }
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, Timestamp oplogApplicationStartPoint, Timestamp topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; DBDirectClient db(opCtx); auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(), QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << oplogApplicationStartPoint.toBSON() << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstTimestampFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp(); if (firstTimestampFound != oplogApplicationStartPoint) { severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON() << " is missing; actual entry found is " << firstTimestampFound.toBSON(); fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. UnreplicatedWritesBlock uwb(opCtx); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering)); _consistencyMarkers->setAppliedThrough( opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }
OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* opCtx, bool isV1ElectionProtocol) { invariant(opCtx->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. // We record this update at the 'lastAppliedOpTime'. If there are any outstanding // checkpoints being taken, they should only reflect this write if they see all writes up // to our 'lastAppliedOpTime'. invariant( _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull()); auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime(); _replicationProcess->getConsistencyMarkers()->clearAppliedThrough( opCtx, lastAppliedOpTime.getTimestamp()); if (isV1ElectionProtocol) { writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] { WriteUnitOfWork wuow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage( opCtx, BSON("msg" << "new primary")); wuow.commit(); }); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(opCtx)); _shardingOnTransitionToPrimaryHook(opCtx); _dropAllTempCollections(opCtx); serverGlobalParams.validateFeaturesAsMaster.store(true); return opTimeToReturn; }
NetworkInterfaceMock::NetworkInterfaceMock() : _waitingToRunMask(0), _currentlyRunning(kNoThread), _now(fassertStatusOK(18653, dateFromISOString("2014-08-01T00:00:00Z"))), _hasStarted(false), _inShutdown(false), _executorNextWakeupDate(Date_t::max()) {}
OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* txn, bool isV1ElectionProtocol) { invariant(txn->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. invariant(_storageInterface->getOplogDeleteFromPoint(txn).isNull()); _storageInterface->setAppliedThrough(txn, {}); if (isV1ElectionProtocol) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); WriteUnitOfWork wuow(txn); txn->getClient()->getServiceContext()->getOpObserver()->onOpMessage( txn, BSON("msg" << "new primary")); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "logging transition to primary to oplog", "local.oplog.rs"); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(txn)); _shardingOnTransitionToPrimaryHook(txn); _dropAllTempCollections(txn); return opTimeToReturn; }
void OplogBufferCollection::pushAllNonBlocking(OperationContext* txn, Batch::const_iterator begin, Batch::const_iterator end) { if (begin == end) { return; } size_t numDocs = std::distance(begin, end); Batch docsToInsert(numDocs); Timestamp ts; std::transform(begin, end, docsToInsert.begin(), [&ts](const Value& value) { auto pair = addIdToDocument(value); invariant(ts.isNull() || pair.second > ts); ts = pair.second; return pair.first; }); stdx::lock_guard<stdx::mutex> lk(_mutex); auto status = _storageInterface->insertDocuments(txn, _nss, docsToInsert); fassertStatusOK(40161, status); _lastPushedTimestamp = ts; _count += numDocs; _size += std::accumulate(begin, end, 0U, [](const size_t& docSize, const Value& value) { return docSize + size_t(value.objsize()); }); _cvNoLongerEmpty.notify_all(); }
bool OplogBufferCollection::_peekOneSide_inlock(OperationContext* txn, Value* value, bool front) const { invariant(_count > 0); // If there is a sentinel, and it was pushed right after the last BSONObj to be popped was // pushed, then we return an empty BSONObj for the sentinel. if (!_sentinels.empty() && (_lastPoppedTimestamp == _sentinels.front())) { *value = BSONObj(); return true; } auto scanDirection = front ? StorageInterface::ScanDirection::kForward : StorageInterface::ScanDirection::kBackward; BSONObj startKey; auto boundInclusion = BoundInclusion::kIncludeStartKeyOnly; // Previously popped documents are not actually removed from the collection. When peeking at the // front of the buffer, we use the last popped timestamp to skip ahead to the first document // that has not been popped. if (front && !_lastPoppedTimestamp.isNull()) { startKey = BSON("" << _lastPoppedTimestamp); boundInclusion = BoundInclusion::kIncludeEndKeyOnly; } const auto docs = fassertStatusOK(40163, _storageInterface->findDocuments( txn, _nss, kIdIdxName, scanDirection, startKey, boundInclusion, 1U)); invariant(1U == docs.size()); *value = extractEmbeddedOplogDocument(docs.front()).getOwned(); return true; }
Status OplogReader::_compareRequiredOpTimeWithQueryResponse(const OpTime& requiredOpTime) { auto containsMinValid = more(); if (!containsMinValid) { return Status( ErrorCodes::NoMatchingDocument, "remote oplog does not contain entry with optime matching our required optime"); } auto doc = nextSafe(); const auto opTime = fassertStatusOK(40351, OpTime::parseFromOplogEntry(doc)); if (requiredOpTime != opTime) { return Status(ErrorCodes::BadValue, str::stream() << "remote oplog contain entry with matching timestamp " << opTime.getTimestamp().toString() << " but optime " << opTime.toString() << " does not " "match our required optime"); } if (requiredOpTime.getTerm() != opTime.getTerm()) { return Status(ErrorCodes::BadValue, str::stream() << "remote oplog contain entry with term " << opTime.getTerm() << " that does not " "match the term in our required optime"); } return Status::OK(); }
void Strategy::commandOp(OperationContext* txn, const string& db, const BSONObj& command, int options, const string& versionedNS, const BSONObj& targetingQuery, vector<CommandResult>* results) { QuerySpec qSpec(db + ".$cmd", command, BSONObj(), 0, 1, options); ParallelSortClusteredCursor cursor(qSpec, CommandInfo(versionedNS, targetingQuery)); // Initialize the cursor cursor.init(txn); set<ShardId> shardIds; cursor.getQueryShardIds(shardIds); for (const ShardId& shardId : shardIds) { CommandResult result; result.shardTargetId = shardId; result.target = fassertStatusOK( 28739, ConnectionString::parse(cursor.getShardCursor(shardId)->originalHost())); result.result = cursor.getShardCursor(shardId)->peekFirst().getOwned(); results->push_back(result); } }
executor::TaskExecutor::EventHandle AsyncResultsMerger::kill() { stdx::lock_guard<stdx::mutex> lk(_mutex); if (_killCursorsScheduledEvent.isValid()) { invariant(_lifecycleState != kAlive); return _killCursorsScheduledEvent; } _lifecycleState = kKillStarted; // Make '_killCursorsScheduledEvent', which we will signal as soon as we have scheduled a // killCursors command to run on all the remote shards. auto statusWithEvent = _executor->makeEvent(); if (ErrorCodes::isShutdownError(statusWithEvent.getStatus().code())) { // The underlying task executor is shutting down. if (!haveOutstandingBatchRequests_inlock()) { _lifecycleState = kKillComplete; } return executor::TaskExecutor::EventHandle(); } fassertStatusOK(28716, statusWithEvent); _killCursorsScheduledEvent = statusWithEvent.getValue(); // If we're not waiting for responses from remotes, we can schedule killCursors commands on the // remotes now. Otherwise, we have to wait until all responses are back, and then we can kill // the remote cursors. if (!haveOutstandingBatchRequests_inlock()) { scheduleKillCursors_inlock(); _lifecycleState = kKillComplete; _executor->signalEvent(_killCursorsScheduledEvent); } return _killCursorsScheduledEvent; }
static vector<ConnectionString> getHosts(const vector<string> hostStrings) { vector<ConnectionString> hosts; for (const auto& host : hostStrings) { hosts.push_back(fassertStatusOK(28738, ConnectionString::parse(host))); } return hosts; }
WriteConcernOptions MigrationSecondaryThrottleOptions::getWriteConcern() const { invariant(_secondaryThrottle != kOff); invariant(_writeConcernBSON); WriteConcernOptions writeConcern; fassertStatusOK(34414, writeConcern.parse(*_writeConcernBSON)); return writeConcern; }
void createSystemIndexes(OperationContext* opCtx, Collection* collection) { invariant(collection); const NamespaceString& ns = collection->ns(); if (ns == AuthorizationManager::usersCollectionNamespace) { auto indexSpec = fassertStatusOK( 40455, index_key_validate::validateIndexSpec( v3SystemUsersIndexSpec.toBSON(), ns, serverGlobalParams.featureCompatibility)); fassertStatusOK( 40456, collection->getIndexCatalog()->createIndexOnEmptyCollection(opCtx, indexSpec)); } else if (ns == AuthorizationManager::rolesCollectionNamespace) { auto indexSpec = fassertStatusOK( 40457, index_key_validate::validateIndexSpec( v3SystemRolesIndexSpec.toBSON(), ns, serverGlobalParams.featureCompatibility)); fassertStatusOK( 40458, collection->getIndexCatalog()->createIndexOnEmptyCollection(opCtx, indexSpec)); } else if (ns == sessionCollectionNamespace) { auto indexSpec = fassertStatusOK( 40493, index_key_validate::validateIndexSpec( v1SystemSessionsIndexSpec.toBSON(), ns, serverGlobalParams.featureCompatibility)); fassertStatusOK( 40494, collection->getIndexCatalog()->createIndexOnEmptyCollection(opCtx, indexSpec)); } }
void ReplicationCoordinatorImpl::_recoverFromElectionTie( const executor::TaskExecutor::CallbackArgs& cbData) { stdx::unique_lock<stdx::mutex> lk(_mutex); auto now = _replExecutor->now(); const auto status = _topCoord->checkShouldStandForElection(now); if (!status.isOK()) { LOG(2) << "ReplicationCoordinatorImpl::_recoverFromElectionTie -- " << status.reason(); } else { fassertStatusOK(28817, _topCoord->becomeCandidateIfElectable(now, false)); _startElectSelf_inlock(); } }
void CollectionShardingState::_onConfigDeleteInvalidateCachedMetadataAndNotify( OperationContext* opCtx, const BSONObj& query) { dassert(opCtx->lockState()->isCollectionLockedForMode(_nss.ns(), MODE_IX)); invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer); // Extract which collection entry is being deleted from the _id field. std::string deletedCollection; fassertStatusOK( 40479, bsonExtractStringField(query, ShardCollectionType::uuid.name(), &deletedCollection)); opCtx->recoveryUnit()->registerChange( new CollectionVersionLogOpHandler(opCtx, NamespaceString(deletedCollection))); }
void ReplicationCoordinatorImpl::_recoverFromElectionTie( const ReplicationExecutor::CallbackArgs& cbData) { LockGuard topoLock(_topoMutex); auto now = _replExecutor.now(); auto lastOpApplied = getMyLastAppliedOpTime(); const auto status = _topCoord->checkShouldStandForElection(now, lastOpApplied); if (!status.isOK()) { LOG(2) << "ReplicationCoordinatorImpl::_recoverFromElectionTie -- " << status.reason(); } else { fassertStatusOK(28817, _topCoord->becomeCandidateIfElectable(now, lastOpApplied, false)); _startElectSelf(); } }
bool DeferredWriter::insertDocument(BSONObj obj) { // We can't insert documents if we haven't been started up. invariant(_pool); stdx::lock_guard<stdx::mutex> lock(_mutex); // Check if we're allowed to insert this object. if (_numBytes + obj.objsize() >= _maxNumBytes) { // If not, drop it. We always drop new entries rather than old ones; that way the caller // knows at the time of the call that the entry was dropped. _logDroppedEntry(); return false; } // Add the object to the buffer. _numBytes += obj.objsize(); fassertStatusOK(40588, _pool->schedule([this, obj] { _worker(InsertStatement(obj.getOwned())); })); return true; }
void BackgroundSync::_produce(OperationContext* opCtx) { if (MONGO_FAIL_POINT(stopReplProducer)) { // This log output is used in js tests so please leave it. log() << "bgsync - stopReplProducer fail point " "enabled. Blocking until fail point is disabled."; // TODO(SERVER-27120): Remove the return statement and uncomment the while loop. // Currently we cannot block here or we prevent primaries from being fully elected since // we'll never call _signalNoNewDataForApplier. // while (MONGO_FAIL_POINT(stopReplProducer) && !inShutdown()) { // mongo::sleepsecs(1); // } mongo::sleepsecs(1); return; } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_state != ProducerState::Running) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; HostAndPort oldSource = _syncSourceHost; SyncSourceResolverResponse syncSourceResp; { const OpTime minValidSaved = _replicationProcess->getConsistencyMarkers()->getMinValid(opCtx); stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } const auto requiredOpTime = (minValidSaved > _lastOpTimeFetched) ? minValidSaved : OpTime(); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, requiredOpTime, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function which is only called from a single // thread. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); _syncSourceResolver->join(); { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->getMemberState().primary()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); _replCoord->abortCatchupIfNeeded().transitional_ignore(); return; } // We only need to mark ourselves as too stale once. if (_tooStale) { return; } // Mark yourself as too stale. _tooStale = true; error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; // Activate maintenance mode and transition to RECOVERING. auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!status.isOK()) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState() << causedBy(status); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } // If our sync source has not changed, it is likely caused by our heartbeat data map being // out of date. In that case we sleep for 1 second to reduce the amount we spin waiting // for our map to update. if (oldSource == source) { log() << "Chose same sync source candidate as last time, " << source << ". Sleeping for 1 second to avoid immediately choosing a new sync source for " "the same reason as last time."; sleepsecs(1); } } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } // If we find a good sync source after having gone too stale, disable maintenance mode so we can // transition to SECONDARY. if (_tooStale) { _tooStale = false; log() << "No longer too stale. Able to sync from " << _syncSourceHost; auto status = _replCoord->setMaintenanceMode(false); if (!status.isOK()) { warning() << "Failed to leave maintenance mode: " << status; } } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; } if (!_replCoord->getMemberState().primary()) { _replCoord->signalUpstreamUpdater(); } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (_replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx).isNull()) { _replicationProcess->getConsistencyMarkers()->setAppliedThrough( opCtx, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status) { fetcherReturnStatus = status; }; // The construction of OplogFetcher has to be outside bgsync mutex, because it calls // replication coordinator. auto oplogFetcherPtr = stdx::make_unique<OplogFetcher>( _replicationCoordinatorExternalState->getTaskExecutor(), OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString::kRsOplogNamespace, _replCoord->getConfig(), _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), syncSourceResp.rbid, true /* requireFresherSyncSource */, &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn, bgSyncOplogFetcherBatchSize); stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } _oplogFetcher = std::move(oplogFetcherPtr); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } const auto logLevel = Command::testCommandsEnabled ? 0 : 1; LOG(logLevel) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getFindQuery_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (getState() != ProducerState::Running) { log() << "Replication producer stopped after oplog fetcher finished returning a batch from " "our sync source. Abandoning this batch of oplog entries and re-evaluating our " "sync source."; return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing) { auto storageInterface = StorageInterface::get(opCtx); _runRollback(opCtx, fetcherReturnStatus, source, syncSourceResp.rbid, storageInterface); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp()); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, oldestOpTimeSeen); bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) }
Status dropDatabase(OperationContext* opCtx, const std::string& dbName) { uassert(ErrorCodes::IllegalOperation, "Cannot drop a database in read-only mode", !storageGlobalParams.readOnly); // TODO (Kal): OldClientContext legacy, needs to be removed { CurOp::get(opCtx)->ensureStarted(); stdx::lock_guard<Client> lk(*opCtx->getClient()); CurOp::get(opCtx)->setNS_inlock(dbName); } auto replCoord = repl::ReplicationCoordinator::get(opCtx); std::size_t numCollectionsToDrop = 0; // We have to wait for the last drop-pending collection to be removed if there are no // collections to drop. repl::OpTime latestDropPendingOpTime; using Result = boost::optional<Status>; // Get an optional result--if it's there, early return; otherwise, wait for collections to drop. auto result = writeConflictRetry(opCtx, "dropDatabase_collection", dbName, [&] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); Database* const db = autoDB.getDb(); if (!db) { return Result(Status(ErrorCodes::NamespaceNotFound, str::stream() << "Could not drop database " << dbName << " because it does not exist")); } bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !replCoord->canAcceptWritesForDatabase(opCtx, dbName); if (userInitiatedWritesAndNotPrimary) { return Result( Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping database " << dbName)); } log() << "dropDatabase " << dbName << " - starting"; db->setDropPending(opCtx, true); // If Database::dropCollectionEventIfSystem() fails, we should reset the drop-pending state // on Database. auto dropPendingGuard = MakeGuard([&db, opCtx] { db->setDropPending(opCtx, false); }); for (auto collection : *db) { const auto& nss = collection->ns(); if (nss.isDropPendingNamespace() && replCoord->isReplEnabled() && opCtx->writesAreReplicated()) { log() << "dropDatabase " << dbName << " - found drop-pending collection: " << nss; latestDropPendingOpTime = std::max( latestDropPendingOpTime, uassertStatusOK(nss.getDropPendingNamespaceOpTime())); continue; } if (replCoord->isOplogDisabledFor(opCtx, nss) || nss.isSystemDotIndexes()) { continue; } log() << "dropDatabase " << dbName << " - dropping collection: " << nss; WriteUnitOfWork wunit(opCtx); fassertStatusOK(40476, db->dropCollectionEvenIfSystem(opCtx, nss)); wunit.commit(); numCollectionsToDrop++; } dropPendingGuard.Dismiss(); // If there are no collection drops to wait for, we complete the drop database operation. if (numCollectionsToDrop == 0U && latestDropPendingOpTime.isNull()) { return Result(_finishDropDatabase(opCtx, dbName, db)); } return Result(boost::none); }); if (result) { return *result; } // If waitForWriteConcern() returns an error or throws an exception, we should reset the // drop-pending state on Database. auto dropPendingGuardWhileAwaitingReplication = MakeGuard([dbName, opCtx] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); if (auto db = autoDB.getDb()) { db->setDropPending(opCtx, false); } }); { // Holding of any locks is disallowed while awaiting replication because this can // potentially block for long time while doing network activity. // // Even though dropDatabase() does not explicitly acquire any locks before awaiting // replication, it is possible that the caller of this function may already have acquired // a lock. The applyOps command is an example of a dropDatabase() caller that does this. // Therefore, we have to release any locks using a TempRelease RAII object. // // TODO: Remove the use of this TempRelease object when SERVER-29802 is completed. // The work in SERVER-29802 will adjust the locking rules around applyOps operations and // dropDatabase is expected to be one of the operations where we expect to no longer acquire // the global lock. Lock::TempRelease release(opCtx->lockState()); if (numCollectionsToDrop > 0U) { auto status = replCoord->awaitReplicationOfLastOpForClient(opCtx, kDropDatabaseWriteConcern) .status; if (!status.isOK()) { return Status(status.code(), str::stream() << "dropDatabase " << dbName << " failed waiting for " << numCollectionsToDrop << " collection drops to replicate: " << status.reason()); } log() << "dropDatabase " << dbName << " - successfully dropped " << numCollectionsToDrop << " collections. dropping database"; } else { invariant(!latestDropPendingOpTime.isNull()); auto status = replCoord ->awaitReplication(opCtx, latestDropPendingOpTime, kDropDatabaseWriteConcern) .status; if (!status.isOK()) { return Status( status.code(), str::stream() << "dropDatabase " << dbName << " failed waiting for pending collection drops (most recent drop optime: " << latestDropPendingOpTime.toString() << ") to replicate: " << status.reason()); } log() << "dropDatabase " << dbName << " - pending collection drops completed. dropping database"; } } dropPendingGuardWhileAwaitingReplication.Dismiss(); return writeConflictRetry(opCtx, "dropDatabase_database", dbName, [&] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); if (auto db = autoDB.getDb()) { return _finishDropDatabase(opCtx, dbName, db); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "Could not drop database " << dbName << " because it does not exist after dropping " << numCollectionsToDrop << " collection(s)."); }); }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); ApplyBatchFinalizer finalizer(replCoord); OperationContextImpl txn; OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back(); const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns // true when we need to end a // batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); if (inShutdown()) return; } if (ops.empty()) { continue; } const BSONObj lastOp = ops.back(); handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook( OperationContext* txn) { auto status = ShardingStateRecovery::recover(txn); if (ErrorCodes::isShutdownError(status.code())) { // Note: callers of this method don't expect exceptions, so throw only unexpected fatal // errors. return; } fassertStatusOK(40107, status); if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { status = Grid::get(txn)->catalogManager()->initializeConfigDatabaseIfNeeded(txn); if (!status.isOK() && status != ErrorCodes::AlreadyInitialized) { if (ErrorCodes::isShutdownError(status.code())) { // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully. return; } fassertFailedWithStatus(40184, Status(status.code(), str::stream() << "Failed to initialize config database on config " "server's first transition to primary" << causedBy(status))); } if (status.isOK()) { // Load the clusterId into memory. Use local readConcern, since we can't use majority // readConcern in drain mode because the global lock prevents replication. This is // safe, since if the clusterId write is rolled back, any writes that depend on it will // also be rolled back. // Since we *just* wrote the cluster ID to the config.version document (via // ShardingCatalogManager::initializeConfigDatabaseIfNeeded), this should always // succeed. status = ClusterIdentityLoader::get(txn)->loadClusterId( txn, repl::ReadConcernLevel::kLocalReadConcern); if (ErrorCodes::isShutdownError(status.code())) { // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully. return; } fassertStatusOK(40217, status); } // For upgrade from 3.2 to 3.4, check if any shards in config.shards are not yet marked as // shard aware, and attempt to initialize sharding awareness on them. auto shardAwareInitializationStatus = Grid::get(txn)->catalogManager()->initializeShardingAwarenessOnUnawareShards(txn); if (!shardAwareInitializationStatus.isOK()) { warning() << "Error while attempting to initialize sharding awareness on sharding " "unaware shards " << causedBy(shardAwareInitializationStatus); } // If this is a config server node becoming a primary, start the balancer Balancer::get(txn)->startThread(txn); } else if (ShardingState::get(txn)->enabled()) { const auto configsvrConnStr = Grid::get(txn)->shardRegistry()->getConfigShard()->getConnString(); auto status = ShardingState::get(txn)->updateShardIdentityConfigString( txn, configsvrConnStr.toString()); if (!status.isOK()) { warning() << "error encountered while trying to update config connection string to " << configsvrConnStr << causedBy(status); } } // There is a slight chance that some stale metadata might have been loaded before the latest // optime has been recovered, so throw out everything that we have up to now ShardingState::get(txn)->markCollectionsNotShardedAtStepdown(); }
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCriticalSection); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Tell the recipient shard to fetch the latest changes Status commitCloneStatus = _cloneDriver->commitClone(txn); if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) { commitCloneStatus = {ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."}; } if (!commitCloneStatus.isOK()) { return {commitCloneStatus.code(), str::stream() << "commit clone failed due to " << commitCloneStatus.toString()}; } // Generate the next collection version. ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion(); uncommittedCollVersion.incMajor(); // applyOps preparation for reflecting the uncommitted metadata on the config server // Preconditions BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), _committedMetadata->getCollVersion().toLong()); bb.done(); } preCond.append(b.obj()); } // Update for the chunk which is being donated BSONArrayBuilder updates; { BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), _args.getMinKey()); n.append(ChunkType::max(), _args.getMaxKey()); n.append(ChunkType::shard(), _args.getToShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); q.done(); updates.append(op.obj()); } // Update for the chunk being moved // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = uncommittedCollVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. if (_committedMetadata->getNumChunks() > 1) { ChunkType bumpChunk; invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); nextVersion.incMinor(); dassert(bumpMin.woCompare(_args.getMinKey()) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _args.getFromShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'"; } else { log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'"; } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeCommitMigration); Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated( txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion); if (MONGO_FAIL_POINT(failCommitMigrationCommand)) { applyOpsStatus = Status(ErrorCodes::InternalError, "Failpoint 'failCommitMigrationCommand' generated error"); } if (applyOpsStatus.isOK()) { // Now that applyOps succeeded and the new collection version is committed, update the // collection metadata to the new collection version and forget the migrated chunk. ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); ChunkType migratingChunkToForget; migratingChunkToForget.setMin(_args.getMinKey()); migratingChunkToForget.setMax(_args.getMaxKey()); _committedMetadata = _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMetadata(_committedMetadata); } else { // This could be an unrelated error (e.g. network error). Check whether the metadata update // succeeded by refreshing the collection metadata from the config server and checking that // the original chunks no longer exist. warning() << "Migration metadata commit may have failed: refreshing metadata to check" << causedBy(applyOpsStatus); // Need to get the latest optime in case the refresh request goes to a secondary -- // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done. Status status = grid.catalogClient(txn)->logChange( txn, "moveChunk.validating", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); if (!status.isOK()) { fassertStatusOK( 40137, {status.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and updating the optime with a write before refreshing the " << "metadata also failed: " << causedBy(status)}); } ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); fassertStatusOK(34431, {refreshStatus.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and refreshing collection metadata failed: " << causedBy(refreshStatus)}); { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata(); if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) { invariant(refreshedMetadata->getCollVersion() == _committedMetadata->getCollVersion()); // After refresh, the collection metadata indicates that the donor shard still owns // the chunk, so no migration changes were written to the config server metadata. return {applyOpsStatus.code(), str::stream() << "Migration was not committed, applyOps failed: " << causedBy(applyOpsStatus)}; } ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion(); if (!refreshedCollectionVersion.equals(nextVersion)) { // The refreshed collection metadata's collection version does not match the control // chunk's updated collection version, which should now be the highest. The control // chunk was not committed, but the migrated chunk was. This state is not // recoverable. fassertStatusOK(40138, {applyOpsStatus.code(), str::stream() << "Migration was partially committed, state is " << "unrecoverable. applyOps error: " << causedBy(applyOpsStatus)}); } } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); scopedGuard.Dismiss(); _cleanup(txn); grid.catalogClient(txn)->logChange(txn, "moveChunk.commit", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); return Status::OK(); }
void BackgroundSync::_produce(OperationContext* txn) { while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) { sleepmillis(0); } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (!_replCoord->isCatchingUp() && (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) { return; } if (_inShutdown_inlock()) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; SyncSourceResolverResponse syncSourceResp; SyncSourceResolver* syncSourceResolver; OpTime minValid; if (_replCoord->getMemberState().recovering()) { auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn); if (minValidSaved > lastOpTimeFetched) { minValid = minValidSaved; } } { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, minValid, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); syncSourceResolver = _syncSourceResolver.get(); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); syncSourceResolver->join(); syncSourceResolver = nullptr; { stdx::unique_lock<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->isCatchingUp()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); sleepsecs(1); return; } error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; if (!_replCoord->isCatchingUp()) { _replCoord->signalUpstreamUpdater(); } } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) { StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto executor = _replicationCoordinatorExternalState->getTaskExecutor(); auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>( executor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { if (_replCoord->isCatchingUp()) { warning() << "Rollback situation detected in catch-up mode; catch-up mode will end."; sleepsecs(1); return; } // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << redact(fetcherReturnStatus); // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state const auto minValid = StorageInterface::get(txn)->getMinValid(txn); if (lastApplied < minValid) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << minValid.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
StatusWith<Shard::QueryResponse> ShardLocal::_exhaustiveFindOnConfig( OperationContext* txn, const ReadPreferenceSetting& readPref, const repl::ReadConcernLevel& readConcernLevel, const NamespaceString& nss, const BSONObj& query, const BSONObj& sort, boost::optional<long long> limit) { auto replCoord = repl::ReplicationCoordinator::get(txn); if (readConcernLevel == repl::ReadConcernLevel::kMajorityReadConcern) { // Set up operation context with majority read snapshot so correct optime can be retrieved. Status status = txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot(); // Wait for any writes performed by this ShardLocal instance to be committed and visible. Status readConcernStatus = replCoord->waitUntilOpTimeForRead( txn, repl::ReadConcernArgs{_getLastOpTime(), readConcernLevel}); if (!readConcernStatus.isOK()) { if (readConcernStatus == ErrorCodes::ShutdownInProgress || ErrorCodes::isInterruption(readConcernStatus.code())) { return readConcernStatus; } fassertStatusOK(40188, readConcernStatus); } // Inform the storage engine to read from the committed snapshot for the rest of this // operation. status = txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot(); fassertStatusOK(40189, status); } else { invariant(readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern); } DBDirectClient client(txn); Query fullQuery(query); if (!sort.isEmpty()) { fullQuery.sort(sort); } fullQuery.readPref(readPref.pref, BSONArray()); try { std::unique_ptr<DBClientCursor> cursor = client.query(nss.ns().c_str(), fullQuery, limit.get_value_or(0)); if (!cursor) { return {ErrorCodes::OperationFailed, str::stream() << "Failed to establish a cursor for reading " << nss.ns() << " from local storage"}; } std::vector<BSONObj> documentVector; while (cursor->more()) { BSONObj document = cursor->nextSafe().getOwned(); documentVector.push_back(std::move(document)); } return Shard::QueryResponse{std::move(documentVector), replCoord->getCurrentCommittedSnapshotOpTime()}; } catch (const DBException& ex) { return ex.toStatus(); } }
/* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ void InitialSync::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { unsigned long long bytesApplied = 0; unsigned long long entriesApplied = 0; while (true) { OpQueue ops; auto replCoord = repl::ReplicationCoordinator::get(txn); while (!tryPopAndWaitForMore(txn, &ops)) { // nothing came back last time, so go again if (ops.empty()) continue; // Check if we reached the end const BSONObj currentOp = ops.back().raw; const OpTime currentOpTime = fassertStatusOK(28772, OpTime::parseFromOplogEntry(currentOp)); // When we reach the end return this batch if (currentOpTime == endOpTime) { break; } else if (currentOpTime > endOpTime) { severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime << " without seeing it. Rollback?"; fassertFailedNoTrace(18693); } // apply replication batch limits if (ops.getSize() > replBatchLimitBytes) break; if (ops.getDeque().size() > replBatchLimitOperations) break; }; if (ops.empty()) { severe() << "got no ops for batch..."; fassertFailedNoTrace(18692); } const BSONObj lastOp = ops.back().raw.getOwned(); // Tally operation information bytesApplied += ops.getSize(); entriesApplied += ops.getDeque().size(); const OpTime lastOpTime = multiApply(txn, ops); replCoord->setMyLastAppliedOpTime(lastOpTime); setNewTimestamp(lastOpTime.getTimestamp()); if (inShutdown()) { return; } // if the last op applied was our end, return if (lastOpTime == endOpTime) { LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied << " bytes) and finished at opTime " << endOpTime; return; } } // end of while (true) }
void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try { if (_consistencyMarkers->getInitialSyncFlag(opCtx)) { log() << "No recovery needed. Initial sync flag set."; return; // Initial Sync will take over so no cleanup is needed. } const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx); if (!truncateAfterPoint.isNull()) { log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON(); _truncateOplogTo(opCtx, truncateAfterPoint); } // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries // erroneously. _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {}); // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist. _consistencyMarkers->removeOldOplogDeleteFromPointField(opCtx); auto topOfOplogSW = _getLastAppliedOpTime(opCtx); boost::optional<OpTime> topOfOplog = boost::none; if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty && topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) { fassertStatusOK(40290, topOfOplogSW); topOfOplog = topOfOplogSW.getValue(); } // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should // set the initial data timestamp to the top OpTime in the oplog once the data is consistent // there. If there is nothing in the oplog, then we do not set the initial data timestamp. auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx); if (!checkpointTimestamp.isNull()) { // If we have a checkpoint timestamp, we set the initial data timestamp now so that // the operations we apply below can be given the proper timestamps. _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), SnapshotName(checkpointTimestamp)); } // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is // nothing to set it to. if (!topOfOplog) { log() << "No oplog entries to apply for recovery. Oplog is empty."; return; } if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) { _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp()); } // If we don't have a checkpoint timestamp, then we are either not running a storage engine // that supports "recover to stable timestamp" or we just upgraded from a version that didn't. // In both cases, the data on disk is not consistent until we have applied all oplog entries to // the end of the oplog, since we do not know which ones actually got applied before shutdown. // As a result, we do not set the initial data timestamp until after we have applied to the end // of the oplog. if (checkpointTimestamp.isNull()) { _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), SnapshotName(topOfOplog->getTimestamp())); } } catch (...) { severe() << "Caught exception during replication recovery: " << exceptionToStatus(); std::terminate(); }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Milliseconds fetcherMaxTimeMS, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); // Forward metadata (containing liveness information) to replication coordinator. bool receivedMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedMetadata) { auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(queryResponse.otherFields.metadata); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << source << ": " << metadataResult.getStatus() << ": " << queryResponse.otherFields.metadata; return; } const auto& metadata = metadataResult.getValue(); _replCoord->processReplSetMetadata(metadata); if (metadata.getPrimaryIndex() != rpc::ReplSetMetadata::kNoPrimary) { _replCoord->cancelAndRescheduleElectionTimeout(); } } const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = fassertStatusOK(28770, OpTime::parseFromOplogEntry(o)); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis)); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", durationCount<Milliseconds>(fetcherMaxTimeMS)); if (receivedMetadata) { bob->append("term", _replCoord->getTerm()); _replCoord->getLastCommittedOpTime().append(bob, "lastKnownCommittedOpTime"); } }
void ReplicationCoordinatorExternalStateImpl::cleanUpLastApplyBatch(OperationContext* txn) { if (_storageInterface->getInitialSyncFlag(txn)) { return; // Initial Sync will take over so no cleanup is needed. } const auto deleteFromPoint = _storageInterface->getOplogDeleteFromPoint(txn); const auto appliedThrough = _storageInterface->getAppliedThrough(txn); const bool needToDeleteEndOfOplog = !deleteFromPoint.isNull() && // This version should never have a non-null deleteFromPoint with a null appliedThrough. // This scenario means that we downgraded after unclean shutdown, then the downgraded node // deleted the ragged end of our oplog, then did a clean shutdown. !appliedThrough.isNull() && // Similarly we should never have an appliedThrough higher than the deleteFromPoint. This // means that the downgraded node deleted our ragged end then applied ahead of our // deleteFromPoint and then had an unclean shutdown before upgrading. We are ok with // applying these ops because older versions wrote to the oplog from a single thread so we // know they are in order. !(appliedThrough.getTimestamp() >= deleteFromPoint); if (needToDeleteEndOfOplog) { log() << "Removing unapplied entries starting at: " << deleteFromPoint; truncateOplogTo(txn, deleteFromPoint); } _storageInterface->setOplogDeleteFromPoint(txn, {}); // clear the deleteFromPoint if (appliedThrough.isNull()) { // No follow-up work to do. return; } // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. const auto topOfOplog = fassertStatusOK(40290, loadLastOpTime(txn)); if (topOfOplog >= appliedThrough) { return; // We've applied all the valid oplog we have. } log() << "Replaying stored operations from " << appliedThrough << " (exclusive) to " << topOfOplog << " (inclusive)."; DBDirectClient db(txn); auto cursor = db.query(rsOplogName, QUERY("ts" << BSON("$gte" << appliedThrough.getTimestamp())), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << appliedThrough << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstOpTimeFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())); if (firstOpTimeFound != appliedThrough) { severe() << "Oplog entry at " << appliedThrough << " is missing; actual entry found is " << firstOpTimeFound; fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. const bool wereWritesReplicated = txn->writesAreReplicated(); ON_BLOCK_EXIT([&] { txn->setReplicatedWrites(wereWritesReplicated); }); txn->setReplicatedWrites(false); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(txn, entry, true)); _storageInterface->setAppliedThrough( txn, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Milliseconds fetcherMaxTimeMS, Status* returnStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { LOG(2) << "Error returned from oplog query: " << result.getStatus(); *returnStatus = result.getStatus(); return; } if (inShutdown()) { LOG(2) << "Interrupted by shutdown while querying oplog. 1"; // 1st instance. return; } // Check if we have been stopped. if (isStopped()) { LOG(2) << "Interrupted by stop request while querying the oplog. 1"; // 1st instance. return; } const auto& queryResponse = result.getValue(); bool syncSourceHasSyncSource = false; OpTime sourcesLastOpTime; // Forward metadata (containing liveness information) to replication coordinator. bool receivedMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedMetadata) { auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(queryResponse.otherFields.metadata); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << source << ": " << metadataResult.getStatus() << ": " << queryResponse.otherFields.metadata; return; } const auto& metadata = metadataResult.getValue(); _replCoord->processReplSetMetadata(metadata); if (metadata.getPrimaryIndex() != rpc::ReplSetMetadata::kNoPrimary) { _replCoord->cancelAndRescheduleElectionTimeout(); } syncSourceHasSyncSource = metadata.getSyncSourceIndex() != -1; sourcesLastOpTime = metadata.getLastOpVisible(); } const auto& documents = queryResponse.documents; auto firstDocToApply = documents.cbegin(); auto lastDocToApply = documents.cend(); if (!documents.empty()) { LOG(2) << "fetcher read " << documents.size() << " operations from remote oplog starting at " << documents.front()["ts"] << " and ending at " << documents.back()["ts"]; } else { LOG(2) << "fetcher read 0 operations from remote oplog"; } // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&firstDocToApply, lastDocToApply]() -> StatusWith<BSONObj> { if (firstDocToApply == lastDocToApply) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(firstDocToApply++); }; *returnStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!returnStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(firstDocToApply != documents.cbegin()); } // No work to do if we are draining/primary. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(2) << "Interrupted by waiting for applier to drain " << "or becoming primary while querying the oplog. 1"; // 1st instance. return; } // The count of the bytes of the documents read off the network. int networkDocumentBytes = 0; Timestamp lastTS; { stdx::unique_lock<stdx::mutex> lock(_mutex); // If we are stopped then return without queueing this batch to apply. if (_stopped) { LOG(2) << "Interrupted by stop request while querying the oplog. 2"; // 2nd instance. return; } lastTS = _lastOpTimeFetched.getTimestamp(); } int count = 0; for (auto&& doc : documents) { networkDocumentBytes += doc.objsize(); ++count; // If this is the first response (to the $gte query) then we already applied the first doc. if (queryResponse.first && count == 1) { continue; } // Check to see if the oplog entry goes back in time for this document. const auto docOpTime = OpTime::parseFromOplogEntry(doc); fassertStatusOK(34362, docOpTime.getStatus()); // entries must have a "ts" field. const auto docTS = docOpTime.getValue().getTimestamp(); if (lastTS >= docTS) { *returnStatus = Status( ErrorCodes::OplogOutOfOrder, str::stream() << "Reading the oplog from" << source.toString() << " returned out of order entries. lastTS: " << lastTS.toString() << " outOfOrderTS:" << docTS.toString() << " at count:" << count); return; } lastTS = docTS; } // These numbers are for the documents we will apply. auto toApplyDocumentCount = documents.size(); auto toApplyDocumentBytes = networkDocumentBytes; if (queryResponse.first) { // The count is one less since the first document found was already applied ($gte $ts query) // and we will not apply it again. We just needed to check it so we didn't rollback, or // error above. --toApplyDocumentCount; const auto alreadyAppliedDocument = documents.cbegin(); toApplyDocumentBytes -= alreadyAppliedDocument->objsize(); } if (toApplyDocumentBytes > 0) { // Wait for enough space. _buffer.waitForSpace(toApplyDocumentBytes); OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } // Buffer docs for later application. std::vector<BSONObj> objs{firstDocToApply, lastDocToApply}; _buffer.pushAllNonBlocking(objs); // Inc stats. opsReadStats.increment(documents.size()); // we read all of the docs in the query. networkByteStats.increment(networkDocumentBytes); bufferCountGauge.increment(toApplyDocumentCount); bufferSizeGauge.increment(toApplyDocumentBytes); // Update last fetched info. auto lastDoc = objs.back(); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = lastDoc["h"].numberLong(); _lastOpTimeFetched = fassertStatusOK(28770, OpTime::parseFromOplogEntry(lastDoc)); LOG(3) << "batch resetting _lastOpTimeFetched: " << _lastOpTimeFetched; } }