void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Status* remoteOplogStartStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { return; } if (inShutdown()) { return; } // Check if we have been paused. if (isPaused()) { return; } const auto& queryResponse = result.getValue(); const auto& documents = queryResponse.documents; auto documentBegin = documents.cbegin(); auto documentEnd = documents.cend(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> { if (documentBegin == documentEnd) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(documentBegin++); }; *remoteOplogStartStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!remoteOplogStartStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(documentBegin != documents.cbegin()); } // process documents int currentBatchMessageSize = 0; for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) { if (inShutdown()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the fetcher. const BSONObj& o = *documentIter; currentBatchMessageSize += o.objsize(); opsReadStats.increment(); if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { sleepsecs(20); } { stdx::unique_lock<stdx::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = extractOpTime(o); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } // record time for each batch getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count()); networkByteStats.increment(currentBatchMessageSize); // Check some things periodically // (whenever we run out of items in the // current cursor batch) if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (_shouldChangeSyncSource(source)) { return; } // Check if we have been paused. if (isPaused()) { return; } // We fill in 'bob' to signal the fetcher to process with another getMore. invariant(bob); bob->append("getMore", queryResponse.cursorId); bob->append("collection", queryResponse.nss.coll()); bob->append("maxTimeMS", int(fetcherMaxTimeMS.count())); }
StatusWith<BSONObj> OplogFetcher::_onSuccessfulBatch(const Fetcher::QueryResponse& queryResponse) { // Stop fetching and return on fail point. // This fail point makes the oplog fetcher ignore the downloaded batch of operations and not // error out. The FailPointEnabled error will be caught by the AbstractOplogFetcher. if (MONGO_FAIL_POINT(stopReplProducer)) { return Status(ErrorCodes::FailPointEnabled, "stopReplProducer fail point is enabled"); } const auto& documents = queryResponse.documents; auto firstDocToApply = documents.cbegin(); if (!documents.empty()) { LOG(2) << "oplog fetcher read " << documents.size() << " operations from remote oplog starting at " << documents.front()["ts"] << " and ending at " << documents.back()["ts"]; } else { LOG(2) << "oplog fetcher read 0 operations from remote oplog"; } auto oqMetadataResult = parseOplogQueryMetadata(queryResponse); if (!oqMetadataResult.isOK()) { error() << "invalid oplog query metadata from sync source " << _getSource() << ": " << oqMetadataResult.getStatus() << ": " << queryResponse.otherFields.metadata; return oqMetadataResult.getStatus(); } auto oqMetadata = oqMetadataResult.getValue(); // This lastFetched value is the last OpTime from the previous batch. auto lastFetched = _getLastOpTimeFetched(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto remoteRBID = oqMetadata ? boost::make_optional(oqMetadata->getRBID()) : boost::none; auto remoteLastApplied = oqMetadata ? boost::make_optional(oqMetadata->getLastOpApplied()) : boost::none; auto status = checkRemoteOplogStart(documents, lastFetched, remoteLastApplied, _requiredRBID, remoteRBID, _requireFresherSyncSource); if (!status.isOK()) { // Stop oplog fetcher and execute rollback if necessary. return status; } LOG(1) << "oplog fetcher successfully fetched from " << _getSource(); // If this is the first batch, no rollback is needed and we don't want to enqueue the first // document, skip it. if (_startingPoint == StartingPoint::kSkipFirstDoc) { firstDocToApply++; } } auto validateResult = OplogFetcher::validateDocuments(documents, queryResponse.first, lastFetched.getTimestamp()); if (!validateResult.isOK()) { return validateResult.getStatus(); } auto info = validateResult.getValue(); // Process replset metadata. It is important that this happen after we've validated the // first batch, so we don't progress our knowledge of the commit point from a // response that triggers a rollback. rpc::ReplSetMetadata replSetMetadata; bool receivedReplMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedReplMetadata) { const auto& metadataObj = queryResponse.otherFields.metadata; auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(metadataObj); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << _getSource() << ": " << metadataResult.getStatus() << ": " << metadataObj; return metadataResult.getStatus(); } replSetMetadata = metadataResult.getValue(); // We will only ever have OplogQueryMetadata if we have ReplSetMetadata, so it is safe // to call processMetadata() in this if block. _dataReplicatorExternalState->processMetadata(replSetMetadata, oqMetadata); } // Increment stats. We read all of the docs in the query. opsReadStats.increment(info.networkDocumentCount); networkByteStats.increment(info.networkDocumentBytes); // Record time for each batch. getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis)); // TODO: back pressure handling will be added in SERVER-23499. auto status = _enqueueDocumentsFn(firstDocToApply, documents.cend(), info); if (!status.isOK()) { return status; } if (_dataReplicatorExternalState->shouldStopFetching( _getSource(), replSetMetadata, oqMetadata)) { str::stream errMsg; errMsg << "sync source " << _getSource().toString(); errMsg << " (config version: " << replSetMetadata.getConfigVersion(); // If OplogQueryMetadata was provided, its values were used to determine if we should // stop fetching from this sync source. if (oqMetadata) { errMsg << "; last applied optime: " << oqMetadata->getLastOpApplied().toString(); errMsg << "; sync source index: " << oqMetadata->getSyncSourceIndex(); errMsg << "; primary index: " << oqMetadata->getPrimaryIndex(); } else { errMsg << "; last visible optime: " << replSetMetadata.getLastOpVisible().toString(); errMsg << "; sync source index: " << replSetMetadata.getSyncSourceIndex(); errMsg << "; primary index: " << replSetMetadata.getPrimaryIndex(); } errMsg << ") is no longer valid"; return Status(ErrorCodes::InvalidSyncSource, errMsg); } auto lastCommittedWithCurrentTerm = _dataReplicatorExternalState->getCurrentTermAndLastCommittedOpTime(); return makeGetMoreCommandObject(queryResponse.nss, queryResponse.cursorId, lastCommittedWithCurrentTerm, _getGetMoreMaxTime(), _batchSize); }
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result, BSONObjBuilder* bob, const HostAndPort& source, OpTime lastOpTimeFetched, long long lastFetchedHash, Milliseconds fetcherMaxTimeMS, Status* returnStatus) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { LOG(2) << "Error returned from oplog query: " << result.getStatus(); *returnStatus = result.getStatus(); return; } if (inShutdown()) { LOG(2) << "Interrupted by shutdown while querying oplog. 1"; // 1st instance. return; } // Check if we have been stopped. if (isStopped()) { LOG(2) << "Interrupted by stop request while querying the oplog. 1"; // 1st instance. return; } const auto& queryResponse = result.getValue(); bool syncSourceHasSyncSource = false; OpTime sourcesLastOpTime; // Forward metadata (containing liveness information) to replication coordinator. bool receivedMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedMetadata) { auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(queryResponse.otherFields.metadata); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << source << ": " << metadataResult.getStatus() << ": " << queryResponse.otherFields.metadata; return; } const auto& metadata = metadataResult.getValue(); _replCoord->processReplSetMetadata(metadata); if (metadata.getPrimaryIndex() != rpc::ReplSetMetadata::kNoPrimary) { _replCoord->cancelAndRescheduleElectionTimeout(); } syncSourceHasSyncSource = metadata.getSyncSourceIndex() != -1; sourcesLastOpTime = metadata.getLastOpVisible(); } const auto& documents = queryResponse.documents; auto firstDocToApply = documents.cbegin(); auto lastDocToApply = documents.cend(); if (!documents.empty()) { LOG(2) << "fetcher read " << documents.size() << " operations from remote oplog starting at " << documents.front()["ts"] << " and ending at " << documents.back()["ts"]; } else { LOG(2) << "fetcher read 0 operations from remote oplog"; } // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto getNextOperation = [&firstDocToApply, lastDocToApply]() -> StatusWith<BSONObj> { if (firstDocToApply == lastDocToApply) { return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing"); } return *(firstDocToApply++); }; *returnStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash); if (!returnStatus->isOK()) { // Stop fetcher and execute rollback. return; } // If this is the first batch and no rollback is needed, we should have advanced // the document iterator. invariant(firstDocToApply != documents.cbegin()); } // No work to do if we are draining/primary. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(2) << "Interrupted by waiting for applier to drain " << "or becoming primary while querying the oplog. 1"; // 1st instance. return; } // The count of the bytes of the documents read off the network. int networkDocumentBytes = 0; Timestamp lastTS; { stdx::unique_lock<stdx::mutex> lock(_mutex); // If we are stopped then return without queueing this batch to apply. if (_stopped) { LOG(2) << "Interrupted by stop request while querying the oplog. 2"; // 2nd instance. return; } lastTS = _lastOpTimeFetched.getTimestamp(); } int count = 0; for (auto&& doc : documents) { networkDocumentBytes += doc.objsize(); ++count; // If this is the first response (to the $gte query) then we already applied the first doc. if (queryResponse.first && count == 1) { continue; } // Check to see if the oplog entry goes back in time for this document. const auto docOpTime = OpTime::parseFromOplogEntry(doc); fassertStatusOK(34362, docOpTime.getStatus()); // entries must have a "ts" field. const auto docTS = docOpTime.getValue().getTimestamp(); if (lastTS >= docTS) { *returnStatus = Status( ErrorCodes::OplogOutOfOrder, str::stream() << "Reading the oplog from" << source.toString() << " returned out of order entries. lastTS: " << lastTS.toString() << " outOfOrderTS:" << docTS.toString() << " at count:" << count); return; } lastTS = docTS; } // These numbers are for the documents we will apply. auto toApplyDocumentCount = documents.size(); auto toApplyDocumentBytes = networkDocumentBytes; if (queryResponse.first) { // The count is one less since the first document found was already applied ($gte $ts query) // and we will not apply it again. We just needed to check it so we didn't rollback, or // error above. --toApplyDocumentCount; const auto alreadyAppliedDocument = documents.cbegin(); toApplyDocumentBytes -= alreadyAppliedDocument->objsize(); } if (toApplyDocumentBytes > 0) { // Wait for enough space. _buffer.waitForSpace(toApplyDocumentBytes); OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } // Buffer docs for later application. std::vector<BSONObj> objs{firstDocToApply, lastDocToApply}; _buffer.pushAllNonBlocking(objs); // Inc stats. opsReadStats.increment(documents.size()); // we read all of the docs in the query. networkByteStats.increment(networkDocumentBytes); bufferCountGauge.increment(toApplyDocumentCount); bufferSizeGauge.increment(toApplyDocumentBytes); // Update last fetched info. auto lastDoc = objs.back(); { stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetchedHash = lastDoc["h"].numberLong(); _lastOpTimeFetched = fassertStatusOK(28770, OpTime::parseFromOplogEntry(lastDoc)); LOG(3) << "batch resetting _lastOpTimeFetched: " << _lastOpTimeFetched; } }
void OplogFetcher::_callback(const Fetcher::QueryResponseStatus& result, BSONObjBuilder* getMoreBob) { // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!result.isOK()) { LOG(2) << "Error returned from oplog query: " << result.getStatus(); _onShutdown(result.getStatus()); return; } const auto& queryResponse = result.getValue(); rpc::ReplSetMetadata metadata; // Forward metadata (containing liveness information) to data replicator external state. bool receivedMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedMetadata) { const auto& metadataObj = queryResponse.otherFields.metadata; auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(metadataObj); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << _fetcher.getSource() << ": " << metadataResult.getStatus() << ": " << metadataObj; _onShutdown(metadataResult.getStatus()); return; } metadata = metadataResult.getValue(); _dataReplicatorExternalState->processMetadata(metadata); } const auto& documents = queryResponse.documents; auto firstDocToApply = documents.cbegin(); if (!documents.empty()) { LOG(2) << "oplog fetcher read " << documents.size() << " operations from remote oplog starting at " << documents.front()["ts"] << " and ending at " << documents.back()["ts"]; } else { LOG(2) << "oplog fetcher read 0 operations from remote oplog"; } auto opTimeWithHash = getLastOpTimeWithHashFetched(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto status = checkRemoteOplogStart(documents, opTimeWithHash); if (!status.isOK()) { // Stop oplog fetcher and execute rollback. _onShutdown(status, opTimeWithHash); return; } // If this is the first batch and no rollback is needed, skip the first document. firstDocToApply++; } auto validateResult = OplogFetcher::validateDocuments( documents, queryResponse.first, opTimeWithHash.opTime.getTimestamp()); if (!validateResult.isOK()) { _onShutdown(validateResult.getStatus(), opTimeWithHash); return; } auto info = validateResult.getValue(); // TODO: back pressure handling will be added in SERVER-23499. _enqueueDocumentsFn(firstDocToApply, documents.cend(), info, queryResponse.elapsedMillis); // Update last fetched info. if (firstDocToApply != documents.cend()) { opTimeWithHash = info.lastDocument; LOG(3) << "batch resetting last fetched optime: " << opTimeWithHash.opTime << "; hash: " << opTimeWithHash.value; stdx::unique_lock<stdx::mutex> lock(_mutex); _lastFetched = opTimeWithHash; } if (_dataReplicatorExternalState->shouldStopFetching(_fetcher.getSource(), metadata)) { _onShutdown(Status(ErrorCodes::InvalidSyncSource, str::stream() << "sync source " << _fetcher.getSource().toString() << " (last optime: " << metadata.getLastOpVisible().toString() << "; sync source index: " << metadata.getSyncSourceIndex() << "; primary index: " << metadata.getPrimaryIndex() << ") is no longer valid"), opTimeWithHash); return; } // No more data. Stop processing and return Status::OK along with last // fetch info. if (!getMoreBob) { _onShutdown(Status::OK(), opTimeWithHash); return; } getMoreBob->appendElements(makeGetMoreCommandObject(_dataReplicatorExternalState, queryResponse.nss, queryResponse.cursorId, _awaitDataTimeout)); }