Exemple #1
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Status* remoteOplogStartStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        return;
    }

    if (inShutdown()) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    const auto& queryResponse = result.getValue();
    const auto& documents = queryResponse.documents;
    auto documentBegin = documents.cbegin();
    auto documentEnd = documents.cend();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> {
            if (documentBegin == documentEnd) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(documentBegin++);
        };

        *remoteOplogStartStatus =
            checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!remoteOplogStartStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(documentBegin != documents.cbegin());
    }

    // process documents
    int currentBatchMessageSize = 0;
    for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) {
        if (inShutdown()) {
            return;
        }

        // If we are transitioning to primary state, we need to leave
        // this loop in order to go into bgsync-pause mode.
        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
            LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
            return;
        }

        // At this point, we are guaranteed to have at least one thing to read out
        // of the fetcher.
        const BSONObj& o = *documentIter;
        currentBatchMessageSize += o.objsize();
        opsReadStats.increment();

        if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) {
            sleepsecs(20);
        }

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _appliedBuffer = false;
        }

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        bufferCountGauge.increment();
        bufferSizeGauge.increment(getSize(o));
        _buffer.push(o);

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = o["h"].numberLong();
            _lastOpTimeFetched = extractOpTime(o);
            LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }

    // record time for each batch
    getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count());

    networkByteStats.increment(currentBatchMessageSize);

    // Check some things periodically
    // (whenever we run out of items in the
    // current cursor batch)
    if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) {
        // on a very low latency network, if we don't wait a little, we'll be
        // getting ops to write almost one at a time.  this will both be expensive
        // for the upstream server as well as potentially defeating our parallel
        // application of batches on the secondary.
        //
        // the inference here is basically if the batch is really small, we are
        // "caught up".
        //
        sleepmillis(SleepToAllowBatchingMillis);
    }

    // If we are transitioning to primary state, we need to leave
    // this loop in order to go into bgsync-pause mode.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        return;
    }

    // re-evaluate quality of sync target
    if (_shouldChangeSyncSource(source)) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    // We fill in 'bob' to signal the fetcher to process with another getMore.
    invariant(bob);
    bob->append("getMore", queryResponse.cursorId);
    bob->append("collection", queryResponse.nss.coll());
    bob->append("maxTimeMS", int(fetcherMaxTimeMS.count()));
}
Exemple #2
0
StatusWith<BSONObj> OplogFetcher::_onSuccessfulBatch(const Fetcher::QueryResponse& queryResponse) {

    // Stop fetching and return on fail point.
    // This fail point makes the oplog fetcher ignore the downloaded batch of operations and not
    // error out. The FailPointEnabled error will be caught by the AbstractOplogFetcher.
    if (MONGO_FAIL_POINT(stopReplProducer)) {
        return Status(ErrorCodes::FailPointEnabled, "stopReplProducer fail point is enabled");
    }

    const auto& documents = queryResponse.documents;
    auto firstDocToApply = documents.cbegin();

    if (!documents.empty()) {
        LOG(2) << "oplog fetcher read " << documents.size()
               << " operations from remote oplog starting at " << documents.front()["ts"]
               << " and ending at " << documents.back()["ts"];
    } else {
        LOG(2) << "oplog fetcher read 0 operations from remote oplog";
    }

    auto oqMetadataResult = parseOplogQueryMetadata(queryResponse);
    if (!oqMetadataResult.isOK()) {
        error() << "invalid oplog query metadata from sync source " << _getSource() << ": "
                << oqMetadataResult.getStatus() << ": " << queryResponse.otherFields.metadata;
        return oqMetadataResult.getStatus();
    }
    auto oqMetadata = oqMetadataResult.getValue();

    // This lastFetched value is the last OpTime from the previous batch.
    auto lastFetched = _getLastOpTimeFetched();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto remoteRBID = oqMetadata ? boost::make_optional(oqMetadata->getRBID()) : boost::none;
        auto remoteLastApplied =
            oqMetadata ? boost::make_optional(oqMetadata->getLastOpApplied()) : boost::none;
        auto status = checkRemoteOplogStart(documents,
                                            lastFetched,
                                            remoteLastApplied,
                                            _requiredRBID,
                                            remoteRBID,
                                            _requireFresherSyncSource);
        if (!status.isOK()) {
            // Stop oplog fetcher and execute rollback if necessary.
            return status;
        }

        LOG(1) << "oplog fetcher successfully fetched from " << _getSource();

        // If this is the first batch, no rollback is needed and we don't want to enqueue the first
        // document, skip it.
        if (_startingPoint == StartingPoint::kSkipFirstDoc) {
            firstDocToApply++;
        }
    }

    auto validateResult =
        OplogFetcher::validateDocuments(documents, queryResponse.first, lastFetched.getTimestamp());
    if (!validateResult.isOK()) {
        return validateResult.getStatus();
    }
    auto info = validateResult.getValue();

    // Process replset metadata.  It is important that this happen after we've validated the
    // first batch, so we don't progress our knowledge of the commit point from a
    // response that triggers a rollback.
    rpc::ReplSetMetadata replSetMetadata;
    bool receivedReplMetadata =
        queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName);
    if (receivedReplMetadata) {
        const auto& metadataObj = queryResponse.otherFields.metadata;
        auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(metadataObj);
        if (!metadataResult.isOK()) {
            error() << "invalid replication metadata from sync source " << _getSource() << ": "
                    << metadataResult.getStatus() << ": " << metadataObj;
            return metadataResult.getStatus();
        }
        replSetMetadata = metadataResult.getValue();

        // We will only ever have OplogQueryMetadata if we have ReplSetMetadata, so it is safe
        // to call processMetadata() in this if block.
        _dataReplicatorExternalState->processMetadata(replSetMetadata, oqMetadata);
    }

    // Increment stats. We read all of the docs in the query.
    opsReadStats.increment(info.networkDocumentCount);
    networkByteStats.increment(info.networkDocumentBytes);

    // Record time for each batch.
    getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis));

    // TODO: back pressure handling will be added in SERVER-23499.
    auto status = _enqueueDocumentsFn(firstDocToApply, documents.cend(), info);
    if (!status.isOK()) {
        return status;
    }

    if (_dataReplicatorExternalState->shouldStopFetching(
            _getSource(), replSetMetadata, oqMetadata)) {
        str::stream errMsg;
        errMsg << "sync source " << _getSource().toString();
        errMsg << " (config version: " << replSetMetadata.getConfigVersion();
        // If OplogQueryMetadata was provided, its values were used to determine if we should
        // stop fetching from this sync source.
        if (oqMetadata) {
            errMsg << "; last applied optime: " << oqMetadata->getLastOpApplied().toString();
            errMsg << "; sync source index: " << oqMetadata->getSyncSourceIndex();
            errMsg << "; primary index: " << oqMetadata->getPrimaryIndex();
        } else {
            errMsg << "; last visible optime: " << replSetMetadata.getLastOpVisible().toString();
            errMsg << "; sync source index: " << replSetMetadata.getSyncSourceIndex();
            errMsg << "; primary index: " << replSetMetadata.getPrimaryIndex();
        }
        errMsg << ") is no longer valid";
        return Status(ErrorCodes::InvalidSyncSource, errMsg);
    }

    auto lastCommittedWithCurrentTerm =
        _dataReplicatorExternalState->getCurrentTermAndLastCommittedOpTime();
    return makeGetMoreCommandObject(queryResponse.nss,
                                    queryResponse.cursorId,
                                    lastCommittedWithCurrentTerm,
                                    _getGetMoreMaxTime(),
                                    _batchSize);
}
Exemple #3
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Milliseconds fetcherMaxTimeMS,
                                      Status* returnStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        LOG(2) << "Error returned from oplog query: " << result.getStatus();
        *returnStatus = result.getStatus();
        return;
    }

    if (inShutdown()) {
        LOG(2) << "Interrupted by shutdown while querying oplog. 1";  // 1st instance.
        return;
    }

    // Check if we have been stopped.
    if (isStopped()) {
        LOG(2) << "Interrupted by stop request while querying the oplog. 1";  // 1st instance.
        return;
    }

    const auto& queryResponse = result.getValue();
    bool syncSourceHasSyncSource = false;
    OpTime sourcesLastOpTime;

    // Forward metadata (containing liveness information) to replication coordinator.
    bool receivedMetadata =
        queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName);
    if (receivedMetadata) {
        auto metadataResult =
            rpc::ReplSetMetadata::readFromMetadata(queryResponse.otherFields.metadata);
        if (!metadataResult.isOK()) {
            error() << "invalid replication metadata from sync source " << source << ": "
                    << metadataResult.getStatus() << ": " << queryResponse.otherFields.metadata;
            return;
        }
        const auto& metadata = metadataResult.getValue();
        _replCoord->processReplSetMetadata(metadata);
        if (metadata.getPrimaryIndex() != rpc::ReplSetMetadata::kNoPrimary) {
            _replCoord->cancelAndRescheduleElectionTimeout();
        }
        syncSourceHasSyncSource = metadata.getSyncSourceIndex() != -1;
        sourcesLastOpTime = metadata.getLastOpVisible();
    }

    const auto& documents = queryResponse.documents;
    auto firstDocToApply = documents.cbegin();
    auto lastDocToApply = documents.cend();

    if (!documents.empty()) {
        LOG(2) << "fetcher read " << documents.size()
               << " operations from remote oplog starting at " << documents.front()["ts"]
               << " and ending at " << documents.back()["ts"];
    } else {
        LOG(2) << "fetcher read 0 operations from remote oplog";
    }

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&firstDocToApply, lastDocToApply]() -> StatusWith<BSONObj> {
            if (firstDocToApply == lastDocToApply) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(firstDocToApply++);
        };

        *returnStatus = checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!returnStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(firstDocToApply != documents.cbegin());
    }

    // No work to do if we are draining/primary.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        LOG(2) << "Interrupted by waiting for applier to drain "
               << "or becoming primary while querying the oplog. 1";  // 1st instance.
        return;
    }

    // The count of the bytes of the documents read off the network.
    int networkDocumentBytes = 0;
    Timestamp lastTS;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        // If we are stopped then return without queueing this batch to apply.
        if (_stopped) {
            LOG(2) << "Interrupted by stop request while querying the oplog. 2";  // 2nd instance.
            return;
        }
        lastTS = _lastOpTimeFetched.getTimestamp();
    }
    int count = 0;
    for (auto&& doc : documents) {
        networkDocumentBytes += doc.objsize();
        ++count;

        // If this is the first response (to the $gte query) then we already applied the first doc.
        if (queryResponse.first && count == 1) {
            continue;
        }

        // Check to see if the oplog entry goes back in time for this document.
        const auto docOpTime = OpTime::parseFromOplogEntry(doc);
        fassertStatusOK(34362, docOpTime.getStatus());  // entries must have a "ts" field.
        const auto docTS = docOpTime.getValue().getTimestamp();

        if (lastTS >= docTS) {
            *returnStatus = Status(
                ErrorCodes::OplogOutOfOrder,
                str::stream() << "Reading the oplog from" << source.toString()
                              << " returned out of order entries. lastTS: " << lastTS.toString()
                              << " outOfOrderTS:" << docTS.toString() << " at count:" << count);
            return;
        }
        lastTS = docTS;
    }

    // These numbers are for the documents we will apply.
    auto toApplyDocumentCount = documents.size();
    auto toApplyDocumentBytes = networkDocumentBytes;
    if (queryResponse.first) {
        // The count is one less since the first document found was already applied ($gte $ts query)
        // and we will not apply it again. We just needed to check it so we didn't rollback, or
        // error above.
        --toApplyDocumentCount;
        const auto alreadyAppliedDocument = documents.cbegin();
        toApplyDocumentBytes -= alreadyAppliedDocument->objsize();
    }

    if (toApplyDocumentBytes > 0) {
        // Wait for enough space.
        _buffer.waitForSpace(toApplyDocumentBytes);

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        // Buffer docs for later application.
        std::vector<BSONObj> objs{firstDocToApply, lastDocToApply};
        _buffer.pushAllNonBlocking(objs);

        // Inc stats.
        opsReadStats.increment(documents.size());  // we read all of the docs in the query.
        networkByteStats.increment(networkDocumentBytes);
        bufferCountGauge.increment(toApplyDocumentCount);
        bufferSizeGauge.increment(toApplyDocumentBytes);

        // Update last fetched info.
        auto lastDoc = objs.back();
        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = lastDoc["h"].numberLong();
            _lastOpTimeFetched = fassertStatusOK(28770, OpTime::parseFromOplogEntry(lastDoc));
            LOG(3) << "batch resetting _lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }
Exemple #4
0
void OplogFetcher::_callback(const Fetcher::QueryResponseStatus& result,
                             BSONObjBuilder* getMoreBob) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        LOG(2) << "Error returned from oplog query: " << result.getStatus();
        _onShutdown(result.getStatus());
        return;
    }

    const auto& queryResponse = result.getValue();
    rpc::ReplSetMetadata metadata;

    // Forward metadata (containing liveness information) to data replicator external state.
    bool receivedMetadata =
        queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName);
    if (receivedMetadata) {
        const auto& metadataObj = queryResponse.otherFields.metadata;
        auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(metadataObj);
        if (!metadataResult.isOK()) {
            error() << "invalid replication metadata from sync source " << _fetcher.getSource()
                    << ": " << metadataResult.getStatus() << ": " << metadataObj;
            _onShutdown(metadataResult.getStatus());
            return;
        }
        metadata = metadataResult.getValue();
        _dataReplicatorExternalState->processMetadata(metadata);
    }

    const auto& documents = queryResponse.documents;
    auto firstDocToApply = documents.cbegin();

    if (!documents.empty()) {
        LOG(2) << "oplog fetcher read " << documents.size()
               << " operations from remote oplog starting at " << documents.front()["ts"]
               << " and ending at " << documents.back()["ts"];
    } else {
        LOG(2) << "oplog fetcher read 0 operations from remote oplog";
    }

    auto opTimeWithHash = getLastOpTimeWithHashFetched();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto status = checkRemoteOplogStart(documents, opTimeWithHash);
        if (!status.isOK()) {
            // Stop oplog fetcher and execute rollback.
            _onShutdown(status, opTimeWithHash);
            return;
        }

        // If this is the first batch and no rollback is needed, skip the first document.
        firstDocToApply++;
    }

    auto validateResult = OplogFetcher::validateDocuments(
        documents, queryResponse.first, opTimeWithHash.opTime.getTimestamp());
    if (!validateResult.isOK()) {
        _onShutdown(validateResult.getStatus(), opTimeWithHash);
        return;
    }
    auto info = validateResult.getValue();

    // TODO: back pressure handling will be added in SERVER-23499.
    _enqueueDocumentsFn(firstDocToApply, documents.cend(), info, queryResponse.elapsedMillis);

    // Update last fetched info.
    if (firstDocToApply != documents.cend()) {
        opTimeWithHash = info.lastDocument;
        LOG(3) << "batch resetting last fetched optime: " << opTimeWithHash.opTime
               << "; hash: " << opTimeWithHash.value;

        stdx::unique_lock<stdx::mutex> lock(_mutex);
        _lastFetched = opTimeWithHash;
    }

    if (_dataReplicatorExternalState->shouldStopFetching(_fetcher.getSource(), metadata)) {
        _onShutdown(Status(ErrorCodes::InvalidSyncSource,
                           str::stream() << "sync source " << _fetcher.getSource().toString()
                                         << " (last optime: "
                                         << metadata.getLastOpVisible().toString()
                                         << "; sync source index: "
                                         << metadata.getSyncSourceIndex()
                                         << "; primary index: "
                                         << metadata.getPrimaryIndex()
                                         << ") is no longer valid"),
                    opTimeWithHash);
        return;
    }

    // No more data. Stop processing and return Status::OK along with last
    // fetch info.
    if (!getMoreBob) {
        _onShutdown(Status::OK(), opTimeWithHash);
        return;
    }

    getMoreBob->appendElements(makeGetMoreCommandObject(_dataReplicatorExternalState,
                                                        queryResponse.nss,
                                                        queryResponse.cursorId,
                                                        _awaitDataTimeout));
}