StatusWith<OpTime> ReplicationCoordinatorExternalStateImpl::loadLastOpTime(
            OperationContext* txn) {

        // TODO: handle WriteConflictExceptions below
        try {
            BSONObj oplogEntry;
            if (!Helpers::getLast(txn, rsOplogName.c_str(), oplogEntry)) {
                return StatusWith<OpTime>(
                        ErrorCodes::NoMatchingDocument,
                        str::stream() << "Did not find any entries in " << rsOplogName);
            }
            BSONElement tsElement = oplogEntry[tsFieldName];
            if (tsElement.eoo()) {
                return StatusWith<OpTime>(
                        ErrorCodes::NoSuchKey,
                        str::stream() << "Most recent entry in " << rsOplogName << " missing \"" <<
                        tsFieldName << "\" field");
            }
            if (tsElement.type() != bsonTimestamp) {
                return StatusWith<OpTime>(
                        ErrorCodes::TypeMismatch,
                        str::stream() << "Expected type of \"" << tsFieldName <<
                        "\" in most recent " << rsOplogName <<
                        " entry to have type Timestamp, but found " << typeName(tsElement.type()));
            }
            return StatusWith<OpTime>(extractOpTime(oplogEntry));
        }
        catch (const DBException& ex) {
            return StatusWith<OpTime>(ex.toStatus());
        }
    }
Exemplo n.º 2
0
    bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) {
        string hn = r.conn()->getServerAddress();

        if (!r.more()) {
            try {
                BSONObj theirLastOp = r.getLastOp(rsOplogName.c_str());
                if (theirLastOp.isEmpty()) {
                    error() << "empty query result from " << hn << " oplog";
                    sleepsecs(2);
                    return true;
                }
                OpTime theirOpTime = extractOpTime(theirLastOp);
                if (theirOpTime < _lastOpTimeFetched) {
                    log() << "we are ahead of the sync source, will try to roll back";
                    syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord);
                    return true;
                }
                /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
                log() << "syncTail condition 1";
                sleepsecs(1);
            }
            catch(DBException& e) {
                error() << "querying " << hn << ' ' << e.toString();
                sleepsecs(2);
            }
            return true;
        }

        BSONObj o = r.nextSafe();
        OpTime opTime = extractOpTime(o);
        long long hash = o["h"].numberLong();
        if ( opTime != _lastOpTimeFetched || hash != _lastFetchedHash ) {
            log() << "our last op time fetched: " << _lastOpTimeFetched;
            log() << "source's GTE: " << opTime;
            syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord);
            return true;
        }

        return false;
    }
Exemplo n.º 3
0
Status BackgroundSync::_checkRemoteOplogStart(
    stdx::function<StatusWith<BSONObj>()> getNextOperation) {
    auto result = getNextOperation();
    if (!result.isOK()) {
        // The GTE query from upstream returns nothing, so we're ahead of the upstream.
        return Status(ErrorCodes::RemoteOplogStale,
                      "we are ahead of the sync source, will try to roll back");
    }
    BSONObj o = result.getValue();
    OpTime opTime = extractOpTime(o);
    long long hash = o["h"].numberLong();
    if (opTime != _lastOpTimeFetched || hash != _lastFetchedHash) {
        return Status(ErrorCodes::OplogStartMissing,
                      str::stream() << "our last op time fetched: " << _lastOpTimeFetched.toString()
                      << ". source's GTE: " << opTime.toString());
    }
    return Status::OK();
}
Exemplo n.º 4
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Status* remoteOplogStartStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        return;
    }

    if (inShutdown()) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    const auto& queryResponse = result.getValue();
    const auto& documents = queryResponse.documents;
    auto documentBegin = documents.cbegin();
    auto documentEnd = documents.cend();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> {
            if (documentBegin == documentEnd) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(documentBegin++);
        };

        *remoteOplogStartStatus =
            checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!remoteOplogStartStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(documentBegin != documents.cbegin());
    }

    // process documents
    int currentBatchMessageSize = 0;
    for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) {
        if (inShutdown()) {
            return;
        }

        // If we are transitioning to primary state, we need to leave
        // this loop in order to go into bgsync-pause mode.
        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
            LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
            return;
        }

        // At this point, we are guaranteed to have at least one thing to read out
        // of the fetcher.
        const BSONObj& o = *documentIter;
        currentBatchMessageSize += o.objsize();
        opsReadStats.increment();

        if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) {
            sleepsecs(20);
        }

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _appliedBuffer = false;
        }

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        bufferCountGauge.increment();
        bufferSizeGauge.increment(getSize(o));
        _buffer.push(o);

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = o["h"].numberLong();
            _lastOpTimeFetched = extractOpTime(o);
            LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }

    // record time for each batch
    getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count());

    networkByteStats.increment(currentBatchMessageSize);

    // Check some things periodically
    // (whenever we run out of items in the
    // current cursor batch)
    if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) {
        // on a very low latency network, if we don't wait a little, we'll be
        // getting ops to write almost one at a time.  this will both be expensive
        // for the upstream server as well as potentially defeating our parallel
        // application of batches on the secondary.
        //
        // the inference here is basically if the batch is really small, we are
        // "caught up".
        //
        sleepmillis(SleepToAllowBatchingMillis);
    }

    // If we are transitioning to primary state, we need to leave
    // this loop in order to go into bgsync-pause mode.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        return;
    }

    // re-evaluate quality of sync target
    if (_shouldChangeSyncSource(source)) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    // We fill in 'bob' to signal the fetcher to process with another getMore.
    invariant(bob);
    bob->append("getMore", queryResponse.cursorId);
    bob->append("collection", queryResponse.nss.coll());
    bob->append("maxTimeMS", int(fetcherMaxTimeMS.count()));
}
Exemplo n.º 5
0
    void BackgroundSync::produce(OperationContext* txn) {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer && !inShutdownStrict()) {
                _appliedBufferCondition.wait(lock);
            }
            if (inShutdownStrict()) {
                return;
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }


        // find a target to sync from the last optime fetched
        OpTime lastOpTimeFetched;
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
            _syncSourceHost = HostAndPort();
        }
        _syncSourceReader.resetConnection();
        _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            // no server found
            if (_syncSourceReader.getHost().empty()) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
            _syncSourceHost = _syncSourceReader.getHost();
            _replCoord->signalUpstreamUpdater();
        }

        _syncSourceReader.tailingQueryGTE(rsOplogName.c_str(), lastOpTimeFetched.getTimestamp());

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!_syncSourceReader.haveCursor()) {
            return;
        }

        if (_rollbackIfNeeded(txn, _syncSourceReader)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!_syncSourceReader.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = _syncSourceReader.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    sleepmillis(SleepToAllowBatchingMillis);
                }

                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (_replCoord->isWaitingForApplierToDrain() || 
                    _replCoord->getMemberState().primary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncSource()) {
                    return;
                }

                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    _syncSourceReader.more();
                }
                networkByteStats.increment(_syncSourceReader.currentBatchMessageSize());

                if (!_syncSourceReader.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause) {
                            return;
                        }
                    }

                    _syncSourceReader.tailCheck();
                    if( !_syncSourceReader.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass";
                        return;
                    }

                    continue;
                }
            }

            // If we are transitioning to primary state, we need to leave
            // this loop in order to go into bgsync-pause mode.
            if (_replCoord->isWaitingForApplierToDrain() ||
                _replCoord->getMemberState().primary()) {
                LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
                return;
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = _syncSourceReader.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
            }

            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));
            _buffer.push(o);

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastFetchedHash = o["h"].numberLong();
                _lastOpTimeFetched = extractOpTime(o);
                LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
            }
        }
    }
Exemplo n.º 6
0
    void OplogReader::connectToSyncSource(OperationContext* txn,
                                          const OpTime& lastOpTimeFetched,
                                          ReplicationCoordinator* replCoord) {
        const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0);
        const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
        OpTime oldestOpTimeSeen = sentinel;

        invariant(conn() == NULL);

        while (true) {
            HostAndPort candidate = replCoord->chooseNewSyncSource();

            if (candidate.empty()) {
                if (oldestOpTimeSeen == sentinel) {
                    // If, in this invocation of connectToSyncSource(), we did not successfully 
                    // connect to any node ahead of us,
                    // we apparently have no sync sources to connect to.
                    // This situation is common; e.g. if there are no writes to the primary at
                    // the moment.
                    return;
                }

                // Connected to at least one member, but in all cases we were too stale to use them
                // as a sync source.
                error() << "too stale to catch up";
                log() << "our last optime : " << lastOpTimeFetched;
                log() << "oldest available is " << oldestOpTimeSeen;
                log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
                setMinValid(txn, oldestOpTimeSeen);
                bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
                if (!worked) {
                    warning() << "Failed to transition into "
                              << MemberState(MemberState::RS_RECOVERING)
                              << ". Current state: " << replCoord->getMemberState();
                }
                return;
            }

            if (!connect(candidate)) {
                LOG(2) << "can't connect to " << candidate.toString() <<
                    " to read operations";
                resetConnection();
                replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
                continue;
            }
            // Read the first (oldest) op and confirm that it's not newer than our last
            // fetched op. Otherwise, we have fallen off the back of that source's oplog.
            BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
            OpTime remoteOldOpTime = extractOpTime(remoteOldestOp);

            if (lastOpTimeFetched < remoteOldOpTime) {
                // We're too stale to use this sync source.
                resetConnection();
                replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(10));
                if (oldestOpTimeSeen > remoteOldOpTime) {
                    warning() << "we are too stale to use " << candidate.toString() << 
                        " as a sync source";
                    oldestOpTimeSeen = remoteOldOpTime;
                }
                continue;
            }

            // Got a valid sync source.
            return;
        } // while (true)
    }