Exemplo n.º 1
0
void BackgroundSync::_rollback(OperationContext* txn,
                               const HostAndPort& source,
                               stdx::function<DBClientBase*()> getConnection) {
    // Abort only when syncRollback detects we are in a unrecoverable state.
    // In other cases, we log the message contained in the error status and retry later.
    auto status = syncRollback(txn,
                               OplogInterfaceLocal(txn, rsOplogName),
                               RollbackSourceImpl(getConnection, source, rsOplogName),
                               _replCoord);
    if (status.isOK()) {
        // When the syncTail thread sees there is no new data by adding something to the buffer.
        _signalNoNewDataForApplier(txn);
        // Wait until the buffer is empty.
        // This is an indication that syncTail has removed the sentinal marker from the buffer
        // and reset its local lastAppliedOpTime via the replCoord.
        while (!_oplogBuffer->isEmpty()) {
            sleepmillis(10);
            if (inShutdown()) {
                return;
            }
        }

        // At this point we are about to leave rollback.  Before we do, wait for any writes done
        // as part of rollback to be durable, and then do any necessary checks that we didn't
        // wind up rolling back something illegal.  We must wait for the rollback to be durable
        // so that if we wind up shutting down uncleanly in response to something we rolled back
        // we know that we won't wind up right back in the same situation when we start back up
        // because the rollback wasn't durable.
        txn->recoveryUnit()->waitUntilDurable();

        // If we detected that we rolled back the shardIdentity document as part of this rollback
        // then we must shut down to clear the in-memory ShardingState associated with the
        // shardIdentity document.
        if (ShardIdentityRollbackNotifier::get(txn)->didRollbackHappen()) {
            severe()
                << "shardIdentity document rollback detected.  Shutting down to clear "
                   "in-memory sharding state.  Restarting this process should safely return it "
                   "to a healthy state";
            fassertFailedNoTrace(40276);
        }

        // It is now safe to clear the ROLLBACK state, which may result in the applier thread
        // transitioning to SECONDARY.  This is safe because the applier thread has now reloaded
        // the new rollback minValid from the database.
        if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK)
                      << " but found self in " << _replCoord->getMemberState();
        }
        return;
    }
    if (ErrorCodes::UnrecoverableRollbackError == status.code()) {
        fassertNoTrace(28723, status);
    }
    warning() << "rollback cannot proceed at this time (retrying later): " << redact(status);
}
Exemplo n.º 2
0
void BackgroundSync::_rollback(OperationContext* txn,
                               const HostAndPort& source,
                               stdx::function<DBClientBase*()> getConnection) {
    // Abort only when syncRollback detects we are in a unrecoverable state.
    // In other cases, we log the message contained in the error status and retry later.
    auto status = syncRollback(txn,
                               OplogInterfaceLocal(txn, rsOplogName),
                               RollbackSourceImpl(getConnection, source, rsOplogName),
                               _replCoord);
    if (status.isOK()) {
        // When the syncTail thread sees there is no new data by adding something to the buffer.
        _signalNoNewDataForApplier();
        // Wait until the buffer is empty.
        // This is an indication that syncTail has removed the sentinal marker from the buffer
        // and reset its local lastAppliedOpTime via the replCoord.
        while (!_buffer.empty()) {
            sleepmillis(10);
            if (inShutdown()) {
                return;
            }
        }

        // It is now safe to clear the ROLLBACK state, which may result in the applier thread
        // transitioning to SECONDARY.  This is safe because the applier thread has now reloaded
        // the new rollback minValid from the database.
        if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK)
                      << " but found self in " << _replCoord->getMemberState();
        }
        return;
    }
    if (ErrorCodes::UnrecoverableRollbackError == status.code()) {
        fassertNoTrace(28723, status);
    }
    warning() << "rollback cannot proceed at this time (retrying later): " << status;
}
Exemplo n.º 3
0
Status RollbackImpl::_transitionToRollback(OperationContext* opCtx) {
    invariant(opCtx);
    if (_isInShutdown()) {
        return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down");
    }

    log() << "transition to ROLLBACK";
    {
        Lock::GlobalWrite globalWrite(opCtx);

        auto status = _replicationCoordinator->setFollowerMode(MemberState::RS_ROLLBACK);
        if (!status.isOK()) {
            status.addContext(str::stream() << "Cannot transition from "
                                            << _replicationCoordinator->getMemberState().toString()
                                            << " to "
                                            << MemberState(MemberState::RS_ROLLBACK).toString());
            log() << status;
            return status;
        }
    }
    return Status::OK();
}
Exemplo n.º 4
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      const OpTime& lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0);
    const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp());

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            error() << "too stale to catch up";
            log() << "our last optime : " << lastOpTimeFetched;
            log() << "oldest available is " << oldestOpTimeSeen;
            log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, oldestOpTimeSeen);
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
        OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp));

        // remoteOldOpTime may come from a very old config, so we cannot compare their terms.
        if (!lastOpTimeFetched.isNull() &&
            lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1));
            if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Got a valid sync source.
        return;
    }  // while (true)
}
Exemplo n.º 5
0
void BackgroundSync::_produce(OperationContext* txn) {

    while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) {
        sleepmillis(0);
    }

    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (!_replCoord->isCatchingUp() &&
            (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) {
            return;
        }

        if (_inShutdown_inlock()) {
            return;
        }
    }

    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    SyncSourceResolverResponse syncSourceResp;
    SyncSourceResolver* syncSourceResolver;
    OpTime minValid;
    if (_replCoord->getMemberState().recovering()) {
        auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn);
        if (minValidSaved > lastOpTimeFetched) {
            minValid = minValidSaved;
        }
    }
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
        _syncSourceResolver = stdx::make_unique<SyncSourceResolver>(
            _replicationCoordinatorExternalState->getTaskExecutor(),
            _replCoord,
            lastOpTimeFetched,
            minValid,
            [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; });
        syncSourceResolver = _syncSourceResolver.get();
    }
    // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls
    // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to
    // acquired before BackgroundSync's.
    // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because
    // we do not destroy this instance outside of this function.
    auto status = _syncSourceResolver->startup();
    if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) {
        return;
    }
    fassertStatusOK(40349, status);
    syncSourceResolver->join();
    syncSourceResolver = nullptr;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        _syncSourceResolver.reset();
    }

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        if (_replCoord->isCatchingUp()) {
            warning() << "Too stale to catch up.";
            log() << "Our newest OpTime : " << lastOpTimeFetched;
            log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen
                  << " from " << syncSourceResp.getSyncSource();
            sleepsecs(1);
            return;
        }

        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode: " << status;
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        if (!_replCoord->isCatchingUp()) {
            _replCoord->signalUpstreamUpdater();
        }
    }

    // Set the applied point if unset. This is most likely the first time we've established a sync
    // source since stepping down or otherwise clearing the applied point. We need to set this here,
    // before the OplogWriter gets a chance to append to the oplog.
    if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) {
        StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime());
    }

    // "lastFetched" not used. Already set in _enqueueDocuments.
    Status fetcherReturnStatus = Status::OK();
    DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
        _replCoord, _replicationCoordinatorExternalState, this);
    OplogFetcher* oplogFetcher;
    try {
        auto executor = _replicationCoordinatorExternalState->getTaskExecutor();
        auto config = _replCoord->getConfig();
        auto onOplogFetcherShutdownCallbackFn =
            [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) {
                fetcherReturnStatus = status;
            };

        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _oplogFetcher = stdx::make_unique<OplogFetcher>(
            executor,
            OpTimeWithHash(lastHashFetched, lastOpTimeFetched),
            source,
            NamespaceString(rsOplogName),
            config,
            _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(),
            &dataReplicatorExternalState,
            stdx::bind(&BackgroundSync::_enqueueDocuments,
                       this,
                       stdx::placeholders::_1,
                       stdx::placeholders::_2,
                       stdx::placeholders::_3),
            onOplogFetcherShutdownCallbackFn);
        oplogFetcher = _oplogFetcher.get();
    } catch (const mongo::DBException& ex) {
        fassertFailedWithStatus(34440, exceptionToStatus());
    }

    LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at "
           << oplogFetcher->getCommandObject_forTest()["filter"];
    auto scheduleStatus = oplogFetcher->startup();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }

    oplogFetcher->join();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << redact(fetcherReturnStatus);
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        if (_replCoord->isCatchingUp()) {
            warning() << "Rollback situation detected in catch-up mode; catch-up mode will end.";
            sleepsecs(1);
            return;
        }

        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << redact(fetcherReturnStatus);

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        const auto minValid = StorageInterface::get(txn)->getMinValid(txn);
        if (lastApplied < minValid) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream() << "need to rollback, but in inconsistent state. "
                                                << "minvalid: "
                                                << minValid.toString()
                                                << " > our last optime: "
                                                << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher stopped querying remote oplog with error: "
                  << redact(fetcherReturnStatus);
    }
}
Exemplo n.º 6
0
void BackgroundSync::_produce(OperationContext* txn) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    SyncSourceResolverResponse syncSourceResp =
        _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched);

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen});
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode.";
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _replCoord->signalUpstreamUpdater();
    }

    const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol();
    // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election
    // timeout. This enables the sync source to communicate liveness of the primary to secondaries.
    // Under protocol version 0, use a default timeout of 2 seconds for awaitData.
    const Milliseconds fetcherMaxTimeMS(
        isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2));

    Status fetcherReturnStatus = Status::OK();
    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      fetcherMaxTimeMS,
                                      &fetcherReturnStatus);


    BSONObjBuilder cmdBob;
    cmdBob.append("find", nsToCollectionSubstring(rsOplogName));
    cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())));
    cmdBob.append("tailable", true);
    cmdBob.append("oplogReplay", true);
    cmdBob.append("awaitData", true);
    cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1)));  // 1 min initial find.

    BSONObjBuilder metadataBob;
    if (isV1ElectionProtocol) {
        cmdBob.append("term", _replCoord->getTerm());
        metadataBob.append(rpc::kReplSetMetadataFieldName, 1);
    }

    auto dbName = nsToDatabase(rsOplogName);
    auto cmdObj = cmdBob.obj();
    auto metadataObj = metadataBob.obj();
    Fetcher fetcher(&_threadPoolTaskExecutor,
                    source,
                    dbName,
                    cmdObj,
                    fetcherCallback,
                    metadataObj,
                    _replCoord->getConfig().getElectionTimeoutPeriod());

    LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at "
           << cmdObj["filter"];
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << fetcherReturnStatus.toString();
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), oplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << fetcherReturnStatus;

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        BatchBoundaries boundaries = getMinValid(txn);
        if (!boundaries.start.isNull() || boundaries.end > lastApplied) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream()
                                      << "need to rollback, but in inconsistent state. "
                                      << "minvalid: " << boundaries.end.toString()
                                      << " > our last optime: " << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString();
    }
}
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc, long long term) {
    // Old versions set this even though they returned not "ok"
    _mismatch = doc[kMismatchFieldName].trueValue();
    if (_mismatch)
        return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match.");

    // Old versions sometimes set the replica set name ("set") but ok:0
    const BSONElement replSetNameElement = doc[kReplSetFieldName];
    if (replSetNameElement.eoo()) {
        _setName.clear();
    } else if (replSetNameElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kReplSetFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type String, but found "
                                    << typeName(replSetNameElement.type()));
    } else {
        _setName = replSetNameElement.String();
    }

    if (_setName.empty() && !doc[kOkFieldName].trueValue()) {
        std::string errMsg = doc[kErrMsgFieldName].str();

        BSONElement errCodeElem = doc[kErrorCodeFieldName];
        if (errCodeElem.ok()) {
            if (!errCodeElem.isNumber())
                return Status(ErrorCodes::BadValue, "Error code is not a number!");

            int errorCode = errCodeElem.numberInt();
            return Status(ErrorCodes::Error(errorCode), errMsg);
        }
        return Status(ErrorCodes::UnknownError, errMsg);
    }

    const BSONElement hasDataElement = doc[kHasDataFieldName];
    _hasDataSet = !hasDataElement.eoo();
    _hasData = hasDataElement.trueValue();

    const BSONElement electionTimeElement = doc[kElectionTimeFieldName];
    if (electionTimeElement.eoo()) {
        _electionTimeSet = false;
    } else if (electionTimeElement.type() == bsonTimestamp) {
        _electionTimeSet = true;
        _electionTime = electionTimeElement.timestamp();
    } else if (electionTimeElement.type() == Date) {
        _electionTimeSet = true;
        _electionTime = Timestamp(electionTimeElement.date());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kElectionTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(electionTimeElement.type()));
    }

    const BSONElement timeElement = doc[kTimeFieldName];
    if (timeElement.eoo()) {
        _timeSet = false;
    } else if (timeElement.isNumber()) {
        _timeSet = true;
        _time = Seconds(timeElement.numberLong());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have a numeric type, but found type "
                                    << typeName(timeElement.type()));
    }

    _isReplSet = doc[kIsReplSetFieldName].trueValue();

    Status termStatus = bsonExtractIntegerField(doc, kTermFieldName, &_term);
    if (!termStatus.isOK() && termStatus != ErrorCodes::NoSuchKey) {
        return termStatus;
    }

    // In order to support both the 3.0(V0) and 3.2(V1) heartbeats we must parse the OpTime
    // field based on its type. If it is a Date, we parse it as the timestamp and use
    // initialize's term argument to complete the OpTime type. If it is an Object, then it's
    // V1 and we construct an OpTime out of its nested fields.
    const BSONElement opTimeElement = doc[kOpTimeFieldName];
    if (opTimeElement.eoo()) {
        _opTimeSet = false;
    } else if (opTimeElement.type() == bsonTimestamp) {
        _opTimeSet = true;
        _opTime = OpTime(opTimeElement.timestamp(), term);
    } else if (opTimeElement.type() == Date) {
        _opTimeSet = true;
        _opTime = OpTime(Timestamp(opTimeElement.date()), term);
    } else if (opTimeElement.type() == Object) {
        Status status = bsonExtractOpTimeField(doc, kOpTimeFieldName, &_opTime);
        _opTimeSet = true;
        // since a v1 OpTime was in the response, the member must be part of a replset
        _isReplSet = true;
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kOpTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(opTimeElement.type()));
    }

    const BSONElement electableElement = doc[kIsElectableFieldName];
    if (electableElement.eoo()) {
        _electableSet = false;
    } else {
        _electableSet = true;
        _electable = electableElement.trueValue();
    }

    const BSONElement memberStateElement = doc[kMemberStateFieldName];
    if (memberStateElement.eoo()) {
        _stateSet = false;
    } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream()
                          << "Expected \"" << kMemberStateFieldName
                          << "\" field in response to replSetHeartbeat "
                             "command to have type NumberInt or NumberLong, but found type "
                          << typeName(memberStateElement.type()));
    } else {
        long long stateInt = memberStateElement.numberLong();
        if (stateInt < 0 || stateInt > MemberState::RS_MAX) {
            return Status(ErrorCodes::BadValue,
                          str::stream()
                              << "Value for \"" << kMemberStateFieldName
                              << "\" in response to replSetHeartbeat is "
                                 "out of range; legal values are non-negative and no more than "
                              << MemberState::RS_MAX);
        }
        _stateSet = true;
        _state = MemberState(static_cast<int>(stateInt));
    }

    _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue();


    // Not required for the case of uninitialized members -- they have no config
    const BSONElement configVersionElement = doc[kConfigVersionFieldName];

    // If we have an optime then we must have a configVersion
    if (_opTimeSet && configVersionElement.eoo()) {
        return Status(ErrorCodes::NoSuchKey,
                      str::stream() << "Response to replSetHeartbeat missing required \""
                                    << kConfigVersionFieldName
                                    << "\" field even though initialized");
    }

    // If there is a "v" (config version) then it must be an int.
    if (!configVersionElement.eoo() && configVersionElement.type() != NumberInt) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kConfigVersionFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type NumberInt, but found "
                                    << typeName(configVersionElement.type()));
    }
    _configVersion = configVersionElement.numberInt();

    const BSONElement hbMsgElement = doc[kHbMessageFieldName];
    if (hbMsgElement.eoo()) {
        _hbmsg.clear();
    } else if (hbMsgElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kHbMessageFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type String, but found " << typeName(hbMsgElement.type()));
    } else {
        _hbmsg = hbMsgElement.String();
    }

    const BSONElement syncingToElement = doc[kSyncSourceFieldName];
    if (syncingToElement.eoo()) {
        _syncingTo = HostAndPort();
    } else if (syncingToElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kSyncSourceFieldName
                                    << "\" field in response to replSetHeartbeat to "
                                       "have type String, but found "
                                    << typeName(syncingToElement.type()));
    } else {
        _syncingTo = HostAndPort(syncingToElement.String());
    }

    const BSONElement rsConfigElement = doc[kConfigFieldName];
    if (rsConfigElement.eoo()) {
        _configSet = false;
        _config = ReplicaSetConfig();
        return Status::OK();
    } else if (rsConfigElement.type() != Object) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kConfigFieldName
                                    << "\" in response to replSetHeartbeat to have type "
                                       "Object, but found " << typeName(rsConfigElement.type()));
    }
    _configSet = true;

    return _config.initialize(rsConfigElement.Obj());
}
 MemberState TopologyCoordinatorImpl::getMemberState() const {
     // TODO
     return MemberState();
 }
Exemplo n.º 9
0
void BackgroundSync::_produce(OperationContext* opCtx) {
    if (MONGO_FAIL_POINT(stopReplProducer)) {
        // This log output is used in js tests so please leave it.
        log() << "bgsync - stopReplProducer fail point "
                 "enabled. Blocking until fail point is disabled.";

        // TODO(SERVER-27120): Remove the return statement and uncomment the while loop.
        // Currently we cannot block here or we prevent primaries from being fully elected since
        // we'll never call _signalNoNewDataForApplier.
        //        while (MONGO_FAIL_POINT(stopReplProducer) && !inShutdown()) {
        //            mongo::sleepsecs(1);
        //        }
        mongo::sleepsecs(1);
        return;
    }

    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_state != ProducerState::Running) {
            return;
        }
    }

    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    HostAndPort oldSource = _syncSourceHost;
    SyncSourceResolverResponse syncSourceResp;
    {
        const OpTime minValidSaved =
            _replicationProcess->getConsistencyMarkers()->getMinValid(opCtx);

        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_state != ProducerState::Running) {
            return;
        }
        const auto requiredOpTime = (minValidSaved > _lastOpTimeFetched) ? minValidSaved : OpTime();
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
        _syncSourceResolver = stdx::make_unique<SyncSourceResolver>(
            _replicationCoordinatorExternalState->getTaskExecutor(),
            _replCoord,
            lastOpTimeFetched,
            requiredOpTime,
            [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; });
    }
    // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls
    // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to
    // acquired before BackgroundSync's.
    // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because
    // we do not destroy this instance outside of this function which is only called from a single
    // thread.
    auto status = _syncSourceResolver->startup();
    if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) {
        return;
    }
    fassertStatusOK(40349, status);
    _syncSourceResolver->join();
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceResolver.reset();
    }

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        if (_replCoord->getMemberState().primary()) {
            warning() << "Too stale to catch up.";
            log() << "Our newest OpTime : " << lastOpTimeFetched;
            log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen
                  << " from " << syncSourceResp.getSyncSource();
            _replCoord->abortCatchupIfNeeded().transitional_ignore();
            return;
        }

        // We only need to mark ourselves as too stale once.
        if (_tooStale) {
            return;
        }

        // Mark yourself as too stale.
        _tooStale = true;

        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";

        // Activate maintenance mode and transition to RECOVERING.
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode: " << status;
        }
        status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!status.isOK()) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState() << causedBy(status);
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            _syncSourceHost = syncSourceResp.getSyncSource();
            source = _syncSourceHost;
        }
        // If our sync source has not changed, it is likely caused by our heartbeat data map being
        // out of date. In that case we sleep for 1 second to reduce the amount we spin waiting
        // for our map to update.
        if (oldSource == source) {
            log() << "Chose same sync source candidate as last time, " << source
                  << ". Sleeping for 1 second to avoid immediately choosing a new sync source for "
                     "the same reason as last time.";
            sleepsecs(1);
        }
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    // If we find a good sync source after having gone too stale, disable maintenance mode so we can
    // transition to SECONDARY.
    if (_tooStale) {

        _tooStale = false;

        log() << "No longer too stale. Able to sync from " << _syncSourceHost;

        auto status = _replCoord->setMaintenanceMode(false);
        if (!status.isOK()) {
            warning() << "Failed to leave maintenance mode: " << status;
        }
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_state != ProducerState::Running) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
    }

    if (!_replCoord->getMemberState().primary()) {
        _replCoord->signalUpstreamUpdater();
    }

    // Set the applied point if unset. This is most likely the first time we've established a sync
    // source since stepping down or otherwise clearing the applied point. We need to set this here,
    // before the OplogWriter gets a chance to append to the oplog.
    if (_replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx).isNull()) {
        _replicationProcess->getConsistencyMarkers()->setAppliedThrough(
            opCtx, _replCoord->getMyLastAppliedOpTime());
    }

    // "lastFetched" not used. Already set in _enqueueDocuments.
    Status fetcherReturnStatus = Status::OK();
    DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
        _replCoord, _replicationCoordinatorExternalState, this);
    OplogFetcher* oplogFetcher;
    try {
        auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status) {
            fetcherReturnStatus = status;
        };
        // The construction of OplogFetcher has to be outside bgsync mutex, because it calls
        // replication coordinator.
        auto oplogFetcherPtr = stdx::make_unique<OplogFetcher>(
            _replicationCoordinatorExternalState->getTaskExecutor(),
            OpTimeWithHash(lastHashFetched, lastOpTimeFetched),
            source,
            NamespaceString::kRsOplogNamespace,
            _replCoord->getConfig(),
            _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(),
            syncSourceResp.rbid,
            true /* requireFresherSyncSource */,
            &dataReplicatorExternalState,
            stdx::bind(&BackgroundSync::_enqueueDocuments,
                       this,
                       stdx::placeholders::_1,
                       stdx::placeholders::_2,
                       stdx::placeholders::_3),
            onOplogFetcherShutdownCallbackFn,
            bgSyncOplogFetcherBatchSize);
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_state != ProducerState::Running) {
            return;
        }
        _oplogFetcher = std::move(oplogFetcherPtr);
        oplogFetcher = _oplogFetcher.get();
    } catch (const mongo::DBException& ex) {
        fassertFailedWithStatus(34440, exceptionToStatus());
    }

    const auto logLevel = Command::testCommandsEnabled ? 0 : 1;
    LOG(logLevel) << "scheduling fetcher to read remote oplog on " << _syncSourceHost
                  << " starting at " << oplogFetcher->getFindQuery_forTest()["filter"];
    auto scheduleStatus = oplogFetcher->startup();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }

    oplogFetcher->join();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (getState() != ProducerState::Running) {
        log() << "Replication producer stopped after oplog fetcher finished returning a batch from "
                 "our sync source.  Abandoning this batch of oplog entries and re-evaluating our "
                 "sync source.";
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << redact(fetcherReturnStatus);
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing) {
        auto storageInterface = StorageInterface::get(opCtx);
        _runRollback(opCtx, fetcherReturnStatus, source, syncSourceResp.rbid, storageInterface);
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher stopped querying remote oplog with error: "
                  << redact(fetcherReturnStatus);
    }
}
Status ReplSetHeartbeatResponseV1::initialize(const BSONObj& doc) {
    Status status = bsonCheckOnlyHasFields("ReplSetHeartbeatResponse",
                                           doc,
                                           kLegalHeartbeatFieldNames);
    if (!status.isOK())
        return status;

    status = bsonExtractBooleanField(doc, kIsReplSetFieldName, &_isReplSet);
    if (!status.isOK())
        return status;

    status = bsonExtractStringField(doc, kReplSetFieldName, &_setName);
    if (!status.isOK())
        return status;

    long long stateInt;
    status = bsonExtractIntegerField(doc, kMemberStateFieldName, &stateInt);
    if (!status.isOK())
        return status;
    if (stateInt < 0 || stateInt > MemberState::RS_MAX) {
        return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" <<
                      kMemberStateFieldName << "\" in response to replSetHeartbeat is "
                      "out of range; legal values are non-negative and no more than " <<
                      MemberState::RS_MAX);
    }
    _state = MemberState(static_cast<int>(stateInt));

    // extracting the lastCommittedOp is a bit of a process
    BSONObj lastOpTime = doc[kLastOpTimeFieldName].Obj();
    Timestamp ts;
    status = bsonExtractTimestampField(lastOpTime, kOpTimeFieldName, &ts);
    if (!status.isOK())
        return status;
    long long term;
    status = bsonExtractIntegerField(lastOpTime, kTermFieldName, &term);
    if (!status.isOK())
        return status;
    _lastOpTime = OpTime(lastOpTime[kOpTimeFieldName].timestamp(),
                         lastOpTime[kTermFieldName].Long());


    status = bsonExtractStringField(doc, kSyncSourceFieldName, &_syncingTo);
    if (!status.isOK())
        return status;

    status = bsonExtractIntegerField(doc, kConfigVersionFieldName, &_configVersion);
    if (!status.isOK())
        return status;

    status = bsonExtractIntegerField(doc, kPrimaryIdFieldName, &_primaryId);
    if (!status.isOK())
        return status;

    status = bsonExtractIntegerField(doc, kTermFieldName, &_term);
    if (!status.isOK())
        return status;

    const BSONElement hasDataElement = doc[kHasDataFieldName];
    _hasDataSet = !hasDataElement.eoo();
    _hasData = hasDataElement.trueValue();

    const BSONElement rsConfigElement = doc[kConfigFieldName];
    if (rsConfigElement.eoo()) {
        _configSet = false;
        _config = ReplicaSetConfig();
        return Status::OK();
    }
    else if (rsConfigElement.type() != Object) {
        return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                      kConfigFieldName << "\" in response to replSetHeartbeat to have type "
                      "Object, but found " << typeName(rsConfigElement.type()));
    }
    _configSet = true;
    return _config.initialize(rsConfigElement.Obj());
}
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) {
    // Old versions set this even though they returned not "ok"
    _mismatch = doc[kMismatchFieldName].trueValue();
    if (_mismatch)
        return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match.");

    // Old versions sometimes set the replica set name ("set") but ok:0
    const BSONElement replSetNameElement = doc[kReplSetFieldName];
    if (replSetNameElement.eoo()) {
        _setName.clear();
    } else if (replSetNameElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kReplSetFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type String, but found "
                                    << typeName(replSetNameElement.type()));
    } else {
        _setName = replSetNameElement.String();
    }

    if (_setName.empty() && !doc[kOkFieldName].trueValue()) {
        std::string errMsg = doc[kErrMsgFieldName].str();

        BSONElement errCodeElem = doc[kErrorCodeFieldName];
        if (errCodeElem.ok()) {
            if (!errCodeElem.isNumber())
                return Status(ErrorCodes::BadValue, "Error code is not a number!");

            int errorCode = errCodeElem.numberInt();
            return Status(ErrorCodes::Error(errorCode), errMsg);
        }
        return Status(ErrorCodes::UnknownError, errMsg);
    }

    const BSONElement hasDataElement = doc[kHasDataFieldName];
    _hasDataSet = !hasDataElement.eoo();
    _hasData = hasDataElement.trueValue();

    const BSONElement electionTimeElement = doc[kElectionTimeFieldName];
    if (electionTimeElement.eoo()) {
        _electionTimeSet = false;
    } else if (electionTimeElement.type() == Timestamp) {
        _electionTimeSet = true;
        _electionTime = electionTimeElement._opTime();
    } else if (electionTimeElement.type() == Date) {
        _electionTimeSet = true;
        _electionTime = OpTime(electionTimeElement.date());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kElectionTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(electionTimeElement.type()));
    }

    const BSONElement timeElement = doc[kTimeFieldName];
    if (timeElement.eoo()) {
        _timeSet = false;
    } else if (timeElement.isNumber()) {
        _timeSet = true;
        _time = Seconds(timeElement.numberLong());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have a numeric type, but found type "
                                    << typeName(timeElement.type()));
    }

    const BSONElement opTimeElement = doc[kOpTimeFieldName];
    if (opTimeElement.eoo()) {
        _opTimeSet = false;
    } else if (opTimeElement.type() == Timestamp) {
        _opTimeSet = true;
        _opTime = opTimeElement._opTime();
    } else if (opTimeElement.type() == Date) {
        _opTimeSet = true;
        _opTime = OpTime(opTimeElement.date());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kOpTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(opTimeElement.type()));
    }

    const BSONElement electableElement = doc[kIsElectableFieldName];
    if (electableElement.eoo()) {
        _electableSet = false;
    } else {
        _electableSet = true;
        _electable = electableElement.trueValue();
    }

    _isReplSet = doc[kIsReplSetFieldName].trueValue();

    const BSONElement memberStateElement = doc[kMemberStateFieldName];
    if (memberStateElement.eoo()) {
        _stateSet = false;
    } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream()
                          << "Expected \"" << kMemberStateFieldName
                          << "\" field in response to replSetHeartbeat "
                             "command to have type NumberInt or NumberLong, but found type "
                          << typeName(memberStateElement.type()));
    } else {
        long long stateInt = memberStateElement.numberLong();
        if (stateInt < 0 || stateInt > MemberState::RS_MAX) {
            return Status(ErrorCodes::BadValue,
                          str::stream()
                              << "Value for \"" << kMemberStateFieldName
                              << "\" in response to replSetHeartbeat is "
                                 "out of range; legal values are non-negative and no more than "
                              << MemberState::RS_MAX);
        }
        _stateSet = true;
        _state = MemberState(static_cast<int>(stateInt));
    }

    _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue();


    // Not required for the case of uninitialized members -- they have no config
    const BSONElement versionElement = doc[kConfigVersionFieldName];

    // If we have an optime then we must have a version
    if (_opTimeSet && versionElement.eoo()) {
        return Status(ErrorCodes::NoSuchKey,
                      str::stream() << "Response to replSetHeartbeat missing required \""
                                    << kConfigVersionFieldName
                                    << "\" field even though initialized");
    }

    // If there is a "v" (config version) then it must be an int.
    if (!versionElement.eoo() && versionElement.type() != NumberInt) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kConfigVersionFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type NumberInt, but found "
                                    << typeName(versionElement.type()));
    }
    _version = versionElement.numberInt();

    const BSONElement hbMsgElement = doc[kHbMessageFieldName];
    if (hbMsgElement.eoo()) {
        _hbmsg.clear();
    } else if (hbMsgElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kHbMessageFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type String, but found " << typeName(hbMsgElement.type()));
    } else {
        _hbmsg = hbMsgElement.String();
    }

    const BSONElement syncingToElement = doc[kSyncSourceFieldName];
    if (syncingToElement.eoo()) {
        _syncingTo.clear();
    } else if (syncingToElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kSyncSourceFieldName
                                    << "\" field in response to replSetHeartbeat to "
                                       "have type String, but found "
                                    << typeName(syncingToElement.type()));
    } else {
        _syncingTo = syncingToElement.String();
    }

    const BSONElement rsConfigElement = doc[kConfigFieldName];
    if (rsConfigElement.eoo()) {
        _configSet = false;
        _config = ReplicaSetConfig();
        return Status::OK();
    } else if (rsConfigElement.type() != Object) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kConfigFieldName
                                    << "\" in response to replSetHeartbeat to have type "
                                       "Object, but found " << typeName(rsConfigElement.type()));
    }
    _configSet = true;
    return _config.initialize(rsConfigElement.Obj());
}
Exemplo n.º 12
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      OpTime lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0);
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched);

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            log() << "replSet error RS102 too stale to catch up";
            log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong();
            log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong();
            log() << "replSet "
                     "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, oldestOpTimeSeen);
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10 * 1000));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsoplog, Query()));
        BSONElement tsElem(remoteOldestOp["ts"]);
        if (tsElem.type() != Timestamp) {
            // This member's got a bad op in its oplog.
            warning() << "oplog invalid format on node " << candidate.toString();
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000));
            continue;
        }
        OpTime remoteOldOpTime = tsElem._opTime();

        if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000));
            if (oldestOpTimeSeen > remoteOldOpTime) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Got a valid sync source.
        return;
    }  // while (true)
}
Exemplo n.º 13
0
    Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) {
        const BSONElement electionTimeElement = doc[kElectionTimeFieldName];
        if (electionTimeElement.eoo()) {
            _electionTimeSet = false;
        }
        else if (electionTimeElement.type() == Timestamp) {
            _electionTimeSet = true;
            _electionTime = electionTimeElement._opTime();
        }
        else if (electionTimeElement.type() == Date) {
            _electionTime = true;
            _electionTime = OpTime(electionTimeElement.date());
        }
        else {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kElectionTimeFieldName << "\" field in response to replSetHeartbeat "
                          "command to have type Date or Timestamp, but found type " <<
                          typeName(electionTimeElement.type()));
        }

        const BSONElement timeElement = doc[kTimeFieldName];
        if (timeElement.eoo()) {
            _timeSet = false;
        }
        else if (timeElement.isNumber()) {
            _timeSet = true;
            _time = Seconds(timeElement.numberLong());
        }
        else {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kTimeFieldName << "\" field in reponse to replSetHeartbeat "
                          "command to have a numeric type, but found type " <<
                          typeName(timeElement.type()));
        }

        const BSONElement opTimeElement = doc[kOpTimeFieldName];
        if (opTimeElement.eoo()) {
            _opTimeSet = false;
        }
        else if (opTimeElement.type() == Timestamp) {
            _opTimeSet = true;
            _opTime = opTimeElement._opTime();
        }
        else if (opTimeElement.type() == Date) {
            _opTimeSet = true;
            _opTime = OpTime(opTimeElement.date());
        }
        else {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kOpTimeFieldName << "\" field in response to replSetHeartbeat "
                          "command to have type Date or Timestamp, but found type " <<
                          typeName(opTimeElement.type()));
        }

        const BSONElement electableElement = doc[kIsElectableFieldName];
        if (electableElement.eoo()) {
            _electableSet = false;
        }
        else {
            _electableSet = true;
            _electable = electableElement.trueValue();
        }

        _mismatch = doc[kMismatchFieldName].trueValue();
        _isReplSet = doc[kIsReplSetFieldName].trueValue();

        const BSONElement memberStateElement = doc[kMemberStateFieldName];
        if (memberStateElement.eoo()) {
            _stateSet = false;
        }
        else if (memberStateElement.type() != NumberInt &&
                 memberStateElement.type() != NumberLong) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kMemberStateFieldName << "\" field in response to replSetHeartbeat "
                          " command to have type NumberInt or NumberLong, but found type " <<
                          typeName(memberStateElement.type()));
        }
        else {
            long long stateInt = memberStateElement.numberLong();
            if (stateInt < 0 || stateInt > MemberState::RS_MAX) {
                return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" <<
                              kMemberStateFieldName << "\" in response to replSetHeartbeat is "
                              " out of range; legal values are non-negative and no more than " <<
                              MemberState::RS_MAX);
            }
            _state = MemberState(static_cast<int>(stateInt));
        }

        _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue();

        const BSONElement versionElement = doc[kConfigVersionFieldName];
        if (versionElement.eoo()) {
            return Status(ErrorCodes::NoSuchKey, str::stream() <<
                          "Response to replSetHeartbeat missing required \"" <<
                          kConfigVersionFieldName << " field");
        }
        if (versionElement.type() != NumberInt) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kConfigVersionFieldName <<
                          "\" field in response to replSetHeartbeat to have "
                          "type NumberInt, but found " << typeName(versionElement.type()));
        }
        _version = versionElement.numberInt();

        const BSONElement replSetNameElement = doc[kReplSetFieldName];
        if (replSetNameElement.eoo()) {
            return Status(ErrorCodes::NoSuchKey, str::stream() <<
                          "Response to replSetHeartbeat missing required \"" <<
                          kReplSetFieldName << "\" field");
        }
        if (replSetNameElement.type() != String) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kReplSetFieldName << "\" field in response to replSetHeartbeat to have "
                          "type String, but found " << typeName(replSetNameElement.type()));
        }
        _setName = replSetNameElement.String();

        const BSONElement hbMsgElement = doc[kHbMessageFieldName];
        if (hbMsgElement.eoo()) {
            _hbmsg.clear();
        }
        else if (hbMsgElement.type() != String) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kHbMessageFieldName << "\" field in response to replSetHeartbeat to have "
                          "type String, but found " << typeName(hbMsgElement.type()));
        }
        _hbmsg = hbMsgElement.String();

        const BSONElement syncingToElement = doc[kSyncSourceFieldName];
        if (syncingToElement.eoo()) {
            _syncingTo.clear();
        }
        else if (syncingToElement.type() != String) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kSyncSourceFieldName << "\" field in response to replSetHeartbeat to "
                          "have type String, but found " << typeName(syncingToElement.type()));
        }
        _syncingTo = syncingToElement.String();

        const BSONElement rsConfigElement = doc[kConfigFieldName];
        if (rsConfigElement.eoo()) {
            _configSet = false;
            _config = ReplicaSetConfig();
        }
        else if (rsConfigElement.type() != Object) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kConfigFieldName << "\" in response to replSetHeartbeat to have type "
                          "Object, but found " << typeName(rsConfigElement.type()));
        }
        _configSet = true;
        return _config.initialize(rsConfigElement.Obj());
    }
Exemplo n.º 14
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      const OpTime& lastOpTimeFetched,
                                      const OpTime& requiredOpTime,
                                      ReplicationCoordinator* replCoord) {
    const Timestamp sentinelTimestamp(duration_cast<Seconds>(Date_t::now().toDurationSinceEpoch()),
                                      0);
    const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched);

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            error() << "too stale to catch up -- entering maintenance mode";
            log() << "our last optime : " << lastOpTimeFetched;
            log() << "oldest available is " << oldestOpTimeSeen;
            log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            auto status = replCoord->setMaintenanceMode(true);
            if (!status.isOK()) {
                warning() << "Failed to transition into maintenance mode: " << status;
            }
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
        OpTime remoteOldOpTime =
            fassertStatusOK(28776, OpTime::parseFromOplogEntry(remoteOldestOp));

        // remoteOldOpTime may come from a very old config, so we cannot compare their terms.
        if (!lastOpTimeFetched.isNull() &&
            lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1));
            if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Check if sync source contains required optime.
        if (!requiredOpTime.isNull()) {
            // This query is structured so that it is executed on the sync source using the oplog
            // start hack (oplogReplay=true and $gt/$gte predicate over "ts").
            auto ts = requiredOpTime.getTimestamp();
            tailingQuery(rsOplogName.c_str(), BSON("ts" << BSON("$gte" << ts << "$lte" << ts)));
            auto status = _compareRequiredOpTimeWithQueryResponse(requiredOpTime);
            if (!status.isOK()) {
                const auto blacklistDuration = Seconds(60);
                const auto until = Date_t::now() + blacklistDuration;
                warning() << "We cannot use " << candidate.toString()
                          << " as a sync source because it does not contain the necessary "
                             "operations for us to reach a consistent state: "
                          << status << " last fetched optime: " << lastOpTimeFetched
                          << ". required optime: " << requiredOpTime
                          << ". Blacklisting this sync source for " << blacklistDuration
                          << " until: " << until;
                resetConnection();
                replCoord->blacklistSyncSource(candidate, until);
                continue;
            }
            resetCursor();
        }

        // TODO: If we were too stale (recovering with maintenance mode on), then turn it off, to
        //       allow becoming secondary/etc.

        // Got a valid sync source.
        return;
    }  // while (true)
}