Exemplo n.º 1
0
void SyncSourceResolver::_requiredOpTimeFetcherCallback(
    const StatusWith<Fetcher::QueryResponse>& queryResult,
    HostAndPort candidate,
    OpTime earliestOpTimeSeen,
    int rbid) {
    if (_isShuttingDown()) {
        _finishCallback(Status(ErrorCodes::CallbackCanceled,
                               str::stream() << "sync source resolver shut down while looking for "
                                                "required optime "
                                             << _requiredOpTime.toString()
                                             << " in candidate's oplog: "
                                             << candidate))
            .transitional_ignore();
        return;
    }

    if (ErrorCodes::CallbackCanceled == queryResult.getStatus()) {
        _finishCallback(queryResult.getStatus()).transitional_ignore();
        return;
    }

    if (!queryResult.isOK()) {
        // We got an error.
        const auto until = _taskExecutor->now() + kFetcherErrorBlacklistDuration;
        log() << "Blacklisting " << candidate << " due to required optime fetcher error: '"
              << queryResult.getStatus() << "' for " << kFetcherErrorBlacklistDuration
              << " until: " << until << ". required optime: " << _requiredOpTime;
        _syncSourceSelector->blacklistSyncSource(candidate, until);

        _chooseAndProbeNextSyncSource(earliestOpTimeSeen).transitional_ignore();
        return;
    }

    const auto& queryResponse = queryResult.getValue();
    auto status = _compareRequiredOpTimeWithQueryResponse(queryResponse);
    if (!status.isOK()) {
        const auto until = _taskExecutor->now() + kNoRequiredOpTimeBlacklistDuration;
        warning() << "We cannot use " << candidate.toString()
                  << " as a sync source because it does not contain the necessary "
                     "operations for us to reach a consistent state: "
                  << status << " last fetched optime: " << _lastOpTimeFetched
                  << ". required optime: " << _requiredOpTime
                  << ". Blacklisting this sync source for " << kNoRequiredOpTimeBlacklistDuration
                  << " until: " << until;
        _syncSourceSelector->blacklistSyncSource(candidate, until);

        _chooseAndProbeNextSyncSource(earliestOpTimeSeen).transitional_ignore();
        return;
    }

    _finishCallback(candidate, rbid).ignore();
}
Exemplo n.º 2
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      const OpTime& lastOpTimeFetched,
                                      const OpTime& requiredOpTime,
                                      ReplicationCoordinator* replCoord) {
    const Timestamp sentinelTimestamp(duration_cast<Seconds>(Date_t::now().toDurationSinceEpoch()),
                                      0);
    const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched);

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            error() << "too stale to catch up -- entering maintenance mode";
            log() << "our last optime : " << lastOpTimeFetched;
            log() << "oldest available is " << oldestOpTimeSeen;
            log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            auto status = replCoord->setMaintenanceMode(true);
            if (!status.isOK()) {
                warning() << "Failed to transition into maintenance mode: " << status;
            }
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
        OpTime remoteOldOpTime =
            fassertStatusOK(28776, OpTime::parseFromOplogEntry(remoteOldestOp));

        // remoteOldOpTime may come from a very old config, so we cannot compare their terms.
        if (!lastOpTimeFetched.isNull() &&
            lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1));
            if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Check if sync source contains required optime.
        if (!requiredOpTime.isNull()) {
            // This query is structured so that it is executed on the sync source using the oplog
            // start hack (oplogReplay=true and $gt/$gte predicate over "ts").
            auto ts = requiredOpTime.getTimestamp();
            tailingQuery(rsOplogName.c_str(), BSON("ts" << BSON("$gte" << ts << "$lte" << ts)));
            auto status = _compareRequiredOpTimeWithQueryResponse(requiredOpTime);
            if (!status.isOK()) {
                const auto blacklistDuration = Seconds(60);
                const auto until = Date_t::now() + blacklistDuration;
                warning() << "We cannot use " << candidate.toString()
                          << " as a sync source because it does not contain the necessary "
                             "operations for us to reach a consistent state: "
                          << status << " last fetched optime: " << lastOpTimeFetched
                          << ". required optime: " << requiredOpTime
                          << ". Blacklisting this sync source for " << blacklistDuration
                          << " until: " << until;
                resetConnection();
                replCoord->blacklistSyncSource(candidate, until);
                continue;
            }
            resetCursor();
        }

        // TODO: If we were too stale (recovering with maintenance mode on), then turn it off, to
        //       allow becoming secondary/etc.

        // Got a valid sync source.
        return;
    }  // while (true)
}