예제 #1
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      const OpTime& lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0);
    const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp());

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            error() << "too stale to catch up";
            log() << "our last optime : " << lastOpTimeFetched;
            log() << "oldest available is " << oldestOpTimeSeen;
            log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, oldestOpTimeSeen);
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
        OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp));

        // remoteOldOpTime may come from a very old config, so we cannot compare their terms.
        if (!lastOpTimeFetched.isNull() &&
            lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1));
            if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Got a valid sync source.
        return;
    }  // while (true)
}
예제 #2
0
파일: sync_tail.cpp 프로젝트: Jaryli/mongo
/* tail an oplog.  ok to return, will be re-called. */
void SyncTail::oplogApplication() {
    ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
    ApplyBatchFinalizer finalizer(replCoord);

    OperationContextImpl txn;
    OpTime originalEndOpTime(getMinValid(&txn).end);

    while (!inShutdown()) {
        OpQueue ops;

        Timer batchTimer;
        int lastTimeChecked = 0;

        do {
            int now = batchTimer.seconds();

            // apply replication batch limits
            if (!ops.empty()) {
                if (now > replBatchLimitSeconds)
                    break;
                if (ops.getDeque().size() > replBatchLimitOperations)
                    break;
            }
            // occasionally check some things
            // (always checked in the first iteration of this do-while loop, because
            // ops is empty)
            if (ops.empty() || now > lastTimeChecked) {
                BackgroundSync* bgsync = BackgroundSync::get();
                if (bgsync->getInitialSyncRequestedFlag()) {
                    // got a resync command
                    return;
                }
                lastTimeChecked = now;
                // can we become secondary?
                // we have to check this before calling mgr, as we must be a secondary to
                // become primary
                tryToGoLiveAsASecondary(&txn, replCoord);
            }

            const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs());
            if (!ops.empty() && slaveDelaySecs > 0) {
                const BSONObj lastOp = ops.back();
                const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs();

                // Stop the batch as the lastOp is too new to be applied. If we continue
                // on, we can get ops that are way ahead of the delay and this will
                // make this thread sleep longer when handleSlaveDelay is called
                // and apply ops much sooner than we like.
                if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                    break;
                }
            }

            if (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                break;
            }

            // keep fetching more ops as long as we haven't filled up a full batch yet
        } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) &&  // tryPopAndWaitForMore returns
                                                                  // true when we need to end a
                                                                  // batch early
                 (ops.getSize() < replBatchLimitBytes) &&
                 !inShutdown());

        // For pausing replication in tests
        while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
            sleepmillis(0);
            if (inShutdown())
                return;
        }

        if (ops.empty()) {
            continue;
        }

        const BSONObj lastOp = ops.back();
        handleSlaveDelay(lastOp);

        // Set minValid to the last OpTime that needs to be applied, in this batch or from the
        // (last) failed batch, whichever is larger.
        // This will cause this node to go into RECOVERING state
        // if we should crash and restart before updating finishing.
        const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm);

        // Take the max of the first endOptime (if we recovered) and the end of our batch.
        const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp));

        // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch)
        // ensures that we keep pushing out the point where we can become consistent
        // and allow reads. If we recover and end up doing smaller batches we must pass the
        // originalEndOpTime before we are good.
        //
        // For example:
        // batch apply, 20-40, end = 40
        // batch failure,
        // restart
        // batch apply, 20-25, end = max(25, 40) = 40
        // batch apply, 25-45, end = 45
        const OpTime end(std::max(originalEndOpTime, lastOpTime));

        // This write will not journal/checkpoint.
        setMinValid(&txn, {start, end});

        OpTime finalOpTime = multiApply(&txn, ops);
        setNewTimestamp(finalOpTime.getTimestamp());

        setMinValid(&txn, end, DurableRequirement::None);
        finalizer.record(finalOpTime);
    }
}
예제 #3
0
    void syncFixUp(OperationContext* txn,
                   FixUpInfo& fixUpInfo,
                   OplogReader* oplogreader,
                   ReplicationCoordinator* replCoord) {
        DBClientConnection* them = oplogreader->conn();

        // fetch all first so we needn't handle interruption in a fancy way

        unsigned long long totalSize = 0;

        list< pair<DocID, BSONObj> > goodVersions;

        BSONObj newMinValid;

        // fetch all the goodVersions of each document from current primary
        DocID doc;
        unsigned long long numFetched = 0;
        try {
            for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin();
                    it != fixUpInfo.toRefetch.end();
                    it++) {
                doc = *it;

                verify(!doc._id.eoo());

                {
                    // TODO : slow.  lots of round trips.
                    numFetched++;
                    BSONObj good = them->findOne(doc.ns, doc._id.wrap(),
                                                     NULL, QueryOption_SlaveOk).getOwned();
                    totalSize += good.objsize();
                    uassert(13410, "replSet too much data to roll back",
                            totalSize < 300 * 1024 * 1024);

                    // note good might be eoo, indicating we should delete it
                    goodVersions.push_back(pair<DocID, BSONObj>(doc,good));
                }
            }
            newMinValid = oplogreader->getLastOp(rsOplogName);
            if (newMinValid.isEmpty()) {
                error() << "rollback error newMinValid empty?";
                return;
            }
        }
        catch (DBException& e) {
            LOG(1) << "rollback re-get objects: " << e.toString();
            error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' '
                    << numFetched << '/' << fixUpInfo.toRefetch.size();
            throw e;
        }

        log() << "rollback 3.5";
        if (fixUpInfo.rbid != getRBID(oplogreader->conn())) {
            // our source rolled back itself.  so the data we received isn't necessarily consistent.
            warning() << "rollback rbid on source changed during rollback, cancelling this attempt";
            return;
        }

        // update them
        log() << "rollback 4 n:" << goodVersions.size();

        bool warn = false;

        invariant(!fixUpInfo.commonPointOurDiskloc.isNull());
        invariant(txn->lockState()->isW());

        // we have items we are writing that aren't from a point-in-time.  thus best not to come
        // online until we get to that point in freshness.
        Timestamp minValid = newMinValid["ts"].timestamp();
        log() << "minvalid=" << minValid.toStringLong();
        setMinValid(txn, minValid);

        // any full collection resyncs required?
        if (!fixUpInfo.collectionsToResyncData.empty()
                || !fixUpInfo.collectionsToResyncMetadata.empty()) {

            for (const string& ns : fixUpInfo.collectionsToResyncData) {
                log() << "rollback 4.1.1 coll resync " << ns;

                fixUpInfo.collectionsToResyncMetadata.erase(ns);

                const NamespaceString nss(ns);

                Database* db = dbHolder().openDb(txn, nss.db().toString());
                invariant(db);

                {
                    WriteUnitOfWork wunit(txn);
                    db->dropCollection(txn, ns);
                    wunit.commit();
                }

                {
                    string errmsg;

                    // This comes as a GlobalWrite lock, so there is no DB to be acquired after
                    // resume, so we can skip the DB stability checks. Also 
                    // copyCollectionFromRemote will acquire its own database pointer, under the
                    // appropriate locks, so just releasing and acquiring the lock is safe.
                    invariant(txn->lockState()->isW());
                    Lock::TempRelease release(txn->lockState());

                    bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg);
                    uassert(15909, str::stream() << "replSet rollback error resyncing collection "
                                                 << ns << ' ' << errmsg, ok);
                }
            }

            for (const string& ns : fixUpInfo.collectionsToResyncMetadata) {
                log() << "rollback 4.1.2 coll metadata resync " << ns;

                const NamespaceString nss(ns);
                auto db = dbHolder().openDb(txn, nss.db().toString());
                invariant(db);
                auto collection = db->getCollection(ns);
                invariant(collection);
                auto cce = collection->getCatalogEntry();

                const std::list<BSONObj> info =
                    them->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll()));

                if (info.empty()) {
                    // Collection dropped by "them" so we should drop it too.
                    log() << ns << " not found on remote host, dropping";
                    fixUpInfo.toDrop.insert(ns);
                    continue;
                }

                invariant(info.size() == 1);

                CollectionOptions options;
                auto status = options.parse(info.front());
                if (!status.isOK()) {
                    throw RSFatalException(str::stream() << "Failed to parse options "
                                                         << info.front() << ": "
                                                         << status.toString());
                }

                WriteUnitOfWork wuow(txn);
                if (options.flagsSet || cce->getCollectionOptions(txn).flagsSet) {
                    cce->updateFlags(txn, options.flags);
                }

                status = collection->setValidator(txn, options.validator);
                if (!status.isOK()) {
                    throw RSFatalException(str::stream() << "Failed to set validator: "
                                                         << status.toString());
                }
                wuow.commit();
            }

            // we did more reading from primary, so check it again for a rollback (which would mess
            // us up), and make minValid newer.
            log() << "rollback 4.2";

            string err;
            try {
                newMinValid = oplogreader->getLastOp(rsOplogName);
                if (newMinValid.isEmpty()) {
                    err = "can't get minvalid from sync source";
                }
                else {
                    Timestamp minValid = newMinValid["ts"].timestamp();
                    log() << "minvalid=" << minValid.toStringLong();
                    setMinValid(txn, minValid);
                }
            }
            catch (DBException& e) {
                err = "can't get/set minvalid: ";
                err += e.what();
            }
            if (fixUpInfo.rbid != getRBID(oplogreader->conn())) {
                // our source rolled back itself.  so the data we received isn't necessarily
                // consistent. however, we've now done writes.  thus we have a problem.
                err += "rbid at primary changed during resync/rollback";
            }
            if (!err.empty()) {
                severe() << "rolling back : " << err
                        << ". A full resync will be necessary.";
                // TODO: reset minvalid so that we are permanently in fatal state
                // TODO: don't be fatal, but rather, get all the data first.
                throw RSFatalException();
            }
            log() << "rollback 4.3";
        }

        map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers;

        log() << "rollback 4.6";
        // drop collections to drop before doing individual fixups - that might make things faster
        // below actually if there were subsequent inserts to rollback
        for (set<string>::iterator it = fixUpInfo.toDrop.begin();
                it != fixUpInfo.toDrop.end();
                it++) {
            log() << "rollback drop: " << *it;

            Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it));
            if (db) {
                WriteUnitOfWork wunit(txn);

                shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it];
                if (!removeSaver)
                    removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it));

                // perform a collection scan and write all documents in the collection to disk
                boost::scoped_ptr<PlanExecutor> exec(
                        InternalPlanner::collectionScan(txn,
                                                        *it,
                                                        db->getCollection(*it)));
                BSONObj curObj;
                PlanExecutor::ExecState execState;
                while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) {
                    removeSaver->goingToDelete(curObj);
                }
                if (execState != PlanExecutor::IS_EOF) {
                    if (execState == PlanExecutor::FAILURE &&
                            WorkingSetCommon::isValidStatusMemberObject(curObj)) {
                        Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj);
                        severe() << "rolling back createCollection on " << *it
                                 << " failed with " << errorStatus
                                 << ". A full resync is necessary.";
                    }
                    else {
                        severe() << "rolling back createCollection on " << *it
                                 << " failed. A full resync is necessary.";
                    }
                            
                    throw RSFatalException();
                }

                db->dropCollection(txn, *it);
                wunit.commit();
            }
        }

        log() << "rollback 4.7";
        OldClientContext ctx(txn, rsOplogName);
        Collection* oplogCollection = ctx.db()->getCollection(rsOplogName);
        uassert(13423,
                str::stream() << "replSet error in rollback can't find " << rsOplogName,
                oplogCollection);

        unsigned deletes = 0, updates = 0;
        time_t lastProgressUpdate = time(0);
        time_t progressUpdateGap = 10;
        for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin();
                it != goodVersions.end();
                it++) {
            time_t now = time(0);
            if (now - lastProgressUpdate > progressUpdateGap) {
                log() << deletes << " delete and "
                      << updates << " update operations processed out of "
                      << goodVersions.size() << " total operations";
                lastProgressUpdate = now;
            }
            const DocID& doc = it->first;
            BSONObj pattern = doc._id.wrap(); // { _id : ... }
            try {
                verify(doc.ns && *doc.ns);
                if (fixUpInfo.collectionsToResyncData.count(doc.ns)) {
                    // we just synced this entire collection
                    continue;
                }

                // keep an archive of items rolled back
                shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns];
                if (!removeSaver)
                    removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns));

                // todo: lots of overhead in context, this can be faster
                OldClientContext ctx(txn, doc.ns);

                // Add the doc to our rollback file
                BSONObj obj;
                Collection* collection = ctx.db()->getCollection(doc.ns);

                // Do not log an error when undoing an insert on a no longer existent collection.
                // It is likely that the collection was dropped as part of rolling back a
                // createCollection command and regardless, the document no longer exists.
                if (collection) {
                    bool found = Helpers::findOne(txn, collection, pattern, obj, false);
                    if (found) {
                        removeSaver->goingToDelete(obj);
                    }
                    else {
                        error() << "rollback cannot find object: " << pattern
                                << " in namespace " << doc.ns;
                    }
                }

                if (it->second.isEmpty()) {
                    // wasn't on the primary; delete.
                    // TODO 1.6 : can't delete from a capped collection.  need to handle that here.
                    deletes++;

                    if (collection) {
                        if (collection->isCapped()) {
                            // can't delete from a capped collection - so we truncate instead. if
                            // this item must go, so must all successors!!!
                            try {
                                // TODO: IIRC cappedTruncateAfter does not handle completely empty.
                                // this will crazy slow if no _id index.
                                long long start = Listener::getElapsedTimeMillis();
                                RecordId loc = Helpers::findOne(txn, collection, pattern, false);
                                if (Listener::getElapsedTimeMillis() - start > 200)
                                    warning() << "roll back slow no _id index for "
                                          << doc.ns << " perhaps?";
                                // would be faster but requires index:
                                // RecordId loc = Helpers::findById(nsd, pattern);
                                if (!loc.isNull()) {
                                    try {
                                        collection->temp_cappedTruncateAfter(txn, loc, true);
                                    }
                                    catch (DBException& e) {
                                        if (e.getCode() == 13415) {
                                            // hack: need to just make cappedTruncate do this...
                                            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                                                WriteUnitOfWork wunit(txn);
                                                uassertStatusOK(collection->truncate(txn));
                                                wunit.commit();
                                            } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(
                                                                            txn,
                                                                            "truncate",
                                                                            collection->ns().ns());
                                        }
                                        else {
                                            throw e;
                                        }
                                    }
                                }
                            }
                            catch (DBException& e) {
                                error() << "rolling back capped collection rec "
                                      << doc.ns << ' ' << e.toString();
                            }
                        }
                        else {
예제 #4
0
void BackgroundSync::_produce(OperationContext* txn) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    SyncSourceResolverResponse syncSourceResp =
        _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched);

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen});
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode.";
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _replCoord->signalUpstreamUpdater();
    }

    const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol();
    // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election
    // timeout. This enables the sync source to communicate liveness of the primary to secondaries.
    // Under protocol version 0, use a default timeout of 2 seconds for awaitData.
    const Milliseconds fetcherMaxTimeMS(
        isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2));

    Status fetcherReturnStatus = Status::OK();
    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      fetcherMaxTimeMS,
                                      &fetcherReturnStatus);


    BSONObjBuilder cmdBob;
    cmdBob.append("find", nsToCollectionSubstring(rsOplogName));
    cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())));
    cmdBob.append("tailable", true);
    cmdBob.append("oplogReplay", true);
    cmdBob.append("awaitData", true);
    cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1)));  // 1 min initial find.

    BSONObjBuilder metadataBob;
    if (isV1ElectionProtocol) {
        cmdBob.append("term", _replCoord->getTerm());
        metadataBob.append(rpc::kReplSetMetadataFieldName, 1);
    }

    auto dbName = nsToDatabase(rsOplogName);
    auto cmdObj = cmdBob.obj();
    auto metadataObj = metadataBob.obj();
    Fetcher fetcher(&_threadPoolTaskExecutor,
                    source,
                    dbName,
                    cmdObj,
                    fetcherCallback,
                    metadataObj,
                    _replCoord->getConfig().getElectionTimeoutPeriod());

    LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at "
           << cmdObj["filter"];
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << fetcherReturnStatus.toString();
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), oplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << fetcherReturnStatus;

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        BatchBoundaries boundaries = getMinValid(txn);
        if (!boundaries.start.isNull() || boundaries.end > lastApplied) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream()
                                      << "need to rollback, but in inconsistent state. "
                                      << "minvalid: " << boundaries.end.toString()
                                      << " > our last optime: " << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString();
    }
}
예제 #5
0
 void ReplSetImpl::setMinValid(OperationContext* ctx, BSONObj obj) {
     setMinValid(ctx, obj["ts"]._opTime());
 }
예제 #6
0
    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();

        while(!inShutdown()) {
            OpQueue ops;
            OperationContextImpl txn;

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    BackgroundSync* bgsync = BackgroundSync::get();
                    if (bgsync->getInitialSyncRequestedFlag()) {
                        // got a resync command
                        Lock::DBLock lk(txn.lockState(), "local", MODE_X);
                        WriteUnitOfWork wunit(&txn);
                        Client::Context ctx(&txn, "local");

                        ctx.db()->dropCollection(&txn, "local.oplog.rs");

                        // Note: the following order is important.
                        // The bgsync thread uses an empty optime as a sentinel to know to wait
                        // for initial sync (done in this thread after we return); thus, we must
                        // ensure the lastAppliedOptime is empty before pausing the bgsync thread
                        // via stop().
                        // We must clear the sync source blacklist after calling stop()
                        // because the bgsync thread, while running, may update the blacklist.
                        replCoord->setMyLastOptime(&txn, OpTime());
                        bgsync->stop();
                        replCoord->clearSyncSourceBlacklist();

                        wunit.commit();

                        return;
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    tryToGoLiveAsASecondary(&txn, replCoord);

                    // TODO(emilkie): This can be removed once we switch over from legacy;
                    // this code is what moves 1-node sets to PRIMARY state.
                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet &&
                            theReplSet->config().members.size() == 1 &&
                            theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds();
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true 
                                                               // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes) &&
                   !inShutdown());

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            if (ops.empty()) {
                continue;
            }

            const BSONObj& lastOp = ops.getDeque().back();
            handleSlaveDelay(lastOp);

            if (replCoord->getCurrentMemberState().primary() && 
                !replCoord->isWaitingForApplierToDrain()) {
                severe() << "attempting to replicate ops while primary";
                fassertFailed(28527);
            }

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            OpTime minValid = lastOp["ts"]._opTime();
            setMinValid(&txn, minValid);

            multiApply(ops.getDeque());

            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication
            // coordinator
            if (theReplSet && !theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }
/* tail an oplog.  ok to return, will be re-called. */
void SyncTail::oplogApplication() {
    OpQueueBatcher batcher(this);

    OperationContextImpl txn;
    auto replCoord = ReplicationCoordinator::get(&txn);
    ApplyBatchFinalizer finalizer(replCoord);

    auto minValidBoundaries = getMinValid(&txn);
    OpTime originalEndOpTime(minValidBoundaries.end);
    OpTime lastWriteOpTime{replCoord->getMyLastOptime()};
    while (!inShutdown()) {
        OpQueue ops;

        do {
            if (BackgroundSync::get()->getInitialSyncRequestedFlag()) {
                // got a resync command
                return;
            }

            tryToGoLiveAsASecondary(&txn, replCoord, minValidBoundaries, lastWriteOpTime);

            // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become
            // ready in time, we'll loop again so we can do the above checks periodically.
            ops = batcher.getNextBatch(Seconds(1));
        } while (!inShutdown() && ops.empty());

        if (inShutdown())
            return;

        invariant(!ops.empty());

        const BSONObj lastOp = ops.back().raw;

        if (lastOp.isEmpty()) {
            // This means that the network thread has coalesced and we have processed all of its
            // data.
            invariant(ops.getDeque().size() == 1);
            if (replCoord->isWaitingForApplierToDrain()) {
                replCoord->signalDrainComplete(&txn);
            }
            continue;  // This wasn't a real op. Don't try to apply it.
        }

        handleSlaveDelay(lastOp);

        // Set minValid to the last OpTime that needs to be applied, in this batch or from the
        // (last) failed batch, whichever is larger.
        // This will cause this node to go into RECOVERING state
        // if we should crash and restart before updating finishing.
        const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm);

        // Take the max of the first endOptime (if we recovered) and the end of our batch.
        const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp));

        // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch)
        // ensures that we keep pushing out the point where we can become consistent
        // and allow reads. If we recover and end up doing smaller batches we must pass the
        // originalEndOpTime before we are good.
        //
        // For example:
        // batch apply, 20-40, end = 40
        // batch failure,
        // restart
        // batch apply, 20-25, end = max(25, 40) = 40
        // batch apply, 25-45, end = 45
        const OpTime end(std::max(originalEndOpTime, lastOpTime));

        // This write will not journal/checkpoint.
        setMinValid(&txn, {start, end});

        lastWriteOpTime = multiApply(&txn, ops);
        setNewTimestamp(lastWriteOpTime.getTimestamp());

        setMinValid(&txn, end, DurableRequirement::None);
        minValidBoundaries.start = {};
        minValidBoundaries.end = end;
        finalizer.record(lastWriteOpTime);
    }
}
예제 #8
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      OpTime lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0);
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched);

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            log() << "replSet error RS102 too stale to catch up";
            log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong();
            log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong();
            log() << "replSet "
                     "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, oldestOpTimeSeen);
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10 * 1000));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsoplog, Query()));
        BSONElement tsElem(remoteOldestOp["ts"]);
        if (tsElem.type() != Timestamp) {
            // This member's got a bad op in its oplog.
            warning() << "oplog invalid format on node " << candidate.toString();
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000));
            continue;
        }
        OpTime remoteOldOpTime = tsElem._opTime();

        if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000));
            if (oldestOpTimeSeen > remoteOldOpTime) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Got a valid sync source.
        return;
    }  // while (true)
}