Пример #1
0
    void syncFixUp(OperationContext* txn,
                   FixUpInfo& fixUpInfo,
                   OplogReader* oplogreader,
                   ReplicationCoordinator* replCoord) {
        DBClientConnection* them = oplogreader->conn();

        // fetch all first so we needn't handle interruption in a fancy way

        unsigned long long totalSize = 0;

        list< pair<DocID, BSONObj> > goodVersions;

        BSONObj newMinValid;

        // fetch all the goodVersions of each document from current primary
        DocID doc;
        unsigned long long numFetched = 0;
        try {
            for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin();
                    it != fixUpInfo.toRefetch.end();
                    it++) {
                doc = *it;

                verify(!doc._id.eoo());

                {
                    // TODO : slow.  lots of round trips.
                    numFetched++;
                    BSONObj good = them->findOne(doc.ns, doc._id.wrap(),
                                                     NULL, QueryOption_SlaveOk).getOwned();
                    totalSize += good.objsize();
                    uassert(13410, "replSet too much data to roll back",
                            totalSize < 300 * 1024 * 1024);

                    // note good might be eoo, indicating we should delete it
                    goodVersions.push_back(pair<DocID, BSONObj>(doc,good));
                }
            }
            newMinValid = oplogreader->getLastOp(rsOplogName);
            if (newMinValid.isEmpty()) {
                error() << "rollback error newMinValid empty?";
                return;
            }
        }
        catch (DBException& e) {
            LOG(1) << "rollback re-get objects: " << e.toString();
            error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' '
                    << numFetched << '/' << fixUpInfo.toRefetch.size();
            throw e;
        }

        log() << "rollback 3.5";
        if (fixUpInfo.rbid != getRBID(oplogreader->conn())) {
            // our source rolled back itself.  so the data we received isn't necessarily consistent.
            warning() << "rollback rbid on source changed during rollback, cancelling this attempt";
            return;
        }

        // update them
        log() << "rollback 4 n:" << goodVersions.size();

        bool warn = false;

        invariant(!fixUpInfo.commonPointOurDiskloc.isNull());
        invariant(txn->lockState()->isW());

        // we have items we are writing that aren't from a point-in-time.  thus best not to come
        // online until we get to that point in freshness.
        Timestamp minValid = newMinValid["ts"].timestamp();
        log() << "minvalid=" << minValid.toStringLong();
        setMinValid(txn, minValid);

        // any full collection resyncs required?
        if (!fixUpInfo.collectionsToResyncData.empty()
                || !fixUpInfo.collectionsToResyncMetadata.empty()) {

            for (const string& ns : fixUpInfo.collectionsToResyncData) {
                log() << "rollback 4.1.1 coll resync " << ns;

                fixUpInfo.collectionsToResyncMetadata.erase(ns);

                const NamespaceString nss(ns);

                Database* db = dbHolder().openDb(txn, nss.db().toString());
                invariant(db);

                {
                    WriteUnitOfWork wunit(txn);
                    db->dropCollection(txn, ns);
                    wunit.commit();
                }

                {
                    string errmsg;

                    // This comes as a GlobalWrite lock, so there is no DB to be acquired after
                    // resume, so we can skip the DB stability checks. Also 
                    // copyCollectionFromRemote will acquire its own database pointer, under the
                    // appropriate locks, so just releasing and acquiring the lock is safe.
                    invariant(txn->lockState()->isW());
                    Lock::TempRelease release(txn->lockState());

                    bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg);
                    uassert(15909, str::stream() << "replSet rollback error resyncing collection "
                                                 << ns << ' ' << errmsg, ok);
                }
            }

            for (const string& ns : fixUpInfo.collectionsToResyncMetadata) {
                log() << "rollback 4.1.2 coll metadata resync " << ns;

                const NamespaceString nss(ns);
                auto db = dbHolder().openDb(txn, nss.db().toString());
                invariant(db);
                auto collection = db->getCollection(ns);
                invariant(collection);
                auto cce = collection->getCatalogEntry();

                const std::list<BSONObj> info =
                    them->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll()));

                if (info.empty()) {
                    // Collection dropped by "them" so we should drop it too.
                    log() << ns << " not found on remote host, dropping";
                    fixUpInfo.toDrop.insert(ns);
                    continue;
                }

                invariant(info.size() == 1);

                CollectionOptions options;
                auto status = options.parse(info.front());
                if (!status.isOK()) {
                    throw RSFatalException(str::stream() << "Failed to parse options "
                                                         << info.front() << ": "
                                                         << status.toString());
                }

                WriteUnitOfWork wuow(txn);
                if (options.flagsSet || cce->getCollectionOptions(txn).flagsSet) {
                    cce->updateFlags(txn, options.flags);
                }

                status = collection->setValidator(txn, options.validator);
                if (!status.isOK()) {
                    throw RSFatalException(str::stream() << "Failed to set validator: "
                                                         << status.toString());
                }
                wuow.commit();
            }

            // we did more reading from primary, so check it again for a rollback (which would mess
            // us up), and make minValid newer.
            log() << "rollback 4.2";

            string err;
            try {
                newMinValid = oplogreader->getLastOp(rsOplogName);
                if (newMinValid.isEmpty()) {
                    err = "can't get minvalid from sync source";
                }
                else {
                    Timestamp minValid = newMinValid["ts"].timestamp();
                    log() << "minvalid=" << minValid.toStringLong();
                    setMinValid(txn, minValid);
                }
            }
            catch (DBException& e) {
                err = "can't get/set minvalid: ";
                err += e.what();
            }
            if (fixUpInfo.rbid != getRBID(oplogreader->conn())) {
                // our source rolled back itself.  so the data we received isn't necessarily
                // consistent. however, we've now done writes.  thus we have a problem.
                err += "rbid at primary changed during resync/rollback";
            }
            if (!err.empty()) {
                severe() << "rolling back : " << err
                        << ". A full resync will be necessary.";
                // TODO: reset minvalid so that we are permanently in fatal state
                // TODO: don't be fatal, but rather, get all the data first.
                throw RSFatalException();
            }
            log() << "rollback 4.3";
        }

        map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers;

        log() << "rollback 4.6";
        // drop collections to drop before doing individual fixups - that might make things faster
        // below actually if there were subsequent inserts to rollback
        for (set<string>::iterator it = fixUpInfo.toDrop.begin();
                it != fixUpInfo.toDrop.end();
                it++) {
            log() << "rollback drop: " << *it;

            Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it));
            if (db) {
                WriteUnitOfWork wunit(txn);

                shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it];
                if (!removeSaver)
                    removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it));

                // perform a collection scan and write all documents in the collection to disk
                boost::scoped_ptr<PlanExecutor> exec(
                        InternalPlanner::collectionScan(txn,
                                                        *it,
                                                        db->getCollection(*it)));
                BSONObj curObj;
                PlanExecutor::ExecState execState;
                while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) {
                    removeSaver->goingToDelete(curObj);
                }
                if (execState != PlanExecutor::IS_EOF) {
                    if (execState == PlanExecutor::FAILURE &&
                            WorkingSetCommon::isValidStatusMemberObject(curObj)) {
                        Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj);
                        severe() << "rolling back createCollection on " << *it
                                 << " failed with " << errorStatus
                                 << ". A full resync is necessary.";
                    }
                    else {
                        severe() << "rolling back createCollection on " << *it
                                 << " failed. A full resync is necessary.";
                    }
                            
                    throw RSFatalException();
                }

                db->dropCollection(txn, *it);
                wunit.commit();
            }
        }

        log() << "rollback 4.7";
        OldClientContext ctx(txn, rsOplogName);
        Collection* oplogCollection = ctx.db()->getCollection(rsOplogName);
        uassert(13423,
                str::stream() << "replSet error in rollback can't find " << rsOplogName,
                oplogCollection);

        unsigned deletes = 0, updates = 0;
        time_t lastProgressUpdate = time(0);
        time_t progressUpdateGap = 10;
        for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin();
                it != goodVersions.end();
                it++) {
            time_t now = time(0);
            if (now - lastProgressUpdate > progressUpdateGap) {
                log() << deletes << " delete and "
                      << updates << " update operations processed out of "
                      << goodVersions.size() << " total operations";
                lastProgressUpdate = now;
            }
            const DocID& doc = it->first;
            BSONObj pattern = doc._id.wrap(); // { _id : ... }
            try {
                verify(doc.ns && *doc.ns);
                if (fixUpInfo.collectionsToResyncData.count(doc.ns)) {
                    // we just synced this entire collection
                    continue;
                }

                // keep an archive of items rolled back
                shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns];
                if (!removeSaver)
                    removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns));

                // todo: lots of overhead in context, this can be faster
                OldClientContext ctx(txn, doc.ns);

                // Add the doc to our rollback file
                BSONObj obj;
                Collection* collection = ctx.db()->getCollection(doc.ns);

                // Do not log an error when undoing an insert on a no longer existent collection.
                // It is likely that the collection was dropped as part of rolling back a
                // createCollection command and regardless, the document no longer exists.
                if (collection) {
                    bool found = Helpers::findOne(txn, collection, pattern, obj, false);
                    if (found) {
                        removeSaver->goingToDelete(obj);
                    }
                    else {
                        error() << "rollback cannot find object: " << pattern
                                << " in namespace " << doc.ns;
                    }
                }

                if (it->second.isEmpty()) {
                    // wasn't on the primary; delete.
                    // TODO 1.6 : can't delete from a capped collection.  need to handle that here.
                    deletes++;

                    if (collection) {
                        if (collection->isCapped()) {
                            // can't delete from a capped collection - so we truncate instead. if
                            // this item must go, so must all successors!!!
                            try {
                                // TODO: IIRC cappedTruncateAfter does not handle completely empty.
                                // this will crazy slow if no _id index.
                                long long start = Listener::getElapsedTimeMillis();
                                RecordId loc = Helpers::findOne(txn, collection, pattern, false);
                                if (Listener::getElapsedTimeMillis() - start > 200)
                                    warning() << "roll back slow no _id index for "
                                          << doc.ns << " perhaps?";
                                // would be faster but requires index:
                                // RecordId loc = Helpers::findById(nsd, pattern);
                                if (!loc.isNull()) {
                                    try {
                                        collection->temp_cappedTruncateAfter(txn, loc, true);
                                    }
                                    catch (DBException& e) {
                                        if (e.getCode() == 13415) {
                                            // hack: need to just make cappedTruncate do this...
                                            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                                                WriteUnitOfWork wunit(txn);
                                                uassertStatusOK(collection->truncate(txn));
                                                wunit.commit();
                                            } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(
                                                                            txn,
                                                                            "truncate",
                                                                            collection->ns().ns());
                                        }
                                        else {
                                            throw e;
                                        }
                                    }
                                }
                            }
                            catch (DBException& e) {
                                error() << "rolling back capped collection rec "
                                      << doc.ns << ' ' << e.toString();
                            }
                        }
                        else {
Пример #2
0
StatusWith<BSONObj> OplogFetcher::_onSuccessfulBatch(const Fetcher::QueryResponse& queryResponse) {

    // Stop fetching and return on fail point.
    // This fail point makes the oplog fetcher ignore the downloaded batch of operations and not
    // error out. The FailPointEnabled error will be caught by the AbstractOplogFetcher.
    if (MONGO_FAIL_POINT(stopReplProducer)) {
        return Status(ErrorCodes::FailPointEnabled, "stopReplProducer fail point is enabled");
    }

    const auto& documents = queryResponse.documents;
    auto firstDocToApply = documents.cbegin();

    if (!documents.empty()) {
        LOG(2) << "oplog fetcher read " << documents.size()
               << " operations from remote oplog starting at " << documents.front()["ts"]
               << " and ending at " << documents.back()["ts"];
    } else {
        LOG(2) << "oplog fetcher read 0 operations from remote oplog";
    }

    auto oqMetadataResult = parseOplogQueryMetadata(queryResponse);
    if (!oqMetadataResult.isOK()) {
        error() << "invalid oplog query metadata from sync source " << _getSource() << ": "
                << oqMetadataResult.getStatus() << ": " << queryResponse.otherFields.metadata;
        return oqMetadataResult.getStatus();
    }
    auto oqMetadata = oqMetadataResult.getValue();

    // This lastFetched value is the last OpTime from the previous batch.
    auto lastFetched = _getLastOpTimeFetched();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto remoteRBID = oqMetadata ? boost::make_optional(oqMetadata->getRBID()) : boost::none;
        auto remoteLastApplied =
            oqMetadata ? boost::make_optional(oqMetadata->getLastOpApplied()) : boost::none;
        auto status = checkRemoteOplogStart(documents,
                                            lastFetched,
                                            remoteLastApplied,
                                            _requiredRBID,
                                            remoteRBID,
                                            _requireFresherSyncSource);
        if (!status.isOK()) {
            // Stop oplog fetcher and execute rollback if necessary.
            return status;
        }

        LOG(1) << "oplog fetcher successfully fetched from " << _getSource();

        // If this is the first batch, no rollback is needed and we don't want to enqueue the first
        // document, skip it.
        if (_startingPoint == StartingPoint::kSkipFirstDoc) {
            firstDocToApply++;
        }
    }

    auto validateResult =
        OplogFetcher::validateDocuments(documents, queryResponse.first, lastFetched.getTimestamp());
    if (!validateResult.isOK()) {
        return validateResult.getStatus();
    }
    auto info = validateResult.getValue();

    // Process replset metadata.  It is important that this happen after we've validated the
    // first batch, so we don't progress our knowledge of the commit point from a
    // response that triggers a rollback.
    rpc::ReplSetMetadata replSetMetadata;
    bool receivedReplMetadata =
        queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName);
    if (receivedReplMetadata) {
        const auto& metadataObj = queryResponse.otherFields.metadata;
        auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(metadataObj);
        if (!metadataResult.isOK()) {
            error() << "invalid replication metadata from sync source " << _getSource() << ": "
                    << metadataResult.getStatus() << ": " << metadataObj;
            return metadataResult.getStatus();
        }
        replSetMetadata = metadataResult.getValue();

        // We will only ever have OplogQueryMetadata if we have ReplSetMetadata, so it is safe
        // to call processMetadata() in this if block.
        _dataReplicatorExternalState->processMetadata(replSetMetadata, oqMetadata);
    }

    // Increment stats. We read all of the docs in the query.
    opsReadStats.increment(info.networkDocumentCount);
    networkByteStats.increment(info.networkDocumentBytes);

    // Record time for each batch.
    getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis));

    // TODO: back pressure handling will be added in SERVER-23499.
    auto status = _enqueueDocumentsFn(firstDocToApply, documents.cend(), info);
    if (!status.isOK()) {
        return status;
    }

    if (_dataReplicatorExternalState->shouldStopFetching(
            _getSource(), replSetMetadata, oqMetadata)) {
        str::stream errMsg;
        errMsg << "sync source " << _getSource().toString();
        errMsg << " (config version: " << replSetMetadata.getConfigVersion();
        // If OplogQueryMetadata was provided, its values were used to determine if we should
        // stop fetching from this sync source.
        if (oqMetadata) {
            errMsg << "; last applied optime: " << oqMetadata->getLastOpApplied().toString();
            errMsg << "; sync source index: " << oqMetadata->getSyncSourceIndex();
            errMsg << "; primary index: " << oqMetadata->getPrimaryIndex();
        } else {
            errMsg << "; last visible optime: " << replSetMetadata.getLastOpVisible().toString();
            errMsg << "; sync source index: " << replSetMetadata.getSyncSourceIndex();
            errMsg << "; primary index: " << replSetMetadata.getPrimaryIndex();
        }
        errMsg << ") is no longer valid";
        return Status(ErrorCodes::InvalidSyncSource, errMsg);
    }

    auto lastCommittedWithCurrentTerm =
        _dataReplicatorExternalState->getCurrentTermAndLastCommittedOpTime();
    return makeGetMoreCommandObject(queryResponse.nss,
                                    queryResponse.cursorId,
                                    lastCommittedWithCurrentTerm,
                                    _getGetMoreMaxTime(),
                                    _batchSize);
}