Esempio n. 1
0
    bool BackgroundSync::isRollbackRequired(OplogReader& r) {
        string hn = r.conn()->getServerAddress();

        if (!r.more()) {
            try {
                BSONObj theirLastOp = r.getLastOp(rsoplog);
                if (theirLastOp.isEmpty()) {
                    log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
                    sleepsecs(2);
                    return true;
                }
                OpTime theirTS = theirLastOp["ts"]._opTime();
                if (theirTS < _lastOpTimeFetched) {
                    log() << "replSet we are ahead of the sync source, will try to roll back"
                          << rsLog;
                    theReplSet->syncRollback(r);
                    return true;
                }
                /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
                log() << "replSet syncTail condition 1" << rsLog;
                sleepsecs(1);
            }
            catch(DBException& e) {
                log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
                sleepsecs(2);
            }
            return true;
        }

        BSONObj o = r.nextSafe();
        OpTime ts = o["ts"]._opTime();
        long long h = o["h"].numberLong();
        if( ts != _lastOpTimeFetched || h != _lastH ) {
            log() << "replSet our last op time fetched: " << _lastOpTimeFetched.toStringPretty() << rsLog;
            log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
            theReplSet->syncRollback(r);
            return true;
        }

        return false;
    }
Esempio n. 2
0
    bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) {
        string hn = r.conn()->getServerAddress();

        if (!r.more()) {
            try {
                BSONObj theirLastOp = r.getLastOp(rsOplogName.c_str());
                if (theirLastOp.isEmpty()) {
                    error() << "empty query result from " << hn << " oplog";
                    sleepsecs(2);
                    return true;
                }
                OpTime theirOpTime = extractOpTime(theirLastOp);
                if (theirOpTime < _lastOpTimeFetched) {
                    log() << "we are ahead of the sync source, will try to roll back";
                    syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord);
                    return true;
                }
                /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
                log() << "syncTail condition 1";
                sleepsecs(1);
            }
            catch(DBException& e) {
                error() << "querying " << hn << ' ' << e.toString();
                sleepsecs(2);
            }
            return true;
        }

        BSONObj o = r.nextSafe();
        OpTime opTime = extractOpTime(o);
        long long hash = o["h"].numberLong();
        if ( opTime != _lastOpTimeFetched || hash != _lastFetchedHash ) {
            log() << "our last op time fetched: " << _lastOpTimeFetched;
            log() << "source's GTE: " << opTime;
            syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord);
            return true;
        }

        return false;
    }
Esempio n. 3
0
    /**
     * Do the initial sync for this member.  There are several steps to this process:
     *
     *     0. Add _initialSyncFlag to minValid to tell us to restart initial sync if we
     *        crash in the middle of this procedure
     *     1. Record start time.
     *     2. Clone.
     *     3. Set minValid1 to sync target's latest op time.
     *     4. Apply ops from start to minValid1, fetching missing docs as needed.
     *     5. Set minValid2 to sync target's latest op time.
     *     6. Apply ops from minValid1 to minValid2.
     *     7. Build indexes.
     *     8. Set minValid3 to sync target's latest op time.
     *     9. Apply ops from minValid2 to minValid3.
          10. Clean up minValid and remove _initialSyncFlag field
     *
     * At that point, initial sync is finished.  Note that the oplog from the sync target is applied
     * three times: step 4, 6, and 8.  4 may involve refetching, 6 should not.  By the end of 6,
     * this member should have consistent data.  8 is "cosmetic," it is only to get this member
     * closer to the latest op time before it can transition to secondary state.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        replset::SyncTail tail(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        // written by applyToHead calls
        BSONObj minValid;

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            // Add field to minvalid document to tell us to restart initial sync if we crash
            theReplSet->setInitialSyncFlag();

            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            Cloner cloner;
            if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, true)) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);

            log() << "oplog sync 1 of 3" << endl;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // Now we sync to the latest op on the sync target _again_, as we may have recloned ops
            // that were "from the future" compared with minValid. During this second application,
            // nothing should need to be recloned.
            log() << "oplog sync 2 of 3" << endl;
            if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) {
                return;
            }
            // data should now be consistent

            lastOp = minValid;

            sethbmsg("initial sync building indexes",0);
            if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, false)) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        log() << "oplog sync 3 of 3" << endl;
        if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) {
            return;
        }
        
        // ---------

        Status status = getGlobalAuthorizationManager()->initialize();
        if (!status.isOK()) {
            warning() << "Failed to reinitialize auth data after initial sync. " << status;
            return;
        }

        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            // Initial sync is now complete.  Flag this by setting minValid to the last thing
            // we synced.
            theReplSet->setMinValid(minValid);

            // Clear the initial sync flag.
            theReplSet->clearInitialSyncFlag();

            cx.ctx().db()->flushFiles(true);
        }
        {
            boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
            theReplSet->initialSyncRequested = false;
        }

        // If we just cloned & there were no ops applied, we still want the primary to know where
        // we're up to
        replset::BackgroundSync::notify();

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Esempio n. 4
0
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);
            
            BSONObj minValid;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // reset state, as that "didn't count"
            emptyOplog(); 
            lastOpTimeWritten = OpTime();
            lastH = 0;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        sethbmsg("initial sync query minValid",0);

        BSONObj minValid;
        if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }
            Helpers::putSingleton("local.replset.minvalid", minValid);
            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Esempio n. 5
0
    /**
     * Do the initial sync for this member.  There are several steps to this process:
     *
     *     1. Record start time.
     *     2. Clone.
     *     3. Set minValid1 to sync target's latest op time.
     *     4. Apply ops from start to minValid1, fetching missing docs as needed.
     *     5. Set minValid2 to sync target's latest op time.
     *     6. Apply ops from minValid1 to minValid2.
     *     7. Build indexes.
     *     8. Set minValid3 to sync target's latest op time.
     *     9. Apply ops from minValid2 to minValid3.
     *
     * At that point, initial sync is finished.  Note that the oplog from the sync target is applied
     * three times: step 4, 6, and 8.  4 may involve refetching, 6 should not.  By the end of 6,
     * this member should have consistent data.  8 is "cosmetic," it is only to get this member
     * closer to the latest op time before it can transition to secondary state.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        replset::SyncTail tail(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        // written by applyToHead calls
        BSONObj minValid;

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);

            log() << "oplog sync 1 of 3" << endl;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // Now we sync to the latest op on the sync target _again_, as we may have recloned ops
            // that were "from the future" compared with minValid. During this second application,
            // nothing should need to be recloned.
            log() << "oplog sync 2 of 3" << endl;
            if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) {
                return;
            }
            // data should now be consistent

            lastOp = minValid;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        log() << "oplog sync 3 of 3" << endl;
        if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            theReplSet->setMinValid(minValid);

            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Esempio n. 6
0
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }
        
        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }
        OpTime startingTS = lastOp["ts"]._opTime();

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();
            for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
                string db = *i;
                if( db != "local" ) {
                    sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
                    bool ok;
                    {
                        writelock lk(db);
                        Client::Context ctx(db);
                        ok = clone(sourceHostname.c_str(), db);
                    }
                    if( !ok ) {
                        sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
                        sleepsecs(300);
                        return;
                    }
                }
            }
        }

        sethbmsg("initial sync query minValid",0);

        isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() );

        /* our cloned copy will be strange until we apply oplog events that occurred
           through the process.  we note that time point here. */
        BSONObj minValid = r.getLastOp(rsoplog);
        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
        OpTime mvoptime = minValid["ts"]._opTime();
        assert( !mvoptime.isNull() );

        /* apply relevant portion of the oplog
        */
        {
            isyncassert( str::stream() << "initial sync source must remain readable throughout our initial sync [2] state now: " << source->state().toString() , source->state().readable() );
            if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw
                log() << "replSet initial sync failed during applyoplog" << rsLog;
                emptyOplog(); // otherwise we'll be up!
                
                lastOpTimeWritten = OpTime();
                lastH = 0;
                
                log() << "replSet cleaning up [1]" << rsLog;
                {
                    writelock lk("local.");
                    Client::Context cx( "local." );
                    cx.db()->flushFiles(true);
                }
                log() << "replSet cleaning up [2]" << rsLog;
                sleepsecs(5);
                return;
            }
        }

        sethbmsg("initial sync finishing up",0);

        assert( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            writelock lk("local.");
            Client::Context cx( "local." );
            cx.db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }
            Helpers::putSingleton("local.replset.minvalid", minValid);
            cx.db()->flushFiles(true);
        }

        sethbmsg("initial sync done",0);
    }
Esempio n. 7
0
    void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
        DBClientConnection *them = r.conn();

        // fetch all first so we needn't handle interruption in a fancy way

        unsigned long long totSize = 0;

        list< pair<DocID,bo> > goodVersions;

        bo newMinValid;

        /* fetch all the goodVersions of each document from current primary */
        DocID d;
        unsigned long long n = 0;
        try {
            for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
                d = *i;

                assert( !d._id.eoo() );

                {
                    /* TODO : slow.  lots of round trips. */
                    n++;
                    bo good= them->findOne(d.ns, d._id.wrap()).getOwned();
                    totSize += good.objsize();
                    uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );

                    // note good might be eoo, indicating we should delete it
                    goodVersions.push_back(pair<DocID,bo>(d,good));
                }
            }
            newMinValid = r.getLastOp(rsoplog);
            if( newMinValid.isEmpty() ) {
                sethbmsg("rollback error newMinValid empty?");
                return;
            }
        }
        catch(DBException& e) {
            sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
            log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
            throw e;
        }

        MemoryMappedFile::flushAll(true);

        sethbmsg("rollback 3.5");
        if( h.rbid != getRBID(r.conn()) ) {
            // our source rolled back itself.  so the data we received isn't necessarily consistent.
            sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
            return;
        }

        // update them
        sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());

        bool warn = false;

        assert( !h.commonPointOurDiskloc.isNull() );

        dbMutex.assertWriteLocked();

        /* we have items we are writing that aren't from a point-in-time.  thus best not to come online
           until we get to that point in freshness. */
        setMinValid(newMinValid);

        /** any full collection resyncs required? */
        if( !h.collectionsToResync.empty() ) {
            for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
                string ns = *i;
                sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
                Client::Context c(*i, dbpath, 0, /*doauth*/false);
                try {
                    bob res;
                    string errmsg;
                    dropCollection(ns, errmsg, res);
                    {
                        dbtemprelease r;
                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false);
                        if( !ok ) {
                            log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog;
                            throw "rollback error resyncing rollection [1]";
                        }
                    }
                }
                catch(...) {
                    log() << "replset rollback error resyncing collection " << ns << rsLog;
                    throw "rollback error resyncing rollection [2]";
                }
            }

            /* we did more reading from primary, so check it again for a rollback (which would mess us up), and
               make minValid newer.
               */
            sethbmsg("rollback 4.2");
            {
                string err;
                try {
                    newMinValid = r.getLastOp(rsoplog);
                    if( newMinValid.isEmpty() ) {
                        err = "can't get minvalid from primary";
                    }
                    else {
                        setMinValid(newMinValid);
                    }
                }
                catch(...) {
                    err = "can't get/set minvalid";
                }
                if( h.rbid != getRBID(r.conn()) ) {
                    // our source rolled back itself.  so the data we received isn't necessarily consistent.
                    // however, we've now done writes.  thus we have a problem.
                    err += "rbid at primary changed during resync/rollback";
                }
                if( !err.empty() ) {
                    log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
                    /* todo: reset minvalid so that we are permanently in fatal state */
                    /* todo: don't be fatal, but rather, get all the data first. */
                    sethbmsg("rollback error");
                    throw rsfatal();
                }
            }
            sethbmsg("rollback 4.3");
        }

        sethbmsg("rollback 4.6");
        /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
        for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
            Client::Context c(*i, dbpath, 0, /*doauth*/false);
            try {
                bob res;
                string errmsg;
                log(1) << "replSet rollback drop: " << *i << rsLog;
                dropCollection(*i, errmsg, res);
            }
            catch(...) {
                log() << "replset rollback error dropping collection " << *i << rsLog;
            }
        }

        sethbmsg("rollback 4.7");
        Client::Context c(rsoplog, dbpath, 0, /*doauth*/false);
        NamespaceDetails *oplogDetails = nsdetails(rsoplog);
        uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);

        map<string,shared_ptr<RemoveSaver> > removeSavers;

        unsigned deletes = 0, updates = 0;
        for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
            const DocID& d = i->first;
            bo pattern = d._id.wrap(); // { _id : ... }
            try {
                assert( d.ns && *d.ns );
                if( h.collectionsToResync.count(d.ns) ) {
                    /* we just synced this entire collection */
                    continue;
                }

                /* keep an archive of items rolled back */
                shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
                if ( ! rs )
                    rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );

                // todo: lots of overhead in context, this can be faster
                Client::Context c(d.ns, dbpath, 0, /*doauth*/false);
                if( i->second.isEmpty() ) {
                    // wasn't on the primary; delete.
                    /* TODO1.6 : can't delete from a capped collection.  need to handle that here. */
                    deletes++;

                    NamespaceDetails *nsd = nsdetails(d.ns);
                    if( nsd ) {
                        if( nsd->capped ) {
                            /* can't delete from a capped collection - so we truncate instead. if this item must go,
                            so must all successors!!! */
                            try {
                                /** todo: IIRC cappedTrunateAfter does not handle completely empty.  todo. */
                                // this will crazy slow if no _id index.
                                long long start = Listener::getElapsedTimeMillis();
                                DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
                                if( Listener::getElapsedTimeMillis() - start > 200 )
                                    log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
                                //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
                                if( !loc.isNull() ) {
                                    try {
                                        nsd->cappedTruncateAfter(d.ns, loc, true);
                                    }
                                    catch(DBException& e) {
                                        if( e.getCode() == 13415 ) {
                                            // hack: need to just make cappedTruncate do this...
                                            nsd->emptyCappedCollection(d.ns);
                                        }
                                        else {
                                            throw;
                                        }
                                    }
                                }
                            }
                            catch(DBException& e) {
                                log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
                            }
                        }
                        else {
                            try {
                                deletes++;
                                deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
                            }
                            catch(...) {
                                log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
                            }
                        }
                        // did we just empty the collection?  if so let's check if it even exists on the source.
                        if( nsd->stats.nrecords == 0 ) {
                            try {
                                string sys = cc().database()->name + ".system.namespaces";
                                bo o = them->findOne(sys, QUERY("name"<<d.ns));
                                if( o.isEmpty() ) {
                                    // we should drop
                                    try {
                                        bob res;
                                        string errmsg;
                                        dropCollection(d.ns, errmsg, res);
                                    }
                                    catch(...) {
                                        log() << "replset error rolling back collection " << d.ns << rsLog;
                                    }
                                }
                            }
                            catch(DBException& ) {
                                /* this isn't *that* big a deal, but is bad. */
                                log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
                            }
                        }
                    }
                }
                else {
                    // todo faster...
                    OpDebug debug;
                    updates++;
                    _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
                }
            }
            catch(DBException& e) {
                log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
                warn = true;
            }
        }

        removeSavers.clear(); // this effectively closes all of them

        sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
        MemoryMappedFile::flushAll(true);
        sethbmsg("rollback 6");

        // clean up oplog
        log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
        // todo: fatal error if this throws?
        oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);

        /* reset cached lastoptimewritten and h value */
        loadLastOpTimeWritten();

        sethbmsg("rollback 7");
        MemoryMappedFile::flushAll(true);

        // done
        if( warn )
            sethbmsg("issues during syncRollback, see log");
        else
            sethbmsg("rollback done");
    }
Esempio n. 8
0
/**
 * Do the initial sync for this member.
 * This code can use a little refactoring. bit ugly
 */
bool ReplSetImpl::_syncDoInitialSync() {
    sethbmsg("initial sync pending",0);
    bool needsFullSync = gtidManager->getLiveState().isInitial();
    bool needGapsFilled = needsFullSync || replSettings.fastsync;

    // if this is the first node, it may have already become primary
    if ( box.getState().primary() ) {
        sethbmsg("I'm already primary, no need for initial sync",0);
        return true;
    }

    const Member *source = NULL;
    OplogReader r;
    string sourceHostname;
    // only bother making a connection if we need to connect for some reason
    if (needGapsFilled) {
        source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return false;
        }

        sourceHostname = source->h().toString();
        if( !r.connect(sourceHostname, 0) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return false;
        }
    }

    if( needsFullSync ) {
        BSONObj lastOp = r.getLastOp();
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return false;
        }

        {
            LOCK_REASON(lockReason, "repl: initial sync drop all databases");
            Lock::GlobalWrite lk(lockReason);
            Client::Transaction dropTransaction(DB_SERIALIZABLE);
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();
            dropTransaction.commit();
        }

        // now deal with creation of oplog
        // first delete any existing data in the oplog

        {
            LOCK_REASON(lockReason, "repl: create oplog");
            Lock::DBWrite lk("local", lockReason);
            Client::Transaction fileOpsTransaction(DB_SERIALIZABLE);
            deleteOplogFiles();
            fileOpsTransaction.commit(0);
        }

        try {
            sethbmsg("initial sync clone all databases", 0);

            shared_ptr<DBClientConnection> conn(r.conn_shared());
            RemoteTransaction rtxn(*conn, "mvcc");

            list<string> dbs = conn->getDatabaseNamesForRepl();

            //
            // Not sure if it is necessary to have a separate fileOps
            // transaction and clone transaction. The cloneTransaction
            // has a higher chance of failing, and I don't know at the moment
            // if it is ok to do fileops successfully, and then an operation (cloning) that
            // later causes an abort. So, to be cautious, they are separate

            {
                LOCK_REASON(lockReason, "repl: initial sync");
                Lock::GlobalWrite lk(lockReason);
                Client::Transaction cloneTransaction(DB_SERIALIZABLE);
                bool ret = _syncDoInitialSync_clone(sourceHostname.c_str(), dbs, conn);

                if (!ret) {
                    veto(source->fullName(), 600);
                    cloneTransaction.abort();
                    sleepsecs(300);
                    return false;
                }

                // at this point, we have copied all of the data from the
                // remote machine. Now we need to copy the replication information
                // on the remote machine's local database, we need to copy
                // the entire (small) replInfo dictionary, and the necessary portion
                // of the oplog

                // first copy the replInfo, as we will use its information
                // to determine  how much of the opLog to copy
                {
                    Client::Context ctx( "local" );
                    BSONObj q;
                    cloneCollection(conn,
                                    "local",
                                    rsReplInfo,
                                    q,
                                    true, //copyIndexes
                                    false //logForRepl
                                   );

                    // copy entire oplog (probably overkill)
                    cloneCollection(conn,
                                    "local",
                                    rsoplog,
                                    q,
                                    true, //copyIndexes
                                    false //logForRepl
                                   );

                    // copy entire oplog.refs (probably overkill)
                    cloneCollection(conn,
                                    "local",
                                    rsOplogRefs,
                                    q,
                                    true, //copyIndexes
                                    false //logForRepl
                                   );
                }
                cloneTransaction.commit(0);
            }

            bool ok = rtxn.commit();
            verify(ok);  // absolutely no reason this should fail, it was read only
            // data should now be consistent
        }
        catch (DBException &e) {
            sethbmsg("exception trying to copy data", 0);
            LOG(0) << e.getCode() << ": " << e.what() << endl;
            sleepsecs(1);
            return false;
        }
    }
    if (needGapsFilled) {
        _fillGaps(&r);
    }
    GTID dummy;
    applyMissingOpsInOplog(dummy);

    sethbmsg("initial sync done",0);

    return true;
}