Exemplo n.º 1
0
BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) {
    OplogReader missingObjReader;  // why are we using OplogReader to run a non-oplog query?
    const char* ns = o.getStringField("ns");

    // capped collections
    Collection* collection = db->getCollection(ns);
    if (collection && collection->isCapped()) {
        log() << "missing doc, but this is okay for a capped collection (" << ns << ")";
        return BSONObj();
    }

    const int retryMax = 3;
    for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
        if (retryCount != 1) {
            // if we are retrying, sleep a bit to let the network possibly recover
            sleepsecs(retryCount * retryCount);
        }
        try {
            bool ok = missingObjReader.connect(HostAndPort(_hostname));
            if (!ok) {
                warning() << "network problem detected while connecting to the "
                          << "sync source, attempt " << retryCount << " of " << retryMax << endl;
                continue;  // try again
            }
        } catch (const SocketException&) {
            warning() << "network problem detected while connecting to the "
                      << "sync source, attempt " << retryCount << " of " << retryMax << endl;
            continue;  // try again
        }

        // get _id from oplog entry to create query to fetch document.
        const BSONElement opElem = o.getField("op");
        const bool isUpdate = !opElem.eoo() && opElem.str() == "u";
        const BSONElement idElem = o.getObjectField(isUpdate ? "o2" : "o")["_id"];

        if (idElem.eoo()) {
            severe() << "cannot fetch missing document without _id field: " << o.toString();
            fassertFailedNoTrace(28742);
        }

        BSONObj query = BSONObjBuilder().append(idElem).obj();
        BSONObj missingObj;
        try {
            missingObj = missingObjReader.findOne(ns, query);
        } catch (const SocketException&) {
            warning() << "network problem detected while fetching a missing document from the "
                      << "sync source, attempt " << retryCount << " of " << retryMax << endl;
            continue;  // try again
        } catch (DBException& e) {
            error() << "assertion fetching missing object: " << e.what() << endl;
            throw;
        }

        // success!
        return missingObj;
    }
    // retry count exceeded
    msgasserted(15916,
                str::stream() << "Can no longer connect to initial sync source: " << _hostname);
}
Exemplo n.º 2
0
    void ReplSetImpl::syncTail() { 
        // todo : locking vis a vis the mgr...

        const Member *primary = box.getPrimary();
        if( primary == 0 ) return;
        string hn = primary->h().toString();
        OplogReader r;
        if( !r.connect(primary->h().toString()) ) { 
            log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
            return;
        }

        /* first make sure we are not hopelessly out of sync by being very stale. */
        {
            BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
            OpTime ts = remoteOldestOp["ts"]._opTime();
            DEV log() << "remoteOldestOp: " << ts.toStringPretty() << endl;
            else log(3) << "remoteOldestOp: " << ts.toStringPretty() << endl;
            if( lastOpTimeWritten < ts ) { 
                log() << "replSet error too stale to catch up, at least from primary " << hn << rsLog;
                log() << "replSet our last optime : " << lastOpTimeWritten.toStringPretty() << rsLog;
                log() << "replSet oldest at " << hn << " : " << ts.toStringPretty() << rsLog;
                log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
                sethbmsg("error too stale to catch up");
                sleepsecs(120);
                return;
            }
        }
Exemplo n.º 3
0
    int run() {

        Client::initThread( "oplogreplay" );

        toolInfoLog() << "going to connect" << std::endl;
        
        OplogReader r;
        r.setTailingQueryOptions( QueryOption_SlaveOk | QueryOption_AwaitData );
        r.connect(mongoOplogGlobalParams.from);

        toolInfoLog() << "connected" << std::endl;

        OpTime start(time(0) - mongoOplogGlobalParams.seconds, 0);
        toolInfoLog() << "starting from " << start.toStringPretty() << std::endl;

        r.tailingQueryGTE(mongoOplogGlobalParams.ns.c_str(), start);

        int num = 0;
        while ( r.more() ) {
            BSONObj o = r.next();
            if (logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(2))) {
                toolInfoLog() << o << std::endl;
            }
            
            if ( o["$err"].type() ) {
                toolError() << "error getting oplog" << std::endl;
                toolError() << o << std::endl;
                return -1;
            }
                

            bool print = ++num % 100000 == 0;
            if (print) {
                toolInfoLog() << num << "\t" << o << std::endl;
            }
            
            if ( o["op"].String() == "n" )
                continue;

            BSONObjBuilder b( o.objsize() + 32 );
            BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) );
            updates.append( o );
            updates.done();

            BSONObj c = b.obj();
            
            BSONObj res;
            bool ok = conn().runCommand( "admin" , c , res );
            if (!ok) {
                toolError() << res << std::endl;
            } else if (print) {
                toolInfoLog() << res << std::endl;
            }
        }

        return 0;
    }
Exemplo n.º 4
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                _currentSyncTarget = NULL;
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        verify(r.conn() == NULL);

        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                continue;
            }

            if (isStale(r, oldest)) {
                r.resetConnection();
                theReplSet->veto(current, 600);
                stale = target;
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            theReplSet->goStale(stale, oldest);
            sleepsecs(120);
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Exemplo n.º 5
0
Arquivo: sync.cpp Projeto: wjin/mongo
    BSONObj Sync::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) {
        OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query?
        const char *ns = o.getStringField("ns");

        // capped collections
        Collection* collection = db->getCollection(ns);
        if ( collection && collection->isCapped() ) {
            log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl;
            return BSONObj();
        }

        const int retryMax = 3;
        for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
            if (retryCount != 1) {
                // if we are retrying, sleep a bit to let the network possibly recover
                sleepsecs(retryCount * retryCount);
            }
            try {
                bool ok = missingObjReader.connect(HostAndPort(hn));
                if (!ok) {
                    warning() << "network problem detected while connecting to the "
                              << "sync source, attempt " << retryCount << " of "
                              << retryMax << endl;
                        continue;  // try again
                }
            } 
            catch (const SocketException&) {
                warning() << "network problem detected while connecting to the "
                          << "sync source, attempt " << retryCount << " of "
                          << retryMax << endl;
                continue; // try again
            }

            // might be more than just _id in the update criteria
            BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
            BSONObj missingObj;
            try {
                missingObj = missingObjReader.findOne(ns, query);
            } 
            catch (const SocketException&) {
                warning() << "network problem detected while fetching a missing document from the "
                          << "sync source, attempt " << retryCount << " of "
                          << retryMax << endl;
                continue; // try again
            } 
            catch (DBException& e) {
                log() << "replication assertion fetching missing object: " << e.what() << endl;
                throw;
            }

            // success!
            return missingObj;
        }
        // retry count exceeded
        msgasserted(15916, 
                    str::stream() << "Can no longer connect to initial sync source: " << hn);
    }
Exemplo n.º 6
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        verify(r.conn() == NULL);
        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            GTID remoteOldestGTID = getGTIDFromBSON("_id", oldest);
            theReplSet->goStale(stale, remoteOldestGTID);
            // vanilla Mongo used to do a sleep of 120 seconds here
            // We removed it. It seems excessive, and if this machine is doing
            // nothing anyway, sleeping won't help. It might as well
            // return with a null sync target, and produce() will handle
            // that fact and sleep one second
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Exemplo n.º 7
0
    /**
     * Do the initial sync for this member.  There are several steps to this process:
     *
     *     0. Add _initialSyncFlag to minValid to tell us to restart initial sync if we
     *        crash in the middle of this procedure
     *     1. Record start time.
     *     2. Clone.
     *     3. Set minValid1 to sync target's latest op time.
     *     4. Apply ops from start to minValid1, fetching missing docs as needed.
     *     5. Set minValid2 to sync target's latest op time.
     *     6. Apply ops from minValid1 to minValid2.
     *     7. Build indexes.
     *     8. Set minValid3 to sync target's latest op time.
     *     9. Apply ops from minValid2 to minValid3.
          10. Clean up minValid and remove _initialSyncFlag field
     *
     * At that point, initial sync is finished.  Note that the oplog from the sync target is applied
     * three times: step 4, 6, and 8.  4 may involve refetching, 6 should not.  By the end of 6,
     * this member should have consistent data.  8 is "cosmetic," it is only to get this member
     * closer to the latest op time before it can transition to secondary state.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        replset::SyncTail tail(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        // written by applyToHead calls
        BSONObj minValid;

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            // Add field to minvalid document to tell us to restart initial sync if we crash
            theReplSet->setInitialSyncFlag();

            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            Cloner cloner;
            if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, true)) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);

            log() << "oplog sync 1 of 3" << endl;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // Now we sync to the latest op on the sync target _again_, as we may have recloned ops
            // that were "from the future" compared with minValid. During this second application,
            // nothing should need to be recloned.
            log() << "oplog sync 2 of 3" << endl;
            if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) {
                return;
            }
            // data should now be consistent

            lastOp = minValid;

            sethbmsg("initial sync building indexes",0);
            if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, false)) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        log() << "oplog sync 3 of 3" << endl;
        if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) {
            return;
        }
        
        // ---------

        Status status = getGlobalAuthorizationManager()->initialize();
        if (!status.isOK()) {
            warning() << "Failed to reinitialize auth data after initial sync. " << status;
            return;
        }

        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            // Initial sync is now complete.  Flag this by setting minValid to the last thing
            // we synced.
            theReplSet->setMinValid(minValid);

            // Clear the initial sync flag.
            theReplSet->clearInitialSyncFlag();

            cx.ctx().db()->flushFiles(true);
        }
        {
            boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
            theReplSet->initialSyncRequested = false;
        }

        // If we just cloned & there were no ops applied, we still want the primary to know where
        // we're up to
        replset::BackgroundSync::notify();

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Exemplo n.º 8
0
    void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const {
        const Member *m = findById(server_id);
        if( m == 0 ) {
            ss << "Error : can't find a member with id: " << server_id << '\n';
            return;
        }

        ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog );

        //const bo fields = BSON( "o" << false << "o2" << false );
        const bo fields;

        /** todo fix we might want an so timeout here */
        OplogReader reader;

        if (reader.connect(m->h()) == false) {
            ss << "couldn't connect to " << m->fullName();
            return;
        }

        reader.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
        if ( !reader.haveCursor() ) {
            ss << "couldn't query " << rsoplog;
            return;
        }
        static const char *h[] = {"ts","optime","h","op","ns","rest",0};

        ss << "<style type=\"text/css\" media=\"screen\">"
           "table { font-size:75% }\n"
           // "th { background-color:#bbb; color:#000 }\n"
           // "td,th { padding:.25em }\n"
           "</style>\n";

        ss << table(h, true);
        //ss << "<pre>\n";
        int n = 0;
        OpTime otFirst;
        OpTime otLast;
        OpTime otEnd;
        while( reader.more() ) {
            bo o = reader.next();
            otLast = o["ts"]._opTime();
            if( otFirst.isNull() )
                otFirst = otLast;
            say(ss, o);
            n++;
        }
        if( n == 0 ) {
            ss << rsoplog << " is empty\n";
        }
        else {
            reader.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
            if( !reader.haveCursor() ) {
                ss << "couldn't query [2] " << rsoplog;
                return;
            }
            string x;
            bo o = reader.next();
            otEnd = o["ts"]._opTime();
            while( 1 ) {
                stringstream z;
                if( o["ts"]._opTime() == otLast )
                    break;
                say(z, o);
                x = z.str() + x;
                if( !reader.more() )
                    break;
                o = reader.next();
            }
            if( !x.empty() ) {
                ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x;
                //ss << "\n...\n\n" << x;
            }
        }
        ss << _table();
        ss << p(time_t_to_String_short(time(0)) + " current time");

        if( !otEnd.isNull() ) {
            ss << "<p>Log length in time: ";
            unsigned d = otEnd.getSecs() - otFirst.getSecs();
            double h = d / 3600.0;
            ss.precision(3);
            if( h < 72 )
                ss << h << " hours";
            else
                ss << h / 24.0 << " days";
            ss << "</p>\n";
        }
    }
Exemplo n.º 9
0
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);
            
            BSONObj minValid;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // reset state, as that "didn't count"
            emptyOplog(); 
            lastOpTimeWritten = OpTime();
            lastH = 0;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        sethbmsg("initial sync query minValid",0);

        BSONObj minValid;
        if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }
            Helpers::putSingleton("local.replset.minvalid", minValid);
            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Exemplo n.º 10
0
    /* initial oplog application, during initial sync, after cloning. 
       @return false on failure.  
       this method returns an error and doesn't throw exceptions (i think).
    */
    bool ReplSetImpl::initialSyncOplogApplication(
        const Member *source,
        OpTime applyGTE,
        OpTime minValid)
    { 
        if( source == 0 ) return false;

        const string hn = source->h().toString();
        OpTime ts;
        try {
            OplogReader r;
            if( !r.connect(hn) ) { 
                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                return false;
            }

            {
                BSONObjBuilder q;
                q.appendDate("$gte", applyGTE.asDate());
                BSONObjBuilder query;
                query.append("ts", q.done());
                BSONObj queryObj = query.done();
                r.query(rsoplog, queryObj);
            }
            assert( r.haveCursor() );

            /* we lock outside the loop to avoid the overhead of locking on every operation.  server isn't usable yet anyway! */
            writelock lk("");

            {
                if( !r.more() ) { 
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElement().fieldName() == string("$err") ) { 
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }
            }

            // todo : use exhaust
            unsigned long long n = 0;
            while( 1 ) { 

                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                {
                    ts = o["ts"]._opTime();

                    /* if we have become primary, we dont' want to apply things from elsewhere
                        anymore. assumePrimary is in the db lock so we are safe as long as 
                        we check after we locked above. */
                    if( (source->state() != MemberState::RS_PRIMARY &&
                         source->state() != MemberState::RS_SECONDARY) ||
                        replSetForceInitialSyncFailure ) {
                        
                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) {
                        // optimes before we started copying need not be applied.
                        syncApply(o);
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }
                if( ++n % 100000 == 0 ) { 
                    // simple progress metering
                    log() << "replSet initialSyncOplogApplication " << n << rsLog;
                }
            }
        }
        catch(DBException& e) { 
            if( ts <= minValid ) {
                // didn't make it far enough
                log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
                return false;
            }
        }
        return true;
    }
Exemplo n.º 11
0
    BSONObj Sync::getMissingDoc(const BSONObj& o) {
        OplogReader missingObjReader;
        const char *ns = o.getStringField("ns");

        // capped collections
        NamespaceDetails *nsd = nsdetails(ns);
        if ( nsd && nsd->isCapped() ) {
            log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl;
            return BSONObj();
        }

        uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));

        // might be more than just _id in the update criteria
        BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
        BSONObj missingObj;
        try {
            missingObj = missingObjReader.findOne(ns, query);
        } catch(DBException& e) {
            log() << "replication assertion fetching missing object: " << e.what() << endl;
            throw;
        }

        return missingObj;
    }
Exemplo n.º 12
0
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }
        
        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }
        OpTime startingTS = lastOp["ts"]._opTime();

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();
            for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
                string db = *i;
                if( db != "local" ) {
                    sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
                    bool ok;
                    {
                        writelock lk(db);
                        Client::Context ctx(db);
                        ok = clone(sourceHostname.c_str(), db);
                    }
                    if( !ok ) {
                        sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
                        sleepsecs(300);
                        return;
                    }
                }
            }
        }

        sethbmsg("initial sync query minValid",0);

        isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() );

        /* our cloned copy will be strange until we apply oplog events that occurred
           through the process.  we note that time point here. */
        BSONObj minValid = r.getLastOp(rsoplog);
        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
        OpTime mvoptime = minValid["ts"]._opTime();
        assert( !mvoptime.isNull() );

        /* apply relevant portion of the oplog
        */
        {
            isyncassert( str::stream() << "initial sync source must remain readable throughout our initial sync [2] state now: " << source->state().toString() , source->state().readable() );
            if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw
                log() << "replSet initial sync failed during applyoplog" << rsLog;
                emptyOplog(); // otherwise we'll be up!
                
                lastOpTimeWritten = OpTime();
                lastH = 0;
                
                log() << "replSet cleaning up [1]" << rsLog;
                {
                    writelock lk("local.");
                    Client::Context cx( "local." );
                    cx.db()->flushFiles(true);
                }
                log() << "replSet cleaning up [2]" << rsLog;
                sleepsecs(5);
                return;
            }
        }

        sethbmsg("initial sync finishing up",0);

        assert( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            writelock lk("local.");
            Client::Context cx( "local." );
            cx.db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }
            Helpers::putSingleton("local.replset.minvalid", minValid);
            cx.db()->flushFiles(true);
        }

        sethbmsg("initial sync done",0);
    }
Exemplo n.º 13
0
    bool ReplSetImpl::_initialSyncOplogApplication(OplogReader& r, const Member *source,
        const OpTime& applyGTE, const OpTime& minValid) {

        const string hn = source->fullName();
        OplogReader missingObjReader;
        try {
            r.tailingQueryGTE( rsoplog, applyGTE );
            if ( !r.haveCursor() ) {
                log() << "replSet initial sync oplog query error" << rsLog;
                return false;
            }

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElementFieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        while( 1 ) {
            try {
                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                ts = o["ts"]._opTime();

                {
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) { // optimes before we started copying need not be applied.
                        bool failedUpdate = syncApply(o);
                        if( failedUpdate ) {
                            // we don't have the object yet, which is possible on initial sync.  get it.
                            log() << "replSet info adding missing object" << endl; // rare enough we can log
                            if( !missingObjReader.connect(hn) ) { // ok to call more than once
                                log() << "replSet initial sync fails, couldn't connect to " << hn << endl;
                                return false;
                            }
                            const char *ns = o.getStringField("ns");
                            BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); // might be more than just _id in the update criteria
                            BSONObj missingObj;
                            try {
                                missingObj = missingObjReader.findOne(
                                    ns, 
                                    query );
                            } catch(...) { 
                                log() << "replSet assertion fetching missing object" << endl;
                                throw;
                            }
                            if( missingObj.isEmpty() ) { 
                                log() << "replSet missing object not found on source. presumably deleted later in oplog" << endl;
                                log() << "replSet o2: " << o.getObjectField("o2").toString() << endl;
                                log() << "replSet o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
                            }
                            else {
                                Client::Context ctx(ns);
                                try {
                                    DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
                                    assert( !d.isNull() );
                                } catch(...) { 
                                    log() << "replSet assertion during insert of missing object" << endl;
                                    throw;
                                }
                                // now reapply the update from above
                                bool failed = syncApply(o);
                                if( failed ) {
                                    log() << "replSet update still fails after adding missing object " << ns << endl;
                                    assert(false);
                                }
                            }
                        }
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }

                if ( ts > minValid ) {
                    break;
                }

                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // skip duplicate key exceptions
                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever
                break;
            }
        }
        return true;
    }
Exemplo n.º 14
0
    /* initial oplog application, during initial sync, after cloning.
       @return false on failure.
       this method returns an error and doesn't throw exceptions (i think).
    */
    bool ReplSetImpl::initialSyncOplogApplication(
        const Member *source,
        OpTime applyGTE,
        OpTime minValid) {
        if( source == 0 ) return false;

        const string hn = source->h().toString();
        OplogReader r;
        try {
            if( !r.connect(hn) ) {
                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                return false;
            }

            r.queryGTE( rsoplog, applyGTE );
            assert( r.haveCursor() );

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElement().fieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        while( 1 ) {
            try {
                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                {
                    ts = o["ts"]._opTime();

                    /* if we have become primary, we dont' want to apply things from elsewhere
                        anymore. assumePrimary is in the db lock so we are safe as long as
                        we check after we locked above. */
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) {
                        // optimes before we started copying need not be applied.
                        syncApply(o);
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }
                
                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // skip duplicate key exceptions
                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    r.resetCursor();
                    r.queryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever
                break;
            }
        }
        return true;
    }
Exemplo n.º 15
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                _currentSyncTarget = NULL;
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        verify(r.conn() == NULL);

        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                continue;
            }

            if (isStale(r, oldest)) {
                r.resetConnection();
                theReplSet->veto(current, 600);
                stale = target;
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }
            {
                // prevent writers from blocking readers during fsync
                SimpleMutex::scoped_lock fsynclk(filesLockedFsync);
                // we don't need the local write lock yet, but it's needed by ensureMe()
                // so we take it preemptively to avoid deadlocking.
                Lock::DBWrite lk("local");

                theReplSet->syncSourceFeedback.connect(target);
            }

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            theReplSet->goStale(stale, oldest);
            sleepsecs(120);
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Exemplo n.º 16
0
    /**
     * Do the initial sync for this member.  There are several steps to this process:
     *
     *     1. Record start time.
     *     2. Clone.
     *     3. Set minValid1 to sync target's latest op time.
     *     4. Apply ops from start to minValid1, fetching missing docs as needed.
     *     5. Set minValid2 to sync target's latest op time.
     *     6. Apply ops from minValid1 to minValid2.
     *     7. Build indexes.
     *     8. Set minValid3 to sync target's latest op time.
     *     9. Apply ops from minValid2 to minValid3.
     *
     * At that point, initial sync is finished.  Note that the oplog from the sync target is applied
     * three times: step 4, 6, and 8.  4 may involve refetching, 6 should not.  By the end of 6,
     * this member should have consistent data.  8 is "cosmetic," it is only to get this member
     * closer to the latest op time before it can transition to secondary state.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        replset::SyncTail tail(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        // written by applyToHead calls
        BSONObj minValid;

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);

            log() << "oplog sync 1 of 3" << endl;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // Now we sync to the latest op on the sync target _again_, as we may have recloned ops
            // that were "from the future" compared with minValid. During this second application,
            // nothing should need to be recloned.
            log() << "oplog sync 2 of 3" << endl;
            if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) {
                return;
            }
            // data should now be consistent

            lastOp = minValid;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        log() << "oplog sync 3 of 3" << endl;
        if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            theReplSet->setMinValid(minValid);

            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Exemplo n.º 17
0
/**
 * Do the initial sync for this member.
 * This code can use a little refactoring. bit ugly
 */
bool ReplSetImpl::_syncDoInitialSync() {
    sethbmsg("initial sync pending",0);
    bool needsFullSync = gtidManager->getLiveState().isInitial();
    bool needGapsFilled = needsFullSync || replSettings.fastsync;

    // if this is the first node, it may have already become primary
    if ( box.getState().primary() ) {
        sethbmsg("I'm already primary, no need for initial sync",0);
        return true;
    }

    const Member *source = NULL;
    OplogReader r;
    string sourceHostname;
    // only bother making a connection if we need to connect for some reason
    if (needGapsFilled) {
        source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return false;
        }

        sourceHostname = source->h().toString();
        if( !r.connect(sourceHostname, 0) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return false;
        }
    }

    if( needsFullSync ) {
        BSONObj lastOp = r.getLastOp();
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return false;
        }

        {
            LOCK_REASON(lockReason, "repl: initial sync drop all databases");
            Lock::GlobalWrite lk(lockReason);
            Client::Transaction dropTransaction(DB_SERIALIZABLE);
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();
            dropTransaction.commit();
        }

        // now deal with creation of oplog
        // first delete any existing data in the oplog

        {
            LOCK_REASON(lockReason, "repl: create oplog");
            Lock::DBWrite lk("local", lockReason);
            Client::Transaction fileOpsTransaction(DB_SERIALIZABLE);
            deleteOplogFiles();
            fileOpsTransaction.commit(0);
        }

        try {
            sethbmsg("initial sync clone all databases", 0);

            shared_ptr<DBClientConnection> conn(r.conn_shared());
            RemoteTransaction rtxn(*conn, "mvcc");

            list<string> dbs = conn->getDatabaseNamesForRepl();

            //
            // Not sure if it is necessary to have a separate fileOps
            // transaction and clone transaction. The cloneTransaction
            // has a higher chance of failing, and I don't know at the moment
            // if it is ok to do fileops successfully, and then an operation (cloning) that
            // later causes an abort. So, to be cautious, they are separate

            {
                LOCK_REASON(lockReason, "repl: initial sync");
                Lock::GlobalWrite lk(lockReason);
                Client::Transaction cloneTransaction(DB_SERIALIZABLE);
                bool ret = _syncDoInitialSync_clone(sourceHostname.c_str(), dbs, conn);

                if (!ret) {
                    veto(source->fullName(), 600);
                    cloneTransaction.abort();
                    sleepsecs(300);
                    return false;
                }

                // at this point, we have copied all of the data from the
                // remote machine. Now we need to copy the replication information
                // on the remote machine's local database, we need to copy
                // the entire (small) replInfo dictionary, and the necessary portion
                // of the oplog

                // first copy the replInfo, as we will use its information
                // to determine  how much of the opLog to copy
                {
                    Client::Context ctx( "local" );
                    BSONObj q;
                    cloneCollection(conn,
                                    "local",
                                    rsReplInfo,
                                    q,
                                    true, //copyIndexes
                                    false //logForRepl
                                   );

                    // copy entire oplog (probably overkill)
                    cloneCollection(conn,
                                    "local",
                                    rsoplog,
                                    q,
                                    true, //copyIndexes
                                    false //logForRepl
                                   );

                    // copy entire oplog.refs (probably overkill)
                    cloneCollection(conn,
                                    "local",
                                    rsOplogRefs,
                                    q,
                                    true, //copyIndexes
                                    false //logForRepl
                                   );
                }
                cloneTransaction.commit(0);
            }

            bool ok = rtxn.commit();
            verify(ok);  // absolutely no reason this should fail, it was read only
            // data should now be consistent
        }
        catch (DBException &e) {
            sethbmsg("exception trying to copy data", 0);
            LOG(0) << e.getCode() << ": " << e.what() << endl;
            sleepsecs(1);
            return false;
        }
    }
    if (needGapsFilled) {
        _fillGaps(&r);
    }
    GTID dummy;
    applyMissingOpsInOplog(dummy);

    sethbmsg("initial sync done",0);

    return true;
}