Esempio n. 1
0
    bool BackgroundSync::isStale(OpTime lastOpTimeFetched, 
                                 OplogReader& r, 
                                 BSONObj& remoteOldestOp) {
        remoteOldestOp = r.findOne(rsoplog, Query());
        OpTime remoteTs = remoteOldestOp["ts"]._opTime();
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (lastOpTimeFetched >= remoteTs) {
                return false;
            }
            log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
            log() << "replSet lastOpTimeFetched: " << lastOpTimeFetched.toStringLong() << rsLog;
        }

        return true;
    }
Esempio n. 2
0
 /**
  * Checks if the oplog given is too far ahead to read from.
  *
  * @param r the oplog
  * @param hn the hostname (for log messages)
  *
  * @return if we are stale compared to the oplog on hn
  */
 bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) {
     BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
     OpTime ts = remoteOldestOp["ts"]._opTime();
     DEV log() << "replSet remoteOldestOp:    " << ts.toStringLong() << rsLog;
     else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
Esempio n. 3
0
    /* slave: pull some data from the master's oplog
       note: not yet in db mutex at this point.
       @return -1 error
               0 ok, don't sleep
               1 ok, sleep
    */
    int ReplSource::sync_pullOpLog(int& nApplied) {
        int okResultCode = 1;
        string ns = string("local.oplog.$") + sourceName();
        log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n';

        bool tailing = true;
        oplogReader.tailCheck();

        bool initial = syncedTo.isNull();

        if ( !oplogReader.haveCursor() || initial ) {
            if ( initial ) {
                // Important to grab last oplog timestamp before listing databases.
                syncToTailOfRemoteLog();
                BSONObj info;
                bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
                massert( 10389 ,  "Unable to get database list", ok );
                BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
                while( i.moreWithEOO() ) {
                    BSONElement e = i.next();
                    if ( e.eoo() )
                        break;
                    string name = e.embeddedObject().getField( "name" ).valuestr();
                    if ( !e.embeddedObject().getBoolField( "empty" ) ) {
                        if ( name != "local" ) {
                            if ( only.empty() || only == name ) {
                                log( 2 ) << "adding to 'addDbNextPass': "******"$gte", syncedTo.asDate());
            BSONObjBuilder query;
            query.append("ts", q.done());
            if ( !only.empty() ) {
                // note we may here skip a LOT of data table scanning, a lot of work for the master.
                query.appendRegex("ns", string("^") + only); // maybe append "\\." here?
            }
            BSONObj queryObj = query.done();
            // e.g. queryObj = { ts: { $gte: syncedTo } }

            oplogReader.tailingQuery(ns.c_str(), queryObj);
            tailing = false;
        }
        else {
            log(2) << "repl: tailing=true\n";
        }

        if( !oplogReader.haveCursor() ) {
            problem() << "repl: dbclient::query returns null (conn closed?)" << endl;
            oplogReader.resetConnection();
            return -1;
        }

        // show any deferred database creates from a previous pass
        {
            set<string>::iterator i = addDbNextPass.begin();
            if ( i != addDbNextPass.end() ) {
                BSONObjBuilder b;
                b.append("ns", *i + '.');
                b.append("op", "db");
                BSONObj op = b.done();
                sync_pullOpLog_applyOperation(op, false);
            }
        }

        if ( !oplogReader.more() ) {
            if ( tailing ) {
                log(2) << "repl: tailing & no new activity\n";
                if( oplogReader.awaitCapable() )
                    okResultCode = 0; // don't sleep

            }
            else {
                log() << "repl:   " << ns << " oplog is empty\n";
            }
            {
                dblock lk;
                save();
            }
            return okResultCode;
        }

        OpTime nextOpTime;
        {
            BSONObj op = oplogReader.next();
            BSONElement ts = op.getField("ts");
            if ( ts.type() != Date && ts.type() != Timestamp ) {
                string err = op.getStringField("$err");
                if ( !err.empty() ) {
                    // 13051 is "tailable cursor requested on non capped collection"
                    if (op.getIntField("code") == 13051) {
                        problem() << "trying to slave off of a non-master" << '\n';
                        massert( 13344 ,  "trying to slave off of a non-master", false );
                    }
                    else {
                        problem() << "repl: $err reading remote oplog: " + err << '\n';
                        massert( 10390 ,  "got $err reading remote oplog", false );
                    }
                }
                else {
                    problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
                    massert( 10391 , "repl: bad object read from remote oplog", false);
                }
            }

            nextOpTime = OpTime( ts.date() );
            log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
            if ( initial ) {
                log(1) << "repl:   initial run\n";
            }
            if( tailing ) {
                if( !( syncedTo < nextOpTime ) ) {
                    log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl;
                    log() << "repl syncTo:     " << syncedTo.toStringLong() << endl;
                    log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl;
                    assert(false);
                }
                oplogReader.putBack( op ); // op will be processed in the loop below
                nextOpTime = OpTime(); // will reread the op below
            }
            else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error
                Nullstream& l = log();
                l << "repl:   nextOpTime " << nextOpTime.toStringLong() << ' ';
                if ( nextOpTime < syncedTo )
                    l << "<??";
                else
                    l << ">";

                l << " syncedTo " << syncedTo.toStringLong() << '\n';
                log() << "repl:   time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
                log() << "repl:   tailing: " << tailing << '\n';
                log() << "repl:   data too stale, halting replication" << endl;
                replInfo = replAllDead = "data too stale halted replication";
                assert( syncedTo < nextOpTime );
                throw SyncException();
            }
            else {
                /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */
            }
        }

        // apply operations
        {
            int n = 0;
            time_t saveLast = time(0);
            while ( 1 ) {

                bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place.

                if ( moreInitialSyncsPending || !oplogReader.more() ) {
                    dblock lk;

                    // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior.
                    {
                        dbtemprelease t;
                        if ( !moreInitialSyncsPending && oplogReader.more() ) {
                            continue;
                        }
                        // otherwise, break out of loop so we can set to completed or clone more dbs
                    }
                    
                    if( oplogReader.awaitCapable() && tailing )
                        okResultCode = 0; // don't sleep
                    syncedTo = nextOpTime;
                    save(); // note how far we are synced up to now
                    log() << "repl:   applied " << n << " operations" << endl;
                    nApplied = n;
                    log() << "repl:  end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
                    break;
                }
                else {
                }

                OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) {
                    // periodically note our progress, in case we are doing a lot of work and crash
                    dblock lk;
                    syncedTo = nextOpTime;
                    // can't update local log ts since there are pending operations from our peer
                    save();
                    log() << "repl:   checkpoint applied " << n << " operations" << endl;
                    log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
                    saveLast = time(0);
                    n = 0;
                }

                BSONObj op = oplogReader.next();

                unsigned b = replApplyBatchSize;
                bool justOne = b == 1;
                scoped_ptr<writelock> lk( justOne ? 0 : new writelock() );
                while( 1 ) {

                    BSONElement ts = op.getField("ts");
                    if( !( ts.type() == Date || ts.type() == Timestamp ) ) {
                        log() << "sync error: problem querying remote oplog record" << endl;
                        log() << "op: " << op.toString() << endl;
                        log() << "halting replication" << endl;
                        replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
                        throw SyncException();
                    }
                    OpTime last = nextOpTime;
                    nextOpTime = OpTime( ts.date() );
                    if ( !( last < nextOpTime ) ) {
                        log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
                        log() << " last:       " << last.toStringLong() << endl;
                        log() << " nextOpTime: " << nextOpTime.toStringLong() << endl;
                        log() << " halting replication" << endl;
                        replInfo = replAllDead = "sync error last >= nextOpTime";
                        uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
                    }
                    if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
                        assert( justOne );
                        oplogReader.putBack( op );
                        _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
                        dblock lk;
                        if ( n > 0 ) {
                            syncedTo = last;
                            save();
                        }
                        log() << "repl:   applied " << n << " operations" << endl;
                        log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
                        log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
                        return okResultCode;
                    }

                    sync_pullOpLog_applyOperation(op, !justOne);
                    n++;

                    if( --b == 0 )
                        break;
                    // if to here, we are doing mulpile applications in a singel write lock acquisition
                    if( !oplogReader.moreInCurrentBatch() ) {
                        // break if no more in batch so we release lock while reading from the master
                        break;
                    }
                    op = oplogReader.next();

                    getDur().commitIfNeeded();
                }
            }
        }

        return okResultCode;
    }
Esempio n. 4
0
 bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) {
     remoteOldestOp = r.findOne(rsoplog, Query());
     OpTime remoteTs = remoteOldestOp["ts"]._opTime();
     DEV log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
     else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
Esempio n. 5
0
    void BackgroundSync::connectOplogReader(OperationContext* txn, 
                                            ReplicationCoordinatorImpl* replCoordImpl,
                                            OplogReader* reader) {
        OpTime lastOpTimeFetched;
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }
        Date_t now(curTimeMillis64());
        OpTime oldestOpTimeSeen(now,0);

        while (true) {
            HostAndPort candidate = replCoordImpl->chooseNewSyncSource();

            if (candidate.empty()) {
                if (oldestOpTimeSeen == OpTime(now,0)) {
                    // If, in this invocation of connectOplogReader(), we did not successfully 
                    // connect to any node ahead of us,
                    // we apparently have no sync sources to connect to.
                    // This situation is common; e.g. if there are no writes to the primary at
                    // the moment.
                    return;
                }

                // Connected to at least one member, but in all cases we were too stale to use them
                // as a sync source.
                log() << "replSet error RS102 too stale to catch up" << rsLog;
                log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong() << rsLog;
                log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong() <<
                    rsLog;
                log() << "replSet "
                    "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember" 
                      << rsLog;
                sethbmsg("error RS102 too stale to catch up");
                theReplSet->setMinValid(txn, oldestOpTimeSeen);
                replCoordImpl->setFollowerMode(MemberState::RS_RECOVERING);
                return;
            }

            if (!reader->connect(candidate)) {
                LOG(2) << "replSet can't connect to " << candidate.toString() << 
                    " to read operations" << rsLog;
                reader->resetConnection();
                replCoordImpl->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10*1000));
                continue;
            }
            // Read the first (oldest) op and confirm that it's not newer than our last
            // fetched op. Otherwise, we have fallen off the back of that source's oplog.
            BSONObj remoteOldestOp(reader->findOne(rsoplog, Query()));
            BSONElement tsElem(remoteOldestOp["ts"]);
            if (tsElem.type() != Timestamp) {
                // This member's got a bad op in its oplog.
                warning() << "oplog invalid format on node " << candidate.toString();
                reader->resetConnection();
                replCoordImpl->blacklistSyncSource(candidate, 
                                                    Date_t(curTimeMillis64() + 600*1000));
                continue;
            }
            OpTime remoteOldOpTime = tsElem._opTime();

            if (lastOpTimeFetched < remoteOldOpTime) {
                // We're too stale to use this sync source.
                reader->resetConnection();
                replCoordImpl->blacklistSyncSource(candidate, 
                                                    Date_t(curTimeMillis64() + 600*1000));
                if (oldestOpTimeSeen > remoteOldOpTime) {
                    warning() << "we are too stale to use " << candidate.toString() << 
                        " as a sync source";
                    oldestOpTimeSeen = remoteOldOpTime;
                }
                continue;
            }

            // Got a valid sync source.
            return;
        } // while (true)

    }
Esempio n. 6
0
    static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
        static time_t last;
        if( time(0)-last < 60 ) {
            throw "findcommonpoint waiting a while before trying again";
        }
        last = time(0);

        assert( dbMutex.atLeastReadLocked() );
        Client::Context c(rsoplog, dbpath, 0, false);
        NamespaceDetails *nsd = nsdetails(rsoplog);
        assert(nsd);
        ReverseCappedCursor u(nsd);
        if( !u.ok() )
            throw "our oplog empty or unreadable";

        const Query q = Query().sort(reverseNaturalObj);
        const bo fields = BSON( "ts" << 1 << "h" << 1 );

        //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0);

        h.rbid = getRBID(them);
        auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0);

        if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable";

        BSONObj ourObj = u.current();
        OpTime ourTime = ourObj["ts"]._opTime();
        BSONObj theirObj = t->nextSafe();
        OpTime theirTime = theirObj["ts"]._opTime();

        {
            long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs());
            /* diff could be positive, negative, or zero */
            log() << "replSet info rollback our last optime:   " << ourTime.toStringPretty() << rsLog;
            log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
            log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
            if( diff > 3600 ) {
                log() << "replSet rollback too long a time period for a rollback." << rsLog;
                throw "error not willing to roll back more than one hour of data";
            }
        }

        unsigned long long scanned = 0;
        while( 1 ) {
            scanned++;
            /* todo add code to assure no excessive scanning for too long */
            if( ourTime == theirTime ) {
                if( ourObj["h"].Long() == theirObj["h"].Long() ) {
                    // found the point back in time where we match.
                    // todo : check a few more just to be careful about hash collisions.
                    log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
                    log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog;
                    h.commonPoint = ourTime;
                    h.commonPointOurDiskloc = u.currLoc();
                    return;
                }

                refetch(h, ourObj);

                if( !t->more() ) {
                    log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
                    throw "RS100 reached beginning of remote oplog [2]";
                }
                theirObj = t->nextSafe();
                theirTime = theirObj["ts"]._opTime();

                u.advance();
                if( !u.ok() ) {
                    log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
                    throw "RS101 reached beginning of local oplog [1]";
                }
                ourObj = u.current();
                ourTime = ourObj["ts"]._opTime();
            }
            else if( theirTime > ourTime ) {
                if( !t->more() ) {
                    log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
                    throw "RS100 reached beginning of remote oplog [1]";
                }
                theirObj = t->nextSafe();
                theirTime = theirObj["ts"]._opTime();
            }
            else {
                // theirTime < ourTime
                refetch(h, ourObj);
                u.advance();
                if( !u.ok() ) {
                    log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
                    throw "RS101 reached beginning of local oplog [2]";
                }
                ourObj = u.current();
                ourTime = ourObj["ts"]._opTime();
            }
        }
    }
Esempio n. 7
0
void OplogReader::connectToSyncSource(OperationContext* txn,
                                      OpTime lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0);
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched);

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            log() << "replSet error RS102 too stale to catch up";
            log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong();
            log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong();
            log() << "replSet "
                     "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, oldestOpTimeSeen);
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10 * 1000));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsoplog, Query()));
        BSONElement tsElem(remoteOldestOp["ts"]);
        if (tsElem.type() != Timestamp) {
            // This member's got a bad op in its oplog.
            warning() << "oplog invalid format on node " << candidate.toString();
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000));
            continue;
        }
        OpTime remoteOldOpTime = tsElem._opTime();

        if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000));
            if (oldestOpTimeSeen > remoteOldOpTime) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Got a valid sync source.
        return;
    }  // while (true)
}