コード例 #1
0
ファイル: sync_tail.cpp プロジェクト: Jaryli/mongo
BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) {
    OplogReader missingObjReader;  // why are we using OplogReader to run a non-oplog query?
    const char* ns = o.getStringField("ns");

    // capped collections
    Collection* collection = db->getCollection(ns);
    if (collection && collection->isCapped()) {
        log() << "missing doc, but this is okay for a capped collection (" << ns << ")";
        return BSONObj();
    }

    const int retryMax = 3;
    for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
        if (retryCount != 1) {
            // if we are retrying, sleep a bit to let the network possibly recover
            sleepsecs(retryCount * retryCount);
        }
        try {
            bool ok = missingObjReader.connect(HostAndPort(_hostname));
            if (!ok) {
                warning() << "network problem detected while connecting to the "
                          << "sync source, attempt " << retryCount << " of " << retryMax << endl;
                continue;  // try again
            }
        } catch (const SocketException&) {
            warning() << "network problem detected while connecting to the "
                      << "sync source, attempt " << retryCount << " of " << retryMax << endl;
            continue;  // try again
        }

        // get _id from oplog entry to create query to fetch document.
        const BSONElement opElem = o.getField("op");
        const bool isUpdate = !opElem.eoo() && opElem.str() == "u";
        const BSONElement idElem = o.getObjectField(isUpdate ? "o2" : "o")["_id"];

        if (idElem.eoo()) {
            severe() << "cannot fetch missing document without _id field: " << o.toString();
            fassertFailedNoTrace(28742);
        }

        BSONObj query = BSONObjBuilder().append(idElem).obj();
        BSONObj missingObj;
        try {
            missingObj = missingObjReader.findOne(ns, query);
        } catch (const SocketException&) {
            warning() << "network problem detected while fetching a missing document from the "
                      << "sync source, attempt " << retryCount << " of " << retryMax << endl;
            continue;  // try again
        } catch (DBException& e) {
            error() << "assertion fetching missing object: " << e.what() << endl;
            throw;
        }

        // success!
        return missingObj;
    }
    // retry count exceeded
    msgasserted(15916,
                str::stream() << "Can no longer connect to initial sync source: " << _hostname);
}
コード例 #2
0
ファイル: oplog.cpp プロジェクト: JakubOboza/mongo
    BSONObj Sync::getMissingDoc(const BSONObj& o) {
        OplogReader missingObjReader;
        const char *ns = o.getStringField("ns");

        // capped collections
        NamespaceDetails *nsd = nsdetails(ns);
        if ( nsd && nsd->isCapped() ) {
            log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl;
            return BSONObj();
        }

        uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));

        // might be more than just _id in the update criteria
        BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
        BSONObj missingObj;
        try {
            missingObj = missingObjReader.findOne(ns, query);
        } catch(DBException& e) {
            log() << "replication assertion fetching missing object: " << e.what() << endl;
            throw;
        }

        return missingObj;
    }
コード例 #3
0
ファイル: bgsync.cpp プロジェクト: igagnidz/tokumx
    bool BackgroundSync::isRollbackRequired(OplogReader& r) {
        string hn = r.conn()->getServerAddress();
        if (!r.more()) {
            // In vanilla Mongo, this happened for one of the
            // following reasons:
            //  - we were ahead of what we are syncing from (don't
            //    think that is possible anymore)
            //  - remote oplog is empty for some weird reason
            // in either case, if it (strangely) happens, we'll just return
            // and our caller will simply try again after a short sleep.
            log() << "replSet error empty query result from " << hn << " oplog, attempting rollback" << rsLog;
             return true;
        }

        BSONObj o = r.nextSafe();
        uint64_t ts = o["ts"]._numberLong();
        uint64_t lastHash = o["h"].numberLong();
        GTID gtid = getGTIDFromBSON("_id", o);

        if( !theReplSet->gtidManager->rollbackNeeded(gtid, ts, lastHash)) {
            log() << "Rollback NOT needed! Our GTID" << gtid << endl;
            return false;
        }

        log() << "Rollback needed! Our GTID" <<
            theReplSet->gtidManager->getLiveState().toString() <<
            " remote GTID: " << gtid.toString() << ". Attempting rollback." << rsLog;

        runRollback(r, ts);
        return true;
    }
コード例 #4
0
ファイル: rs_rollback.cpp プロジェクト: 7segments/mongo
    bool isRollbackRequired(OplogReader& r, uint64_t *lastTS) {
        string hn = r.conn()->getServerAddress();
        verify(r.more());
        BSONObj rollbackStatus;
        bool found = getRollbackStatus(rollbackStatus);
        if (found) {
            // we have a rollback in progress,
            // must complete it
            log() << "Rollback needed, found rollbackStatus: " << rollbackStatus << rsLog;
            return true;
        }

        BSONObj o = r.nextSafe();
        uint64_t ts = o["ts"]._numberLong();
        uint64_t lastHash = o["h"].numberLong();
        GTID gtid = getGTIDFromBSON("_id", o);

        if (!theReplSet->gtidManager->rollbackNeeded(gtid, ts, lastHash)) {
            log() << "Rollback NOT needed! " << gtid << endl;
            return false;
        }

        log() << "Rollback needed! Our GTID: " <<
            theReplSet->gtidManager->getLiveState().toString() <<
            ", remote GTID: " << gtid.toString() << ". Attempting rollback." << rsLog;

        *lastTS = ts;
        return true;
    }
コード例 #5
0
ファイル: rs_sync.cpp プロジェクト: mgood/mongo
    void ReplSetImpl::syncTail() { 
        // todo : locking vis a vis the mgr...

        const Member *primary = box.getPrimary();
        if( primary == 0 ) return;
        string hn = primary->h().toString();
        OplogReader r;
        if( !r.connect(primary->h().toString()) ) { 
            log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
            return;
        }

        /* first make sure we are not hopelessly out of sync by being very stale. */
        {
            BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
            OpTime ts = remoteOldestOp["ts"]._opTime();
            DEV log() << "remoteOldestOp: " << ts.toStringPretty() << endl;
            else log(3) << "remoteOldestOp: " << ts.toStringPretty() << endl;
            if( lastOpTimeWritten < ts ) { 
                log() << "replSet error too stale to catch up, at least from primary " << hn << rsLog;
                log() << "replSet our last optime : " << lastOpTimeWritten.toStringPretty() << rsLog;
                log() << "replSet oldest at " << hn << " : " << ts.toStringPretty() << rsLog;
                log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
                sethbmsg("error too stale to catch up");
                sleepsecs(120);
                return;
            }
        }
コード例 #6
0
ファイル: bgsync.cpp プロジェクト: 10genReviews/mongo
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                _currentSyncTarget = NULL;
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        verify(r.conn() == NULL);

        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                continue;
            }

            if (isStale(r, oldest)) {
                r.resetConnection();
                theReplSet->veto(current, 600);
                stale = target;
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            theReplSet->goStale(stale, oldest);
            sleepsecs(120);
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
コード例 #7
0
ファイル: sync.cpp プロジェクト: wjin/mongo
    BSONObj Sync::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) {
        OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query?
        const char *ns = o.getStringField("ns");

        // capped collections
        Collection* collection = db->getCollection(ns);
        if ( collection && collection->isCapped() ) {
            log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl;
            return BSONObj();
        }

        const int retryMax = 3;
        for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
            if (retryCount != 1) {
                // if we are retrying, sleep a bit to let the network possibly recover
                sleepsecs(retryCount * retryCount);
            }
            try {
                bool ok = missingObjReader.connect(HostAndPort(hn));
                if (!ok) {
                    warning() << "network problem detected while connecting to the "
                              << "sync source, attempt " << retryCount << " of "
                              << retryMax << endl;
                        continue;  // try again
                }
            } 
            catch (const SocketException&) {
                warning() << "network problem detected while connecting to the "
                          << "sync source, attempt " << retryCount << " of "
                          << retryMax << endl;
                continue; // try again
            }

            // might be more than just _id in the update criteria
            BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
            BSONObj missingObj;
            try {
                missingObj = missingObjReader.findOne(ns, query);
            } 
            catch (const SocketException&) {
                warning() << "network problem detected while fetching a missing document from the "
                          << "sync source, attempt " << retryCount << " of "
                          << retryMax << endl;
                continue; // try again
            } 
            catch (DBException& e) {
                log() << "replication assertion fetching missing object: " << e.what() << endl;
                throw;
            }

            // success!
            return missingObj;
        }
        // retry count exceeded
        msgasserted(15916, 
                    str::stream() << "Can no longer connect to initial sync source: " << hn);
    }
コード例 #8
0
ファイル: bgsync.cpp プロジェクト: 10genReviews/mongo
    bool BackgroundSync::isRollbackRequired(OplogReader& r) {
        string hn = r.conn()->getServerAddress();

        if (!r.more()) {
            try {
                BSONObj theirLastOp = r.getLastOp(rsoplog);
                if (theirLastOp.isEmpty()) {
                    log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
                    sleepsecs(2);
                    return true;
                }
                OpTime theirTS = theirLastOp["ts"]._opTime();
                if (theirTS < _lastOpTimeFetched) {
                    log() << "replSet we are ahead of the sync source, will try to roll back"
                          << rsLog;
                    theReplSet->syncRollback(r);
                    return true;
                }
                /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
                log() << "replSet syncTail condition 1" << rsLog;
                sleepsecs(1);
            }
            catch(DBException& e) {
                log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
                sleepsecs(2);
            }
            return true;
        }

        BSONObj o = r.nextSafe();
        OpTime ts = o["ts"]._opTime();
        long long h = o["h"].numberLong();
        if( ts != _lastOpTimeFetched || h != _lastH ) {
            log() << "replSet our last op time fetched: " << _lastOpTimeFetched.toStringPretty() << rsLog;
            log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
            theReplSet->syncRollback(r);
            return true;
        }

        return false;
    }
コード例 #9
0
ファイル: bgsync.cpp プロジェクト: ArgusTek/mongo
    bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) {
        string hn = r.conn()->getServerAddress();

        if (!r.more()) {
            try {
                BSONObj theirLastOp = r.getLastOp(rsOplogName.c_str());
                if (theirLastOp.isEmpty()) {
                    error() << "empty query result from " << hn << " oplog";
                    sleepsecs(2);
                    return true;
                }
                OpTime theirOpTime = extractOpTime(theirLastOp);
                if (theirOpTime < _lastOpTimeFetched) {
                    log() << "we are ahead of the sync source, will try to roll back";
                    syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord);
                    return true;
                }
                /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
                log() << "syncTail condition 1";
                sleepsecs(1);
            }
            catch(DBException& e) {
                error() << "querying " << hn << ' ' << e.toString();
                sleepsecs(2);
            }
            return true;
        }

        BSONObj o = r.nextSafe();
        OpTime opTime = extractOpTime(o);
        long long hash = o["h"].numberLong();
        if ( opTime != _lastOpTimeFetched || hash != _lastFetchedHash ) {
            log() << "our last op time fetched: " << _lastOpTimeFetched;
            log() << "source's GTE: " << opTime;
            syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord);
            return true;
        }

        return false;
    }
コード例 #10
0
ファイル: bgsync.cpp プロジェクト: igagnidz/tokumx
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        verify(r.conn() == NULL);
        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            GTID remoteOldestGTID = getGTIDFromBSON("_id", oldest);
            theReplSet->goStale(stale, remoteOldestGTID);
            // vanilla Mongo used to do a sleep of 120 seconds here
            // We removed it. It seems excessive, and if this machine is doing
            // nothing anyway, sleeping won't help. It might as well
            // return with a null sync target, and produce() will handle
            // that fact and sleep one second
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
コード例 #11
0
ファイル: oplog.cpp プロジェクト: aberg001/mongo
 // Copy a range of documents to the local oplog.refs collection
 static void copyOplogRefsRange(OplogReader &r, OID oid) {
     shared_ptr<DBClientCursor> c = r.getOplogRefsCursor(oid);
     Client::ReadContext ctx(rsOplogRefs);
     while (c->more()) {
         BSONObj b = c->next();
         BSONElement eOID = b.getFieldDotted("_id.oid");
         if (oid != eOID.OID()) {
             break;
         }
         LOG(6) << "copyOplogRefsRange " << b << endl;
         writeEntryToOplogRefs(b);
     }
 }
コード例 #12
0
ファイル: oplog.cpp プロジェクト: ChrisKozak/mongo
    int run() {

        Client::initThread( "oplogreplay" );

        toolInfoLog() << "going to connect" << std::endl;
        
        OplogReader r;
        r.setTailingQueryOptions( QueryOption_SlaveOk | QueryOption_AwaitData );
        r.connect(mongoOplogGlobalParams.from);

        toolInfoLog() << "connected" << std::endl;

        OpTime start(time(0) - mongoOplogGlobalParams.seconds, 0);
        toolInfoLog() << "starting from " << start.toStringPretty() << std::endl;

        r.tailingQueryGTE(mongoOplogGlobalParams.ns.c_str(), start);

        int num = 0;
        while ( r.more() ) {
            BSONObj o = r.next();
            if (logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(2))) {
                toolInfoLog() << o << std::endl;
            }
            
            if ( o["$err"].type() ) {
                toolError() << "error getting oplog" << std::endl;
                toolError() << o << std::endl;
                return -1;
            }
                

            bool print = ++num % 100000 == 0;
            if (print) {
                toolInfoLog() << num << "\t" << o << std::endl;
            }
            
            if ( o["op"].String() == "n" )
                continue;

            BSONObjBuilder b( o.objsize() + 32 );
            BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) );
            updates.append( o );
            updates.done();

            BSONObj c = b.obj();
            
            BSONObj res;
            bool ok = conn().runCommand( "admin" , c , res );
            if (!ok) {
                toolError() << res << std::endl;
            } else if (print) {
                toolInfoLog() << res << std::endl;
            }
        }

        return 0;
    }
コード例 #13
0
ファイル: bgsync.cpp プロジェクト: OpenSourceiDRLPVTLTD/mongo
    bool BackgroundSync::isStale(OpTime lastOpTimeFetched, 
                                 OplogReader& r, 
                                 BSONObj& remoteOldestOp) {
        remoteOldestOp = r.findOne(rsoplog, Query());
        OpTime remoteTs = remoteOldestOp["ts"]._opTime();
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (lastOpTimeFetched >= remoteTs) {
                return false;
            }
            log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
            log() << "replSet lastOpTimeFetched: " << lastOpTimeFetched.toStringLong() << rsLog;
        }

        return true;
    }
コード例 #14
0
ファイル: rs_rollback.cpp プロジェクト: 7segments/mongo
 bool canStartRollback(OplogReader& r, GTID idToRollbackTo) {
     shared_ptr<DBClientConnection> conn(r.conn_shared());
     // before we start rollback, let's make sure that the minUnapplied on the remote
     // server is past the id that we are rolling back to. Otherwise, the snapshot
     // we create will not be up to date, and the rollback algorithm will not work
     BSONObjBuilder b;
     b.append("_id", "minUnapplied");
     // Note that another way to get this information is to
     // request a heartbeat. That one will technically return
     // a more up to date value for minUnapplied
     BSONObj res = findOneFromConn(conn.get(), rsReplInfo, Query(b.done()));
     GTID minUnapplied = getGTIDFromBSON("GTID", res);
     if (GTID::cmp(minUnapplied, idToRollbackTo) < 0) {
         log() << "Remote server has minUnapplied " << minUnapplied.toString() << \
             " we want to rollback to " << idToRollbackTo.toString() << \
             ". Therefore, exiting and retrying." << rsLog;
         return false;
     }
     return true;
 }
コード例 #15
0
ファイル: bgsync.cpp プロジェクト: igagnidz/tokumx
    void BackgroundSync::runRollback(OplogReader& r, uint64_t oplogTS) {
        // starting from ourLast, we need to read the remote oplog
        // backwards until we find an entry in the remote oplog
        // that has the same GTID, timestamp, and hash as
        // what we have in our oplog. If we don't find one that is within
        // some reasonable timeframe, then we go fatal
        GTID ourLast = theReplSet->gtidManager->getLiveState();
        GTID idToRollbackTo;
        uint64_t rollbackPointTS = 0;
        uint64_t rollbackPointHash = 0;
        incRBID();
        try {
            shared_ptr<DBClientCursor> rollbackCursor = r.getRollbackCursor(ourLast);
            while (rollbackCursor->more()) {
                BSONObj remoteObj = rollbackCursor->next();
                GTID remoteGTID = getGTIDFromBSON("_id", remoteObj);
                uint64_t remoteTS = remoteObj["ts"]._numberLong();
                uint64_t remoteLastHash = remoteObj["h"].numberLong();
                if (remoteTS + 1800*1000 < oplogTS) {
                    log() << "Rollback takes us too far back, throwing exception. remoteTS: " << remoteTS << " oplogTS: " << oplogTS << rsLog;
                    throw RollbackOplogException("replSet rollback too long a time period for a rollback (at least 30 minutes).");
                    break;
                }
                //now try to find an entry in our oplog with that GTID
                BSONObjBuilder localQuery;
                BSONObj localObj;
                addGTIDToBSON("_id", remoteGTID, localQuery);
                bool foundLocally = false;
                {
                    LOCK_REASON(lockReason, "repl: looking up oplog entry for rollback");
                    Client::ReadContext ctx(rsoplog, lockReason);
                    Client::Transaction transaction(DB_SERIALIZABLE);
                    foundLocally = Collection::findOne(rsoplog, localQuery.done(), localObj);
                    transaction.commit();
                }
                if (foundLocally) {
                    GTID localGTID = getGTIDFromBSON("_id", localObj);
                    uint64_t localTS = localObj["ts"]._numberLong();
                    uint64_t localLastHash = localObj["h"].numberLong();
                    if (localLastHash == remoteLastHash &&
                        localTS == remoteTS &&
                        GTID::cmp(localGTID, remoteGTID) == 0
                        )
                    {
                        idToRollbackTo = localGTID;
                        rollbackPointTS = localTS;
                        rollbackPointHash = localLastHash;
                        log() << "found id to rollback to " << idToRollbackTo << rsLog;
                        break;
                    }
                }
            }
            // At this point, either we have found the point to try to rollback to,
            // or we have determined that we cannot rollback
            if (idToRollbackTo.isInitial()) {
                // we cannot rollback
                throw RollbackOplogException("could not find ID to rollback to");
            }
        }
        catch (DBException& e) {
            log() << "Caught DBException during rollback " << e.toString() << rsLog;
            throw RollbackOplogException("DBException while trying to find ID to rollback to: " + e.toString());
        }
        catch (std::exception& e2) {
            log() << "Caught std::exception during rollback " << e2.what() << rsLog;
            throw RollbackOplogException(str::stream() << "Exception while trying to find ID to rollback to: " << e2.what());
        }

        // proceed with the rollback to point idToRollbackTo
        // probably ought to grab a global write lock while doing this
        // I don't think we want oplog cursors reading from this machine
        // while we are rolling back. Or at least do something to protect against this

        // first, let's get all the operations that are being applied out of the way,
        // we don't want to rollback an item in the oplog while simultaneously,
        // the applier thread is applying it to the oplog
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            while (_deque.size() > 0) {
                log() << "waiting for applier to finish work before doing rollback " << rsLog;
                _queueDone.wait(lock);
            }
            verifySettled();
        }

        // now let's tell the system we are going to rollback, to do so,
        // abort live multi statement transactions, invalidate cursors, and
        // change the state to RS_ROLLBACK
        {
            // so we know nothing is simultaneously occurring
            RWLockRecursive::Exclusive e(operationLock);
            LOCK_REASON(lockReason, "repl: killing all operations for rollback");
            Lock::GlobalWrite lk(lockReason);
            ClientCursor::invalidateAllCursors();
            Client::abortLiveTransactions();
            theReplSet->goToRollbackState();
        }

        try {
            // now that we are settled, we have to take care of the GTIDManager
            // and the repl info thread.
            // We need to reset the state of the GTIDManager to the point
            // we intend to rollback to, and we need to make sure that the repl info thread
            // has captured this information.
            theReplSet->gtidManager->resetAfterInitialSync(
                idToRollbackTo,
                rollbackPointTS,
                rollbackPointHash
                );
            // now force an update of the repl info thread
            theReplSet->forceUpdateReplInfo();

            // at this point, everything should be settled, the applier should
            // have nothing left (and remain that way, because this is the only
            // thread that can put work on the applier). Now we can rollback
            // the data.
            while (true) {
                BSONObj o;
                {
                    LOCK_REASON(lockReason, "repl: checking for oplog data");
                    Lock::DBRead lk(rsoplog, lockReason);
                    Client::Transaction txn(DB_SERIALIZABLE);
                    // if there is nothing in the oplog, break
                    o = getLastEntryInOplog();
                    if( o.isEmpty() ) {
                        break;
                    }
                }
                GTID lastGTID = getGTIDFromBSON("_id", o);
                // if we have rolled back enough, break from while loop
                if (GTID::cmp(lastGTID, idToRollbackTo) <= 0) {
                    dassert(GTID::cmp(lastGTID, idToRollbackTo) == 0);
                    break;
                }
                rollbackTransactionFromOplog(o, true);
            }
            theReplSet->leaveRollbackState();
        }
        catch (DBException& e) {
            log() << "Caught DBException during rollback " << e.toString() << rsLog;
            throw RollbackOplogException("DBException while trying to run rollback: " + e.toString());
        }
        catch (std::exception& e2) {
            log() << "Caught std::exception during rollback " << e2.what() << rsLog;
            throw RollbackOplogException(str::stream() << "Exception while trying to run rollback: " << e2.what());
        }
        
    }
コード例 #16
0
    /**
     * Do the initial sync for this member.  There are several steps to this process:
     *
     *     0. Add _initialSyncFlag to minValid to tell us to restart initial sync if we
     *        crash in the middle of this procedure
     *     1. Record start time.
     *     2. Clone.
     *     3. Set minValid1 to sync target's latest op time.
     *     4. Apply ops from start to minValid1, fetching missing docs as needed.
     *     5. Set minValid2 to sync target's latest op time.
     *     6. Apply ops from minValid1 to minValid2.
     *     7. Build indexes.
     *     8. Set minValid3 to sync target's latest op time.
     *     9. Apply ops from minValid2 to minValid3.
          10. Clean up minValid and remove _initialSyncFlag field
     *
     * At that point, initial sync is finished.  Note that the oplog from the sync target is applied
     * three times: step 4, 6, and 8.  4 may involve refetching, 6 should not.  By the end of 6,
     * this member should have consistent data.  8 is "cosmetic," it is only to get this member
     * closer to the latest op time before it can transition to secondary state.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        replset::SyncTail tail(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        // written by applyToHead calls
        BSONObj minValid;

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            // Add field to minvalid document to tell us to restart initial sync if we crash
            theReplSet->setInitialSyncFlag();

            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            Cloner cloner;
            if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, true)) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);

            log() << "oplog sync 1 of 3" << endl;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // Now we sync to the latest op on the sync target _again_, as we may have recloned ops
            // that were "from the future" compared with minValid. During this second application,
            // nothing should need to be recloned.
            log() << "oplog sync 2 of 3" << endl;
            if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) {
                return;
            }
            // data should now be consistent

            lastOp = minValid;

            sethbmsg("initial sync building indexes",0);
            if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, false)) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        log() << "oplog sync 3 of 3" << endl;
        if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) {
            return;
        }
        
        // ---------

        Status status = getGlobalAuthorizationManager()->initialize();
        if (!status.isOK()) {
            warning() << "Failed to reinitialize auth data after initial sync. " << status;
            return;
        }

        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            // Initial sync is now complete.  Flag this by setting minValid to the last thing
            // we synced.
            theReplSet->setMinValid(minValid);

            // Clear the initial sync flag.
            theReplSet->clearInitialSyncFlag();

            cx.ctx().db()->flushFiles(true);
        }
        {
            boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
            theReplSet->initialSyncRequested = false;
        }

        // If we just cloned & there were no ops applied, we still want the primary to know where
        // we're up to
        replset::BackgroundSync::notify();

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
コード例 #17
0
ファイル: health.cpp プロジェクト: Aaron20141021/mongo
    void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const {
        const Member *m = findById(server_id);
        if( m == 0 ) {
            ss << "Error : can't find a member with id: " << server_id << '\n';
            return;
        }

        ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog );

        //const bo fields = BSON( "o" << false << "o2" << false );
        const bo fields;

        /** todo fix we might want an so timeout here */
        OplogReader reader;

        if (reader.connect(m->h()) == false) {
            ss << "couldn't connect to " << m->fullName();
            return;
        }

        reader.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
        if ( !reader.haveCursor() ) {
            ss << "couldn't query " << rsoplog;
            return;
        }
        static const char *h[] = {"ts","optime","h","op","ns","rest",0};

        ss << "<style type=\"text/css\" media=\"screen\">"
           "table { font-size:75% }\n"
           // "th { background-color:#bbb; color:#000 }\n"
           // "td,th { padding:.25em }\n"
           "</style>\n";

        ss << table(h, true);
        //ss << "<pre>\n";
        int n = 0;
        OpTime otFirst;
        OpTime otLast;
        OpTime otEnd;
        while( reader.more() ) {
            bo o = reader.next();
            otLast = o["ts"]._opTime();
            if( otFirst.isNull() )
                otFirst = otLast;
            say(ss, o);
            n++;
        }
        if( n == 0 ) {
            ss << rsoplog << " is empty\n";
        }
        else {
            reader.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
            if( !reader.haveCursor() ) {
                ss << "couldn't query [2] " << rsoplog;
                return;
            }
            string x;
            bo o = reader.next();
            otEnd = o["ts"]._opTime();
            while( 1 ) {
                stringstream z;
                if( o["ts"]._opTime() == otLast )
                    break;
                say(z, o);
                x = z.str() + x;
                if( !reader.more() )
                    break;
                o = reader.next();
            }
            if( !x.empty() ) {
                ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x;
                //ss << "\n...\n\n" << x;
            }
        }
        ss << _table();
        ss << p(time_t_to_String_short(time(0)) + " current time");

        if( !otEnd.isNull() ) {
            ss << "<p>Log length in time: ";
            unsigned d = otEnd.getSecs() - otFirst.getSecs();
            double h = d / 3600.0;
            ss.precision(3);
            if( h < 72 )
                ss << h << " hours";
            else
                ss << h / 24.0 << " days";
            ss << "</p>\n";
        }
    }
コード例 #18
0
ファイル: rs_initialsync.cpp プロジェクト: JKO/mongo
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);
            
            BSONObj minValid;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // reset state, as that "didn't count"
            emptyOplog(); 
            lastOpTimeWritten = OpTime();
            lastH = 0;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        sethbmsg("initial sync query minValid",0);

        BSONObj minValid;
        if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }
            Helpers::putSingleton("local.replset.minvalid", minValid);
            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
コード例 #19
0
ファイル: bgsync.cpp プロジェクト: 504com/mongo
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r;
        OpTime lastOpTimeFetched;
        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        r.tailingQueryGTE(rsoplog, lastOpTimeFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!r.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = r.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    dassert( !Lock::isLocked() );
                    sleepmillis(SleepToAllowBatchingMillis);
                }
  
                if (theReplSet->gotForceSync()) {
                    return;
                }
                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (isAssumingPrimary() || theReplSet->isPrimary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncTarget()) {
                    return;
                }


                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    r.more();
                }
                networkByteStats.increment(r.currentBatchMessageSize());

                if (!r.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause || 
                            !_currentSyncTarget || 
                            !_currentSyncTarget->hbinfo().hbstate.readable()) {
                            return;
                        }
                    }

                    r.tailCheck();
                    if( !r.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass" << rsLog;
                        return;
                    }

                    continue;
                }
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = r.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
            }
            // the blocking queue will wait (forever) until there's room for us to push
            _buffer.push(o);
            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastH = o["h"].numberLong();
                _lastOpTimeFetched = o["ts"]._opTime();
                LOG(3) << "replSet lastOpTimeFetched: "
                       << _lastOpTimeFetched.toStringPretty() << rsLog;
            }
        }
    }
コード例 #20
0
ファイル: bgsync.cpp プロジェクト: alabid/mongo
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    OplogReader syncSourceReader;
    syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

    // no server found
    if (syncSourceReader.getHost().empty()) {
        sleepsecs(1);
        // if there is no one to sync from
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_pause) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _syncSourceHost = syncSourceReader.getHost();
        _replCoord->signalUpstreamUpdater();
    }

    const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout);

    // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared
    // if sync source feedback fails.
    const HostAndPort source = syncSourceReader.getHost();
    syncSourceReader.resetConnection();
    // no more references to oplog reader from here on.

    // If this status is not OK after the fetcher returns from wait(),
    // proceed to execute rollback
    Status remoteOplogStartStatus = Status::OK();

    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      &remoteOplogStartStatus);

    auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter"
                              << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))
                              << "tailable" << true << "oplogReplay" << true << "awaitData" << true
                              << "maxTimeMS" << int(fetcherMaxTimeMS.count()));
    Fetcher fetcher(taskExecutor,
                    source,
                    nsToDatabase(rsOplogName),
                    cmdObj,
                    fetcherCallback,
                    rpc::makeEmptyMetadata());
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();

    // If the background sync is paused after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isPaused()) {
        return;
    }

    // Execute rollback if necessary.
    // Rollback is a synchronous operation that uses the task executor and may not be
    // executed inside the fetcher callback.
    if (!remoteOplogStartStatus.isOK()) {
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection =
            [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* {
                if (!connection.get()) {
                    connection.reset(new ConnectionPool::ConnectionPtr(
                        &connectionPool, source, Date_t::now(), oplogSocketTimeout));
                };
                return connection->get();
            };

        log() << "starting rollback: " << remoteOplogStartStatus;
        _rollback(txn, source, getConnection);
        stop();
    }
}
コード例 #21
0
ファイル: rs_sync.cpp プロジェクト: Ry4an/mongo
 bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) {
     remoteOldestOp = r.findOne(rsoplog, Query());
     OpTime remoteTs = remoteOldestOp["ts"]._opTime();
     DEV log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
     else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
コード例 #22
0
ファイル: rs_sync.cpp プロジェクト: rauchg/mongo
    /* initial oplog application, during initial sync, after cloning. 
       @return false on failure.  
       this method returns an error and doesn't throw exceptions (i think).
    */
    bool ReplSetImpl::initialSyncOplogApplication(
        const Member *source,
        OpTime applyGTE,
        OpTime minValid)
    { 
        if( source == 0 ) return false;

        const string hn = source->h().toString();
        OpTime ts;
        try {
            OplogReader r;
            if( !r.connect(hn) ) { 
                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                return false;
            }

            {
                BSONObjBuilder q;
                q.appendDate("$gte", applyGTE.asDate());
                BSONObjBuilder query;
                query.append("ts", q.done());
                BSONObj queryObj = query.done();
                r.query(rsoplog, queryObj);
            }
            assert( r.haveCursor() );

            /* we lock outside the loop to avoid the overhead of locking on every operation.  server isn't usable yet anyway! */
            writelock lk("");

            {
                if( !r.more() ) { 
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElement().fieldName() == string("$err") ) { 
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }
            }

            // todo : use exhaust
            unsigned long long n = 0;
            while( 1 ) { 

                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                {
                    ts = o["ts"]._opTime();

                    /* if we have become primary, we dont' want to apply things from elsewhere
                        anymore. assumePrimary is in the db lock so we are safe as long as 
                        we check after we locked above. */
                    if( (source->state() != MemberState::RS_PRIMARY &&
                         source->state() != MemberState::RS_SECONDARY) ||
                        replSetForceInitialSyncFailure ) {
                        
                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) {
                        // optimes before we started copying need not be applied.
                        syncApply(o);
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }
                if( ++n % 100000 == 0 ) { 
                    // simple progress metering
                    log() << "replSet initialSyncOplogApplication " << n << rsLog;
                }
            }
        }
        catch(DBException& e) { 
            if( ts <= minValid ) {
                // didn't make it far enough
                log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
                return false;
            }
        }
        return true;
    }
コード例 #23
0
ファイル: rs_sync.cpp プロジェクト: rauchg/mongo
 /**
  * Checks if the oplog given is too far ahead to read from.
  *
  * @param r the oplog
  * @param hn the hostname (for log messages)
  *
  * @return if we are stale compared to the oplog on hn
  */
 bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) {
     BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
     OpTime ts = remoteOldestOp["ts"]._opTime();
     DEV log() << "replSet remoteOldestOp:    " << ts.toStringLong() << rsLog;
     else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
コード例 #24
0
ファイル: rs_sync.cpp プロジェクト: osheroff/mongo
    bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source,
        const OpTime& applyGTE, const OpTime& minValid) {

        const string hn = source->fullName();
        try {
            r.tailingQueryGTE( rsoplog, applyGTE );
            if ( !r.haveCursor() ) {
                log() << "replSet initial sync oplog query error" << rsLog;
                return false;
            }

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElementFieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        int fails = 0;
        while( ts < minValid ) {
            try {
                // There are some special cases with initial sync (see the catch block), so we
                // don't want to break out of this while until we've reached minvalid. Thus, we'll
                // keep trying to requery.
                if( !r.more() ) {
                    OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl;
                    sleepsecs(1);

                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten);
                    if ( !r.haveCursor() ) {
                        if (fails++ > 30) {
                            log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl;
                            return false;
                        }
                    }

                    continue;
                }

                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                ts = o["ts"]._opTime();

                {
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    applyOp(o, applyGTE);
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }

                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // Skip duplicate key exceptions.
                // These are relatively common on initial sync: if a document is inserted
                // early in the clone step, the insert will be replayed but the document
                // will probably already have been cloned over.
                if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever, we'll break out of the loop and catch
                // anything that's really wrong in syncTail
            }
        }
        return true;
    }
コード例 #25
0
ファイル: rs_initialsync.cpp プロジェクト: Xyand/mongo
    /**
     * Do the initial sync for this member.  There are several steps to this process:
     *
     *     1. Record start time.
     *     2. Clone.
     *     3. Set minValid1 to sync target's latest op time.
     *     4. Apply ops from start to minValid1, fetching missing docs as needed.
     *     5. Set minValid2 to sync target's latest op time.
     *     6. Apply ops from minValid1 to minValid2.
     *     7. Build indexes.
     *     8. Set minValid3 to sync target's latest op time.
     *     9. Apply ops from minValid2 to minValid3.
     *
     * At that point, initial sync is finished.  Note that the oplog from the sync target is applied
     * three times: step 4, 6, and 8.  4 may involve refetching, 6 should not.  By the end of 6,
     * this member should have consistent data.  8 is "cosmetic," it is only to get this member
     * closer to the latest op time before it can transition to secondary state.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        replset::SyncTail tail(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        // written by applyToHead calls
        BSONObj minValid;

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);

            log() << "oplog sync 1 of 3" << endl;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;

            // Now we sync to the latest op on the sync target _again_, as we may have recloned ops
            // that were "from the future" compared with minValid. During this second application,
            // nothing should need to be recloned.
            log() << "oplog sync 2 of 3" << endl;
            if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) {
                return;
            }
            // data should now be consistent

            lastOp = minValid;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        log() << "oplog sync 3 of 3" << endl;
        if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            theReplSet->setMinValid(minValid);

            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
コード例 #26
0
ファイル: rs_initialsync.cpp プロジェクト: dgriffith/mongo
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }
        
        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }
        OpTime startingTS = lastOp["ts"]._opTime();

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();
            for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
                string db = *i;
                if( db != "local" ) {
                    sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
                    bool ok;
                    {
                        writelock lk(db);
                        Client::Context ctx(db);
                        ok = clone(sourceHostname.c_str(), db);
                    }
                    if( !ok ) {
                        sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
                        sleepsecs(300);
                        return;
                    }
                }
            }
        }

        sethbmsg("initial sync query minValid",0);

        isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() );

        /* our cloned copy will be strange until we apply oplog events that occurred
           through the process.  we note that time point here. */
        BSONObj minValid = r.getLastOp(rsoplog);
        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
        OpTime mvoptime = minValid["ts"]._opTime();
        assert( !mvoptime.isNull() );

        /* apply relevant portion of the oplog
        */
        {
            isyncassert( str::stream() << "initial sync source must remain readable throughout our initial sync [2] state now: " << source->state().toString() , source->state().readable() );
            if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw
                log() << "replSet initial sync failed during applyoplog" << rsLog;
                emptyOplog(); // otherwise we'll be up!
                
                lastOpTimeWritten = OpTime();
                lastH = 0;
                
                log() << "replSet cleaning up [1]" << rsLog;
                {
                    writelock lk("local.");
                    Client::Context cx( "local." );
                    cx.db()->flushFiles(true);
                }
                log() << "replSet cleaning up [2]" << rsLog;
                sleepsecs(5);
                return;
            }
        }

        sethbmsg("initial sync finishing up",0);

        assert( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            writelock lk("local.");
            Client::Context cx( "local." );
            cx.db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }
            Helpers::putSingleton("local.replset.minvalid", minValid);
            cx.db()->flushFiles(true);
        }

        sethbmsg("initial sync done",0);
    }
コード例 #27
0
    void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
        DBClientConnection *them = r.conn();

        // fetch all first so we needn't handle interruption in a fancy way

        unsigned long long totSize = 0;

        list< pair<DocID,bo> > goodVersions;

        bo newMinValid;

        /* fetch all the goodVersions of each document from current primary */
        DocID d;
        unsigned long long n = 0;
        try {
            for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
                d = *i;

                assert( !d._id.eoo() );

                {
                    /* TODO : slow.  lots of round trips. */
                    n++;
                    bo good= them->findOne(d.ns, d._id.wrap()).getOwned();
                    totSize += good.objsize();
                    uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );

                    // note good might be eoo, indicating we should delete it
                    goodVersions.push_back(pair<DocID,bo>(d,good));
                }
            }
            newMinValid = r.getLastOp(rsoplog);
            if( newMinValid.isEmpty() ) {
                sethbmsg("rollback error newMinValid empty?");
                return;
            }
        }
        catch(DBException& e) {
            sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
            log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
            throw e;
        }

        MemoryMappedFile::flushAll(true);

        sethbmsg("rollback 3.5");
        if( h.rbid != getRBID(r.conn()) ) {
            // our source rolled back itself.  so the data we received isn't necessarily consistent.
            sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
            return;
        }

        // update them
        sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());

        bool warn = false;

        assert( !h.commonPointOurDiskloc.isNull() );

        dbMutex.assertWriteLocked();

        /* we have items we are writing that aren't from a point-in-time.  thus best not to come online
           until we get to that point in freshness. */
        setMinValid(newMinValid);

        /** any full collection resyncs required? */
        if( !h.collectionsToResync.empty() ) {
            for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
                string ns = *i;
                sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
                Client::Context c(*i, dbpath, 0, /*doauth*/false);
                try {
                    bob res;
                    string errmsg;
                    dropCollection(ns, errmsg, res);
                    {
                        dbtemprelease r;
                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false);
                        if( !ok ) {
                            log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog;
                            throw "rollback error resyncing rollection [1]";
                        }
                    }
                }
                catch(...) {
                    log() << "replset rollback error resyncing collection " << ns << rsLog;
                    throw "rollback error resyncing rollection [2]";
                }
            }

            /* we did more reading from primary, so check it again for a rollback (which would mess us up), and
               make minValid newer.
               */
            sethbmsg("rollback 4.2");
            {
                string err;
                try {
                    newMinValid = r.getLastOp(rsoplog);
                    if( newMinValid.isEmpty() ) {
                        err = "can't get minvalid from primary";
                    }
                    else {
                        setMinValid(newMinValid);
                    }
                }
                catch(...) {
                    err = "can't get/set minvalid";
                }
                if( h.rbid != getRBID(r.conn()) ) {
                    // our source rolled back itself.  so the data we received isn't necessarily consistent.
                    // however, we've now done writes.  thus we have a problem.
                    err += "rbid at primary changed during resync/rollback";
                }
                if( !err.empty() ) {
                    log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
                    /* todo: reset minvalid so that we are permanently in fatal state */
                    /* todo: don't be fatal, but rather, get all the data first. */
                    sethbmsg("rollback error");
                    throw rsfatal();
                }
            }
            sethbmsg("rollback 4.3");
        }

        sethbmsg("rollback 4.6");
        /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
        for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
            Client::Context c(*i, dbpath, 0, /*doauth*/false);
            try {
                bob res;
                string errmsg;
                log(1) << "replSet rollback drop: " << *i << rsLog;
                dropCollection(*i, errmsg, res);
            }
            catch(...) {
                log() << "replset rollback error dropping collection " << *i << rsLog;
            }
        }

        sethbmsg("rollback 4.7");
        Client::Context c(rsoplog, dbpath, 0, /*doauth*/false);
        NamespaceDetails *oplogDetails = nsdetails(rsoplog);
        uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);

        map<string,shared_ptr<RemoveSaver> > removeSavers;

        unsigned deletes = 0, updates = 0;
        for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
            const DocID& d = i->first;
            bo pattern = d._id.wrap(); // { _id : ... }
            try {
                assert( d.ns && *d.ns );
                if( h.collectionsToResync.count(d.ns) ) {
                    /* we just synced this entire collection */
                    continue;
                }

                /* keep an archive of items rolled back */
                shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
                if ( ! rs )
                    rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );

                // todo: lots of overhead in context, this can be faster
                Client::Context c(d.ns, dbpath, 0, /*doauth*/false);
                if( i->second.isEmpty() ) {
                    // wasn't on the primary; delete.
                    /* TODO1.6 : can't delete from a capped collection.  need to handle that here. */
                    deletes++;

                    NamespaceDetails *nsd = nsdetails(d.ns);
                    if( nsd ) {
                        if( nsd->capped ) {
                            /* can't delete from a capped collection - so we truncate instead. if this item must go,
                            so must all successors!!! */
                            try {
                                /** todo: IIRC cappedTrunateAfter does not handle completely empty.  todo. */
                                // this will crazy slow if no _id index.
                                long long start = Listener::getElapsedTimeMillis();
                                DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
                                if( Listener::getElapsedTimeMillis() - start > 200 )
                                    log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
                                //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
                                if( !loc.isNull() ) {
                                    try {
                                        nsd->cappedTruncateAfter(d.ns, loc, true);
                                    }
                                    catch(DBException& e) {
                                        if( e.getCode() == 13415 ) {
                                            // hack: need to just make cappedTruncate do this...
                                            nsd->emptyCappedCollection(d.ns);
                                        }
                                        else {
                                            throw;
                                        }
                                    }
                                }
                            }
                            catch(DBException& e) {
                                log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
                            }
                        }
                        else {
                            try {
                                deletes++;
                                deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
                            }
                            catch(...) {
                                log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
                            }
                        }
                        // did we just empty the collection?  if so let's check if it even exists on the source.
                        if( nsd->stats.nrecords == 0 ) {
                            try {
                                string sys = cc().database()->name + ".system.namespaces";
                                bo o = them->findOne(sys, QUERY("name"<<d.ns));
                                if( o.isEmpty() ) {
                                    // we should drop
                                    try {
                                        bob res;
                                        string errmsg;
                                        dropCollection(d.ns, errmsg, res);
                                    }
                                    catch(...) {
                                        log() << "replset error rolling back collection " << d.ns << rsLog;
                                    }
                                }
                            }
                            catch(DBException& ) {
                                /* this isn't *that* big a deal, but is bad. */
                                log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
                            }
                        }
                    }
                }
                else {
                    // todo faster...
                    OpDebug debug;
                    updates++;
                    _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
                }
            }
            catch(DBException& e) {
                log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
                warn = true;
            }
        }

        removeSavers.clear(); // this effectively closes all of them

        sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
        MemoryMappedFile::flushAll(true);
        sethbmsg("rollback 6");

        // clean up oplog
        log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
        // todo: fatal error if this throws?
        oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);

        /* reset cached lastoptimewritten and h value */
        loadLastOpTimeWritten();

        sethbmsg("rollback 7");
        MemoryMappedFile::flushAll(true);

        // done
        if( warn )
            sethbmsg("issues during syncRollback, see log");
        else
            sethbmsg("rollback done");
    }
コード例 #28
0
ファイル: rs_sync.cpp プロジェクト: Ry4an/mongo
    bool ReplSetImpl::_initialSyncOplogApplication(OplogReader& r, const Member *source,
        const OpTime& applyGTE, const OpTime& minValid) {

        const string hn = source->fullName();
        OplogReader missingObjReader;
        try {
            r.tailingQueryGTE( rsoplog, applyGTE );
            if ( !r.haveCursor() ) {
                log() << "replSet initial sync oplog query error" << rsLog;
                return false;
            }

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElementFieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        while( 1 ) {
            try {
                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                ts = o["ts"]._opTime();

                {
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) { // optimes before we started copying need not be applied.
                        bool failedUpdate = syncApply(o);
                        if( failedUpdate ) {
                            // we don't have the object yet, which is possible on initial sync.  get it.
                            log() << "replSet info adding missing object" << endl; // rare enough we can log
                            if( !missingObjReader.connect(hn) ) { // ok to call more than once
                                log() << "replSet initial sync fails, couldn't connect to " << hn << endl;
                                return false;
                            }
                            const char *ns = o.getStringField("ns");
                            BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); // might be more than just _id in the update criteria
                            BSONObj missingObj;
                            try {
                                missingObj = missingObjReader.findOne(
                                    ns, 
                                    query );
                            } catch(...) { 
                                log() << "replSet assertion fetching missing object" << endl;
                                throw;
                            }
                            if( missingObj.isEmpty() ) { 
                                log() << "replSet missing object not found on source. presumably deleted later in oplog" << endl;
                                log() << "replSet o2: " << o.getObjectField("o2").toString() << endl;
                                log() << "replSet o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
                            }
                            else {
                                Client::Context ctx(ns);
                                try {
                                    DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
                                    assert( !d.isNull() );
                                } catch(...) { 
                                    log() << "replSet assertion during insert of missing object" << endl;
                                    throw;
                                }
                                // now reapply the update from above
                                bool failed = syncApply(o);
                                if( failed ) {
                                    log() << "replSet update still fails after adding missing object " << ns << endl;
                                    assert(false);
                                }
                            }
                        }
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }

                if ( ts > minValid ) {
                    break;
                }

                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // skip duplicate key exceptions
                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever
                break;
            }
        }
        return true;
    }
コード例 #29
0
ファイル: rs_sync.cpp プロジェクト: LsRbls/mongo
    /* initial oplog application, during initial sync, after cloning.
       @return false on failure.
       this method returns an error and doesn't throw exceptions (i think).
    */
    bool ReplSetImpl::initialSyncOplogApplication(
        const Member *source,
        OpTime applyGTE,
        OpTime minValid) {
        if( source == 0 ) return false;

        const string hn = source->h().toString();
        OplogReader r;
        try {
            if( !r.connect(hn) ) {
                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                return false;
            }

            r.queryGTE( rsoplog, applyGTE );
            assert( r.haveCursor() );

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElement().fieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        while( 1 ) {
            try {
                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                {
                    ts = o["ts"]._opTime();

                    /* if we have become primary, we dont' want to apply things from elsewhere
                        anymore. assumePrimary is in the db lock so we are safe as long as
                        we check after we locked above. */
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) {
                        // optimes before we started copying need not be applied.
                        syncApply(o);
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }
                
                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // skip duplicate key exceptions
                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    r.resetCursor();
                    r.queryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever
                break;
            }
        }
        return true;
    }
コード例 #30
0
    unsigned ReplSetImpl::_syncRollback(OplogReader&r) {
        assert( !lockedByMe() );
        assert( !dbMutex.atLeastReadLocked() );

        sethbmsg("rollback 0");

        writelocktry lk(rsoplog, 20000);
        if( !lk.got() ) {
            sethbmsg("rollback couldn't get write lock in a reasonable time");
            return 2;
        }

        if( box.getState().secondary() ) {
            /* by doing this, we will not service reads (return an error as we aren't in secondary staate.
               that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
               or removed or yielded later anyway.

               also, this is better for status reporting - we know what is happening.
               */
            box.change(MemberState::RS_ROLLBACK, _self);
        }

        HowToFixUp how;
        sethbmsg("rollback 1");
        {
            r.resetCursor();
            /*DBClientConnection us(false, 0, 0);
            string errmsg;
            if( !us.connect(HostAndPort::me().toString(),errmsg) ) {
                sethbmsg("rollback connect to self failure" + errmsg);
                return;
            }*/

            sethbmsg("rollback 2 FindCommonPoint");
            try {
                syncRollbackFindCommonPoint(r.conn(), how);
            }
            catch( const char *p ) {
                sethbmsg(string("rollback 2 error ") + p);
                return 10;
            }
            catch( rsfatal& ) {
                _fatal();
                return 2;
            }
            catch( DBException& e ) {
                sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min");
                dbtemprelease r;
                sleepsecs(60);
                throw;
            }
        }

        sethbmsg("replSet rollback 3 fixup");

        {
            incRBID();
            try {
                syncFixUp(how, r);
            }
            catch( rsfatal& ) {
                sethbmsg("rollback fixup error");
                _fatal();
                return 2;
            }
            catch(...) {
                incRBID(); throw;
            }
            incRBID();

            /* success - leave "ROLLBACK" state
               can go to SECONDARY once minvalid is achieved
            */
            box.change(MemberState::RS_RECOVERING, _self);
        }

        return 0;
    }