Beispiel #1
0
    bool BackgroundSync::isRollbackRequired(OplogReader& r) {
        string hn = r.conn()->getServerAddress();
        if (!r.more()) {
            // In vanilla Mongo, this happened for one of the
            // following reasons:
            //  - we were ahead of what we are syncing from (don't
            //    think that is possible anymore)
            //  - remote oplog is empty for some weird reason
            // in either case, if it (strangely) happens, we'll just return
            // and our caller will simply try again after a short sleep.
            log() << "replSet error empty query result from " << hn << " oplog, attempting rollback" << rsLog;
             return true;
        }

        BSONObj o = r.nextSafe();
        uint64_t ts = o["ts"]._numberLong();
        uint64_t lastHash = o["h"].numberLong();
        GTID gtid = getGTIDFromBSON("_id", o);

        if( !theReplSet->gtidManager->rollbackNeeded(gtid, ts, lastHash)) {
            log() << "Rollback NOT needed! Our GTID" << gtid << endl;
            return false;
        }

        log() << "Rollback needed! Our GTID" <<
            theReplSet->gtidManager->getLiveState().toString() <<
            " remote GTID: " << gtid.toString() << ". Attempting rollback." << rsLog;

        runRollback(r, ts);
        return true;
    }
Beispiel #2
0
    bool isRollbackRequired(OplogReader& r, uint64_t *lastTS) {
        string hn = r.conn()->getServerAddress();
        verify(r.more());
        BSONObj rollbackStatus;
        bool found = getRollbackStatus(rollbackStatus);
        if (found) {
            // we have a rollback in progress,
            // must complete it
            log() << "Rollback needed, found rollbackStatus: " << rollbackStatus << rsLog;
            return true;
        }

        BSONObj o = r.nextSafe();
        uint64_t ts = o["ts"]._numberLong();
        uint64_t lastHash = o["h"].numberLong();
        GTID gtid = getGTIDFromBSON("_id", o);

        if (!theReplSet->gtidManager->rollbackNeeded(gtid, ts, lastHash)) {
            log() << "Rollback NOT needed! " << gtid << endl;
            return false;
        }

        log() << "Rollback needed! Our GTID: " <<
            theReplSet->gtidManager->getLiveState().toString() <<
            ", remote GTID: " << gtid.toString() << ". Attempting rollback." << rsLog;

        *lastTS = ts;
        return true;
    }
Beispiel #3
0
 void rollbackToGTID(GTID idToRollbackTo, RollbackDocsMap* docsMap, RollbackSaveData* rsSave) {
     // at this point, everything should be settled, the applier should
     // have nothing left (and remain that way, because this is the only
     // thread that can put work on the applier). Now we can rollback
     // the data.
     while (true) {
         BSONObj o;
         {
             LOCK_REASON(lockReason, "repl: checking for oplog data");
             Client::ReadContext ctx(rsoplog, lockReason);
             Client::Transaction txn(DB_SERIALIZABLE);
             // if there is nothing in the oplog, break
             o = getLastEntryInOplog();
             if (o.isEmpty()) {
                 throw RollbackOplogException("Oplog empty when rolling back to a GTID");
             }
         }
         GTID lastGTID = getGTIDFromBSON("_id", o);
         // if we have rolled back enough, break from while loop
         if (GTID::cmp(lastGTID, idToRollbackTo) <= 0) {
             dassert(GTID::cmp(lastGTID, idToRollbackTo) == 0);
             break;
         }
         rollbackTransactionFromOplog(o, docsMap, rsSave);
     }
     log() << "Rolling back to " << idToRollbackTo.toString() << " produced " <<
         docsMap->size() << " documents for which we need to retrieve a snapshot of." << rsLog;
 }
Beispiel #4
0
    void ClientCursor::storeOpForSlave( BSONObj curr ) {
        if ( ! ( _queryOptions & QueryOption_OplogReplay ))
            return;

        BSONElement e = curr["_id"];
        if ( e.type() == BinData ) {
            _slaveReadTill = getGTIDFromBSON("_id", curr);
            _slaveReadTillTS = curr["ts"]._numberLong();
        }
    }
Beispiel #5
0
 bool canStartRollback(OplogReader& r, GTID idToRollbackTo) {
     shared_ptr<DBClientConnection> conn(r.conn_shared());
     // before we start rollback, let's make sure that the minUnapplied on the remote
     // server is past the id that we are rolling back to. Otherwise, the snapshot
     // we create will not be up to date, and the rollback algorithm will not work
     BSONObjBuilder b;
     b.append("_id", "minUnapplied");
     // Note that another way to get this information is to
     // request a heartbeat. That one will technically return
     // a more up to date value for minUnapplied
     BSONObj res = findOneFromConn(conn.get(), rsReplInfo, Query(b.done()));
     GTID minUnapplied = getGTIDFromBSON("GTID", res);
     if (GTID::cmp(minUnapplied, idToRollbackTo) < 0) {
         log() << "Remote server has minUnapplied " << minUnapplied.toString() << \
             " we want to rollback to " << idToRollbackTo.toString() << \
             ". Therefore, exiting and retrying." << rsLog;
         return false;
     }
     return true;
 }
Beispiel #6
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        verify(r.conn() == NULL);
        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            GTID remoteOldestGTID = getGTIDFromBSON("_id", oldest);
            theReplSet->goStale(stale, remoteOldestGTID);
            // vanilla Mongo used to do a sleep of 120 seconds here
            // We removed it. It seems excessive, and if this machine is doing
            // nothing anyway, sleeping won't help. It might as well
            // return with a null sync target, and produce() will handle
            // that fact and sleep one second
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Beispiel #7
0
    void BackgroundSync::runRollback(OplogReader& r, uint64_t oplogTS) {
        // starting from ourLast, we need to read the remote oplog
        // backwards until we find an entry in the remote oplog
        // that has the same GTID, timestamp, and hash as
        // what we have in our oplog. If we don't find one that is within
        // some reasonable timeframe, then we go fatal
        GTID ourLast = theReplSet->gtidManager->getLiveState();
        GTID idToRollbackTo;
        uint64_t rollbackPointTS = 0;
        uint64_t rollbackPointHash = 0;
        incRBID();
        try {
            shared_ptr<DBClientCursor> rollbackCursor = r.getRollbackCursor(ourLast);
            while (rollbackCursor->more()) {
                BSONObj remoteObj = rollbackCursor->next();
                GTID remoteGTID = getGTIDFromBSON("_id", remoteObj);
                uint64_t remoteTS = remoteObj["ts"]._numberLong();
                uint64_t remoteLastHash = remoteObj["h"].numberLong();
                if (remoteTS + 1800*1000 < oplogTS) {
                    log() << "Rollback takes us too far back, throwing exception. remoteTS: " << remoteTS << " oplogTS: " << oplogTS << rsLog;
                    throw RollbackOplogException("replSet rollback too long a time period for a rollback (at least 30 minutes).");
                    break;
                }
                //now try to find an entry in our oplog with that GTID
                BSONObjBuilder localQuery;
                BSONObj localObj;
                addGTIDToBSON("_id", remoteGTID, localQuery);
                bool foundLocally = false;
                {
                    LOCK_REASON(lockReason, "repl: looking up oplog entry for rollback");
                    Client::ReadContext ctx(rsoplog, lockReason);
                    Client::Transaction transaction(DB_SERIALIZABLE);
                    foundLocally = Collection::findOne(rsoplog, localQuery.done(), localObj);
                    transaction.commit();
                }
                if (foundLocally) {
                    GTID localGTID = getGTIDFromBSON("_id", localObj);
                    uint64_t localTS = localObj["ts"]._numberLong();
                    uint64_t localLastHash = localObj["h"].numberLong();
                    if (localLastHash == remoteLastHash &&
                        localTS == remoteTS &&
                        GTID::cmp(localGTID, remoteGTID) == 0
                        )
                    {
                        idToRollbackTo = localGTID;
                        rollbackPointTS = localTS;
                        rollbackPointHash = localLastHash;
                        log() << "found id to rollback to " << idToRollbackTo << rsLog;
                        break;
                    }
                }
            }
            // At this point, either we have found the point to try to rollback to,
            // or we have determined that we cannot rollback
            if (idToRollbackTo.isInitial()) {
                // we cannot rollback
                throw RollbackOplogException("could not find ID to rollback to");
            }
        }
        catch (DBException& e) {
            log() << "Caught DBException during rollback " << e.toString() << rsLog;
            throw RollbackOplogException("DBException while trying to find ID to rollback to: " + e.toString());
        }
        catch (std::exception& e2) {
            log() << "Caught std::exception during rollback " << e2.what() << rsLog;
            throw RollbackOplogException(str::stream() << "Exception while trying to find ID to rollback to: " << e2.what());
        }

        // proceed with the rollback to point idToRollbackTo
        // probably ought to grab a global write lock while doing this
        // I don't think we want oplog cursors reading from this machine
        // while we are rolling back. Or at least do something to protect against this

        // first, let's get all the operations that are being applied out of the way,
        // we don't want to rollback an item in the oplog while simultaneously,
        // the applier thread is applying it to the oplog
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            while (_deque.size() > 0) {
                log() << "waiting for applier to finish work before doing rollback " << rsLog;
                _queueDone.wait(lock);
            }
            verifySettled();
        }

        // now let's tell the system we are going to rollback, to do so,
        // abort live multi statement transactions, invalidate cursors, and
        // change the state to RS_ROLLBACK
        {
            // so we know nothing is simultaneously occurring
            RWLockRecursive::Exclusive e(operationLock);
            LOCK_REASON(lockReason, "repl: killing all operations for rollback");
            Lock::GlobalWrite lk(lockReason);
            ClientCursor::invalidateAllCursors();
            Client::abortLiveTransactions();
            theReplSet->goToRollbackState();
        }

        try {
            // now that we are settled, we have to take care of the GTIDManager
            // and the repl info thread.
            // We need to reset the state of the GTIDManager to the point
            // we intend to rollback to, and we need to make sure that the repl info thread
            // has captured this information.
            theReplSet->gtidManager->resetAfterInitialSync(
                idToRollbackTo,
                rollbackPointTS,
                rollbackPointHash
                );
            // now force an update of the repl info thread
            theReplSet->forceUpdateReplInfo();

            // at this point, everything should be settled, the applier should
            // have nothing left (and remain that way, because this is the only
            // thread that can put work on the applier). Now we can rollback
            // the data.
            while (true) {
                BSONObj o;
                {
                    LOCK_REASON(lockReason, "repl: checking for oplog data");
                    Lock::DBRead lk(rsoplog, lockReason);
                    Client::Transaction txn(DB_SERIALIZABLE);
                    // if there is nothing in the oplog, break
                    o = getLastEntryInOplog();
                    if( o.isEmpty() ) {
                        break;
                    }
                }
                GTID lastGTID = getGTIDFromBSON("_id", o);
                // if we have rolled back enough, break from while loop
                if (GTID::cmp(lastGTID, idToRollbackTo) <= 0) {
                    dassert(GTID::cmp(lastGTID, idToRollbackTo) == 0);
                    break;
                }
                rollbackTransactionFromOplog(o, true);
            }
            theReplSet->leaveRollbackState();
        }
        catch (DBException& e) {
            log() << "Caught DBException during rollback " << e.toString() << rsLog;
            throw RollbackOplogException("DBException while trying to run rollback: " + e.toString());
        }
        catch (std::exception& e2) {
            log() << "Caught std::exception during rollback " << e2.what() << rsLog;
            throw RollbackOplogException(str::stream() << "Exception while trying to run rollback: " << e2.what());
        }
        
    }
Beispiel #8
0
 void findRollbackPoint(
     OplogReader& r, uint64_t oplogTS,
     GTID* idToRollbackTo,
     uint64_t* rollbackPointTS,
     uint64_t* rollbackPointHash
     )
 {
     bool gtidFound = false;
     try {
         GTID ourLast = theReplSet->gtidManager->getLiveState();
         shared_ptr<DBClientCursor> rollbackCursor = r.getRollbackCursor(ourLast);
         uassert(17350, "rollback failed to get a cursor to start reading backwards from.", rollbackCursor.get());
         while (rollbackCursor->more()) {
             BSONObj remoteObj = rollbackCursor->next();
             GTID remoteGTID = getGTIDFromBSON("_id", remoteObj);
             uint64_t remoteTS = remoteObj["ts"]._numberLong();
             uint64_t remoteLastHash = remoteObj["h"].numberLong();
             if (remoteTS + 1800*1000 < oplogTS) {
                 log() << "Rollback takes us too far back, throwing exception. remoteTS: " << remoteTS << " oplogTS: " << oplogTS << rsLog;
                 throw RollbackOplogException("replSet rollback too long a time period for a rollback (at least 30 minutes).");
                 break;
             }
             //now try to find an entry in our oplog with that GTID
             BSONObjBuilder localQuery;
             BSONObj localObj;
             addGTIDToBSON("_id", remoteGTID, localQuery);
             bool foundLocally = false;
             {
                 LOCK_REASON(lockReason, "repl: looking up oplog entry for rollback");
                 Client::ReadContext ctx(rsoplog, lockReason);
                 Client::Transaction transaction(DB_SERIALIZABLE);
                 foundLocally = Collection::findOne(rsoplog, localQuery.done(), localObj);
                 transaction.commit();
             }
             if (foundLocally) {
                 GTID localGTID = getGTIDFromBSON("_id", localObj);
                 uint64_t localTS = localObj["ts"]._numberLong();
                 uint64_t localLastHash = localObj["h"].numberLong();
                 if (localLastHash == remoteLastHash &&
                     localTS == remoteTS &&
                     GTID::cmp(localGTID, remoteGTID) == 0
                     )
                 {
                     *idToRollbackTo = localGTID;
                     *rollbackPointTS = localTS;
                     *rollbackPointHash = localLastHash;
                     gtidFound = true;
                     log() << "found id to rollback to " << idToRollbackTo->toString() << rsLog;
                     break;
                 }
             }
         }
         // At this point, either we have found the point to try to rollback to,
         // or we have determined that we cannot rollback
         if (!gtidFound) {
             // we cannot rollback
             throw RollbackOplogException("could not find ID to rollback to");
         }
     }
     catch (DBException& e) {
         log() << "Caught DBException during rollback " << e.toString() << rsLog;
         throw RollbackOplogException("DBException while trying to find ID to rollback to: " + e.toString());
     }
     catch (std::exception& e2) {
         log() << "Caught std::exception during rollback " << e2.what() << rsLog;
         throw RollbackOplogException(str::stream() << "Exception while trying to find ID to rollback to: " << e2.what());
     }
 }
Beispiel #9
0
 GTID getGTIDFromOplogEntry(BSONObj o) {
     return getGTIDFromBSON("_id", o);
 }