Ejemplo n.º 1
0
 void rollbackToGTID(GTID idToRollbackTo, RollbackDocsMap* docsMap, RollbackSaveData* rsSave) {
     // at this point, everything should be settled, the applier should
     // have nothing left (and remain that way, because this is the only
     // thread that can put work on the applier). Now we can rollback
     // the data.
     while (true) {
         BSONObj o;
         {
             LOCK_REASON(lockReason, "repl: checking for oplog data");
             Client::ReadContext ctx(rsoplog, lockReason);
             Client::Transaction txn(DB_SERIALIZABLE);
             // if there is nothing in the oplog, break
             o = getLastEntryInOplog();
             if (o.isEmpty()) {
                 throw RollbackOplogException("Oplog empty when rolling back to a GTID");
             }
         }
         GTID lastGTID = getGTIDFromBSON("_id", o);
         // if we have rolled back enough, break from while loop
         if (GTID::cmp(lastGTID, idToRollbackTo) <= 0) {
             dassert(GTID::cmp(lastGTID, idToRollbackTo) == 0);
             break;
         }
         rollbackTransactionFromOplog(o, docsMap, rsSave);
     }
     log() << "Rolling back to " << idToRollbackTo.toString() << " produced " <<
         docsMap->size() << " documents for which we need to retrieve a snapshot of." << rsLog;
 }
Ejemplo n.º 2
0
    void BackgroundSync::runRollback(OplogReader& r, uint64_t oplogTS) {
        // starting from ourLast, we need to read the remote oplog
        // backwards until we find an entry in the remote oplog
        // that has the same GTID, timestamp, and hash as
        // what we have in our oplog. If we don't find one that is within
        // some reasonable timeframe, then we go fatal
        GTID ourLast = theReplSet->gtidManager->getLiveState();
        GTID idToRollbackTo;
        uint64_t rollbackPointTS = 0;
        uint64_t rollbackPointHash = 0;
        incRBID();
        try {
            shared_ptr<DBClientCursor> rollbackCursor = r.getRollbackCursor(ourLast);
            while (rollbackCursor->more()) {
                BSONObj remoteObj = rollbackCursor->next();
                GTID remoteGTID = getGTIDFromBSON("_id", remoteObj);
                uint64_t remoteTS = remoteObj["ts"]._numberLong();
                uint64_t remoteLastHash = remoteObj["h"].numberLong();
                if (remoteTS + 1800*1000 < oplogTS) {
                    log() << "Rollback takes us too far back, throwing exception. remoteTS: " << remoteTS << " oplogTS: " << oplogTS << rsLog;
                    throw RollbackOplogException("replSet rollback too long a time period for a rollback (at least 30 minutes).");
                    break;
                }
                //now try to find an entry in our oplog with that GTID
                BSONObjBuilder localQuery;
                BSONObj localObj;
                addGTIDToBSON("_id", remoteGTID, localQuery);
                bool foundLocally = false;
                {
                    LOCK_REASON(lockReason, "repl: looking up oplog entry for rollback");
                    Client::ReadContext ctx(rsoplog, lockReason);
                    Client::Transaction transaction(DB_SERIALIZABLE);
                    foundLocally = Collection::findOne(rsoplog, localQuery.done(), localObj);
                    transaction.commit();
                }
                if (foundLocally) {
                    GTID localGTID = getGTIDFromBSON("_id", localObj);
                    uint64_t localTS = localObj["ts"]._numberLong();
                    uint64_t localLastHash = localObj["h"].numberLong();
                    if (localLastHash == remoteLastHash &&
                        localTS == remoteTS &&
                        GTID::cmp(localGTID, remoteGTID) == 0
                        )
                    {
                        idToRollbackTo = localGTID;
                        rollbackPointTS = localTS;
                        rollbackPointHash = localLastHash;
                        log() << "found id to rollback to " << idToRollbackTo << rsLog;
                        break;
                    }
                }
            }
            // At this point, either we have found the point to try to rollback to,
            // or we have determined that we cannot rollback
            if (idToRollbackTo.isInitial()) {
                // we cannot rollback
                throw RollbackOplogException("could not find ID to rollback to");
            }
        }
        catch (DBException& e) {
            log() << "Caught DBException during rollback " << e.toString() << rsLog;
            throw RollbackOplogException("DBException while trying to find ID to rollback to: " + e.toString());
        }
        catch (std::exception& e2) {
            log() << "Caught std::exception during rollback " << e2.what() << rsLog;
            throw RollbackOplogException(str::stream() << "Exception while trying to find ID to rollback to: " << e2.what());
        }

        // proceed with the rollback to point idToRollbackTo
        // probably ought to grab a global write lock while doing this
        // I don't think we want oplog cursors reading from this machine
        // while we are rolling back. Or at least do something to protect against this

        // first, let's get all the operations that are being applied out of the way,
        // we don't want to rollback an item in the oplog while simultaneously,
        // the applier thread is applying it to the oplog
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            while (_deque.size() > 0) {
                log() << "waiting for applier to finish work before doing rollback " << rsLog;
                _queueDone.wait(lock);
            }
            verifySettled();
        }

        // now let's tell the system we are going to rollback, to do so,
        // abort live multi statement transactions, invalidate cursors, and
        // change the state to RS_ROLLBACK
        {
            // so we know nothing is simultaneously occurring
            RWLockRecursive::Exclusive e(operationLock);
            LOCK_REASON(lockReason, "repl: killing all operations for rollback");
            Lock::GlobalWrite lk(lockReason);
            ClientCursor::invalidateAllCursors();
            Client::abortLiveTransactions();
            theReplSet->goToRollbackState();
        }

        try {
            // now that we are settled, we have to take care of the GTIDManager
            // and the repl info thread.
            // We need to reset the state of the GTIDManager to the point
            // we intend to rollback to, and we need to make sure that the repl info thread
            // has captured this information.
            theReplSet->gtidManager->resetAfterInitialSync(
                idToRollbackTo,
                rollbackPointTS,
                rollbackPointHash
                );
            // now force an update of the repl info thread
            theReplSet->forceUpdateReplInfo();

            // at this point, everything should be settled, the applier should
            // have nothing left (and remain that way, because this is the only
            // thread that can put work on the applier). Now we can rollback
            // the data.
            while (true) {
                BSONObj o;
                {
                    LOCK_REASON(lockReason, "repl: checking for oplog data");
                    Lock::DBRead lk(rsoplog, lockReason);
                    Client::Transaction txn(DB_SERIALIZABLE);
                    // if there is nothing in the oplog, break
                    o = getLastEntryInOplog();
                    if( o.isEmpty() ) {
                        break;
                    }
                }
                GTID lastGTID = getGTIDFromBSON("_id", o);
                // if we have rolled back enough, break from while loop
                if (GTID::cmp(lastGTID, idToRollbackTo) <= 0) {
                    dassert(GTID::cmp(lastGTID, idToRollbackTo) == 0);
                    break;
                }
                rollbackTransactionFromOplog(o, true);
            }
            theReplSet->leaveRollbackState();
        }
        catch (DBException& e) {
            log() << "Caught DBException during rollback " << e.toString() << rsLog;
            throw RollbackOplogException("DBException while trying to run rollback: " + e.toString());
        }
        catch (std::exception& e2) {
            log() << "Caught std::exception during rollback " << e2.what() << rsLog;
            throw RollbackOplogException(str::stream() << "Exception while trying to run rollback: " << e2.what());
        }
        
    }
Ejemplo n.º 3
0
        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            log() << "replSet replSetInitiate admin command received from client" << rsLog;

            if( !replSet ) {
                errmsg = "server is not running with --replSet";
                return false;
            }
            if( theReplSet ) {
                errmsg = "already initialized";
                result.append("info", "try querying " + rsConfigNs + " to see current configuration");
                return false;
            }

            {
                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
                // later.  of course it could be stuck then, but this check lowers the risk if weird things
                // are up.
                time_t t = time(0);
                Lock::GlobalWrite lk;
                if( time(0)-t > 10 ) {
                    errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
                    return false;
                }

                /* check that we don't already have an oplog.  that could cause issues.
                   it is ok if the initiating member has *other* data than that.
                   */
                Client::Transaction transaction(DB_SERIALIZABLE);
                BSONObj o = getLastEntryInOplog();
                if( !o.isEmpty() ) {
                    errmsg = rsoplog + string(" is not empty on the initiating member.  cannot initiate.");
                    return false;
                }
                transaction.commit();
            }

            if( ReplSet::startupStatus == ReplSet::BADCONFIG ) {
                errmsg = "server already in BADCONFIG state (check logs); not initiating";
                result.append("info", ReplSet::startupStatusMsg.get());
                return false;
            }
            if( ReplSet::startupStatus != ReplSet::EMPTYCONFIG ) {
                result.append("startupStatus", ReplSet::startupStatus);
                errmsg = "all members and seeds must be reachable to initiate set";
                result.append("info", cmdLine._replSet);
                return false;
            }

            BSONObj configObj;

            if( cmdObj["replSetInitiate"].type() != Object ) {
                result.append("info2", "no configuration explicitly specified -- making one");
                log() << "replSet info initiate : no configuration specified.  Using a default configuration for the set" << rsLog;

                string name;
                vector<HostAndPort> seeds;
                set<HostAndPort> seedSet;
                parseReplsetCmdLine(cmdLine._replSet, name, seeds, seedSet); // may throw...

                bob b;
                b.append("_id", name);
                b.append("protocolVersion", ReplSetConfig::CURRENT_PROTOCOL_VERSION);
                bob members;
                members.append("0", BSON( "_id" << 0 << "host" << HostAndPort::me().toString() ));
                result.append("me", HostAndPort::me().toString());
                for( unsigned i = 0; i < seeds.size(); i++ )
                    members.append(bob::numStr(i+1), BSON( "_id" << i+1 << "host" << seeds[i].toString()));
                b.appendArray("members", members.obj());
                configObj = b.obj();
                log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog;
            }
            else {
                configObj = ReplSetConfig::addProtocolVersionIfMissing(cmdObj["replSetInitiate"].Obj());
            }

            bool parsed = false;
            try {
                ReplSetConfig newConfig(configObj);
                parsed = true;

                if( newConfig.version > 1 ) {
                    errmsg = "can't initiate with a version number greater than 1";
                    return false;
                }

                log() << "replSet replSetInitiate config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;

                checkMembersUpForConfigChange(newConfig, result, true);

                log() << "replSet replSetInitiate all members seem up" << rsLog;

                Lock::GlobalWrite lk;
                {
                    Client::Transaction transaction(DB_SERIALIZABLE);
                    createOplog();
                    openOplogFiles();
                    GTID minLiveGTID;
                    GTID minUnappliedGTID;
                    logToReplInfo(minLiveGTID, minUnappliedGTID);
                    transaction.commit();
                }
                bo comment = BSON( "msg" << "initiating set");
                newConfig.saveConfigLocally(comment, true);
                log() << "replSet replSetInitiate config now saved locally.  Should come online in about a minute." << rsLog;
                result.append("info", "Config now saved locally.  Should come online in about a minute.");
                ReplSet::startupStatus = ReplSet::SOON;
                ReplSet::startupStatusMsg.set("Received replSetInitiate - should come online shortly.");
            }
            catch( DBException& e ) {
                log() << "replSet replSetInitiate exception: " << e.what() << rsLog;
                if( !parsed )
                    errmsg = string("couldn't parse cfg object ") + e.what();
                else
                    errmsg = string("couldn't initiate : ") + e.what();
                return false;
            }
            catch( string& e2 ) {
                log() << e2 << rsLog;
                errmsg = e2;
                return false;
            }

            return true;
        }