Пример #1
0
    void ReplSetImpl::syncTail() { 
        // todo : locking vis a vis the mgr...

        const Member *primary = box.getPrimary();
        if( primary == 0 ) return;
        string hn = primary->h().toString();
        OplogReader r;
        if( !r.connect(primary->h().toString()) ) { 
            log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
            return;
        }

        /* first make sure we are not hopelessly out of sync by being very stale. */
        {
            BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
            OpTime ts = remoteOldestOp["ts"]._opTime();
            DEV log() << "remoteOldestOp: " << ts.toStringPretty() << endl;
            else log(3) << "remoteOldestOp: " << ts.toStringPretty() << endl;
            if( lastOpTimeWritten < ts ) { 
                log() << "replSet error too stale to catch up, at least from primary " << hn << rsLog;
                log() << "replSet our last optime : " << lastOpTimeWritten.toStringPretty() << rsLog;
                log() << "replSet oldest at " << hn << " : " << ts.toStringPretty() << rsLog;
                log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
                sethbmsg("error too stale to catch up");
                sleepsecs(120);
                return;
            }
        }
Пример #2
0
 void logPosition() const {
     if (_player) {
         if (_player->thisTime() != OpTime()) {
             log() << "Exiting while processing operation with OpTime " << _player->thisTimeStr() << endl;
         }
         report();
         OpTime t = _player->maxOpTimeSynced();
         string tsString = mongoutils::str::stream() << t.getSecs() << ":" << t.getInc();
         log() << "Use --ts=" << tsString << " to resume." << endl;
         try {
             std::ofstream tsFile;
             tsFile.exceptions(std::ifstream::badbit | std::ifstream::failbit);
             tsFile.open(_tsFilename, std::ofstream::trunc);
             tsFile << tsString;
             tsFile.close();
             log() << "Saved timestamp to file "
                   << (boost::filesystem::current_path() / _tsFilename).string() << "." << endl;
             log() << "I'll automatically use this value next time if you run from this directory "
                   << "and don't pass --ts." << endl;
         }
         catch (std::exception &e) {
             warning() << "Error saving timestamp to file " << _tsFilename << ": " << e.what() << endl;
             warning() << "Make sure you save the timestamp somewhere, because I couldn't!" << endl;
         }
     }
 }
Пример #3
0
 void report() const {
     const OpTime &maxOpTimeSynced = _player->maxOpTimeSynced();
     LOG(0) << "synced up to " << fmtOpTime(maxOpTimeSynced);
     if (!_rconn) {
         LOG(0) << endl;
         return;
     }
     Query lastQuery;
     lastQuery.sort("$natural", -1);
     BSONObj lastFields = BSON("ts" << 1);
     BSONObj lastObj = _rconn->conn().findOne(_oplogns, lastQuery, &lastFields);
     BSONElement tsElt = lastObj["ts"];
     if (!tsElt.ok()) {
         warning() << "couldn't find last oplog entry on remote host" << endl;
         LOG(0) << endl;
         return;
     }
     OpTime lastOpTime = OpTime(tsElt.date());
     LOG(0) << ", source has up to " << fmtOpTime(lastOpTime);
     if (maxOpTimeSynced == lastOpTime) {
         LOG(0) << ", fully synced." << endl;
     }
     else {
         int diff = lastOpTime.getSecs() - maxOpTimeSynced.getSecs();
         if (diff > 0) {
             LOG(0) << ", " << (lastOpTime.getSecs() - maxOpTimeSynced.getSecs())
                    << " seconds behind source." << endl;
         }
         else {
             LOG(0) << ", less than 1 second behind source." << endl;
         }
     }
     _reportingTimer.reset();
 }
        void addOp(const string& op, BSONObj o, BSONObj* o2 = NULL, const char* coll = NULL, 
                   int version = 0) {
            OpTime ts;
            {
                Lock::GlobalWrite lk;
                ts = OpTime::_now();
            }

            BSONObjBuilder b;
            b.appendTimestamp("ts", ts.asLL());
            if (version != 0) {
                b.append("v", version);
            }
            b.append("op", op);
            b.append("o", o);

            if (o2) {
                b.append("o2", *o2);
            }

            if (coll) {
                b.append("ns", coll);
            }
            else {
                b.append("ns", ns());
            }

            _bgsync->addDoc(b.done());
        }
Пример #5
0
    /**
     * Replays the sync target's oplog from lastOp to the latest op on the sync target.
     *
     * @param syncer either initial sync (can reclone missing docs) or "normal" sync (no recloning)
     * @param r      the oplog reader
     * @param source the sync target
     * @param lastOp the op to start syncing at.  replset::InitialSync writes this and then moves to
     *               the queue.  replset::SyncTail does not write this, it moves directly to the
     *               queue.
     * @param minValid populated by this function. The most recent op on the sync target's oplog,
     *                 this function syncs to this value (inclusive)
     * @return if applying the oplog succeeded
     */
    bool ReplSetImpl::_syncDoInitialSync_applyToHead( replset::SyncTail& syncer, OplogReader* r,
                                                      const Member* source, const BSONObj& lastOp ,
                                                      BSONObj& minValid ) {
        /* our cloned copy will be strange until we apply oplog events that occurred
           through the process.  we note that time point here. */

        try {
            // It may have been a long time since we last used this connection to
            // query the oplog, depending on the size of the databases we needed to clone.
            // A common problem is that TCP keepalives are set too infrequent, and thus
            // our connection here is terminated by a firewall due to inactivity.
            // Solution is to increase the TCP keepalive frequency.
            minValid = r->getLastOp(rsoplog);
        } catch ( SocketException & ) {
            log() << "connection lost to " << source->h().toString() << "; is your tcp keepalive interval set appropriately?";
            if( !r->connect(source->h().toString()) ) {
                sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
                throw;
            }
            // retry
            minValid = r->getLastOp(rsoplog);
        }

        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );

        OpTime mvoptime = minValid["ts"]._opTime();
        verify( !mvoptime.isNull() );

        OpTime startingTS = lastOp["ts"]._opTime();
        verify( mvoptime >= startingTS );

        // apply startingTS..mvoptime portion of the oplog
        {
            try {
                minValid = syncer.oplogApplication(lastOp, minValid);
            }
            catch (const DBException&) {
                log() << "replSet initial sync failed during oplog application phase" << rsLog;

                emptyOplog(); // otherwise we'll be up!

                lastOpTimeWritten = OpTime();
                lastH = 0;

                log() << "replSet cleaning up [1]" << rsLog;
                {
                    Client::WriteContext cx( "local." );
                    cx.ctx().db()->flushFiles(true);
                }
                log() << "replSet cleaning up [2]" << rsLog;

                log() << "replSet initial sync failed will try again" << endl;

                sleepsecs(5);
                return false;
            }
        }
        
        return true;
    }
        void run() {

            OpTime o;

            {
                mongo::mutex::scoped_lock lk2(OpTime::m);
                o = OpTime::now(lk2);
            }

            BSONObjBuilder b;
            b.append("ns","dummy");
            b.appendTimestamp("ts", o.asLL());
            BSONObj obj = b.obj();
            MockInitialSync mock;

            // all three should succeed
            std::vector<BSONObj> ops;
            ops.push_back(obj);
            replset::multiInitialSyncApply(ops, &mock);

            mock.failOnStep = MockInitialSync::FAIL_FIRST_APPLY;
            replset::multiInitialSyncApply(ops, &mock);

            mock.retry = false;
            replset::multiInitialSyncApply(ops, &mock);

            drop();
        }
Пример #7
0
    void SyncTail::handleSlaveDelay(const BSONObj& lastOp) {
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
        int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds();

        // ignore slaveDelay if the box is still initializing. once
        // it becomes secondary we can worry about it.
        if( slaveDelaySecs > 0 && replCoord->getCurrentMemberState().secondary() ) {
            const OpTime ts = lastOp["ts"]._opTime();
            long long a = ts.getSecs();
            long long b = time(0);
            long long lag = b - a;
            long long sleeptime = slaveDelaySecs - lag;
            if( sleeptime > 0 ) {
                uassert(12000, "rs slaveDelay differential too big check clocks and systems",
                        sleeptime < 0x40000000);
                if( sleeptime < 60 ) {
                    sleepsecs((int) sleeptime);
                }
                else {
                    warning() << "replSet slavedelay causing a long sleep of " << sleeptime
                              << " seconds" << rsLog;
                    // sleep(hours) would prevent reconfigs from taking effect & such!
                    long long waitUntil = b + sleeptime;
                    while(time(0) < waitUntil) {
                        sleepsecs(6);

                        // Handle reconfigs that changed the slave delay
                        if (replCoord->getSlaveDelaySecs().total_seconds() != slaveDelaySecs)
                            break;
                    }
                }
            }
        } // endif slaveDelay
    }
Пример #8
0
    BSONObj SyncTail::oplogApplySegment(const BSONObj& applyGTEObj, const BSONObj& minValidObj,
                                     MultiSyncApplyFunc func) {
        OpTime applyGTE = applyGTEObj["ts"]._opTime();
        OpTime minValid = minValidObj["ts"]._opTime();

        // We have to keep track of the last op applied to the data, because there's no other easy
        // way of getting this data synchronously.  Batches may go past minValidObj, so we need to
        // know to bump minValid past minValidObj.
        BSONObj lastOp = applyGTEObj;
        OpTime ts = applyGTE;

        time_t start = time(0);
        time_t now = start;

        unsigned long long n = 0, lastN = 0;

        while( ts < minValid ) {
            OpQueue ops;

            while (ops.getSize() < replBatchLimitBytes) {
                if (tryPopAndWaitForMore(&ops)) {
                    break;
                }

                // apply replication batch limits
                now = time(0);
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
            }
            setOplogVersion(ops.getDeque().front());

            multiApply(ops.getDeque(), func);

            n += ops.getDeque().size();

            if ( n > lastN + 1000 ) {
                if (now - start > 10) {
                    // simple progress metering
                    log() << "replSet initialSyncOplogApplication applied "
                          << n << " operations, synced to "
                          << ts.toStringPretty() << rsLog;
                    start = now;
                    lastN = n;
                }
            }

            // we want to keep a record of the last op applied, to compare with minvalid
            lastOp = ops.getDeque().back();
            OpTime tempTs = lastOp["ts"]._opTime();
            applyOpsToOplog(&ops.getDeque());

            ts = tempTs;
        }

        return lastOp;
    }
Пример #9
0
    void SyncTail::handleSlaveDelay(const BSONObj& lastOp) {
        int sd = theReplSet->myConfig().slaveDelay;

        // ignore slaveDelay if the box is still initializing. once
        // it becomes secondary we can worry about it.
        if( sd && theReplSet->isSecondary() ) {
            const OpTime ts = lastOp["ts"]._opTime();
            long long a = ts.getSecs();
            long long b = time(0);
            long long lag = b - a;
            long long sleeptime = sd - lag;
            if( sleeptime > 0 ) {
                uassert(12000, "rs slaveDelay differential too big check clocks and systems",
                        sleeptime < 0x40000000);
                if( sleeptime < 60 ) {
                    sleepsecs((int) sleeptime);
                }
                else {
                    log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
                    // sleep(hours) would prevent reconfigs from taking effect & such!
                    long long waitUntil = b + sleeptime;
                    while( 1 ) {
                        sleepsecs(6);
                        if( time(0) >= waitUntil )
                            break;

                        if( theReplSet->myConfig().slaveDelay != sd ) // reconf
                            break;
                    }
                }
            }
        } // endif slaveDelay
    }
Пример #10
0
    /** write an op to the oplog that is already built.
        todo : make _logOpRS() call this so we don't repeat ourself?
        */
    void _logOpObjRS(const BSONObj& op) {
        DEV assertInWriteLock();

        const OpTime ts = op["ts"]._opTime();
        long long h = op["h"].numberLong();

        {
            const char *logns = rsoplog;
            if ( rsOplogDetails == 0 ) {
                Client::Context ctx( logns , dbpath, 0, false);
                localDB = ctx.db();
                assert( localDB );
                rsOplogDetails = nsdetails(logns);
                massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
            }
            Client::Context ctx( logns , localDB, false );
            {
                int len = op.objsize();
                Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
                memcpy(getDur().writingPtr(r->data, len), op.objdata(), len);
            }
            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                     this code (or code in now() maybe) should be improved.
                     */
            if( theReplSet ) {
                if( !(theReplSet->lastOpTimeWritten<ts) ) {
                    log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl;
                }
                theReplSet->lastOpTimeWritten = ts;
                theReplSet->lastH = h;
                ctx.getClient()->setLastOp( ts.asDate() );
            }
        }
    }
Пример #11
0
BSONObj OplogFetcher::_makeFindCommandObject(const NamespaceString& nss,
                                             OpTime lastOpTimeFetched,
                                             Milliseconds findMaxTime) const {
    auto lastCommittedWithCurrentTerm =
        _dataReplicatorExternalState->getCurrentTermAndLastCommittedOpTime();
    auto term = lastCommittedWithCurrentTerm.value;
    BSONObjBuilder cmdBob;
    cmdBob.append("find", nss.coll());
    cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())));
    cmdBob.append("tailable", true);
    cmdBob.append("oplogReplay", true);
    cmdBob.append("awaitData", true);
    cmdBob.append("maxTimeMS", durationCount<Milliseconds>(findMaxTime));
    cmdBob.append("batchSize", _batchSize);

    if (term != OpTime::kUninitializedTerm) {
        cmdBob.append("term", term);
    }

    // This ensures that the sync source never returns an empty batch of documents for the first set
    // of results.
    cmdBob.append("readConcern", BSON("afterClusterTime" << lastOpTimeFetched.getTimestamp()));

    return cmdBob.obj();
}
Пример #12
0
    // oplogdiags in web ui
    static void say(stringstream&ss, const bo& op) {
        ss << "<tr>";

        set<string> skip;
        be e = op["ts"];
        if( e.type() == Date || e.type() == Timestamp ) {
            OpTime ot = e._opTime();
            ss << td( time_t_to_String_short( ot.getSecs() ) );
            ss << td( ot.toString() );
            skip.insert("ts");
        }
        else ss << td("?") << td("?");

        e = op["h"];
        if( e.type() == NumberLong ) {
            ss << "<td>" << hex << e.Long() << "</td>\n";
            skip.insert("h");
        }
        else
            ss << td("?");

        ss << td(op["op"].valuestrsafe());
        ss << td(op["ns"].valuestrsafe());
        skip.insert("op");
        skip.insert("ns");

        ss << "<td>";
        for( bo::iterator i(op); i.more(); ) {
            be e = i.next();
            if( skip.count(e.fieldName()) ) continue;
            ss << e.toString() << ' ';
        }
        ss << "</td></tr>\n";
    }
Пример #13
0
    /* Do we have the newest data of them all?
       @param allUp - set to true if all members are up.  Only set if true returned.
       @return true if we are freshest.  Note we may tie.
    */
    bool Consensus::_weAreFreshest(bool& allUp, int& nTies) {
        const OpTime ord = theReplSet->lastOpTimeWritten;
        nTies = 0;
        verify( !ord.isNull() );
        BSONObj cmd = BSON(
                          "replSetFresh" << 1 <<
                          "set" << rs.name() <<
                          "opTime" << Date_t(ord.asDate()) <<
                          "who" << rs._self->fullName() <<
                          "cfgver" << rs._cfg->version <<
                          "id" << rs._self->id());
        list<Target> L;
        int ver;
        /* the following queries arbiters, even though they are never fresh.  wonder if that makes sense.
           it doesn't, but it could, if they "know" what freshness it one day.  so consider removing
           arbiters from getTargets() here.  although getTargets is used elsewhere for elections; there
           arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make
           not fetching them herein happen.
           */
        rs.getTargets(L, ver);
        _multiCommand(cmd, L);
        int nok = 0;
        allUp = true;
        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
            if( i->ok ) {
                nok++;
                if( i->result["fresher"].trueValue() ) {
                    log() << "not electing self, we are not freshest" << rsLog;
                    return false;
                }
                OpTime remoteOrd( i->result["opTime"].Date() );
                if( remoteOrd == ord )
                    nTies++;
                verify( remoteOrd <= ord );

                if( i->result["veto"].trueValue() ) {
                    BSONElement msg = i->result["errmsg"];
                    if (!msg.eoo()) {
                        log() << "not electing self, " << i->toHost << " would veto with '" <<
                            msg.String() << "'" << rsLog;
                    }
                    else {
                        log() << "not electing self, " << i->toHost << " would veto" << rsLog;
                    }
                    return false;
                }
            }
            else {
                DEV log() << "replSet freshest returns " << i->result.toString() << rsLog;
                allUp = false;
            }
        }
        LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
        verify( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
        return true;
    }
Пример #14
0
 void ReplSetImpl::loadLastOpTimeWritten(OperationContext* txn, bool quiet) {
     Lock::DBRead lk(txn->lockState(), rsoplog);
     BSONObj o;
     if (Helpers::getLast(txn, rsoplog, o)) {
         OpTime lastOpTime = o["ts"]._opTime();
         uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTime.isNull());
         getGlobalReplicationCoordinator()->setMyLastOptime(txn, lastOpTime);
     }
     else {
         getGlobalReplicationCoordinator()->setMyLastOptime(txn, OpTime());
     }
 }
Пример #15
0
    /**
     * Replays the sync target's oplog from lastOp to the latest op on the sync target.
     *
     * @param syncer either initial sync (can reclone missing docs) or "normal" sync (no recloning)
     * @param r      the oplog reader
     * @param source the sync target
     * @return if applying the oplog succeeded
     */
    bool ReplSetImpl::_initialSyncApplyOplog( OperationContext* ctx,
                                              repl::SyncTail& syncer,
                                              OplogReader* r,
                                              const Member* source) {
        const OpTime startOpTime = lastOpTimeWritten;
        BSONObj lastOp;
        try {
            // It may have been a long time since we last used this connection to
            // query the oplog, depending on the size of the databases we needed to clone.
            // A common problem is that TCP keepalives are set too infrequent, and thus
            // our connection here is terminated by a firewall due to inactivity.
            // Solution is to increase the TCP keepalive frequency.
            lastOp = r->getLastOp(rsoplog);
        } catch ( SocketException & ) {
            log() << "connection lost to " << source->h().toString() << "; is your tcp keepalive interval set appropriately?";
            if( !r->connect(source->h()) ) {
                sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
                throw;
            }
            // retry
            lastOp = r->getLastOp(rsoplog);
        }

        isyncassert( "lastOp is empty ", !lastOp.isEmpty() );

        OpTime stopOpTime = lastOp["ts"]._opTime();

        // If we already have what we need then return.
        if (stopOpTime == startOpTime)
            return true;

        verify( !stopOpTime.isNull() );
        verify( stopOpTime > startOpTime );

        // apply till stopOpTime
        try {
            syncer.oplogApplication(ctx, stopOpTime);
        }
        catch (const DBException&) {
            log() << "replSet initial sync failed during oplog application phase, and will retry"
                  << rsLog;

            lastOpTimeWritten = OpTime();
            lastH = 0;

            sleepsecs(5);
            return false;
        }
        
        return true;
    }
Пример #16
0
    bool ReplSetImpl::shouldChangeSyncTarget(const OpTime& targetOpTime) const {
        for (Member *m = _members.head(); m; m = m->next()) {
            if (m->syncable() &&
                targetOpTime.getSecs()+maxSyncSourceLagSecs < m->hbinfo().opTime.getSecs()) {
                log() << "changing sync target because current sync target's most recent OpTime is "
                      << targetOpTime.toStringPretty() << " which is more than "
                      << maxSyncSourceLagSecs << " seconds behind member " << m->fullName()
                      << " whose most recent OpTime is " << m->hbinfo().opTime.getSecs();
                return true;
            }
        }

        return false;
    }
Пример #17
0
void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCtx,
                                                          Timestamp stableTimestamp,
                                                          OpTime appliedThrough,
                                                          OpTime topOfOplog) {
    invariant(!stableTimestamp.isNull());
    invariant(!topOfOplog.isNull());
    const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
    log() << "Recovering from stable timestamp: " << stableTimestamp
          << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough
          << ", TruncateAfter: " << truncateAfterPoint << ")";

    log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp;
    _applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp());
}
Пример #18
0
    // Produce a reply to a RAFT-style RequestVote RPC; this is MongoDB ReplSetFresh command
    // The caller should validate that the message is for the correct set, and has the required data
    void TopologyCoordinatorImpl::prepareRequestVoteResponse(const Date_t now,
                                                             const BSONObj& cmdObj,
                                                             const OpTime& lastOpApplied,
                                                             std::string& errmsg,
                                                             BSONObjBuilder& result) {

        string who = cmdObj["who"].String();
        int cfgver = cmdObj["cfgver"].Int();
        OpTime opTime(cmdObj["opTime"].Date());

        bool weAreFresher = false;
        if( _currentConfig.getConfigVersion() > cfgver ) {
            log() << "replSet member " << who << " is not yet aware its cfg version "
                  << cfgver << " is stale";
            result.append("info", "config version stale");
            weAreFresher = true;
        }
        // check not only our own optime, but any other member we can reach
        else if( opTime < _commitOkayThrough ||
                 opTime < _latestKnownOpTime())  {
            weAreFresher = true;
        }
        result.appendDate("opTime", lastOpApplied.asDate());
        result.append("fresher", weAreFresher);

        bool doVeto = _shouldVeto(cmdObj, errmsg);
        result.append("veto",doVeto);
        if (doVeto) {
            result.append("errmsg", errmsg);
        }
    }
Пример #19
0
Status waitForWriteConcern(OperationContext* txn,
                           const OpTime& replOpTime,
                           const WriteConcernOptions& writeConcern,
                           WriteConcernResult* result) {
    // We assume all options have been validated earlier, if not, programming error
    dassert(validateWriteConcern(writeConcern).isOK());

    // Next handle blocking on disk

    Timer syncTimer;

    switch (writeConcern.syncMode) {
        case WriteConcernOptions::NONE:
            break;
        case WriteConcernOptions::FSYNC: {
            StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
            if (!storageEngine->isDurable()) {
                result->fsyncFiles = storageEngine->flushAllFiles(true);
            } else {
                // We only need to commit the journal if we're durable
                txn->recoveryUnit()->waitUntilDurable();
            }
            break;
        }
        case WriteConcernOptions::JOURNAL:
            txn->recoveryUnit()->waitUntilDurable();
            break;
    }

    result->syncMillis = syncTimer.millis();

    // Now wait for replication

    if (replOpTime.isNull()) {
        // no write happened for this client yet
        return Status::OK();
    }

    // needed to avoid incrementing gleWtimeStats SERVER-9005
    if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) {
        // no desired replication check
        return Status::OK();
    }

    // Now we wait for replication
    // Note that replica set stepdowns and gle mode changes are thrown as errors
    repl::ReplicationCoordinator::StatusAndDuration replStatus =
        repl::getGlobalReplicationCoordinator()->awaitReplication(txn, replOpTime, writeConcern);
    if (replStatus.status == ErrorCodes::WriteConcernFailed) {
        gleWtimeouts.increment();
        result->err = "timeout";
        result->wTimedOut = true;
    }
    // Add stats
    result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime);
    gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration));
    result->wTime = durationCount<Milliseconds>(replStatus.duration);

    return replStatus.status;
}
Пример #20
0
Status OplogReader::_compareRequiredOpTimeWithQueryResponse(const OpTime& requiredOpTime) {
    auto containsMinValid = more();
    if (!containsMinValid) {
        return Status(
            ErrorCodes::NoMatchingDocument,
            "remote oplog does not contain entry with optime matching our required optime");
    }
    auto doc = nextSafe();
    const auto opTime = fassertStatusOK(40351, OpTime::parseFromOplogEntry(doc));
    if (requiredOpTime != opTime) {
        return Status(ErrorCodes::BadValue,
                      str::stream() << "remote oplog contain entry with matching timestamp "
                                    << opTime.getTimestamp().toString()
                                    << " but optime "
                                    << opTime.toString()
                                    << " does not "
                                       "match our required optime");
    }
    if (requiredOpTime.getTerm() != opTime.getTerm()) {
        return Status(ErrorCodes::BadValue,
                      str::stream() << "remote oplog contain entry with term " << opTime.getTerm()
                                    << " that does not "
                                       "match the term in our required optime");
    }
    return Status::OK();
}
Пример #21
0
 void OplogReader::tailingQueryGTE(const char *ns, OpTime optime, const BSONObj* fields ) {
     BSONObjBuilder gte;
     gte.appendTimestamp("$gte", optime.asDate());
     BSONObjBuilder query;
     query.append("ts", gte.done());
     tailingQuery(ns, query.done(), fields);
 }
Пример #22
0
Status SyncSourceResolver::_chooseAndProbeNextSyncSource(OpTime earliestOpTimeSeen) {
    auto candidateResult = _chooseNewSyncSource();
    if (!candidateResult.isOK()) {
        return _finishCallback(candidateResult);
    }

    if (candidateResult.getValue().empty()) {
        if (earliestOpTimeSeen.isNull()) {
            return _finishCallback(candidateResult);
        }

        SyncSourceResolverResponse response;
        response.syncSourceStatus = {ErrorCodes::OplogStartMissing, "too stale to catch up"};
        response.earliestOpTimeSeen = earliestOpTimeSeen;
        return _finishCallback(response);
    }

    auto status = _scheduleFetcher(
        _makeFirstOplogEntryFetcher(candidateResult.getValue(), earliestOpTimeSeen));
    if (!status.isOK()) {
        return _finishCallback(status);
    }

    return Status::OK();
}
Пример #23
0
    bool BackgroundSync::isStale(OpTime lastOpTimeFetched, 
                                 OplogReader& r, 
                                 BSONObj& remoteOldestOp) {
        remoteOldestOp = r.findOne(rsoplog, Query());
        OpTime remoteTs = remoteOldestOp["ts"]._opTime();
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (lastOpTimeFetched >= remoteTs) {
                return false;
            }
            log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
            log() << "replSet lastOpTimeFetched: " << lastOpTimeFetched.toStringLong() << rsLog;
        }

        return true;
    }
void ReplicationCoordinatorExternalStateImpl::updateCommittedSnapshot(
    const OpTime& newCommitPoint) {
    auto manager = _service->getGlobalStorageEngine()->getSnapshotManager();
    if (manager) {
        manager->setCommittedSnapshot(newCommitPoint.getTimestamp());
    }
    notifyOplogMetadataWaiters(newCommitPoint);
}
Пример #25
0
Status BackgroundSync::_checkRemoteOplogStart(
    stdx::function<StatusWith<BSONObj>()> getNextOperation) {
    auto result = getNextOperation();
    if (!result.isOK()) {
        // The GTE query from upstream returns nothing, so we're ahead of the upstream.
        return Status(ErrorCodes::RemoteOplogStale,
                      "we are ahead of the sync source, will try to roll back");
    }
    BSONObj o = result.getValue();
    OpTime opTime = extractOpTime(o);
    long long hash = o["h"].numberLong();
    if (opTime != _lastOpTimeFetched || hash != _lastFetchedHash) {
        return Status(ErrorCodes::OplogStartMissing,
                      str::stream() << "our last op time fetched: " << _lastOpTimeFetched.toString()
                      << ". source's GTE: " << opTime.toString());
    }
    return Status::OK();
}
Пример #26
0
 bool ReplSetImpl::shouldChangeSyncTarget(const HostAndPort& currentTarget) {
     lock lk(this);
     OpTime targetOpTime = findByName(currentTarget.toString())->hbinfo().opTime;
     for (Member *m = _members.head(); m; m = m->next()) {
         if (m->syncable() &&
             targetOpTime.getSecs()+maxSyncSourceLagSecs < m->hbinfo().opTime.getSecs()) {
             log() << "changing sync target because current sync target's most recent OpTime is "
                   << targetOpTime.toStringPretty() << " which is more than "
                   << maxSyncSourceLagSecs << " seconds behind member " << m->fullName()
                   << " whose most recent OpTime is " << m->hbinfo().opTime.getSecs();
             return true;
         }
     }
     if (gotForceSync()) {
         return true;
     }
     return false;
 }
Пример #27
0
/**
 * Creates a create collection oplog entry with given optime.
 */
OplogEntry makeCreateCollectionOplogEntry(OpTime opTime,
                                          const NamespaceString& nss = NamespaceString("test.t")) {
    BSONObjBuilder bob;
    bob.appendElements(opTime.toBSON());
    bob.append("h", 1LL);
    bob.append("op", "c");
    bob.append("ns", nss.getCommandNS());
    bob.append("o", BSON("create" << nss.coll()));
    return OplogEntry(bob.obj());
}
Пример #28
0
    bool ReplSetImpl::shouldChangeSyncTarget(const OpTime& targetOpTime) const {
        for (Member *m = _members.head(); m; m = m->next()) {
            if (m->syncable() &&
                targetOpTime.getSecs()+maxSyncSourceLagSecs < m->hbinfo().opTime.getSecs()) {
                return true;
            }
        }

        return false;
    }
Пример #29
0
    /* applies oplog from "now" until endOpTime using the applier threads for initial sync*/
    void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) {
        unsigned long long bytesApplied = 0;
        unsigned long long entriesApplied = 0;
        while (true) {
            OpQueue ops;
            OperationContextImpl ctx;

            while (!tryPopAndWaitForMore(&ops, getGlobalReplicationCoordinator())) {
                // nothing came back last time, so go again
                if (ops.empty()) continue;

                // Check if we reached the end
                const BSONObj currentOp = ops.back();
                const OpTime currentOpTime = currentOp["ts"]._opTime();

                // When we reach the end return this batch
                if (currentOpTime == endOpTime) {
                    break;
                }
                else if (currentOpTime > endOpTime) {
                    severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime
                            << " without seeing it. Rollback?" << rsLog;
                    fassertFailedNoTrace(18693);
                }

                // apply replication batch limits
                if (ops.getSize() > replBatchLimitBytes)
                    break;
                if (ops.getDeque().size() > replBatchLimitOperations)
                    break;
            };

            if (ops.empty()) {
                severe() << "got no ops for batch...";
                fassertFailedNoTrace(18692);
            }

            const BSONObj lastOp = ops.back().getOwned();

            // Tally operation information
            bytesApplied += ops.getSize();
            entriesApplied += ops.getDeque().size();

            multiApply(ops.getDeque());
            OpTime lastOpTime = applyOpsToOplog(&ops.getDeque());

            // if the last op applied was our end, return
            if (lastOpTime == endOpTime) {
                LOG(1) << "SyncTail applied " << entriesApplied
                       << " entries (" << bytesApplied << " bytes)"
                       << " and finished at opTime " << endOpTime.toStringPretty();
                return;
            }
        } // end of while (true)
    }
Пример #30
0
 void SyncSourceFeedback::updateMap(const mongo::OID& rid, const OpTime& ot) {
     boost::unique_lock<boost::mutex> lock(_mtx);
     LOG(1) << "replSet last: " << _slaveMap[rid].toString() << " to " << ot.toString() << endl;
     // only update if ot is newer than what we have already
     if (ot > _slaveMap[rid]) {
         _slaveMap[rid] = ot;
         _positionChanged = true;
         LOG(2) << "now last is " << _slaveMap[rid].toString() << endl;
         _cond.notify_all();
     }
 }