bool BackgroundSync::isStale(OpTime lastOpTimeFetched, OplogReader& r, BSONObj& remoteOldestOp) { remoteOldestOp = r.findOne(rsoplog, Query()); OpTime remoteTs = remoteOldestOp["ts"]._opTime(); { boost::unique_lock<boost::mutex> lock(_mutex); if (lastOpTimeFetched >= remoteTs) { return false; } log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog; log() << "replSet lastOpTimeFetched: " << lastOpTimeFetched.toStringLong() << rsLog; } return true; }
/** * Checks if the oplog given is too far ahead to read from. * * @param r the oplog * @param hn the hostname (for log messages) * * @return if we are stale compared to the oplog on hn */ bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) { BSONObj remoteOldestOp = r.findOne(rsoplog, Query()); OpTime ts = remoteOldestOp["ts"]._opTime(); DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog; else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
/* slave: pull some data from the master's oplog note: not yet in db mutex at this point. @return -1 error 0 ok, don't sleep 1 ok, sleep */ int ReplSource::sync_pullOpLog(int& nApplied) { int okResultCode = 1; string ns = string("local.oplog.$") + sourceName(); log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; bool tailing = true; oplogReader.tailCheck(); bool initial = syncedTo.isNull(); if ( !oplogReader.haveCursor() || initial ) { if ( initial ) { // Important to grab last oplog timestamp before listing databases. syncToTailOfRemoteLog(); BSONObj info; bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); massert( 10389 , "Unable to get database list", ok ); BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; string name = e.embeddedObject().getField( "name" ).valuestr(); if ( !e.embeddedObject().getBoolField( "empty" ) ) { if ( name != "local" ) { if ( only.empty() || only == name ) { log( 2 ) << "adding to 'addDbNextPass': "******"$gte", syncedTo.asDate()); BSONObjBuilder query; query.append("ts", q.done()); if ( !only.empty() ) { // note we may here skip a LOT of data table scanning, a lot of work for the master. query.appendRegex("ns", string("^") + only); // maybe append "\\." here? } BSONObj queryObj = query.done(); // e.g. queryObj = { ts: { $gte: syncedTo } } oplogReader.tailingQuery(ns.c_str(), queryObj); tailing = false; } else { log(2) << "repl: tailing=true\n"; } if( !oplogReader.haveCursor() ) { problem() << "repl: dbclient::query returns null (conn closed?)" << endl; oplogReader.resetConnection(); return -1; } // show any deferred database creates from a previous pass { set<string>::iterator i = addDbNextPass.begin(); if ( i != addDbNextPass.end() ) { BSONObjBuilder b; b.append("ns", *i + '.'); b.append("op", "db"); BSONObj op = b.done(); sync_pullOpLog_applyOperation(op, false); } } if ( !oplogReader.more() ) { if ( tailing ) { log(2) << "repl: tailing & no new activity\n"; if( oplogReader.awaitCapable() ) okResultCode = 0; // don't sleep } else { log() << "repl: " << ns << " oplog is empty\n"; } { dblock lk; save(); } return okResultCode; } OpTime nextOpTime; { BSONObj op = oplogReader.next(); BSONElement ts = op.getField("ts"); if ( ts.type() != Date && ts.type() != Timestamp ) { string err = op.getStringField("$err"); if ( !err.empty() ) { // 13051 is "tailable cursor requested on non capped collection" if (op.getIntField("code") == 13051) { problem() << "trying to slave off of a non-master" << '\n'; massert( 13344 , "trying to slave off of a non-master", false ); } else { problem() << "repl: $err reading remote oplog: " + err << '\n'; massert( 10390 , "got $err reading remote oplog", false ); } } else { problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; massert( 10391 , "repl: bad object read from remote oplog", false); } } nextOpTime = OpTime( ts.date() ); log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; if ( initial ) { log(1) << "repl: initial run\n"; } if( tailing ) { if( !( syncedTo < nextOpTime ) ) { log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; log() << "repl syncTo: " << syncedTo.toStringLong() << endl; log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; assert(false); } oplogReader.putBack( op ); // op will be processed in the loop below nextOpTime = OpTime(); // will reread the op below } else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error Nullstream& l = log(); l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' '; if ( nextOpTime < syncedTo ) l << "<??"; else l << ">"; l << " syncedTo " << syncedTo.toStringLong() << '\n'; log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n"; log() << "repl: tailing: " << tailing << '\n'; log() << "repl: data too stale, halting replication" << endl; replInfo = replAllDead = "data too stale halted replication"; assert( syncedTo < nextOpTime ); throw SyncException(); } else { /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ } } // apply operations { int n = 0; time_t saveLast = time(0); while ( 1 ) { bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place. if ( moreInitialSyncsPending || !oplogReader.more() ) { dblock lk; // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior. { dbtemprelease t; if ( !moreInitialSyncsPending && oplogReader.more() ) { continue; } // otherwise, break out of loop so we can set to completed or clone more dbs } if( oplogReader.awaitCapable() && tailing ) okResultCode = 0; // don't sleep syncedTo = nextOpTime; save(); // note how far we are synced up to now log() << "repl: applied " << n << " operations" << endl; nApplied = n; log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; break; } else { } OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { // periodically note our progress, in case we are doing a lot of work and crash dblock lk; syncedTo = nextOpTime; // can't update local log ts since there are pending operations from our peer save(); log() << "repl: checkpoint applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; saveLast = time(0); n = 0; } BSONObj op = oplogReader.next(); unsigned b = replApplyBatchSize; bool justOne = b == 1; scoped_ptr<writelock> lk( justOne ? 0 : new writelock() ); while( 1 ) { BSONElement ts = op.getField("ts"); if( !( ts.type() == Date || ts.type() == Timestamp ) ) { log() << "sync error: problem querying remote oplog record" << endl; log() << "op: " << op.toString() << endl; log() << "halting replication" << endl; replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; throw SyncException(); } OpTime last = nextOpTime; nextOpTime = OpTime( ts.date() ); if ( !( last < nextOpTime ) ) { log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; log() << " last: " << last.toStringLong() << endl; log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; log() << " halting replication" << endl; replInfo = replAllDead = "sync error last >= nextOpTime"; uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); } if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { assert( justOne ); oplogReader.putBack( op ); _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; dblock lk; if ( n > 0 ) { syncedTo = last; save(); } log() << "repl: applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; return okResultCode; } sync_pullOpLog_applyOperation(op, !justOne); n++; if( --b == 0 ) break; // if to here, we are doing mulpile applications in a singel write lock acquisition if( !oplogReader.moreInCurrentBatch() ) { // break if no more in batch so we release lock while reading from the master break; } op = oplogReader.next(); getDur().commitIfNeeded(); } } } return okResultCode; }
bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) { remoteOldestOp = r.findOne(rsoplog, Query()); OpTime remoteTs = remoteOldestOp["ts"]._opTime(); DEV log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog; else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
void BackgroundSync::connectOplogReader(OperationContext* txn, ReplicationCoordinatorImpl* replCoordImpl, OplogReader* reader) { OpTime lastOpTimeFetched; { boost::unique_lock<boost::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } Date_t now(curTimeMillis64()); OpTime oldestOpTimeSeen(now,0); while (true) { HostAndPort candidate = replCoordImpl->chooseNewSyncSource(); if (candidate.empty()) { if (oldestOpTimeSeen == OpTime(now,0)) { // If, in this invocation of connectOplogReader(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. log() << "replSet error RS102 too stale to catch up" << rsLog; log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong() << rsLog; log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong() << rsLog; log() << "replSet " "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember" << rsLog; sethbmsg("error RS102 too stale to catch up"); theReplSet->setMinValid(txn, oldestOpTimeSeen); replCoordImpl->setFollowerMode(MemberState::RS_RECOVERING); return; } if (!reader->connect(candidate)) { LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations" << rsLog; reader->resetConnection(); replCoordImpl->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10*1000)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(reader->findOne(rsoplog, Query())); BSONElement tsElem(remoteOldestOp["ts"]); if (tsElem.type() != Timestamp) { // This member's got a bad op in its oplog. warning() << "oplog invalid format on node " << candidate.toString(); reader->resetConnection(); replCoordImpl->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600*1000)); continue; } OpTime remoteOldOpTime = tsElem._opTime(); if (lastOpTimeFetched < remoteOldOpTime) { // We're too stale to use this sync source. reader->resetConnection(); replCoordImpl->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600*1000)); if (oldestOpTimeSeen > remoteOldOpTime) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) }
static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) { static time_t last; if( time(0)-last < 60 ) { throw "findcommonpoint waiting a while before trying again"; } last = time(0); assert( dbMutex.atLeastReadLocked() ); Client::Context c(rsoplog, dbpath, 0, false); NamespaceDetails *nsd = nsdetails(rsoplog); assert(nsd); ReverseCappedCursor u(nsd); if( !u.ok() ) throw "our oplog empty or unreadable"; const Query q = Query().sort(reverseNaturalObj); const bo fields = BSON( "ts" << 1 << "h" << 1 ); //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0); h.rbid = getRBID(them); auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0); if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable"; BSONObj ourObj = u.current(); OpTime ourTime = ourObj["ts"]._opTime(); BSONObj theirObj = t->nextSafe(); OpTime theirTime = theirObj["ts"]._opTime(); { long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs()); /* diff could be positive, negative, or zero */ log() << "replSet info rollback our last optime: " << ourTime.toStringPretty() << rsLog; log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog; log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog; if( diff > 3600 ) { log() << "replSet rollback too long a time period for a rollback." << rsLog; throw "error not willing to roll back more than one hour of data"; } } unsigned long long scanned = 0; while( 1 ) { scanned++; /* todo add code to assure no excessive scanning for too long */ if( ourTime == theirTime ) { if( ourObj["h"].Long() == theirObj["h"].Long() ) { // found the point back in time where we match. // todo : check a few more just to be careful about hash collisions. log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog; log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog; h.commonPoint = ourTime; h.commonPointOurDiskloc = u.currLoc(); return; } refetch(h, ourObj); if( !t->more() ) { log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS100 reached beginning of remote oplog [2]"; } theirObj = t->nextSafe(); theirTime = theirObj["ts"]._opTime(); u.advance(); if( !u.ok() ) { log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS101 reached beginning of local oplog [1]"; } ourObj = u.current(); ourTime = ourObj["ts"]._opTime(); } else if( theirTime > ourTime ) { if( !t->more() ) { log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS100 reached beginning of remote oplog [1]"; } theirObj = t->nextSafe(); theirTime = theirObj["ts"]._opTime(); } else { // theirTime < ourTime refetch(h, ourObj); u.advance(); if( !u.ok() ) { log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS101 reached beginning of local oplog [2]"; } ourObj = u.current(); ourTime = ourObj["ts"]._opTime(); } } }
void OplogReader::connectToSyncSource(OperationContext* txn, OpTime lastOpTimeFetched, ReplicationCoordinator* replCoord) { const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. log() << "replSet error RS102 too stale to catch up"; log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong(); log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong(); log() << "replSet " "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, oldestOpTimeSeen); bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10 * 1000)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsoplog, Query())); BSONElement tsElem(remoteOldestOp["ts"]); if (tsElem.type() != Timestamp) { // This member's got a bad op in its oplog. warning() << "oplog invalid format on node " << candidate.toString(); resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000)); continue; } OpTime remoteOldOpTime = tsElem._opTime(); if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000)); if (oldestOpTimeSeen > remoteOldOpTime) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) }