void report() const { const OpTime &maxOpTimeSynced = _player->maxOpTimeSynced(); LOG(0) << "synced up to " << fmtOpTime(maxOpTimeSynced); if (!_rconn) { LOG(0) << endl; return; } Query lastQuery; lastQuery.sort("$natural", -1); BSONObj lastFields = BSON("ts" << 1); BSONObj lastObj = _rconn->conn().findOne(_oplogns, lastQuery, &lastFields); BSONElement tsElt = lastObj["ts"]; if (!tsElt.ok()) { warning() << "couldn't find last oplog entry on remote host" << endl; LOG(0) << endl; return; } OpTime lastOpTime = OpTime(tsElt.date()); LOG(0) << ", source has up to " << fmtOpTime(lastOpTime); if (maxOpTimeSynced == lastOpTime) { LOG(0) << ", fully synced." << endl; } else { int diff = lastOpTime.getSecs() - maxOpTimeSynced.getSecs(); if (diff > 0) { LOG(0) << ", " << (lastOpTime.getSecs() - maxOpTimeSynced.getSecs()) << " seconds behind source." << endl; } else { LOG(0) << ", less than 1 second behind source." << endl; } } _reportingTimer.reset(); }
// oplogdiags in web ui static void say(stringstream&ss, const bo& op) { ss << "<tr>"; set<string> skip; be e = op["ts"]; if( e.type() == Date || e.type() == Timestamp ) { OpTime ot = e._opTime(); ss << td( time_t_to_String_short( ot.getSecs() ) ); ss << td( ot.toString() ); skip.insert("ts"); } else ss << td("?") << td("?"); e = op["h"]; if( e.type() == NumberLong ) { ss << "<td>" << hex << e.Long() << "</td>\n"; skip.insert("h"); } else ss << td("?"); ss << td(op["op"].valuestrsafe()); ss << td(op["ns"].valuestrsafe()); skip.insert("op"); skip.insert("ns"); ss << "<td>"; for( bo::iterator i(op); i.more(); ) { be e = i.next(); if( skip.count(e.fieldName()) ) continue; ss << e.toString() << ' '; } ss << "</td></tr>\n"; }
void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { int sd = theReplSet->myConfig().slaveDelay; // ignore slaveDelay if the box is still initializing. once // it becomes secondary we can worry about it. if( sd && theReplSet->isSecondary() ) { const OpTime ts = lastOp["ts"]._opTime(); long long a = ts.getSecs(); long long b = time(0); long long lag = b - a; long long sleeptime = sd - lag; if( sleeptime > 0 ) { uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000); if( sleeptime < 60 ) { sleepsecs((int) sleeptime); } else { log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog; // sleep(hours) would prevent reconfigs from taking effect & such! long long waitUntil = b + sleeptime; while( 1 ) { sleepsecs(6); if( time(0) >= waitUntil ) break; if( theReplSet->myConfig().slaveDelay != sd ) // reconf break; } } } } // endif slaveDelay }
void logPosition() const { if (_player) { if (_player->thisTime() != OpTime()) { log() << "Exiting while processing operation with OpTime " << _player->thisTimeStr() << endl; } report(); OpTime t = _player->maxOpTimeSynced(); string tsString = mongoutils::str::stream() << t.getSecs() << ":" << t.getInc(); log() << "Use --ts=" << tsString << " to resume." << endl; try { std::ofstream tsFile; tsFile.exceptions(std::ifstream::badbit | std::ifstream::failbit); tsFile.open(_tsFilename, std::ofstream::trunc); tsFile << tsString; tsFile.close(); log() << "Saved timestamp to file " << (boost::filesystem::current_path() / _tsFilename).string() << "." << endl; log() << "I'll automatically use this value next time if you run from this directory " << "and don't pass --ts." << endl; } catch (std::exception &e) { warning() << "Error saving timestamp to file " << _tsFilename << ": " << e.what() << endl; warning() << "Make sure you save the timestamp somewhere, because I couldn't!" << endl; } } }
void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); // ignore slaveDelay if the box is still initializing. once // it becomes secondary we can worry about it. if( slaveDelaySecs > 0 && replCoord->getCurrentMemberState().secondary() ) { const OpTime ts = lastOp["ts"]._opTime(); long long a = ts.getSecs(); long long b = time(0); long long lag = b - a; long long sleeptime = slaveDelaySecs - lag; if( sleeptime > 0 ) { uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000); if( sleeptime < 60 ) { sleepsecs((int) sleeptime); } else { warning() << "replSet slavedelay causing a long sleep of " << sleeptime << " seconds" << rsLog; // sleep(hours) would prevent reconfigs from taking effect & such! long long waitUntil = b + sleeptime; while(time(0) < waitUntil) { sleepsecs(6); // Handle reconfigs that changed the slave delay if (replCoord->getSlaveDelaySecs().total_seconds() != slaveDelaySecs) break; } } } } // endif slaveDelay }
bool ReplSetImpl::shouldChangeSyncTarget(const OpTime& targetOpTime) const { for (Member *m = _members.head(); m; m = m->next()) { if (m->syncable() && targetOpTime.getSecs()+maxSyncSourceLagSecs < m->hbinfo().opTime.getSecs()) { return true; } } return false; }
bool ReplSetImpl::shouldChangeSyncTarget(const OpTime& targetOpTime) const { for (Member *m = _members.head(); m; m = m->next()) { if (m->syncable() && targetOpTime.getSecs()+maxSyncSourceLagSecs < m->hbinfo().opTime.getSecs()) { log() << "changing sync target because current sync target's most recent OpTime is " << targetOpTime.toStringPretty() << " which is more than " << maxSyncSourceLagSecs << " seconds behind member " << m->fullName() << " whose most recent OpTime is " << m->hbinfo().opTime.getSecs(); return true; } } return false; }
bool ReplSetImpl::shouldChangeSyncTarget(const HostAndPort& currentTarget) { lock lk(this); OpTime targetOpTime = findByName(currentTarget.toString())->hbinfo().opTime; for (Member *m = _members.head(); m; m = m->next()) { if (m->syncable() && targetOpTime.getSecs()+maxSyncSourceLagSecs < m->hbinfo().opTime.getSecs()) { log() << "changing sync target because current sync target's most recent OpTime is " << targetOpTime.toStringPretty() << " which is more than " << maxSyncSourceLagSecs << " seconds behind member " << m->fullName() << " whose most recent OpTime is " << m->hbinfo().opTime.getSecs(); return true; } } if (gotForceSync()) { return true; } return false; }
static string fmtOpTime(const OpTime &t) { stringstream ss; ss << t.getSecs() << ":" << t.getInc(); return ss.str(); }
const Member* ReplSetImpl::getMemberToSyncTo() { lock lk(this); // if we have a target we've requested to sync from, use it if (_forceSyncTarget) { Member* target = _forceSyncTarget; _forceSyncTarget = 0; sethbmsg( str::stream() << "syncing to: " << target->fullName() << " by request", 0); return target; } const Member* primary = box.getPrimary(); // wait for 2N pings before choosing a sync target if (_cfg) { int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings; if (needMorePings > 0) { OCCASIONALLY log() << "waiting for " << needMorePings << " pings from other members before syncing" << endl; return NULL; } // If we are only allowed to sync from the primary, return that if (!_cfg->chainingAllowed()) { // Returns NULL if we cannot reach the primary return primary; } } // find the member with the lowest ping time that has more data than me // Find primary's oplog time. Reject sync candidates that are more than // maxSyncSourceLagSecs seconds behind. OpTime primaryOpTime; if (primary) primaryOpTime = primary->hbinfo().opTime; else // choose a time that will exclude no candidates, since we don't see a primary primaryOpTime = OpTime(maxSyncSourceLagSecs, 0); if (primaryOpTime.getSecs() < static_cast<unsigned int>(maxSyncSourceLagSecs)) { // erh - I think this means there was just a new election // and we don't yet know the new primary's optime primaryOpTime = OpTime(maxSyncSourceLagSecs, 0); } OpTime oldestSyncOpTime(primaryOpTime.getSecs() - maxSyncSourceLagSecs, 0); Member *closest = 0; time_t now = 0; // Make two attempts. The first attempt, we ignore those nodes with // slave delay higher than our own. The second attempt includes such // nodes, in case those are the only ones we can reach. // This loop attempts to set 'closest'. for (int attempts = 0; attempts < 2; ++attempts) { for (Member *m = _members.head(); m; m = m->next()) { if (!m->syncable()) continue; if (m->state() == MemberState::RS_SECONDARY) { // only consider secondaries that are ahead of where we are if (m->hbinfo().opTime <= lastOpTimeWritten) continue; // omit secondaries that are excessively behind, on the first attempt at least. if (attempts == 0 && m->hbinfo().opTime < oldestSyncOpTime) continue; } // omit nodes that are more latent than anything we've already considered if (closest && (m->hbinfo().ping > closest->hbinfo().ping)) continue; if (attempts == 0 && (myConfig().slaveDelay < m->config().slaveDelay || m->config().hidden)) { continue; // skip this one in the first attempt } map<string,time_t>::iterator vetoed = _veto.find(m->fullName()); if (vetoed != _veto.end()) { // Do some veto housekeeping if (now == 0) { now = time(0); } // if this was on the veto list, check if it was vetoed in the last "while". // if it was, skip. if (vetoed->second >= now) { if (time(0) % 5 == 0) { log() << "replSet not trying to sync from " << (*vetoed).first << ", it is vetoed for " << ((*vetoed).second - now) << " more seconds" << rsLog; } continue; } _veto.erase(vetoed); // fall through, this is a valid candidate now } // This candidate has passed all tests; set 'closest' closest = m; } if (closest) break; // no need for second attempt } if (!closest) { return NULL; } sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0); return closest; }
void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { const Member *m = findById(server_id); if( m == 0 ) { ss << "Error : can't find a member with id: " << server_id << '\n'; return; } ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog ); //const bo fields = BSON( "o" << false << "o2" << false ); const bo fields; /** todo fix we might want an so timeout here */ DBClientConnection conn(false, 0, /*timeout*/ 20); { string errmsg; if( !conn.connect(m->fullName(), errmsg) ) { ss << "couldn't connect to " << m->fullName() << ' ' << errmsg; return; } } auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields); if( c.get() == 0 ) { ss << "couldn't query " << rsoplog; return; } static const char *h[] = {"ts","optime", "h","op","ns","rest",0}; ss << "<style type=\"text/css\" media=\"screen\">" "table { font-size:75% }\n" // "th { background-color:#bbb; color:#000 }\n" // "td,th { padding:.25em }\n" "</style>\n"; ss << table(h, true); //ss << "<pre>\n"; int n = 0; // we save ts as a timestamp, so even though we don't really // use OpTimes anymore, this code is fine OpTime otFirst; OpTime otLast; OpTime otEnd; while( c->more() ) { bo o = c->next(); otLast = o["ts"]._opTime(); if( otFirst.isNull() ) otFirst = otLast; say(ss, o); n++; } if( n == 0 ) { ss << rsoplog << " is empty\n"; } else { auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields); if( c.get() == 0 ) { ss << "couldn't query [2] " << rsoplog; return; } string x; bo o = c->next(); otEnd = o["ts"]._opTime(); while( 1 ) { stringstream z; if( o["ts"]._opTime() == otLast ) break; say(z, o); x = z.str() + x; if( !c->more() ) break; o = c->next(); } if( !x.empty() ) { ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x; //ss << "\n...\n\n" << x; } } ss << _table(); ss << p(time_t_to_String_short(time(0)) + " current time"); if( !otEnd.isNull() ) { ss << "<p>Log length in time: "; unsigned d = otEnd.getSecs() - otFirst.getSecs(); double h = d / 3600.0; ss.precision(3); if( h < 72 ) ss << h << " hours"; else ss << h / 24.0 << " days"; ss << "</p>\n"; } }
/* slave: pull some data from the master's oplog note: not yet in db mutex at this point. @return -1 error 0 ok, don't sleep 1 ok, sleep */ int ReplSource::sync_pullOpLog(int& nApplied) { int okResultCode = 1; string ns = string("local.oplog.$") + sourceName(); log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; bool tailing = true; oplogReader.tailCheck(); bool initial = syncedTo.isNull(); if ( !oplogReader.haveCursor() || initial ) { if ( initial ) { // Important to grab last oplog timestamp before listing databases. syncToTailOfRemoteLog(); BSONObj info; bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); massert( 10389 , "Unable to get database list", ok ); BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; string name = e.embeddedObject().getField( "name" ).valuestr(); if ( !e.embeddedObject().getBoolField( "empty" ) ) { if ( name != "local" ) { if ( only.empty() || only == name ) { log( 2 ) << "adding to 'addDbNextPass': "******"$gte", syncedTo.asDate()); BSONObjBuilder query; query.append("ts", q.done()); if ( !only.empty() ) { // note we may here skip a LOT of data table scanning, a lot of work for the master. query.appendRegex("ns", string("^") + only); // maybe append "\\." here? } BSONObj queryObj = query.done(); // e.g. queryObj = { ts: { $gte: syncedTo } } oplogReader.tailingQuery(ns.c_str(), queryObj); tailing = false; } else { log(2) << "repl: tailing=true\n"; } if( !oplogReader.haveCursor() ) { problem() << "repl: dbclient::query returns null (conn closed?)" << endl; oplogReader.resetConnection(); return -1; } // show any deferred database creates from a previous pass { set<string>::iterator i = addDbNextPass.begin(); if ( i != addDbNextPass.end() ) { BSONObjBuilder b; b.append("ns", *i + '.'); b.append("op", "db"); BSONObj op = b.done(); sync_pullOpLog_applyOperation(op, false); } } if ( !oplogReader.more() ) { if ( tailing ) { log(2) << "repl: tailing & no new activity\n"; if( oplogReader.awaitCapable() ) okResultCode = 0; // don't sleep } else { log() << "repl: " << ns << " oplog is empty\n"; } { dblock lk; save(); } return okResultCode; } OpTime nextOpTime; { BSONObj op = oplogReader.next(); BSONElement ts = op.getField("ts"); if ( ts.type() != Date && ts.type() != Timestamp ) { string err = op.getStringField("$err"); if ( !err.empty() ) { // 13051 is "tailable cursor requested on non capped collection" if (op.getIntField("code") == 13051) { problem() << "trying to slave off of a non-master" << '\n'; massert( 13344 , "trying to slave off of a non-master", false ); } else { problem() << "repl: $err reading remote oplog: " + err << '\n'; massert( 10390 , "got $err reading remote oplog", false ); } } else { problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; massert( 10391 , "repl: bad object read from remote oplog", false); } } nextOpTime = OpTime( ts.date() ); log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; if ( initial ) { log(1) << "repl: initial run\n"; } if( tailing ) { if( !( syncedTo < nextOpTime ) ) { log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; log() << "repl syncTo: " << syncedTo.toStringLong() << endl; log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; assert(false); } oplogReader.putBack( op ); // op will be processed in the loop below nextOpTime = OpTime(); // will reread the op below } else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error Nullstream& l = log(); l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' '; if ( nextOpTime < syncedTo ) l << "<??"; else l << ">"; l << " syncedTo " << syncedTo.toStringLong() << '\n'; log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n"; log() << "repl: tailing: " << tailing << '\n'; log() << "repl: data too stale, halting replication" << endl; replInfo = replAllDead = "data too stale halted replication"; assert( syncedTo < nextOpTime ); throw SyncException(); } else { /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ } } // apply operations { int n = 0; time_t saveLast = time(0); while ( 1 ) { bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place. if ( moreInitialSyncsPending || !oplogReader.more() ) { dblock lk; // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior. { dbtemprelease t; if ( !moreInitialSyncsPending && oplogReader.more() ) { continue; } // otherwise, break out of loop so we can set to completed or clone more dbs } if( oplogReader.awaitCapable() && tailing ) okResultCode = 0; // don't sleep syncedTo = nextOpTime; save(); // note how far we are synced up to now log() << "repl: applied " << n << " operations" << endl; nApplied = n; log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; break; } else { } OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { // periodically note our progress, in case we are doing a lot of work and crash dblock lk; syncedTo = nextOpTime; // can't update local log ts since there are pending operations from our peer save(); log() << "repl: checkpoint applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; saveLast = time(0); n = 0; } BSONObj op = oplogReader.next(); unsigned b = replApplyBatchSize; bool justOne = b == 1; scoped_ptr<writelock> lk( justOne ? 0 : new writelock() ); while( 1 ) { BSONElement ts = op.getField("ts"); if( !( ts.type() == Date || ts.type() == Timestamp ) ) { log() << "sync error: problem querying remote oplog record" << endl; log() << "op: " << op.toString() << endl; log() << "halting replication" << endl; replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; throw SyncException(); } OpTime last = nextOpTime; nextOpTime = OpTime( ts.date() ); if ( !( last < nextOpTime ) ) { log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; log() << " last: " << last.toStringLong() << endl; log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; log() << " halting replication" << endl; replInfo = replAllDead = "sync error last >= nextOpTime"; uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); } if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { assert( justOne ); oplogReader.putBack( op ); _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; dblock lk; if ( n > 0 ) { syncedTo = last; save(); } log() << "repl: applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; return okResultCode; } sync_pullOpLog_applyOperation(op, !justOne); n++; if( --b == 0 ) break; // if to here, we are doing mulpile applications in a singel write lock acquisition if( !oplogReader.moreInCurrentBatch() ) { // break if no more in batch so we release lock while reading from the master break; } op = oplogReader.next(); getDur().commitIfNeeded(); } } } return okResultCode; }
virtual int doRun() { // authenticate enum Auth::Level authLevel = Auth::NONE; auth("", &authLevel); uassert(15935, "user does not have write access", authLevel == Auth::WRITE); boost::filesystem::path root = getParam("dir"); // check if we're actually talking to a machine that can write if (!isMaster()) { return -1; } if (isMongos() && _db == "" && exists(root / "config")) { log() << "Cannot do a full restore on a sharded system" << endl; return -1; } _drop = hasParam( "drop" ); _keepIndexVersion = hasParam("keepIndexVersion"); _restoreOptions = !hasParam("noOptionsRestore"); _restoreIndexes = !hasParam("noIndexRestore"); _w = getParam( "w" , 1 ); bool doOplog = hasParam( "oplogReplay" ); if (doOplog) { // fail early if errors if (_db != "") { log() << "Can only replay oplog on full restore" << endl; return -1; } if ( ! exists(root / "oplog.bson") ) { log() << "No oplog file to replay. Make sure you run mongodump with --oplog." << endl; return -1; } BSONObj out; if (! conn().simpleCommand("admin", &out, "buildinfo")) { log() << "buildinfo command failed: " << out["errmsg"].String() << endl; return -1; } StringData version = out["version"].valuestr(); if (versionCmp(version, "1.7.4-pre-") < 0) { log() << "Can only replay oplog to server version >= 1.7.4" << endl; return -1; } string oplogLimit = getParam( "oplogLimit", "" ); string oplogInc = "0"; if(!oplogLimit.empty()) { size_t i = oplogLimit.find_first_of(':'); if ( i != string::npos ) { if ( i + 1 < oplogLimit.length() ) { oplogInc = oplogLimit.substr(i + 1); } oplogLimit = oplogLimit.substr(0, i); } try { _oplogLimitTS.reset(new OpTime( boost::lexical_cast<unsigned long>(oplogLimit.c_str()), boost::lexical_cast<unsigned long>(oplogInc.c_str()))); } catch( const boost::bad_lexical_cast& error) { log() << "Could not parse oplogLimit into Timestamp from values ( " << oplogLimit << " , " << oplogInc << " )" << endl; return -1; } if (!oplogLimit.empty()) { // Only for a replica set as master will have no-op entries so we would need to // skip them all to find the real op scoped_ptr<DBClientCursor> cursor( conn().query("local.oplog.rs", Query().sort(BSON("$natural" << -1)), 1 /*return first*/)); OpTime tsOptime; // get newest oplog entry and make sure it is older than the limit to apply. if (cursor->more()) { tsOptime = cursor->next().getField("ts")._opTime(); if (tsOptime > *_oplogLimitTS.get()) { log() << "The oplogLimit is not newer than" << " the last oplog entry on the server." << endl; return -1; } } BSONObjBuilder tsRestrictBldr; if (!tsOptime.isNull()) tsRestrictBldr.appendTimestamp("$gt", tsOptime.asDate()); tsRestrictBldr.appendTimestamp("$lt", _oplogLimitTS->asDate()); BSONObj query = BSON("ts" << tsRestrictBldr.obj()); if (!tsOptime.isNull()) { log() << "Latest oplog entry on the server is " << tsOptime.getSecs() << ":" << tsOptime.getInc() << endl; log() << "Only applying oplog entries matching this criteria: " << query.jsonString() << endl; } _opmatcher.reset(new Matcher(query)); } } } /* If _db is not "" then the user specified a db name to restore as. * * In that case we better be given either a root directory that * contains only .bson files or a single .bson file (a db). * * In the case where a collection name is specified we better be * given either a root directory that contains only a single * .bson file, or a single .bson file itself (a collection). */ drillDown(root, _db != "", _coll != "", !(_oplogLimitTS.get() == NULL), true); // should this happen for oplog replay as well? conn().getLastError(_db == "" ? "admin" : _db); if (doOplog) { log() << "\t Replaying oplog" << endl; _curns = OPLOG_SENTINEL; processFile( root / "oplog.bson" ); log() << "Applied " << _oplogEntryApplies << " oplog entries out of " << _oplogEntryApplies + _oplogEntrySkips << " (" << _oplogEntrySkips << " skipped)." << endl; } return EXIT_CLEAN; }
static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) { static time_t last; if( time(0)-last < 60 ) { throw "findcommonpoint waiting a while before trying again"; } last = time(0); assert( dbMutex.atLeastReadLocked() ); Client::Context c(rsoplog, dbpath, 0, false); NamespaceDetails *nsd = nsdetails(rsoplog); assert(nsd); ReverseCappedCursor u(nsd); if( !u.ok() ) throw "our oplog empty or unreadable"; const Query q = Query().sort(reverseNaturalObj); const bo fields = BSON( "ts" << 1 << "h" << 1 ); //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0); h.rbid = getRBID(them); auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0); if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable"; BSONObj ourObj = u.current(); OpTime ourTime = ourObj["ts"]._opTime(); BSONObj theirObj = t->nextSafe(); OpTime theirTime = theirObj["ts"]._opTime(); { long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs()); /* diff could be positive, negative, or zero */ log() << "replSet info rollback our last optime: " << ourTime.toStringPretty() << rsLog; log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog; log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog; if( diff > 3600 ) { log() << "replSet rollback too long a time period for a rollback." << rsLog; throw "error not willing to roll back more than one hour of data"; } } unsigned long long scanned = 0; while( 1 ) { scanned++; /* todo add code to assure no excessive scanning for too long */ if( ourTime == theirTime ) { if( ourObj["h"].Long() == theirObj["h"].Long() ) { // found the point back in time where we match. // todo : check a few more just to be careful about hash collisions. log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog; log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog; h.commonPoint = ourTime; h.commonPointOurDiskloc = u.currLoc(); return; } refetch(h, ourObj); if( !t->more() ) { log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS100 reached beginning of remote oplog [2]"; } theirObj = t->nextSafe(); theirTime = theirObj["ts"]._opTime(); u.advance(); if( !u.ok() ) { log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS101 reached beginning of local oplog [1]"; } ourObj = u.current(); ourTime = ourObj["ts"]._opTime(); } else if( theirTime > ourTime ) { if( !t->more() ) { log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS100 reached beginning of remote oplog [1]"; } theirObj = t->nextSafe(); theirTime = theirObj["ts"]._opTime(); } else { // theirTime < ourTime refetch(h, ourObj); u.advance(); if( !u.ok() ) { log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; throw "RS101 reached beginning of local oplog [2]"; } ourObj = u.current(); ourTime = ourObj["ts"]._opTime(); } } }
void BuilderObj::append(const OpTime& ot) { pBuilder->appendTimestamp(fieldName, ot.getSecs(), ot.getInc()); }
void BuilderArray::append(const OpTime& ot) { pBuilder->appendTimestamp(ot.getSecs(), ot.getInc()); }