BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? const char* ns = o.getStringField("ns"); // capped collections Collection* collection = db->getCollection(ns); if (collection && collection->isCapped()) { log() << "missing doc, but this is okay for a capped collection (" << ns << ")"; return BSONObj(); } const int retryMax = 3; for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { if (retryCount != 1) { // if we are retrying, sleep a bit to let the network possibly recover sleepsecs(retryCount * retryCount); } try { bool ok = missingObjReader.connect(HostAndPort(_hostname)); if (!ok) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } } catch (const SocketException&) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } // get _id from oplog entry to create query to fetch document. const BSONElement opElem = o.getField("op"); const bool isUpdate = !opElem.eoo() && opElem.str() == "u"; const BSONElement idElem = o.getObjectField(isUpdate ? "o2" : "o")["_id"]; if (idElem.eoo()) { severe() << "cannot fetch missing document without _id field: " << o.toString(); fassertFailedNoTrace(28742); } BSONObj query = BSONObjBuilder().append(idElem).obj(); BSONObj missingObj; try { missingObj = missingObjReader.findOne(ns, query); } catch (const SocketException&) { warning() << "network problem detected while fetching a missing document from the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } catch (DBException& e) { error() << "assertion fetching missing object: " << e.what() << endl; throw; } // success! return missingObj; } // retry count exceeded msgasserted(15916, str::stream() << "Can no longer connect to initial sync source: " << _hostname); }
BSONObj Sync::getMissingDoc(const BSONObj& o) { OplogReader missingObjReader; const char *ns = o.getStringField("ns"); // capped collections NamespaceDetails *nsd = nsdetails(ns); if ( nsd && nsd->isCapped() ) { log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl; return BSONObj(); } uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn)); // might be more than just _id in the update criteria BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); BSONObj missingObj; try { missingObj = missingObjReader.findOne(ns, query); } catch(DBException& e) { log() << "replication assertion fetching missing object: " << e.what() << endl; throw; } return missingObj; }
bool BackgroundSync::isRollbackRequired(OplogReader& r) { string hn = r.conn()->getServerAddress(); if (!r.more()) { // In vanilla Mongo, this happened for one of the // following reasons: // - we were ahead of what we are syncing from (don't // think that is possible anymore) // - remote oplog is empty for some weird reason // in either case, if it (strangely) happens, we'll just return // and our caller will simply try again after a short sleep. log() << "replSet error empty query result from " << hn << " oplog, attempting rollback" << rsLog; return true; } BSONObj o = r.nextSafe(); uint64_t ts = o["ts"]._numberLong(); uint64_t lastHash = o["h"].numberLong(); GTID gtid = getGTIDFromBSON("_id", o); if( !theReplSet->gtidManager->rollbackNeeded(gtid, ts, lastHash)) { log() << "Rollback NOT needed! Our GTID" << gtid << endl; return false; } log() << "Rollback needed! Our GTID" << theReplSet->gtidManager->getLiveState().toString() << " remote GTID: " << gtid.toString() << ". Attempting rollback." << rsLog; runRollback(r, ts); return true; }
bool isRollbackRequired(OplogReader& r, uint64_t *lastTS) { string hn = r.conn()->getServerAddress(); verify(r.more()); BSONObj rollbackStatus; bool found = getRollbackStatus(rollbackStatus); if (found) { // we have a rollback in progress, // must complete it log() << "Rollback needed, found rollbackStatus: " << rollbackStatus << rsLog; return true; } BSONObj o = r.nextSafe(); uint64_t ts = o["ts"]._numberLong(); uint64_t lastHash = o["h"].numberLong(); GTID gtid = getGTIDFromBSON("_id", o); if (!theReplSet->gtidManager->rollbackNeeded(gtid, ts, lastHash)) { log() << "Rollback NOT needed! " << gtid << endl; return false; } log() << "Rollback needed! Our GTID: " << theReplSet->gtidManager->getLiveState().toString() << ", remote GTID: " << gtid.toString() << ". Attempting rollback." << rsLog; *lastTS = ts; return true; }
void ReplSetImpl::syncTail() { // todo : locking vis a vis the mgr... const Member *primary = box.getPrimary(); if( primary == 0 ) return; string hn = primary->h().toString(); OplogReader r; if( !r.connect(primary->h().toString()) ) { log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog; return; } /* first make sure we are not hopelessly out of sync by being very stale. */ { BSONObj remoteOldestOp = r.findOne(rsoplog, Query()); OpTime ts = remoteOldestOp["ts"]._opTime(); DEV log() << "remoteOldestOp: " << ts.toStringPretty() << endl; else log(3) << "remoteOldestOp: " << ts.toStringPretty() << endl; if( lastOpTimeWritten < ts ) { log() << "replSet error too stale to catch up, at least from primary " << hn << rsLog; log() << "replSet our last optime : " << lastOpTimeWritten.toStringPretty() << rsLog; log() << "replSet oldest at " << hn << " : " << ts.toStringPretty() << rsLog; log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog; sethbmsg("error too stale to catch up"); sleepsecs(120); return; } }
void BackgroundSync::getOplogReader(OplogReader& r) { const Member *target = NULL, *stale = NULL; BSONObj oldest; { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set _currentSyncTarget = NULL; return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer) { _condvar.wait(lock); } } verify(r.conn() == NULL); while ((target = theReplSet->getMemberToSyncTo()) != NULL) { string current = target->fullName(); if (!r.connect(current)) { LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog; r.resetConnection(); theReplSet->veto(current); continue; } if (isStale(r, oldest)) { r.resetConnection(); theReplSet->veto(current, 600); stale = target; continue; } // if we made it here, the target is up and not stale { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = target; } return; } // the only viable sync target was stale if (stale) { theReplSet->goStale(stale, oldest); sleepsecs(120); } { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = NULL; } }
BSONObj Sync::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? const char *ns = o.getStringField("ns"); // capped collections Collection* collection = db->getCollection(ns); if ( collection && collection->isCapped() ) { log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl; return BSONObj(); } const int retryMax = 3; for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { if (retryCount != 1) { // if we are retrying, sleep a bit to let the network possibly recover sleepsecs(retryCount * retryCount); } try { bool ok = missingObjReader.connect(HostAndPort(hn)); if (!ok) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } } catch (const SocketException&) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } // might be more than just _id in the update criteria BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); BSONObj missingObj; try { missingObj = missingObjReader.findOne(ns, query); } catch (const SocketException&) { warning() << "network problem detected while fetching a missing document from the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } catch (DBException& e) { log() << "replication assertion fetching missing object: " << e.what() << endl; throw; } // success! return missingObj; } // retry count exceeded msgasserted(15916, str::stream() << "Can no longer connect to initial sync source: " << hn); }
bool BackgroundSync::isRollbackRequired(OplogReader& r) { string hn = r.conn()->getServerAddress(); if (!r.more()) { try { BSONObj theirLastOp = r.getLastOp(rsoplog); if (theirLastOp.isEmpty()) { log() << "replSet error empty query result from " << hn << " oplog" << rsLog; sleepsecs(2); return true; } OpTime theirTS = theirLastOp["ts"]._opTime(); if (theirTS < _lastOpTimeFetched) { log() << "replSet we are ahead of the sync source, will try to roll back" << rsLog; theReplSet->syncRollback(r); return true; } /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ log() << "replSet syncTail condition 1" << rsLog; sleepsecs(1); } catch(DBException& e) { log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog; sleepsecs(2); } return true; } BSONObj o = r.nextSafe(); OpTime ts = o["ts"]._opTime(); long long h = o["h"].numberLong(); if( ts != _lastOpTimeFetched || h != _lastH ) { log() << "replSet our last op time fetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; log() << "replset source's GTE: " << ts.toStringPretty() << rsLog; theReplSet->syncRollback(r); return true; } return false; }
bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) { string hn = r.conn()->getServerAddress(); if (!r.more()) { try { BSONObj theirLastOp = r.getLastOp(rsOplogName.c_str()); if (theirLastOp.isEmpty()) { error() << "empty query result from " << hn << " oplog"; sleepsecs(2); return true; } OpTime theirOpTime = extractOpTime(theirLastOp); if (theirOpTime < _lastOpTimeFetched) { log() << "we are ahead of the sync source, will try to roll back"; syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); return true; } /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ log() << "syncTail condition 1"; sleepsecs(1); } catch(DBException& e) { error() << "querying " << hn << ' ' << e.toString(); sleepsecs(2); } return true; } BSONObj o = r.nextSafe(); OpTime opTime = extractOpTime(o); long long hash = o["h"].numberLong(); if ( opTime != _lastOpTimeFetched || hash != _lastFetchedHash ) { log() << "our last op time fetched: " << _lastOpTimeFetched; log() << "source's GTE: " << opTime; syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); return true; } return false; }
void BackgroundSync::getOplogReader(OplogReader& r) { const Member *target = NULL, *stale = NULL; BSONObj oldest; verify(r.conn() == NULL); while ((target = theReplSet->getMemberToSyncTo()) != NULL) { string current = target->fullName(); if (!r.connect(current)) { LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog; r.resetConnection(); theReplSet->veto(current); continue; } // if we made it here, the target is up and not stale { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = target; } return; } // the only viable sync target was stale if (stale) { GTID remoteOldestGTID = getGTIDFromBSON("_id", oldest); theReplSet->goStale(stale, remoteOldestGTID); // vanilla Mongo used to do a sleep of 120 seconds here // We removed it. It seems excessive, and if this machine is doing // nothing anyway, sleeping won't help. It might as well // return with a null sync target, and produce() will handle // that fact and sleep one second } { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = NULL; } }
// Copy a range of documents to the local oplog.refs collection static void copyOplogRefsRange(OplogReader &r, OID oid) { shared_ptr<DBClientCursor> c = r.getOplogRefsCursor(oid); Client::ReadContext ctx(rsOplogRefs); while (c->more()) { BSONObj b = c->next(); BSONElement eOID = b.getFieldDotted("_id.oid"); if (oid != eOID.OID()) { break; } LOG(6) << "copyOplogRefsRange " << b << endl; writeEntryToOplogRefs(b); } }
int run() { Client::initThread( "oplogreplay" ); toolInfoLog() << "going to connect" << std::endl; OplogReader r; r.setTailingQueryOptions( QueryOption_SlaveOk | QueryOption_AwaitData ); r.connect(mongoOplogGlobalParams.from); toolInfoLog() << "connected" << std::endl; OpTime start(time(0) - mongoOplogGlobalParams.seconds, 0); toolInfoLog() << "starting from " << start.toStringPretty() << std::endl; r.tailingQueryGTE(mongoOplogGlobalParams.ns.c_str(), start); int num = 0; while ( r.more() ) { BSONObj o = r.next(); if (logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(2))) { toolInfoLog() << o << std::endl; } if ( o["$err"].type() ) { toolError() << "error getting oplog" << std::endl; toolError() << o << std::endl; return -1; } bool print = ++num % 100000 == 0; if (print) { toolInfoLog() << num << "\t" << o << std::endl; } if ( o["op"].String() == "n" ) continue; BSONObjBuilder b( o.objsize() + 32 ); BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) ); updates.append( o ); updates.done(); BSONObj c = b.obj(); BSONObj res; bool ok = conn().runCommand( "admin" , c , res ); if (!ok) { toolError() << res << std::endl; } else if (print) { toolInfoLog() << res << std::endl; } } return 0; }
bool BackgroundSync::isStale(OpTime lastOpTimeFetched, OplogReader& r, BSONObj& remoteOldestOp) { remoteOldestOp = r.findOne(rsoplog, Query()); OpTime remoteTs = remoteOldestOp["ts"]._opTime(); { boost::unique_lock<boost::mutex> lock(_mutex); if (lastOpTimeFetched >= remoteTs) { return false; } log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog; log() << "replSet lastOpTimeFetched: " << lastOpTimeFetched.toStringLong() << rsLog; } return true; }
bool canStartRollback(OplogReader& r, GTID idToRollbackTo) { shared_ptr<DBClientConnection> conn(r.conn_shared()); // before we start rollback, let's make sure that the minUnapplied on the remote // server is past the id that we are rolling back to. Otherwise, the snapshot // we create will not be up to date, and the rollback algorithm will not work BSONObjBuilder b; b.append("_id", "minUnapplied"); // Note that another way to get this information is to // request a heartbeat. That one will technically return // a more up to date value for minUnapplied BSONObj res = findOneFromConn(conn.get(), rsReplInfo, Query(b.done())); GTID minUnapplied = getGTIDFromBSON("GTID", res); if (GTID::cmp(minUnapplied, idToRollbackTo) < 0) { log() << "Remote server has minUnapplied " << minUnapplied.toString() << \ " we want to rollback to " << idToRollbackTo.toString() << \ ". Therefore, exiting and retrying." << rsLog; return false; } return true; }
void BackgroundSync::runRollback(OplogReader& r, uint64_t oplogTS) { // starting from ourLast, we need to read the remote oplog // backwards until we find an entry in the remote oplog // that has the same GTID, timestamp, and hash as // what we have in our oplog. If we don't find one that is within // some reasonable timeframe, then we go fatal GTID ourLast = theReplSet->gtidManager->getLiveState(); GTID idToRollbackTo; uint64_t rollbackPointTS = 0; uint64_t rollbackPointHash = 0; incRBID(); try { shared_ptr<DBClientCursor> rollbackCursor = r.getRollbackCursor(ourLast); while (rollbackCursor->more()) { BSONObj remoteObj = rollbackCursor->next(); GTID remoteGTID = getGTIDFromBSON("_id", remoteObj); uint64_t remoteTS = remoteObj["ts"]._numberLong(); uint64_t remoteLastHash = remoteObj["h"].numberLong(); if (remoteTS + 1800*1000 < oplogTS) { log() << "Rollback takes us too far back, throwing exception. remoteTS: " << remoteTS << " oplogTS: " << oplogTS << rsLog; throw RollbackOplogException("replSet rollback too long a time period for a rollback (at least 30 minutes)."); break; } //now try to find an entry in our oplog with that GTID BSONObjBuilder localQuery; BSONObj localObj; addGTIDToBSON("_id", remoteGTID, localQuery); bool foundLocally = false; { LOCK_REASON(lockReason, "repl: looking up oplog entry for rollback"); Client::ReadContext ctx(rsoplog, lockReason); Client::Transaction transaction(DB_SERIALIZABLE); foundLocally = Collection::findOne(rsoplog, localQuery.done(), localObj); transaction.commit(); } if (foundLocally) { GTID localGTID = getGTIDFromBSON("_id", localObj); uint64_t localTS = localObj["ts"]._numberLong(); uint64_t localLastHash = localObj["h"].numberLong(); if (localLastHash == remoteLastHash && localTS == remoteTS && GTID::cmp(localGTID, remoteGTID) == 0 ) { idToRollbackTo = localGTID; rollbackPointTS = localTS; rollbackPointHash = localLastHash; log() << "found id to rollback to " << idToRollbackTo << rsLog; break; } } } // At this point, either we have found the point to try to rollback to, // or we have determined that we cannot rollback if (idToRollbackTo.isInitial()) { // we cannot rollback throw RollbackOplogException("could not find ID to rollback to"); } } catch (DBException& e) { log() << "Caught DBException during rollback " << e.toString() << rsLog; throw RollbackOplogException("DBException while trying to find ID to rollback to: " + e.toString()); } catch (std::exception& e2) { log() << "Caught std::exception during rollback " << e2.what() << rsLog; throw RollbackOplogException(str::stream() << "Exception while trying to find ID to rollback to: " << e2.what()); } // proceed with the rollback to point idToRollbackTo // probably ought to grab a global write lock while doing this // I don't think we want oplog cursors reading from this machine // while we are rolling back. Or at least do something to protect against this // first, let's get all the operations that are being applied out of the way, // we don't want to rollback an item in the oplog while simultaneously, // the applier thread is applying it to the oplog { boost::unique_lock<boost::mutex> lock(_mutex); while (_deque.size() > 0) { log() << "waiting for applier to finish work before doing rollback " << rsLog; _queueDone.wait(lock); } verifySettled(); } // now let's tell the system we are going to rollback, to do so, // abort live multi statement transactions, invalidate cursors, and // change the state to RS_ROLLBACK { // so we know nothing is simultaneously occurring RWLockRecursive::Exclusive e(operationLock); LOCK_REASON(lockReason, "repl: killing all operations for rollback"); Lock::GlobalWrite lk(lockReason); ClientCursor::invalidateAllCursors(); Client::abortLiveTransactions(); theReplSet->goToRollbackState(); } try { // now that we are settled, we have to take care of the GTIDManager // and the repl info thread. // We need to reset the state of the GTIDManager to the point // we intend to rollback to, and we need to make sure that the repl info thread // has captured this information. theReplSet->gtidManager->resetAfterInitialSync( idToRollbackTo, rollbackPointTS, rollbackPointHash ); // now force an update of the repl info thread theReplSet->forceUpdateReplInfo(); // at this point, everything should be settled, the applier should // have nothing left (and remain that way, because this is the only // thread that can put work on the applier). Now we can rollback // the data. while (true) { BSONObj o; { LOCK_REASON(lockReason, "repl: checking for oplog data"); Lock::DBRead lk(rsoplog, lockReason); Client::Transaction txn(DB_SERIALIZABLE); // if there is nothing in the oplog, break o = getLastEntryInOplog(); if( o.isEmpty() ) { break; } } GTID lastGTID = getGTIDFromBSON("_id", o); // if we have rolled back enough, break from while loop if (GTID::cmp(lastGTID, idToRollbackTo) <= 0) { dassert(GTID::cmp(lastGTID, idToRollbackTo) == 0); break; } rollbackTransactionFromOplog(o, true); } theReplSet->leaveRollbackState(); } catch (DBException& e) { log() << "Caught DBException during rollback " << e.toString() << rsLog; throw RollbackOplogException("DBException while trying to run rollback: " + e.toString()); } catch (std::exception& e2) { log() << "Caught std::exception during rollback " << e2.what() << rsLog; throw RollbackOplogException(str::stream() << "Exception while trying to run rollback: " << e2.what()); } }
/** * Do the initial sync for this member. There are several steps to this process: * * 0. Add _initialSyncFlag to minValid to tell us to restart initial sync if we * crash in the middle of this procedure * 1. Record start time. * 2. Clone. * 3. Set minValid1 to sync target's latest op time. * 4. Apply ops from start to minValid1, fetching missing docs as needed. * 5. Set minValid2 to sync target's latest op time. * 6. Apply ops from minValid1 to minValid2. * 7. Build indexes. * 8. Set minValid3 to sync target's latest op time. * 9. Apply ops from minValid2 to minValid3. 10. Clean up minValid and remove _initialSyncFlag field * * At that point, initial sync is finished. Note that the oplog from the sync target is applied * three times: step 4, 6, and 8. 4 may involve refetching, 6 should not. By the end of 6, * this member should have consistent data. 8 is "cosmetic," it is only to get this member * closer to the latest op time before it can transition to secondary state. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); replset::SyncTail tail(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } // written by applyToHead calls BSONObj minValid; if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { // Add field to minvalid document to tell us to restart initial sync if we crash theReplSet->setInitialSyncFlag(); sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); Cloner cloner; if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, true)) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); log() << "oplog sync 1 of 3" << endl; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // Now we sync to the latest op on the sync target _again_, as we may have recloned ops // that were "from the future" compared with minValid. During this second application, // nothing should need to be recloned. log() << "oplog sync 2 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) { return; } // data should now be consistent lastOp = minValid; sethbmsg("initial sync building indexes",0); if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, false)) { veto(source->fullName(), 600); sleepsecs(300); return; } } log() << "oplog sync 3 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) { return; } // --------- Status status = getGlobalAuthorizationManager()->initialize(); if (!status.isOK()) { warning() << "Failed to reinitialize auth data after initial sync. " << status; return; } sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } // Initial sync is now complete. Flag this by setting minValid to the last thing // we synced. theReplSet->setMinValid(minValid); // Clear the initial sync flag. theReplSet->clearInitialSyncFlag(); cx.ctx().db()->flushFiles(true); } { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); theReplSet->initialSyncRequested = false; } // If we just cloned & there were no ops applied, we still want the primary to know where // we're up to replset::BackgroundSync::notify(); changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { const Member *m = findById(server_id); if( m == 0 ) { ss << "Error : can't find a member with id: " << server_id << '\n'; return; } ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog ); //const bo fields = BSON( "o" << false << "o2" << false ); const bo fields; /** todo fix we might want an so timeout here */ OplogReader reader; if (reader.connect(m->h()) == false) { ss << "couldn't connect to " << m->fullName(); return; } reader.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields); if ( !reader.haveCursor() ) { ss << "couldn't query " << rsoplog; return; } static const char *h[] = {"ts","optime","h","op","ns","rest",0}; ss << "<style type=\"text/css\" media=\"screen\">" "table { font-size:75% }\n" // "th { background-color:#bbb; color:#000 }\n" // "td,th { padding:.25em }\n" "</style>\n"; ss << table(h, true); //ss << "<pre>\n"; int n = 0; OpTime otFirst; OpTime otLast; OpTime otEnd; while( reader.more() ) { bo o = reader.next(); otLast = o["ts"]._opTime(); if( otFirst.isNull() ) otFirst = otLast; say(ss, o); n++; } if( n == 0 ) { ss << rsoplog << " is empty\n"; } else { reader.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields); if( !reader.haveCursor() ) { ss << "couldn't query [2] " << rsoplog; return; } string x; bo o = reader.next(); otEnd = o["ts"]._opTime(); while( 1 ) { stringstream z; if( o["ts"]._opTime() == otLast ) break; say(z, o); x = z.str() + x; if( !reader.more() ) break; o = reader.next(); } if( !x.empty() ) { ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x; //ss << "\n...\n\n" << x; } } ss << _table(); ss << p(time_t_to_String_short(time(0)) + " current time"); if( !otEnd.isNull() ) { ss << "<p>Log length in time: "; unsigned d = otEnd.getSecs() - otFirst.getSecs(); double h = d / 3600.0; ss.precision(3); if( h < 72 ) ss << h << " hours"; else ss << h / 24.0 << " days"; ss << "</p>\n"; } }
/** * Do the initial sync for this member. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // reset state, as that "didn't count" emptyOplog(); lastOpTimeWritten = OpTime(); lastH = 0; sethbmsg("initial sync building indexes",0); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } } sethbmsg("initial sync query minValid",0); BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) { return; } // --------- sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } Helpers::putSingleton("local.replset.minvalid", minValid); cx.ctx().db()->flushFiles(true); } changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r; OpTime lastOpTimeFetched; // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; } r.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { if (!r.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. r.more(); } networkByteStats.increment(r.currentBatchMessageSize()); if (!r.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter" << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())) << "tailable" << true << "oplogReplay" << true << "awaitData" << true << "maxTimeMS" << int(fetcherMaxTimeMS.count())); Fetcher fetcher(taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, rpc::makeEmptyMetadata()); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) { remoteOldestOp = r.findOne(rsoplog, Query()); OpTime remoteTs = remoteOldestOp["ts"]._opTime(); DEV log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog; else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
/* initial oplog application, during initial sync, after cloning. @return false on failure. this method returns an error and doesn't throw exceptions (i think). */ bool ReplSetImpl::initialSyncOplogApplication( const Member *source, OpTime applyGTE, OpTime minValid) { if( source == 0 ) return false; const string hn = source->h().toString(); OpTime ts; try { OplogReader r; if( !r.connect(hn) ) { log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog; return false; } { BSONObjBuilder q; q.appendDate("$gte", applyGTE.asDate()); BSONObjBuilder query; query.append("ts", q.done()); BSONObj queryObj = query.done(); r.query(rsoplog, queryObj); } assert( r.haveCursor() ); /* we lock outside the loop to avoid the overhead of locking on every operation. server isn't usable yet anyway! */ writelock lk(""); { if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); if( op.firstElement().fieldName() == string("$err") ) { log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; return false; } uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } } // todo : use exhaust unsigned long long n = 0; while( 1 ) { if( !r.more() ) break; BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ { ts = o["ts"]._opTime(); /* if we have become primary, we dont' want to apply things from elsewhere anymore. assumePrimary is in the db lock so we are safe as long as we check after we locked above. */ if( (source->state() != MemberState::RS_PRIMARY && source->state() != MemberState::RS_SECONDARY) || replSetForceInitialSyncFailure ) { int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; throw DBException("forced error",0); } log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } if( ts >= applyGTE ) { // optimes before we started copying need not be applied. syncApply(o); } _logOpObjRS(o); /* with repl sets we write the ops to our oplog too */ } if( ++n % 100000 == 0 ) { // simple progress metering log() << "replSet initialSyncOplogApplication " << n << rsLog; } } } catch(DBException& e) { if( ts <= minValid ) { // didn't make it far enough log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog; return false; } } return true; }
/** * Checks if the oplog given is too far ahead to read from. * * @param r the oplog * @param hn the hostname (for log messages) * * @return if we are stale compared to the oplog on hn */ bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) { BSONObj remoteOldestOp = r.findOne(rsoplog, Query()); OpTime ts = remoteOldestOp["ts"]._opTime(); DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog; else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source, const OpTime& applyGTE, const OpTime& minValid) { const string hn = source->fullName(); try { r.tailingQueryGTE( rsoplog, applyGTE ); if ( !r.haveCursor() ) { log() << "replSet initial sync oplog query error" << rsLog; return false; } { if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); if( op.firstElementFieldName() == string("$err") ) { log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; return false; } uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at " << t.toStringPretty() << " to " << minValid.toStringPretty()); } } catch(DBException& e) { log() << "replSet initial sync failing: " << e.toString() << rsLog; return false; } /* we lock outside the loop to avoid the overhead of locking on every operation. */ writelock lk(""); // todo : use exhaust OpTime ts; time_t start = time(0); unsigned long long n = 0; int fails = 0; while( ts < minValid ) { try { // There are some special cases with initial sync (see the catch block), so we // don't want to break out of this while until we've reached minvalid. Thus, we'll // keep trying to requery. if( !r.more() ) { OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl; sleepsecs(1); r.resetCursor(); r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten); if ( !r.haveCursor() ) { if (fails++ > 30) { log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl; return false; } } continue; } BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ ts = o["ts"]._opTime(); { if( (source->state() != MemberState::RS_PRIMARY && source->state() != MemberState::RS_SECONDARY) || replSetForceInitialSyncFailure ) { int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; throw DBException("forced error",0); } log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } applyOp(o, applyGTE); } if ( ++n % 1000 == 0 ) { time_t now = time(0); if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; } } getDur().commitIfNeeded(); } catch (DBException& e) { // Skip duplicate key exceptions. // These are relatively common on initial sync: if a document is inserted // early in the clone step, the insert will be replayed but the document // will probably already have been cloned over. if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) { continue; } // handle cursor not found (just requery) if( e.getCode() == 13127 ) { log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl; r.resetCursor(); r.tailingQueryGTE(rsoplog, ts); if( r.haveCursor() ) { continue; } } // TODO: handle server restart if( ts <= minValid ) { // didn't make it far enough log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog; return false; } // otherwise, whatever, we'll break out of the loop and catch // anything that's really wrong in syncTail } } return true; }
/** * Do the initial sync for this member. There are several steps to this process: * * 1. Record start time. * 2. Clone. * 3. Set minValid1 to sync target's latest op time. * 4. Apply ops from start to minValid1, fetching missing docs as needed. * 5. Set minValid2 to sync target's latest op time. * 6. Apply ops from minValid1 to minValid2. * 7. Build indexes. * 8. Set minValid3 to sync target's latest op time. * 9. Apply ops from minValid2 to minValid3. * * At that point, initial sync is finished. Note that the oplog from the sync target is applied * three times: step 4, 6, and 8. 4 may involve refetching, 6 should not. By the end of 6, * this member should have consistent data. 8 is "cosmetic," it is only to get this member * closer to the latest op time before it can transition to secondary state. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); replset::SyncTail tail(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } // written by applyToHead calls BSONObj minValid; if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); log() << "oplog sync 1 of 3" << endl; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // Now we sync to the latest op on the sync target _again_, as we may have recloned ops // that were "from the future" compared with minValid. During this second application, // nothing should need to be recloned. log() << "oplog sync 2 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) { return; } // data should now be consistent lastOp = minValid; sethbmsg("initial sync building indexes",0); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } } log() << "oplog sync 3 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) { return; } // --------- sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } theReplSet->setMinValid(minValid); cx.ctx().db()->flushFiles(true); } changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
/** * Do the initial sync for this member. */ void ReplSetImpl::_syncDoInitialSync() { sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } OpTime startingTS = lastOp["ts"]._opTime(); if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) { string db = *i; if( db != "local" ) { sethbmsg( str::stream() << "initial sync cloning db: " << db , 0); bool ok; { writelock lk(db); Client::Context ctx(db); ok = clone(sourceHostname.c_str(), db); } if( !ok ) { sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0); sleepsecs(300); return; } } } } sethbmsg("initial sync query minValid",0); isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() ); /* our cloned copy will be strange until we apply oplog events that occurred through the process. we note that time point here. */ BSONObj minValid = r.getLastOp(rsoplog); isyncassert( "getLastOp is empty ", !minValid.isEmpty() ); OpTime mvoptime = minValid["ts"]._opTime(); assert( !mvoptime.isNull() ); /* apply relevant portion of the oplog */ { isyncassert( str::stream() << "initial sync source must remain readable throughout our initial sync [2] state now: " << source->state().toString() , source->state().readable() ); if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw log() << "replSet initial sync failed during applyoplog" << rsLog; emptyOplog(); // otherwise we'll be up! lastOpTimeWritten = OpTime(); lastH = 0; log() << "replSet cleaning up [1]" << rsLog; { writelock lk("local."); Client::Context cx( "local." ); cx.db()->flushFiles(true); } log() << "replSet cleaning up [2]" << rsLog; sleepsecs(5); return; } } sethbmsg("initial sync finishing up",0); assert( !box.getState().primary() ); // wouldn't make sense if we were. { writelock lk("local."); Client::Context cx( "local." ); cx.db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } Helpers::putSingleton("local.replset.minvalid", minValid); cx.db()->flushFiles(true); } sethbmsg("initial sync done",0); }
void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) { DBClientConnection *them = r.conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totSize = 0; list< pair<DocID,bo> > goodVersions; bo newMinValid; /* fetch all the goodVersions of each document from current primary */ DocID d; unsigned long long n = 0; try { for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { d = *i; assert( !d._id.eoo() ); { /* TODO : slow. lots of round trips. */ n++; bo good= them->findOne(d.ns, d._id.wrap()).getOwned(); totSize += good.objsize(); uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 ); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID,bo>(d,good)); } } newMinValid = r.getLastOp(rsoplog); if( newMinValid.isEmpty() ) { sethbmsg("rollback error newMinValid empty?"); return; } } catch(DBException& e) { sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0); log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog; throw e; } MemoryMappedFile::flushAll(true); sethbmsg("rollback 3.5"); if( h.rbid != getRBID(r.conn()) ) { // our source rolled back itself. so the data we received isn't necessarily consistent. sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt"); return; } // update them sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size()); bool warn = false; assert( !h.commonPointOurDiskloc.isNull() ); dbMutex.assertWriteLocked(); /* we have items we are writing that aren't from a point-in-time. thus best not to come online until we get to that point in freshness. */ setMinValid(newMinValid); /** any full collection resyncs required? */ if( !h.collectionsToResync.empty() ) { for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { string ns = *i; sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns); Client::Context c(*i, dbpath, 0, /*doauth*/false); try { bob res; string errmsg; dropCollection(ns, errmsg, res); { dbtemprelease r; bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false); if( !ok ) { log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog; throw "rollback error resyncing rollection [1]"; } } } catch(...) { log() << "replset rollback error resyncing collection " << ns << rsLog; throw "rollback error resyncing rollection [2]"; } } /* we did more reading from primary, so check it again for a rollback (which would mess us up), and make minValid newer. */ sethbmsg("rollback 4.2"); { string err; try { newMinValid = r.getLastOp(rsoplog); if( newMinValid.isEmpty() ) { err = "can't get minvalid from primary"; } else { setMinValid(newMinValid); } } catch(...) { err = "can't get/set minvalid"; } if( h.rbid != getRBID(r.conn()) ) { // our source rolled back itself. so the data we received isn't necessarily consistent. // however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if( !err.empty() ) { log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog; /* todo: reset minvalid so that we are permanently in fatal state */ /* todo: don't be fatal, but rather, get all the data first. */ sethbmsg("rollback error"); throw rsfatal(); } } sethbmsg("rollback 4.3"); } sethbmsg("rollback 4.6"); /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */ for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { Client::Context c(*i, dbpath, 0, /*doauth*/false); try { bob res; string errmsg; log(1) << "replSet rollback drop: " << *i << rsLog; dropCollection(*i, errmsg, res); } catch(...) { log() << "replset rollback error dropping collection " << *i << rsLog; } } sethbmsg("rollback 4.7"); Client::Context c(rsoplog, dbpath, 0, /*doauth*/false); NamespaceDetails *oplogDetails = nsdetails(rsoplog); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails); map<string,shared_ptr<RemoveSaver> > removeSavers; unsigned deletes = 0, updates = 0; for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) { const DocID& d = i->first; bo pattern = d._id.wrap(); // { _id : ... } try { assert( d.ns && *d.ns ); if( h.collectionsToResync.count(d.ns) ) { /* we just synced this entire collection */ continue; } /* keep an archive of items rolled back */ shared_ptr<RemoveSaver>& rs = removeSavers[d.ns]; if ( ! rs ) rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) ); // todo: lots of overhead in context, this can be faster Client::Context c(d.ns, dbpath, 0, /*doauth*/false); if( i->second.isEmpty() ) { // wasn't on the primary; delete. /* TODO1.6 : can't delete from a capped collection. need to handle that here. */ deletes++; NamespaceDetails *nsd = nsdetails(d.ns); if( nsd ) { if( nsd->capped ) { /* can't delete from a capped collection - so we truncate instead. if this item must go, so must all successors!!! */ try { /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */ // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); DiskLoc loc = Helpers::findOne(d.ns, pattern, false); if( Listener::getElapsedTimeMillis() - start > 200 ) log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern); if( !loc.isNull() ) { try { nsd->cappedTruncateAfter(d.ns, loc, true); } catch(DBException& e) { if( e.getCode() == 13415 ) { // hack: need to just make cappedTruncate do this... nsd->emptyCappedCollection(d.ns); } else { throw; } } } } catch(DBException& e) { log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog; } } else { try { deletes++; deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() ); } catch(...) { log() << "replSet error rollback delete failed ns:" << d.ns << rsLog; } } // did we just empty the collection? if so let's check if it even exists on the source. if( nsd->stats.nrecords == 0 ) { try { string sys = cc().database()->name + ".system.namespaces"; bo o = them->findOne(sys, QUERY("name"<<d.ns)); if( o.isEmpty() ) { // we should drop try { bob res; string errmsg; dropCollection(d.ns, errmsg, res); } catch(...) { log() << "replset error rolling back collection " << d.ns << rsLog; } } } catch(DBException& ) { /* this isn't *that* big a deal, but is bad. */ log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog; } } } } else { // todo faster... OpDebug debug; updates++; _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() ); } } catch(DBException& e) { log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog; warn = true; } } removeSavers.clear(); // this effectively closes all of them sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates); MemoryMappedFile::flushAll(true); sethbmsg("rollback 6"); // clean up oplog log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog; // todo: fatal error if this throws? oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false); /* reset cached lastoptimewritten and h value */ loadLastOpTimeWritten(); sethbmsg("rollback 7"); MemoryMappedFile::flushAll(true); // done if( warn ) sethbmsg("issues during syncRollback, see log"); else sethbmsg("rollback done"); }
bool ReplSetImpl::_initialSyncOplogApplication(OplogReader& r, const Member *source, const OpTime& applyGTE, const OpTime& minValid) { const string hn = source->fullName(); OplogReader missingObjReader; try { r.tailingQueryGTE( rsoplog, applyGTE ); if ( !r.haveCursor() ) { log() << "replSet initial sync oplog query error" << rsLog; return false; } { if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); if( op.firstElementFieldName() == string("$err") ) { log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; return false; } uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at " << t.toStringPretty() << " to " << minValid.toStringPretty()); } } catch(DBException& e) { log() << "replSet initial sync failing: " << e.toString() << rsLog; return false; } /* we lock outside the loop to avoid the overhead of locking on every operation. */ writelock lk(""); // todo : use exhaust OpTime ts; time_t start = time(0); unsigned long long n = 0; while( 1 ) { try { if( !r.more() ) break; BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ ts = o["ts"]._opTime(); { if( (source->state() != MemberState::RS_PRIMARY && source->state() != MemberState::RS_SECONDARY) || replSetForceInitialSyncFailure ) { int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; throw DBException("forced error",0); } log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } if( ts >= applyGTE ) { // optimes before we started copying need not be applied. bool failedUpdate = syncApply(o); if( failedUpdate ) { // we don't have the object yet, which is possible on initial sync. get it. log() << "replSet info adding missing object" << endl; // rare enough we can log if( !missingObjReader.connect(hn) ) { // ok to call more than once log() << "replSet initial sync fails, couldn't connect to " << hn << endl; return false; } const char *ns = o.getStringField("ns"); BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); // might be more than just _id in the update criteria BSONObj missingObj; try { missingObj = missingObjReader.findOne( ns, query ); } catch(...) { log() << "replSet assertion fetching missing object" << endl; throw; } if( missingObj.isEmpty() ) { log() << "replSet missing object not found on source. presumably deleted later in oplog" << endl; log() << "replSet o2: " << o.getObjectField("o2").toString() << endl; log() << "replSet o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl; } else { Client::Context ctx(ns); try { DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize()); assert( !d.isNull() ); } catch(...) { log() << "replSet assertion during insert of missing object" << endl; throw; } // now reapply the update from above bool failed = syncApply(o); if( failed ) { log() << "replSet update still fails after adding missing object " << ns << endl; assert(false); } } } } _logOpObjRS(o); /* with repl sets we write the ops to our oplog too */ } if ( ++n % 1000 == 0 ) { time_t now = time(0); if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; } } if ( ts > minValid ) { break; } getDur().commitIfNeeded(); } catch (DBException& e) { // skip duplicate key exceptions if( e.getCode() == 11000 || e.getCode() == 11001 ) { continue; } // handle cursor not found (just requery) if( e.getCode() == 13127 ) { log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl; r.resetCursor(); r.tailingQueryGTE(rsoplog, ts); if( r.haveCursor() ) { continue; } } // TODO: handle server restart if( ts <= minValid ) { // didn't make it far enough log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog; return false; } // otherwise, whatever break; } } return true; }
/* initial oplog application, during initial sync, after cloning. @return false on failure. this method returns an error and doesn't throw exceptions (i think). */ bool ReplSetImpl::initialSyncOplogApplication( const Member *source, OpTime applyGTE, OpTime minValid) { if( source == 0 ) return false; const string hn = source->h().toString(); OplogReader r; try { if( !r.connect(hn) ) { log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog; return false; } r.queryGTE( rsoplog, applyGTE ); assert( r.haveCursor() ); { if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); if( op.firstElement().fieldName() == string("$err") ) { log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; return false; } uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at " << t.toStringPretty() << " to " << minValid.toStringPretty()); } } catch(DBException& e) { log() << "replSet initial sync failing: " << e.toString() << rsLog; return false; } /* we lock outside the loop to avoid the overhead of locking on every operation. */ writelock lk(""); // todo : use exhaust OpTime ts; time_t start = time(0); unsigned long long n = 0; while( 1 ) { try { if( !r.more() ) break; BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ { ts = o["ts"]._opTime(); /* if we have become primary, we dont' want to apply things from elsewhere anymore. assumePrimary is in the db lock so we are safe as long as we check after we locked above. */ if( (source->state() != MemberState::RS_PRIMARY && source->state() != MemberState::RS_SECONDARY) || replSetForceInitialSyncFailure ) { int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; throw DBException("forced error",0); } log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } if( ts >= applyGTE ) { // optimes before we started copying need not be applied. syncApply(o); } _logOpObjRS(o); /* with repl sets we write the ops to our oplog too */ } if ( ++n % 1000 == 0 ) { time_t now = time(0); if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; } } getDur().commitIfNeeded(); } catch (DBException& e) { // skip duplicate key exceptions if( e.getCode() == 11000 || e.getCode() == 11001 ) { continue; } // handle cursor not found (just requery) if( e.getCode() == 13127 ) { r.resetCursor(); r.queryGTE(rsoplog, ts); if( r.haveCursor() ) { continue; } } // TODO: handle server restart if( ts <= minValid ) { // didn't make it far enough log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog; return false; } // otherwise, whatever break; } } return true; }
unsigned ReplSetImpl::_syncRollback(OplogReader&r) { assert( !lockedByMe() ); assert( !dbMutex.atLeastReadLocked() ); sethbmsg("rollback 0"); writelocktry lk(rsoplog, 20000); if( !lk.got() ) { sethbmsg("rollback couldn't get write lock in a reasonable time"); return 2; } if( box.getState().secondary() ) { /* by doing this, we will not service reads (return an error as we aren't in secondary staate. that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred or removed or yielded later anyway. also, this is better for status reporting - we know what is happening. */ box.change(MemberState::RS_ROLLBACK, _self); } HowToFixUp how; sethbmsg("rollback 1"); { r.resetCursor(); /*DBClientConnection us(false, 0, 0); string errmsg; if( !us.connect(HostAndPort::me().toString(),errmsg) ) { sethbmsg("rollback connect to self failure" + errmsg); return; }*/ sethbmsg("rollback 2 FindCommonPoint"); try { syncRollbackFindCommonPoint(r.conn(), how); } catch( const char *p ) { sethbmsg(string("rollback 2 error ") + p); return 10; } catch( rsfatal& ) { _fatal(); return 2; } catch( DBException& e ) { sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min"); dbtemprelease r; sleepsecs(60); throw; } } sethbmsg("replSet rollback 3 fixup"); { incRBID(); try { syncFixUp(how, r); } catch( rsfatal& ) { sethbmsg("rollback fixup error"); _fatal(); return 2; } catch(...) { incRBID(); throw; } incRBID(); /* success - leave "ROLLBACK" state can go to SECONDARY once minvalid is achieved */ box.change(MemberState::RS_RECOVERING, _self); } return 0; }