bool BackgroundSync::isRollbackRequired(OplogReader& r) { string hn = r.conn()->getServerAddress(); if (!r.more()) { try { BSONObj theirLastOp = r.getLastOp(rsoplog); if (theirLastOp.isEmpty()) { log() << "replSet error empty query result from " << hn << " oplog" << rsLog; sleepsecs(2); return true; } OpTime theirTS = theirLastOp["ts"]._opTime(); if (theirTS < _lastOpTimeFetched) { log() << "replSet we are ahead of the sync source, will try to roll back" << rsLog; theReplSet->syncRollback(r); return true; } /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ log() << "replSet syncTail condition 1" << rsLog; sleepsecs(1); } catch(DBException& e) { log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog; sleepsecs(2); } return true; } BSONObj o = r.nextSafe(); OpTime ts = o["ts"]._opTime(); long long h = o["h"].numberLong(); if( ts != _lastOpTimeFetched || h != _lastH ) { log() << "replSet our last op time fetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; log() << "replset source's GTE: " << ts.toStringPretty() << rsLog; theReplSet->syncRollback(r); return true; } return false; }
bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) { string hn = r.conn()->getServerAddress(); if (!r.more()) { try { BSONObj theirLastOp = r.getLastOp(rsOplogName.c_str()); if (theirLastOp.isEmpty()) { error() << "empty query result from " << hn << " oplog"; sleepsecs(2); return true; } OpTime theirOpTime = extractOpTime(theirLastOp); if (theirOpTime < _lastOpTimeFetched) { log() << "we are ahead of the sync source, will try to roll back"; syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); return true; } /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ log() << "syncTail condition 1"; sleepsecs(1); } catch(DBException& e) { error() << "querying " << hn << ' ' << e.toString(); sleepsecs(2); } return true; } BSONObj o = r.nextSafe(); OpTime opTime = extractOpTime(o); long long hash = o["h"].numberLong(); if ( opTime != _lastOpTimeFetched || hash != _lastFetchedHash ) { log() << "our last op time fetched: " << _lastOpTimeFetched; log() << "source's GTE: " << opTime; syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); return true; } return false; }
/** * Do the initial sync for this member. There are several steps to this process: * * 0. Add _initialSyncFlag to minValid to tell us to restart initial sync if we * crash in the middle of this procedure * 1. Record start time. * 2. Clone. * 3. Set minValid1 to sync target's latest op time. * 4. Apply ops from start to minValid1, fetching missing docs as needed. * 5. Set minValid2 to sync target's latest op time. * 6. Apply ops from minValid1 to minValid2. * 7. Build indexes. * 8. Set minValid3 to sync target's latest op time. * 9. Apply ops from minValid2 to minValid3. 10. Clean up minValid and remove _initialSyncFlag field * * At that point, initial sync is finished. Note that the oplog from the sync target is applied * three times: step 4, 6, and 8. 4 may involve refetching, 6 should not. By the end of 6, * this member should have consistent data. 8 is "cosmetic," it is only to get this member * closer to the latest op time before it can transition to secondary state. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); replset::SyncTail tail(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } // written by applyToHead calls BSONObj minValid; if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { // Add field to minvalid document to tell us to restart initial sync if we crash theReplSet->setInitialSyncFlag(); sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); Cloner cloner; if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, true)) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); log() << "oplog sync 1 of 3" << endl; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // Now we sync to the latest op on the sync target _again_, as we may have recloned ops // that were "from the future" compared with minValid. During this second application, // nothing should need to be recloned. log() << "oplog sync 2 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) { return; } // data should now be consistent lastOp = minValid; sethbmsg("initial sync building indexes",0); if (!_syncDoInitialSync_clone(cloner, sourceHostname.c_str(), dbs, false)) { veto(source->fullName(), 600); sleepsecs(300); return; } } log() << "oplog sync 3 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) { return; } // --------- Status status = getGlobalAuthorizationManager()->initialize(); if (!status.isOK()) { warning() << "Failed to reinitialize auth data after initial sync. " << status; return; } sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } // Initial sync is now complete. Flag this by setting minValid to the last thing // we synced. theReplSet->setMinValid(minValid); // Clear the initial sync flag. theReplSet->clearInitialSyncFlag(); cx.ctx().db()->flushFiles(true); } { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); theReplSet->initialSyncRequested = false; } // If we just cloned & there were no ops applied, we still want the primary to know where // we're up to replset::BackgroundSync::notify(); changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
/** * Do the initial sync for this member. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // reset state, as that "didn't count" emptyOplog(); lastOpTimeWritten = OpTime(); lastH = 0; sethbmsg("initial sync building indexes",0); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } } sethbmsg("initial sync query minValid",0); BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) { return; } // --------- sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } Helpers::putSingleton("local.replset.minvalid", minValid); cx.ctx().db()->flushFiles(true); } changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
/** * Do the initial sync for this member. There are several steps to this process: * * 1. Record start time. * 2. Clone. * 3. Set minValid1 to sync target's latest op time. * 4. Apply ops from start to minValid1, fetching missing docs as needed. * 5. Set minValid2 to sync target's latest op time. * 6. Apply ops from minValid1 to minValid2. * 7. Build indexes. * 8. Set minValid3 to sync target's latest op time. * 9. Apply ops from minValid2 to minValid3. * * At that point, initial sync is finished. Note that the oplog from the sync target is applied * three times: step 4, 6, and 8. 4 may involve refetching, 6 should not. By the end of 6, * this member should have consistent data. 8 is "cosmetic," it is only to get this member * closer to the latest op time before it can transition to secondary state. */ void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); replset::SyncTail tail(replset::BackgroundSync::get()); sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } // written by applyToHead calls BSONObj minValid; if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } sethbmsg("initial sync data copy, starting syncup",0); log() << "oplog sync 1 of 3" << endl; if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // Now we sync to the latest op on the sync target _again_, as we may have recloned ops // that were "from the future" compared with minValid. During this second application, // nothing should need to be recloned. log() << "oplog sync 2 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r , source , lastOp , minValid)) { return; } // data should now be consistent lastOp = minValid; sethbmsg("initial sync building indexes",0); if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) { veto(source->fullName(), 600); sleepsecs(300); return; } } log() << "oplog sync 3 of 3" << endl; if (!_syncDoInitialSync_applyToHead(tail, &r, source, lastOp, minValid)) { return; } // --------- sethbmsg("initial sync finishing up",0); verify( !box.getState().primary() ); // wouldn't make sense if we were. { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } theReplSet->setMinValid(minValid); cx.ctx().db()->flushFiles(true); } changeState(MemberState::RS_RECOVERING); sethbmsg("initial sync done",0); }
/** * Do the initial sync for this member. */ void ReplSetImpl::_syncDoInitialSync() { sethbmsg("initial sync pending",0); // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return; } const Member *source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return; } string sourceHostname = source->h().toString(); OplogReader r; if( !r.connect(sourceHostname) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return; } BSONObj lastOp = r.getLastOp(rsoplog); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return; } OpTime startingTS = lastOp["ts"]._opTime(); if (replSettings.fastsync) { log() << "fastsync: skipping database clone" << rsLog; } else { sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); sethbmsg("initial sync clone all databases", 0); list<string> dbs = r.conn()->getDatabaseNames(); for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) { string db = *i; if( db != "local" ) { sethbmsg( str::stream() << "initial sync cloning db: " << db , 0); bool ok; { writelock lk(db); Client::Context ctx(db); ok = clone(sourceHostname.c_str(), db); } if( !ok ) { sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0); sleepsecs(300); return; } } } } sethbmsg("initial sync query minValid",0); isyncassert( "initial sync source must remain readable throughout our initial sync", source->state().readable() ); /* our cloned copy will be strange until we apply oplog events that occurred through the process. we note that time point here. */ BSONObj minValid = r.getLastOp(rsoplog); isyncassert( "getLastOp is empty ", !minValid.isEmpty() ); OpTime mvoptime = minValid["ts"]._opTime(); assert( !mvoptime.isNull() ); /* apply relevant portion of the oplog */ { isyncassert( str::stream() << "initial sync source must remain readable throughout our initial sync [2] state now: " << source->state().toString() , source->state().readable() ); if( ! initialSyncOplogApplication(source, /*applyGTE*/startingTS, /*minValid*/mvoptime) ) { // note we assume here that this call does not throw log() << "replSet initial sync failed during applyoplog" << rsLog; emptyOplog(); // otherwise we'll be up! lastOpTimeWritten = OpTime(); lastH = 0; log() << "replSet cleaning up [1]" << rsLog; { writelock lk("local."); Client::Context cx( "local." ); cx.db()->flushFiles(true); } log() << "replSet cleaning up [2]" << rsLog; sleepsecs(5); return; } } sethbmsg("initial sync finishing up",0); assert( !box.getState().primary() ); // wouldn't make sense if we were. { writelock lk("local."); Client::Context cx( "local." ); cx.db()->flushFiles(true); try { log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; } catch(...) { } Helpers::putSingleton("local.replset.minvalid", minValid); cx.db()->flushFiles(true); } sethbmsg("initial sync done",0); }
void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) { DBClientConnection *them = r.conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totSize = 0; list< pair<DocID,bo> > goodVersions; bo newMinValid; /* fetch all the goodVersions of each document from current primary */ DocID d; unsigned long long n = 0; try { for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { d = *i; assert( !d._id.eoo() ); { /* TODO : slow. lots of round trips. */ n++; bo good= them->findOne(d.ns, d._id.wrap()).getOwned(); totSize += good.objsize(); uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 ); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID,bo>(d,good)); } } newMinValid = r.getLastOp(rsoplog); if( newMinValid.isEmpty() ) { sethbmsg("rollback error newMinValid empty?"); return; } } catch(DBException& e) { sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0); log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog; throw e; } MemoryMappedFile::flushAll(true); sethbmsg("rollback 3.5"); if( h.rbid != getRBID(r.conn()) ) { // our source rolled back itself. so the data we received isn't necessarily consistent. sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt"); return; } // update them sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size()); bool warn = false; assert( !h.commonPointOurDiskloc.isNull() ); dbMutex.assertWriteLocked(); /* we have items we are writing that aren't from a point-in-time. thus best not to come online until we get to that point in freshness. */ setMinValid(newMinValid); /** any full collection resyncs required? */ if( !h.collectionsToResync.empty() ) { for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { string ns = *i; sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns); Client::Context c(*i, dbpath, 0, /*doauth*/false); try { bob res; string errmsg; dropCollection(ns, errmsg, res); { dbtemprelease r; bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false); if( !ok ) { log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog; throw "rollback error resyncing rollection [1]"; } } } catch(...) { log() << "replset rollback error resyncing collection " << ns << rsLog; throw "rollback error resyncing rollection [2]"; } } /* we did more reading from primary, so check it again for a rollback (which would mess us up), and make minValid newer. */ sethbmsg("rollback 4.2"); { string err; try { newMinValid = r.getLastOp(rsoplog); if( newMinValid.isEmpty() ) { err = "can't get minvalid from primary"; } else { setMinValid(newMinValid); } } catch(...) { err = "can't get/set minvalid"; } if( h.rbid != getRBID(r.conn()) ) { // our source rolled back itself. so the data we received isn't necessarily consistent. // however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if( !err.empty() ) { log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog; /* todo: reset minvalid so that we are permanently in fatal state */ /* todo: don't be fatal, but rather, get all the data first. */ sethbmsg("rollback error"); throw rsfatal(); } } sethbmsg("rollback 4.3"); } sethbmsg("rollback 4.6"); /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */ for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { Client::Context c(*i, dbpath, 0, /*doauth*/false); try { bob res; string errmsg; log(1) << "replSet rollback drop: " << *i << rsLog; dropCollection(*i, errmsg, res); } catch(...) { log() << "replset rollback error dropping collection " << *i << rsLog; } } sethbmsg("rollback 4.7"); Client::Context c(rsoplog, dbpath, 0, /*doauth*/false); NamespaceDetails *oplogDetails = nsdetails(rsoplog); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails); map<string,shared_ptr<RemoveSaver> > removeSavers; unsigned deletes = 0, updates = 0; for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) { const DocID& d = i->first; bo pattern = d._id.wrap(); // { _id : ... } try { assert( d.ns && *d.ns ); if( h.collectionsToResync.count(d.ns) ) { /* we just synced this entire collection */ continue; } /* keep an archive of items rolled back */ shared_ptr<RemoveSaver>& rs = removeSavers[d.ns]; if ( ! rs ) rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) ); // todo: lots of overhead in context, this can be faster Client::Context c(d.ns, dbpath, 0, /*doauth*/false); if( i->second.isEmpty() ) { // wasn't on the primary; delete. /* TODO1.6 : can't delete from a capped collection. need to handle that here. */ deletes++; NamespaceDetails *nsd = nsdetails(d.ns); if( nsd ) { if( nsd->capped ) { /* can't delete from a capped collection - so we truncate instead. if this item must go, so must all successors!!! */ try { /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */ // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); DiskLoc loc = Helpers::findOne(d.ns, pattern, false); if( Listener::getElapsedTimeMillis() - start > 200 ) log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern); if( !loc.isNull() ) { try { nsd->cappedTruncateAfter(d.ns, loc, true); } catch(DBException& e) { if( e.getCode() == 13415 ) { // hack: need to just make cappedTruncate do this... nsd->emptyCappedCollection(d.ns); } else { throw; } } } } catch(DBException& e) { log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog; } } else { try { deletes++; deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() ); } catch(...) { log() << "replSet error rollback delete failed ns:" << d.ns << rsLog; } } // did we just empty the collection? if so let's check if it even exists on the source. if( nsd->stats.nrecords == 0 ) { try { string sys = cc().database()->name + ".system.namespaces"; bo o = them->findOne(sys, QUERY("name"<<d.ns)); if( o.isEmpty() ) { // we should drop try { bob res; string errmsg; dropCollection(d.ns, errmsg, res); } catch(...) { log() << "replset error rolling back collection " << d.ns << rsLog; } } } catch(DBException& ) { /* this isn't *that* big a deal, but is bad. */ log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog; } } } } else { // todo faster... OpDebug debug; updates++; _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() ); } } catch(DBException& e) { log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog; warn = true; } } removeSavers.clear(); // this effectively closes all of them sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates); MemoryMappedFile::flushAll(true); sethbmsg("rollback 6"); // clean up oplog log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog; // todo: fatal error if this throws? oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false); /* reset cached lastoptimewritten and h value */ loadLastOpTimeWritten(); sethbmsg("rollback 7"); MemoryMappedFile::flushAll(true); // done if( warn ) sethbmsg("issues during syncRollback, see log"); else sethbmsg("rollback done"); }
/** * Do the initial sync for this member. * This code can use a little refactoring. bit ugly */ bool ReplSetImpl::_syncDoInitialSync() { sethbmsg("initial sync pending",0); bool needsFullSync = gtidManager->getLiveState().isInitial(); bool needGapsFilled = needsFullSync || replSettings.fastsync; // if this is the first node, it may have already become primary if ( box.getState().primary() ) { sethbmsg("I'm already primary, no need for initial sync",0); return true; } const Member *source = NULL; OplogReader r; string sourceHostname; // only bother making a connection if we need to connect for some reason if (needGapsFilled) { source = getMemberToSyncTo(); if (!source) { sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); sleepsecs(15); return false; } sourceHostname = source->h().toString(); if( !r.connect(sourceHostname, 0) ) { sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); sleepsecs(15); return false; } } if( needsFullSync ) { BSONObj lastOp = r.getLastOp(); if( lastOp.isEmpty() ) { sethbmsg("initial sync couldn't read remote oplog", 0); sleepsecs(15); return false; } { LOCK_REASON(lockReason, "repl: initial sync drop all databases"); Lock::GlobalWrite lk(lockReason); Client::Transaction dropTransaction(DB_SERIALIZABLE); sethbmsg("initial sync drop all databases", 0); dropAllDatabasesExceptLocal(); dropTransaction.commit(); } // now deal with creation of oplog // first delete any existing data in the oplog { LOCK_REASON(lockReason, "repl: create oplog"); Lock::DBWrite lk("local", lockReason); Client::Transaction fileOpsTransaction(DB_SERIALIZABLE); deleteOplogFiles(); fileOpsTransaction.commit(0); } try { sethbmsg("initial sync clone all databases", 0); shared_ptr<DBClientConnection> conn(r.conn_shared()); RemoteTransaction rtxn(*conn, "mvcc"); list<string> dbs = conn->getDatabaseNamesForRepl(); // // Not sure if it is necessary to have a separate fileOps // transaction and clone transaction. The cloneTransaction // has a higher chance of failing, and I don't know at the moment // if it is ok to do fileops successfully, and then an operation (cloning) that // later causes an abort. So, to be cautious, they are separate { LOCK_REASON(lockReason, "repl: initial sync"); Lock::GlobalWrite lk(lockReason); Client::Transaction cloneTransaction(DB_SERIALIZABLE); bool ret = _syncDoInitialSync_clone(sourceHostname.c_str(), dbs, conn); if (!ret) { veto(source->fullName(), 600); cloneTransaction.abort(); sleepsecs(300); return false; } // at this point, we have copied all of the data from the // remote machine. Now we need to copy the replication information // on the remote machine's local database, we need to copy // the entire (small) replInfo dictionary, and the necessary portion // of the oplog // first copy the replInfo, as we will use its information // to determine how much of the opLog to copy { Client::Context ctx( "local" ); BSONObj q; cloneCollection(conn, "local", rsReplInfo, q, true, //copyIndexes false //logForRepl ); // copy entire oplog (probably overkill) cloneCollection(conn, "local", rsoplog, q, true, //copyIndexes false //logForRepl ); // copy entire oplog.refs (probably overkill) cloneCollection(conn, "local", rsOplogRefs, q, true, //copyIndexes false //logForRepl ); } cloneTransaction.commit(0); } bool ok = rtxn.commit(); verify(ok); // absolutely no reason this should fail, it was read only // data should now be consistent } catch (DBException &e) { sethbmsg("exception trying to copy data", 0); LOG(0) << e.getCode() << ": " << e.what() << endl; sleepsecs(1); return false; } } if (needGapsFilled) { _fillGaps(&r); } GTID dummy; applyMissingOpsInOplog(dummy); sethbmsg("initial sync done",0); return true; }