bool DBClientReplicaSet::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { if ( toSend.operation() == dbQuery ) { // TODO: might be possible to do this faster by changing api DbMessage dm( toSend ); QueryMessage qm( dm ); if ( qm.queryOptions & QueryOption_SlaveOk ) { for ( int i=0; i<3; i++ ) { try { DBClientConnection* s = checkSlave(); if ( actualServer ) *actualServer = s->getServerAddress(); return s->call( toSend , response , assertOk ); } catch ( DBException &e ) { LOG(1) << "can't call replica set slave " << i << " : " << _slaveHost << causedBy( e ) << endl; if ( actualServer ) *actualServer = ""; } } } } DBClientConnection* m = checkMaster(); if ( actualServer ) *actualServer = m->getServerAddress(); return m->call( toSend , response , assertOk ); }
void syncFixUp(OperationContext* txn, FixUpInfo& fixUpInfo, OplogReader* oplogreader, ReplicationCoordinator* replCoord) { DBClientConnection* them = oplogreader->conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totalSize = 0; list< pair<DocID, BSONObj> > goodVersions; BSONObj newMinValid; // fetch all the goodVersions of each document from current primary DocID doc; unsigned long long numFetched = 0; try { for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); it != fixUpInfo.toRefetch.end(); it++) { doc = *it; verify(!doc._id.eoo()); { // TODO : slow. lots of round trips. numFetched++; BSONObj good = them->findOne(doc.ns, doc._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); totalSize += good.objsize(); uassert(13410, "replSet too much data to roll back", totalSize < 300 * 1024 * 1024); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID, BSONObj>(doc,good)); } } newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { error() << "rollback error newMinValid empty?"; return; } } catch (DBException& e) { LOG(1) << "rollback re-get objects: " << e.toString(); error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' << numFetched << '/' << fixUpInfo.toRefetch.size(); throw e; } log() << "rollback 3.5"; if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily consistent. warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; return; } // update them log() << "rollback 4 n:" << goodVersions.size(); bool warn = false; invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); invariant(txn->lockState()->isW()); // we have items we are writing that aren't from a point-in-time. thus best not to come // online until we get to that point in freshness. Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); // any full collection resyncs required? if (!fixUpInfo.collectionsToResyncData.empty() || !fixUpInfo.collectionsToResyncMetadata.empty()) { for (const string& ns : fixUpInfo.collectionsToResyncData) { log() << "rollback 4.1.1 coll resync " << ns; fixUpInfo.collectionsToResyncMetadata.erase(ns); const NamespaceString nss(ns); Database* db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); { WriteUnitOfWork wunit(txn); db->dropCollection(txn, ns); wunit.commit(); } { string errmsg; // This comes as a GlobalWrite lock, so there is no DB to be acquired after // resume, so we can skip the DB stability checks. Also // copyCollectionFromRemote will acquire its own database pointer, under the // appropriate locks, so just releasing and acquiring the lock is safe. invariant(txn->lockState()->isW()); Lock::TempRelease release(txn->lockState()); bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok); } } for (const string& ns : fixUpInfo.collectionsToResyncMetadata) { log() << "rollback 4.1.2 coll metadata resync " << ns; const NamespaceString nss(ns); auto db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); auto collection = db->getCollection(ns); invariant(collection); auto cce = collection->getCatalogEntry(); const std::list<BSONObj> info = them->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll())); if (info.empty()) { // Collection dropped by "them" so we should drop it too. log() << ns << " not found on remote host, dropping"; fixUpInfo.toDrop.insert(ns); continue; } invariant(info.size() == 1); CollectionOptions options; auto status = options.parse(info.front()); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to parse options " << info.front() << ": " << status.toString()); } WriteUnitOfWork wuow(txn); if (options.flagsSet || cce->getCollectionOptions(txn).flagsSet) { cce->updateFlags(txn, options.flags); } status = collection->setValidator(txn, options.validator); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to set validator: " << status.toString()); } wuow.commit(); } // we did more reading from primary, so check it again for a rollback (which would mess // us up), and make minValid newer. log() << "rollback 4.2"; string err; try { newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { err = "can't get minvalid from sync source"; } else { Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); } } catch (DBException& e) { err = "can't get/set minvalid: "; err += e.what(); } if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily // consistent. however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if (!err.empty()) { severe() << "rolling back : " << err << ". A full resync will be necessary."; // TODO: reset minvalid so that we are permanently in fatal state // TODO: don't be fatal, but rather, get all the data first. throw RSFatalException(); } log() << "rollback 4.3"; } map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers; log() << "rollback 4.6"; // drop collections to drop before doing individual fixups - that might make things faster // below actually if there were subsequent inserts to rollback for (set<string>::iterator it = fixUpInfo.toDrop.begin(); it != fixUpInfo.toDrop.end(); it++) { log() << "rollback drop: " << *it; Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); if (db) { WriteUnitOfWork wunit(txn); shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); // perform a collection scan and write all documents in the collection to disk boost::scoped_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, *it, db->getCollection(*it))); BSONObj curObj; PlanExecutor::ExecState execState; while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { removeSaver->goingToDelete(curObj); } if (execState != PlanExecutor::IS_EOF) { if (execState == PlanExecutor::FAILURE && WorkingSetCommon::isValidStatusMemberObject(curObj)) { Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); severe() << "rolling back createCollection on " << *it << " failed with " << errorStatus << ". A full resync is necessary."; } else { severe() << "rolling back createCollection on " << *it << " failed. A full resync is necessary."; } throw RSFatalException(); } db->dropCollection(txn, *it); wunit.commit(); } } log() << "rollback 4.7"; OldClientContext ctx(txn, rsOplogName); Collection* oplogCollection = ctx.db()->getCollection(rsOplogName); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsOplogName, oplogCollection); unsigned deletes = 0, updates = 0; time_t lastProgressUpdate = time(0); time_t progressUpdateGap = 10; for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin(); it != goodVersions.end(); it++) { time_t now = time(0); if (now - lastProgressUpdate > progressUpdateGap) { log() << deletes << " delete and " << updates << " update operations processed out of " << goodVersions.size() << " total operations"; lastProgressUpdate = now; } const DocID& doc = it->first; BSONObj pattern = doc._id.wrap(); // { _id : ... } try { verify(doc.ns && *doc.ns); if (fixUpInfo.collectionsToResyncData.count(doc.ns)) { // we just synced this entire collection continue; } // keep an archive of items rolled back shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); // todo: lots of overhead in context, this can be faster OldClientContext ctx(txn, doc.ns); // Add the doc to our rollback file BSONObj obj; Collection* collection = ctx.db()->getCollection(doc.ns); // Do not log an error when undoing an insert on a no longer existent collection. // It is likely that the collection was dropped as part of rolling back a // createCollection command and regardless, the document no longer exists. if (collection) { bool found = Helpers::findOne(txn, collection, pattern, obj, false); if (found) { removeSaver->goingToDelete(obj); } else { error() << "rollback cannot find object: " << pattern << " in namespace " << doc.ns; } } if (it->second.isEmpty()) { // wasn't on the primary; delete. // TODO 1.6 : can't delete from a capped collection. need to handle that here. deletes++; if (collection) { if (collection->isCapped()) { // can't delete from a capped collection - so we truncate instead. if // this item must go, so must all successors!!! try { // TODO: IIRC cappedTruncateAfter does not handle completely empty. // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); RecordId loc = Helpers::findOne(txn, collection, pattern, false); if (Listener::getElapsedTimeMillis() - start > 200) warning() << "roll back slow no _id index for " << doc.ns << " perhaps?"; // would be faster but requires index: // RecordId loc = Helpers::findById(nsd, pattern); if (!loc.isNull()) { try { collection->temp_cappedTruncateAfter(txn, loc, true); } catch (DBException& e) { if (e.getCode() == 13415) { // hack: need to just make cappedTruncate do this... MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); uassertStatusOK(collection->truncate(txn)); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "truncate", collection->ns().ns()); } else { throw e; } } } } catch (DBException& e) { error() << "rolling back capped collection rec " << doc.ns << ' ' << e.toString(); } } else {
void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) { DBClientConnection *them = r.conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totSize = 0; list< pair<DocID,bo> > goodVersions; bo newMinValid; /* fetch all the goodVersions of each document from current primary */ DocID d; unsigned long long n = 0; try { for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { d = *i; assert( !d._id.eoo() ); { /* TODO : slow. lots of round trips. */ n++; bo good= them->findOne(d.ns, d._id.wrap()).getOwned(); totSize += good.objsize(); uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 ); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID,bo>(d,good)); } } newMinValid = r.getLastOp(rsoplog); if( newMinValid.isEmpty() ) { sethbmsg("rollback error newMinValid empty?"); return; } } catch(DBException& e) { sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0); log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog; throw e; } MemoryMappedFile::flushAll(true); sethbmsg("rollback 3.5"); if( h.rbid != getRBID(r.conn()) ) { // our source rolled back itself. so the data we received isn't necessarily consistent. sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt"); return; } // update them sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size()); bool warn = false; assert( !h.commonPointOurDiskloc.isNull() ); dbMutex.assertWriteLocked(); /* we have items we are writing that aren't from a point-in-time. thus best not to come online until we get to that point in freshness. */ setMinValid(newMinValid); /** any full collection resyncs required? */ if( !h.collectionsToResync.empty() ) { for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { string ns = *i; sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns); Client::Context c(*i, dbpath, 0, /*doauth*/false); try { bob res; string errmsg; dropCollection(ns, errmsg, res); { dbtemprelease r; bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, bo(), errmsg, false); if( !ok ) { log() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg << rsLog; throw "rollback error resyncing rollection [1]"; } } } catch(...) { log() << "replset rollback error resyncing collection " << ns << rsLog; throw "rollback error resyncing rollection [2]"; } } /* we did more reading from primary, so check it again for a rollback (which would mess us up), and make minValid newer. */ sethbmsg("rollback 4.2"); { string err; try { newMinValid = r.getLastOp(rsoplog); if( newMinValid.isEmpty() ) { err = "can't get minvalid from primary"; } else { setMinValid(newMinValid); } } catch(...) { err = "can't get/set minvalid"; } if( h.rbid != getRBID(r.conn()) ) { // our source rolled back itself. so the data we received isn't necessarily consistent. // however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if( !err.empty() ) { log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog; /* todo: reset minvalid so that we are permanently in fatal state */ /* todo: don't be fatal, but rather, get all the data first. */ sethbmsg("rollback error"); throw rsfatal(); } } sethbmsg("rollback 4.3"); } sethbmsg("rollback 4.6"); /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */ for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { Client::Context c(*i, dbpath, 0, /*doauth*/false); try { bob res; string errmsg; log(1) << "replSet rollback drop: " << *i << rsLog; dropCollection(*i, errmsg, res); } catch(...) { log() << "replset rollback error dropping collection " << *i << rsLog; } } sethbmsg("rollback 4.7"); Client::Context c(rsoplog, dbpath, 0, /*doauth*/false); NamespaceDetails *oplogDetails = nsdetails(rsoplog); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails); map<string,shared_ptr<RemoveSaver> > removeSavers; unsigned deletes = 0, updates = 0; for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) { const DocID& d = i->first; bo pattern = d._id.wrap(); // { _id : ... } try { assert( d.ns && *d.ns ); if( h.collectionsToResync.count(d.ns) ) { /* we just synced this entire collection */ continue; } /* keep an archive of items rolled back */ shared_ptr<RemoveSaver>& rs = removeSavers[d.ns]; if ( ! rs ) rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) ); // todo: lots of overhead in context, this can be faster Client::Context c(d.ns, dbpath, 0, /*doauth*/false); if( i->second.isEmpty() ) { // wasn't on the primary; delete. /* TODO1.6 : can't delete from a capped collection. need to handle that here. */ deletes++; NamespaceDetails *nsd = nsdetails(d.ns); if( nsd ) { if( nsd->capped ) { /* can't delete from a capped collection - so we truncate instead. if this item must go, so must all successors!!! */ try { /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */ // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); DiskLoc loc = Helpers::findOne(d.ns, pattern, false); if( Listener::getElapsedTimeMillis() - start > 200 ) log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern); if( !loc.isNull() ) { try { nsd->cappedTruncateAfter(d.ns, loc, true); } catch(DBException& e) { if( e.getCode() == 13415 ) { // hack: need to just make cappedTruncate do this... nsd->emptyCappedCollection(d.ns); } else { throw; } } } } catch(DBException& e) { log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog; } } else { try { deletes++; deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() ); } catch(...) { log() << "replSet error rollback delete failed ns:" << d.ns << rsLog; } } // did we just empty the collection? if so let's check if it even exists on the source. if( nsd->stats.nrecords == 0 ) { try { string sys = cc().database()->name + ".system.namespaces"; bo o = them->findOne(sys, QUERY("name"<<d.ns)); if( o.isEmpty() ) { // we should drop try { bob res; string errmsg; dropCollection(d.ns, errmsg, res); } catch(...) { log() << "replset error rolling back collection " << d.ns << rsLog; } } } catch(DBException& ) { /* this isn't *that* big a deal, but is bad. */ log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog; } } } } else { // todo faster... OpDebug debug; updates++; _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() ); } } catch(DBException& e) { log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog; warn = true; } } removeSavers.clear(); // this effectively closes all of them sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates); MemoryMappedFile::flushAll(true); sethbmsg("rollback 6"); // clean up oplog log(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog; // todo: fatal error if this throws? oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false); /* reset cached lastoptimewritten and h value */ loadLastOpTimeWritten(); sethbmsg("rollback 7"); MemoryMappedFile::flushAll(true); // done if( warn ) sethbmsg("issues during syncRollback, see log"); else sethbmsg("rollback done"); }
bool DBClientReplicaSet::call(Message &toSend, Message &response, bool assertOk, string * actualServer) { const char * ns = 0; if (toSend.operation() == dbQuery) { // TODO: might be possible to do this faster by changing api DbMessage dm(toSend); QueryMessage qm(dm); ns = qm.ns; shared_ptr<ReadPreferenceSetting> readPref( _extractReadPref( qm.query, qm.queryOptions ) ); if ( _isSecondaryQuery( ns, qm.query, *readPref ) ) { LOG( 3 ) << "dbclient_rs call using secondary or tagged node selection in " << _getMonitor()->getName() << ", read pref is " << readPref->toBSON() << " (primary : " << ( _master.get() != NULL ? _master->getServerAddress() : "[not cached]" ) << ", lastTagged : " << ( _lastSlaveOkConn.get() != NULL ? _lastSlaveOkConn->getServerAddress() : "[not cached]" ) << ")" << endl; for (size_t retry = 0; retry < MAX_RETRY; retry++) { try { DBClientConnection* conn = selectNodeUsingTags(readPref); if (conn == NULL) { return false; } if (actualServer != NULL) { *actualServer = conn->getServerAddress(); } return conn->call(toSend, response, assertOk); } catch ( const DBException& dbExcep ) { LOG(1) << "can't call replica set node " << _lastSlaveOkHost << ": " << causedBy( dbExcep ) << endl; if ( actualServer ) *actualServer = ""; invalidateLastSlaveOkCache(); } } // Was not able to successfully send after max retries return false; } } LOG( 3 ) << "dbclient_rs call to primary node in " << _getMonitor()->getName() << endl; DBClientConnection* m = checkMaster(); if ( actualServer ) *actualServer = m->getServerAddress(); if ( ! m->call( toSend , response , assertOk ) ) return false; if ( ns ) { QueryResult * res = (QueryResult*)response.singleData(); if ( res->nReturned == 1 ) { BSONObj x(res->data() ); if ( str::contains( ns , "$cmd" ) ) { if ( isNotMasterErrorString( x["errmsg"] ) ) isntMaster(); } else { if ( isNotMasterErrorString( getErrField( x ) ) ) isntMaster(); } } } return true; }
void DBClientReplicaSet::say(Message& toSend, bool isRetry, string* actualServer) { if (!isRetry) _lazyState = LazyState(); const int lastOp = toSend.operation(); if (lastOp == dbQuery) { // TODO: might be possible to do this faster by changing api DbMessage dm(toSend); QueryMessage qm(dm); shared_ptr<ReadPreferenceSetting> readPref( _extractReadPref( qm.query, qm.queryOptions ) ); if ( _isSecondaryQuery( qm.ns, qm.query, *readPref ) ) { LOG( 3 ) << "dbclient_rs say using secondary or tagged node selection in " << _getMonitor()->getName() << ", read pref is " << readPref->toBSON() << " (primary : " << ( _master.get() != NULL ? _master->getServerAddress() : "[not cached]" ) << ", lastTagged : " << ( _lastSlaveOkConn.get() != NULL ? _lastSlaveOkConn->getServerAddress() : "[not cached]" ) << ")" << endl; string lastNodeErrMsg; for (size_t retry = 0; retry < MAX_RETRY; retry++) { _lazyState._retries = retry; try { DBClientConnection* conn = selectNodeUsingTags(readPref); if (conn == NULL) { break; } if (actualServer != NULL) { *actualServer = conn->getServerAddress(); } conn->say(toSend); _lazyState._lastOp = lastOp; _lazyState._secondaryQueryOk = true; _lazyState._lastClient = conn; } catch ( const DBException& DBExcep ) { StringBuilder errMsgBuilder; errMsgBuilder << "can't callLazy replica set node " << _lastSlaveOkHost.toString() << ": " << causedBy( DBExcep ); lastNodeErrMsg = errMsgBuilder.str(); LOG(1) << lastNodeErrMsg << endl; invalidateLastSlaveOkCache(); continue; } return; } StringBuilder assertMsg; assertMsg << "Failed to call say, no good nodes in " << _getMonitor()->getName(); if ( !lastNodeErrMsg.empty() ) { assertMsg << ", last error: " << lastNodeErrMsg; } uasserted(16380, assertMsg.str()); } } LOG( 3 ) << "dbclient_rs say to primary node in " << _getMonitor()->getName() << endl; DBClientConnection* master = checkMaster(); if (actualServer) *actualServer = master->getServerAddress(); _lazyState._lastOp = lastOp; _lazyState._secondaryQueryOk = false; // Don't retry requests to primary since there is only one host to try _lazyState._retries = MAX_RETRY; _lazyState._lastClient = master; master->say(toSend); return; }