void syncFixUp(OperationContext* txn, FixUpInfo& fixUpInfo, OplogReader* oplogreader, ReplicationCoordinator* replCoord) { DBClientConnection* them = oplogreader->conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totalSize = 0; list< pair<DocID, BSONObj> > goodVersions; BSONObj newMinValid; // fetch all the goodVersions of each document from current primary DocID doc; unsigned long long numFetched = 0; try { for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); it != fixUpInfo.toRefetch.end(); it++) { doc = *it; verify(!doc._id.eoo()); { // TODO : slow. lots of round trips. numFetched++; BSONObj good = them->findOne(doc.ns, doc._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); totalSize += good.objsize(); uassert(13410, "replSet too much data to roll back", totalSize < 300 * 1024 * 1024); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID, BSONObj>(doc,good)); } } newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { error() << "rollback error newMinValid empty?"; return; } } catch (DBException& e) { LOG(1) << "rollback re-get objects: " << e.toString(); error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' << numFetched << '/' << fixUpInfo.toRefetch.size(); throw e; } log() << "rollback 3.5"; if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily consistent. warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; return; } // update them log() << "rollback 4 n:" << goodVersions.size(); bool warn = false; invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); invariant(txn->lockState()->isW()); // we have items we are writing that aren't from a point-in-time. thus best not to come // online until we get to that point in freshness. Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); // any full collection resyncs required? if (!fixUpInfo.collectionsToResyncData.empty() || !fixUpInfo.collectionsToResyncMetadata.empty()) { for (const string& ns : fixUpInfo.collectionsToResyncData) { log() << "rollback 4.1.1 coll resync " << ns; fixUpInfo.collectionsToResyncMetadata.erase(ns); const NamespaceString nss(ns); Database* db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); { WriteUnitOfWork wunit(txn); db->dropCollection(txn, ns); wunit.commit(); } { string errmsg; // This comes as a GlobalWrite lock, so there is no DB to be acquired after // resume, so we can skip the DB stability checks. Also // copyCollectionFromRemote will acquire its own database pointer, under the // appropriate locks, so just releasing and acquiring the lock is safe. invariant(txn->lockState()->isW()); Lock::TempRelease release(txn->lockState()); bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok); } } for (const string& ns : fixUpInfo.collectionsToResyncMetadata) { log() << "rollback 4.1.2 coll metadata resync " << ns; const NamespaceString nss(ns); auto db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); auto collection = db->getCollection(ns); invariant(collection); auto cce = collection->getCatalogEntry(); const std::list<BSONObj> info = them->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll())); if (info.empty()) { // Collection dropped by "them" so we should drop it too. log() << ns << " not found on remote host, dropping"; fixUpInfo.toDrop.insert(ns); continue; } invariant(info.size() == 1); CollectionOptions options; auto status = options.parse(info.front()); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to parse options " << info.front() << ": " << status.toString()); } WriteUnitOfWork wuow(txn); if (options.flagsSet || cce->getCollectionOptions(txn).flagsSet) { cce->updateFlags(txn, options.flags); } status = collection->setValidator(txn, options.validator); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to set validator: " << status.toString()); } wuow.commit(); } // we did more reading from primary, so check it again for a rollback (which would mess // us up), and make minValid newer. log() << "rollback 4.2"; string err; try { newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { err = "can't get minvalid from sync source"; } else { Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); } } catch (DBException& e) { err = "can't get/set minvalid: "; err += e.what(); } if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily // consistent. however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if (!err.empty()) { severe() << "rolling back : " << err << ". A full resync will be necessary."; // TODO: reset minvalid so that we are permanently in fatal state // TODO: don't be fatal, but rather, get all the data first. throw RSFatalException(); } log() << "rollback 4.3"; } map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers; log() << "rollback 4.6"; // drop collections to drop before doing individual fixups - that might make things faster // below actually if there were subsequent inserts to rollback for (set<string>::iterator it = fixUpInfo.toDrop.begin(); it != fixUpInfo.toDrop.end(); it++) { log() << "rollback drop: " << *it; Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); if (db) { WriteUnitOfWork wunit(txn); shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); // perform a collection scan and write all documents in the collection to disk boost::scoped_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, *it, db->getCollection(*it))); BSONObj curObj; PlanExecutor::ExecState execState; while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { removeSaver->goingToDelete(curObj); } if (execState != PlanExecutor::IS_EOF) { if (execState == PlanExecutor::FAILURE && WorkingSetCommon::isValidStatusMemberObject(curObj)) { Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); severe() << "rolling back createCollection on " << *it << " failed with " << errorStatus << ". A full resync is necessary."; } else { severe() << "rolling back createCollection on " << *it << " failed. A full resync is necessary."; } throw RSFatalException(); } db->dropCollection(txn, *it); wunit.commit(); } } log() << "rollback 4.7"; OldClientContext ctx(txn, rsOplogName); Collection* oplogCollection = ctx.db()->getCollection(rsOplogName); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsOplogName, oplogCollection); unsigned deletes = 0, updates = 0; time_t lastProgressUpdate = time(0); time_t progressUpdateGap = 10; for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin(); it != goodVersions.end(); it++) { time_t now = time(0); if (now - lastProgressUpdate > progressUpdateGap) { log() << deletes << " delete and " << updates << " update operations processed out of " << goodVersions.size() << " total operations"; lastProgressUpdate = now; } const DocID& doc = it->first; BSONObj pattern = doc._id.wrap(); // { _id : ... } try { verify(doc.ns && *doc.ns); if (fixUpInfo.collectionsToResyncData.count(doc.ns)) { // we just synced this entire collection continue; } // keep an archive of items rolled back shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); // todo: lots of overhead in context, this can be faster OldClientContext ctx(txn, doc.ns); // Add the doc to our rollback file BSONObj obj; Collection* collection = ctx.db()->getCollection(doc.ns); // Do not log an error when undoing an insert on a no longer existent collection. // It is likely that the collection was dropped as part of rolling back a // createCollection command and regardless, the document no longer exists. if (collection) { bool found = Helpers::findOne(txn, collection, pattern, obj, false); if (found) { removeSaver->goingToDelete(obj); } else { error() << "rollback cannot find object: " << pattern << " in namespace " << doc.ns; } } if (it->second.isEmpty()) { // wasn't on the primary; delete. // TODO 1.6 : can't delete from a capped collection. need to handle that here. deletes++; if (collection) { if (collection->isCapped()) { // can't delete from a capped collection - so we truncate instead. if // this item must go, so must all successors!!! try { // TODO: IIRC cappedTruncateAfter does not handle completely empty. // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); RecordId loc = Helpers::findOne(txn, collection, pattern, false); if (Listener::getElapsedTimeMillis() - start > 200) warning() << "roll back slow no _id index for " << doc.ns << " perhaps?"; // would be faster but requires index: // RecordId loc = Helpers::findById(nsd, pattern); if (!loc.isNull()) { try { collection->temp_cappedTruncateAfter(txn, loc, true); } catch (DBException& e) { if (e.getCode() == 13415) { // hack: need to just make cappedTruncate do this... MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); uassertStatusOK(collection->truncate(txn)); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "truncate", collection->ns().ns()); } else { throw e; } } } } catch (DBException& e) { error() << "rolling back capped collection rec " << doc.ns << ' ' << e.toString(); } } else {
StatusWith<BSONObj> OplogFetcher::_onSuccessfulBatch(const Fetcher::QueryResponse& queryResponse) { // Stop fetching and return on fail point. // This fail point makes the oplog fetcher ignore the downloaded batch of operations and not // error out. The FailPointEnabled error will be caught by the AbstractOplogFetcher. if (MONGO_FAIL_POINT(stopReplProducer)) { return Status(ErrorCodes::FailPointEnabled, "stopReplProducer fail point is enabled"); } const auto& documents = queryResponse.documents; auto firstDocToApply = documents.cbegin(); if (!documents.empty()) { LOG(2) << "oplog fetcher read " << documents.size() << " operations from remote oplog starting at " << documents.front()["ts"] << " and ending at " << documents.back()["ts"]; } else { LOG(2) << "oplog fetcher read 0 operations from remote oplog"; } auto oqMetadataResult = parseOplogQueryMetadata(queryResponse); if (!oqMetadataResult.isOK()) { error() << "invalid oplog query metadata from sync source " << _getSource() << ": " << oqMetadataResult.getStatus() << ": " << queryResponse.otherFields.metadata; return oqMetadataResult.getStatus(); } auto oqMetadata = oqMetadataResult.getValue(); // This lastFetched value is the last OpTime from the previous batch. auto lastFetched = _getLastOpTimeFetched(); // Check start of remote oplog and, if necessary, stop fetcher to execute rollback. if (queryResponse.first) { auto remoteRBID = oqMetadata ? boost::make_optional(oqMetadata->getRBID()) : boost::none; auto remoteLastApplied = oqMetadata ? boost::make_optional(oqMetadata->getLastOpApplied()) : boost::none; auto status = checkRemoteOplogStart(documents, lastFetched, remoteLastApplied, _requiredRBID, remoteRBID, _requireFresherSyncSource); if (!status.isOK()) { // Stop oplog fetcher and execute rollback if necessary. return status; } LOG(1) << "oplog fetcher successfully fetched from " << _getSource(); // If this is the first batch, no rollback is needed and we don't want to enqueue the first // document, skip it. if (_startingPoint == StartingPoint::kSkipFirstDoc) { firstDocToApply++; } } auto validateResult = OplogFetcher::validateDocuments(documents, queryResponse.first, lastFetched.getTimestamp()); if (!validateResult.isOK()) { return validateResult.getStatus(); } auto info = validateResult.getValue(); // Process replset metadata. It is important that this happen after we've validated the // first batch, so we don't progress our knowledge of the commit point from a // response that triggers a rollback. rpc::ReplSetMetadata replSetMetadata; bool receivedReplMetadata = queryResponse.otherFields.metadata.hasElement(rpc::kReplSetMetadataFieldName); if (receivedReplMetadata) { const auto& metadataObj = queryResponse.otherFields.metadata; auto metadataResult = rpc::ReplSetMetadata::readFromMetadata(metadataObj); if (!metadataResult.isOK()) { error() << "invalid replication metadata from sync source " << _getSource() << ": " << metadataResult.getStatus() << ": " << metadataObj; return metadataResult.getStatus(); } replSetMetadata = metadataResult.getValue(); // We will only ever have OplogQueryMetadata if we have ReplSetMetadata, so it is safe // to call processMetadata() in this if block. _dataReplicatorExternalState->processMetadata(replSetMetadata, oqMetadata); } // Increment stats. We read all of the docs in the query. opsReadStats.increment(info.networkDocumentCount); networkByteStats.increment(info.networkDocumentBytes); // Record time for each batch. getmoreReplStats.recordMillis(durationCount<Milliseconds>(queryResponse.elapsedMillis)); // TODO: back pressure handling will be added in SERVER-23499. auto status = _enqueueDocumentsFn(firstDocToApply, documents.cend(), info); if (!status.isOK()) { return status; } if (_dataReplicatorExternalState->shouldStopFetching( _getSource(), replSetMetadata, oqMetadata)) { str::stream errMsg; errMsg << "sync source " << _getSource().toString(); errMsg << " (config version: " << replSetMetadata.getConfigVersion(); // If OplogQueryMetadata was provided, its values were used to determine if we should // stop fetching from this sync source. if (oqMetadata) { errMsg << "; last applied optime: " << oqMetadata->getLastOpApplied().toString(); errMsg << "; sync source index: " << oqMetadata->getSyncSourceIndex(); errMsg << "; primary index: " << oqMetadata->getPrimaryIndex(); } else { errMsg << "; last visible optime: " << replSetMetadata.getLastOpVisible().toString(); errMsg << "; sync source index: " << replSetMetadata.getSyncSourceIndex(); errMsg << "; primary index: " << replSetMetadata.getPrimaryIndex(); } errMsg << ") is no longer valid"; return Status(ErrorCodes::InvalidSyncSource, errMsg); } auto lastCommittedWithCurrentTerm = _dataReplicatorExternalState->getCurrentTermAndLastCommittedOpTime(); return makeGetMoreCommandObject(queryResponse.nss, queryResponse.cursorId, lastCommittedWithCurrentTerm, _getGetMoreMaxTime(), _batchSize); }