OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* txn, bool isV1ElectionProtocol) { invariant(txn->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. invariant(_storageInterface->getOplogDeleteFromPoint(txn).isNull()); _storageInterface->setAppliedThrough(txn, {}); if (isV1ElectionProtocol) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); WriteUnitOfWork wuow(txn); txn->getClient()->getServiceContext()->getOpObserver()->onOpMessage( txn, BSON("msg" << "new primary")); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "logging transition to primary to oplog", "local.oplog.rs"); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(txn)); _shardingOnTransitionToPrimaryHook(txn); _dropAllTempCollections(txn); return opTimeToReturn; }
long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) { BSONObj oplogEntry; try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock lk(txn->lockState(), "local", MODE_X); bool success = Helpers::getLast(txn, rsOplogName.c_str(), oplogEntry); if (!success) { // This can happen when we are to do an initial sync. lastHash will be set // after the initial sync is complete. return 0; } } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "readLastAppliedHash", rsOplogName); } catch (const DBException& ex) { severe() << "Problem reading " << rsOplogName << ": " << redact(ex); fassertFailed(18904); } long long hash; auto status = bsonExtractIntegerField(oplogEntry, kHashFieldName, &hash); if (!status.isOK()) { severe() << "Most recent entry in " << rsOplogName << " is missing or has invalid \"" << kHashFieldName << "\" field. Oplog entry: " << redact(oplogEntry) << ": " << redact(status); fassertFailed(18902); } return hash; }
Status dropIndexes(OperationContext* txn, const NamespaceString& ns, const BSONObj& idxDescriptor, BSONObjBuilder* result) { StringData dbName = ns.db(); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); AutoGetDb autoDb(txn, dbName, MODE_X); bool userInitiatedWritesAndNotPrimary = txn->writesAreReplicated() && !repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping indexes in " << ns.toString()); } WriteUnitOfWork wunit(txn); Status status = wrappedRun(txn, dbName, ns, autoDb.getDb(), idxDescriptor, result); if (!status.isOK()) { return status; } getGlobalServiceContext()->getOpObserver()->onDropIndex( txn, dbName.toString() + ".$cmd", idxDescriptor); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "dropIndexes", dbName); return Status::OK(); }
void dropAllDatabasesExceptLocal(OperationContext* txn) { ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); vector<string> n; StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&n); if (n.size() == 0) return; log() << "dropAllDatabasesExceptLocal " << n.size(); repl::getGlobalReplicationCoordinator()->dropAllSnapshots(); for (vector<string>::iterator i = n.begin(); i != n.end(); i++) { if (*i != "local") { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { Database* db = dbHolder().get(txn, *i); // This is needed since dropDatabase can't be rolled back. // This is safe be replaced by "invariant(db);dropDatabase(txn, db);" once fixed if (db == nullptr) { log() << "database disappeared after listDatabases but before drop: " << *i; } else { Database::dropDatabase(txn, db); } } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "dropAllDatabasesExceptLocal", *i); } }
long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) { BSONObj oplogEntry; try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock lk(txn->lockState(), "local", MODE_X); bool success = Helpers::getLast(txn, rsOplogName.c_str(), oplogEntry); if (!success) { // This can happen when we are to do an initial sync. lastHash will be set // after the initial sync is complete. return 0; } } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "readLastAppliedHash", rsOplogName); } catch (const DBException& ex) { severe() << "Problem reading " << rsOplogName << ": " << ex.toStatus(); fassertFailed(18904); } BSONElement hashElement = oplogEntry[hashFieldName]; if (hashElement.eoo()) { severe() << "Most recent entry in " << rsOplogName << " missing \"" << hashFieldName << "\" field"; fassertFailed(18902); } if (hashElement.type() != NumberLong) { severe() << "Expected type of \"" << hashFieldName << "\" in most recent " << rsOplogName << " entry to have type NumberLong, but found " << typeName(hashElement.type()); fassertFailed(18903); } return hashElement.safeNumberLong(); }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { const std::string ns = parseNsCollectionRequired(dbname, jsobj); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); AutoGetDb autoDb(txn, dbname, MODE_X); if (!fromRepl && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) { return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping indexes in " << ns)); } WriteUnitOfWork wunit(txn); bool ok = wrappedRun(txn, dbname, ns, autoDb.getDb(), jsobj, errmsg, result); if (!ok) { return false; } if (!fromRepl) { getGlobalEnvironment()->getOpObserver()->onDropIndex(txn, dbname + ".$cmd", jsobj); } wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "dropIndexes", dbname); return true; }
Status CollectionBulkLoaderImpl::insertDocuments(const std::vector<BSONObj>::const_iterator begin, const std::vector<BSONObj>::const_iterator end) { int count = 0; return _runTaskReleaseResourcesOnFailure( [begin, end, &count, this](OperationContext* txn) -> Status { invariant(txn); for (auto iter = begin; iter != end; ++iter) { std::vector<MultiIndexBlock*> indexers; if (_idIndexBlock) { indexers.push_back(_idIndexBlock.get()); } if (_secondaryIndexesBlock) { indexers.push_back(_secondaryIndexesBlock.get()); } MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); const auto status = _coll->insertDocument(txn, *iter, indexers, false); if (!status.isOK()) { return status; } wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( _txn, "CollectionBulkLoaderImpl::insertDocuments", _nss.ns()); ++count; } return Status::OK(); });
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn, const BSONObj& config) { try { createOplog(txn); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); Helpers::putSingleton(txn, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); // This initializes the minvalid document with a null "ts" because older versions (<=3.2) // get angry if the minValid document is present but doesn't have a "ts" field. // Consider removing this once we no longer need to support downgrading to 3.2. _storageInterface->setMinValidToAtLeast(txn, {}); FeatureCompatibilityVersion::setIfCleanStartup(txn, _storageInterface); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
void FeatureCompatibilityVersion::set(OperationContext* txn, StringData version) { uassert(40284, "featureCompatibilityVersion must be '3.4' or '3.2'. See " "http://dochub.mongodb.org/core/3.4-feature-compatibility.", version == FeatureCompatibilityVersionCommandParser::kVersion34 || version == FeatureCompatibilityVersionCommandParser::kVersion32); DBDirectClient client(txn); NamespaceString nss(FeatureCompatibilityVersion::kCollection); if (version == FeatureCompatibilityVersionCommandParser::kVersion34) { // We build a v=2 index on the "admin.system.version" collection as part of setting the // featureCompatibilityVersion to 3.4. This is a new index version that isn't supported by // versions of MongoDB earlier than 3.4 that will cause 3.2 secondaries to crash when it is // replicated. std::vector<BSONObj> indexSpecs{k32IncompatibleIndexSpec}; { ScopedTransaction transaction(txn, MODE_IX); AutoGetOrCreateDb autoDB(txn, nss.db(), MODE_X); uassert(ErrorCodes::NotMaster, str::stream() << "Cannot set featureCompatibilityVersion to '" << version << "'. Not primary while attempting to create index on: " << nss.ns(), repl::ReplicationCoordinator::get(txn->getServiceContext()) ->canAcceptWritesFor(nss)); IndexBuilder builder(k32IncompatibleIndexSpec, false); auto status = builder.buildInForeground(txn, autoDB.getDb()); uassertStatusOK(status); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wuow(txn); getGlobalServiceContext()->getOpObserver()->onCreateIndex( txn, autoDB.getDb()->getSystemIndexesName(), k32IncompatibleIndexSpec, false); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "FeatureCompatibilityVersion::set", nss.ns()); } // We then update the featureCompatibilityVersion document stored in the // "admin.system.version" collection. We do this after creating the v=2 index in order to // maintain the invariant that if the featureCompatibilityVersion is 3.4, then // 'k32IncompatibleIndexSpec' index exists on the "admin.system.version" collection. BSONObj updateResult; client.runCommand(nss.db().toString(), makeUpdateCommand(version, WriteConcernOptions::Majority), updateResult); uassertStatusOK(getStatusFromCommandResult(updateResult)); uassertStatusOK(getWriteConcernStatusFromCommandResult(updateResult)); // We then update the value of the featureCompatibilityVersion server parameter. serverGlobalParams.featureCompatibility.version.store( ServerGlobalParams::FeatureCompatibility::Version::k34); } else if (version == FeatureCompatibilityVersionCommandParser::kVersion32) {
void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) { createOplog(txn); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, BSON("msg" << "initiating set")); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); }
bool getInitialSyncFlag() { OperationContextImpl txn; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(&txn, MODE_IX); Lock::DBLock lk(txn.lockState(), "local", MODE_X); BSONObj mv; bool found = Helpers::getSingleton(&txn, minvalidNS, mv); if (found) { return mv[initialSyncFlagString].trueValue(); } return false; } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(&txn, "getInitialSyncFlags", minvalidNS); }
Status ReplicationCoordinatorExternalStateImpl::storeLocalConfigDocument(OperationContext* txn, const BSONObj& config) { try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbWriteLock(txn->lockState(), configDatabaseName, MODE_X); Helpers::putSingleton(txn, configCollectionName, config); return Status::OK(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "save replica set config", configCollectionName); } catch (const DBException& ex) { return ex.toStatus(); } }
void ServiceContextMongoDTest::_dropAllDBs(OperationContext* txn) { dropAllDatabasesExceptLocal(txn); ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); AutoGetDb autoDBLocal(txn, "local", MODE_X); const auto localDB = autoDBLocal.getDb(); if (localDB) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // Do not wrap in a WriteUnitOfWork until SERVER-17103 is addressed. autoDBLocal.getDb()->dropDatabase(txn, localDB); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "_dropAllDBs", "local"); } }
Status ReplicationCoordinatorExternalStateImpl::storeLocalLastVoteDocument( OperationContext* txn, const LastVote& lastVote) { BSONObj lastVoteObj = lastVote.toBSON(); try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbWriteLock(txn->lockState(), lastVoteDatabaseName, MODE_X); Helpers::putSingleton(txn, lastVoteCollectionName, lastVoteObj); return Status::OK(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "save replica set lastVote", lastVoteCollectionName); MONGO_UNREACHABLE; } catch (const DBException& ex) { return ex.toStatus(); } }
StatusWith<BSONObj> ReplicationCoordinatorExternalStateImpl::loadLocalConfigDocument( OperationContext* txn) { try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { BSONObj config; if (!Helpers::getSingleton(txn, configCollectionName, config)) { return StatusWith<BSONObj>( ErrorCodes::NoMatchingDocument, str::stream() << "Did not find replica set configuration document in " << configCollectionName); } return StatusWith<BSONObj>(config); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "load replica set config", configCollectionName); } catch (const DBException& ex) { return StatusWith<BSONObj>(ex.toStatus()); } }
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn, const BSONObj& config, bool updateReplOpTime) { try { createOplog(txn, rsOplogName, true); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); Helpers::putSingleton(txn, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); if (updateReplOpTime) { getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj); } else { // 'updateReplOpTime' is false when called from the replSetInitiate command when the // server is running with replication disabled. We bypass onOpMessage to invoke // _logOp directly so that we can override the replication mode and keep _logO from // updating the replication coordinator's op time (illegal operation when // replication is not enabled). repl::oplogCheckCloseDatabase(txn, nullptr); repl::_logOp(txn, "n", "", msgObj, nullptr, false, rsOplogName, ReplicationCoordinator::modeReplSet, updateReplOpTime); repl::oplogCheckCloseDatabase(txn, nullptr); } wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
bool SyncTail::shouldRetry(OperationContext* txn, const BSONObj& o) { const NamespaceString nss(o.getStringField("ns")); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // Take an X lock on the database in order to preclude other modifications. // Also, the database might not exist yet, so create it. AutoGetOrCreateDb autoDb(txn, nss.db(), MODE_X); Database* const db = autoDb.getDb(); // we don't have the object yet, which is possible on initial sync. get it. log() << "adding missing object" << endl; // rare enough we can log BSONObj missingObj = getMissingDoc(txn, db, o); if (missingObj.isEmpty()) { log() << "missing object not found on source." " presumably deleted later in oplog"; log() << "o2: " << o.getObjectField("o2").toString(); log() << "o firstfield: " << o.getObjectField("o").firstElementFieldName(); return false; } else { WriteUnitOfWork wunit(txn); Collection* const coll = db->getOrCreateCollection(txn, nss.toString()); invariant(coll); Status status = coll->insertDocument(txn, missingObj, true); uassert(15917, str::stream() << "failed to insert missing doc: " << status.toString(), status.isOK()); LOG(1) << "inserted missing doc: " << missingObj.toString() << endl; wunit.commit(); return true; } } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "InsertRetry", nss.ns()); // fixes compile errors on GCC - see SERVER-18219 for details MONGO_UNREACHABLE; }
void handleSERVER23299ForDb(OperationContext* txn, Database* db) { log() << "Scanning " << db->name() << " db for SERVER-23299 eligibility"; const auto dbEntry = db->getDatabaseCatalogEntry(); list<string> collNames; dbEntry->getCollectionNamespaces(&collNames); for (const auto& collName : collNames) { const auto collEntry = dbEntry->getCollectionCatalogEntry(collName); const auto collOptions = collEntry->getCollectionOptions(txn); if (!collOptions.temp) continue; log() << "Marking collection " << collName << " as permanent per SERVER-23299"; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wuow(txn); collEntry->clearTempFlag(txn); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "repair SERVER-23299", collEntry->ns().ns()); } log() << "Done scanning " << db->name() << " for SERVER-23299 eligibility"; }
StatusWith<LastVote> ReplicationCoordinatorExternalStateImpl::loadLocalLastVoteDocument( OperationContext* txn) { try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { BSONObj lastVoteObj; if (!Helpers::getSingleton(txn, lastVoteCollectionName, lastVoteObj)) { return StatusWith<LastVote>(ErrorCodes::NoMatchingDocument, str::stream() << "Did not find replica set lastVote document in " << lastVoteCollectionName); } LastVote lastVote; lastVote.initialize(lastVoteObj); return StatusWith<LastVote>(lastVote); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "load replica set lastVote", lastVoteCollectionName); } catch (const DBException& ex) { return StatusWith<LastVote>(ex.toStatus()); } }
void ServiceContextMongoDTest::_dropAllDBs(OperationContext* txn) { dropAllDatabasesExceptLocal(txn); ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); AutoGetDb autoDBLocal(txn, "local", MODE_X); const auto localDB = autoDBLocal.getDb(); if (localDB) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // Do not wrap in a WriteUnitOfWork until SERVER-17103 is addressed. autoDBLocal.getDb()->dropDatabase(txn, localDB); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "_dropAllDBs", "local"); } // dropAllDatabasesExceptLocal() does not close empty databases. However the holder still // allocates resources to track these empty databases. These resources not released by // dropAllDatabasesExceptLocal() will be leaked at exit unless we call DatabaseHolder::closeAll. BSONObjBuilder unused; invariant(dbHolder().closeAll(txn, unused, false)); }
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn, const BSONObj& config) { try { createOplog(txn); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); Helpers::putSingleton(txn, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
Status waitForLinearizableReadConcern(OperationContext* txn) { repl::ReplicationCoordinator* replCoord = repl::ReplicationCoordinator::get(txn->getClient()->getServiceContext()); { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock lk(txn->lockState(), "local", MODE_IX); Lock::CollectionLock lock(txn->lockState(), "local.oplog.rs", MODE_IX); if (!replCoord->canAcceptWritesForDatabase("admin")) { return {ErrorCodes::NotMaster, "No longer primary when waiting for linearizable read concern"}; } MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork uow(txn); txn->getClient()->getServiceContext()->getOpObserver()->onOpMessage( txn, BSON("msg" << "linearizable read")); uow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "waitForLinearizableReadConcern", "local.rs.oplog"); } WriteConcernOptions wc = WriteConcernOptions( WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0); repl::OpTime lastOpApplied = repl::ReplClientInfo::forClient(txn->getClient()).getLastOp(); auto awaitReplResult = replCoord->awaitReplication(txn, lastOpApplied, wc); if (awaitReplResult.status == ErrorCodes::WriteConcernFailed) { return Status(ErrorCodes::LinearizableReadConcernError, "Failed to confirm that read was linearizable."); } return awaitReplResult.status; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); std::unique_ptr<WorkingSet> ws(new WorkingSet()); std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get())); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); ClientCursor* cursor = new ClientCursor( CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
Status IndexAccessMethod::commitBulk(OperationContext* txn, std::unique_ptr<BulkBuilder> bulk, bool mayInterrupt, bool dupsAllowed, set<RecordId>* dupsToDrop) { Timer timer; std::unique_ptr<BulkBuilder::Sorter::Iterator> i(bulk->_sorter->done()); stdx::unique_lock<Client> lk(*txn->getClient()); ProgressMeterHolder pm(*txn->setMessage_inlock("Index Bulk Build: (2/3) btree bottom up", "Index: (2/3) BTree Bottom Up Progress", bulk->_keysInserted, 10)); lk.unlock(); std::unique_ptr<SortedDataBuilderInterface> builder; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); if (bulk->_everGeneratedMultipleKeys || isMultikeyFromPaths(bulk->_indexMultikeyPaths)) { _btreeState->setMultikey(txn, bulk->_indexMultikeyPaths); } builder.reset(_newInterface->getBulkBuilder(txn, dupsAllowed)); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "setting index multikey flag", ""); while (i->more()) { if (mayInterrupt) { txn->checkForInterrupt(); } WriteUnitOfWork wunit(txn); // Improve performance in the btree-building phase by disabling rollback tracking. // This avoids copying all the written bytes to a buffer that is only used to roll back. // Note that this is safe to do, as this entire index-build-in-progress will be cleaned // up by the index system. txn->recoveryUnit()->setRollbackWritesDisabled(); // Get the next datum and add it to the builder. BulkBuilder::Sorter::Data d = i->next(); Status status = builder->addKey(d.first, d.second); if (!status.isOK()) { // Overlong key that's OK to skip? if (status.code() == ErrorCodes::KeyTooLong && ignoreKeyTooLong(txn)) { continue; } // Check if this is a duplicate that's OK to skip if (status.code() == ErrorCodes::DuplicateKey) { invariant(!dupsAllowed); // shouldn't be getting DupKey errors if dupsAllowed. if (dupsToDrop) { dupsToDrop->insert(d.second); continue; } } return status; } // If we're here either it's a dup and we're cool with it or the addKey went just // fine. pm.hit(); wunit.commit(); } pm.finished(); { stdx::lock_guard<Client> lk(*txn->getClient()); CurOp::get(txn)->setMessage_inlock("Index Bulk Build: (3/3) btree-middle", "Index: (3/3) BTree Middle Progress"); } LOG(timer.seconds() > 10 ? 0 : 1) << "\t done building bottom layer, going to commit"; builder->commit(mayInterrupt); return Status::OK(); }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { const NamespaceString ns(parseNs(dbname, cmdObj)); Status status = userAllowedWriteNS(ns); if (!status.isOK()) return appendCommandStatus(result, status); if (cmdObj["indexes"].type() != Array) { errmsg = "indexes has to be an array"; result.append("cmdObj", cmdObj); return false; } std::vector<BSONObj> specs; { BSONObjIterator i(cmdObj["indexes"].Obj()); while (i.more()) { BSONElement e = i.next(); if (e.type() != Object) { errmsg = "everything in indexes has to be an Object"; result.append("cmdObj", cmdObj); return false; } specs.push_back(e.Obj()); } } if (specs.size() == 0) { errmsg = "no indexes to add"; return false; } // check specs for (size_t i = 0; i < specs.size(); i++) { BSONObj spec = specs[i]; if (spec["ns"].eoo()) { spec = _addNsToSpec(ns, spec); specs[i] = spec; } if (spec["ns"].type() != String) { errmsg = "ns field must be a string"; result.append("spec", spec); return false; } std::string nsFromUser = spec["ns"].String(); if (nsFromUser.empty()) { errmsg = "ns field cannot be an empty string"; result.append("spec", spec); return false; } if (ns != nsFromUser) { errmsg = str::stream() << "value of ns field '" << nsFromUser << "' doesn't match namespace " << ns.ns(); result.append("spec", spec); return false; } } // now we know we have to create index(es) // Note: createIndexes command does not currently respect shard versioning. ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbLock(txn->lockState(), ns.db(), MODE_X); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) { return appendCommandStatus( result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating indexes in " << ns.ns())); } Database* db = dbHolder().get(txn, ns.db()); if (!db) { db = dbHolder().openDb(txn, ns.db()); } Collection* collection = db->getCollection(ns.ns()); if (collection) { result.appendBool("createdCollectionAutomatically", false); } else { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); collection = db->createCollection(txn, ns.ns(), CollectionOptions()); invariant(collection); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); result.appendBool("createdCollectionAutomatically", true); } const int numIndexesBefore = collection->getIndexCatalog()->numIndexesTotal(txn); result.append("numIndexesBefore", numIndexesBefore); auto client = txn->getClient(); ScopeGuard lastOpSetterGuard = MakeObjGuard(repl::ReplClientInfo::forClient(client), &repl::ReplClientInfo::setLastOpToSystemLastOpTime, txn); MultiIndexBlock indexer(txn, collection); indexer.allowBackgroundBuilding(); indexer.allowInterruption(); const size_t origSpecsSize = specs.size(); indexer.removeExistingIndexes(&specs); if (specs.size() == 0) { result.append("numIndexesAfter", numIndexesBefore); result.append("note", "all indexes already exist"); return true; } if (specs.size() != origSpecsSize) { result.append("note", "index already exists"); } for (size_t i = 0; i < specs.size(); i++) { const BSONObj& spec = specs[i]; if (spec["unique"].trueValue()) { status = checkUniqueIndexConstraints(txn, ns.ns(), spec["key"].Obj()); if (!status.isOK()) { return appendCommandStatus(result, status); } } if (spec["v"].isNumber() && spec["v"].numberInt() == 0) { return appendCommandStatus( result, Status(ErrorCodes::CannotCreateIndex, str::stream() << "illegal index specification: " << spec << ". " << "The option v:0 cannot be passed explicitly")); } } MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { uassertStatusOK(indexer.init(specs)); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); // If we're a background index, replace exclusive db lock with an intent lock, so that // other readers and writers can proceed during this phase. if (indexer.getBuildInBackground()) { txn->recoveryUnit()->abandonSnapshot(); dbLock.relockWithMode(MODE_IX); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) { return appendCommandStatus( result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating background indexes in " << ns.ns())); } } try { Lock::CollectionLock colLock(txn->lockState(), ns.ns(), MODE_IX); uassertStatusOK(indexer.insertAllDocumentsInCollection()); } catch (const DBException& e) { invariant(e.getCode() != ErrorCodes::WriteConflict); // Must have exclusive DB lock before we clean up the index build via the // destructor of 'indexer'. if (indexer.getBuildInBackground()) { try { // This function cannot throw today, but we will preemptively prepare for // that day, to avoid data corruption due to lack of index cleanup. txn->recoveryUnit()->abandonSnapshot(); dbLock.relockWithMode(MODE_X); if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) { return appendCommandStatus( result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating background indexes in " << ns.ns() << ": cleaning up index build failure due to " << e.toString())); } } catch (...) { std::terminate(); } } throw; } // Need to return db lock back to exclusive, to complete the index build. if (indexer.getBuildInBackground()) { txn->recoveryUnit()->abandonSnapshot(); dbLock.relockWithMode(MODE_X); uassert(ErrorCodes::NotMaster, str::stream() << "Not primary while completing index build in " << dbname, repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)); Database* db = dbHolder().get(txn, ns.db()); uassert(28551, "database dropped during index build", db); uassert(28552, "collection dropped during index build", db->getCollection(ns.ns())); } MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); indexer.commit(); for (size_t i = 0; i < specs.size(); i++) { std::string systemIndexes = ns.getSystemIndexesCollection(); getGlobalServiceContext()->getOpObserver()->onCreateIndex( txn, systemIndexes, specs[i]); } wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); result.append("numIndexesAfter", collection->getIndexCatalog()->numIndexesTotal(txn)); lastOpSetterGuard.Dismiss(); return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { // --- parse NamespaceString ns( dbname, cmdObj[name].String() ); Status status = userAllowedWriteNS( ns ); if ( !status.isOK() ) return appendCommandStatus( result, status ); if ( cmdObj["indexes"].type() != Array ) { errmsg = "indexes has to be an array"; result.append( "cmdObj", cmdObj ); return false; } std::vector<BSONObj> specs; { BSONObjIterator i( cmdObj["indexes"].Obj() ); while ( i.more() ) { BSONElement e = i.next(); if ( e.type() != Object ) { errmsg = "everything in indexes has to be an Object"; result.append( "cmdObj", cmdObj ); return false; } specs.push_back( e.Obj() ); } } if ( specs.size() == 0 ) { errmsg = "no indexes to add"; return false; } // check specs for ( size_t i = 0; i < specs.size(); i++ ) { BSONObj spec = specs[i]; if ( spec["ns"].eoo() ) { spec = _addNsToSpec( ns, spec ); specs[i] = spec; } if ( spec["ns"].type() != String ) { errmsg = "spec has no ns"; result.append( "spec", spec ); return false; } if ( ns != spec["ns"].String() ) { errmsg = "namespace mismatch"; result.append( "spec", spec ); return false; } } // now we know we have to create index(es) // Note: createIndexes command does not currently respect shard versioning. ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbLock(txn->lockState(), ns.db(), MODE_X); if (!fromRepl && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) { return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating indexes in " << ns.ns())); } Database* db = dbHolder().get(txn, ns.db()); if (!db) { db = dbHolder().openDb(txn, ns.db()); } Collection* collection = db->getCollection( ns.ns() ); result.appendBool( "createdCollectionAutomatically", collection == NULL ); if ( !collection ) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); collection = db->createCollection( txn, ns.ns() ); invariant( collection ); if (!fromRepl) { getGlobalEnvironment()->getOpObserver()->onCreateCollection( txn, ns, CollectionOptions()); } wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns()); }
Status cloneCollectionAsCapped(OperationContext* txn, Database* db, const std::string& shortFrom, const std::string& shortTo, double size, bool temp) { std::string fromNs = db->name() + "." + shortFrom; std::string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection(fromNs); if (!fromCollection) return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist"); if (db->getCollection(toNs)) return Status(ErrorCodes::NamespaceExists, "to collection already exists"); // create new collection MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { const auto fromOptions = fromCollection->getCatalogEntry()->getCollectionOptions(txn).toBSON(); OldClientContext ctx(txn, toNs); BSONObjBuilder spec; spec.appendBool("capped", true); spec.append("size", size); if (temp) spec.appendBool("temp", true); spec.appendElementsUnique(fromOptions); WriteUnitOfWork wunit(txn); Status status = userCreateNS(txn, ctx.db(), toNs, spec.done()); if (!status.isOK()) return status; wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "cloneCollectionAsCapped", fromNs); Collection* toCollection = db->getCollection(toNs); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(txn) * 2)); long long excessSize = fromCollection->dataSize(txn) - allocatedSpaceGuess; std::unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, fromNs, fromCollection, InternalPlanner::FORWARD)); exec->setYieldPolicy(PlanExecutor::WRITE_CONFLICT_RETRY_ONLY); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(txn); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above invariant(false); } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != txn->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(txn, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(txn); toCollection->insertDocument(txn, objToClone.value(), true, txn->writesAreReplicated()); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException& wce) { CurOp::get(txn)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. wce.logAndBackoff(retries, "cloneCollectionAsCapped", fromNs); // Can't use WRITE_CONFLICT_RETRY_LOOP macros since we need to save/restore exec // around call to abandonSnapshot. exec->saveState(); txn->recoveryUnit()->abandonSnapshot(); exec->restoreState(txn); // Handles any WCEs internally. } } invariant(false); // unreachable }
void syncFixUp(OperationContext* txn, FixUpInfo& fixUpInfo, OplogReader* oplogreader, ReplicationCoordinator* replCoord) { DBClientConnection* them = oplogreader->conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totalSize = 0; list< pair<DocID, BSONObj> > goodVersions; BSONObj newMinValid; // fetch all the goodVersions of each document from current primary DocID doc; unsigned long long numFetched = 0; try { for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); it != fixUpInfo.toRefetch.end(); it++) { doc = *it; verify(!doc._id.eoo()); { // TODO : slow. lots of round trips. numFetched++; BSONObj good = them->findOne(doc.ns, doc._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); totalSize += good.objsize(); uassert(13410, "replSet too much data to roll back", totalSize < 300 * 1024 * 1024); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID, BSONObj>(doc,good)); } } newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { error() << "rollback error newMinValid empty?"; return; } } catch (DBException& e) { LOG(1) << "rollback re-get objects: " << e.toString(); error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' << numFetched << '/' << fixUpInfo.toRefetch.size(); throw e; } log() << "rollback 3.5"; if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily consistent. warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; return; } // update them log() << "rollback 4 n:" << goodVersions.size(); bool warn = false; invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); invariant(txn->lockState()->isW()); // we have items we are writing that aren't from a point-in-time. thus best not to come // online until we get to that point in freshness. Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); // any full collection resyncs required? if (!fixUpInfo.collectionsToResyncData.empty() || !fixUpInfo.collectionsToResyncMetadata.empty()) { for (const string& ns : fixUpInfo.collectionsToResyncData) { log() << "rollback 4.1.1 coll resync " << ns; fixUpInfo.collectionsToResyncMetadata.erase(ns); const NamespaceString nss(ns); Database* db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); { WriteUnitOfWork wunit(txn); db->dropCollection(txn, ns); wunit.commit(); } { string errmsg; // This comes as a GlobalWrite lock, so there is no DB to be acquired after // resume, so we can skip the DB stability checks. Also // copyCollectionFromRemote will acquire its own database pointer, under the // appropriate locks, so just releasing and acquiring the lock is safe. invariant(txn->lockState()->isW()); Lock::TempRelease release(txn->lockState()); bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok); } } for (const string& ns : fixUpInfo.collectionsToResyncMetadata) { log() << "rollback 4.1.2 coll metadata resync " << ns; const NamespaceString nss(ns); auto db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); auto collection = db->getCollection(ns); invariant(collection); auto cce = collection->getCatalogEntry(); const std::list<BSONObj> info = them->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll())); if (info.empty()) { // Collection dropped by "them" so we should drop it too. log() << ns << " not found on remote host, dropping"; fixUpInfo.toDrop.insert(ns); continue; } invariant(info.size() == 1); CollectionOptions options; auto status = options.parse(info.front()); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to parse options " << info.front() << ": " << status.toString()); } WriteUnitOfWork wuow(txn); if (options.flagsSet || cce->getCollectionOptions(txn).flagsSet) { cce->updateFlags(txn, options.flags); } status = collection->setValidator(txn, options.validator); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to set validator: " << status.toString()); } wuow.commit(); } // we did more reading from primary, so check it again for a rollback (which would mess // us up), and make minValid newer. log() << "rollback 4.2"; string err; try { newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { err = "can't get minvalid from sync source"; } else { Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); } } catch (DBException& e) { err = "can't get/set minvalid: "; err += e.what(); } if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily // consistent. however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if (!err.empty()) { severe() << "rolling back : " << err << ". A full resync will be necessary."; // TODO: reset minvalid so that we are permanently in fatal state // TODO: don't be fatal, but rather, get all the data first. throw RSFatalException(); } log() << "rollback 4.3"; } map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers; log() << "rollback 4.6"; // drop collections to drop before doing individual fixups - that might make things faster // below actually if there were subsequent inserts to rollback for (set<string>::iterator it = fixUpInfo.toDrop.begin(); it != fixUpInfo.toDrop.end(); it++) { log() << "rollback drop: " << *it; Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); if (db) { WriteUnitOfWork wunit(txn); shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); // perform a collection scan and write all documents in the collection to disk boost::scoped_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, *it, db->getCollection(*it))); BSONObj curObj; PlanExecutor::ExecState execState; while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { removeSaver->goingToDelete(curObj); } if (execState != PlanExecutor::IS_EOF) { if (execState == PlanExecutor::FAILURE && WorkingSetCommon::isValidStatusMemberObject(curObj)) { Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); severe() << "rolling back createCollection on " << *it << " failed with " << errorStatus << ". A full resync is necessary."; } else { severe() << "rolling back createCollection on " << *it << " failed. A full resync is necessary."; } throw RSFatalException(); } db->dropCollection(txn, *it); wunit.commit(); } } log() << "rollback 4.7"; OldClientContext ctx(txn, rsOplogName); Collection* oplogCollection = ctx.db()->getCollection(rsOplogName); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsOplogName, oplogCollection); unsigned deletes = 0, updates = 0; time_t lastProgressUpdate = time(0); time_t progressUpdateGap = 10; for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin(); it != goodVersions.end(); it++) { time_t now = time(0); if (now - lastProgressUpdate > progressUpdateGap) { log() << deletes << " delete and " << updates << " update operations processed out of " << goodVersions.size() << " total operations"; lastProgressUpdate = now; } const DocID& doc = it->first; BSONObj pattern = doc._id.wrap(); // { _id : ... } try { verify(doc.ns && *doc.ns); if (fixUpInfo.collectionsToResyncData.count(doc.ns)) { // we just synced this entire collection continue; } // keep an archive of items rolled back shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); // todo: lots of overhead in context, this can be faster OldClientContext ctx(txn, doc.ns); // Add the doc to our rollback file BSONObj obj; Collection* collection = ctx.db()->getCollection(doc.ns); // Do not log an error when undoing an insert on a no longer existent collection. // It is likely that the collection was dropped as part of rolling back a // createCollection command and regardless, the document no longer exists. if (collection) { bool found = Helpers::findOne(txn, collection, pattern, obj, false); if (found) { removeSaver->goingToDelete(obj); } else { error() << "rollback cannot find object: " << pattern << " in namespace " << doc.ns; } } if (it->second.isEmpty()) { // wasn't on the primary; delete. // TODO 1.6 : can't delete from a capped collection. need to handle that here. deletes++; if (collection) { if (collection->isCapped()) { // can't delete from a capped collection - so we truncate instead. if // this item must go, so must all successors!!! try { // TODO: IIRC cappedTruncateAfter does not handle completely empty. // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); RecordId loc = Helpers::findOne(txn, collection, pattern, false); if (Listener::getElapsedTimeMillis() - start > 200) warning() << "roll back slow no _id index for " << doc.ns << " perhaps?"; // would be faster but requires index: // RecordId loc = Helpers::findById(nsd, pattern); if (!loc.isNull()) { try { collection->temp_cappedTruncateAfter(txn, loc, true); } catch (DBException& e) { if (e.getCode() == 13415) { // hack: need to just make cappedTruncate do this... MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); uassertStatusOK(collection->truncate(txn)); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "truncate", collection->ns().ns()); } else { throw e; } } } } catch (DBException& e) { error() << "rolling back capped collection rec " << doc.ns << ' ' << e.toString(); } } else {
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl) { const std::string ns = parseNsCollectionRequired(dbname, cmdObj); Status allowedWriteStatus = userAllowedWriteNS(ns); if (!allowedWriteStatus.isOK()) { return appendCommandStatus(result, allowedWriteStatus); } const BSONObj query = cmdObj.getObjectField("query"); const BSONObj fields = cmdObj.getObjectField("fields"); const BSONObj update = cmdObj.getObjectField("update"); const BSONObj sort = cmdObj.getObjectField("sort"); bool upsert = cmdObj["upsert"].trueValue(); bool returnNew = cmdObj["new"].trueValue(); bool remove = cmdObj["remove"].trueValue(); if (remove) { if (upsert) { errmsg = "remove and upsert can't co-exist"; return false; } if (!update.isEmpty()) { errmsg = "remove and update can't co-exist"; return false; } if (returnNew) { errmsg = "remove and returnNew can't co-exist"; return false; } } else if (!cmdObj.hasField("update")) { errmsg = "need remove or update"; return false; } bool ok = false; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { errmsg = ""; // We can always retry because we only ever modify one document ok = runImpl(txn, dbname, ns, query, fields, update, sort, upsert, returnNew, remove, result, errmsg); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "findAndModify", ns); if (!ok && errmsg == "no-collection") { // Take X lock so we can create collection, then re-run operation. ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock lk(txn->lockState(), dbname, MODE_X); Client::Context ctx(txn, ns, false /* don't check version */); if (!fromRepl && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) { return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while creating collection " << ns << " during findAndModify")); } Database* db = ctx.db(); if (db->getCollection(ns)) { // someone else beat us to it, that's ok // we might race while we unlock if someone drops // but that's ok, we'll just do nothing and error out } else { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wuow(txn); uassertStatusOK(userCreateNS(txn, db, ns, BSONObj(), !fromRepl)); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "findAndModify", ns); } errmsg = ""; ok = runImpl(txn, dbname, ns, query, fields, update, sort, upsert, returnNew, remove, result, errmsg); }
// static Status SyncTail::syncApply(OperationContext* txn, const BSONObj& op, bool convertUpdateToUpsert, ApplyOperationInLockFn applyOperationInLock, ApplyCommandInLockFn applyCommandInLock, IncrementOpsAppliedStatsFn incrementOpsAppliedStats) { if (inShutdown()) { return Status::OK(); } // Count each log op application as a separate operation, for reporting purposes CurOp individualOp(txn); const char* ns = op.getStringField("ns"); verify(ns); const char* opType = op["op"].valuestrsafe(); bool isCommand(opType[0] == 'c'); bool isNoOp(opType[0] == 'n'); if ((*ns == '\0') || (*ns == '.')) { // this is ugly // this is often a no-op // but can't be 100% sure if (!isNoOp) { error() << "skipping bad op in oplog: " << op.toString(); } return Status::OK(); } if (isCommand) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( Lock::GlobalWrite globalWriteLock(txn->lockState()); // special case apply for commands to avoid implicit database creation Status status = applyCommandInLock(txn, op); incrementOpsAppliedStats(); return status; } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "syncApply_command", ns); } auto applyOp = [&](Database* db) { // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. txn->setReplicatedWrites(false); DisableDocumentValidation validationDisabler(txn); Status status = applyOperationInLock(txn, db, op, convertUpdateToUpsert, incrementOpsAppliedStats); if (!status.isOK() && status.code() == ErrorCodes::WriteConflict) { throw WriteConflictException(); } return status; }; if (isNoOp || (opType[0] == 'i' && nsToCollectionSubstring(ns) == "system.indexes")) { auto opStr = isNoOp ? "syncApply_noop" : "syncApply_indexBuild"; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X); OldClientContext ctx(txn, ns); return applyOp(ctx.db()); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, opStr, ns); }