bool DropPendingCollectionReaper::rollBackDropPendingCollection( OperationContext* opCtx, const OpTime& opTime, const NamespaceString& collectionNamespace) { // renames because these are internal operations. UnreplicatedWritesBlock uwb(opCtx); const auto pendingNss = collectionNamespace.makeDropPendingNamespace(opTime); { stdx::lock_guard<stdx::mutex> lock(_mutex); const auto equalRange = _dropPendingNamespaces.equal_range(opTime); const auto& lowerBound = equalRange.first; const auto& upperBound = equalRange.second; auto matcher = [&pendingNss](const auto& pair) { return pair.second == pendingNss; }; auto it = std::find_if(lowerBound, upperBound, matcher); if (it == upperBound) { warning() << "Cannot find drop-pending namespace at OpTime " << opTime << " for collection " << collectionNamespace << " to roll back."; return false; } _dropPendingNamespaces.erase(it); } log() << "Rolling back collection drop for " << pendingNss << " with drop OpTime " << opTime << " to namespace " << collectionNamespace; return true; }
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, Timestamp oplogApplicationStartPoint, Timestamp topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; DBDirectClient db(opCtx); auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(), QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << oplogApplicationStartPoint.toBSON() << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstTimestampFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp(); if (firstTimestampFound != oplogApplicationStartPoint) { severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON() << " is missing; actual entry found is " << firstTimestampFound.toBSON(); fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. UnreplicatedWritesBlock uwb(opCtx); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering)); _consistencyMarkers->setAppliedThrough( opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }
int* measure() { int i=0; for(i=0;i<num;i++) { dis[i]=uwb(nodes[i]); } return dis; }
void DropPendingCollectionReaper::dropCollectionsOlderThan(OperationContext* opCtx, const OpTime& opTime) { DropPendingNamespaces toDrop; { stdx::lock_guard<stdx::mutex> lock(_mutex); for (auto it = _dropPendingNamespaces.cbegin(); it != _dropPendingNamespaces.cend() && it->first <= opTime; ++it) { toDrop.insert(*it); } } if (toDrop.empty()) { return; } { // Every node cleans up its own drop-pending collections. We should never replicate these // drops because these are internal operations. UnreplicatedWritesBlock uwb(opCtx); for (const auto& opTimeAndNamespace : toDrop) { const auto& dropOpTime = opTimeAndNamespace.first; const auto& nss = opTimeAndNamespace.second; log() << "Completing collection drop for " << nss << " with drop optime " << dropOpTime << " (notification optime: " << opTime << ")"; Status status = Status::OK(); try { // dropCollection could throw an interrupt exception, since it acquires db locks. status = _storageInterface->dropCollection(opCtx, nss); } catch (...) { status = exceptionToStatus(); } if (!status.isOK()) { warning() << "Failed to remove drop-pending collection " << nss << " with drop optime " << dropOpTime << " (notification optime: " << opTime << "): " << status; } } } { // Entries must be removed AFTER drops are completed, so that getEarliestDropOpTime() // returns appropriate results. stdx::lock_guard<stdx::mutex> lock(_mutex); auto it = _dropPendingNamespaces.cbegin(); while (it != _dropPendingNamespaces.cend() && it->first <= opTime) { if (toDrop.find(it->first) != toDrop.cend()) { it = _dropPendingNamespaces.erase(it); } else { ++it; } } } }
void ReplicationRecoveryImpl::_reconstructPreparedTransactions(OperationContext* opCtx) { DBDirectClient client(opCtx); const auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace, {BSON("state" << "prepared")}); // Iterate over each entry in the transactions table that has a prepared transaction. while (cursor->more()) { const auto txnRecordObj = cursor->next(); const auto txnRecord = SessionTxnRecord::parse( IDLParserErrorContext("recovering prepared transaction"), txnRecordObj); invariant(txnRecord.getState() == DurableTxnStateEnum::kPrepared); // Get the prepareTransaction oplog entry corresponding to this transactions table entry. invariant(!opCtx->recoveryUnit()->getPointInTimeReadTimestamp()); const auto prepareOpTime = txnRecord.getLastWriteOpTime(); invariant(!prepareOpTime.isNull()); TransactionHistoryIterator iter(prepareOpTime); invariant(iter.hasNext()); const auto prepareOplogEntry = iter.next(opCtx); { // Make a new opCtx so that we can set the lsid when applying the prepare transaction // oplog entry. auto newClient = opCtx->getServiceContext()->makeClient("reconstruct-prepared-transactions"); AlternativeClientRegion acr(newClient); const auto newOpCtx = cc().makeOperationContext(); repl::UnreplicatedWritesBlock uwb(newOpCtx.get()); // Snapshot transaction can never conflict with the PBWM lock. newOpCtx->lockState()->setShouldConflictWithSecondaryBatchApplication(false); // TODO: SERVER-40177 This should be removed once it is guaranteed operations applied on // recovering nodes cannot encounter unnecessary prepare conflicts. newOpCtx->recoveryUnit()->setIgnorePrepared(true); // Checks out the session, applies the operations and prepares the transactions. uassertStatusOK(applyRecoveredPrepareTransaction(newOpCtx.get(), prepareOplogEntry)); } } }
Status ReplicationCoordinatorExternalStateImpl::runRepairOnLocalDB(OperationContext* opCtx) { try { Lock::GlobalWrite globalWrite(opCtx); StorageEngine* engine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!engine->isMmapV1()) { return Status::OK(); } UnreplicatedWritesBlock uwb(opCtx); Status status = repairDatabase(opCtx, engine, localDbName, false, false); // Open database before returning dbHolder().openDb(opCtx, localDbName); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
void CollectionCloner::_listIndexesCallback(const Fetcher::QueryResponseStatus& fetchResult, Fetcher::NextAction* nextAction, BSONObjBuilder* getMoreBob) { const bool collectionIsEmpty = fetchResult == ErrorCodes::NamespaceNotFound; if (collectionIsEmpty) { // Schedule collection creation and finish callback. auto&& scheduleResult = _scheduleDbWorkFn([this](const executor::TaskExecutor::CallbackArgs& cbd) { if (!cbd.status.isOK()) { _finishCallback(cbd.status); return; } auto opCtx = cbd.opCtx; UnreplicatedWritesBlock uwb(opCtx); auto&& createStatus = _storageInterface->createCollection(opCtx, _destNss, _options); _finishCallback(createStatus); }); if (!scheduleResult.isOK()) { _finishCallback(scheduleResult.getStatus()); } return; }; if (!fetchResult.isOK()) { _finishCallback(fetchResult.getStatus().withContext( str::stream() << "listIndexes call failed on collection '" << _sourceNss.ns() << "'")); return; } auto batchData(fetchResult.getValue()); auto&& documents = batchData.documents; if (documents.empty()) { warning() << "No indexes found for collection " << _sourceNss.ns() << " while cloning from " << _source; } UniqueLock lk(_mutex); // When listing indexes by UUID, the sync source may use a different name for the collection // as result of renaming or two-phase drop. As the index spec also includes a 'ns' field, this // must be rewritten. BSONObjBuilder nsFieldReplacementBuilder; nsFieldReplacementBuilder.append("ns", _sourceNss.ns()); BSONElement nsFieldReplacementElem = nsFieldReplacementBuilder.done().firstElement(); // We may be called with multiple batches leading to a need to grow _indexSpecs. _indexSpecs.reserve(_indexSpecs.size() + documents.size()); for (auto&& doc : documents) { // The addField replaces the 'ns' field with the correct name, see above. if (StringData("_id_") == doc["name"].str()) { _idIndexSpec = doc.addField(nsFieldReplacementElem); continue; } _indexSpecs.push_back(doc.addField(nsFieldReplacementElem)); } lk.unlock(); // The fetcher will continue to call with kGetMore until an error or the last batch. if (*nextAction == Fetcher::NextAction::kGetMore) { invariant(getMoreBob); getMoreBob->append("getMore", batchData.cursorId); getMoreBob->append("collection", batchData.nss.coll()); return; } // We have all of the indexes now, so we can start cloning the collection data. auto&& scheduleResult = _scheduleDbWorkFn( [=](const executor::TaskExecutor::CallbackArgs& cbd) { _beginCollectionCallback(cbd); }); if (!scheduleResult.isOK()) { _finishCallback(scheduleResult.getStatus()); return; } }
Status applyOps(OperationContext* opCtx, const std::string& dbName, const BSONObj& applyOpCmd, BSONObjBuilder* result) { bool allowAtomic = false; uassertStatusOK( bsonExtractBooleanFieldWithDefault(applyOpCmd, "allowAtomic", true, &allowAtomic)); auto areOpsCrudOnly = _areOpsCrudOnly(applyOpCmd); auto isAtomic = allowAtomic && areOpsCrudOnly; auto hasPrecondition = _hasPrecondition(applyOpCmd); boost::optional<Lock::GlobalWrite> globalWriteLock; boost::optional<Lock::DBLock> dbWriteLock; // There's only one case where we are allowed to take the database lock instead of the global // lock - no preconditions; only CRUD ops; and non-atomic mode. if (!hasPrecondition && areOpsCrudOnly && !allowAtomic) { dbWriteLock.emplace(opCtx, dbName, MODE_X); } else { globalWriteLock.emplace(opCtx); } bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(opCtx, dbName); if (userInitiatedWritesAndNotPrimary) return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while applying ops to database " << dbName); if (hasPrecondition) { auto status = _checkPrecondition(opCtx, applyOpCmd, result); if (!status.isOK()) { return status; } } int numApplied = 0; if (!isAtomic) return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied); // Perform write ops atomically invariant(globalWriteLock); try { writeConflictRetry(opCtx, "applyOps", dbName, [&] { BSONObjBuilder intermediateResult; WriteUnitOfWork wunit(opCtx); numApplied = 0; { // Suppress replication for atomic operations until end of applyOps. repl::UnreplicatedWritesBlock uwb(opCtx); uassertStatusOK( _applyOps(opCtx, dbName, applyOpCmd, &intermediateResult, &numApplied)); } // Generate oplog entry for all atomic ops collectively. if (opCtx->writesAreReplicated()) { // We want this applied atomically on slaves so we rewrite the oplog entry without // the pre-condition for speed. BSONObjBuilder cmdBuilder; for (auto elem : applyOpCmd) { auto name = elem.fieldNameStringData(); if (name == kPreconditionFieldName) continue; if (name == "bypassDocumentValidation") continue; cmdBuilder.append(elem); } const BSONObj cmdRewritten = cmdBuilder.done(); auto opObserver = getGlobalServiceContext()->getOpObserver(); invariant(opObserver); opObserver->onApplyOps(opCtx, dbName, cmdRewritten); } wunit.commit(); result->appendElements(intermediateResult.obj()); }); } catch (const DBException& ex) { if (ex.getCode() == ErrorCodes::NamespaceNotFound) { // Retry in non-atomic mode, since MMAP cannot implicitly create a new database // within an active WriteUnitOfWork. return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied); } BSONArrayBuilder ab; ++numApplied; for (int j = 0; j < numApplied; j++) ab.append(false); result->append("applied", numApplied); result->append("code", ex.getCode()); result->append("codeName", ErrorCodes::errorString(ErrorCodes::fromInt(ex.getCode()))); result->append("errmsg", ex.what()); result->append("results", ab.arr()); return Status(ErrorCodes::UnknownError, ex.what()); } return Status::OK(); }