void DeferredWriter::_worker(InsertStatement stmt) {
    auto uniqueOpCtx = Client::getCurrent()->makeOperationContext();
    OperationContext* opCtx = uniqueOpCtx.get();
    auto result = _getCollection(opCtx);

    if (!result.isOK()) {
        _logFailure(result.getStatus());
        return;
    }

    auto agc = std::move(result.getValue());

    Collection& collection = *agc->getCollection();

    Status status = writeConflictRetry(opCtx, "deferred insert", _nss.ns(), [&] {
        WriteUnitOfWork wuow(opCtx);
        Status status = collection.insertDocument(opCtx, stmt, nullptr, false);
        if (!status.isOK()) {
            return status;
        }

        wuow.commit();
        return Status::OK();
    });

    stdx::lock_guard<stdx::mutex> lock(_mutex);

    _numBytes -= stmt.doc.objsize();

    // If a write to a deferred collection fails, periodically tell the log.
    if (!status.isOK()) {
        _logFailure(status);
    }
}
Exemple #2
0
OpTimeWithHash BackgroundSync::_readLastAppliedOpTimeWithHash(OperationContext* opCtx) {
    BSONObj oplogEntry;
    try {
        bool success = writeConflictRetry(
            opCtx, "readLastAppliedHash", NamespaceString::kRsOplogNamespace.ns(), [&] {
                Lock::DBLock lk(opCtx, "local", MODE_X);
                return Helpers::getLast(
                    opCtx, NamespaceString::kRsOplogNamespace.ns().c_str(), oplogEntry);
            });

        if (!success) {
            // This can happen when we are to do an initial sync.  lastHash will be set
            // after the initial sync is complete.
            return OpTimeWithHash(0);
        }
    } catch (const DBException& ex) {
        severe() << "Problem reading " << NamespaceString::kRsOplogNamespace.ns() << ": "
                 << redact(ex);
        fassertFailed(18904);
    }
    long long hash;
    auto status = bsonExtractIntegerField(oplogEntry, kHashFieldName, &hash);
    if (!status.isOK()) {
        severe() << "Most recent entry in " << NamespaceString::kRsOplogNamespace.ns()
                 << " is missing or has invalid \"" << kHashFieldName
                 << "\" field. Oplog entry: " << redact(oplogEntry) << ": " << redact(status);
        fassertFailed(18902);
    }
    OplogEntry parsedEntry(oplogEntry);
    return OpTimeWithHash(hash, parsedEntry.getOpTime());
}
OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* opCtx,
                                                                      bool isV1ElectionProtocol) {
    invariant(opCtx->lockState()->isW());

    // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be
    // done before we add anything to our oplog.
    // We record this update at the 'lastAppliedOpTime'. If there are any outstanding
    // checkpoints being taken, they should only reflect this write if they see all writes up
    // to our 'lastAppliedOpTime'.
    invariant(
        _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull());
    auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime();
    _replicationProcess->getConsistencyMarkers()->clearAppliedThrough(
        opCtx, lastAppliedOpTime.getTimestamp());

    if (isV1ElectionProtocol) {
        writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] {
            WriteUnitOfWork wuow(opCtx);
            opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(
                opCtx,
                BSON("msg"
                     << "new primary"));
            wuow.commit();
        });
    }
    const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(opCtx));

    _shardingOnTransitionToPrimaryHook(opCtx);
    _dropAllTempCollections(opCtx);

    serverGlobalParams.validateFeaturesAsMaster.store(true);

    return opTimeToReturn;
}
void DatabaseHolderImpl::dropDb(OperationContext* opCtx, Database* db) {
    invariant(db);

    // Store the name so we have if for after the db object is deleted
    auto name = db->name();

    LOG(1) << "dropDatabase " << name;

    invariant(opCtx->lockState()->isDbLockedForMode(name, MODE_X));

    BackgroundOperation::assertNoBgOpInProgForDb(name);

    audit::logDropDatabase(opCtx->getClient(), name);

    auto const serviceContext = opCtx->getServiceContext();

    for (auto collIt = db->begin(opCtx); collIt != db->end(opCtx); ++collIt) {
        auto coll = *collIt;
        if (!coll) {
            break;
        }

        Top::get(serviceContext).collectionDropped(coll->ns().ns(), true);
    }

    close(opCtx, name);

    auto const storageEngine = serviceContext->getStorageEngine();
    writeConflictRetry(opCtx, "dropDatabase", name, [&] {
        storageEngine->dropDatabase(opCtx, name).transitional_ignore();
    });
}
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* opCtx,
                                                                         const BSONObj& config) {
    try {
        createOplog(opCtx);

        writeConflictRetry(opCtx,
                           "initiate oplog entry",
                           NamespaceString::kRsOplogNamespace.toString(),
                           [this, &opCtx, &config] {
                               Lock::GlobalWrite globalWrite(opCtx);

                               WriteUnitOfWork wuow(opCtx);
                               Helpers::putSingleton(opCtx, configCollectionName, config);
                               const auto msgObj = BSON("msg"
                                                        << "initiating set");
                               _service->getOpObserver()->onOpMessage(opCtx, msgObj);
                               wuow.commit();
                               // ReplSetTest assumes that immediately after the replSetInitiate
                               // command returns, it can allow other nodes to initial sync with no
                               // retries and they will succeed.  Unfortunately, initial sync will
                               // fail if it finds its sync source has an empty oplog.  Thus, we
                               // need to wait here until the seed document is visible in our oplog.
                               AutoGetCollection oplog(
                                   opCtx, NamespaceString::kRsOplogNamespace, MODE_IS);
                               waitForAllEarlierOplogWritesToBeVisible(opCtx);
                           });

        // Set UUIDs for all non-replicated collections. This is necessary for independent replica
        // sets and config server replica sets started with no data files because collections in
        // local are created prior to the featureCompatibilityVersion being set to 3.6, so the
        // collections are not created with UUIDs. We exclude ShardServers when adding UUIDs to
        // non-replicated collections on the primary because ShardServers are started up by default
        // with featureCompatibilityVersion 3.4, so we don't want to assign UUIDs to them until the
        // cluster's featureCompatibilityVersion is explicitly set to 3.6 by the config server. The
        // below UUID addition for non-replicated collections only occurs on the primary; UUIDs are
        // added to non-replicated collections on secondaries during InitialSync. When the config
        // server sets the featureCompatibilityVersion to 3.6, the shard primary will add UUIDs to
        // all the collections that need them. One special case here is if a shard is already in
        // featureCompatibilityVersion 3.6 and a new node is started up with --shardsvr and added to
        // that shard, the new node will still start up with featureCompatibilityVersion 3.4 and
        // need to have UUIDs added to each collection. These UUIDs are added during InitialSync,
        // because the new node is a secondary.
        if (serverGlobalParams.clusterRole != ClusterRole::ShardServer &&
            FeatureCompatibilityVersion::isCleanStartUp()) {
            auto schemaStatus = updateUUIDSchemaVersionNonReplicated(opCtx, true);
            if (!schemaStatus.isOK()) {
                return schemaStatus;
            }
        }
        FeatureCompatibilityVersion::setIfCleanStartup(opCtx, _storageInterface);
    } catch (const DBException& ex) {
        return ex.toStatus();
    }
    return Status::OK();
}
Status ReplicationCoordinatorExternalStateImpl::storeLocalConfigDocument(OperationContext* opCtx,
                                                                         const BSONObj& config) {
    try {
        writeConflictRetry(opCtx, "save replica set config", configCollectionName, [&] {
            Lock::DBLock dbWriteLock(opCtx, configDatabaseName, MODE_X);
            Helpers::putSingleton(opCtx, configCollectionName, config);
        });

        return Status::OK();
    } catch (const DBException& ex) {
        return ex.toStatus();
    }
}
Exemple #7
0
Status dropIndexes(OperationContext* opCtx,
                   const NamespaceString& nss,
                   const BSONObj& cmdObj,
                   BSONObjBuilder* result) {
    return writeConflictRetry(opCtx, "dropIndexes", nss.db(), [opCtx, &nss, &cmdObj, result] {
        AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X);

        bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
            !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, nss);

        if (userInitiatedWritesAndNotPrimary) {
            return Status(ErrorCodes::NotMaster,
                          str::stream() << "Not primary while dropping indexes in " << nss);
        }

        if (!serverGlobalParams.quiet.load()) {
            LOG(0) << "CMD: dropIndexes " << nss << ": " << cmdObj[kIndexFieldName].toString(false);
        }

        // If db/collection does not exist, short circuit and return.
        Database* db = autoColl.getDb();
        Collection* collection = autoColl.getCollection();
        if (!collection) {
            if (db && ViewCatalog::get(db)->lookup(opCtx, nss.ns())) {
                return Status(ErrorCodes::CommandNotSupportedOnView,
                              str::stream() << "Cannot drop indexes on view " << nss);
            }

            return Status(ErrorCodes::NamespaceNotFound, "ns not found");
        }

        BackgroundOperation::assertNoBgOpInProgForNs(nss);
        IndexBuildsCoordinator::get(opCtx)->assertNoIndexBuildInProgForCollection(
            collection->uuid().get());

        WriteUnitOfWork wunit(opCtx);
        OldClientContext ctx(opCtx, nss.ns());

        Status status = wrappedRun(opCtx, collection, cmdObj, result);
        if (!status.isOK()) {
            return status;
        }

        wunit.commit();
        return Status::OK();
    });
}
StatusWith<BSONObj> ReplicationCoordinatorExternalStateImpl::loadLocalConfigDocument(
    OperationContext* opCtx) {
    try {
        return writeConflictRetry(opCtx, "load replica set config", configCollectionName, [opCtx] {
            BSONObj config;
            if (!Helpers::getSingleton(opCtx, configCollectionName, config)) {
                return StatusWith<BSONObj>(
                    ErrorCodes::NoMatchingDocument,
                    str::stream() << "Did not find replica set configuration document in "
                                  << configCollectionName);
            }
            return StatusWith<BSONObj>(config);
        });
    } catch (const DBException& ex) {
        return StatusWith<BSONObj>(ex.toStatus());
    }
}
Exemple #9
0
void ServiceContextMongoDTest::_dropAllDBs(OperationContext* opCtx) {
    dropAllDatabasesExceptLocal(opCtx);

    Lock::GlobalWrite lk(opCtx);
    AutoGetDb autoDBLocal(opCtx, "local", MODE_X);
    const auto localDB = autoDBLocal.getDb();
    if (localDB) {
        writeConflictRetry(opCtx, "_dropAllDBs", "local", [&] {
            // Do not wrap in a WriteUnitOfWork until SERVER-17103 is addressed.
            autoDBLocal.getDb()->dropDatabase(opCtx, localDB);
        });
    }

    // dropAllDatabasesExceptLocal() does not close empty databases. However the holder still
    // allocates resources to track these empty databases. These resources not released by
    // dropAllDatabasesExceptLocal() will be leaked at exit unless we call DatabaseHolder::closeAll.
    dbHolder().closeAll(opCtx, "all databases dropped");
}
StatusWith<LastVote> ReplicationCoordinatorExternalStateImpl::loadLocalLastVoteDocument(
    OperationContext* opCtx) {
    try {
        return writeConflictRetry(
            opCtx, "load replica set lastVote", lastVoteCollectionName, [opCtx] {
                BSONObj lastVoteObj;
                if (!Helpers::getSingleton(opCtx, lastVoteCollectionName, lastVoteObj)) {
                    return StatusWith<LastVote>(
                        ErrorCodes::NoMatchingDocument,
                        str::stream() << "Did not find replica set lastVote document in "
                                      << lastVoteCollectionName);
                }
                return LastVote::readFromLastVote(lastVoteObj);
            });
    } catch (const DBException& ex) {
        return StatusWith<LastVote>(ex.toStatus());
    }
}
Exemple #11
0
void NoopWriter::_writeNoop(OperationContext* opCtx) {
    // Use GlobalLock + lockMMAPV1Flush instead of DBLock to allow return when the lock is not
    // available. It may happen when the primary steps down and a shared global lock is acquired.
    Lock::GlobalLock lock(
        opCtx, MODE_IX, Date_t::now() + Milliseconds(1), Lock::InterruptBehavior::kLeaveUnlocked);
    if (!lock.isLocked()) {
        LOG(1) << "Global lock is not available skipping noopWrite";
        return;
    }
    opCtx->lockState()->lockMMAPV1Flush();

    auto replCoord = ReplicationCoordinator::get(opCtx);
    // Its a proxy for being a primary
    if (!replCoord->canAcceptWritesForDatabase(opCtx, "admin")) {
        LOG(1) << "Not a primary, skipping the noop write";
        return;
    }

    auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime();

    // _lastKnownOpTime is not protected by lock as its used only by one thread.
    if (lastAppliedOpTime != _lastKnownOpTime) {
        LOG(1) << "Not scheduling a noop write. Last known OpTime: " << _lastKnownOpTime
               << " != last primary OpTime: " << lastAppliedOpTime;
    } else {
        if (writePeriodicNoops.load()) {
            const auto logLevel = getTestCommandsEnabled() ? 0 : 1;
            LOG(logLevel)
                << "Writing noop to oplog as there has been no writes to this replica set in over "
                << _writeInterval;
            writeConflictRetry(
                opCtx, "writeNoop", NamespaceString::kRsOplogNamespace.ns(), [&opCtx] {
                    WriteUnitOfWork uow(opCtx);
                    opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(opCtx,
                                                                                          kMsgObj);
                    uow.commit();
                });
        }
    }

    _lastKnownOpTime = replCoord->getMyLastAppliedOpTime();
    LOG(1) << "Set last known op time to " << _lastKnownOpTime;
}
SessionCatalogMigrationSource::SessionCatalogMigrationSource(OperationContext* opCtx,
                                                             NamespaceString ns)
    : _ns(std::move(ns)), _rollbackIdAtInit(repl::ReplicationProcess::get(opCtx)->getRollbackID()) {
    // Exclude entries for transaction.
    Query query;
    // Sort is not needed for correctness. This is just for making it easier to write deterministic
    // tests.
    query.sort(BSON("_id" << 1));

    DBDirectClient client(opCtx);
    auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace, query);

    while (cursor->more()) {
        auto nextSession = SessionTxnRecord::parse(
            IDLParserErrorContext("Session migration cloning"), cursor->next());
        if (!nextSession.getLastWriteOpTime().isNull()) {
            _sessionOplogIterators.push_back(
                stdx::make_unique<SessionOplogIterator>(std::move(nextSession), _rollbackIdAtInit));
        }
    }

    {
        AutoGetCollection autoColl(opCtx, NamespaceString::kRsOplogNamespace, MODE_IX);
        writeConflictRetry(
            opCtx,
            "session migration initialization majority commit barrier",
            NamespaceString::kRsOplogNamespace.ns(),
            [&] {
                const auto message = BSON("sessionMigrateCloneStart" << _ns.ns());

                WriteUnitOfWork wuow(opCtx);
                opCtx->getClient()->getServiceContext()->getOpObserver()->onInternalOpMessage(
                    opCtx, _ns, {}, {}, message);
                wuow.commit();
            });
    }

    auto opTimeToWait = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
    WriteConcernResult result;
    WriteConcernOptions majority(
        WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0);
    uassertStatusOK(waitForWriteConcern(opCtx, opTimeToWait, majority, &result));
}
Status ReplicationCoordinatorExternalStateImpl::storeLocalLastVoteDocument(
    OperationContext* opCtx, const LastVote& lastVote) {
    BSONObj lastVoteObj = lastVote.toBSON();
    try {
        Status status =
            writeConflictRetry(opCtx, "save replica set lastVote", lastVoteCollectionName, [&] {
                Lock::DBLock dbWriteLock(opCtx, lastVoteDatabaseName, MODE_X);

                // If there is no last vote document, we want to store one. Otherwise, we only want
                // to replace it if the new last vote document would have a higher term. We both
                // check the term of the current last vote document and insert the new document
                // under the DBLock to synchronize the two operations.
                BSONObj result;
                bool exists = Helpers::getSingleton(opCtx, lastVoteCollectionName, result);
                if (!exists) {
                    Helpers::putSingleton(opCtx, lastVoteCollectionName, lastVoteObj);
                } else {
                    StatusWith<LastVote> oldLastVoteDoc = LastVote::readFromLastVote(result);
                    if (!oldLastVoteDoc.isOK()) {
                        return oldLastVoteDoc.getStatus();
                    }
                    if (lastVote.getTerm() > oldLastVoteDoc.getValue().getTerm()) {
                        Helpers::putSingleton(opCtx, lastVoteCollectionName, lastVoteObj);
                    }
                }

                return Status::OK();
            });

        if (!status.isOK()) {
            return status;
        }

        opCtx->recoveryUnit()->waitUntilDurable();

        return Status::OK();
    } catch (const DBException& ex) {
        return ex.toStatus();
    }
}
StatusWith<int> CollectionRangeDeleter::_doDeletion(OperationContext* opCtx,
                                                    Collection* collection,
                                                    BSONObj const& keyPattern,
                                                    ChunkRange const& range,
                                                    int maxToDelete) {
    invariant(collection != nullptr);
    invariant(!isEmpty());

    auto const& nss = collection->ns();

    // The IndexChunk has a keyPattern that may apply to more than one index - we need to
    // select the index and get the full index keyPattern here.
    auto catalog = collection->getIndexCatalog();
    const IndexDescriptor* idx = catalog->findShardKeyPrefixedIndex(opCtx, keyPattern, false);
    if (!idx) {
        std::string msg = str::stream() << "Unable to find shard key index for "
                                        << keyPattern.toString() << " in " << nss.ns();
        LOG(0) << msg;
        return {ErrorCodes::InternalError, msg};
    }

    // Extend bounds to match the index we found
    const KeyPattern indexKeyPattern(idx->keyPattern());
    const auto extend = [&](const auto& key) {
        return Helpers::toKeyFormat(indexKeyPattern.extendRangeBound(key, false));
    };

    const auto min = extend(range.getMin());
    const auto max = extend(range.getMax());

    LOG(1) << "begin removal of " << min << " to " << max << " in " << nss.ns();

    const auto indexName = idx->indexName();
    IndexDescriptor* descriptor = collection->getIndexCatalog()->findIndexByName(opCtx, indexName);
    if (!descriptor) {
        std::string msg = str::stream() << "shard key index with name " << indexName << " on '"
                                        << nss.ns() << "' was dropped";
        LOG(0) << msg;
        return {ErrorCodes::InternalError, msg};
    }

    boost::optional<Helpers::RemoveSaver> saver;
    if (serverGlobalParams.moveParanoia) {
        saver.emplace("moveChunk", nss.ns(), "cleaning");
    }

    auto halfOpen = BoundInclusion::kIncludeStartKeyOnly;
    auto manual = PlanExecutor::YIELD_MANUAL;
    auto forward = InternalPlanner::FORWARD;
    auto fetch = InternalPlanner::IXSCAN_FETCH;

    auto exec = InternalPlanner::indexScan(
        opCtx, collection, descriptor, min, max, halfOpen, manual, forward, fetch);

    int numDeleted = 0;
    do {
        RecordId rloc;
        BSONObj obj;
        PlanExecutor::ExecState state = exec->getNext(&obj, &rloc);
        if (state == PlanExecutor::IS_EOF) {
            break;
        }
        if (state == PlanExecutor::FAILURE || state == PlanExecutor::DEAD) {
            warning() << PlanExecutor::statestr(state) << " - cursor error while trying to delete "
                      << redact(min) << " to " << redact(max) << " in " << nss << ": "
                      << redact(WorkingSetCommon::toStatusString(obj))
                      << ", stats: " << Explain::getWinningPlanStats(exec.get());
            break;
        }
        invariant(PlanExecutor::ADVANCED == state);

        exec->saveState();

        writeConflictRetry(opCtx, "delete range", nss.ns(), [&] {
            WriteUnitOfWork wuow(opCtx);
            if (saver) {
                uassertStatusOK(saver->goingToDelete(obj));
            }
            collection->deleteDocument(opCtx, kUninitializedStmtId, rloc, nullptr, true);
            wuow.commit();
        });

        try {
            exec->restoreState();
        } catch (const DBException& ex) {
            warning() << "error restoring cursor state while trying to delete " << redact(min)
                      << " to " << redact(max) << " in " << nss
                      << ", stats: " << Explain::getWinningPlanStats(exec.get()) << ": "
                      << redact(ex.toStatus());
            break;
        }
        ShardingStatistics::get(opCtx).countDocsDeletedOnDonor.addAndFetch(1);

    } while (++numDeleted < maxToDelete);

    return numDeleted;
}
Exemple #15
0
Status createCollectionForApplyOps(OperationContext* opCtx,
                                   const std::string& dbName,
                                   const BSONElement& ui,
                                   const BSONObj& cmdObj,
                                   const BSONObj& idIndex) {
    invariant(opCtx->lockState()->isDbLockedForMode(dbName, MODE_X));
    auto db = dbHolder().get(opCtx, dbName);
    const NamespaceString newCollName(Command::parseNsCollectionRequired(dbName, cmdObj));
    auto newCmd = cmdObj;

    // If a UUID is given, see if we need to rename a collection out of the way, and whether the
    // collection already exists under a different name. If so, rename it into place. As this is
    // done during replay of the oplog, the operations do not need to be atomic, just idempotent.
    // We need to do the renaming part in a separate transaction, as we cannot transactionally
    // create a database on MMAPv1, which could result in createCollection failing if the database
    // does not yet exist.
    if (ui.ok()) {
        // Return an optional, indicating whether we need to early return (if the collection already
        // exists, or in case of an error).
        using Result = boost::optional<Status>;
        auto result =
            writeConflictRetry(opCtx, "createCollectionForApplyOps", newCollName.ns(), [&] {
                WriteUnitOfWork wunit(opCtx);
                // Options need the field to be named "uuid", so parse/recreate.
                auto uuid = uassertStatusOK(UUID::parse(ui));
                uassert(ErrorCodes::InvalidUUID,
                        "Invalid UUID in applyOps create command: " + uuid.toString(),
                        uuid.isRFC4122v4());

                auto& catalog = UUIDCatalog::get(opCtx);
                auto currentName = catalog.lookupNSSByUUID(uuid);
                OpObserver* opObserver = getGlobalServiceContext()->getOpObserver();
                if (currentName == newCollName)
                    return Result(Status::OK());

                // In the case of oplog replay, a future command may have created or renamed a
                // collection with that same name. In that case, renaming this future collection to
                // a random temporary name is correct: once all entries are replayed no temporary
                // names will remain.  On MMAPv1 the rename can result in index names that are too
                // long. However this should only happen for initial sync and "resync collection"
                // for rollback, so we can let the error propagate resulting in an abort and restart
                // of the initial sync or result in rollback to fassert, requiring a resync of that
                // node.
                const bool stayTemp = true;
                if (auto futureColl = db ? db->getCollection(opCtx, newCollName) : nullptr) {
                    auto tmpNameResult = db->makeUniqueCollectionNamespace(opCtx, "tmp%%%%%");
                    if (!tmpNameResult.isOK()) {
                        return Result(Status(tmpNameResult.getStatus().code(),
                                             str::stream() << "Cannot generate temporary "
                                                              "collection namespace for applyOps "
                                                              "create command: collection: "
                                                           << newCollName.ns()
                                                           << ". error: "
                                                           << tmpNameResult.getStatus().reason()));
                    }
                    const auto& tmpName = tmpNameResult.getValue();
                    Status status =
                        db->renameCollection(opCtx, newCollName.ns(), tmpName.ns(), stayTemp);
                    if (!status.isOK())
                        return Result(status);
                    opObserver->onRenameCollection(opCtx,
                                                   newCollName,
                                                   tmpName,
                                                   futureColl->uuid(),
                                                   /*dropTarget*/ false,
                                                   /*dropTargetUUID*/ {},
                                                   stayTemp);
                }

                // If the collection with the requested UUID already exists, but with a different
                // name, just rename it to 'newCollName'.
                if (catalog.lookupCollectionByUUID(uuid)) {
                    Status status =
                        db->renameCollection(opCtx, currentName.ns(), newCollName.ns(), stayTemp);
                    if (!status.isOK())
                        return Result(status);
                    opObserver->onRenameCollection(opCtx,
                                                   currentName,
                                                   newCollName,
                                                   uuid,
                                                   /*dropTarget*/ false,
                                                   /*dropTargetUUID*/ {},
                                                   stayTemp);

                    wunit.commit();
                    return Result(Status::OK());
                }

                // A new collection with the specific UUID must be created, so add the UUID to the
                // creation options. Regular user collection creation commands cannot do this.
                auto uuidObj = uuid.toBSON();
                newCmd = cmdObj.addField(uuidObj.firstElement());
                wunit.commit();

                return Result(boost::none);
            });

        if (result) {
            return *result;
        }
    }

    return createCollection(
        opCtx, newCollName, newCmd, idIndex, CollectionOptions::parseForStorage);
}
Exemple #16
0
Status dropCollection(OperationContext* opCtx,
                      const NamespaceString& collectionName,
                      BSONObjBuilder& result,
                      const repl::OpTime& dropOpTime,
                      DropCollectionSystemCollectionMode systemCollectionMode) {
    if (!serverGlobalParams.quiet.load()) {
        log() << "CMD: drop " << collectionName;
    }

    return writeConflictRetry(opCtx, "drop", collectionName.ns(), [&] {
        AutoGetDb autoDb(opCtx, collectionName.db(), MODE_X);
        Database* const db = autoDb.getDb();
        Collection* coll = db ? db->getCollection(opCtx, collectionName) : nullptr;
        auto view =
            db && !coll ? db->getViewCatalog()->lookup(opCtx, collectionName.ns()) : nullptr;

        if (MONGO_FAIL_POINT(hangDuringDropCollection)) {
            log() << "hangDuringDropCollection fail point enabled. Blocking until fail point is "
                     "disabled.";
            MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangDuringDropCollection);
        }

        if (!db || (!coll && !view)) {
            return Status(ErrorCodes::NamespaceNotFound, "ns not found");
        }

        const bool shardVersionCheck = true;
        OldClientContext context(opCtx, collectionName.ns(), shardVersionCheck);

        bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
            !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, collectionName);

        if (userInitiatedWritesAndNotPrimary) {
            return Status(ErrorCodes::NotMaster,
                          str::stream() << "Not primary while dropping collection "
                                        << collectionName);
        }

        WriteUnitOfWork wunit(opCtx);
        if (!result.hasField("ns")) {
            result.append("ns", collectionName.ns());
        }

        if (coll) {
            invariant(!view);
            int numIndexes = coll->getIndexCatalog()->numIndexesTotal(opCtx);

            BackgroundOperation::assertNoBgOpInProgForNs(collectionName.ns());

            Status s = systemCollectionMode ==
                    DropCollectionSystemCollectionMode::kDisallowSystemCollectionDrops
                ? db->dropCollection(opCtx, collectionName.ns(), dropOpTime)
                : db->dropCollectionEvenIfSystem(opCtx, collectionName, dropOpTime);

            if (!s.isOK()) {
                return s;
            }

            result.append("nIndexesWas", numIndexes);
        } else {
            invariant(view);
            Status status = db->dropView(opCtx, collectionName.ns());
            if (!status.isOK()) {
                return status;
            }
        }
        wunit.commit();

        return Status::OK();
    });
}
Exemple #17
0
Status dropDatabase(OperationContext* opCtx, const std::string& dbName) {
    uassert(ErrorCodes::IllegalOperation,
            "Cannot drop a database in read-only mode",
            !storageGlobalParams.readOnly);
    // TODO (Kal): OldClientContext legacy, needs to be removed
    {
        CurOp::get(opCtx)->ensureStarted();
        stdx::lock_guard<Client> lk(*opCtx->getClient());
        CurOp::get(opCtx)->setNS_inlock(dbName);
    }

    auto replCoord = repl::ReplicationCoordinator::get(opCtx);
    std::size_t numCollectionsToDrop = 0;

    // We have to wait for the last drop-pending collection to be removed if there are no
    // collections to drop.
    repl::OpTime latestDropPendingOpTime;

    using Result = boost::optional<Status>;
    // Get an optional result--if it's there, early return; otherwise, wait for collections to drop.
    auto result = writeConflictRetry(opCtx, "dropDatabase_collection", dbName, [&] {
        Lock::GlobalWrite lk(opCtx);
        AutoGetDb autoDB(opCtx, dbName, MODE_X);
        Database* const db = autoDB.getDb();
        if (!db) {
            return Result(Status(ErrorCodes::NamespaceNotFound,
                                 str::stream() << "Could not drop database " << dbName
                                               << " because it does not exist"));
        }

        bool userInitiatedWritesAndNotPrimary =
            opCtx->writesAreReplicated() && !replCoord->canAcceptWritesForDatabase(opCtx, dbName);

        if (userInitiatedWritesAndNotPrimary) {
            return Result(
                Status(ErrorCodes::NotMaster,
                       str::stream() << "Not primary while dropping database " << dbName));
        }

        log() << "dropDatabase " << dbName << " - starting";
        db->setDropPending(opCtx, true);

        // If Database::dropCollectionEventIfSystem() fails, we should reset the drop-pending state
        // on Database.
        auto dropPendingGuard = MakeGuard([&db, opCtx] { db->setDropPending(opCtx, false); });

        for (auto collection : *db) {
            const auto& nss = collection->ns();
            if (nss.isDropPendingNamespace() && replCoord->isReplEnabled() &&
                opCtx->writesAreReplicated()) {
                log() << "dropDatabase " << dbName << " - found drop-pending collection: " << nss;
                latestDropPendingOpTime = std::max(
                    latestDropPendingOpTime, uassertStatusOK(nss.getDropPendingNamespaceOpTime()));
                continue;
            }
            if (replCoord->isOplogDisabledFor(opCtx, nss) || nss.isSystemDotIndexes()) {
                continue;
            }
            log() << "dropDatabase " << dbName << " - dropping collection: " << nss;
            WriteUnitOfWork wunit(opCtx);
            fassertStatusOK(40476, db->dropCollectionEvenIfSystem(opCtx, nss));
            wunit.commit();
            numCollectionsToDrop++;
        }
        dropPendingGuard.Dismiss();

        // If there are no collection drops to wait for, we complete the drop database operation.
        if (numCollectionsToDrop == 0U && latestDropPendingOpTime.isNull()) {
            return Result(_finishDropDatabase(opCtx, dbName, db));
        }

        return Result(boost::none);
    });

    if (result) {
        return *result;
    }

    // If waitForWriteConcern() returns an error or throws an exception, we should reset the
    // drop-pending state on Database.
    auto dropPendingGuardWhileAwaitingReplication = MakeGuard([dbName, opCtx] {
        Lock::GlobalWrite lk(opCtx);
        AutoGetDb autoDB(opCtx, dbName, MODE_X);
        if (auto db = autoDB.getDb()) {
            db->setDropPending(opCtx, false);
        }
    });

    {
        // Holding of any locks is disallowed while awaiting replication because this can
        // potentially block for long time while doing network activity.
        //
        // Even though dropDatabase() does not explicitly acquire any locks before awaiting
        // replication, it is possible that the caller of this function may already have acquired
        // a lock. The applyOps command is an example of a dropDatabase() caller that does this.
        // Therefore, we have to release any locks using a TempRelease RAII object.
        //
        // TODO: Remove the use of this TempRelease object when SERVER-29802 is completed.
        // The work in SERVER-29802 will adjust the locking rules around applyOps operations and
        // dropDatabase is expected to be one of the operations where we expect to no longer acquire
        // the global lock.
        Lock::TempRelease release(opCtx->lockState());

        if (numCollectionsToDrop > 0U) {
            auto status =
                replCoord->awaitReplicationOfLastOpForClient(opCtx, kDropDatabaseWriteConcern)
                    .status;
            if (!status.isOK()) {
                return Status(status.code(),
                              str::stream() << "dropDatabase " << dbName << " failed waiting for "
                                            << numCollectionsToDrop
                                            << " collection drops to replicate: "
                                            << status.reason());
            }

            log() << "dropDatabase " << dbName << " - successfully dropped " << numCollectionsToDrop
                  << " collections. dropping database";
        } else {
            invariant(!latestDropPendingOpTime.isNull());
            auto status =
                replCoord
                    ->awaitReplication(opCtx, latestDropPendingOpTime, kDropDatabaseWriteConcern)
                    .status;
            if (!status.isOK()) {
                return Status(
                    status.code(),
                    str::stream()
                        << "dropDatabase "
                        << dbName
                        << " failed waiting for pending collection drops (most recent drop optime: "
                        << latestDropPendingOpTime.toString()
                        << ") to replicate: "
                        << status.reason());
            }

            log() << "dropDatabase " << dbName
                  << " - pending collection drops completed. dropping database";
        }
    }

    dropPendingGuardWhileAwaitingReplication.Dismiss();

    return writeConflictRetry(opCtx, "dropDatabase_database", dbName, [&] {
        Lock::GlobalWrite lk(opCtx);
        AutoGetDb autoDB(opCtx, dbName, MODE_X);
        if (auto db = autoDB.getDb()) {
            return _finishDropDatabase(opCtx, dbName, db);
        }

        return Status(ErrorCodes::NamespaceNotFound,
                      str::stream() << "Could not drop database " << dbName
                                    << " because it does not exist after dropping "
                                    << numCollectionsToDrop
                                    << " collection(s).");
    });
}
Status IndexAccessMethod::commitBulk(OperationContext* opCtx,
                                     std::unique_ptr<BulkBuilder> bulk,
                                     bool mayInterrupt,
                                     bool dupsAllowed,
                                     set<RecordId>* dupsToDrop,
                                     bool assignTimestamp) {
    // Do not track multikey path info for index builds.
    ScopeGuard restartTracker =
        MakeGuard([opCtx] { MultikeyPathTracker::get(opCtx).startTrackingMultikeyPathInfo(); });
    if (!MultikeyPathTracker::get(opCtx).isTrackingMultikeyPathInfo()) {
        restartTracker.Dismiss();
    }
    MultikeyPathTracker::get(opCtx).stopTrackingMultikeyPathInfo();
    Timer timer;

    std::unique_ptr<BulkBuilder::Sorter::Iterator> i(bulk->_sorter->done());

    stdx::unique_lock<Client> lk(*opCtx->getClient());
    ProgressMeterHolder pm(
        CurOp::get(opCtx)->setMessage_inlock("Index Bulk Build: (2/3) btree bottom up",
                                             "Index: (2/3) BTree Bottom Up Progress",
                                             bulk->_keysInserted,
                                             10));
    lk.unlock();

    std::unique_ptr<SortedDataBuilderInterface> builder;

    writeConflictRetry(opCtx, "setting index multikey flag", "", [&] {
        WriteUnitOfWork wunit(opCtx);

        if (bulk->_everGeneratedMultipleKeys || isMultikeyFromPaths(bulk->_indexMultikeyPaths)) {
            _btreeState->setMultikey(opCtx, bulk->_indexMultikeyPaths);
        }

        builder.reset(_newInterface->getBulkBuilder(opCtx, dupsAllowed));
        if (assignTimestamp) {
            fassertStatusOK(50705,
                            opCtx->recoveryUnit()->setTimestamp(
                                LogicalClock::get(opCtx)->getClusterTime().asTimestamp()));
        }
        wunit.commit();
    });

    while (i->more()) {
        if (mayInterrupt) {
            opCtx->checkForInterrupt();
        }

        WriteUnitOfWork wunit(opCtx);
        // Improve performance in the btree-building phase by disabling rollback tracking.
        // This avoids copying all the written bytes to a buffer that is only used to roll back.
        // Note that this is safe to do, as this entire index-build-in-progress will be cleaned
        // up by the index system.
        opCtx->recoveryUnit()->setRollbackWritesDisabled();

        // Get the next datum and add it to the builder.
        BulkBuilder::Sorter::Data d = i->next();
        Status status = builder->addKey(d.first, d.second);

        if (!status.isOK()) {
            // Overlong key that's OK to skip?
            if (status.code() == ErrorCodes::KeyTooLong && ignoreKeyTooLong(opCtx)) {
                continue;
            }

            // Check if this is a duplicate that's OK to skip
            if (status.code() == ErrorCodes::DuplicateKey) {
                invariant(!dupsAllowed);  // shouldn't be getting DupKey errors if dupsAllowed.

                if (dupsToDrop) {
                    dupsToDrop->insert(d.second);
                    continue;
                }
            }

            return status;
        }

        // If we're here either it's a dup and we're cool with it or the addKey went just
        // fine.
        pm.hit();
        if (assignTimestamp) {
            fassertStatusOK(50704,
                            opCtx->recoveryUnit()->setTimestamp(
                                LogicalClock::get(opCtx)->getClusterTime().asTimestamp()));
        }
        wunit.commit();
    }

    pm.finished();

    {
        stdx::lock_guard<Client> lk(*opCtx->getClient());
        CurOp::get(opCtx)->setMessage_inlock("Index Bulk Build: (3/3) btree-middle",
                                             "Index: (3/3) BTree Middle Progress");
    }

    LOG(timer.seconds() > 10 ? 0 : 1) << "\t done building bottom layer, going to commit";

    std::unique_ptr<TimestampBlock> tsBlock;
    if (assignTimestamp) {
        tsBlock = stdx::make_unique<TimestampBlock>(
            opCtx, LogicalClock::get(opCtx)->getClusterTime().asTimestamp());
    }
    builder->commit(mayInterrupt);
    return Status::OK();
}
Status IndexBuildInterceptor::drainWritesIntoIndex(OperationContext* opCtx,
                                                   const InsertDeleteOptions& options,
                                                   RecoveryUnit::ReadSource readSource) {
    invariant(!opCtx->lockState()->inAWriteUnitOfWork());

    // Callers may request to read at a specific timestamp so that no drained writes are timestamped
    // earlier than their original write timestamp. Also ensure that leaving this function resets
    // the ReadSource to its original value.
    auto resetReadSourceGuard =
        makeGuard([ opCtx, prevReadSource = opCtx->recoveryUnit()->getTimestampReadSource() ] {
            opCtx->recoveryUnit()->abandonSnapshot();
            opCtx->recoveryUnit()->setTimestampReadSource(prevReadSource);
        });

    if (readSource != RecoveryUnit::ReadSource::kUnset) {
        opCtx->recoveryUnit()->abandonSnapshot();
        opCtx->recoveryUnit()->setTimestampReadSource(readSource);
    } else {
        resetReadSourceGuard.dismiss();
    }

    // These are used for logging only.
    int64_t totalDeleted = 0;
    int64_t totalInserted = 0;
    Timer timer;

    const int64_t appliedAtStart = _numApplied;

    // Set up the progress meter. This will never be completely accurate, because more writes can be
    // read from the side writes table than are observed before draining.
    static const char* curopMessage = "Index Build: draining writes received during build";
    ProgressMeterHolder progress;
    {
        stdx::unique_lock<Client> lk(*opCtx->getClient());
        progress.set(CurOp::get(opCtx)->setProgress_inlock(curopMessage));
    }

    // Force the progress meter to log at the end of every batch. By default, the progress meter
    // only logs after a large number of calls to hit(), but since we batch inserts by up to
    // 1000 records, progress would rarely be displayed.
    progress->reset(_sideWritesCounter.load() - appliedAtStart /* total */,
                    3 /* secondsBetween */,
                    1 /* checkInterval */);

    // Buffer operations into batches to insert per WriteUnitOfWork. Impose an upper limit on the
    // number of documents and the total size of the batch.
    const int32_t kBatchMaxSize = 1000;
    const int64_t kBatchMaxBytes = BSONObjMaxInternalSize;

    int64_t batchSizeBytes = 0;

    std::vector<SideWriteRecord> batch;
    batch.reserve(kBatchMaxSize);

    // Hold on to documents that would exceed the per-batch memory limit. Always insert this first
    // into the next batch.
    boost::optional<SideWriteRecord> stashed;

    auto cursor = _sideWritesTable->rs()->getCursor(opCtx);

    bool atEof = false;
    while (!atEof) {
        opCtx->checkForInterrupt();

        // Stashed records should be inserted into a batch first.
        if (stashed) {
            invariant(batch.empty());
            batch.push_back(std::move(stashed.get()));
            stashed.reset();
        }

        auto record = cursor->next();

        if (record) {
            RecordId currentRecordId = record->id;
            BSONObj docOut = record->data.toBson().getOwned();

            // If the total batch size in bytes would be too large, stash this document and let the
            // current batch insert.
            int objSize = docOut.objsize();
            if (batchSizeBytes + objSize > kBatchMaxBytes) {
                invariant(!stashed);

                // Stash this document to be inserted in the next batch.
                stashed.emplace(currentRecordId, std::move(docOut));
            } else {
                batchSizeBytes += objSize;
                batch.emplace_back(currentRecordId, std::move(docOut));

                // Continue if there is more room in the batch.
                if (batch.size() < kBatchMaxSize) {
                    continue;
                }
            }
        } else {
            atEof = true;
            if (batch.empty())
                break;
        }

        invariant(!batch.empty());

        cursor->save();

        // If we are here, either we have reached the end of the table or the batch is full, so
        // insert everything in one WriteUnitOfWork, and delete each inserted document from the side
        // writes table.
        auto status = writeConflictRetry(opCtx, "index build drain", _indexCatalogEntry->ns(), [&] {
            WriteUnitOfWork wuow(opCtx);
            for (auto& operation : batch) {
                auto status =
                    _applyWrite(opCtx, operation.second, options, &totalInserted, &totalDeleted);
                if (!status.isOK()) {
                    return status;
                }

                // Delete the document from the table as soon as it has been inserted into the
                // index. This ensures that no key is ever inserted twice and no keys are skipped.
                _sideWritesTable->rs()->deleteRecord(opCtx, operation.first);
            }

            // For rollback to work correctly, these writes need to be timestamped. The actual time
            // is not important, as long as it not older than the most recent visible side write.
            IndexTimestampHelper::setGhostCommitTimestampForWrite(
                opCtx, NamespaceString(_indexCatalogEntry->ns()));

            wuow.commit();
            return Status::OK();
        });
        if (!status.isOK()) {
            return status;
        }

        progress->hit(batch.size());

        // Lock yielding will only happen if we are holding intent locks.
        _tryYield(opCtx);
        cursor->restore();

        // Account for more writes coming in during a batch.
        progress->setTotalWhileRunning(_sideWritesCounter.loadRelaxed() - appliedAtStart);

        _numApplied += batch.size();
        batch.clear();
        batchSizeBytes = 0;
    }

    progress->finished();

    int logLevel = (_numApplied - appliedAtStart > 0) ? 0 : 1;
    LOG(logLevel) << "index build: drain applied " << (_numApplied - appliedAtStart)
                  << " side writes (inserted: " << totalInserted << ", deleted: " << totalDeleted
                  << ") for '" << _indexCatalogEntry->descriptor()->indexName() << "' in "
                  << timer.millis() << " ms";

    return Status::OK();
}
Exemple #20
0
Status applyOps(OperationContext* opCtx,
                const std::string& dbName,
                const BSONObj& applyOpCmd,
                BSONObjBuilder* result) {
    bool allowAtomic = false;
    uassertStatusOK(
        bsonExtractBooleanFieldWithDefault(applyOpCmd, "allowAtomic", true, &allowAtomic));
    auto areOpsCrudOnly = _areOpsCrudOnly(applyOpCmd);
    auto isAtomic = allowAtomic && areOpsCrudOnly;
    auto hasPrecondition = _hasPrecondition(applyOpCmd);

    boost::optional<Lock::GlobalWrite> globalWriteLock;
    boost::optional<Lock::DBLock> dbWriteLock;

    // There's only one case where we are allowed to take the database lock instead of the global
    // lock - no preconditions; only CRUD ops; and non-atomic mode.
    if (!hasPrecondition && areOpsCrudOnly && !allowAtomic) {
        dbWriteLock.emplace(opCtx, dbName, MODE_X);
    } else {
        globalWriteLock.emplace(opCtx);
    }

    bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
        !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(opCtx, dbName);

    if (userInitiatedWritesAndNotPrimary)
        return Status(ErrorCodes::NotMaster,
                      str::stream() << "Not primary while applying ops to database " << dbName);

    if (hasPrecondition) {
        auto status = _checkPrecondition(opCtx, applyOpCmd, result);
        if (!status.isOK()) {
            return status;
        }
    }

    int numApplied = 0;
    if (!isAtomic)
        return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied);

    // Perform write ops atomically
    invariant(globalWriteLock);
    try {
        writeConflictRetry(opCtx, "applyOps", dbName, [&] {
            BSONObjBuilder intermediateResult;
            WriteUnitOfWork wunit(opCtx);
            numApplied = 0;
            {
                // Suppress replication for atomic operations until end of applyOps.
                repl::UnreplicatedWritesBlock uwb(opCtx);
                uassertStatusOK(
                    _applyOps(opCtx, dbName, applyOpCmd, &intermediateResult, &numApplied));
            }
            // Generate oplog entry for all atomic ops collectively.
            if (opCtx->writesAreReplicated()) {
                // We want this applied atomically on slaves so we rewrite the oplog entry without
                // the pre-condition for speed.

                BSONObjBuilder cmdBuilder;

                for (auto elem : applyOpCmd) {
                    auto name = elem.fieldNameStringData();
                    if (name == kPreconditionFieldName)
                        continue;
                    if (name == "bypassDocumentValidation")
                        continue;
                    cmdBuilder.append(elem);
                }

                const BSONObj cmdRewritten = cmdBuilder.done();

                auto opObserver = getGlobalServiceContext()->getOpObserver();
                invariant(opObserver);
                opObserver->onApplyOps(opCtx, dbName, cmdRewritten);
            }
            wunit.commit();
            result->appendElements(intermediateResult.obj());
        });
    } catch (const DBException& ex) {
        if (ex.getCode() == ErrorCodes::NamespaceNotFound) {
            // Retry in non-atomic mode, since MMAP cannot implicitly create a new database
            // within an active WriteUnitOfWork.
            return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied);
        }
        BSONArrayBuilder ab;
        ++numApplied;
        for (int j = 0; j < numApplied; j++)
            ab.append(false);
        result->append("applied", numApplied);
        result->append("code", ex.getCode());
        result->append("codeName", ErrorCodes::errorString(ErrorCodes::fromInt(ex.getCode())));
        result->append("errmsg", ex.what());
        result->append("results", ab.arr());
        return Status(ErrorCodes::UnknownError, ex.what());
    }

    return Status::OK();
}