ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx,
                                                const NamespaceString& nss,
                                                bool forceRefreshFromThisThread) {
    invariant(!opCtx->lockState()->isLocked());
    invariant(!opCtx->getClient()->isInDirectClient());

    auto const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->canAcceptShardedCommands());

    const auto routingInfo =
        uassertStatusOK(Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh(
            opCtx, nss, forceRefreshFromThisThread));
    const auto cm = routingInfo.cm();

    if (!cm) {
        // No chunk manager, so unsharded.

        // Exclusive collection lock needed since we're now changing the metadata
        AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(opCtx, nss);
        css->refreshMetadata(opCtx, nullptr);

        return ChunkVersion::UNSHARDED();
    }

    {
        AutoGetCollection autoColl(opCtx, nss, MODE_IS);
        auto metadata = CollectionShardingState::get(opCtx, nss)->getMetadata(opCtx);

        // We already have newer version
        if (metadata && metadata->getCollVersion().epoch() == cm->getVersion().epoch() &&
            metadata->getCollVersion() >= cm->getVersion()) {
            LOG(1) << "Skipping refresh of metadata for " << nss << " "
                   << metadata->getCollVersion() << " with an older " << cm->getVersion();
            return metadata->getShardVersion();
        }
    }

    // Exclusive collection lock needed since we're now changing the metadata
    AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X);

    auto css = CollectionShardingState::get(opCtx, nss);
    auto metadata = css->getMetadata(opCtx);

    // We already have newer version
    if (metadata && metadata->getCollVersion().epoch() == cm->getVersion().epoch() &&
        metadata->getCollVersion() >= cm->getVersion()) {
        LOG(1) << "Skipping refresh of metadata for " << nss << " " << metadata->getCollVersion()
               << " with an older " << cm->getVersion();
        return metadata->getShardVersion();
    }

    std::unique_ptr<CollectionMetadata> newCollectionMetadata =
        stdx::make_unique<CollectionMetadata>(cm, shardingState->getShardName());

    css->refreshMetadata(opCtx, std::move(newCollectionMetadata));

    return css->getMetadata(opCtx)->getShardVersion();
}
Status MigrationSourceManager::startClone(OperationContext* txn) {
    invariant(!txn->lockState()->isLocked());
    invariant(_state == kCreated);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

    grid.catalogClient(txn)->logChange(txn,
                                       "moveChunk.start",
                                       _args.getNss().ns(),
                                       BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey()
                                                  << "from"
                                                  << _args.getFromShardId()
                                                  << "to"
                                                  << _args.getToShardId()));

    _cloneDriver = stdx::make_unique<MigrationChunkClonerSourceLegacy>(
        _args, _committedMetadata->getKeyPattern());

    {
        // Register for notifications from the replication subsystem
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
        css->setMigrationSourceManager(txn, this);
    }

    Status startCloneStatus = _cloneDriver->startClone(txn);
    if (!startCloneStatus.isOK()) {
        return startCloneStatus;
    }

    _state = kCloning;
    scopedGuard.Dismiss();
    return Status::OK();
}
BSONObj ActiveMigrationsRegistry::getActiveMigrationStatusReport(OperationContext* opCtx) {
    boost::optional<NamespaceString> nss;
    {
        stdx::lock_guard<stdx::mutex> lk(_mutex);

        if (_activeMoveChunkState) {
            nss = _activeMoveChunkState->args.getNss();
        }
    }

    // The state of the MigrationSourceManager could change between taking and releasing the mutex
    // above and then taking the collection lock here, but that's fine because it isn't important to
    // return information on a migration that just ended or started. This is just best effort and
    // desireable for reporting, and then diagnosing, migrations that are stuck.
    if (nss) {
        // Lock the collection so nothing changes while we're getting the migration report.
        AutoGetCollection autoColl(opCtx, nss.get(), MODE_IS);

        auto css = CollectionShardingState::get(opCtx, nss.get());
        if (css->getMigrationSourceManager()) {
            return css->getMigrationSourceManager()->getMigrationStatusReport();
        }
    }

    return BSONObj();
}
Пример #4
0
void OpObserverShardingImpl::shardObserveTransactionPrepareOrUnpreparedCommit(
    OperationContext* opCtx, const std::vector<repl::ReplOperation>& stmts) {

    for (const auto stmt : stmts) {
        auto const nss = stmt.getNss();

        AutoGetCollection autoColl(opCtx, nss, MODE_IS);
        auto csr = CollectionShardingRuntime::get(opCtx, nss);
        auto csrLock = CollectionShardingRuntime::CSRLock::lock(opCtx, csr);
        auto msm = MigrationSourceManager::get(csr, csrLock);
        if (!msm) {
            continue;
        }

        auto const opType = stmt.getOpType();

        // We pass an empty opTime to observers because retryable write history doesn't care about
        // writes in transactions.
        if (opType == repl::OpTypeEnum::kInsert) {
            msm->getCloner()->onInsertOp(opCtx, stmt.getObject(), {});
        } else if (opType == repl::OpTypeEnum::kUpdate) {
            if (auto updateDoc = stmt.getObject2()) {
                msm->getCloner()->onUpdateOp(
                    opCtx, stmt.getPreImageDocumentKey(), *updateDoc, {}, {});
            }
        } else if (opType == repl::OpTypeEnum::kDelete) {
            if (isMigratingWithCSRLock(csr, csrLock, stmt.getObject())) {
                msm->getCloner()->onDeleteOp(
                    opCtx, getDocumentKey(opCtx, nss, stmt.getObject()), {}, {});
            }
        }
    }
}
    int reap(OperationContext* opCtx) override {
        auto const coord = mongo::repl::ReplicationCoordinator::get(opCtx);

        Handler handler(opCtx, *_collection);
        if (!handler.initialize()) {
            return 0;
        }

        AutoGetCollection autoColl(
            opCtx, NamespaceString::kSessionTransactionsTableNamespace, MODE_IS);

        // Only start reaping if the shard or config server node is currently the primary
        if (!coord->canAcceptWritesForDatabase(
                opCtx, NamespaceString::kSessionTransactionsTableNamespace.db())) {
            return 0;
        }

        DBDirectClient client(opCtx);

        auto query = makeQuery(opCtx->getServiceContext()->getFastClockSource()->now());
        auto cursor = client.query(
            NamespaceString::kSessionTransactionsTableNamespace, query, 0, 0, &kIdProjection);

        while (cursor->more()) {
            auto transactionSession = SessionsCollectionFetchResultIndividualResult::parse(
                "TransactionSession"_sd, cursor->next());

            handler.handleLsid(transactionSession.get_id());
        }

        // Before the handler goes out of scope, flush its last batch to disk and collect stats.
        return handler.finalize();
    }
void MigrationSourceManager::_cleanup(OperationContext* txn) {
    invariant(_state != kDone);

    {
        // Unregister from the collection's sharding state
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(txn, _args.getNss().ns());

        // The migration source manager is not visible anymore after it is unregistered from the
        // collection
        css->clearMigrationSourceManager(txn);

        // Leave the critical section.
        if (_state == kCriticalSection) {
            _critSecSignal->set();
        }
    }

    // Decrement the metadata op counter outside of the collection lock in order to hold it for as
    // short as possible.
    if (_state == kCriticalSection) {
        ShardingStateRecovery::endMetadataOp(txn);
    }

    if (_cloneDriver) {
        _cloneDriver->cancelClone(txn);
        _cloneDriver.reset();
    }

    _state = kDone;
}
Пример #7
0
Status ShardingStateRecovery::recover(OperationContext* opCtx) {
    if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) {
        return Status::OK();
    }

    BSONObj recoveryDocBSON;

    try {
        AutoGetCollection autoColl(opCtx, NamespaceString::kConfigCollectionNamespace, MODE_IS);
        if (!Helpers::findOne(
                opCtx, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) {
            return Status::OK();
        }
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON);
    if (!recoveryDocStatus.isOK())
        return recoveryDocStatus.getStatus();

    const auto recoveryDoc = std::move(recoveryDocStatus.getValue());

    log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON());

    ShardingState* const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->enabled());

    if (!recoveryDoc.getMinOpTimeUpdaters()) {
        // Treat the minOpTime as up-to-date
        grid.advanceConfigOpTime(recoveryDoc.getMinOpTime());
        return Status::OK();
    }

    log() << "Sharding state recovery document indicates there were "
          << recoveryDoc.getMinOpTimeUpdaters()
          << " metadata change operations in flight. Contacting the config server primary in order "
             "to retrieve the most recent opTime.";

    // Need to fetch the latest uptime from the config server, so do a logging write
    Status status =
        grid.catalogClient(opCtx)->logChange(opCtx,
                                             "Sharding minOpTime recovery",
                                             NamespaceString::kConfigCollectionNamespace.ns(),
                                             recoveryDocBSON,
                                             ShardingCatalogClient::kMajorityWriteConcern);
    if (!status.isOK())
        return status;

    log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime();

    // Finally, clear the recovery document so next time we don't need to recover
    status = modifyRecoveryDocument(opCtx, RecoveryDocument::Clear, kLocalWriteConcern);
    if (!status.isOK()) {
        warning() << "Failed to reset sharding state recovery document due to " << redact(status);
    }

    return Status::OK();
}
Пример #8
0
Status ShardingStateRecovery::recover(OperationContext* txn) {
    BSONObj recoveryDocBSON;

    try {
        AutoGetCollection autoColl(txn, NamespaceString::kConfigCollectionNamespace, MODE_IS);
        if (!Helpers::findOne(
                txn, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) {
            return Status::OK();
        }
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON);
    if (!recoveryDocStatus.isOK())
        return recoveryDocStatus.getStatus();

    const auto recoveryDoc = std::move(recoveryDocStatus.getValue());

    log() << "Sharding state recovery process found document " << recoveryDoc.toBSON();

    // Make sure the sharding state is initialized
    ShardingState* const shardingState = ShardingState::get(txn);

    shardingState->initialize(txn, recoveryDoc.getConfigsvr().toString());
    shardingState->setShardName(recoveryDoc.getShardName());

    if (!recoveryDoc.getMinOpTimeUpdaters()) {
        // Treat the minOpTime as up-to-date
        grid.shardRegistry()->advanceConfigOpTime(recoveryDoc.getMinOpTime());
        return Status::OK();
    }

    log() << "Sharding state recovery document indicates there were "
          << recoveryDoc.getMinOpTimeUpdaters()
          << " metadata change operations in flight. Contacting the config server primary in order "
             "to retrieve the most recent opTime.";

    // Need to fetch the latest uptime from the config server, so do a logging write
    Status status =
        grid.catalogManager(txn)->logChange(txn,
                                            "Sharding recovery thread",
                                            "Sharding minOpTime recovery",
                                            NamespaceString::kConfigCollectionNamespace.ns(),
                                            recoveryDocBSON);
    if (!status.isOK())
        return status;

    log() << "Sharding state recovered. New config server opTime is "
          << grid.shardRegistry()->getConfigOpTime();

    // Finally, clear the recovery document so next time we don't need to recover
    status = modifyRecoveryDocument(txn, RecoveryDocument::Clear, kMajorityWriteConcern);
    if (!status.isOK()) {
        warning() << "Failed to reset sharding state recovery document due to " << status;
    }

    return Status::OK();
}
Пример #9
0
boost::optional<UUID> MongoDSessionCatalog::getTransactionTableUUID(OperationContext* opCtx) {
    AutoGetCollection autoColl(opCtx, NamespaceString::kSessionTransactionsTableNamespace, MODE_IS);

    const auto coll = autoColl.getCollection();
    if (!coll) {
        return boost::none;
    }

    return coll->uuid();
}
Пример #10
0
    bool Helpers::getLast(OperationContext* txn, const char *ns, BSONObj& result) {
        AutoGetCollectionForRead autoColl(txn, ns);
        auto_ptr<PlanExecutor> exec(InternalPlanner::collectionScan(txn,
                                                                    ns,
                                                                    autoColl.getCollection(),
                                                                    InternalPlanner::BACKWARD));

        PlanExecutor::ExecState state = exec->getNext(&result, NULL);
        return PlanExecutor::ADVANCED == state;
    }
void MockReplCoordServerFixture::insertOplogEntry(const repl::OplogEntry& entry) {
    AutoGetCollection autoColl(opCtx(), NamespaceString::kRsOplogNamespace, MODE_IX);
    auto coll = autoColl.getCollection();
    ASSERT_TRUE(coll != nullptr);

    auto status = coll->insertDocument(opCtx(),
                                       InsertStatement(entry.toBSON()),
                                       &CurOp::get(opCtx())->debug(),
                                       /* fromMigrate */ false);
    ASSERT_OK(status);
}
Пример #12
0
bool Helpers::getLast(OperationContext* txn, const char* ns, BSONObj& result) {
    AutoGetCollectionForRead autoColl(txn, ns);
    unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan(
        txn, ns, autoColl.getCollection(), PlanExecutor::YIELD_MANUAL, InternalPlanner::BACKWARD));
    PlanExecutor::ExecState state = exec->getNext(&result, NULL);

    if (PlanExecutor::ADVANCED == state) {
        result = result.getOwned();
        return true;
    }
    return false;
}
Пример #13
0
/**
 * Due to SERVER-23274, versions 3.2.0 through 3.2.4 of MongoDB incorrectly mark the final output
 * collections of aggregations with $out stages as temporary on most replica set secondaries. Rather
 * than risk deleting collections that the user did not intend to be temporary when newer nodes
 * start up or get promoted to be replica set primaries, newer nodes clear the temp flags left by
 * these versions.
 */
bool isSubjectToSERVER23299(OperationContext* txn) {
    // We are already called under global X lock as part of the startup sequence
    invariant(txn->lockState()->isW());

    if (storageGlobalParams.readOnly) {
        return false;
    }

    // Ensure that the local database is open since we are still early in the server startup
    // sequence
    dbHolder().openDb(txn, startupLogCollectionName.db());

    // Only used as a shortcut to obtain a reference to the startup log collection
    AutoGetCollection autoColl(txn, startupLogCollectionName, MODE_IS);

    // No startup log or an empty one means either that the user was not running an affected
    // version, or that they manually deleted the startup collection since they last started an
    // affected version.
    LOG(1) << "Checking node for SERVER-23299 eligibility";
    if (!autoColl.getCollection()) {
        LOG(1) << "Didn't find " << startupLogCollectionName;
        return false;
    }
    LOG(1) << "Checking node for SERVER-23299 applicability - reading startup log";
    BSONObj lastStartupLogDoc;
    if (!Helpers::getLast(txn, startupLogCollectionName.ns().c_str(), lastStartupLogDoc)) {
        return false;
    }
    std::vector<int> versionComponents;
    try {
        for (auto elem : lastStartupLogDoc["buildinfo"]["versionArray"].Obj()) {
            versionComponents.push_back(elem.Int());
        }
        uassert(40050,
                str::stream() << "Expected three elements in buildinfo.versionArray; found "
                              << versionComponents.size(),
                versionComponents.size() >= 3);
    } catch (const DBException& ex) {
        log() << "Last entry of " << startupLogCollectionName
              << " has no well-formed  buildinfo.versionArray field; ignoring " << causedBy(ex);
        return false;
    }
    LOG(1)
        << "Checking node for SERVER-23299 applicability - checking version 3.2.x for x in [0, 4]";
    if (versionComponents[0] != 3)
        return false;
    if (versionComponents[1] != 2)
        return false;
    if (versionComponents[2] > 4)
        return false;
    LOG(1) << "Node eligible for SERVER-23299";
    return true;
}
Пример #14
0
    void DocumentSourceCursor::loadBatch() {
        if (!_exec) {
            dispose();
            return;
        }

        // We have already validated the sharding version when we constructed the PlanExecutor
        // so we shouldn't check it again.
        const NamespaceString nss(_ns);
        AutoGetCollectionForRead autoColl(pExpCtx->opCtx, nss);

        _exec->restoreState(pExpCtx->opCtx);

        int memUsageBytes = 0;
        BSONObj obj;
        PlanExecutor::ExecState state;
        while ((state = _exec->getNext(&obj, NULL)) == PlanExecutor::ADVANCED) {
            if (_dependencies) {
                _currentBatch.push_back(_dependencies->extractFields(obj));
            }
            else {
                _currentBatch.push_back(Document::fromBsonWithMetaData(obj));
            }

            if (_limit) {
                if (++_docsAddedToBatches == _limit->getLimit()) {
                    break;
                }
                verify(_docsAddedToBatches < _limit->getLimit());
            }

            memUsageBytes += _currentBatch.back().getApproximateSize();

            if (memUsageBytes > MaxBytesToReturnToClientAtOnce) {
                // End this batch and prepare PlanExecutor for yielding.
                _exec->saveState();
                return;
            }
        }

        // If we got here, there won't be any more documents, so destroy the executor. Can't use
        // dispose since we want to keep the _currentBatch.
        _exec.reset();

        uassert(16028, "collection or index disappeared when cursor yielded",
                state != PlanExecutor::DEAD);

        uassert(17285, "cursor encountered an error: " + WorkingSetCommon::toStatusString(obj),
                state != PlanExecutor::EXEC_ERROR);

        massert(17286, str::stream() << "Unexpected return from PlanExecutor::getNext: " << state,
                state == PlanExecutor::IS_EOF || state == PlanExecutor::ADVANCED);
    }
void MigrationChunkClonerSourceLegacy::_cleanup(OperationContext* txn) {
    {
        stdx::lock_guard<stdx::mutex> sl(_mutex);
        _cloneCompleted = true;
    }

    ScopedTransaction scopedXact(txn, MODE_IS);
    AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

    if (_deleteNotifyExec) {
        _deleteNotifyExec.reset();
    }
}
Пример #16
0
void RollbackResyncsCollectionOptionsTest::resyncCollectionOptionsTest(
    CollectionOptions localCollOptions,
    BSONObj remoteCollOptionsObj,
    BSONObj collModCmd,
    std::string collName) {
    createOplog(_opCtx.get());

    auto dbName = "test";
    auto nss = NamespaceString(dbName, collName);

    auto coll = _createCollection(_opCtx.get(), nss.toString(), localCollOptions);

    auto commonOpUuid = unittest::assertGet(UUID::parse("f005ba11-cafe-bead-f00d-123456789abc"));
    auto commonOpBson = BSON("ts" << Timestamp(1, 1) << "t" << 1LL << "op"
                                  << "n"
                                  << "o"
                                  << BSONObj()
                                  << "ns"
                                  << "rollback_test.test"
                                  << "ui"
                                  << commonOpUuid);

    auto commonOperation = std::make_pair(commonOpBson, RecordId(1));

    auto collectionModificationOperation =
        makeCommandOp(Timestamp(Seconds(2), 0), coll->uuid(), nss.toString(), collModCmd, 2);

    RollbackSourceWithCollectionOptions rollbackSource(
        std::unique_ptr<OplogInterface>(new OplogInterfaceMock({commonOperation})),
        remoteCollOptionsObj);

    ASSERT_OK(syncRollback(_opCtx.get(),
                           OplogInterfaceMock({collectionModificationOperation, commonOperation}),
                           rollbackSource,
                           {},
                           _coordinator,
                           _replicationProcess.get()));

    // Make sure the collection options are correct.
    AutoGetCollectionForReadCommand autoColl(_opCtx.get(), NamespaceString(nss.toString()));
    auto collAfterRollbackOptions =
        autoColl.getCollection()->getCatalogEntry()->getCollectionOptions(_opCtx.get());

    BSONObjBuilder expectedOptionsBob;
    if (localCollOptions.uuid) {
        localCollOptions.uuid.get().appendToBuilder(&expectedOptionsBob, "uuid");
    }
    expectedOptionsBob.appendElements(remoteCollOptionsObj);

    ASSERT_BSONOBJ_EQ(expectedOptionsBob.obj(), collAfterRollbackOptions.toBSON());
}
Status onShardVersionMismatch(OperationContext* opCtx,
                              const NamespaceString& nss,
                              ChunkVersion shardVersionReceived,
                              bool forceRefreshFromThisThread) noexcept {
    invariant(!opCtx->lockState()->isLocked());
    invariant(!opCtx->getClient()->isInDirectClient());

    auto const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->canAcceptShardedCommands());

    LOG(2) << "Metadata refresh requested for " << nss.ns() << " at shard version "
           << shardVersionReceived;

    ShardingStatistics::get(opCtx).countStaleConfigErrors.addAndFetch(1);

    // Ensure any ongoing migrations have completed before trying to do the refresh. This wait is
    // just an optimization so that MongoS does not exhaust its maximum number of StaleConfig retry
    // attempts while the migration is being committed.
    try {
        auto& oss = OperationShardingState::get(opCtx);
        oss.waitForMigrationCriticalSectionSignal(opCtx);
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto currentShardVersion = [&] {
        AutoGetCollection autoColl(opCtx, nss, MODE_IS);
        const auto currentMetadata = CollectionShardingState::get(opCtx, nss)->getMetadata(opCtx);
        if (currentMetadata) {
            return currentMetadata->getShardVersion();
        }

        return ChunkVersion::UNSHARDED();
    }();

    if (currentShardVersion.epoch() == shardVersionReceived.epoch() &&
        currentShardVersion.majorVersion() >= shardVersionReceived.majorVersion()) {
        // Don't need to remotely reload if we're in the same epoch and the requested version is
        // smaller than the one we know about. This means that the remote side is behind.
        return Status::OK();
    }

    try {
        forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread);
        return Status::OK();
    } catch (const DBException& ex) {
        log() << "Failed to refresh metadata for collection" << nss << causedBy(redact(ex));
        return ex.toStatus();
    }
}
Пример #18
0
        bool run(OperationContext* txn,
                 const string& dbname,
                 BSONObj& cmdObj,
                 int,
                 string& errmsg,
                 BSONObjBuilder& result,
                 bool /*fromRepl*/) {

            BSONElement first = cmdObj.firstElement();
            uassert(
                28528,
                str::stream() << "Argument to listIndexes must be of type String, not "
                              << typeName(first.type()),
                first.type() == String);
            const NamespaceString ns(parseNs(dbname, cmdObj));
            uassert(
                28529,
                str::stream() << "Argument to listIndexes must be a collection name, "
                              << "not the empty string",
                !ns.coll().empty());

            AutoGetCollectionForRead autoColl(txn, ns);
            if (!autoColl.getDb()) {
                return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound,
                                                            "no database" ) );
            }

            const Collection* collection = autoColl.getCollection();
            if (!collection) {
                return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound,
                                                            "no collection" ) );
            }

            const CollectionCatalogEntry* cce = collection->getCatalogEntry();
            invariant(cce);

            vector<string> indexNames;
            cce->getAllIndexes( txn, &indexNames );

            BSONArrayBuilder arr;
            for ( size_t i = 0; i < indexNames.size(); i++ ) {
                arr.append( cce->getIndexSpec( txn, indexNames[i] ) );
            }

            result.append( "indexes", arr.arr() );

            return true;
        }
Пример #19
0
bool Helpers::getLast(OperationContext* opCtx, const char* ns, BSONObj& result) {
    AutoGetCollectionForReadCommand autoColl(opCtx, NamespaceString(ns));
    auto exec = InternalPlanner::collectionScan(
        opCtx, ns, autoColl.getCollection(), PlanExecutor::NO_YIELD, InternalPlanner::BACKWARD);
    PlanExecutor::ExecState state = exec->getNext(&result, NULL);

    // Non-yielding collection scans from InternalPlanner will never error.
    invariant(PlanExecutor::ADVANCED == state || PlanExecutor::IS_EOF == state);

    if (PlanExecutor::ADVANCED == state) {
        result = result.getOwned();
        return true;
    }

    return false;
}
Пример #20
0
Status dropIndexes(OperationContext* opCtx,
                   const NamespaceString& nss,
                   const BSONObj& cmdObj,
                   BSONObjBuilder* result) {
    return writeConflictRetry(opCtx, "dropIndexes", nss.db(), [opCtx, &nss, &cmdObj, result] {
        AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X);

        bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
            !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, nss);

        if (userInitiatedWritesAndNotPrimary) {
            return Status(ErrorCodes::NotMaster,
                          str::stream() << "Not primary while dropping indexes in " << nss);
        }

        if (!serverGlobalParams.quiet.load()) {
            LOG(0) << "CMD: dropIndexes " << nss << ": " << cmdObj[kIndexFieldName].toString(false);
        }

        // If db/collection does not exist, short circuit and return.
        Database* db = autoColl.getDb();
        Collection* collection = autoColl.getCollection();
        if (!collection) {
            if (db && ViewCatalog::get(db)->lookup(opCtx, nss.ns())) {
                return Status(ErrorCodes::CommandNotSupportedOnView,
                              str::stream() << "Cannot drop indexes on view " << nss);
            }

            return Status(ErrorCodes::NamespaceNotFound, "ns not found");
        }

        BackgroundOperation::assertNoBgOpInProgForNs(nss);
        IndexBuildsCoordinator::get(opCtx)->assertNoIndexBuildInProgForCollection(
            collection->uuid().get());

        WriteUnitOfWork wunit(opCtx);
        OldClientContext ctx(opCtx, nss.ns());

        Status status = wrappedRun(opCtx, collection, cmdObj, result);
        if (!status.isOK()) {
            return status;
        }

        wunit.commit();
        return Status::OK();
    });
}
Пример #21
0
    Value DocumentSourceCursor::serialize(bool explain) const {
        // we never parse a documentSourceCursor, so we only serialize for explain
        if (!explain)
            return Value();

        // Get planner-level explain info from the underlying PlanExecutor.
        BSONObjBuilder explainBuilder;
        Status explainStatus(ErrorCodes::InternalError, "");
        {
            const NamespaceString nss(_ns);
            AutoGetCollectionForRead autoColl(pExpCtx->opCtx, nss);

            massert(17392, "No _exec. Were we disposed before explained?", _exec);

            _exec->restoreState(pExpCtx->opCtx);
            explainStatus = Explain::explainStages(pExpCtx->opCtx,
                                                   _exec.get(),
                                                   ExplainCommon::QUERY_PLANNER,
                                                   &explainBuilder);
            _exec->saveState();
        }

        MutableDocument out;
        out["query"] = Value(_query);

        if (!_sort.isEmpty())
            out["sort"] = Value(_sort);

        if (_limit)
            out["limit"] = Value(_limit->getLimit());

        if (!_projection.isEmpty())
            out["fields"] = Value(_projection);

        // Add explain results from the query system into the agg explain output.
        if (explainStatus.isOK()) {
            BSONObj explainObj = explainBuilder.obj();
            invariant(explainObj.hasField("queryPlanner"));
            out["queryPlanner"] = Value(explainObj["queryPlanner"]);
        }
        else {
            out["planError"] = Value(explainStatus.toString());
        }

        return Value(DOC(getSourceName() << out.freezeToValue()));
    }
Пример #22
0
/**
 * Due to SERVER-23274, versions 3.2.0 through 3.2.4 of MongoDB incorrectly mark the final output
 * collections of aggregations with $out stages as temporary on most replica set secondaries. Rather
 * than risk deleting collections that the user did not intend to be temporary when newer nodes
 * start up or get promoted to be replica set primaries, newer nodes clear the temp flags left by
 * these versions.
 */
bool isSubjectToSERVER23299(OperationContext* txn) {
    if (storageGlobalParams.readOnly) {
        return false;
    }
    dbHolder().openDb(txn, startupLogCollectionName.db());
    AutoGetCollectionForRead autoColl(txn, startupLogCollectionName);
    // No startup log or an empty one means either that the user was not running an affected
    // version, or that they manually deleted the startup collection since they last started an
    // affected version.
    LOG(1) << "Checking node for SERVER-23299 eligibility";
    if (!autoColl.getCollection()) {
        LOG(1) << "Didn't find " << startupLogCollectionName;
        return false;
    }
    LOG(1) << "Checking node for SERVER-23299 applicability - reading startup log";
    BSONObj lastStartupLogDoc;
    if (!Helpers::getLast(txn, startupLogCollectionName.ns().c_str(), lastStartupLogDoc)) {
        return false;
    }
    std::vector<int> versionComponents;
    try {
        for (auto elem : lastStartupLogDoc["buildinfo"]["versionArray"].Obj()) {
            versionComponents.push_back(elem.Int());
        }
        uassert(40050,
                str::stream() << "Expected three elements in buildinfo.versionArray; found "
                              << versionComponents.size(),
                versionComponents.size() >= 3);
    } catch (const DBException& ex) {
        log() << "Last entry of " << startupLogCollectionName
              << " has no well-formed  buildinfo.versionArray field; ignoring " << causedBy(ex);
        return false;
    }
    LOG(1)
        << "Checking node for SERVER-23299 applicability - checking version 3.2.x for x in [0, 4]";
    if (versionComponents[0] != 3)
        return false;
    if (versionComponents[1] != 2)
        return false;
    if (versionComponents[2] > 4)
        return false;
    LOG(1) << "Node eligible for SERVER-23299";
    return true;
}
SessionCatalogMigrationSource::SessionCatalogMigrationSource(OperationContext* opCtx,
                                                             NamespaceString ns)
    : _ns(std::move(ns)), _rollbackIdAtInit(repl::ReplicationProcess::get(opCtx)->getRollbackID()) {
    // Exclude entries for transaction.
    Query query;
    // Sort is not needed for correctness. This is just for making it easier to write deterministic
    // tests.
    query.sort(BSON("_id" << 1));

    DBDirectClient client(opCtx);
    auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace, query);

    while (cursor->more()) {
        auto nextSession = SessionTxnRecord::parse(
            IDLParserErrorContext("Session migration cloning"), cursor->next());
        if (!nextSession.getLastWriteOpTime().isNull()) {
            _sessionOplogIterators.push_back(
                stdx::make_unique<SessionOplogIterator>(std::move(nextSession), _rollbackIdAtInit));
        }
    }

    {
        AutoGetCollection autoColl(opCtx, NamespaceString::kRsOplogNamespace, MODE_IX);
        writeConflictRetry(
            opCtx,
            "session migration initialization majority commit barrier",
            NamespaceString::kRsOplogNamespace.ns(),
            [&] {
                const auto message = BSON("sessionMigrateCloneStart" << _ns.ns());

                WriteUnitOfWork wuow(opCtx);
                opCtx->getClient()->getServiceContext()->getOpObserver()->onInternalOpMessage(
                    opCtx, _ns, {}, {}, message);
                wuow.commit();
            });
    }

    auto opTimeToWait = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
    WriteConcernResult result;
    WriteConcernOptions majority(
        WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0);
    uassertStatusOK(waitForWriteConcern(opCtx, opTimeToWait, majority, &result));
}
Пример #24
0
Status CollectionShardingState::waitForClean(OperationContext* opCtx,
                                             const NamespaceString& nss,
                                             OID const& epoch,
                                             ChunkRange orphanRange) {
    while (true) {
        boost::optional<CleanupNotification> stillScheduled;

        {
            AutoGetCollection autoColl(opCtx, nss, MODE_IX);
            auto css = CollectionShardingState::get(opCtx, nss);

            {
                // First, see if collection was dropped, but do it in a separate scope in order to
                // not hold reference on it, which would make it appear in use
                auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager);
                if (!metadata || metadata->getCollVersion().epoch() != epoch) {
                    return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"};
                }
            }

            stillScheduled = css->trackOrphanedDataCleanup(orphanRange);
            if (!stillScheduled) {
                log() << "Finished deleting " << nss.ns() << " range "
                      << redact(orphanRange.toString());
                return Status::OK();
            }
        }

        log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange;

        Status result = stillScheduled->waitStatus(opCtx);
        if (!result.isOK()) {
            return result.withContext(str::stream() << "Failed to delete orphaned " << nss.ns()
                                                    << " range "
                                                    << orphanRange.toString());
        }
    }

    MONGO_UNREACHABLE;
}
Пример #25
0
Status MigrationSourceManager::enterCriticalSection(OperationContext* txn) {
    invariant(!txn->lockState()->isLocked());
    invariant(_state == kCloneCaughtUp);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

    // Mark the shard as running critical operation, which requires recovery on crash
    Status status = ShardingStateRecovery::startMetadataOp(txn);
    if (!status.isOK()) {
        return status;
    }

    {
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
        if (!css->getMetadata() ||
            !css->getMetadata()->getCollVersion().equals(_committedMetadata->getCollVersion())) {
            return {ErrorCodes::IncompatibleShardingMetadata,
                    str::stream()
                        << "Sharding metadata changed while holding distributed lock. Expected: "
                        << _committedMetadata->getCollVersion().toString()
                        << ", actual: "
                        << css->getMetadata()->getCollVersion().toString()};
        }

        // IMPORTANT: After this line, the critical section is in place and needs to be rolled back
        // if anything fails, which would prevent commit to the config servers.
        _critSecSignal = std::make_shared<Notification<void>>();
    }

    log() << "Successfully entered critical section.";

    _state = kCriticalSection;
    scopedGuard.Dismiss();
    return Status::OK();
}
Пример #26
0
/* static */
Status CollectionShardingState::waitForClean(OperationContext* opCtx,
                                             NamespaceString nss,
                                             OID const& epoch,
                                             ChunkRange orphanRange) {
    do {
        auto stillScheduled = boost::optional<CleanupNotification>();
        {
            AutoGetCollection autoColl(opCtx, nss, MODE_IX);
            // First, see if collection was dropped.
            auto css = CollectionShardingState::get(opCtx, nss);
            {
                auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager);
                if (!metadata || metadata->getCollVersion().epoch() != epoch) {
                    return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"};
                }
            }  // drop metadata
            stillScheduled = css->trackOrphanedDataCleanup(orphanRange);
            if (!stillScheduled) {
                log() << "Finished deleting " << nss.ns() << " range "
                      << redact(orphanRange.toString());
                return Status::OK();
            }
        }  // drop collection lock

        log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange;
        Status result = stillScheduled->waitStatus(opCtx);
        if (!result.isOK()) {
            return Status{result.code(),
                          str::stream() << "Failed to delete orphaned " << nss.ns() << " range "
                                        << orphanRange.toString()
                                        << ": "
                                        << result.reason()};
        }
    } while (true);
    MONGO_UNREACHABLE;
}
Пример #27
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& cmdObj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        BSONElement first = cmdObj.firstElement();
        uassert(28528,
                str::stream() << "Argument to listIndexes must be of type String, not "
                              << typeName(first.type()),
                first.type() == String);
        StringData collectionName = first.valueStringData();
        uassert(28529,
                str::stream() << "Argument to listIndexes must be a collection name, "
                              << "not the empty string",
                !collectionName.empty());
        const NamespaceString ns(dbname, collectionName);

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        AutoGetCollectionForRead autoColl(txn, ns);
        if (!autoColl.getDb()) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no database"));
        }

        const Collection* collection = autoColl.getCollection();
        if (!collection) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no collection"));
        }

        const CollectionCatalogEntry* cce = collection->getCatalogEntry();
        invariant(cce);

        vector<string> indexNames;
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            indexNames.clear();
            cce->getAllIndexes(txn, &indexNames);
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

        std::unique_ptr<WorkingSet> ws(new WorkingSet());
        std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get()));

        for (size_t i = 0; i < indexNames.size(); i++) {
            BSONObj indexSpec;
            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                indexSpec = cce->getIndexSpec(txn, indexNames[i]);
            }
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->keyData.clear();
            member->loc = RecordId();
            member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned());
            member->transitionToOwnedObj();
            root->pushBack(id);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "."
                                                    << ns.coll();
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS());
        dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            ClientCursor* cursor = new ClientCursor(
                CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace);
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
bool ShardingInitializationMongoD::initializeShardingAwarenessIfNeeded(OperationContext* opCtx) {
    invariant(!opCtx->lockState()->isLocked());

    // In sharded readOnly mode, we ignore the shardIdentity document on disk and instead *require*
    // a shardIdentity document to be passed through --overrideShardIdentity
    if (storageGlobalParams.readOnly) {
        if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) {
            uassert(ErrorCodes::InvalidOptions,
                    "If started with --shardsvr in queryableBackupMode, a shardIdentity document "
                    "must be provided through --overrideShardIdentity",
                    !serverGlobalParams.overrideShardIdentity.isEmpty());

            auto overrideShardIdentity =
                uassertStatusOK(ShardIdentityType::fromShardIdentityDocument(
                    serverGlobalParams.overrideShardIdentity));

            {
                // Global lock is required to call initializeFromShardIdentity
                Lock::GlobalWrite lk(opCtx);
                initializeFromShardIdentity(opCtx, overrideShardIdentity);
            }

            return true;
        } else {
            // Error if --overrideShardIdentity is used but *not* started with --shardsvr
            uassert(ErrorCodes::InvalidOptions,
                    str::stream()
                        << "Not started with --shardsvr, but a shardIdentity document was provided "
                           "through --overrideShardIdentity: "
                        << serverGlobalParams.overrideShardIdentity,
                    serverGlobalParams.overrideShardIdentity.isEmpty());
            return false;
        }

        MONGO_UNREACHABLE;
    }

    // In sharded *non*-readOnly mode, error if --overrideShardIdentity is provided
    uassert(ErrorCodes::InvalidOptions,
            str::stream() << "--overrideShardIdentity is only allowed in sharded "
                             "queryableBackupMode. If not in queryableBackupMode, you can edit "
                             "the shardIdentity document by starting the server *without* "
                             "--shardsvr, manually updating the shardIdentity document in the "
                          << NamespaceString::kServerConfigurationNamespace.toString()
                          << " collection, and restarting the server with --shardsvr.",
            serverGlobalParams.overrideShardIdentity.isEmpty());

    // Use the shardIdentity document on disk if one exists, but it is okay if no shardIdentity
    // document is available at all (sharding awareness will be initialized when a shardIdentity
    // document is inserted)
    BSONObj shardIdentityBSON;
    const bool foundShardIdentity = [&] {
        AutoGetCollection autoColl(opCtx, NamespaceString::kServerConfigurationNamespace, MODE_IS);
        return Helpers::findOne(opCtx,
                                autoColl.getCollection(),
                                BSON("_id" << ShardIdentityType::IdName),
                                shardIdentityBSON);
    }();

    if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) {
        if (!foundShardIdentity) {
            warning() << "Started with --shardsvr, but no shardIdentity document was found on "
                         "disk in "
                      << NamespaceString::kServerConfigurationNamespace
                      << ". This most likely means this server has not yet been added to a "
                         "sharded cluster.";
            return false;
        }

        invariant(!shardIdentityBSON.isEmpty());

        auto shardIdentity =
            uassertStatusOK(ShardIdentityType::fromShardIdentityDocument(shardIdentityBSON));

        {
            // Global lock is required to call initializeFromShardIdentity
            Lock::GlobalWrite lk(opCtx);
            initializeFromShardIdentity(opCtx, shardIdentity);
        }

        return true;
    } else {
        // Warn if a shardIdentity document is found on disk but *not* started with --shardsvr.
        if (!shardIdentityBSON.isEmpty()) {
            warning() << "Not started with --shardsvr, but a shardIdentity document was found "
                         "on disk in "
                      << NamespaceString::kServerConfigurationNamespace << ": "
                      << shardIdentityBSON;
        }
        return false;
    }
}
Пример #29
0
Status ShardingStateRecovery::recover(OperationContext* txn) {
    if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) {
        return Status::OK();
    }

    BSONObj recoveryDocBSON;

    try {
        AutoGetCollection autoColl(txn, NamespaceString::kConfigCollectionNamespace, MODE_IS);
        if (!Helpers::findOne(
                txn, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) {
            return Status::OK();
        }
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON);
    if (!recoveryDocStatus.isOK())
        return recoveryDocStatus.getStatus();

    const auto recoveryDoc = std::move(recoveryDocStatus.getValue());

    log() << "Sharding state recovery process found document " << recoveryDoc.toBSON();

    // Make sure the sharding state is initialized
    ShardingState* const shardingState = ShardingState::get(txn);

    // For backwards compatibility. Shards added by v3.4 cluster should have been initialized by
    // the shard identity document.
    // TODO(SERER-25276): Remove this after 3.4 since 3.4 shards should always have ShardingState
    // initialized by this point.
    if (!shardingState->enabled()) {
        shardingState->initializeFromConfigConnString(txn, recoveryDoc.getConfigsvr().toString());
        shardingState->setShardName(recoveryDoc.getShardName());
    }

    if (!recoveryDoc.getMinOpTimeUpdaters()) {
        // Treat the minOpTime as up-to-date
        grid.advanceConfigOpTime(recoveryDoc.getMinOpTime());
        return Status::OK();
    }

    log() << "Sharding state recovery document indicates there were "
          << recoveryDoc.getMinOpTimeUpdaters()
          << " metadata change operations in flight. Contacting the config server primary in order "
             "to retrieve the most recent opTime.";

    // Need to fetch the latest uptime from the config server, so do a logging write
    Status status =
        grid.catalogClient(txn)->logChange(txn,
                                           "Sharding minOpTime recovery",
                                           NamespaceString::kConfigCollectionNamespace.ns(),
                                           recoveryDocBSON,
                                           ShardingCatalogClient::kMajorityWriteConcern);
    if (!status.isOK())
        return status;

    log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime();

    // Finally, clear the recovery document so next time we don't need to recover
    status = modifyRecoveryDocument(txn, RecoveryDocument::Clear, kLocalWriteConcern);
    if (!status.isOK()) {
        warning() << "Failed to reset sharding state recovery document due to " << status;
    }

    return Status::OK();
}
Status MigrationChunkClonerSourceLegacy::_storeCurrentLocs(OperationContext* txn) {
    ScopedTransaction scopedXact(txn, MODE_IS);
    AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

    Collection* const collection = autoColl.getCollection();
    if (!collection) {
        return {ErrorCodes::NamespaceNotFound,
                str::stream() << "Collection " << _args.getNss().ns() << " does not exist."};
    }

    // Allow multiKey based on the invariant that shard keys must be single-valued. Therefore, any
    // multi-key index prefixed by shard key cannot be multikey over the shard key fields.
    IndexDescriptor* idx =
        collection->getIndexCatalog()->findShardKeyPrefixedIndex(txn,
                                                                 _shardKeyPattern.toBSON(),
                                                                 false);  // requireSingleKey
    if (!idx) {
        return {ErrorCodes::IndexNotFound,
                str::stream() << "can't find index with prefix " << _shardKeyPattern.toBSON()
                              << " in storeCurrentLocs for "
                              << _args.getNss().ns()};
    }

    // Install the stage, which will listen for notifications on the collection
    {
        stdx::lock_guard<stdx::mutex> sl(_mutex);

        invariant(!_deleteNotifyExec);

        // Takes ownership of 'ws' and 'dns'.
        auto statusWithPlanExecutor =
            PlanExecutor::make(txn,
                               stdx::make_unique<WorkingSet>(),
                               stdx::make_unique<DeleteNotificationStage>(this, txn),
                               collection,
                               PlanExecutor::YIELD_MANUAL);
        invariant(statusWithPlanExecutor.isOK());

        _deleteNotifyExec = std::move(statusWithPlanExecutor.getValue());
        _deleteNotifyExec->registerExec(collection);
    }

    // Assume both min and max non-empty, append MinKey's to make them fit chosen index
    const KeyPattern kp(idx->keyPattern());

    BSONObj min = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMinKey(), false));
    BSONObj max = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMaxKey(), false));

    std::unique_ptr<PlanExecutor> exec(InternalPlanner::indexScan(txn,
                                                                  collection,
                                                                  idx,
                                                                  min,
                                                                  max,
                                                                  false,  // endKeyInclusive
                                                                  PlanExecutor::YIELD_MANUAL));

    // We can afford to yield here because any change to the base data that we might miss is already
    // being queued and will migrate in the 'transferMods' stage.
    exec->setYieldPolicy(PlanExecutor::YIELD_AUTO, collection);

    // Use the average object size to estimate how many objects a full chunk would carry do that
    // while traversing the chunk's range using the sharding index, below there's a fair amount of
    // slack before we determine a chunk is too large because object sizes will vary.
    unsigned long long maxRecsWhenFull;
    long long avgRecSize;

    const long long totalRecs = collection->numRecords(txn);
    if (totalRecs > 0) {
        avgRecSize = collection->dataSize(txn) / totalRecs;
        maxRecsWhenFull = _args.getMaxChunkSizeBytes() / avgRecSize;
        maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1),
                                   130 * maxRecsWhenFull / 100 /* slack */);
    } else {
        avgRecSize = 0;
        maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1;
    }

    // Do a full traversal of the chunk and don't stop even if we think it is a large chunk we want
    // the number of records to better report, in that case.
    bool isLargeChunk = false;
    unsigned long long recCount = 0;

    BSONObj obj;
    RecordId recordId;
    PlanExecutor::ExecState state;
    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, &recordId))) {
        if (!isLargeChunk) {
            stdx::lock_guard<stdx::mutex> lk(_mutex);
            _cloneLocs.insert(recordId);
        }

        if (++recCount > maxRecsWhenFull) {
            isLargeChunk = true;
            // Continue on despite knowing that it will fail, just to get the correct value for
            // recCount
        }
    }

    if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) {
        return {ErrorCodes::InternalError,
                str::stream() << "Executor error while scanning for documents belonging to chunk: "
                              << WorkingSetCommon::toStatusString(obj)};
    }

    exec.reset();

    if (isLargeChunk) {
        return {
            ErrorCodes::ChunkTooBig,
            str::stream() << "Cannot move chunk: the maximum number of documents for a chunk is "
                          << maxRecsWhenFull
                          << ", the maximum chunk size is "
                          << _args.getMaxChunkSizeBytes()
                          << ", average document size is "
                          << avgRecSize
                          << ". Found "
                          << recCount
                          << " documents in chunk "
                          << " ns: "
                          << _args.getNss().ns()
                          << " "
                          << _args.getMinKey()
                          << " -> "
                          << _args.getMaxKey()};
    }

    _averageObjectSizeForCloneLocs = static_cast<uint64_t>(collection->averageObjectSize(txn) + 12);

    return Status::OK();
}