namespace QueryStageMultiPlan {

using std::unique_ptr;
using std::vector;
using stdx::make_unique;

static const NamespaceString nss("unittests.QueryStageMultiPlan");

/**
 * Create query solution.
 */
QuerySolution* createQuerySolution() {
    unique_ptr<QuerySolution> soln(new QuerySolution());
    soln->cacheData.reset(new SolutionCacheData());
    soln->cacheData->solnType = SolutionCacheData::COLLSCAN_SOLN;
    soln->cacheData->tree.reset(new PlanCacheIndexTree());
    return soln.release();
}

class QueryStageMultiPlanBase {
public:
    QueryStageMultiPlanBase() : _client(&_txn) {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.dropCollection(nss.ns());
    }

    virtual ~QueryStageMultiPlanBase() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.dropCollection(nss.ns());
    }

    void addIndex(const BSONObj& obj) {
        ASSERT_OK(dbtests::createIndex(&_txn, nss.ns(), obj));
    }

    void insert(const BSONObj& obj) {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.insert(nss.ns(), obj);
    }

    void remove(const BSONObj& obj) {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.remove(nss.ns(), obj);
    }

    OperationContext* txn() {
        return &_txn;
    }

protected:
    const ServiceContext::UniqueOperationContext _txnPtr = cc().makeOperationContext();
    OperationContext& _txn = *_txnPtr;
    DBDirectClient _client;
};


// Basic ranking test: collection scan vs. highly selective index scan.  Make sure we also get
// all expected results out as well.
class MPSCollectionScanVsHighlySelectiveIXScan : public QueryStageMultiPlanBase {
public:
    void run() {
        const int N = 5000;
        for (int i = 0; i < N; ++i) {
            insert(BSON("foo" << (i % 10)));
        }

        addIndex(BSON("foo" << 1));

        AutoGetCollectionForRead ctx(&_txn, nss.ns());
        const Collection* coll = ctx.getCollection();

        // Plan 0: IXScan over foo == 7
        // Every call to work() returns something so this should clearly win (by current scoring
        // at least).
        IndexScanParams ixparams;
        ixparams.descriptor =
            coll->getIndexCatalog()->findIndexByKeyPattern(&_txn, BSON("foo" << 1));
        ixparams.bounds.isSimpleRange = true;
        ixparams.bounds.startKey = BSON("" << 7);
        ixparams.bounds.endKey = BSON("" << 7);
        ixparams.bounds.endKeyInclusive = true;
        ixparams.direction = 1;

        unique_ptr<WorkingSet> sharedWs(new WorkingSet());
        IndexScan* ix = new IndexScan(&_txn, ixparams, sharedWs.get(), NULL);
        unique_ptr<PlanStage> firstRoot(new FetchStage(&_txn, sharedWs.get(), ix, NULL, coll));

        // Plan 1: CollScan with matcher.
        CollectionScanParams csparams;
        csparams.collection = coll;
        csparams.direction = CollectionScanParams::FORWARD;

        // Make the filter.
        BSONObj filterObj = BSON("foo" << 7);
        const CollatorInterface* collator = nullptr;
        StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse(
            filterObj, ExtensionsCallbackDisallowExtensions(), collator);
        verify(statusWithMatcher.isOK());
        unique_ptr<MatchExpression> filter = std::move(statusWithMatcher.getValue());
        // Make the stage.
        unique_ptr<PlanStage> secondRoot(
            new CollectionScan(&_txn, csparams, sharedWs.get(), filter.get()));

        // Hand the plans off to the MPS.
        auto statusWithCQ = CanonicalQuery::canonicalize(
            txn(), nss, BSON("foo" << 7), ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        verify(NULL != cq.get());

        unique_ptr<MultiPlanStage> mps =
            make_unique<MultiPlanStage>(&_txn, ctx.getCollection(), cq.get());
        mps->addPlan(createQuerySolution(), firstRoot.release(), sharedWs.get());
        mps->addPlan(createQuerySolution(), secondRoot.release(), sharedWs.get());

        // Plan 0 aka the first plan aka the index scan should be the best.
        PlanYieldPolicy yieldPolicy(PlanExecutor::YIELD_MANUAL, clockSource.get());
        mps->pickBestPlan(&yieldPolicy);
        ASSERT(mps->bestPlanChosen());
        ASSERT_EQUALS(0, mps->bestPlanIdx());

        // Takes ownership of arguments other than 'collection'.
        auto statusWithPlanExecutor = PlanExecutor::make(&_txn,
                                                         std::move(sharedWs),
                                                         std::move(mps),
                                                         std::move(cq),
                                                         coll,
                                                         PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(statusWithPlanExecutor.getStatus());
        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        // Get all our results out.
        int results = 0;
        BSONObj obj;
        PlanExecutor::ExecState state;
        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            ASSERT_EQUALS(obj["foo"].numberInt(), 7);
            ++results;
        }
        ASSERT_EQUALS(PlanExecutor::IS_EOF, state);
        ASSERT_EQUALS(results, N / 10);
    }
};

// Case in which we select a blocking plan as the winner, and a non-blocking plan
// is available as a backup.
class MPSBackupPlan : public QueryStageMultiPlanBase {
public:
    void run() {
        // Data is just a single {_id: 1, a: 1, b: 1} document.
        insert(BSON("_id" << 1 << "a" << 1 << "b" << 1));

        // Indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        AutoGetCollectionForRead ctx(&_txn, nss.ns());
        Collection* collection = ctx.getCollection();

        // Query for both 'a' and 'b' and sort on 'b'.
        auto statusWithCQ = CanonicalQuery::canonicalize(txn(),
                                                         nss,
                                                         BSON("a" << 1 << "b" << 1),  // query
                                                         BSON("b" << 1),              // sort
                                                         BSONObj(),                   // proj
                                                         ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // Force index intersection.
        bool forceIxisectOldValue = internalQueryForceIntersectionPlans;
        internalQueryForceIntersectionPlans = true;

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);
        // Turn this off otherwise it pops up in some plans.
        plannerParams.options &= ~QueryPlannerParams::KEEP_MUTATIONS;

        // Plan.
        vector<QuerySolution*> solutions;
        Status status = QueryPlanner::plan(*cq, plannerParams, &solutions);
        ASSERT(status.isOK());

        // We expect a plan using index {a: 1} and plan using index {b: 1} and
        // an index intersection plan.
        ASSERT_EQUALS(solutions.size(), 3U);

        // Fill out the MultiPlanStage.
        unique_ptr<MultiPlanStage> mps(new MultiPlanStage(&_txn, collection, cq.get()));
        unique_ptr<WorkingSet> ws(new WorkingSet());
        // Put each solution from the planner into the MPR.
        for (size_t i = 0; i < solutions.size(); ++i) {
            PlanStage* root;
            ASSERT(StageBuilder::build(&_txn, collection, *cq, *solutions[i], ws.get(), &root));
            // Takes ownership of 'solutions[i]' and 'root'.
            mps->addPlan(solutions[i], root, ws.get());
        }

        // This sets a backup plan.
        PlanYieldPolicy yieldPolicy(PlanExecutor::YIELD_MANUAL, clockSource.get());
        mps->pickBestPlan(&yieldPolicy);
        ASSERT(mps->bestPlanChosen());
        ASSERT(mps->hasBackupPlan());

        // We should have picked the index intersection plan due to forcing ixisect.
        QuerySolution* soln = mps->bestSolution();
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{sort: {pattern: {b: 1}, limit: 0, node: {sortKeyGen: {node:"
            "{fetch: {node: {andSorted: {nodes: ["
            "{ixscan: {filter: null, pattern: {a:1}}},"
            "{ixscan: {filter: null, pattern: {b:1}}}]}}}}}}}}",
            soln->root.get()));

        // Get the resulting document.
        PlanStage::StageState state = PlanStage::NEED_TIME;
        WorkingSetID wsid;
        while (state != PlanStage::ADVANCED) {
            state = mps->work(&wsid);
        }
        WorkingSetMember* member = ws->get(wsid);

        // Check the document returned by the query.
        ASSERT(member->hasObj());
        BSONObj expectedDoc = BSON("_id" << 1 << "a" << 1 << "b" << 1);
        ASSERT(expectedDoc.woCompare(member->obj.value()) == 0);

        // The blocking plan became unblocked, so we should no longer have a backup plan,
        // and the winning plan should still be the index intersection one.
        ASSERT(!mps->hasBackupPlan());
        soln = mps->bestSolution();
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{sort: {pattern: {b: 1}, limit: 0, node: {sortKeyGen: {node:"
            "{fetch: {node: {andSorted: {nodes: ["
            "{ixscan: {filter: null, pattern: {a:1}}},"
            "{ixscan: {filter: null, pattern: {b:1}}}]}}}}}}}}",
            soln->root.get()));

        // Restore index intersection force parameter.
        internalQueryForceIntersectionPlans = forceIxisectOldValue;
    }
};

// Test the structure and values of the explain output.
class MPSExplainAllPlans : public QueryStageMultiPlanBase {
public:
    void run() {
        // Insert a document to create the collection.
        insert(BSON("x" << 1));

        const int nDocs = 500;

        auto ws = stdx::make_unique<WorkingSet>();
        auto firstPlan = stdx::make_unique<QueuedDataStage>(&_txn, ws.get());
        auto secondPlan = stdx::make_unique<QueuedDataStage>(&_txn, ws.get());

        for (int i = 0; i < nDocs; ++i) {
            addMember(firstPlan.get(), ws.get(), BSON("x" << 1));

            // Make the second plan slower by inserting a NEED_TIME between every result.
            addMember(secondPlan.get(), ws.get(), BSON("x" << 1));
            secondPlan->pushBack(PlanStage::NEED_TIME);
        }

        AutoGetCollectionForRead ctx(&_txn, nss.ns());

        auto cq = uassertStatusOK(CanonicalQuery::canonicalize(
            txn(), nss, BSON("x" << 1), ExtensionsCallbackDisallowExtensions()));
        unique_ptr<MultiPlanStage> mps =
            make_unique<MultiPlanStage>(&_txn, ctx.getCollection(), cq.get());

        // Put each plan into the MultiPlanStage. Takes ownership of 'firstPlan' and 'secondPlan'.
        auto firstSoln = stdx::make_unique<QuerySolution>();
        auto secondSoln = stdx::make_unique<QuerySolution>();
        mps->addPlan(firstSoln.release(), firstPlan.release(), ws.get());
        mps->addPlan(secondSoln.release(), secondPlan.release(), ws.get());

        // Making a PlanExecutor chooses the best plan.
        auto exec = uassertStatusOK(PlanExecutor::make(
            &_txn, std::move(ws), std::move(mps), ctx.getCollection(), PlanExecutor::YIELD_MANUAL));

        auto root = static_cast<MultiPlanStage*>(exec->getRootStage());
        ASSERT_TRUE(root->bestPlanChosen());
        // The first QueuedDataStage should have won.
        ASSERT_EQ(root->bestPlanIdx(), 0);

        BSONObjBuilder bob;
        Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &bob);
        BSONObj explained = bob.done();

        ASSERT_EQ(explained["executionStats"]["nReturned"].Int(), nDocs);
        ASSERT_EQ(explained["executionStats"]["executionStages"]["needTime"].Int(), 0);
        auto allPlansStats = explained["executionStats"]["allPlansExecution"].Array();
        ASSERT_EQ(allPlansStats.size(), 2UL);
        for (auto&& planStats : allPlansStats) {
            int maxEvaluationResults = internalQueryPlanEvaluationMaxResults;
            ASSERT_EQ(planStats["executionStages"]["stage"].String(), "QUEUED_DATA");
            if (planStats["executionStages"]["needTime"].Int() > 0) {
                // This is the losing plan. Should only have advanced about half the time.
                ASSERT_LT(planStats["nReturned"].Int(), maxEvaluationResults);
            } else {
                // This is the winning plan. Stats here should be from the trial period.
                ASSERT_EQ(planStats["nReturned"].Int(), maxEvaluationResults);
            }
        }
    }

private:
    /**
     * Allocates a new WorkingSetMember with data 'dataObj' in 'ws', and adds the WorkingSetMember
     * to 'qds'.
     */
    void addMember(QueuedDataStage* qds, WorkingSet* ws, BSONObj dataObj) {
        WorkingSetID id = ws->allocate();
        WorkingSetMember* wsm = ws->get(id);
        wsm->obj = Snapshotted<BSONObj>(SnapshotId(), BSON("x" << 1));
        wsm->transitionToOwnedObj();
        qds->pushBack(id);
    }
};

// Test that the plan summary only includes stats from the winning plan.
//
// This is a regression test for SERVER-20111.
class MPSSummaryStats : public QueryStageMultiPlanBase {
public:
    void run() {
        const int N = 5000;
        for (int i = 0; i < N; ++i) {
            insert(BSON("foo" << (i % 10)));
        }

        // Add two indices to give more plans.
        addIndex(BSON("foo" << 1));
        addIndex(BSON("foo" << -1 << "bar" << 1));

        AutoGetCollectionForRead ctx(&_txn, nss.ns());
        Collection* coll = ctx.getCollection();

        // Create the executor (Matching all documents).
        auto queryObj = BSON("foo" << BSON("$gte" << 0));
        auto cq = uassertStatusOK(CanonicalQuery::canonicalize(
            txn(), nss, queryObj, ExtensionsCallbackDisallowExtensions()));
        auto exec =
            uassertStatusOK(getExecutor(&_txn, coll, std::move(cq), PlanExecutor::YIELD_MANUAL));

        ASSERT_EQ(exec->getRootStage()->stageType(), STAGE_MULTI_PLAN);

        exec->executePlan();

        PlanSummaryStats stats;
        Explain::getSummaryStats(*exec, &stats);

        // If only the winning plan's stats are recorded, we should not have examined more than the
        // total number of documents/index keys.
        ASSERT_LTE(stats.totalDocsExamined, static_cast<size_t>(N));
        ASSERT_LTE(stats.totalKeysExamined, static_cast<size_t>(N));
    }
};

class All : public Suite {
public:
    All() : Suite("query_stage_multiplan") {}

    void setupTests() {
        add<MPSCollectionScanVsHighlySelectiveIXScan>();
        add<MPSBackupPlan>();
        add<MPSExplainAllPlans>();
        add<MPSSummaryStats>();
    }
};

SuiteInstance<All> queryStageMultiPlanAll;

}  // namespace QueryStageMultiPlan
Beispiel #2
0
    void run() {
        // Populate the collection.
        for (int i = 0; i < 50; ++i) {
            insert(BSON("_id" << i << "foo" << i));
        }
        ASSERT_EQUALS(50U, count(BSONObj()));

        // Various variables we'll need.
        OldClientWriteContext ctx(&_txn, nss.ns());
        OpDebug* opDebug = &CurOp::get(_txn)->debug();
        Collection* coll = ctx.getCollection();
        UpdateLifecycleImpl updateLifecycle(false, nss);
        UpdateRequest request(nss);
        UpdateDriver driver((UpdateDriver::Options()));
        const int targetDocIndex = 10;
        const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex));
        const unique_ptr<WorkingSet> ws(stdx::make_unique<WorkingSet>());
        const unique_ptr<CanonicalQuery> cq(canonicalize(query));

        // Get the RecordIds that would be returned by an in-order scan.
        vector<RecordId> locs;
        getLocs(coll, CollectionScanParams::FORWARD, &locs);

        // Populate the request.
        request.setQuery(query);
        request.setUpdates(fromjson("{$set: {x: 0}}"));
        request.setSort(BSONObj());
        request.setMulti(false);
        request.setReturnDocs(UpdateRequest::RETURN_NEW);
        request.setLifecycle(&updateLifecycle);

        ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

        // Configure a QueuedDataStage to pass the first object in the collection back in a
        // LOC_AND_OBJ state.
        std::unique_ptr<QueuedDataStage> qds(stdx::make_unique<QueuedDataStage>(ws.get()));
        WorkingSetID id = ws->allocate();
        WorkingSetMember* member = ws->get(id);
        member->loc = locs[targetDocIndex];
        const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex);
        member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc);
        ws->transitionToLocAndObj(id);
        qds->pushBack(id);

        // Configure the update.
        UpdateStageParams updateParams(&request, &driver, opDebug);
        updateParams.canonicalQuery = cq.get();

        unique_ptr<UpdateStage> updateStage(
            stdx::make_unique<UpdateStage>(&_txn, updateParams, ws.get(), coll, qds.release()));

        // Should return advanced.
        id = WorkingSet::INVALID_ID;
        PlanStage::StageState state = updateStage->work(&id);
        ASSERT_EQUALS(PlanStage::ADVANCED, state);

        // Make sure the returned value is what we expect it to be.

        // Should give us back a valid id.
        ASSERT_TRUE(WorkingSet::INVALID_ID != id);
        WorkingSetMember* resultMember = ws->get(id);
        // With an owned copy of the object, with no RecordId.
        ASSERT_TRUE(resultMember->hasOwnedObj());
        ASSERT_FALSE(resultMember->hasLoc());
        ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ);
        ASSERT_TRUE(resultMember->obj.value().isOwned());

        // Should be the new value.
        BSONObj newDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex << "x" << 0);
        ASSERT_EQUALS(resultMember->obj.value(), newDoc);

        // Should have done the update.
        vector<BSONObj> objs;
        getCollContents(coll, &objs);
        ASSERT_EQUALS(objs[targetDocIndex], newDoc);

        // That should be it.
        id = WorkingSet::INVALID_ID;
        ASSERT_EQUALS(PlanStage::IS_EOF, updateStage->work(&id));
    }
Beispiel #3
0
 virtual ~QueryStageUpdateBase() {
     OldClientWriteContext ctx(&_txn, nss.ns());
     _client.dropCollection(nss.ns());
 }
Beispiel #4
0
 void addIndex(const BSONObj& obj) {
     ASSERT_OK(dbtests::createIndex(&_opCtx, nss.ns(), obj));
 }
Beispiel #5
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );
            return;
        }

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );
            return;
        }

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );
            return;
        }

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();
        }

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
            }
        }
        else {
            wcStatus = writeConcern.parse( wcDoc );
        }

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );
        }

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );
            return;
        }

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );
            return;
        }

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );
            return;
        }

        //
        // End validation
        //

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        //
        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.
        //

        bulkExecute( request, &upserted, &writeErrors );

        //
        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.
        //

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );
            }
        }

        //
        // Refresh metadata if needed
        //

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                //
                // First, we refresh metadata if we need to based on the requested version.
                //

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       requestMetadata->getShardVersion(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );
                }

                if ( queueForMigrationCommit ) {

                    //
                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.
                    //

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    //
                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.
                    //

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
                        }
                    }
                }
            }
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );
            }
        }

        //
        // Construct response
        //

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );
            }

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );
            }

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );
            }

            if (replset::anyReplEnabled()) {
                response->setLastOp( _client->getLastOp() );
                if (replset::theReplSet) {
                    response->setElectionId(replset::theReplSet->getElectionId());
                }
            }

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );
        }

        dassert( response->isValid( NULL ) );
    }
Beispiel #6
0
StatusWith<BSONObj> validateIndexSpec(
    OperationContext* opCtx,
    const BSONObj& indexSpec,
    const NamespaceString& expectedNamespace,
    const ServerGlobalParams::FeatureCompatibility& featureCompatibility) {
    bool hasKeyPatternField = false;
    bool hasIndexNameField = false;
    bool hasNamespaceField = false;
    bool hasVersionField = false;
    bool hasCollationField = false;

    auto fieldNamesValidStatus = validateIndexSpecFieldNames(indexSpec);
    if (!fieldNamesValidStatus.isOK()) {
        return fieldNamesValidStatus;
    }

    boost::optional<IndexVersion> resolvedIndexVersion;

    for (auto&& indexSpecElem : indexSpec) {
        auto indexSpecElemFieldName = indexSpecElem.fieldNameStringData();
        if (IndexDescriptor::kKeyPatternFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kKeyPatternFieldName
                                      << "' must be an object, but got "
                                      << typeName(indexSpecElem.type())};
            }

            std::vector<StringData> keys;
            for (auto&& keyElem : indexSpecElem.Obj()) {
                auto keyElemFieldName = keyElem.fieldNameStringData();
                if (std::find(keys.begin(), keys.end(), keyElemFieldName) != keys.end()) {
                    return {ErrorCodes::BadValue,
                            str::stream() << "The field '" << keyElemFieldName
                                          << "' appears multiple times in the index key pattern "
                                          << indexSpecElem.Obj()};
                }
                keys.push_back(keyElemFieldName);
            }

            // Here we always validate the key pattern according to the most recent rules, in order
            // to enforce that all new indexes have well-formed key patterns.
            Status keyPatternValidateStatus =
                validateKeyPattern(indexSpecElem.Obj(), IndexDescriptor::kLatestIndexVersion);
            if (!keyPatternValidateStatus.isOK()) {
                return keyPatternValidateStatus;
            }

            hasKeyPatternField = true;
        } else if (IndexDescriptor::kIndexNameFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::String) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kIndexNameFieldName
                                      << "' must be a string, but got "
                                      << typeName(indexSpecElem.type())};
            }

            hasIndexNameField = true;
        } else if (IndexDescriptor::kNamespaceFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::String) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kNamespaceFieldName
                                      << "' must be a string, but got "
                                      << typeName(indexSpecElem.type())};
            }

            StringData ns = indexSpecElem.valueStringData();
            if (ns.empty()) {
                return {ErrorCodes::BadValue,
                        str::stream() << "The field '" << IndexDescriptor::kNamespaceFieldName
                                      << "' cannot be an empty string"};
            }

            if (ns != expectedNamespace.ns()) {
                return {ErrorCodes::BadValue,
                        str::stream() << "The value of the field '"
                                      << IndexDescriptor::kNamespaceFieldName
                                      << "' ("
                                      << ns
                                      << ") doesn't match the namespace '"
                                      << expectedNamespace.ns()
                                      << "'"};
            }

            hasNamespaceField = true;
        } else if (IndexDescriptor::kIndexVersionFieldName == indexSpecElemFieldName) {
            if (!indexSpecElem.isNumber()) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kIndexVersionFieldName
                                      << "' must be a number, but got "
                                      << typeName(indexSpecElem.type())};
            }

            auto requestedIndexVersionAsInt = representAs<int>(indexSpecElem.number());
            if (!requestedIndexVersionAsInt) {
                return {ErrorCodes::BadValue,
                        str::stream()
                            << "Index version must be representable as a 32-bit integer, but got "
                            << indexSpecElem.toString(false, false)};
            }

            const IndexVersion requestedIndexVersion =
                static_cast<IndexVersion>(*requestedIndexVersionAsInt);
            auto creationAllowedStatus = IndexDescriptor::isIndexVersionAllowedForCreation(
                requestedIndexVersion, featureCompatibility, indexSpec);
            if (!creationAllowedStatus.isOK()) {
                return creationAllowedStatus;
            }

            hasVersionField = true;
            resolvedIndexVersion = requestedIndexVersion;
        } else if (IndexDescriptor::kCollationFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kCollationFieldName
                                      << "' must be an object, but got "
                                      << typeName(indexSpecElem.type())};
            }

            if (indexSpecElem.Obj().isEmpty()) {
                return {ErrorCodes::BadValue,
                        str::stream() << "The field '" << IndexDescriptor::kCollationFieldName
                                      << "' cannot be an empty object."};
            }

            hasCollationField = true;
        } else if (IndexDescriptor::kPartialFilterExprFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '"
                                      << IndexDescriptor::kPartialFilterExprFieldName
                                      << "' must be an object, but got "
                                      << typeName(indexSpecElem.type())};
            }

            // Just use the simple collator, even though the index may have a separate collation
            // specified or may inherit the default collation from the collection. It's legal to
            // parse with the wrong collation, since the collation can be set on a MatchExpression
            // after the fact. Here, we don't bother checking the collation after the fact, since
            // this invocation of the parser is just for validity checking.
            auto simpleCollator = nullptr;
            boost::intrusive_ptr<ExpressionContext> expCtx(
                new ExpressionContext(opCtx, simpleCollator));

            // Special match expression features (e.g. $jsonSchema, $expr, ...) are not allowed in
            // a partialFilterExpression on index creation.
            auto statusWithMatcher =
                MatchExpressionParser::parse(indexSpecElem.Obj(),
                                             std::move(expCtx),
                                             ExtensionsCallbackNoop(),
                                             MatchExpressionParser::kBanAllSpecialFeatures);
            if (!statusWithMatcher.isOK()) {
                return statusWithMatcher.getStatus();
            }
        } else {
            // We can assume field name is valid at this point. Validation of fieldname is handled
            // prior to this in validateIndexSpecFieldNames().
            continue;
        }
    }

    if (!resolvedIndexVersion) {
        resolvedIndexVersion = IndexDescriptor::getDefaultIndexVersion();
    }

    if (!hasKeyPatternField) {
        return {ErrorCodes::FailedToParse,
                str::stream() << "The '" << IndexDescriptor::kKeyPatternFieldName
                              << "' field is a required property of an index specification"};
    }

    if (!hasIndexNameField) {
        return {ErrorCodes::FailedToParse,
                str::stream() << "The '" << IndexDescriptor::kIndexNameFieldName
                              << "' field is a required property of an index specification"};
    }

    if (hasCollationField && *resolvedIndexVersion < IndexVersion::kV2) {
        return {ErrorCodes::CannotCreateIndex,
                str::stream() << "Invalid index specification " << indexSpec
                              << "; cannot create an index with the '"
                              << IndexDescriptor::kCollationFieldName
                              << "' option and "
                              << IndexDescriptor::kIndexVersionFieldName
                              << "="
                              << static_cast<int>(*resolvedIndexVersion)};
    }

    if (!hasNamespaceField || !hasVersionField) {
        BSONObjBuilder bob;

        if (!hasNamespaceField) {
            // We create a new index specification with the 'ns' field set as 'expectedNamespace' if
            // the field was omitted.
            bob.append(IndexDescriptor::kNamespaceFieldName, expectedNamespace.ns());
        }

        if (!hasVersionField) {
            // We create a new index specification with the 'v' field set as 'defaultIndexVersion'
            // if the field was omitted.
            bob.append(IndexDescriptor::kIndexVersionFieldName,
                       static_cast<int>(*resolvedIndexVersion));
        }

        bob.appendElements(indexSpec);
        return bob.obj();
    }

    return indexSpec;
}
Beispiel #7
0
namespace PlanRankingTests {

using std::unique_ptr;
using std::vector;

static const NamespaceString nss("unittests.PlanRankingTests");

class PlanRankingTestBase {
public:
    PlanRankingTestBase()
        : _internalQueryForceIntersectionPlans(internalQueryForceIntersectionPlans.load()),
          _enableHashIntersection(internalQueryPlannerEnableHashIntersection.load()),
          _client(&_opCtx) {
        // Run all tests with hash-based intersection enabled.
        internalQueryPlannerEnableHashIntersection.store(true);

        // Ensure N is significantly larger then internalQueryPlanEvaluationWorks.
        ASSERT_GTE(N, internalQueryPlanEvaluationWorks.load() + 1000);

        OldClientWriteContext ctx(&_opCtx, nss.ns());
        _client.dropCollection(nss.ns());
    }

    virtual ~PlanRankingTestBase() {
        // Restore external setParameter testing bools.
        internalQueryForceIntersectionPlans.store(_internalQueryForceIntersectionPlans);
        internalQueryPlannerEnableHashIntersection.store(_enableHashIntersection);
    }

    void insert(const BSONObj& obj) {
        OldClientWriteContext ctx(&_opCtx, nss.ns());
        _client.insert(nss.ns(), obj);
    }

    void addIndex(const BSONObj& obj) {
        ASSERT_OK(dbtests::createIndex(&_opCtx, nss.ns(), obj));
    }

    /**
     * Use the MultiPlanRunner to pick the best plan for the query 'cq'.  Goes through
     * normal planning to generate solutions and feeds them to the MPR.
     *
     * Does NOT take ownership of 'cq'.  Caller DOES NOT own the returned QuerySolution*.
     */
    QuerySolution* pickBestPlan(CanonicalQuery* cq) {
        AutoGetCollectionForReadCommand ctx(&_opCtx, nss);
        Collection* collection = ctx.getCollection();

        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_opCtx, collection, cq, &plannerParams);
        // Turn this off otherwise it pops up in some plans.
        plannerParams.options &= ~QueryPlannerParams::KEEP_MUTATIONS;

        // Plan.
        vector<QuerySolution*> solutions;
        Status status = QueryPlanner::plan(*cq, plannerParams, &solutions);
        ASSERT(status.isOK());

        ASSERT_GREATER_THAN_OR_EQUALS(solutions.size(), 1U);

        // Fill out the MPR.
        _mps.reset(new MultiPlanStage(&_opCtx, collection, cq));
        unique_ptr<WorkingSet> ws(new WorkingSet());
        // Put each solution from the planner into the MPR.
        for (size_t i = 0; i < solutions.size(); ++i) {
            PlanStage* root;
            ASSERT(StageBuilder::build(&_opCtx, collection, *cq, *solutions[i], ws.get(), &root));
            // Takes ownership of all (actually some) arguments.
            _mps->addPlan(solutions[i], root, ws.get());
        }
        // This is what sets a backup plan, should we test for it.
        PlanYieldPolicy yieldPolicy(PlanExecutor::NO_YIELD,
                                    _opCtx.getServiceContext()->getFastClockSource());
        _mps->pickBestPlan(&yieldPolicy).transitional_ignore();
        ASSERT(_mps->bestPlanChosen());

        size_t bestPlanIdx = _mps->bestPlanIdx();
        ASSERT_LESS_THAN(bestPlanIdx, solutions.size());

        // And return a pointer to the best solution.
        return _mps->bestSolution();
    }

    /**
     * Was a backup plan picked during the ranking process?
     */
    bool hasBackupPlan() const {
        ASSERT(NULL != _mps.get());
        return _mps->hasBackupPlan();
    }

    OperationContext* opCtx() {
        return &_opCtx;
    }

protected:
    // A large number, which must be larger than the number of times
    // candidate plans are worked by the multi plan runner. Used for
    // determining the number of documents in the tests below.
    const int N = 12000;

    const ServiceContext::UniqueOperationContext _txnPtr = cc().makeOperationContext();
    OperationContext& _opCtx = *_txnPtr;

private:
    // Holds the value of global "internalQueryForceIntersectionPlans" setParameter flag.
    // Restored at end of test invocation regardless of test result.
    bool _internalQueryForceIntersectionPlans;

    // Holds the value of the global set parameter so it can be restored at the end
    // of the test.
    bool _enableHashIntersection;

    unique_ptr<MultiPlanStage> _mps;

    DBDirectClient _client;
};

/**
 * Test that the "prefer ixisect" parameter works.
 */
class PlanRankingIntersectOverride : public PlanRankingTestBase {
public:
    void run() {
        // 'a' is very selective, 'b' is not.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << i << "b" << 1));
        }

        // Add indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        unique_ptr<CanonicalQuery> cq;

        // Run the query {a:4, b:1}.
        {
            auto qr = stdx::make_unique<QueryRequest>(nss);
            qr->setFilter(BSON("a" << 100 << "b" << 1));
            auto statusWithCQ = CanonicalQuery::canonicalize(
                opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
            verify(statusWithCQ.isOK());
            cq = std::move(statusWithCQ.getValue());
            ASSERT(cq.get());
        }

        // {a:100} is super selective so choose that.
        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{fetch: {filter: {b:1}, node: {ixscan: {pattern: {a: 1}}}}}", soln->root.get()));

        // Turn on the "force intersect" option.
        // This will be reverted by PlanRankingTestBase's destructor when the test completes.
        internalQueryForceIntersectionPlans.store(true);

        // And run the same query again.
        {
            auto qr = stdx::make_unique<QueryRequest>(nss);
            qr->setFilter(BSON("a" << 100 << "b" << 1));
            auto statusWithCQ = CanonicalQuery::canonicalize(
                opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
            verify(statusWithCQ.isOK());
            cq = std::move(statusWithCQ.getValue());
        }

        // With the "ranking picks ixisect always" option we pick an intersection plan that uses
        // both the {a:1} and {b:1} indices even though it performs poorly.

        soln = pickBestPlan(cq.get());
        ASSERT(
            QueryPlannerTestLib::solutionMatches("{fetch: {node: {andSorted: {nodes: ["
                                                 "{ixscan: {filter: null, pattern: {a:1}}},"
                                                 "{ixscan: {filter: null, pattern: {b:1}}}]}}}}",
                                                 soln->root.get()));
    }
};

/**
 * Test that a hashed AND solution plan is picked along with a non-blocking backup solution.
 */
class PlanRankingIntersectWithBackup : public PlanRankingTestBase {
public:
    void run() {
        // 'a' is very selective, 'b' is not.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << i << "b" << 1));
        }

        // Add indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        // Run the query {a:1, b:{$gt:1}.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << 1 << "b" << BSON("$gt" << 1)));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // Turn on the "force intersect" option.
        // This will be reverted by PlanRankingTestBase's destructor when the test completes.
        internalQueryForceIntersectionPlans.store(true);

        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(
            QueryPlannerTestLib::solutionMatches("{fetch: {node: {andHash: {nodes: ["
                                                 "{ixscan: {filter: null, pattern: {a:1}}},"
                                                 "{ixscan: {filter: null, pattern: {b:1}}}]}}}}",
                                                 soln->root.get()));

        // Confirm that a backup plan is available.
        ASSERT(hasBackupPlan());
    }
};

/**
 * Two plans hit EOF at the same time, but one is covered. Make sure that we prefer the covered
 * plan.
 */
class PlanRankingPreferCovered : public PlanRankingTestBase {
public:
    void run() {
        // Insert data {a:i, b:i}.  Index {a:1} and {a:1, b:1}, query on 'a', projection on 'a'
        // and 'b'.  Should prefer the second index as we can pull the 'b' data out.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << i << "b" << i));
        }

        addIndex(BSON("a" << 1));
        addIndex(BSON("a" << 1 << "b" << 1));

        // Query for a==27 with projection that wants 'a' and 'b'.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << 27));
        qr->setProj(BSON("_id" << 0 << "a" << 1 << "b" << 1));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        QuerySolution* soln = pickBestPlan(cq.get());

        // Prefer the fully covered plan.
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{proj: {spec: {_id:0, a:1, b:1}, node: {ixscan: {pattern: {a: 1, b:1}}}}}",
            soln->root.get()));
    }
};

/**
 * No plan produces any results or hits EOF. In this case we should never choose an index
 * intersection solution.
 */
class PlanRankingAvoidIntersectIfNoResults : public PlanRankingTestBase {
public:
    void run() {
        // We insert lots of copies of {a:1, b:1, c: 20}.  We have the indices {a:1} and {b:1},
        // and the query is {a:1, b:1, c: 999}.  No data that matches the query but we won't
        // know that during plan ranking.  We don't want to choose an intersection plan here.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << 1 << "b" << 1 << "c" << 20));
        }

        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        // There is no data that matches this query but we don't know that until EOF.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << 1 << "b" << 1 << "c" << 99));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        QuerySolution* soln = pickBestPlan(cq.get());

        // Anti-prefer the intersection plan.
        bool bestIsScanOverA = QueryPlannerTestLib::solutionMatches(
            "{fetch: {node: {ixscan: {pattern: {a: 1}}}}}", soln->root.get());
        bool bestIsScanOverB = QueryPlannerTestLib::solutionMatches(
            "{fetch: {node: {ixscan: {pattern: {b: 1}}}}}", soln->root.get());
        ASSERT(bestIsScanOverA || bestIsScanOverB);
    }
};

/**
 * No plan produces any results or hits EOF. In this case we should prefer covered solutions to
 * non-covered solutions.
 */
class PlanRankingPreferCoveredEvenIfNoResults : public PlanRankingTestBase {
public:
    void run() {
        // We insert lots of copies of {a:1, b:1}.  We have the indices {a:1} and {a:1, b:1},
        // the query is for a doc that doesn't exist, but there is a projection over 'a' and
        // 'b'.  We should prefer the index that provides a covered query.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << 1 << "b" << 1));
        }

        addIndex(BSON("a" << 1));
        addIndex(BSON("a" << 1 << "b" << 1));

        // There is no data that matches this query ({a:2}).  Both scans will hit EOF before
        // returning any data.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << 2));
        qr->setProj(BSON("_id" << 0 << "a" << 1 << "b" << 1));

        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        QuerySolution* soln = pickBestPlan(cq.get());
        // Prefer the fully covered plan.
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{proj: {spec: {_id:0, a:1, b:1}, node: {ixscan: {pattern: {a: 1, b:1}}}}}",
            soln->root.get()));
    }
};

/**
 * We have an index on "a" which is somewhat selective and an index on "b" which is highly
 * selective (will cause an immediate EOF). Make sure that a query with predicates on both "a"
 * and "b" will use the index on "b".
 */
class PlanRankingPreferImmediateEOF : public PlanRankingTestBase {
public:
    void run() {
        // 'a' is very selective, 'b' is not.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << i << "b" << 1));
        }

        // Add indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        // Run the query {a:N+1, b:1}.  (No such document.)
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << N + 1 << "b" << 1));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // {a: 100} is super selective so choose that.
        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{fetch: {filter: {b:1}, node: {ixscan: {pattern: {a: 1}}}}}", soln->root.get()));
    }
};

/**
 * Same as PlanRankingPreferImmediateEOF, but substitute a range predicate on "a" for the
 * equality predicate on "a".  The presence of the range predicate has an impact on the
 * intersection plan that is raced against the single-index plans: since "a" no longer generates
 * point interval bounds, the results of the index scan aren't guaranteed to be returned in
 * RecordId order, and so the intersection plan uses the AND_HASHED stage instead of the
 * AND_SORTED stage.  It is still the case that the query should pick the plan that uses index
 * "b", instead of the plan that uses index "a" or the (hashed) intersection plan.
 */
class PlanRankingPreferImmediateEOFAgainstHashed : public PlanRankingTestBase {
public:
    void run() {
        // 'a' is very selective, 'b' is not.
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << i << "b" << 1));
        }

        // Add indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        // Run the query {a:N+1, b:1}.  (No such document.)
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << BSON("$gte" << N + 1) << "b" << 1));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // {a: 100} is super selective so choose that.
        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{fetch: {filter: {b:1}, node: {ixscan: {pattern: {a: 1}}}}}", soln->root.get()));
    }
};

/**
 * We have an index on _id and a query over _id with a sort.  Ensure that we don't pick a
 * collscan as the best plan even though the _id-scanning solution doesn't produce any results.
 */
class PlanRankingNoCollscan : public PlanRankingTestBase {
public:
    void run() {
        for (int i = 0; i < N; ++i) {
            insert(BSON("_id" << i));
        }

        addIndex(BSON("_id" << 1));

        // Run a query with a sort.  The blocking sort won't produce any data during the
        // evaluation period.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("_id" << BSON("$gte" << 20 << "$lte" << 200)));
        qr->setSort(BSON("c" << 1));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        QuerySolution* soln = pickBestPlan(cq.get());

        // The best must not be a collscan.
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{sort: {pattern: {c: 1}, limit: 0, node: {sortKeyGen: {node:"
            "{fetch: {filter: null, node: "
            "{ixscan: {filter: null, pattern: {_id: 1}}}}}}}}}",
            soln->root.get()));
    }
};

/**
 * No indices are available, output a collscan.
 */
class PlanRankingCollscan : public PlanRankingTestBase {
public:
    void run() {
        // Insert data for which we have no index.
        for (int i = 0; i < N; ++i) {
            insert(BSON("foo" << i));
        }

        // Look for A Space Odyssey.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("foo" << 2001));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        QuerySolution* soln = pickBestPlan(cq.get());

        // The best must be a collscan.
        ASSERT(QueryPlannerTestLib::solutionMatches("{cscan: {dir: 1, filter: {foo: 2001}}}",
                                                    soln->root.get()));
    }
};

/**
 * When no other information is available, prefer solutions without
 * a blocking sort stage.
 */
class PlanRankingAvoidBlockingSort : public PlanRankingTestBase {
public:
    void run() {
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << 1 << "d" << i));
        }

        // The index {d: 1, e: 1} provides the desired sort order,
        // while index {a: 1, b: 1} can be used to answer the
        // query predicate, but does not provide the sort.
        addIndex(BSON("a" << 1 << "b" << 1));
        addIndex(BSON("d" << 1 << "e" << 1));

        // Query: find({a: 1}).sort({d: 1})
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(BSON("a" << 1));
        qr->setSort(BSON("d" << 1));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // No results will be returned during the trial period,
        // so we expect to choose {d: 1, e: 1}, as it allows us
        // to avoid the sort stage.
        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(
            QueryPlannerTestLib::solutionMatches("{fetch: {filter: {a:1}, node: "
                                                 "{ixscan: {filter: null, pattern: {d:1,e:1}}}}}",
                                                 soln->root.get()));
    }
};

/**
 * Make sure we run candidate plans for long enough when none of the
 * plans are producing results.
 */
class PlanRankingWorkPlansLongEnough : public PlanRankingTestBase {
public:
    void run() {
        for (int i = 0; i < N; ++i) {
            insert(BSON("a" << 1));
            insert(BSON("a" << 1 << "b" << 1 << "c" << i));
        }

        // Indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        // Solutions using either 'a' or 'b' will take a long time to start producing
        // results. However, an index scan on 'b' will start producing results sooner
        // than an index scan on 'a'.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(fromjson("{a: 1, b: 1, c: {$gte: 5000}}"));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // Use index on 'b'.
        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(QueryPlannerTestLib::solutionMatches("{fetch: {node: {ixscan: {pattern: {b: 1}}}}}",
                                                    soln->root.get()));
    }
};

/**
 * Suppose we have two plans which are roughly equivalent, other than that
 * one uses an index which involves doing a lot more skipping of index keys.
 * Prefer the plan which does not have to do this index key skipping.
 */
class PlanRankingAccountForKeySkips : public PlanRankingTestBase {
public:
    void run() {
        for (int i = 0; i < 100; ++i) {
            insert(BSON("a" << i << "b" << i << "c" << i));
        }

        // These indices look equivalent to the ranker for the query below unless we account
        // for key skipping. We should pick index {a: 1} if we account for key skipping
        // properly.
        addIndex(BSON("b" << 1 << "c" << 1));
        addIndex(BSON("a" << 1));

        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(fromjson("{a: 9, b: {$ne: 10}, c: 9}"));
        auto statusWithCQ = CanonicalQuery::canonicalize(
            opCtx(), std::move(qr), ExtensionsCallbackDisallowExtensions());
        ASSERT_OK(statusWithCQ.getStatus());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // Expect to use index {a: 1, b: 1}.
        QuerySolution* soln = pickBestPlan(cq.get());
        ASSERT(QueryPlannerTestLib::solutionMatches("{fetch: {node: {ixscan: {pattern: {a: 1}}}}}",
                                                    soln->root.get()));
    }
};

class All : public Suite {
public:
    All() : Suite("query_plan_ranking") {}

    void setupTests() {
        add<PlanRankingIntersectOverride>();
        add<PlanRankingIntersectWithBackup>();
        add<PlanRankingPreferCovered>();
        add<PlanRankingAvoidIntersectIfNoResults>();
        add<PlanRankingPreferCoveredEvenIfNoResults>();
        add<PlanRankingPreferImmediateEOF>();
        add<PlanRankingPreferImmediateEOFAgainstHashed>();
        add<PlanRankingNoCollscan>();
        add<PlanRankingCollscan>();
        add<PlanRankingAvoidBlockingSort>();
        add<PlanRankingWorkPlansLongEnough>();
        add<PlanRankingAccountForKeySkips>();
    }
};

SuiteInstance<All> planRankingAll;

}  // namespace PlanRankingTest
Beispiel #8
0
std::string runQuery(OperationContext* opCtx,
                     QueryMessage& q,
                     const NamespaceString& nss,
                     Message& result) {
    CurOp& curOp = *CurOp::get(opCtx);
    curOp.ensureStarted();

    uassert(ErrorCodes::InvalidNamespace,
            str::stream() << "Invalid ns [" << nss.ns() << "]",
            nss.isValid());
    invariant(!nss.isCommand());

    // Set CurOp information.
    const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip);
    beginQueryOp(opCtx, nss, upconvertedQuery, q.ntoreturn, q.ntoskip);

    // Parse the qm into a CanonicalQuery.
    const boost::intrusive_ptr<ExpressionContext> expCtx;
    auto cq = uassertStatusOKWithContext(
        CanonicalQuery::canonicalize(opCtx,
                                     q,
                                     expCtx,
                                     ExtensionsCallbackReal(opCtx, &nss),
                                     MatchExpressionParser::kAllowAllSpecialFeatures),
        "Can't canonicalize query");
    invariant(cq.get());

    LOG(5) << "Running query:\n" << redact(cq->toString());
    LOG(2) << "Running query: " << redact(cq->toStringShort());

    // Parse, canonicalize, plan, transcribe, and get a plan executor.
    AutoGetCollectionForReadCommand ctx(opCtx, nss, AutoGetCollection::ViewMode::kViewsForbidden);
    Collection* const collection = ctx.getCollection();

    {
        const QueryRequest& qr = cq->getQueryRequest();

        // Allow the query to run on secondaries if the read preference permits it. If no read
        // preference was specified, allow the query to run iff slaveOk has been set.
        const bool slaveOK = qr.hasReadPref()
            ? uassertStatusOK(ReadPreferenceSetting::fromContainingBSON(q.query))
                  .canRunOnSecondary()
            : qr.isSlaveOk();
        uassertStatusOK(
            repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, slaveOK));
    }

    // We have a parsed query. Time to get the execution plan for it.
    auto exec = uassertStatusOK(getExecutorLegacyFind(opCtx, collection, nss, std::move(cq)));

    const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest();

    // If it's actually an explain, do the explain and return rather than falling through
    // to the normal query execution loop.
    if (qr.isExplain()) {
        BufBuilder bb;
        bb.skip(sizeof(QueryResult::Value));

        BSONObjBuilder explainBob;
        Explain::explainStages(
            exec.get(), collection, ExplainOptions::Verbosity::kExecAllPlans, &explainBob);

        // Add the resulting object to the return buffer.
        BSONObj explainObj = explainBob.obj();
        bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

        // Set query result fields.
        QueryResult::View qr = bb.buf();
        qr.setResultFlagsToOk();
        qr.msgdata().setLen(bb.len());
        curOp.debug().responseLength = bb.len();
        qr.msgdata().setOperation(opReply);
        qr.setCursorId(0);
        qr.setStartingFrom(0);
        qr.setNReturned(1);
        result.setData(bb.release());
        return "";
    }

    // Handle query option $maxTimeMS (not used with commands).
    if (qr.getMaxTimeMS() > 0) {
        uassert(40116,
                "Illegal attempt to set operation deadline within DBDirectClient",
                !opCtx->getClient()->isInDirectClient());
        opCtx->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()});
    }
    opCtx->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

    // Run the query.
    // bb is used to hold query results
    // this buffer should contain either requested documents per query or
    // explain information, but not both
    BufBuilder bb(FindCommon::kInitReplyBufferSize);
    bb.skip(sizeof(QueryResult::Value));

    // How many results have we obtained from the executor?
    int numResults = 0;

    BSONObj obj;
    PlanExecutor::ExecState state;

    // Get summary info about which plan the executor is using.
    {
        stdx::lock_guard<Client> lk(*opCtx->getClient());
        curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
    }

    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
        // If we can't fit this result inside the current batch, then we stash it for later.
        if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) {
            exec->enqueue(obj);
            break;
        }

        // Add result to output buffer.
        bb.appendBuf((void*)obj.objdata(), obj.objsize());

        // Count the result.
        ++numResults;

        if (FindCommon::enoughForFirstBatch(qr, numResults)) {
            LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore()
                   << " ntoreturn=" << qr.getNToReturn().value_or(0)
                   << " numResults=" << numResults;
            break;
        }
    }

    // Caller expects exceptions thrown in certain cases.
    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
        error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
                << ", stats: " << redact(Explain::getWinningPlanStats(exec.get()));
        uassertStatusOKWithContext(WorkingSetCommon::getMemberObjectStatus(obj),
                                   "Executor error during OP_QUERY find");
        MONGO_UNREACHABLE;
    }

    // Before saving the cursor, ensure that whatever plan we established happened with the expected
    // collection version
    auto css = CollectionShardingState::get(opCtx, nss);
    css->checkShardVersionOrThrow(opCtx);

    // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with
    // this cursorid later.
    long long ccId = 0;

    if (shouldSaveCursor(opCtx, collection, state, exec.get())) {
        // We won't use the executor until it's getMore'd.
        exec->saveState();
        exec->detachFromOperationContext();

        // Allocate a new ClientCursor and register it with the cursor manager.
        ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor(
            opCtx,
            {std::move(exec),
             nss,
             AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(),
             opCtx->recoveryUnit()->getReadConcernLevel(),
             upconvertedQuery});
        ccId = pinnedCursor.getCursor()->cursorid();

        LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
               << " results";

        // TODO document
        if (qr.isExhaust()) {
            curOp.debug().exhaust = true;
        }

        pinnedCursor.getCursor()->setPos(numResults);

        // We assume that cursors created through a DBDirectClient are always used from their
        // original OperationContext, so we do not need to move time to and from the cursor.
        if (!opCtx->getClient()->isInDirectClient()) {
            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros());
        }

        endQueryOp(opCtx, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId);
    } else {
        LOG(5) << "Not caching executor but returning " << numResults << " results.";
        endQueryOp(opCtx, collection, *exec, numResults, ccId);
    }

    // Fill out the output buffer's header.
    QueryResult::View queryResultView = bb.buf();
    queryResultView.setCursorId(ccId);
    queryResultView.setResultFlagsToOk();
    queryResultView.msgdata().setLen(bb.len());
    queryResultView.msgdata().setOperation(opReply);
    queryResultView.setStartingFrom(0);
    queryResultView.setNReturned(numResults);

    // Add the results from the query into the output buffer.
    result.setData(bb.release());

    // curOp.debug().exhaust is set above.
    return curOp.debug().exhaust ? nss.ns() : "";
}
Beispiel #9
0
Status appendCollectionStorageStats(OperationContext* opCtx,
                                    const NamespaceString& nss,
                                    const BSONObj& param,
                                    BSONObjBuilder* result) {
    int scale = 1;
    if (param["scale"].isNumber()) {
        scale = param["scale"].numberInt();
        if (scale < 1) {
            return {ErrorCodes::BadValue, "scale has to be >= 1"};
        }
    } else if (param["scale"].trueValue()) {
        return {ErrorCodes::BadValue, "scale has to be a number >= 1"};
    }

    bool verbose = param["verbose"].trueValue();

    AutoGetCollectionForReadCommand ctx(opCtx, nss);
    Collection* collection = ctx.getCollection();  // Will be set if present
    if (!ctx.getDb() || !collection) {
        result->appendNumber("size", 0);
        result->appendNumber("count", 0);
        result->appendNumber("storageSize", 0);
        result->append("nindexes", 0);
        result->appendNumber("totalIndexSize", 0);
        result->append("indexDetails", BSONObj());
        result->append("indexSizes", BSONObj());
        std::string errmsg = !(ctx.getDb()) ? "Database [" + nss.db().toString() + "] not found."
                                            : "Collection [" + nss.toString() + "] not found.";
        return {ErrorCodes::NamespaceNotFound, errmsg};
    }

    long long size = collection->dataSize(opCtx) / scale;
    result->appendNumber("size", size);
    long long numRecords = collection->numRecords(opCtx);
    result->appendNumber("count", numRecords);

    if (numRecords)
        result->append("avgObjSize", collection->averageObjectSize(opCtx));

    RecordStore* recordStore = collection->getRecordStore();
    result->appendNumber(
        "storageSize",
        static_cast<long long>(recordStore->storageSize(opCtx, result, verbose ? 1 : 0)) / scale);

    recordStore->appendCustomStats(opCtx, result, scale);

    IndexCatalog* indexCatalog = collection->getIndexCatalog();
    result->append("nindexes", indexCatalog->numIndexesReady(opCtx));

    BSONObjBuilder indexDetails;

    std::unique_ptr<IndexCatalog::IndexIterator> it = indexCatalog->getIndexIterator(opCtx, false);
    while (it->more()) {
        const IndexCatalogEntry* entry = it->next();
        const IndexDescriptor* descriptor = entry->descriptor();
        const IndexAccessMethod* iam = entry->accessMethod();
        invariant(iam);

        BSONObjBuilder bob;
        if (iam->appendCustomStats(opCtx, &bob, scale)) {
            indexDetails.append(descriptor->indexName(), bob.obj());
        }
    }

    result->append("indexDetails", indexDetails.obj());

    BSONObjBuilder indexSizes;
    long long indexSize = collection->getIndexSize(opCtx, &indexSizes, scale);

    result->appendNumber("totalIndexSize", indexSize / scale);
    result->append("indexSizes", indexSizes.obj());

    return Status::OK();
}
Beispiel #10
0
 Status dropIndexTable(OperationContext* opCtx, NamespaceString nss, std::string indexName) {
     std::string indexIdent =
         _storageEngine->getCatalog()->getIndexIdent(opCtx, nss.ns(), indexName);
     return dropIdent(opCtx, indexIdent);
 }
Beispiel #11
0
BSONObj RollbackSourceImpl::findOne(const NamespaceString& nss, const BSONObj& filter) const {
    return _getConnection()->findOne(nss.toString(), filter, NULL, QueryOption_SlaveOk).getOwned();
}
Beispiel #12
0
 /**
  * Create a collection table in the KVEngine not reflected in the KVCatalog.
  */
 Status createCollTable(OperationContext* opCtx, NamespaceString collName) {
     const std::string identName = "collection-" + collName.ns();
     return _storageEngine->getEngine()->createGroupedRecordStore(
         opCtx, collName.ns(), identName, CollectionOptions(), KVPrefix::kNotPrefixed);
 }
Beispiel #13
0
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx,
                                           ChunkManager* manager,
                                           Chunk* chunk,
                                           long dataWritten) {
    // Disable lastError tracking so that any errors, which occur during auto-split do not get
    // bubbled up on the client connection doing a write
    LastError::Disabled disableLastError(&LastError::get(opCtx->getClient()));

    const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration();

    const bool minIsInf =
        (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin()));
    const bool maxIsInf =
        (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax()));

    const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten);

    const uint64_t desiredChunkSize =
        calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks());

    if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) {
        return;
    }

    const NamespaceString nss(manager->getns());

    if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) {
        LOG(1) << "won't auto split because not enough tickets: " << nss;
        return;
    }

    TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets));

    const ChunkRange chunkRange(chunk->getMin(), chunk->getMax());

    try {
        // Ensure we have the most up-to-date balancer configuration
        uassertStatusOK(balancerConfig->refreshAndCheck(opCtx));

        if (!balancerConfig->getShouldAutoSplit()) {
            return;
        }

        LOG(1) << "about to initiate autosplit: " << redact(chunk->toString())
               << " dataWritten: " << chunkBytesWritten
               << " desiredChunkSize: " << desiredChunkSize;

        const uint64_t chunkSizeToUse = [&]() {
            const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2;

            if (estNumSplitPoints >= kTooManySplitPoints) {
                // The current desired chunk size will split the chunk into lots of small chunk and
                // at the worst case this can result into thousands of chunks. So check and see if a
                // bigger value can be used.
                return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes());
            } else {
                return desiredChunkSize;
            }
        }();

        auto splitPoints =
            uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx,
                                                              chunk->getShardId(),
                                                              nss,
                                                              manager->getShardKeyPattern(),
                                                              chunkRange,
                                                              chunkSizeToUse,
                                                              boost::none));

        if (splitPoints.size() <= 1) {
            // No split points means there isn't enough data to split on; 1 split point means we
            // have
            // between half the chunk size to full chunk size so there is no need to split yet
            chunk->clearBytesWritten();
            return;
        }

        if (minIsInf || maxIsInf) {
            // We don't want to reset _dataWritten since we want to check the other side right away
        } else {
            // We're splitting, so should wait a bit
            chunk->clearBytesWritten();
        }

        // We assume that if the chunk being split is the first (or last) one on the collection,
        // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the
        // very first (or last) key as a split point.
        //
        // This heuristic is skipped for "special" shard key patterns that are not likely to produce
        // monotonically increasing or decreasing values (e.g. hashed shard keys).
        if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) {
            if (minIsInf) {
                BSONObj key = findExtremeKeyForShard(
                    opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true);
                if (!key.isEmpty()) {
                    splitPoints.front() = key.getOwned();
                }
            } else if (maxIsInf) {
                BSONObj key = findExtremeKeyForShard(
                    opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false);
                if (!key.isEmpty()) {
                    splitPoints.back() = key.getOwned();
                }
            }
        }

        const auto suggestedMigrateChunk =
            uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx,
                                                                  chunk->getShardId(),
                                                                  nss,
                                                                  manager->getShardKeyPattern(),
                                                                  manager->getVersion(),
                                                                  chunkRange,
                                                                  splitPoints));

        // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk
        // to balance
        const bool shouldBalance = [&]() {
            if (!balancerConfig->shouldBalanceForAutoSplit())
                return false;

            auto collStatus =
                Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns());
            if (!collStatus.isOK()) {
                log() << "Auto-split for " << nss << " failed to load collection metadata"
                      << causedBy(redact(collStatus.getStatus()));
                return false;
            }

            return collStatus.getValue().value.getAllowBalance();
        }();

        log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into "
              << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")"
              << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" +
                          (shouldBalance ? ")" : ", but no migrations allowed)"));

        // Reload the chunk manager after the split
        auto routingInfo = uassertStatusOK(
            Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx,
                                                                                         nss));

        if (!shouldBalance || !suggestedMigrateChunk) {
            return;
        }

        // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot
        // spot from staying on a single shard. This is based on the assumption that succeeding
        // inserts will fall on the top chunk.

        // We need to use the latest chunk manager (after the split) in order to have the most
        // up-to-date view of the chunk we are about to move
        auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation(
            suggestedMigrateChunk->getMin());

        ChunkType chunkToMove;
        chunkToMove.setNS(nss.ns());
        chunkToMove.setShard(suggestedChunk->getShardId());
        chunkToMove.setMin(suggestedChunk->getMin());
        chunkToMove.setMax(suggestedChunk->getMax());
        chunkToMove.setVersion(suggestedChunk->getLastmod());

        uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove));

        // Ensure the collection gets reloaded because of the move
        Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss);
    } catch (const DBException& ex) {
        chunk->clearBytesWritten();

        if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) {
            log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex)
                  << ", going to invalidate routing table entry for " << nss;
            Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss);
        }
    }
}
 QueryStageMultiPlanBase() : _client(&_txn) {
     OldClientWriteContext ctx(&_txn, nss.ns());
     _client.dropCollection(nss.ns());
 }
Beispiel #15
0
Status IndexBuilder::_build(OperationContext* opCtx,
                            Database* db,
                            bool allowBackgroundBuilding,
                            Lock::DBLock* dbLock) const {
    const NamespaceString ns(_index["ns"].String());

    Collection* c = db->getCollection(opCtx, ns);

    // Collections should not be implicitly created by the index builder.
    fassert(40409, c);

    {
        stdx::lock_guard<Client> lk(*opCtx->getClient());
        // Show which index we're building in the curop display.
        CurOp::get(opCtx)->setOpDescription_inlock(_index);
    }

    bool haveSetBgIndexStarting = false;
    while (true) {
        Status status = Status::OK();
        try {
            MultiIndexBlock indexer(opCtx, c);
            indexer.allowInterruption();

            if (allowBackgroundBuilding)
                indexer.allowBackgroundBuilding();

            try {
                status = indexer.init(_index).getStatus();
                if (status == ErrorCodes::IndexAlreadyExists ||
                    (status == ErrorCodes::IndexOptionsConflict && _relaxConstraints)) {
                    LOG(1) << "Ignoring indexing error: " << redact(status);
                    if (allowBackgroundBuilding) {
                        // Must set this in case anyone is waiting for this build.
                        _setBgIndexStarting();
                    }
                    return Status::OK();
                }

                if (status.isOK()) {
                    if (allowBackgroundBuilding) {
                        if (!haveSetBgIndexStarting) {
                            _setBgIndexStarting();
                            haveSetBgIndexStarting = true;
                        }
                        invariant(dbLock);
                        dbLock->relockWithMode(MODE_IX);
                    }

                    Lock::CollectionLock colLock(opCtx->lockState(), ns.ns(), MODE_IX);
                    status = indexer.insertAllDocumentsInCollection();
                }

                if (status.isOK()) {
                    if (allowBackgroundBuilding) {
                        dbLock->relockWithMode(MODE_X);
                    }
                    WriteUnitOfWork wunit(opCtx);
                    indexer.commit();
                    wunit.commit();
                }
                if (!status.isOK()) {
                    error() << "bad status from index build: " << redact(status);
                }
            } catch (const DBException& e) {
                status = e.toStatus();
            }

            if (allowBackgroundBuilding) {
                dbLock->relockWithMode(MODE_X);
                Database* reloadDb = dbHolder().get(opCtx, ns.db());
                fassert(28553, reloadDb);
                fassert(28554, reloadDb->getCollection(opCtx, ns));
            }

            if (status.code() == ErrorCodes::InterruptedAtShutdown) {
                // leave it as-if kill -9 happened. This will be handled on restart.
                invariant(allowBackgroundBuilding);  // Foreground builds aren't interrupted.
                indexer.abortWithoutCleanup();
            }
        } catch (const WriteConflictException& wce) {
            status = wce.toStatus();
        }

        if (status.code() != ErrorCodes::WriteConflict)
            return status;


        LOG(2) << "WriteConflictException while creating index in IndexBuilder, retrying.";
        opCtx->recoveryUnit()->abandonSnapshot();
    }
}
Beispiel #16
0
void TagsType::setNS(const NamespaceString& ns) {
    invariant(ns.isValid());
    _ns = ns;
}
Beispiel #17
0
bool mergeChunks(OperationContext* txn,
                 const NamespaceString& nss,
                 const BSONObj& minKey,
                 const BSONObj& maxKey,
                 const OID& epoch,
                 string* errMsg) {
    // Get the distributed lock
    string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to "
                                 << maxKey;
    auto scopedDistLock = grid.catalogManager(txn)->distLock(
        txn, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout);

    if (!scopedDistLock.isOK()) {
        *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                           << " to merge chunks in [" << minKey << "," << maxKey << ")"
                           << causedBy(scopedDistLock.getStatus());

        warning() << *errMsg;
        return false;
    }

    ShardingState* shardingState = ShardingState::get(txn);

    //
    // We now have the collection lock, refresh metadata to latest version and sanity check
    //

    ChunkVersion shardVersion;
    Status status = shardingState->refreshMetadataNow(txn, nss.ns(), &shardVersion);

    if (!status.isOK()) {
        *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                << nss.ns() << causedBy(status.reason());

        warning() << *errMsg;
        return false;
    }

    if (epoch.isSet() && shardVersion.epoch() != epoch) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed"
                           << " since merge was sent"
                           << "(sent epoch : " << epoch.toString()
                           << ", current epoch : " << shardVersion.epoch().toString() << ")";

        warning() << *errMsg;
        return false;
    }

    shared_ptr<CollectionMetadata> metadata = shardingState->getCollectionMetadata(nss.ns());

    if (!metadata || metadata->getKeyPattern().isEmpty()) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " is not sharded";

        warning() << *errMsg;
        return false;
    }

    dassert(metadata->getShardVersion().equals(shardVersion));

    if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) {
        *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey)
                           << " is not valid"
                           << " for collection " << nss.ns() << " with key pattern "
                           << metadata->getKeyPattern();

        warning() << *errMsg;
        return false;
    }

    //
    // Get merged chunk information
    //

    ChunkVersion mergeVersion = metadata->getCollVersion();
    mergeVersion.incMinor();

    std::vector<ChunkType> chunksToMerge;

    ChunkType itChunk;
    itChunk.setMin(minKey);
    itChunk.setMax(minKey);
    itChunk.setNS(nss.ns());
    itChunk.setShard(shardingState->getShardName());

    while (itChunk.getMax().woCompare(maxKey) < 0 &&
           metadata->getNextChunk(itChunk.getMax(), &itChunk)) {
        chunksToMerge.push_back(itChunk);
    }

    if (chunksToMerge.empty()) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range starting at " << minKey << " and ending at " << maxKey
                           << " does not belong to shard " << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    //
    // Validate the range starts and ends at chunks and has no holes, error if not valid
    //

    BSONObj firstDocMin = chunksToMerge.front().getMin();
    BSONObj firstDocMax = chunksToMerge.front().getMax();
    // minKey is inclusive
    bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey);

    if (!minKeyInRange) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range starting at " << minKey << " does not belong to shard "
                           << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    BSONObj lastDocMin = chunksToMerge.back().getMin();
    BSONObj lastDocMax = chunksToMerge.back().getMax();
    // maxKey is exclusive
    bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0;

    if (!maxKeyInRange) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " range ending at " << maxKey << " does not belong to shard "
                           << shardingState->getShardName();

        warning() << *errMsg;
        return false;
    }

    bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0;
    bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0;

    if (!validRangeStartKey || !validRangeEndKey) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " does not contain a chunk "
                           << (!validRangeStartKey ? "starting at " + minKey.toString() : "")
                           << (!validRangeStartKey && !validRangeEndKey ? " or " : "")
                           << (!validRangeEndKey ? "ending at " + maxKey.toString() : "");

        warning() << *errMsg;
        return false;
    }

    if (chunksToMerge.size() == 1) {
        *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                           << " already contains chunk for " << rangeToString(minKey, maxKey);

        warning() << *errMsg;
        return false;
    }

    // Look for hole in range
    for (size_t i = 1; i < chunksToMerge.size(); ++i) {
        if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) {
            *errMsg =
                stream() << "could not merge chunks, collection " << nss.ns()
                         << " has a hole in the range " << rangeToString(minKey, maxKey) << " at "
                         << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin());

            warning() << *errMsg;
            return false;
        }
    }

    //
    // Run apply ops command
    //
    Status applyOpsStatus = runApplyOpsCmd(txn, chunksToMerge, shardVersion, mergeVersion);
    if (!applyOpsStatus.isOK()) {
        warning() << applyOpsStatus;
        return false;
    }

    //
    // Install merged chunk metadata
    //

    {
        ScopedTransaction transaction(txn, MODE_IX);
        Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX);
        Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X);

        shardingState->mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
    }

    //
    // Log change
    //

    BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion);

    grid.catalogManager(txn)->logChange(txn, "merge", nss.ns(), mergeLogEntry);

    return true;
}
Beispiel #18
0
    virtual bool errmsgRun(OperationContext* opCtx,
                           const string& db,
                           const BSONObj& cmdObj,
                           string& errmsg,
                           BSONObjBuilder& result) {
        NamespaceString nss = CommandHelpers::parseNsCollectionRequired(db, cmdObj);

        repl::ReplicationCoordinator* replCoord = repl::ReplicationCoordinator::get(opCtx);
        if (replCoord->getMemberState().primary() && !cmdObj["force"].trueValue()) {
            errmsg =
                "will not run compact on an active replica set primary as this is a slow blocking "
                "operation. use force:true to force";
            return false;
        }

        if (!nss.isNormal()) {
            errmsg = "bad namespace name";
            return false;
        }

        if (nss.isSystem()) {
            // items in system.* cannot be moved as there might be pointers to them
            // i.e. system.indexes entries are pointed to from NamespaceDetails
            errmsg = "can't compact a system namespace";
            return false;
        }

        CompactOptions compactOptions;

        if (cmdObj["preservePadding"].trueValue()) {
            compactOptions.paddingMode = CompactOptions::PRESERVE;
            if (cmdObj.hasElement("paddingFactor") || cmdObj.hasElement("paddingBytes")) {
                errmsg = "cannot mix preservePadding and paddingFactor|paddingBytes";
                return false;
            }
        } else if (cmdObj.hasElement("paddingFactor") || cmdObj.hasElement("paddingBytes")) {
            compactOptions.paddingMode = CompactOptions::MANUAL;
            if (cmdObj.hasElement("paddingFactor")) {
                compactOptions.paddingFactor = cmdObj["paddingFactor"].Number();
                if (compactOptions.paddingFactor < 1 || compactOptions.paddingFactor > 4) {
                    errmsg = "invalid padding factor";
                    return false;
                }
            }
            if (cmdObj.hasElement("paddingBytes")) {
                compactOptions.paddingBytes = cmdObj["paddingBytes"].numberInt();
                if (compactOptions.paddingBytes < 0 ||
                    compactOptions.paddingBytes > (1024 * 1024)) {
                    errmsg = "invalid padding bytes";
                    return false;
                }
            }
        }

        if (cmdObj.hasElement("validate"))
            compactOptions.validateDocuments = cmdObj["validate"].trueValue();

        AutoGetDb autoDb(opCtx, db, MODE_X);
        Database* const collDB = autoDb.getDb();

        Collection* collection = collDB ? collDB->getCollection(opCtx, nss) : nullptr;
        auto view =
            collDB && !collection ? collDB->getViewCatalog()->lookup(opCtx, nss.ns()) : nullptr;

        // If db/collection does not exist, short circuit and return.
        if (!collDB || !collection) {
            if (view)
                uasserted(ErrorCodes::CommandNotSupportedOnView, "can't compact a view");
            else
                uasserted(ErrorCodes::NamespaceNotFound, "collection does not exist");
        }

        OldClientContext ctx(opCtx, nss.ns());
        BackgroundOperation::assertNoBgOpInProgForNs(nss.ns());

        log() << "compact " << nss.ns() << " begin, options: " << compactOptions;

        StatusWith<CompactStats> status = collection->compact(opCtx, &compactOptions);
        uassertStatusOK(status.getStatus());

        if (status.getValue().corruptDocuments > 0)
            result.append("invalidObjects", status.getValue().corruptDocuments);

        log() << "compact " << nss.ns() << " end";

        return true;
    }
Beispiel #19
0
std::string runQuery(OperationContext* txn,
                     QueryMessage& q,
                     const NamespaceString& nss,
                     Message& result) {
    CurOp& curop = *CurOp::get(txn);

    uassert(ErrorCodes::InvalidNamespace,
            str::stream() << "Invalid ns [" << nss.ns() << "]",
            nss.isValid());
    invariant(!nss.isCommand());

    // Set curop information.
    beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip);

    // Parse the qm into a CanonicalQuery.

    auto statusWithCQ = CanonicalQuery::canonicalize(q, ExtensionsCallbackReal(txn, &nss));
    if (!statusWithCQ.isOK()) {
        uasserted(
            17287,
            str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString());
    }
    unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
    invariant(cq.get());

    LOG(5) << "Running query:\n" << cq->toString();
    LOG(2) << "Running query: " << cq->toStringShort();

    // Parse, canonicalize, plan, transcribe, and get a plan executor.
    AutoGetCollectionForRead ctx(txn, nss);
    Collection* collection = ctx.getCollection();

    const int dbProfilingLevel =
        ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

    // We have a parsed query. Time to get the execution plan for it.
    std::unique_ptr<PlanExecutor> exec = uassertStatusOK(
        getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO));

    const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

    // If it's actually an explain, do the explain and return rather than falling through
    // to the normal query execution loop.
    if (pq.isExplain()) {
        BufBuilder bb;
        bb.skip(sizeof(QueryResult::Value));

        BSONObjBuilder explainBob;
        Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

        // Add the resulting object to the return buffer.
        BSONObj explainObj = explainBob.obj();
        bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

        // TODO: Does this get overwritten/do we really need to set this twice?
        curop.debug().query = q.query;

        // Set query result fields.
        QueryResult::View qr = bb.buf();
        bb.decouple();
        qr.setResultFlagsToOk();
        qr.msgdata().setLen(bb.len());
        curop.debug().responseLength = bb.len();
        qr.msgdata().setOperation(opReply);
        qr.setCursorId(0);
        qr.setStartingFrom(0);
        qr.setNReturned(1);
        result.setData(qr.view2ptr(), true);
        return "";
    }

    // Handle query option $maxTimeMS (not used with commands).
    curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
    txn->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

    // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
    bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
    Status serveReadsStatus =
        repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK);
    uassertStatusOK(serveReadsStatus);

    // Run the query.
    // bb is used to hold query results
    // this buffer should contain either requested documents per query or
    // explain information, but not both
    BufBuilder bb(FindCommon::kInitReplyBufferSize);
    bb.skip(sizeof(QueryResult::Value));

    // How many results have we obtained from the executor?
    int numResults = 0;

    // If we're replaying the oplog, we save the last time that we read.
    Timestamp slaveReadTill;

    BSONObj obj;
    PlanExecutor::ExecState state;

    // Get summary info about which plan the executor is using.
    {
        stdx::lock_guard<Client> lk(*txn->getClient());
        curop.setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
    }

    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
        // If we can't fit this result inside the current batch, then we stash it for later.
        if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) {
            exec->enqueue(obj);
            break;
        }

        // Add result to output buffer.
        bb.appendBuf((void*)obj.objdata(), obj.objsize());

        // Count the result.
        ++numResults;

        // Possibly note slave's position in the oplog.
        if (pq.isOplogReplay()) {
            BSONElement e = obj["ts"];
            if (Date == e.type() || bsonTimestamp == e.type()) {
                slaveReadTill = e.timestamp();
            }
        }

        if (FindCommon::enoughForFirstBatch(pq, numResults)) {
            LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
                   << " ntoreturn=" << pq.getNToReturn().value_or(0) << " numResults=" << numResults
                   << endl;
            break;
        }
    }

    // If we cache the executor later, we want to deregister it as it receives notifications
    // anyway by virtue of being cached.
    //
    // If we don't cache the executor later, we are deleting it, so it must be deregistered.
    //
    // So, no matter what, deregister the executor.
    exec->deregisterExec();

    // Caller expects exceptions thrown in certain cases.
    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
        error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
                << ", stats: " << Explain::getWinningPlanStats(exec.get());
        uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
    }

    // Before saving the cursor, ensure that whatever plan we established happened with the expected
    // collection version
    auto css = CollectionShardingState::get(txn, nss);
    css->checkShardVersionOrThrow(txn);

    // Fill out curop based on query results. If we have a cursorid, we will fill out curop with
    // this cursorid later.
    long long ccId = 0;

    if (shouldSaveCursor(txn, collection, state, exec.get())) {
        // We won't use the executor until it's getMore'd.
        exec->saveState();
        exec->detachFromOperationContext();

        // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
        // inserted into a global map by its ctor.
        ClientCursor* cc =
            new ClientCursor(collection->getCursorManager(),
                             exec.release(),
                             nss.ns(),
                             txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                             pq.getOptions(),
                             pq.getFilter());
        ccId = cc->cursorid();

        LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
               << " results" << endl;

        // TODO document
        if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
            cc->slaveReadTill(slaveReadTill);
        }

        // TODO document
        if (pq.isExhaust()) {
            curop.debug().exhaust = true;
        }

        cc->setPos(numResults);

        // If the query had a time limit, remaining time is "rolled over" to the cursor (for
        // use by future getmore ops).
        cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

        endQueryOp(txn, collection, *cc->getExecutor(), dbProfilingLevel, numResults, ccId);
    } else {
        LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
        endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, ccId);
    }

    // Add the results from the query into the output buffer.
    result.appendData(bb.buf(), bb.len());
    bb.decouple();

    // Fill out the output buffer's header.
    QueryResult::View qr = result.header().view2ptr();
    qr.setCursorId(ccId);
    qr.setResultFlagsToOk();
    qr.msgdata().setOperation(opReply);
    qr.setStartingFrom(0);
    qr.setNReturned(numResults);

    // curop.debug().exhaust is set above.
    return curop.debug().exhaust ? nss.ns() : "";
}
Beispiel #20
0
StatusWith<BSONObj> validateIndexSpec(
    const BSONObj& indexSpec,
    const NamespaceString& expectedNamespace,
    const ServerGlobalParams::FeatureCompatibility& featureCompatibility) {
    bool hasKeyPatternField = false;
    bool hasNamespaceField = false;
    bool hasVersionField = false;
    bool hasCollationField = false;

    auto fieldNamesValidStatus = validateIndexSpecFieldNames(indexSpec);
    if (!fieldNamesValidStatus.isOK()) {
        return fieldNamesValidStatus;
    }

    boost::optional<IndexVersion> resolvedIndexVersion;

    for (auto&& indexSpecElem : indexSpec) {
        auto indexSpecElemFieldName = indexSpecElem.fieldNameStringData();
        if (IndexDescriptor::kKeyPatternFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kKeyPatternFieldName
                                      << "' must be an object, but got "
                                      << typeName(indexSpecElem.type())};
            }

            std::vector<StringData> keys;
            for (auto&& keyElem : indexSpecElem.Obj()) {
                auto keyElemFieldName = keyElem.fieldNameStringData();
                if (std::find(keys.begin(), keys.end(), keyElemFieldName) != keys.end()) {
                    return {ErrorCodes::BadValue,
                            str::stream() << "The field '" << keyElemFieldName
                                          << "' appears multiple times in the index key pattern "
                                          << indexSpecElem.Obj()};
                }
                keys.push_back(keyElemFieldName);
            }

            hasKeyPatternField = true;
        } else if (IndexDescriptor::kNamespaceFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::String) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kNamespaceFieldName
                                      << "' must be a string, but got "
                                      << typeName(indexSpecElem.type())};
            }

            StringData ns = indexSpecElem.valueStringData();
            if (ns.empty()) {
                return {ErrorCodes::BadValue,
                        str::stream() << "The field '" << IndexDescriptor::kNamespaceFieldName
                                      << "' cannot be an empty string"};
            }

            if (ns != expectedNamespace.ns()) {
                return {ErrorCodes::BadValue,
                        str::stream() << "The value of the field '"
                                      << IndexDescriptor::kNamespaceFieldName
                                      << "' ("
                                      << ns
                                      << ") doesn't match the namespace '"
                                      << expectedNamespace.ns()
                                      << "'"};
            }

            hasNamespaceField = true;
        } else if (IndexDescriptor::kIndexVersionFieldName == indexSpecElemFieldName) {
            if (!indexSpecElem.isNumber()) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kIndexVersionFieldName
                                      << "' must be a number, but got "
                                      << typeName(indexSpecElem.type())};
            }

            auto requestedIndexVersionAsInt = representAs<int>(indexSpecElem.number());
            if (!requestedIndexVersionAsInt) {
                return {ErrorCodes::BadValue,
                        str::stream()
                            << "Index version must be representable as a 32-bit integer, but got "
                            << indexSpecElem.toString(false, false)};
            }

            const IndexVersion requestedIndexVersion =
                static_cast<IndexVersion>(*requestedIndexVersionAsInt);
            auto creationAllowedStatus = IndexDescriptor::isIndexVersionAllowedForCreation(
                requestedIndexVersion, featureCompatibility, indexSpec);
            if (!creationAllowedStatus.isOK()) {
                return creationAllowedStatus;
            }

            hasVersionField = true;
            resolvedIndexVersion = requestedIndexVersion;
        } else if (IndexDescriptor::kCollationFieldName == indexSpecElemFieldName) {
            if (indexSpecElem.type() != BSONType::Object) {
                return {ErrorCodes::TypeMismatch,
                        str::stream() << "The field '" << IndexDescriptor::kNamespaceFieldName
                                      << "' must be an object, but got "
                                      << typeName(indexSpecElem.type())};
            }

            hasCollationField = true;
        } else {
            // We can assume field name is valid at this point. Validation of fieldname is handled
            // prior to this in validateIndexSpecFieldNames().
            continue;
        }
    }

    if (!resolvedIndexVersion) {
        resolvedIndexVersion =
            IndexDescriptor::getDefaultIndexVersion(featureCompatibility.version.load());
    }

    if (!hasKeyPatternField) {
        return {ErrorCodes::FailedToParse,
                str::stream() << "The '" << IndexDescriptor::kKeyPatternFieldName
                              << "' field is a required property of an index specification"};
    }

    if (hasCollationField && *resolvedIndexVersion < IndexVersion::kV2) {
        return {ErrorCodes::CannotCreateIndex,
                str::stream() << "Invalid index specification " << indexSpec
                              << "; cannot create an index with the '"
                              << IndexDescriptor::kCollationFieldName
                              << "' option and "
                              << IndexDescriptor::kIndexVersionFieldName
                              << "="
                              << static_cast<int>(*resolvedIndexVersion)};
    }

    if (!hasNamespaceField || !hasVersionField) {
        BSONObjBuilder bob;

        if (!hasNamespaceField) {
            // We create a new index specification with the 'ns' field set as 'expectedNamespace' if
            // the field was omitted.
            bob.append(IndexDescriptor::kNamespaceFieldName, expectedNamespace.ns());
        }

        if (!hasVersionField) {
            // We create a new index specification with the 'v' field set as 'defaultIndexVersion'
            // if the field was omitted.
            bob.append(IndexDescriptor::kIndexVersionFieldName,
                       static_cast<int>(*resolvedIndexVersion));
        }

        bob.appendElements(indexSpec);
        return bob.obj();
    }

    return indexSpec;
}
Beispiel #21
0
 void insert(const BSONObj& obj) {
     OldClientWriteContext ctx(&_opCtx, nss.ns());
     _client.insert(nss.ns(), obj);
 }
Beispiel #22
0
mongo::Status mongo::emptyCapped(OperationContext* opCtx, const NamespaceString& collectionName) {
    AutoGetDb autoDb(opCtx, collectionName.db(), MODE_X);

    bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
        !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, collectionName);

    if (userInitiatedWritesAndNotPrimary) {
        return Status(ErrorCodes::NotMaster,
                      str::stream() << "Not primary while truncating collection: "
                                    << collectionName.ns());
    }

    Database* db = autoDb.getDb();
    uassert(ErrorCodes::NamespaceNotFound, "no such database", db);

    Collection* collection = db->getCollection(opCtx, collectionName);
    uassert(ErrorCodes::CommandNotSupportedOnView,
            str::stream() << "emptycapped not supported on view: " << collectionName.ns(),
            collection || !db->getViewCatalog()->lookup(opCtx, collectionName.ns()));
    uassert(ErrorCodes::NamespaceNotFound, "no such collection", collection);

    if (collectionName.isSystem() && !collectionName.isSystemDotProfile()) {
        return Status(ErrorCodes::IllegalOperation,
                      str::stream() << "Cannot truncate a system collection: "
                                    << collectionName.ns());
    }

    if (collectionName.isVirtualized()) {
        return Status(ErrorCodes::IllegalOperation,
                      str::stream() << "Cannot truncate a virtual collection: "
                                    << collectionName.ns());
    }

    if ((repl::ReplicationCoordinator::get(opCtx)->getReplicationMode() !=
         repl::ReplicationCoordinator::modeNone) &&
        collectionName.isOplog()) {
        return Status(ErrorCodes::OplogOperationUnsupported,
                      str::stream() << "Cannot truncate a live oplog while replicating: "
                                    << collectionName.ns());
    }

    BackgroundOperation::assertNoBgOpInProgForNs(collectionName.ns());

    WriteUnitOfWork wuow(opCtx);

    Status status = collection->truncate(opCtx);
    if (!status.isOK()) {
        return status;
    }

    getGlobalServiceContext()->getOpObserver()->onEmptyCapped(
        opCtx, collection->ns(), collection->uuid());

    wuow.commit();

    return Status::OK();
}
Beispiel #23
0
    Status convertToCapped(OperationContext* txn,
                           const NamespaceString& collectionName,
                           double size) {

        StringData dbname = collectionName.db();
        StringData shortSource = collectionName.coll();

        ScopedTransaction transaction(txn, MODE_IX);
        AutoGetDb autoDb(txn, collectionName.db(), MODE_X);

        bool userInitiatedWritesAndNotPrimary = txn->writesAreReplicated() &&
            !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname);

        if (userInitiatedWritesAndNotPrimary) {
            return Status(ErrorCodes::NotMaster,
                          str::stream() << "Not primary while converting "
                                        << collectionName.ns() << " to a capped collection");
        }

        Database* const db = autoDb.getDb();
        if (!db) {
            return Status(ErrorCodes::DatabaseNotFound,
                          str::stream() << "database " << dbname << " not found");
        }

        stopIndexBuildsConvertToCapped(txn, db, collectionName);
        BackgroundOperation::assertNoBgOpInProgForDb(dbname);

        std::string shortTmpName = str::stream() << "tmp.convertToCapped." << shortSource;
        std::string longTmpName = str::stream() << dbname << "." << shortTmpName;

        WriteUnitOfWork wunit(txn);
        if (db->getCollection(longTmpName)) {
            Status status = db->dropCollection(txn, longTmpName);
            if (!status.isOK())
                return status;
        }


        const bool shouldReplicateWrites = txn->writesAreReplicated();
        txn->setReplicatedWrites(false);
        ON_BLOCK_EXIT(&OperationContext::setReplicatedWrites, txn, shouldReplicateWrites);
        Status status = cloneCollectionAsCapped(txn,
                                                db,
                                                shortSource.toString(),
                                                shortTmpName,
                                                size,
                                                true);

        if (!status.isOK()) {
            return status;
        }

        verify(db->getCollection(longTmpName));

        status = db->dropCollection(txn, collectionName.ns());
        txn->setReplicatedWrites(shouldReplicateWrites);
        if (!status.isOK())
            return status;

        status = db->renameCollection(txn, longTmpName, collectionName.ns(), false);
        if (!status.isOK())
            return status;

        getGlobalServiceContext()->getOpObserver()->onConvertToCapped(
                txn,
                NamespaceString(collectionName),
                size);

        wunit.commit();
        return Status::OK();
    }
Beispiel #24
0
    Status IndexBuilder::_build(OperationContext* txn,
                                Database* db,
                                bool allowBackgroundBuilding,
                                Lock::DBLock* dbLock) const {
        const NamespaceString ns(_index["ns"].String());

        Collection* c = db->getCollection( ns.ns() );
        if ( !c ) {
            while (true) {
                try {
                    WriteUnitOfWork wunit(txn);
                    c = db->getOrCreateCollection( txn, ns.ns() );
                    verify(c);
                    wunit.commit();
                    break;
                }
                catch (const WriteConflictException& wce) {
                    LOG(2) << "WriteConflictException while creating collection in IndexBuilder"
                           << ", retrying.";
                    txn->recoveryUnit()->commitAndRestart();
                    continue;
                }
            }
        }

        // Show which index we're building in the curop display.
        txn->getCurOp()->setQuery(_index);

        bool haveSetBgIndexStarting = false;
        while (true) {
            Status status = Status::OK();
            try {
                MultiIndexBlock indexer(txn, c);
                indexer.allowInterruption();

                if (allowBackgroundBuilding)
                    indexer.allowBackgroundBuilding();


                IndexDescriptor* descriptor(NULL);
                try {
                    status = indexer.init(_index);
                    if ( status.code() == ErrorCodes::IndexAlreadyExists ) {
                        if (allowBackgroundBuilding) {
                            // Must set this in case anyone is waiting for this build.
                            _setBgIndexStarting();
                        }
                        return Status::OK();
                    }

                    if (status.isOK()) {
                        if (allowBackgroundBuilding) {
                            descriptor = indexer.registerIndexBuild();
                            if (!haveSetBgIndexStarting) {
                                _setBgIndexStarting();
                                haveSetBgIndexStarting = true;
                            }
                            invariant(dbLock);
                            dbLock->relockWithMode(MODE_IX);
                        }

                        Lock::CollectionLock colLock(txn->lockState(), ns.ns(), MODE_IX);
                        status = indexer.insertAllDocumentsInCollection();
                    }

                    if (status.isOK()) {
                        if (allowBackgroundBuilding) {
                            dbLock->relockWithMode(MODE_X);
                        }
                        WriteUnitOfWork wunit(txn);
                        indexer.commit();
                        wunit.commit();
                    }
                }
                catch (const DBException& e) {
                    status = e.toStatus();
                }

                if (allowBackgroundBuilding) {
                    dbLock->relockWithMode(MODE_X);
                    Database* reloadDb = dbHolder().get(txn, ns.db());
                    fassert(28553, reloadDb);
                    fassert(28554, reloadDb->getCollection(ns.ns()));
                    indexer.unregisterIndexBuild(descriptor);
                }

                if (status.code() == ErrorCodes::InterruptedAtShutdown) {
                    // leave it as-if kill -9 happened. This will be handled on restart.
                    indexer.abortWithoutCleanup();
                }
            }
            catch (const WriteConflictException& wce) {
                status = wce.toStatus();
            }

            if (status.code() != ErrorCodes::WriteConflict)
                return status;


            LOG(2) << "WriteConflictException while creating index in IndexBuilder, retrying.";
            txn->recoveryUnit()->commitAndRestart();
        }
    }
Beispiel #25
0
    void run() {
        // Run the update.
        {
            OldClientWriteContext ctx(&_txn, nss.ns());

            // Populate the collection.
            for (int i = 0; i < 10; ++i) {
                insert(BSON("_id" << i << "foo" << i));
            }
            ASSERT_EQUALS(10U, count(BSONObj()));

            CurOp& curOp = *CurOp::get(_txn);
            OpDebug* opDebug = &curOp.debug();
            UpdateDriver driver((UpdateDriver::Options()));
            Database* db = ctx.db();
            Collection* coll = db->getCollection(nss.ns());

            // Get the RecordIds that would be returned by an in-order scan.
            vector<RecordId> locs;
            getLocs(coll, CollectionScanParams::FORWARD, &locs);

            UpdateRequest request(nss);
            UpdateLifecycleImpl updateLifecycle(false, nss);
            request.setLifecycle(&updateLifecycle);

            // Update is a multi-update that sets 'bar' to 3 in every document
            // where foo is less than 5.
            BSONObj query = fromjson("{foo: {$lt: 5}}");
            BSONObj updates = fromjson("{$set: {bar: 3}}");

            request.setMulti();
            request.setQuery(query);
            request.setUpdates(updates);

            ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

            // Configure the scan.
            CollectionScanParams collScanParams;
            collScanParams.collection = coll;
            collScanParams.direction = CollectionScanParams::FORWARD;
            collScanParams.tailable = false;

            // Configure the update.
            UpdateStageParams updateParams(&request, &driver, opDebug);
            unique_ptr<CanonicalQuery> cq(canonicalize(query));
            updateParams.canonicalQuery = cq.get();

            unique_ptr<WorkingSet> ws(new WorkingSet());
            unique_ptr<CollectionScan> cs(
                new CollectionScan(&_txn, collScanParams, ws.get(), cq->root()));

            unique_ptr<UpdateStage> updateStage(
                new UpdateStage(&_txn, updateParams, ws.get(), coll, cs.release()));

            const UpdateStats* stats =
                static_cast<const UpdateStats*>(updateStage->getSpecificStats());

            const size_t targetDocIndex = 3;

            while (stats->nModified < targetDocIndex) {
                WorkingSetID id = WorkingSet::INVALID_ID;
                PlanStage::StageState state = updateStage->work(&id);
                ASSERT_EQUALS(PlanStage::NEED_TIME, state);
            }

            // Remove locs[targetDocIndex];
            updateStage->saveState();
            updateStage->invalidate(&_txn, locs[targetDocIndex], INVALIDATION_DELETION);
            BSONObj targetDoc = coll->docFor(&_txn, locs[targetDocIndex]).value();
            ASSERT(!targetDoc.isEmpty());
            remove(targetDoc);
            updateStage->restoreState();

            // Do the remaining updates.
            while (!updateStage->isEOF()) {
                WorkingSetID id = WorkingSet::INVALID_ID;
                PlanStage::StageState state = updateStage->work(&id);
                ASSERT(PlanStage::NEED_TIME == state || PlanStage::IS_EOF == state);
            }

            // 4 of the 5 matching documents should have been modified (one was deleted).
            ASSERT_EQUALS(4U, stats->nModified);
            ASSERT_EQUALS(4U, stats->nMatched);
        }

        // Check the contents of the collection.
        {
            AutoGetCollectionForRead ctx(&_txn, nss.ns());
            Collection* collection = ctx.getCollection();

            vector<BSONObj> objs;
            getCollContents(collection, &objs);

            // Verify that the collection now has 9 docs (one was deleted).
            ASSERT_EQUALS(9U, objs.size());

            // Make sure that the collection has certain documents.
            assertHasDoc(objs, fromjson("{_id: 0, foo: 0, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 1, foo: 1, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 2, foo: 2, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 4, foo: 4, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 5, foo: 5}"));
            assertHasDoc(objs, fromjson("{_id: 6, foo: 6}"));
        }
    }
Beispiel #26
0
StatusWith<Shard::QueryResponse> ShardRemote::_exhaustiveFindOnConfig(
    OperationContext* txn,
    const ReadPreferenceSetting& readPref,
    const repl::ReadConcernLevel& readConcernLevel,
    const NamespaceString& nss,
    const BSONObj& query,
    const BSONObj& sort,
    boost::optional<long long> limit) {
    // Do not allow exhaustive finds to be run against regular shards.
    invariant(getId() == "config");

    const auto host =
        _targeter->findHost(readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(txn));
    if (!host.isOK()) {
        return host.getStatus();
    }

    QueryResponse response;

    // If for some reason the callback never gets invoked, we will return this status in response.
    Status status = Status(ErrorCodes::InternalError, "Internal error running find command");

    auto fetcherCallback =
        [this, &status, &response](const Fetcher::QueryResponseStatus& dataStatus,
                                   Fetcher::NextAction* nextAction,
                                   BSONObjBuilder* getMoreBob) {

            // Throw out any accumulated results on error
            if (!dataStatus.isOK()) {
                status = dataStatus.getStatus();
                response.docs.clear();
                return;
            }

            auto& data = dataStatus.getValue();
            if (data.otherFields.metadata.hasField(rpc::kReplSetMetadataFieldName)) {
                auto replParseStatus =
                    rpc::ReplSetMetadata::readFromMetadata(data.otherFields.metadata);

                if (!replParseStatus.isOK()) {
                    status = replParseStatus.getStatus();
                    response.docs.clear();
                    return;
                }

                response.opTime = replParseStatus.getValue().getLastOpCommitted();

                // We return the config opTime that was returned for this particular request, but as
                // a safeguard we ensure our global configOpTime is at least as large as it.
                invariant(grid.configOpTime() >= response.opTime);
            }

            for (const BSONObj& doc : data.documents) {
                response.docs.push_back(doc.getOwned());
            }

            status = Status::OK();

            if (!getMoreBob) {
                return;
            }
            getMoreBob->append("getMore", data.cursorId);
            getMoreBob->append("collection", data.nss.coll());
        };

    BSONObj readConcernObj;
    {
        invariant(readConcernLevel == repl::ReadConcernLevel::kMajorityReadConcern);
        const repl::ReadConcernArgs readConcern{grid.configOpTime(), readConcernLevel};
        BSONObjBuilder bob;
        readConcern.appendInfo(&bob);
        readConcernObj =
            bob.done().getObjectField(repl::ReadConcernArgs::kReadConcernFieldName).getOwned();
    }

    auto qr = stdx::make_unique<QueryRequest>(nss);
    qr->setFilter(query);
    qr->setSort(sort);
    qr->setReadConcern(readConcernObj);
    qr->setLimit(limit);

    BSONObjBuilder findCmdBuilder;
    qr->asFindCommand(&findCmdBuilder);

    Microseconds maxTime = std::min(duration_cast<Microseconds>(kConfigCommandTimeout),
                                    txn->getRemainingMaxTimeMicros());
    if (maxTime < Milliseconds{1}) {
        // If there is less than 1ms remaining before the maxTime timeout expires, set the max time
        // to 1ms, since setting maxTimeMs to 1ms in a find command means "no max time".
        maxTime = Milliseconds{1};
    }

    findCmdBuilder.append(QueryRequest::cmdOptionMaxTimeMS, durationCount<Milliseconds>(maxTime));

    Fetcher fetcher(Grid::get(txn)->getExecutorPool()->getFixedExecutor(),
                    host.getValue(),
                    nss.db().toString(),
                    findCmdBuilder.done(),
                    fetcherCallback,
                    _getMetadataForCommand(readPref),
                    duration_cast<Milliseconds>(maxTime));
    Status scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        return scheduleStatus;
    }

    fetcher.wait();

    updateReplSetMonitor(host.getValue(), status);

    if (!status.isOK()) {
        if (status.compareCode(ErrorCodes::ExceededTimeLimit)) {
            LOG(0) << "Operation timed out with status " << status;
        }
        return status;
    }

    return response;
}
Beispiel #27
0
namespace QueryStageUpdate {

using std::unique_ptr;
using std::vector;

static const NamespaceString nss("unittests.QueryStageUpdate");

class QueryStageUpdateBase {
public:
    QueryStageUpdateBase() : _client(&_txn) {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.dropCollection(nss.ns());
        _client.createCollection(nss.ns());
    }

    virtual ~QueryStageUpdateBase() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.dropCollection(nss.ns());
    }

    void insert(const BSONObj& doc) {
        _client.insert(nss.ns(), doc);
    }

    void remove(const BSONObj& obj) {
        _client.remove(nss.ns(), obj);
    }

    size_t count(const BSONObj& query) {
        return _client.count(nss.ns(), query, 0, 0, 0);
    }

    unique_ptr<CanonicalQuery> canonicalize(const BSONObj& query) {
        auto statusWithCQ = CanonicalQuery::canonicalize(nss, query);
        ASSERT_OK(statusWithCQ.getStatus());
        return std::move(statusWithCQ.getValue());
    }

    /**
     * Runs the update operation by calling work until EOF. Asserts that
     * the update stage always returns NEED_TIME.
     */
    void runUpdate(UpdateStage* updateStage) {
        WorkingSetID id = WorkingSet::INVALID_ID;
        PlanStage::StageState state = PlanStage::NEED_TIME;
        while (PlanStage::IS_EOF != state) {
            ASSERT_EQUALS(PlanStage::NEED_TIME, state);
            state = updateStage->work(&id);
        }
    }

    /**
     * Returns a vector of all of the documents currently in 'collection'.
     *
     * Uses a forward collection scan stage to get the docs, and populates 'out' with
     * the results.
     */
    void getCollContents(Collection* collection, vector<BSONObj>* out) {
        WorkingSet ws;

        CollectionScanParams params;
        params.collection = collection;
        params.direction = CollectionScanParams::FORWARD;
        params.tailable = false;

        unique_ptr<CollectionScan> scan(new CollectionScan(&_txn, params, &ws, NULL));
        while (!scan->isEOF()) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            PlanStage::StageState state = scan->work(&id);
            if (PlanStage::ADVANCED == state) {
                WorkingSetMember* member = ws.get(id);
                verify(member->hasObj());
                out->push_back(member->obj.value());
            }
        }
    }

    void getLocs(Collection* collection,
                 CollectionScanParams::Direction direction,
                 vector<RecordId>* out) {
        WorkingSet ws;

        CollectionScanParams params;
        params.collection = collection;
        params.direction = direction;
        params.tailable = false;

        unique_ptr<CollectionScan> scan(new CollectionScan(&_txn, params, &ws, NULL));
        while (!scan->isEOF()) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            PlanStage::StageState state = scan->work(&id);
            if (PlanStage::ADVANCED == state) {
                WorkingSetMember* member = ws.get(id);
                verify(member->hasLoc());
                out->push_back(member->loc);
            }
        }
    }

    /**
     * Asserts that 'objs' contains 'expectedDoc'.
     */
    void assertHasDoc(const vector<BSONObj>& objs, const BSONObj& expectedDoc) {
        bool foundDoc = false;
        for (size_t i = 0; i < objs.size(); i++) {
            if (0 == objs[i].woCompare(expectedDoc)) {
                foundDoc = true;
                break;
            }
        }
        ASSERT(foundDoc);
    }

protected:
    OperationContextImpl _txn;

private:
    DBDirectClient _client;
};

/**
 * Test an upsert into an empty collection.
 */
class QueryStageUpdateUpsertEmptyColl : public QueryStageUpdateBase {
public:
    void run() {
        // Run the update.
        {
            OldClientWriteContext ctx(&_txn, nss.ns());
            CurOp& curOp = *CurOp::get(_txn);
            OpDebug* opDebug = &curOp.debug();
            UpdateDriver driver((UpdateDriver::Options()));
            Collection* collection = ctx.getCollection();

            // Collection should be empty.
            ASSERT_EQUALS(0U, count(BSONObj()));

            UpdateRequest request(nss);
            UpdateLifecycleImpl updateLifecycle(false, nss);
            request.setLifecycle(&updateLifecycle);

            // Update is the upsert {_id: 0, x: 1}, {$set: {y: 2}}.
            BSONObj query = fromjson("{_id: 0, x: 1}");
            BSONObj updates = fromjson("{$set: {y: 2}}");

            request.setUpsert();
            request.setQuery(query);
            request.setUpdates(updates);

            ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

            // Setup update params.
            UpdateStageParams params(&request, &driver, opDebug);
            unique_ptr<CanonicalQuery> cq(canonicalize(query));
            params.canonicalQuery = cq.get();

            unique_ptr<WorkingSet> ws(new WorkingSet());
            unique_ptr<EOFStage> eofStage(new EOFStage());

            unique_ptr<UpdateStage> updateStage(
                new UpdateStage(&_txn, params, ws.get(), collection, eofStage.release()));

            runUpdate(updateStage.get());
        }

        // Verify the contents of the resulting collection.
        {
            AutoGetCollectionForRead ctx(&_txn, nss.ns());
            Collection* collection = ctx.getCollection();

            vector<BSONObj> objs;
            getCollContents(collection, &objs);

            // Expect a single document, {_id: 0, x: 1, y: 2}.
            ASSERT_EQUALS(1U, objs.size());
            ASSERT_EQUALS(objs[0], fromjson("{_id: 0, x: 1, y: 2}"));
        }
    }
};

/**
 * Test receipt of an invalidation: case in which the document about to updated
 * is deleted.
 */
class QueryStageUpdateSkipInvalidatedDoc : public QueryStageUpdateBase {
public:
    void run() {
        // Run the update.
        {
            OldClientWriteContext ctx(&_txn, nss.ns());

            // Populate the collection.
            for (int i = 0; i < 10; ++i) {
                insert(BSON("_id" << i << "foo" << i));
            }
            ASSERT_EQUALS(10U, count(BSONObj()));

            CurOp& curOp = *CurOp::get(_txn);
            OpDebug* opDebug = &curOp.debug();
            UpdateDriver driver((UpdateDriver::Options()));
            Database* db = ctx.db();
            Collection* coll = db->getCollection(nss.ns());

            // Get the RecordIds that would be returned by an in-order scan.
            vector<RecordId> locs;
            getLocs(coll, CollectionScanParams::FORWARD, &locs);

            UpdateRequest request(nss);
            UpdateLifecycleImpl updateLifecycle(false, nss);
            request.setLifecycle(&updateLifecycle);

            // Update is a multi-update that sets 'bar' to 3 in every document
            // where foo is less than 5.
            BSONObj query = fromjson("{foo: {$lt: 5}}");
            BSONObj updates = fromjson("{$set: {bar: 3}}");

            request.setMulti();
            request.setQuery(query);
            request.setUpdates(updates);

            ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

            // Configure the scan.
            CollectionScanParams collScanParams;
            collScanParams.collection = coll;
            collScanParams.direction = CollectionScanParams::FORWARD;
            collScanParams.tailable = false;

            // Configure the update.
            UpdateStageParams updateParams(&request, &driver, opDebug);
            unique_ptr<CanonicalQuery> cq(canonicalize(query));
            updateParams.canonicalQuery = cq.get();

            unique_ptr<WorkingSet> ws(new WorkingSet());
            unique_ptr<CollectionScan> cs(
                new CollectionScan(&_txn, collScanParams, ws.get(), cq->root()));

            unique_ptr<UpdateStage> updateStage(
                new UpdateStage(&_txn, updateParams, ws.get(), coll, cs.release()));

            const UpdateStats* stats =
                static_cast<const UpdateStats*>(updateStage->getSpecificStats());

            const size_t targetDocIndex = 3;

            while (stats->nModified < targetDocIndex) {
                WorkingSetID id = WorkingSet::INVALID_ID;
                PlanStage::StageState state = updateStage->work(&id);
                ASSERT_EQUALS(PlanStage::NEED_TIME, state);
            }

            // Remove locs[targetDocIndex];
            updateStage->saveState();
            updateStage->invalidate(&_txn, locs[targetDocIndex], INVALIDATION_DELETION);
            BSONObj targetDoc = coll->docFor(&_txn, locs[targetDocIndex]).value();
            ASSERT(!targetDoc.isEmpty());
            remove(targetDoc);
            updateStage->restoreState();

            // Do the remaining updates.
            while (!updateStage->isEOF()) {
                WorkingSetID id = WorkingSet::INVALID_ID;
                PlanStage::StageState state = updateStage->work(&id);
                ASSERT(PlanStage::NEED_TIME == state || PlanStage::IS_EOF == state);
            }

            // 4 of the 5 matching documents should have been modified (one was deleted).
            ASSERT_EQUALS(4U, stats->nModified);
            ASSERT_EQUALS(4U, stats->nMatched);
        }

        // Check the contents of the collection.
        {
            AutoGetCollectionForRead ctx(&_txn, nss.ns());
            Collection* collection = ctx.getCollection();

            vector<BSONObj> objs;
            getCollContents(collection, &objs);

            // Verify that the collection now has 9 docs (one was deleted).
            ASSERT_EQUALS(9U, objs.size());

            // Make sure that the collection has certain documents.
            assertHasDoc(objs, fromjson("{_id: 0, foo: 0, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 1, foo: 1, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 2, foo: 2, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 4, foo: 4, bar: 3}"));
            assertHasDoc(objs, fromjson("{_id: 5, foo: 5}"));
            assertHasDoc(objs, fromjson("{_id: 6, foo: 6}"));
        }
    }
};

/**
 * Test that the update stage returns an owned copy of the original document if
 * ReturnDocOption::RETURN_OLD is specified.
 */
class QueryStageUpdateReturnOldDoc : public QueryStageUpdateBase {
public:
    void run() {
        // Populate the collection.
        for (int i = 0; i < 10; ++i) {
            insert(BSON("_id" << i << "foo" << i));
        }
        ASSERT_EQUALS(10U, count(BSONObj()));

        // Various variables we'll need.
        OldClientWriteContext ctx(&_txn, nss.ns());
        OpDebug* opDebug = &CurOp::get(_txn)->debug();
        Collection* coll = ctx.getCollection();
        UpdateLifecycleImpl updateLifecycle(false, nss);
        UpdateRequest request(nss);
        UpdateDriver driver((UpdateDriver::Options()));
        const int targetDocIndex = 0;  // We'll be working with the first doc in the collection.
        const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex));
        const unique_ptr<WorkingSet> ws(stdx::make_unique<WorkingSet>());
        const unique_ptr<CanonicalQuery> cq(canonicalize(query));

        // Get the RecordIds that would be returned by an in-order scan.
        vector<RecordId> locs;
        getLocs(coll, CollectionScanParams::FORWARD, &locs);

        // Populate the request.
        request.setQuery(query);
        request.setUpdates(fromjson("{$set: {x: 0}}"));
        request.setSort(BSONObj());
        request.setMulti(false);
        request.setReturnDocs(UpdateRequest::RETURN_OLD);
        request.setLifecycle(&updateLifecycle);

        ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

        // Configure a QueuedDataStage to pass the first object in the collection back in a
        // LOC_AND_OBJ state.
        std::unique_ptr<QueuedDataStage> qds(stdx::make_unique<QueuedDataStage>(ws.get()));
        WorkingSetID id = ws->allocate();
        WorkingSetMember* member = ws->get(id);
        member->loc = locs[targetDocIndex];
        const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex);
        member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc);
        ws->transitionToLocAndObj(id);
        qds->pushBack(id);

        // Configure the update.
        UpdateStageParams updateParams(&request, &driver, opDebug);
        updateParams.canonicalQuery = cq.get();

        const unique_ptr<UpdateStage> updateStage(
            stdx::make_unique<UpdateStage>(&_txn, updateParams, ws.get(), coll, qds.release()));

        // Should return advanced.
        id = WorkingSet::INVALID_ID;
        PlanStage::StageState state = updateStage->work(&id);
        ASSERT_EQUALS(PlanStage::ADVANCED, state);

        // Make sure the returned value is what we expect it to be.

        // Should give us back a valid id.
        ASSERT_TRUE(WorkingSet::INVALID_ID != id);
        WorkingSetMember* resultMember = ws->get(id);
        // With an owned copy of the object, with no RecordId.
        ASSERT_TRUE(resultMember->hasOwnedObj());
        ASSERT_FALSE(resultMember->hasLoc());
        ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ);
        ASSERT_TRUE(resultMember->obj.value().isOwned());

        // Should be the old value.
        ASSERT_EQUALS(resultMember->obj.value(), oldDoc);

        // Should have done the update.
        BSONObj newDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex << "x" << 0);
        vector<BSONObj> objs;
        getCollContents(coll, &objs);
        ASSERT_EQUALS(objs[targetDocIndex], newDoc);

        // That should be it.
        id = WorkingSet::INVALID_ID;
        ASSERT_EQUALS(PlanStage::IS_EOF, updateStage->work(&id));
    }
};

/**
 * Test that the update stage returns an owned copy of the updated document if
 * ReturnDocOption::RETURN_NEW is specified.
 */
class QueryStageUpdateReturnNewDoc : public QueryStageUpdateBase {
public:
    void run() {
        // Populate the collection.
        for (int i = 0; i < 50; ++i) {
            insert(BSON("_id" << i << "foo" << i));
        }
        ASSERT_EQUALS(50U, count(BSONObj()));

        // Various variables we'll need.
        OldClientWriteContext ctx(&_txn, nss.ns());
        OpDebug* opDebug = &CurOp::get(_txn)->debug();
        Collection* coll = ctx.getCollection();
        UpdateLifecycleImpl updateLifecycle(false, nss);
        UpdateRequest request(nss);
        UpdateDriver driver((UpdateDriver::Options()));
        const int targetDocIndex = 10;
        const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex));
        const unique_ptr<WorkingSet> ws(stdx::make_unique<WorkingSet>());
        const unique_ptr<CanonicalQuery> cq(canonicalize(query));

        // Get the RecordIds that would be returned by an in-order scan.
        vector<RecordId> locs;
        getLocs(coll, CollectionScanParams::FORWARD, &locs);

        // Populate the request.
        request.setQuery(query);
        request.setUpdates(fromjson("{$set: {x: 0}}"));
        request.setSort(BSONObj());
        request.setMulti(false);
        request.setReturnDocs(UpdateRequest::RETURN_NEW);
        request.setLifecycle(&updateLifecycle);

        ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

        // Configure a QueuedDataStage to pass the first object in the collection back in a
        // LOC_AND_OBJ state.
        std::unique_ptr<QueuedDataStage> qds(stdx::make_unique<QueuedDataStage>(ws.get()));
        WorkingSetID id = ws->allocate();
        WorkingSetMember* member = ws->get(id);
        member->loc = locs[targetDocIndex];
        const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex);
        member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc);
        ws->transitionToLocAndObj(id);
        qds->pushBack(id);

        // Configure the update.
        UpdateStageParams updateParams(&request, &driver, opDebug);
        updateParams.canonicalQuery = cq.get();

        unique_ptr<UpdateStage> updateStage(
            stdx::make_unique<UpdateStage>(&_txn, updateParams, ws.get(), coll, qds.release()));

        // Should return advanced.
        id = WorkingSet::INVALID_ID;
        PlanStage::StageState state = updateStage->work(&id);
        ASSERT_EQUALS(PlanStage::ADVANCED, state);

        // Make sure the returned value is what we expect it to be.

        // Should give us back a valid id.
        ASSERT_TRUE(WorkingSet::INVALID_ID != id);
        WorkingSetMember* resultMember = ws->get(id);
        // With an owned copy of the object, with no RecordId.
        ASSERT_TRUE(resultMember->hasOwnedObj());
        ASSERT_FALSE(resultMember->hasLoc());
        ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ);
        ASSERT_TRUE(resultMember->obj.value().isOwned());

        // Should be the new value.
        BSONObj newDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex << "x" << 0);
        ASSERT_EQUALS(resultMember->obj.value(), newDoc);

        // Should have done the update.
        vector<BSONObj> objs;
        getCollContents(coll, &objs);
        ASSERT_EQUALS(objs[targetDocIndex], newDoc);

        // That should be it.
        id = WorkingSet::INVALID_ID;
        ASSERT_EQUALS(PlanStage::IS_EOF, updateStage->work(&id));
    }
};

/**
 * Test that the update stage does not update or return WorkingSetMembers that it gets back from
 * a child in the OWNED_OBJ state.
 */
class QueryStageUpdateSkipOwnedObjects : public QueryStageUpdateBase {
public:
    void run() {
        // Various variables we'll need.
        OldClientWriteContext ctx(&_txn, nss.ns());
        OpDebug* opDebug = &CurOp::get(_txn)->debug();
        Collection* coll = ctx.getCollection();
        UpdateLifecycleImpl updateLifecycle(false, nss);
        UpdateRequest request(nss);
        UpdateDriver driver((UpdateDriver::Options()));
        const BSONObj query = BSONObj();
        const unique_ptr<WorkingSet> ws(stdx::make_unique<WorkingSet>());
        const unique_ptr<CanonicalQuery> cq(canonicalize(query));

        // Populate the request.
        request.setQuery(query);
        request.setUpdates(fromjson("{$set: {x: 0}}"));
        request.setSort(BSONObj());
        request.setMulti(false);
        request.setReturnDocs(UpdateRequest::ReturnDocOption::RETURN_OLD);
        request.setLifecycle(&updateLifecycle);

        ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti()));

        // Configure a QueuedDataStage to pass an OWNED_OBJ to the update stage.
        unique_ptr<QueuedDataStage> qds(stdx::make_unique<QueuedDataStage>(ws.get()));
        {
            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->obj = Snapshotted<BSONObj>(SnapshotId(), fromjson("{x: 1}"));
            member->transitionToOwnedObj();
            qds->pushBack(id);
        }

        // Configure the update.
        UpdateStageParams updateParams(&request, &driver, opDebug);
        updateParams.canonicalQuery = cq.get();

        const unique_ptr<UpdateStage> updateStage(
            stdx::make_unique<UpdateStage>(&_txn, updateParams, ws.get(), coll, qds.release()));
        const UpdateStats* stats = static_cast<const UpdateStats*>(updateStage->getSpecificStats());

        // Call work, passing the set up member to the update stage.
        WorkingSetID id = WorkingSet::INVALID_ID;
        PlanStage::StageState state = updateStage->work(&id);

        // Should return NEED_TIME, not modifying anything.
        ASSERT_EQUALS(PlanStage::NEED_TIME, state);
        ASSERT_EQUALS(stats->nModified, 0U);

        id = WorkingSet::INVALID_ID;
        state = updateStage->work(&id);
        ASSERT_EQUALS(PlanStage::IS_EOF, state);
    }
};

class All : public Suite {
public:
    All() : Suite("query_stage_update") {}

    void setupTests() {
        // Stage-specific tests below.
        add<QueryStageUpdateUpsertEmptyColl>();
        add<QueryStageUpdateSkipInvalidatedDoc>();
        add<QueryStageUpdateReturnOldDoc>();
        add<QueryStageUpdateReturnNewDoc>();
        add<QueryStageUpdateSkipOwnedObjects>();
    }
};

SuiteInstance<All> all;

}  // namespace QueryStageUpdate
namespace QueryStageSubplan {

static const NamespaceString nss("unittests.QueryStageSubplan");

class QueryStageSubplanBase {
public:
    QueryStageSubplanBase() : _client(&_txn) {}

    virtual ~QueryStageSubplanBase() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        _client.dropCollection(nss.ns());
    }

    void addIndex(const BSONObj& obj) {
        ASSERT_OK(dbtests::createIndex(&_txn, nss.ns(), obj));
    }

    void insert(const BSONObj& doc) {
        _client.insert(nss.ns(), doc);
    }

protected:
    /**
     * Parses the json string 'findCmd', specifying a find command, to a CanonicalQuery.
     */
    std::unique_ptr<CanonicalQuery> cqFromFindCommand(const std::string& findCmd) {
        BSONObj cmdObj = fromjson(findCmd);

        bool isExplain = false;
        auto lpq =
            unittest::assertGet(LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain));

        auto cq = unittest::assertGet(
            CanonicalQuery::canonicalize(lpq.release(), ExtensionsCallbackNoop()));
        return cq;
    }

    OperationContextImpl _txn;

private:
    DBDirectClient _client;
};

/**
 * SERVER-15012: test that the subplan stage does not crash when the winning solution
 * for an $or clause uses a '2d' index. We don't produce cache data for '2d'. The subplanner
 * should gracefully fail after finding that no cache data is available, allowing us to fall
 * back to regular planning.
 */
class QueryStageSubplanGeo2dOr : public QueryStageSubplanBase {
public:
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        addIndex(BSON("a"
                      << "2d"
                      << "b" << 1));
        addIndex(BSON("a"
                      << "2d"));

        BSONObj query = fromjson(
            "{$or: [{a: {$geoWithin: {$centerSphere: [[0,0],10]}}},"
            "{a: {$geoWithin: {$centerSphere: [[1,1],10]}}}]}");

        auto statusWithCQ = CanonicalQuery::canonicalize(nss, query);
        ASSERT_OK(statusWithCQ.getStatus());
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        Collection* collection = ctx.getCollection();

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);

        WorkingSet ws;
        std::unique_ptr<SubplanStage> subplan(
            new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        // Plan selection should succeed due to falling back on regular planning.
        PlanYieldPolicy yieldPolicy(NULL, PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));
    }
};

/**
 * Test the SubplanStage's ability to plan an individual branch using the plan cache.
 */
class QueryStageSubplanPlanFromCache : public QueryStageSubplanBase {
public:
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());

        addIndex(BSON("a" << 1));
        addIndex(BSON("a" << 1 << "b" << 1));
        addIndex(BSON("c" << 1));

        for (int i = 0; i < 10; i++) {
            insert(BSON("a" << 1 << "b" << i << "c" << i));
        }

        // This query should result in a plan cache entry for the first $or branch, because
        // there are two competing indices. The second branch has only one relevant index, so
        // its winning plan should not be cached.
        BSONObj query = fromjson("{$or: [{a: 1, b: 3}, {c: 1}]}");

        Collection* collection = ctx.getCollection();

        auto statusWithCQ = CanonicalQuery::canonicalize(nss, query);
        ASSERT_OK(statusWithCQ.getStatus());
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);

        WorkingSet ws;
        std::unique_ptr<SubplanStage> subplan(
            new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        PlanYieldPolicy yieldPolicy(NULL, PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        // Nothing is in the cache yet, so neither branch should have been planned from
        // the plan cache.
        ASSERT_FALSE(subplan->branchPlannedFromCache(0));
        ASSERT_FALSE(subplan->branchPlannedFromCache(1));

        // If we repeat the same query, the plan for the first branch should have come from
        // the cache.
        ws.clear();
        subplan.reset(new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        ASSERT_TRUE(subplan->branchPlannedFromCache(0));
        ASSERT_FALSE(subplan->branchPlannedFromCache(1));
    }
};

/**
 * Ensure that the subplan stage doesn't create a plan cache entry if there are no query results.
 */
class QueryStageSubplanDontCacheZeroResults : public QueryStageSubplanBase {
public:
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());

        addIndex(BSON("a" << 1 << "b" << 1));
        addIndex(BSON("a" << 1));
        addIndex(BSON("c" << 1));

        for (int i = 0; i < 10; i++) {
            insert(BSON("a" << 1 << "b" << i << "c" << i));
        }

        // Running this query should not create any cache entries. For the first branch, it's
        // because there are no matching results. For the second branch it's because there is only
        // one relevant index.
        BSONObj query = fromjson("{$or: [{a: 1, b: 15}, {c: 1}]}");

        Collection* collection = ctx.getCollection();

        auto statusWithCQ = CanonicalQuery::canonicalize(nss, query);
        ASSERT_OK(statusWithCQ.getStatus());
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);

        WorkingSet ws;
        std::unique_ptr<SubplanStage> subplan(
            new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        PlanYieldPolicy yieldPolicy(nullptr, PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        // Nothing is in the cache yet, so neither branch should have been planned from
        // the plan cache.
        ASSERT_FALSE(subplan->branchPlannedFromCache(0));
        ASSERT_FALSE(subplan->branchPlannedFromCache(1));

        // If we run the query again, it should again be the case that neither branch gets planned
        // from the cache (because the first call to pickBestPlan() refrained from creating any
        // cache entries).
        ws.clear();
        subplan.reset(new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        ASSERT_FALSE(subplan->branchPlannedFromCache(0));
        ASSERT_FALSE(subplan->branchPlannedFromCache(1));
    }
};

/**
 * Ensure that the subplan stage doesn't create a plan cache entry if there are no query results.
 */
class QueryStageSubplanDontCacheTies : public QueryStageSubplanBase {
public:
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());

        addIndex(BSON("a" << 1 << "b" << 1));
        addIndex(BSON("a" << 1 << "c" << 1));
        addIndex(BSON("d" << 1));

        for (int i = 0; i < 10; i++) {
            insert(BSON("a" << 1 << "e" << 1 << "d" << 1));
        }

        // Running this query should not create any cache entries. For the first branch, it's
        // because plans using the {a: 1, b: 1} and {a: 1, c: 1} indices should tie during plan
        // ranking. For the second branch it's because there is only one relevant index.
        BSONObj query = fromjson("{$or: [{a: 1, e: 1}, {d: 1}]}");

        Collection* collection = ctx.getCollection();

        auto statusWithCQ = CanonicalQuery::canonicalize(nss, query);
        ASSERT_OK(statusWithCQ.getStatus());
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);

        WorkingSet ws;
        std::unique_ptr<SubplanStage> subplan(
            new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        PlanYieldPolicy yieldPolicy(nullptr, PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        // Nothing is in the cache yet, so neither branch should have been planned from
        // the plan cache.
        ASSERT_FALSE(subplan->branchPlannedFromCache(0));
        ASSERT_FALSE(subplan->branchPlannedFromCache(1));

        // If we run the query again, it should again be the case that neither branch gets planned
        // from the cache (because the first call to pickBestPlan() refrained from creating any
        // cache entries).
        ws.clear();
        subplan.reset(new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        ASSERT_FALSE(subplan->branchPlannedFromCache(0));
        ASSERT_FALSE(subplan->branchPlannedFromCache(1));
    }
};

/**
 * Unit test the subplan stage's canUseSubplanning() method.
 */
class QueryStageSubplanCanUseSubplanning : public QueryStageSubplanBase {
public:
    void run() {
        // We won't try and subplan something that doesn't have an $or.
        {
            std::string findCmd = "{find: 'testns', filter: {$and:[{a:1}, {b:1}]}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Don't try and subplan if there is no filter.
        {
            std::string findCmd = "{find: 'testns'}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // We won't try and subplan two contained ORs.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or:[{a:1}, {b:1}], $or:[{c:1}, {d:1}], e:1}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning if there is a hint.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or: [{a:1, b:1}, {c:1, d:1}]},"
                "hint: {a:1, b:1}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning with min.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or: [{a:1, b:1}, {c:1, d:1}]},"
                "min: {a:1, b:1}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning with max.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or: [{a:1, b:1}, {c:1, d:1}]},"
                "max: {a:2, b:2}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning with tailable.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or: [{a:1, b:1}, {c:1, d:1}]},"
                "tailable: true}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning with snapshot.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or: [{a:1, b:1}, {c:1, d:1}]},"
                "snapshot: true}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can use subplanning for rooted $or.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$or: [{a:1, b:1}, {c:1, d:1}]}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_TRUE(SubplanStage::canUseSubplanning(*cq));

            std::string findCmd2 =
                "{find: 'testns',"
                "filter: {$or: [{a:1}, {c:1}]}}";
            std::unique_ptr<CanonicalQuery> cq2 = cqFromFindCommand(findCmd2);
            ASSERT_TRUE(SubplanStage::canUseSubplanning(*cq2));
        }

        // Can't use subplanning for a single contained $or.
        //
        // TODO: Consider allowing this to use subplanning (see SERVER-13732).
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {e: 1, $or: [{a:1, b:1}, {c:1, d:1}]}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning if the contained $or query has a geo predicate.
        //
        // TODO: Consider allowing this to use subplanning (see SERVER-13732).
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {loc: {$geoWithin: {$centerSphere: [[0,0], 1]}},"
                "e: 1, $or: [{a:1, b:1}, {c:1, d:1}]}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning if the contained $or query also has a $text predicate.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {$text: {$search: 'foo'},"
                "e: 1, $or: [{a:1, b:1}, {c:1, d:1}]}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }

        // Can't use subplanning if the contained $or query also has a $near predicate.
        {
            std::string findCmd =
                "{find: 'testns',"
                "filter: {loc: {$near: [0, 0]},"
                "e: 1, $or: [{a:1, b:1}, {c:1, d:1}]}}";
            std::unique_ptr<CanonicalQuery> cq = cqFromFindCommand(findCmd);
            ASSERT_FALSE(SubplanStage::canUseSubplanning(*cq));
        }
    }
};

/**
 * Unit test the subplan stage's rewriteToRootedOr() method.
 */
class QueryStageSubplanRewriteToRootedOr : public QueryStageSubplanBase {
public:
    void run() {
        // Rewrite (AND (OR a b) e) => (OR (AND a e) (AND b e))
        {
            BSONObj queryObj = fromjson("{$or:[{a:1}, {b:1}], e:1}");
            StatusWithMatchExpression expr = MatchExpressionParser::parse(queryObj);
            ASSERT_OK(expr.getStatus());
            std::unique_ptr<MatchExpression> rewrittenExpr =
                SubplanStage::rewriteToRootedOr(std::move(expr.getValue()));

            std::string findCmdRewritten =
                "{find: 'testns',"
                "filter: {$or:[{a:1,e:1}, {b:1,e:1}]}}";
            std::unique_ptr<CanonicalQuery> cqRewritten = cqFromFindCommand(findCmdRewritten);

            ASSERT(rewrittenExpr->equivalent(cqRewritten->root()));
        }

        // Rewrite (AND (OR a b) e f) => (OR (AND a e f) (AND b e f))
        {
            BSONObj queryObj = fromjson("{$or:[{a:1}, {b:1}], e:1, f:1}");
            StatusWithMatchExpression expr = MatchExpressionParser::parse(queryObj);
            ASSERT_OK(expr.getStatus());
            std::unique_ptr<MatchExpression> rewrittenExpr =
                SubplanStage::rewriteToRootedOr(std::move(expr.getValue()));

            std::string findCmdRewritten =
                "{find: 'testns',"
                "filter: {$or:[{a:1,e:1,f:1}, {b:1,e:1,f:1}]}}";
            std::unique_ptr<CanonicalQuery> cqRewritten = cqFromFindCommand(findCmdRewritten);

            ASSERT(rewrittenExpr->equivalent(cqRewritten->root()));
        }

        // Rewrite (AND (OR (AND a b) (AND c d) e f) => (OR (AND a b e f) (AND c d e f))
        {
            BSONObj queryObj = fromjson("{$or:[{a:1,b:1}, {c:1,d:1}], e:1,f:1}");
            StatusWithMatchExpression expr = MatchExpressionParser::parse(queryObj);
            ASSERT_OK(expr.getStatus());
            std::unique_ptr<MatchExpression> rewrittenExpr =
                SubplanStage::rewriteToRootedOr(std::move(expr.getValue()));

            std::string findCmdRewritten =
                "{find: 'testns',"
                "filter: {$or:[{a:1,b:1,e:1,f:1},"
                "{c:1,d:1,e:1,f:1}]}}";
            std::unique_ptr<CanonicalQuery> cqRewritten = cqFromFindCommand(findCmdRewritten);

            ASSERT(rewrittenExpr->equivalent(cqRewritten->root()));
        }
    }
};

/**
 * Test the subplan stage's ability to answer a contained $or query.
 */
class QueryStageSubplanPlanContainedOr : public QueryStageSubplanBase {
public:
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        addIndex(BSON("b" << 1 << "a" << 1));
        addIndex(BSON("c" << 1 << "a" << 1));

        BSONObj query = fromjson("{a: 1, $or: [{b: 2}, {c: 3}]}");

        // Two of these documents match.
        insert(BSON("_id" << 1 << "a" << 1 << "b" << 2));
        insert(BSON("_id" << 2 << "a" << 2 << "b" << 2));
        insert(BSON("_id" << 3 << "a" << 1 << "c" << 3));
        insert(BSON("_id" << 4 << "a" << 1 << "c" << 4));

        auto cq = unittest::assertGet(CanonicalQuery::canonicalize(nss, query));

        Collection* collection = ctx.getCollection();

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);

        WorkingSet ws;
        std::unique_ptr<SubplanStage> subplan(
            new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        // Plan selection should succeed due to falling back on regular planning.
        PlanYieldPolicy yieldPolicy(nullptr, PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        // Work the stage until it produces all results.
        size_t numResults = 0;
        PlanStage::StageState stageState = PlanStage::NEED_TIME;
        while (stageState != PlanStage::IS_EOF) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            stageState = subplan->work(&id);
            ASSERT_NE(stageState, PlanStage::DEAD);
            ASSERT_NE(stageState, PlanStage::FAILURE);

            if (stageState == PlanStage::ADVANCED) {
                ++numResults;
                WorkingSetMember* member = ws.get(id);
                ASSERT(member->hasObj());
                ASSERT(member->obj.value() == BSON("_id" << 1 << "a" << 1 << "b" << 2) ||
                       member->obj.value() == BSON("_id" << 3 << "a" << 1 << "c" << 3));
            }
        }

        ASSERT_EQ(numResults, 2U);
    }
};

/**
 * Test the subplan stage's ability to answer a rooted $or query with a $ne and a sort.
 *
 * Regression test for SERVER-19388.
 */
class QueryStageSubplanPlanRootedOrNE : public QueryStageSubplanBase {
public:
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        addIndex(BSON("a" << 1 << "b" << 1));
        addIndex(BSON("a" << 1 << "c" << 1));

        // Every doc matches.
        insert(BSON("_id" << 1 << "a" << 1));
        insert(BSON("_id" << 2 << "a" << 2));
        insert(BSON("_id" << 3 << "a" << 3));
        insert(BSON("_id" << 4));

        BSONObj query = fromjson("{$or: [{a: 1}, {a: {$ne:1}}]}");
        BSONObj sort = BSON("d" << 1);
        BSONObj projection;
        auto cq = unittest::assertGet(CanonicalQuery::canonicalize(nss, query, sort, projection));

        Collection* collection = ctx.getCollection();

        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);

        WorkingSet ws;
        std::unique_ptr<SubplanStage> subplan(
            new SubplanStage(&_txn, collection, &ws, plannerParams, cq.get()));

        PlanYieldPolicy yieldPolicy(nullptr, PlanExecutor::YIELD_MANUAL);
        ASSERT_OK(subplan->pickBestPlan(&yieldPolicy));

        size_t numResults = 0;
        PlanStage::StageState stageState = PlanStage::NEED_TIME;
        while (stageState != PlanStage::IS_EOF) {
            WorkingSetID id = WorkingSet::INVALID_ID;
            stageState = subplan->work(&id);
            ASSERT_NE(stageState, PlanStage::DEAD);
            ASSERT_NE(stageState, PlanStage::FAILURE);
            if (stageState == PlanStage::ADVANCED) {
                ++numResults;
            }
        }

        ASSERT_EQ(numResults, 4U);
    }
};

class All : public Suite {
public:
    All() : Suite("query_stage_subplan") {}

    void setupTests() {
        add<QueryStageSubplanGeo2dOr>();
        add<QueryStageSubplanPlanFromCache>();
        add<QueryStageSubplanDontCacheZeroResults>();
        add<QueryStageSubplanDontCacheTies>();
        add<QueryStageSubplanCanUseSubplanning>();
        add<QueryStageSubplanRewriteToRootedOr>();
        add<QueryStageSubplanPlanContainedOr>();
        add<QueryStageSubplanPlanRootedOrNE>();
    }
};

SuiteInstance<All> all;

}  // namespace QueryStageSubplan
Beispiel #29
0
 void insert(const BSONObj& doc) {
     _client.insert(nss.ns(), doc);
 }
    void run() {
        // Data is just a single {_id: 1, a: 1, b: 1} document.
        insert(BSON("_id" << 1 << "a" << 1 << "b" << 1));

        // Indices on 'a' and 'b'.
        addIndex(BSON("a" << 1));
        addIndex(BSON("b" << 1));

        AutoGetCollectionForRead ctx(&_txn, nss.ns());
        Collection* collection = ctx.getCollection();

        // Query for both 'a' and 'b' and sort on 'b'.
        auto statusWithCQ = CanonicalQuery::canonicalize(txn(),
                                                         nss,
                                                         BSON("a" << 1 << "b" << 1),  // query
                                                         BSON("b" << 1),              // sort
                                                         BSONObj(),                   // proj
                                                         ExtensionsCallbackDisallowExtensions());
        verify(statusWithCQ.isOK());
        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        ASSERT(NULL != cq.get());

        // Force index intersection.
        bool forceIxisectOldValue = internalQueryForceIntersectionPlans;
        internalQueryForceIntersectionPlans = true;

        // Get planner params.
        QueryPlannerParams plannerParams;
        fillOutPlannerParams(&_txn, collection, cq.get(), &plannerParams);
        // Turn this off otherwise it pops up in some plans.
        plannerParams.options &= ~QueryPlannerParams::KEEP_MUTATIONS;

        // Plan.
        vector<QuerySolution*> solutions;
        Status status = QueryPlanner::plan(*cq, plannerParams, &solutions);
        ASSERT(status.isOK());

        // We expect a plan using index {a: 1} and plan using index {b: 1} and
        // an index intersection plan.
        ASSERT_EQUALS(solutions.size(), 3U);

        // Fill out the MultiPlanStage.
        unique_ptr<MultiPlanStage> mps(new MultiPlanStage(&_txn, collection, cq.get()));
        unique_ptr<WorkingSet> ws(new WorkingSet());
        // Put each solution from the planner into the MPR.
        for (size_t i = 0; i < solutions.size(); ++i) {
            PlanStage* root;
            ASSERT(StageBuilder::build(&_txn, collection, *cq, *solutions[i], ws.get(), &root));
            // Takes ownership of 'solutions[i]' and 'root'.
            mps->addPlan(solutions[i], root, ws.get());
        }

        // This sets a backup plan.
        PlanYieldPolicy yieldPolicy(PlanExecutor::YIELD_MANUAL, clockSource.get());
        mps->pickBestPlan(&yieldPolicy);
        ASSERT(mps->bestPlanChosen());
        ASSERT(mps->hasBackupPlan());

        // We should have picked the index intersection plan due to forcing ixisect.
        QuerySolution* soln = mps->bestSolution();
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{sort: {pattern: {b: 1}, limit: 0, node: {sortKeyGen: {node:"
            "{fetch: {node: {andSorted: {nodes: ["
            "{ixscan: {filter: null, pattern: {a:1}}},"
            "{ixscan: {filter: null, pattern: {b:1}}}]}}}}}}}}",
            soln->root.get()));

        // Get the resulting document.
        PlanStage::StageState state = PlanStage::NEED_TIME;
        WorkingSetID wsid;
        while (state != PlanStage::ADVANCED) {
            state = mps->work(&wsid);
        }
        WorkingSetMember* member = ws->get(wsid);

        // Check the document returned by the query.
        ASSERT(member->hasObj());
        BSONObj expectedDoc = BSON("_id" << 1 << "a" << 1 << "b" << 1);
        ASSERT(expectedDoc.woCompare(member->obj.value()) == 0);

        // The blocking plan became unblocked, so we should no longer have a backup plan,
        // and the winning plan should still be the index intersection one.
        ASSERT(!mps->hasBackupPlan());
        soln = mps->bestSolution();
        ASSERT(QueryPlannerTestLib::solutionMatches(
            "{sort: {pattern: {b: 1}, limit: 0, node: {sortKeyGen: {node:"
            "{fetch: {node: {andSorted: {nodes: ["
            "{ixscan: {filter: null, pattern: {a:1}}},"
            "{ixscan: {filter: null, pattern: {b:1}}}]}}}}}}}}",
            soln->root.get()));

        // Restore index intersection force parameter.
        internalQueryForceIntersectionPlans = forceIxisectOldValue;
    }