Пример #1
0
void test7(int type){
    int i = 0, j = 0;
    char* buffer;
    qlog_init(50);
    qlog_thread_init("test_6_main_thread");
    for (i = 0; i < 10; i++){
        for (j = 0; j< 1500; j++){
            switch (type){
                case 1:
                    QLOG("log entry");
                    break;
                case 2:
                    QLOG_HEX(&buffer, 15);
                    break;
                case 3:
                    QLOG_BT;
                    break;
                case 4:
                    QLOG("log entry sdlkfjsdklfjsdlkfjsdklfj");
                    QLOG_HEX(&buffer, 98);
                    QLOG_BT;
                default:
                    break;
            }
        }
        /*qlog_display_print_buffer(stdout);*/
        qlog_reset();
    }
    qlog_cleanup();
}
Пример #2
0
    Status SubplanRunner::planSubqueries() {
        MatchExpression* theOr = _query->root();

        for (size_t i = 0; i < _plannerParams.indices.size(); ++i) {
            const IndexEntry& ie = _plannerParams.indices[i];
            _indexMap[ie.keyPattern] = i;
            QLOG() << "Subplanner: index " << i << " is " << ie.toString() << endl;
        }

        const WhereCallbackReal whereCallback(_collection->ns().db());

        for (size_t i = 0; i < theOr->numChildren(); ++i) {
            // Turn the i-th child into its own query.
            MatchExpression* orChild = theOr->getChild(i);
            CanonicalQuery* orChildCQ;
            Status childCQStatus = CanonicalQuery::canonicalize(*_query,
                                                                orChild,
                                                                &orChildCQ,
                                                                whereCallback);
            if (!childCQStatus.isOK()) {
                mongoutils::str::stream ss;
                ss << "Subplanner: Can't canonicalize subchild " << orChild->toString()
                   << " " << childCQStatus.reason();
                return Status(ErrorCodes::BadValue, ss);
            }

            // Make sure it gets cleaned up.
            auto_ptr<CanonicalQuery> safeOrChildCQ(orChildCQ);

            // Plan the i-th child.
            vector<QuerySolution*> solutions;

            // We don't set NO_TABLE_SCAN because peeking at the cache data will keep us from 
            // considering any plan that's a collscan.
            QLOG() << "Subplanner: planning child " << i << " of " << theOr->numChildren();
            Status status = QueryPlanner::plan(*safeOrChildCQ, _plannerParams, &solutions);

            if (!status.isOK()) {
                mongoutils::str::stream ss;
                ss << "Subplanner: Can't plan for subchild " << orChildCQ->toString()
                   << " " << status.reason();
                return Status(ErrorCodes::BadValue, ss);
            }
            QLOG() << "Subplanner: got " << solutions.size() << " solutions";

            if (0 == solutions.size()) {
                // If one child doesn't have an indexed solution, bail out.
                mongoutils::str::stream ss;
                ss << "Subplanner: No solutions for subchild " << orChildCQ->toString();
                return Status(ErrorCodes::BadValue, ss);
            }

            // Hang onto the canonicalized subqueries and the corresponding query solutions
            // so that they can be used in subplan running later on.
            _cqs.push(safeOrChildCQ.release());
            _solutions.push(solutions);
        }

        return Status::OK();
    }
Пример #3
0
    PlanStage::StageState MultiPlanStage::work(WorkingSetID* out) {
        // Adds the amount of time taken by work() to executionTimeMillis.
        ScopedTimer timer(&_commonStats.executionTimeMillis);

        if (_failure) {
            *out = _statusMemberId;
            return PlanStage::FAILURE;
        }

        CandidatePlan& bestPlan = _candidates[_bestPlanIdx];

        // Look for an already produced result that provides the data the caller wants.
        if (!bestPlan.results.empty()) {
            *out = bestPlan.results.front();
            bestPlan.results.pop_front();
            _commonStats.advanced++;
            return PlanStage::ADVANCED;
        }

        // best plan had no (or has no more) cached results

        StageState state = bestPlan.root->work(out);

        if (PlanStage::FAILURE == state && hasBackupPlan()) {
            QLOG() << "Best plan errored out switching to backup\n";
            // Uncache the bad solution if we fall back
            // on the backup solution.
            //
            // XXX: Instead of uncaching we should find a way for the
            // cached plan runner to fall back on a different solution
            // if the best solution fails. Alternatively we could try to
            // defer cache insertion to be after the first produced result.

            _collection->infoCache()->getPlanCache()->remove(*_query);

            _bestPlanIdx = _backupPlanIdx;
            _backupPlanIdx = kNoSuchPlan;

            return _candidates[_bestPlanIdx].root->work(out);
        }

        if (hasBackupPlan() && PlanStage::ADVANCED == state) {
            QLOG() << "Best plan had a blocking stage, became unblocked\n";
            _backupPlanIdx = kNoSuchPlan;
        }

        // Increment stats.
        if (PlanStage::ADVANCED == state) {
            _commonStats.advanced++;
        }
        else if (PlanStage::NEED_TIME == state) {
            _commonStats.needTime++;
        }

        return state;
    }
Пример #4
0
CCefWindow::CCefWindow(const QString& url, QCefView* host, QWindow *parent /*= 0*/)
	: QWindow(parent)
	, pQCefViewWidget_(host)
	, hwndCefBrowser_(NULL)
	, pQCefViewHandler(NULL)
{
	CCefManager::getInstance().AddBrowserRefCount();

	setFlags(Qt::FramelessWindowHint);
	
	// Create native window
	create();

	// Set window info
	CefWindowInfo window_info;
	RECT rc = { 0 };
	window_info.SetAsChild((HWND)winId(), rc);

	CefBrowserSettings browserSettings;
	browserSettings.plugins = STATE_DISABLED;	// disable all plugins

	pQCefViewHandler = new QCefViewBrowserHandler(host);
	instance_map_[(HWND)winId()] = pQCefViewWidget_;

	// Create the main browser window.
	if (!CefBrowserHost::CreateBrowser(
		window_info,			// window info
		pQCefViewHandler.get(),	// handler
		url.toStdString(),		// url
		browserSettings,		// settings
		NULL))
	{
		QLOG() << QStringLiteral("Failed to create browser.");
	}
}
Пример #5
0
    void CachedPlanRunner::updateCache() {
        _updatedCache = true;

        if (_killed) {
            return;
        }

        Database* db = cc().database();
        // XXX: We need to check for NULL because this is called upon
        // destruction of the CachedPlanRunner. In some cases, the db
        // or collection could be dropped without kill() being called
        // on the runner (for example, timeout of a ClientCursor holding
        // the runner).
        if (NULL == db) { return; }
        Collection* collection = db->getCollection(_canonicalQuery->ns());
        if (NULL == collection) { return; }
        PlanCache* cache = collection->infoCache()->getPlanCache();

        std::auto_ptr<PlanCacheEntryFeedback> feedback(new PlanCacheEntryFeedback());
        // XXX: what else can we provide here?
        feedback->stats.reset(_exec->getStats());
        feedback->score = PlanRanker::scoreTree(feedback->stats.get());

        Status fbs = cache->feedback(*_canonicalQuery, feedback.release());

        if (!fbs.isOK()) {
            QLOG() << _canonicalQuery->ns() << ": Failed to update cache with feedback: "
                   << fbs.toString() << " - "
                   << "(query: " << _canonicalQuery->getQueryObj()
                   << "; sort: " << _canonicalQuery->getParsed().getSort()
                   << "; projection: " << _canonicalQuery->getParsed().getProj()
                   << ") is no longer in plan cache.";
        }
    }
Пример #6
0
    // static 
    double PlanRanker::scoreTree(const PlanStageStats* stats) {
        // We start all scores at 1.  Our "no plan selected" score is 0 and we want all plans to
        // be greater than that.
        double baseScore = 1;

        // How much did a plan produce?
        // Range: [0, 1]
        double productivity = static_cast<double>(stats->common.advanced)
                            / static_cast<double>(stats->common.works);

        // Does a plan have a sort?
        // bool sort = hasSort(stats);

        // How selective do we think an index is?
        //double selectivity = computeSelectivity(stats);
        //return baseScore + productivity + selectivity;

        //double sortPenalty = sort ? 0.5 : 0;
        //double score = baseScore + productivity - sortPenalty;
        double score = baseScore + productivity;

        QLOG() << "score (" << score << ") = baseScore (" << baseScore << ") + productivity(" << productivity << ")\n";

        return score;
    }
Пример #7
0
    // static
    Status QueryPlanner::planFromCache(const CanonicalQuery& query,
                                       const QueryPlannerParams& params,
                                       CachedSolution* cachedSoln,
                                       QuerySolution** out) {

        // Create a copy of the expression tree.  We use cachedSoln to annotate this with indices.
        MatchExpression* clone = query.root()->shallowClone();

        // XXX: Use data in cachedSoln to tag 'clone' with the indices used.  The tags use an index
        // ID which is an index into some vector of IndexEntry(s).  How do we maintain this across
        // calls to plan?  Do we want to store in the soln the keypatterns of the indices and just
        // map those to an index into params.indices?  Might be easiest thing to do, and certainly
        // most intelligible for debugging.

        // Use the cached index assignments to build solnRoot.  Takes ownership of clone.
        QuerySolutionNode* solnRoot =
            QueryPlannerAccess::buildIndexedDataAccess(query, clone, false, params.indices);

        // XXX: are the NULL cases an error/when does this happen / can this happen?
        if (NULL != solnRoot) {
            QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, solnRoot);
            if (NULL != soln) {
                QLOG() << "Planner: adding cached solution:\n" << soln->toString() << endl;
                *out = soln;
            }
        }

        // XXX: if any NULLs return error status?
        return Status::OK();
    }
Пример #8
0
void test6(){
    int data[100];
    int i = 0;
    alma a;
    alma2 a2;
    qlog_ext_event_type_t et = 0, et2 = 0;
    a.alma1 = 100;
    a.alma2 = 200;

    a2.a21 = 10923;
    a2.a22 = 987;

    for (i = 0; i < 100; i++) {
        data[i] = 1879*i;
    }
    qlog_init(10);
    qlog_thread_init("main thread");
    QLOG("message1");
    QLOG_BT;
    QLOG("message2");
    QLOG_HEX(data, sizeof(data));
    QLOG("message3");

    et = qlog_ext_register_event(test6_print_cb);
    printf("event type: %d\n", et);
    et = qlog_ext_register_event(test6_print_cb);
    printf("event type: %d\n", et);
    et = qlog_ext_register_event(test6_print_cb);
    printf("event type: %d\n", et);
    et = qlog_ext_register_event(test6_print_cb);
    printf("event type: %d\n", et);
    et = qlog_ext_register_event(test6_print_cb);
    printf("event type: %d\n", et);

    et2 = qlog_ext_register_event(test6_print_cb2);
    printf("event type: %d\n", et2);
    qlog_ext_log(et, &a, sizeof(a), "Hoki alma");
    qlog_ext_log(1000, &a, sizeof(a), "Hoki2 alma");
    qlog_ext_log(et2, &a2, sizeof(a2), "Hoki2222 alma");
    QLOG_HEX(&a, sizeof(a));

    QLOG_BT;
    QLOG_VA("This is a formatted message: %d, %s, %d", 1023, "alma", 12);
    qlog_display_print_buffer(stdout);
    qlog_cleanup();
}
Пример #9
0
void test10(void){
    int a = 0;
    int b = 0;
    qlog_init(25);
    qlog_thread_init("main thread");
    QLOG("test10 has been started");
    QLOG_ENTRY;
    a = test10_a();
    printf("%s, %d\n", __FUNCTION__, a);
    QLOG("test10_a has been returned");
    b = test10_c(86);
    printf("%s, %d\n", __FUNCTION__, b);
    QLOG_LEAVE;
    QLOG("test10 has been finished");
    qlog_display_print_buffer(stdout);
    qlog_cleanup();

}
Пример #10
0
    bool PlanEnumerator::getNext(MatchExpression** tree) {
        if (_done) { return false; }

        // Tag with our first solution.
        tagMemo(_nodeToId[_root]);

        *tree = _root->shallowClone();
        tagForSort(*tree);
        sortUsingTags(*tree);

        _root->resetTag();
        QLOG() << "Enumerator: memo right before moving:\n";
        dumpMemo();
        _done = nextMemo(_nodeToId[_root]);
        QLOG() << "Enumerator: memo right after moving:\n";
        dumpMemo();
        return true;
    }
Пример #11
0
    // static 
    size_t PlanRanker::pickBestPlan(const vector<CandidatePlan>& candidates,
                                    PlanRankingDecision* why) {
        // Each plan will have a stat tree.
        vector<PlanStageStats*> statTrees;

        // Get stat trees from each plan.
        for (size_t i = 0; i < candidates.size(); ++i) {
            statTrees.push_back(candidates[i].root->getStats());
        }

        // Compute score for each tree.  Record the best.
        double maxScore = 0;
        size_t bestChild = numeric_limits<size_t>::max();
        for (size_t i = 0; i < statTrees.size(); ++i) {
            QLOG() << "scoring plan " << i << ":\n" << candidates[i].solution->toString();
            double score = scoreTree(statTrees[i]);
            QLOG() << "score = " << score << endl;
            if (score > maxScore) {
                maxScore = score;
                bestChild = i;
            }
        }

        // Make sure we got something.
        verify(numeric_limits<size_t>::max() != bestChild);

        if (NULL != why) {
            // Record the stats of the winner.
            why->statsOfWinner = statTrees[bestChild];
        }

        // Clean up stats of losers.
        for (size_t i = 0; i < statTrees.size(); ++i) {
            // If why is null we're not saving the bestChild's stats and we can delete it.
            if (i != bestChild || NULL == why) {
                delete statTrees[i];
            }
        }

        return bestChild;
    }
Пример #12
0
    bool MultiPlanRunner::pickBestPlan(size_t* out) {
        static const int timesEachPlanIsWorked = 100;

        // Run each plan some number of times.
        for (int i = 0; i < timesEachPlanIsWorked; ++i) {
            bool moreToDo = workAllPlans();
            if (!moreToDo) { break; }
        }

        if (_failure || _killed) { return false; }

        size_t bestChild = PlanRanker::pickBestPlan(_candidates, NULL);

        // Run the best plan.  Store it.
        _bestPlan.reset(new PlanExecutor(_candidates[bestChild].ws,
                                         _candidates[bestChild].root));
        _bestPlan->setYieldPolicy(_policy);
        _alreadyProduced = _candidates[bestChild].results;
        _bestSolution.reset(_candidates[bestChild].solution);

        QLOG() << "Winning solution:\n" << _bestSolution->toString() << endl;

        // TODO:
        // Store the choice we just made in the cache.
        // QueryPlanCache* cache = PlanCache::get(somenamespace);
        // cache->add(_query, *_candidates[bestChild]->solution, decision->bestPlanStats);
        // delete decision;

        // Clear out the candidate plans, leaving only stats as we're all done w/them.
        for (size_t i = 0; i < _candidates.size(); ++i) {
            if (i == bestChild) { continue; }
            delete _candidates[i].solution;

            // Remember the stats for the candidate plan because we always show it on an
            // explain. (The {verbose:false} in explain() is client-side trick; we always
            // generate a "verbose" explain.)
            PlanStageStats* stats = _candidates[i].root->getStats();
            if (stats) {
                _candidateStats.push_back(stats);
            }
            delete _candidates[i].root;

            // ws must die after the root.
            delete _candidates[i].ws;
        }

        _candidates.clear();
        if (NULL != out) { *out = bestChild; }
        return true;
    }
Пример #13
0
    Status PlanEnumerator::init() {
        _inOrderCount = 0;
        _done = false;

        QLOG() << "enumerator received root:\n" << _root->toString() << endl;

        // Fill out our memo structure from the tagged _root.
        _done = !prepMemo(_root);

        // Dump the tags.  We replace them with IndexTag instances.
        _root->resetTag();

        return Status::OK();
    }
Пример #14
0
    void PlanEnumerator::tagMemo(size_t id) {
        QLOG() << "Tagging memoID " << id << endl;
        NodeAssignment* assign = _memo[id];
        verify(NULL != assign);

        if (NULL != assign->pred) {
            PredicateAssignment* pa = assign->pred.get();
            verify(NULL == pa->expr->getTag());
            verify(pa->indexToAssign < pa->first.size());
            pa->expr->setTag(new IndexTag(pa->first[pa->indexToAssign]));
        }
        else if (NULL != assign->orAssignment) {
            OrAssignment* oa = assign->orAssignment.get();
            for (size_t i = 0; i < oa->subnodes.size(); ++i) {
                tagMemo(oa->subnodes[i]);
            }
        }
        else if (NULL != assign->newAnd) {
            AndAssignment* aa = assign->newAnd.get();

            if (AndAssignment::MANDATORY == aa->state) {
                verify(aa->counter < aa->mandatory.size());
                const OneIndexAssignment& assign = aa->mandatory[aa->counter];
                for (size_t i = 0; i < assign.preds.size(); ++i) {
                    MatchExpression* pred = assign.preds[i];
                    verify(NULL == pred->getTag());
                    pred->setTag(new IndexTag(assign.index, assign.positions[i]));
                }
            }
            else if (AndAssignment::PRED_CHOICES == aa->state) {
                verify(aa->counter < aa->predChoices.size());
                const OneIndexAssignment& assign = aa->predChoices[aa->counter];
                for (size_t i = 0; i < assign.preds.size(); ++i) {
                    MatchExpression* pred = assign.preds[i];
                    verify(NULL == pred->getTag());
                    pred->setTag(new IndexTag(assign.index, assign.positions[i]));
                }
            }
            else {
                verify(AndAssignment::SUBNODES == aa->state);
                verify(aa->counter < aa->subnodes.size());
                tagMemo(aa->subnodes[aa->counter]);
            }
        }
        else {
            verify(0);
        }
    }
Пример #15
0
void CachedPlanStage::updateCache() {
    _updatedCache = true;

    std::auto_ptr<PlanCacheEntryFeedback> feedback(new PlanCacheEntryFeedback());
    feedback->stats.reset(getStats());
    feedback->score = PlanRanker::scoreTree(feedback->stats.get());

    PlanCache* cache = _collection->infoCache()->getPlanCache();
    Status fbs = cache->feedback(*_canonicalQuery, feedback.release());

    if (!fbs.isOK()) {
        QLOG() << _canonicalQuery->ns() << ": Failed to update cache with feedback: "
               << fbs.toString() << " - "
               << "(query: " << _canonicalQuery->getQueryObj()
               << "; sort: " << _canonicalQuery->getParsed().getSort()
               << "; projection: " << _canonicalQuery->getParsed().getProj()
               << ") is no longer in plan cache.";
    }
}
Пример #16
0
void* test8_thr(void* data){
    char* thread_name = (char*) data;
    struct timespec sleep_time;
    struct timespec remaining_time;
    unsigned long counter = 0;
    sleep_time.tv_sec = 0;
    sleep_time.tv_nsec = 1000000;

    qlog_thread_init(thread_name);
    printf("Thread started: %s\n", thread_name);
    while(1){
        if (test8_run == 0) {
            printf("Loop count in thread %s: %lu\n", thread_name, counter);
            break;
        }
        counter++;
        QLOG("thread message");
        QLOG_HEX(thread_name, 10);
        QLOG_BT;
        nanosleep(&sleep_time, &remaining_time);
    }
    return NULL;
}
Пример #17
0
    // static
    QuerySolution* QueryPlannerAnalysis::analyzeDataAccess(const CanonicalQuery& query,
                                                   const QueryPlannerParams& params,
                                                   QuerySolutionNode* solnRoot) {
        auto_ptr<QuerySolution> soln(new QuerySolution());
        soln->filterData = query.getQueryObj();
        verify(soln->filterData.isOwned());
        soln->ns = query.ns();
        soln->indexFilterApplied = params.indexFiltersApplied;

        solnRoot->computeProperties();

        // solnRoot finds all our results.  Let's see what transformations we must perform to the
        // data.

        // If we're answering a query on a sharded system, we need to drop documents that aren't
        // logically part of our shard (XXX GREG elaborate more precisely)
        if (params.options & QueryPlannerParams::INCLUDE_SHARD_FILTER) {
            // XXX TODO: use params.shardKey to do fetch analysis instead of always fetching.
            if (!solnRoot->fetched()) {
                FetchNode* fetch = new FetchNode();
                fetch->children.push_back(solnRoot);
                solnRoot = fetch;
            }
            ShardingFilterNode* sfn = new ShardingFilterNode();
            sfn->children.push_back(solnRoot);
            solnRoot = sfn;
        }

        solnRoot = analyzeSort(query, params, solnRoot, &soln->hasSortStage);
        // This can happen if we need to create a blocking sort stage and we're not allowed to.
        if (NULL == solnRoot) { return NULL; }

        // If we can (and should), add the keep mutations stage.

        // We cannot keep mutated documents if:
        //
        // 1. The query requires an index to evaluate the predicate ($text).  We can't tell whether
        // or not the doc actually satisfies the $text predicate since we can't evaluate a
        // text MatchExpression.
        //
        // 2. The query implies a sort ($geoNear).  It would be rather expensive and hacky to merge
        // the document at the right place.
        //
        // 3. There is an index-provided sort.  Ditto above comment about merging.
        // XXX; do we want some kind of static init for a set of stages we care about & pass that
        // set into hasNode?
        bool cannotKeepFlagged = hasNode(solnRoot, STAGE_TEXT)
                              || hasNode(solnRoot, STAGE_GEO_NEAR_2D)
                              || hasNode(solnRoot, STAGE_GEO_NEAR_2DSPHERE)
                              || (!query.getParsed().getSort().isEmpty() && !soln->hasSortStage);

        // Only these stages can produce flagged results.  A stage has to hold state past one call
        // to work(...) in order to possibly flag a result.
        bool couldProduceFlagged = hasNode(solnRoot, STAGE_GEO_2D)
                                || hasNode(solnRoot, STAGE_AND_HASH)
                                || hasNode(solnRoot, STAGE_AND_SORTED)
                                || hasNode(solnRoot, STAGE_FETCH);

        bool shouldAddMutation = !cannotKeepFlagged && couldProduceFlagged;

        if (shouldAddMutation && (params.options & QueryPlannerParams::KEEP_MUTATIONS)) {
            KeepMutationsNode* keep = new KeepMutationsNode();

            // We must run the entire expression tree to make sure the document is still valid.
            keep->filter.reset(query.root()->shallowClone());

            if (STAGE_SORT == solnRoot->getType()) {
                // We want to insert the invalidated results before the sort stage, if there is one.
                verify(1 == solnRoot->children.size());
                keep->children.push_back(solnRoot->children[0]);
                solnRoot->children[0] = keep;
            }
            else {
                keep->children.push_back(solnRoot);
                solnRoot = keep;
            }
        }

        // Project the results.
        if (NULL != query.getProj()) {
            QLOG() << "PROJECTION: fetched status: " << solnRoot->fetched() << endl;
            QLOG() << "PROJECTION: Current plan is:\n" << solnRoot->toString() << endl;
            if (query.getProj()->requiresDocument()) {
                QLOG() << "PROJECTION: claims to require doc adding fetch.\n";
                // If the projection requires the entire document, somebody must fetch.
                if (!solnRoot->fetched()) {
                    FetchNode* fetch = new FetchNode();
                    fetch->children.push_back(solnRoot);
                    solnRoot = fetch;
                }
            }
            else {
                QLOG() << "PROJECTION: requires fields\n";
                const vector<string>& fields = query.getProj()->getRequiredFields();
                bool covered = true;
                for (size_t i = 0; i < fields.size(); ++i) {
                    if (!solnRoot->hasField(fields[i])) {
                        QLOG() << "PROJECTION: not covered cuz doesn't have field "
                             << fields[i] << endl;
                        covered = false;
                        break;
                    }
                }
                QLOG() << "PROJECTION: is covered?: = " << covered << endl;
                // If any field is missing from the list of fields the projection wants,
                // a fetch is required.
                if (!covered) {
                    FetchNode* fetch = new FetchNode();
                    fetch->children.push_back(solnRoot);
                    solnRoot = fetch;
                }
            }

            // We now know we have whatever data is required for the projection.
            ProjectionNode* projNode = new ProjectionNode();
            projNode->children.push_back(solnRoot);
            projNode->fullExpression = query.root();
            projNode->projection = query.getParsed().getProj();
            solnRoot = projNode;
        }
        else {
            // If there's no projection, we must fetch, as the user wants the entire doc.
            if (!solnRoot->fetched()) {
                FetchNode* fetch = new FetchNode();
                fetch->children.push_back(solnRoot);
                solnRoot = fetch;
            }
        }

        if (0 != query.getParsed().getSkip()) {
            SkipNode* skip = new SkipNode();
            skip->skip = query.getParsed().getSkip();
            skip->children.push_back(solnRoot);
            solnRoot = skip;
        }

        // When there is both a blocking sort and a limit, the limit will
        // be enforced by the blocking sort.
        // Otherwise, we need to limit the results in the case of a hard limit
        // (ie. limit in raw query is negative)
        if (0 != query.getParsed().getNumToReturn() &&
            !soln->hasSortStage &&
            !query.getParsed().wantMore()) {

            LimitNode* limit = new LimitNode();
            limit->limit = query.getParsed().getNumToReturn();
            limit->children.push_back(solnRoot);
            solnRoot = limit;
        }

        soln->root.reset(solnRoot);
        return soln.release();
    }
Пример #18
0
    // static
    QuerySolutionNode* QueryPlannerAnalysis::analyzeSort(const CanonicalQuery& query,
                                                         const QueryPlannerParams& params,
                                                         QuerySolutionNode* solnRoot,
                                                         bool* blockingSortOut) {
        *blockingSortOut = false;

        const BSONObj& sortObj = query.getParsed().getSort();

        if (sortObj.isEmpty()) {
            return solnRoot;
        }

        // TODO: We could check sortObj for any projections other than :1 and :-1
        // and short-cut some of this.

        // If the sort is $natural, we ignore it, assuming that the caller has detected that and
        // outputted a collscan to satisfy the desired order.
        BSONElement natural = sortObj.getFieldDotted("$natural");
        if (!natural.eoo()) {
            return solnRoot;
        }

        // See if solnRoot gives us the sort.  If so, we're done.
        BSONObjSet sorts = solnRoot->getSort();

        // If the sort we want is in the set of sort orders provided already, bail out.
        if (sorts.end() != sorts.find(sortObj)) {
            return solnRoot;
        }

        // Sort is not provided.  See if we provide the reverse of our sort pattern.
        // If so, we can reverse the scan direction(s).
        BSONObj reverseSort = QueryPlannerCommon::reverseSortObj(sortObj);
        if (sorts.end() != sorts.find(reverseSort)) {
            QueryPlannerCommon::reverseScans(solnRoot);
            QLOG() << "Reversing ixscan to provide sort.  Result: "
                   << solnRoot->toString() << endl;
            return solnRoot;
        }

        // Sort not provided, can't reverse scans to get the sort.  One last trick: We can "explode"
        // index scans over point intervals to an OR of sub-scans in order to pull out a sort.
        // Let's try this.
        if (explodeForSort(query, params, &solnRoot)) {
            return solnRoot;
        }

        // If we're here, we need to add a sort stage.

        // If we're not allowed to put a blocking sort in, bail out.
        if (params.options & QueryPlannerParams::NO_BLOCKING_SORT) {
            delete solnRoot;
            return NULL;
        }

        // Add a fetch stage so we have the full object when we hit the sort stage.  XXX TODO: Can
        // we pull values out of the key and if so in what cases?  (covered_index_sort_3.js)
        if (!solnRoot->fetched()) {
            FetchNode* fetch = new FetchNode();
            fetch->children.push_back(solnRoot);
            solnRoot = fetch;
        }

        // And build the full sort stage.
        SortNode* sort = new SortNode();
        sort->pattern = sortObj;
        sort->query = query.getParsed().getFilter();
        // When setting the limit on the sort, we need to consider both
        // the limit N and skip count M. The sort should return an ordered list
        // N + M items so that the skip stage can discard the first M results.
        if (0 != query.getParsed().getNumToReturn()) {
            sort->limit = query.getParsed().getNumToReturn() +
                          query.getParsed().getSkip();
        }
        else {
            sort->limit = 0;
        }
        sort->children.push_back(solnRoot);
        solnRoot = sort;
        *blockingSortOut = true;

        return solnRoot;
    }
Пример #19
0
    bool QueryPlannerAnalysis::explodeForSort(const CanonicalQuery& query,
                                              const QueryPlannerParams& params,
                                              QuerySolutionNode** solnRoot) {
        vector<QuerySolutionNode*> leafNodes;

        if (!structureOKForExplode(*solnRoot)) {
            return false;
        }

        getLeafNodes(*solnRoot, &leafNodes);

        const BSONObj& desiredSort = query.getParsed().getSort();

        // How many scan leaves will result from our expansion?
        size_t totalNumScans = 0;

        // The value of entry i is how many scans we want to blow up for leafNodes[i].
        // We calculate this in the loop below and might as well reuse it if we blow up
        // that scan.
        vector<size_t> fieldsToExplode;

        // The sort order we're looking for has to possibly be provided by each of the index scans
        // upon explosion.
        for (size_t i = 0; i < leafNodes.size(); ++i) {
            // We can do this because structureOKForExplode is only true if the leaves are index
            // scans.
            IndexScanNode* isn = static_cast<IndexScanNode*>(leafNodes[i]);
            const IndexBounds& bounds = isn->bounds;

            // Not a point interval prefix, can't try to rewrite.
            if (bounds.isSimpleRange) {
                return false;
            }

            // How many scans will we create if we blow up this ixscan?
            size_t numScans = 1;

            // Skip every field that is a union of point intervals and build the resulting sort
            // order from the remaining fields.
            BSONObjIterator kpIt(isn->indexKeyPattern);
            size_t boundsIdx = 0;
            while (kpIt.more()) {
                const OrderedIntervalList& oil = bounds.fields[boundsIdx];
                if (!isUnionOfPoints(oil)) {
                    break;
                }
                numScans *= oil.intervals.size();
                kpIt.next();
                ++boundsIdx;
            }

            // There's no sort order left to gain by exploding.  Just go home.  TODO: verify nothing
            // clever we can do here.
            if (!kpIt.more()) {
                return false;
            }

            // The rest of the fields define the sort order we could obtain by exploding
            // the bounds.
            BSONObjBuilder resultingSortBob;
            while (kpIt.more()) {
                resultingSortBob.append(kpIt.next());
            }

            // See if it's the order we're looking for.
            BSONObj possibleSort = resultingSortBob.obj();
            if (0 != possibleSort.woCompare(desiredSort)) {
                return false;
            }

            // Do some bookkeeping to see how many ixscans we'll create total.
            totalNumScans += numScans;

            // And for this scan how many fields we expand.
            fieldsToExplode.push_back(boundsIdx);
        }

        // Too many ixscans spoil the performance.
        if (totalNumScans > QueryPlannerAnalysis::kMaxScansToExplode) {
            QLOG() << "Could expand ixscans to pull out sort order but resulting scan count"
                   << "(" << totalNumScans << ") is too high.";
            return false;
        }

        // If we're here, we can (probably?  depends on how restrictive the structure check is)
        // get our sort order via ixscan blow-up.
        for (size_t i = 0; i < leafNodes.size(); ++i) {
            IndexScanNode* isn = static_cast<IndexScanNode*>(leafNodes[i]);
            QuerySolutionNode* newNode = explodeScan(isn, desiredSort, fieldsToExplode[i]);
            // Replace 'isn' with 'newNode'
            replaceNodeInTree(solnRoot, isn, newNode);
            // And get rid of the old data access node.
            delete isn;
        }

        return true;
    }
Пример #20
0
    // static
    void QueryPlanner::plan(const CanonicalQuery& query,
                            const QueryPlannerParams& params,
                            vector<QuerySolution*>* out) {
        QLOG() << "=============================\n"
               << "Beginning planning, options = " << optionString(params.options) << endl
               << "Canonical query:\n" << query.toString() << endl
               << "============================="
               << endl;

        // The shortcut formerly known as IDHACK.  See if it's a simple _id query.  If so we might
        // just make an ixscan over the _id index and bypass the rest of planning entirely.
        if (!query.getParsed().isExplain() && !query.getParsed().showDiskLoc()
            && isSimpleIdQuery(query.getParsed().getFilter())
            && !query.getParsed().hasOption(QueryOption_CursorTailable)) {

            // See if we can find an _id index.
            for (size_t i = 0; i < params.indices.size(); ++i) {
                if (isIdIndex(params.indices[i].keyPattern)) {
                    const IndexEntry& index = params.indices[i];
                    QLOG() << "IDHACK using index " << index.toString() << endl;

                    // If so, we make a simple scan to find the doc.
                    IndexScanNode* isn = new IndexScanNode();
                    isn->indexKeyPattern = index.keyPattern;
                    isn->indexIsMultiKey = index.multikey;
                    isn->direction = 1;
                    isn->bounds.isSimpleRange = true;
                    BSONObj key = getKeyFromQuery(index.keyPattern, query.getParsed().getFilter());
                    isn->bounds.startKey = isn->bounds.endKey = key;
                    isn->bounds.endKeyInclusive = true;
                    isn->computeProperties();

                    QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, isn);

                    if (NULL != soln) {
                        out->push_back(soln);
                        QLOG() << "IDHACK solution is:\n" << (*out)[0]->toString() << endl;
                        // And that's it.
                        return;
                    }
                }
            }
        }

        for (size_t i = 0; i < params.indices.size(); ++i) {
            QLOG() << "idx " << i << " is " << params.indices[i].toString() << endl;
        }

        bool canTableScan = !(params.options & QueryPlannerParams::NO_TABLE_SCAN);

        // If the query requests a tailable cursor, the only solution is a collscan + filter with
        // tailable set on the collscan.  TODO: This is a policy departure.  Previously I think you
        // could ask for a tailable cursor and it just tried to give you one.  Now, we fail if we
        // can't provide one.  Is this what we want?
        if (query.getParsed().hasOption(QueryOption_CursorTailable)) {
            if (!QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR)
                && canTableScan) {
                QuerySolution* soln = buildCollscanSoln(query, true, params);
                if (NULL != soln) {
                    out->push_back(soln);
                }
            }
            return;
        }

        // The hint can be $natural: 1.  If this happens, output a collscan.  It's a weird way of
        // saying "table scan for two, please."
        if (!query.getParsed().getHint().isEmpty()) {
            BSONElement natural = query.getParsed().getHint().getFieldDotted("$natural");
            if (!natural.eoo()) {
                QLOG() << "forcing a table scan due to hinted $natural\n";
                if (canTableScan) {
                    QuerySolution* soln = buildCollscanSoln(query, false, params);
                    if (NULL != soln) {
                        out->push_back(soln);
                    }
                }
                return;
            }
        }

        // NOR and NOT we can't handle well with indices.  If we see them here, they weren't
        // rewritten to remove the negation.  Just output a collscan for those.
        if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::NOT)
            || QueryPlannerCommon::hasNode(query.root(), MatchExpression::NOR)) {

            // If there's a near predicate, we can't handle this.
            // TODO: Should canonicalized query detect this?
            if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR)) {
                warning() << "Can't handle NOT/NOR with GEO_NEAR";
                return;
            }
            QLOG() << "NOT/NOR in plan, just outtping a collscan\n";
            if (canTableScan) {
                QuerySolution* soln = buildCollscanSoln(query, false, params);
                if (NULL != soln) {
                    out->push_back(soln);
                }
            }
            return;
        }

        // Figure out what fields we care about.
        unordered_set<string> fields;
        QueryPlannerIXSelect::getFields(query.root(), "", &fields);

        for (unordered_set<string>::const_iterator it = fields.begin(); it != fields.end(); ++it) {
            QLOG() << "predicate over field " << *it << endl;
        }

        // Filter our indices so we only look at indices that are over our predicates.
        vector<IndexEntry> relevantIndices;

        // Hints require us to only consider the hinted index.
        BSONObj hintIndex = query.getParsed().getHint();

        // Snapshot is a form of a hint.  If snapshot is set, try to use _id index to make a real
        // plan.  If that fails, just scan the _id index.
        if (query.getParsed().isSnapshot()) {
            // Find the ID index in indexKeyPatterns.  It's our hint.
            for (size_t i = 0; i < params.indices.size(); ++i) {
                if (isIdIndex(params.indices[i].keyPattern)) {
                    hintIndex = params.indices[i].keyPattern;
                    break;
                }
            }
        }

        size_t hintIndexNumber = numeric_limits<size_t>::max();

        if (!hintIndex.isEmpty()) {
            // Sigh.  If the hint is specified it might be using the index name.
            BSONElement firstHintElt = hintIndex.firstElement();
            if (str::equals("$hint", firstHintElt.fieldName()) && String == firstHintElt.type()) {
                string hintName = firstHintElt.String();
                for (size_t i = 0; i < params.indices.size(); ++i) {
                    if (params.indices[i].name == hintName) {
                        QLOG() << "hint by name specified, restricting indices to "
                             << params.indices[i].keyPattern.toString() << endl;
                        relevantIndices.clear();
                        relevantIndices.push_back(params.indices[i]);
                        hintIndexNumber = i;
                        hintIndex = params.indices[i].keyPattern;
                        break;
                    }
                }
            }
            else {
                for (size_t i = 0; i < params.indices.size(); ++i) {
                    if (0 == params.indices[i].keyPattern.woCompare(hintIndex)) {
                        relevantIndices.clear();
                        relevantIndices.push_back(params.indices[i]);
                        QLOG() << "hint specified, restricting indices to " << hintIndex.toString()
                             << endl;
                        hintIndexNumber = i;
                        break;
                    }
                }
            }

            if (hintIndexNumber == numeric_limits<size_t>::max()) {
                // This is supposed to be an error.
                warning() << "Can't find hint for " << hintIndex.toString();
                return;
            }
        }
        else {
            QLOG() << "Finding relevant indices\n";
            QueryPlannerIXSelect::findRelevantIndices(fields, params.indices, &relevantIndices);
        }

        for (size_t i = 0; i < relevantIndices.size(); ++i) {
            QLOG() << "relevant idx " << i << " is " << relevantIndices[i].toString() << endl;
        }

        // Figure out how useful each index is to each predicate.
        // query.root() is now annotated with RelevantTag(s).
        QueryPlannerIXSelect::rateIndices(query.root(), "", relevantIndices);

        QLOG() << "rated tree" << endl;
        QLOG() << query.root()->toString() << endl;

        // If there is a GEO_NEAR it must have an index it can use directly.
        // XXX: move into data access?
        MatchExpression* gnNode = NULL;
        if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR, &gnNode)) {
            // No index for GEO_NEAR?  No query.
            RelevantTag* tag = static_cast<RelevantTag*>(gnNode->getTag());
            if (0 == tag->first.size() && 0 == tag->notFirst.size()) {
                return;
            }

            GeoNearMatchExpression* gnme = static_cast<GeoNearMatchExpression*>(gnNode);

            vector<size_t> newFirst;

            // 2d + GEO_NEAR is annoying.  Because 2d's GEO_NEAR isn't streaming we have to embed
            // the full query tree inside it as a matcher.
            for (size_t i = 0; i < tag->first.size(); ++i) {
                // GEO_NEAR has a non-2d index it can use.  We can deal w/that in normal planning.
                if (!is2DIndex(relevantIndices[tag->first[i]].keyPattern)) {
                    newFirst.push_back(i);
                    continue;
                }

                // If we're here, GEO_NEAR has a 2d index.  We create a 2dgeonear plan with the
                // entire tree as a filter, if possible.

                GeoNear2DNode* solnRoot = new GeoNear2DNode();
                solnRoot->nq = gnme->getData();

                if (MatchExpression::GEO_NEAR != query.root()->matchType()) {
                    // root is an AND, clone and delete the GEO_NEAR child.
                    MatchExpression* filterTree = query.root()->shallowClone();
                    verify(MatchExpression::AND == filterTree->matchType());

                    bool foundChild = false;
                    for (size_t i = 0; i < filterTree->numChildren(); ++i) {
                        if (MatchExpression::GEO_NEAR == filterTree->getChild(i)->matchType()) {
                            foundChild = true;
                            filterTree->getChildVector()->erase(filterTree->getChildVector()->begin() + i);
                            break;
                        }
                    }
                    verify(foundChild);
                    solnRoot->filter.reset(filterTree);
                }

                solnRoot->numWanted = query.getParsed().getNumToReturn();
                if (0 == solnRoot->numWanted) {
                    solnRoot->numWanted = 100;
                }
                solnRoot->indexKeyPattern = relevantIndices[tag->first[i]].keyPattern;

                // Remove the 2d index.  2d can only be the first field, and we know there is
                // only one GEO_NEAR, so we don't care if anyone else was assigned it; it'll
                // only be first for gnNode.
                tag->first.erase(tag->first.begin() + i);

                QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, solnRoot);

                if (NULL != soln) {
                    out->push_back(soln);
                }
            }

            // Continue planning w/non-2d indices tagged for this pred.
            tag->first.swap(newFirst);

            if (0 == tag->first.size() && 0 == tag->notFirst.size()) {
                return;
            }
        }

        // Likewise, if there is a TEXT it must have an index it can use directly.
        MatchExpression* textNode;
        if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::TEXT, &textNode)) {
            RelevantTag* tag = static_cast<RelevantTag*>(textNode->getTag());
            if (0 == tag->first.size() && 0 == tag->notFirst.size()) {
                return;
            }
        }

        // If we have any relevant indices, we try to create indexed plans.
        if (0 < relevantIndices.size()) {
            // The enumerator spits out trees tagged with IndexTag(s).
            PlanEnumerator isp(query.root(), &relevantIndices);
            isp.init();

            MatchExpression* rawTree;
            while (isp.getNext(&rawTree)) {
                QLOG() << "about to build solntree from tagged tree:\n" << rawTree->toString()
                       << endl;

                // This can fail if enumeration makes a mistake.
                QuerySolutionNode* solnRoot =
                    QueryPlannerAccess::buildIndexedDataAccess(query, rawTree, false, relevantIndices);

                if (NULL == solnRoot) { continue; }

                QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, solnRoot);
                if (NULL != soln) {
                    QLOG() << "Planner: adding solution:\n" << soln->toString() << endl;
                    out->push_back(soln);
                }
            }
        }

        QLOG() << "Planner: outputted " << out->size() << " indexed solutions.\n";

        // An index was hinted.  If there are any solutions, they use the hinted index.  If not, we
        // scan the entire index to provide results and output that as our plan.  This is the
        // desired behavior when an index is hinted that is not relevant to the query.
        if (!hintIndex.isEmpty() && (0 == out->size())) {
            QuerySolution* soln = buildWholeIXSoln(params.indices[hintIndexNumber], query, params);
            if (NULL != soln) {
                QLOG() << "Planner: outputting soln that uses hinted index as scan." << endl;
                out->push_back(soln);
            }
            return;
        }

        // If a sort order is requested, there may be an index that provides it, even if that
        // index is not over any predicates in the query.
        //
        // XXX XXX: Can we do this even if the index is sparse?  Might we miss things?
        if (!query.getParsed().getSort().isEmpty()
            && !QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR)
            && !QueryPlannerCommon::hasNode(query.root(), MatchExpression::TEXT)) {

            // See if we have a sort provided from an index already.
            bool usingIndexToSort = false;
            for (size_t i = 0; i < out->size(); ++i) {
                QuerySolution* soln = (*out)[i];
                if (!soln->hasSortStage) {
                    usingIndexToSort = true;
                    break;
                }
            }

            if (!usingIndexToSort) {
                for (size_t i = 0; i < params.indices.size(); ++i) {
                    const BSONObj& kp = params.indices[i].keyPattern;
                    if (providesSort(query, kp)) {
                        QLOG() << "Planner: outputting soln that uses index to provide sort."
                               << endl;
                        QuerySolution* soln = buildWholeIXSoln(params.indices[i], query, params);
                        if (NULL != soln) {
                            out->push_back(soln);
                            break;
                        }
                    }
                    if (providesSort(query, QueryPlannerCommon::reverseSortObj(kp))) {
                        QLOG() << "Planner: outputting soln that uses (reverse) index "
                               << "to provide sort." << endl;
                        QuerySolution* soln = buildWholeIXSoln(params.indices[i], query, params, -1);
                        if (NULL != soln) {
                            out->push_back(soln);
                            break;
                        }
                    }
                }
            }
        }

        // TODO: Do we always want to offer a collscan solution?
        // XXX: currently disabling the always-use-a-collscan in order to find more planner bugs.
        if (    !QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR)
             && !QueryPlannerCommon::hasNode(query.root(), MatchExpression::TEXT)
             && ((params.options & QueryPlannerParams::INCLUDE_COLLSCAN) || (0 == out->size() && canTableScan)))
        {
            QuerySolution* collscan = buildCollscanSoln(query, false, params);
            if (NULL != collscan) {
                out->push_back(collscan);
                QLOG() << "Planner: outputting a collscan:\n";
                QLOG() << collscan->toString() << endl;
            }
        }
    }
Пример #21
0
    /**
     * For a given query, get a runner.  The runner could be a SingleSolutionRunner, a
     * CachedQueryRunner, or a MultiPlanRunner, depending on the cache/query solver/etc.
     */
    Status getRunner(Collection* collection,
                     CanonicalQuery* rawCanonicalQuery,
                     Runner** out,
                     size_t plannerOptions) {

        verify(rawCanonicalQuery);
        auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery);

        // This can happen as we're called by internal clients as well.
        if (NULL == collection) {
            const string& ns = canonicalQuery->ns();
            LOG(2) << "Collection " << ns << " does not exist."
                   << " Using EOF runner: " << canonicalQuery->toStringShort();
            *out = new EOFRunner(canonicalQuery.release(), ns);
            return Status::OK();
        }

        // If we have an _id index we can use the idhack runner.
        if (IDHackRunner::supportsQuery(*canonicalQuery) &&
            collection->getIndexCatalog()->findIdIndex()) {
            LOG(2) << "Using idhack: " << canonicalQuery->toStringShort();
            *out = new IDHackRunner(collection, canonicalQuery.release());
            return Status::OK();
        }

        // Tailable: If the query requests tailable the collection must be capped.
        if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) {
            if (!collection->isCapped()) {
                return Status(ErrorCodes::BadValue,
                              "error processing query: " + canonicalQuery->toString() +
                              " tailable cursor requested on non capped collection");
            }

            // If a sort is specified it must be equal to expectedSort.
            const BSONObj expectedSort = BSON("$natural" << 1);
            const BSONObj& actualSort = canonicalQuery->getParsed().getSort();
            if (!actualSort.isEmpty() && !(actualSort == expectedSort)) {
                return Status(ErrorCodes::BadValue,
                              "error processing query: " + canonicalQuery->toString() +
                              " invalid sort specified for tailable cursor: "
                              + actualSort.toString());
            }
        }

        // Fill out the planning params.  We use these for both cached solutions and non-cached.
        QueryPlannerParams plannerParams;
        plannerParams.options = plannerOptions;
        fillOutPlannerParams(collection, rawCanonicalQuery, &plannerParams);

        // See if the cache has what we're looking for.
        Status cacheStatus = getRunnerFromCache(canonicalQuery.get(),
                                                collection,
                                                plannerParams,
                                                out);

        // This can be not-OK and we can carry on.  It just means the query wasn't cached.
        if (cacheStatus.isOK()) {
            // We got a cached runner.
            canonicalQuery.release();
            return cacheStatus;
        }

        if (internalQueryPlanOrChildrenIndependently
            && SubplanRunner::canUseSubplanRunner(*canonicalQuery)) {

            QLOG() << "Running query as sub-queries: " << canonicalQuery->toStringShort();
            LOG(2) << "Running query as sub-queries: " << canonicalQuery->toStringShort();

            SubplanRunner* runner;
            Status runnerStatus = SubplanRunner::make(collection, plannerParams,
                                                      canonicalQuery.release(), &runner);
            if (!runnerStatus.isOK()) {
                return runnerStatus;
            }

            *out = runner;
            return Status::OK();
        }

        return getRunnerAlwaysPlan(collection, canonicalQuery.release(), plannerParams, out);
    }
Пример #22
0
    bool MultiPlanRunner::pickBestPlan(size_t* out, BSONObj* objOut) {
        static const int timesEachPlanIsWorked = 100;

        // Run each plan some number of times.
        for (int i = 0; i < timesEachPlanIsWorked; ++i) {
            bool moreToDo = workAllPlans(objOut);
            if (!moreToDo) { break; }
        }

        if (_failure || _killed) { return false; }

        // After picking best plan, ranking will own plan stats from
        // candidate solutions (winner and losers).
        std::auto_ptr<PlanRankingDecision> ranking(new PlanRankingDecision);
        size_t bestChild = PlanRanker::pickBestPlan(_candidates, ranking.get());

        // Copy candidate order. We will need this to sort candidate stats for explain
        // after transferring ownership of 'ranking' to plan cache.
        std::vector<size_t> candidateOrder = ranking->candidateOrder;

        // Run the best plan.  Store it.
        _bestPlan.reset(new PlanExecutor(_candidates[bestChild].ws,
                                         _candidates[bestChild].root));
        _bestPlan->setYieldPolicy(_policy);
        _alreadyProduced = _candidates[bestChild].results;
        _bestSolution.reset(_candidates[bestChild].solution);

        QLOG() << "Winning solution:\n" << _bestSolution->toString() << endl;

        size_t backupChild = bestChild;
        if (_bestSolution->hasBlockingStage && (0 == _alreadyProduced.size())) {
            QLOG() << "Winner has blocking stage, looking for backup plan...\n";
            for (size_t i = 0; i < _candidates.size(); ++i) {
                if (!_candidates[i].solution->hasBlockingStage) {
                    QLOG() << "Candidate " << i << " is backup child\n";
                    backupChild = i;
                    _backupSolution = _candidates[i].solution;
                    _backupAlreadyProduced = _candidates[i].results;
                    _backupPlan = new PlanExecutor(_candidates[i].ws, _candidates[i].root);
                    _backupPlan->setYieldPolicy(_policy);
                    break;
                }
            }
        }

        // Store the choice we just made in the cache. We do
        // not cache the query if:
        //   1) The query is of a type that is not safe to cache, or
        //   2) the winning plan did not actually produce any results,
        //   without hitting EOF. In this case, we have no information to
        //   suggest that this plan is good.
        const PlanStageStats* bestStats = ranking->stats.vector()[0];
        if (PlanCache::shouldCacheQuery(*_query)
            && (!_alreadyProduced.empty() || bestStats->common.isEOF)) {
            Database* db = cc().database();
            verify(NULL != db);
            Collection* collection = db->getCollection(_query->ns());
            verify(NULL != collection);
            PlanCache* cache = collection->infoCache()->getPlanCache();
            // Create list of candidate solutions for the cache with
            // the best solution at the front.
            std::vector<QuerySolution*> solutions;

            // Generate solutions and ranking decisions sorted by score.
            for (size_t orderingIndex = 0;
                 orderingIndex < candidateOrder.size(); ++orderingIndex) {
                // index into candidates/ranking
                size_t i = candidateOrder[orderingIndex];
                solutions.push_back(_candidates[i].solution);
            }

            // Check solution cache data. Do not add to cache if
            // we have any invalid SolutionCacheData data.
            // XXX: One known example is 2D queries
            bool validSolutions = true;
            for (size_t i = 0; i < solutions.size(); ++i) {
                if (NULL == solutions[i]->cacheData.get()) {
                    QLOG() << "Not caching query because this solution has no cache data: "
                           << solutions[i]->toString();
                    validSolutions = false;
                    break;
                }
            }

            if (validSolutions) {
                cache->add(*_query, solutions, ranking.release());
            }
        }

        // Clear out the candidate plans, leaving only stats as we're all done w/them.
        // Traverse candidate plans in order or score
        for (size_t orderingIndex = 0;
             orderingIndex < candidateOrder.size(); ++orderingIndex) {
            // index into candidates/ranking
            size_t i = candidateOrder[orderingIndex];

            if (i == bestChild) { continue; }
            if (i == backupChild) { continue; }

            delete _candidates[i].solution;

            // Remember the stats for the candidate plan because we always show it on an
            // explain. (The {verbose:false} in explain() is client-side trick; we always
            // generate a "verbose" explain.)
            PlanStageStats* stats = _candidates[i].root->getStats();
            if (stats) {
                _candidateStats.push_back(stats);
            }
            delete _candidates[i].root;

            // ws must die after the root.
            delete _candidates[i].ws;
        }

        _candidates.clear();
        if (NULL != out) { *out = bestChild; }
        return true;
    }
Пример #23
0
    Runner::RunnerState MultiPlanRunner::getNext(BSONObj* objOut, DiskLoc* dlOut) {
        if (_killed) { return Runner::RUNNER_DEAD; }
        if (_failure) { return Runner::RUNNER_ERROR; }

        // If we haven't picked the best plan yet...
        if (NULL == _bestPlan) {
            if (!pickBestPlan(NULL, objOut)) {
                verify(_failure || _killed);
                if (_killed) { return Runner::RUNNER_DEAD; }
                if (_failure) { return Runner::RUNNER_ERROR; }
            }
        }

        // Look for an already produced result that provides the data the caller wants.
        while (!_alreadyProduced.empty()) {
            WorkingSetID id = _alreadyProduced.front();
            _alreadyProduced.pop_front();

            WorkingSetMember* member = _bestPlan->getWorkingSet()->get(id);

            // Note that this copies code from PlanExecutor.
            if (NULL != objOut) {
                if (WorkingSetMember::LOC_AND_IDX == member->state) {
                    if (1 != member->keyData.size()) {
                        _bestPlan->getWorkingSet()->free(id);
                        // If the caller needs the key data and the WSM doesn't have it, drop the
                        // result and carry on.
                        continue;
                    }
                    *objOut = member->keyData[0].keyData;
                }
                else if (member->hasObj()) {
                    *objOut = member->obj;
                }
                else {
                    // If the caller needs an object and the WSM doesn't have it, drop and
                    // try the next result.
                    _bestPlan->getWorkingSet()->free(id);
                    continue;
                }
            }

            if (NULL != dlOut) {
                if (member->hasLoc()) {
                    *dlOut = member->loc;
                }
                else {
                    // If the caller needs a DiskLoc and the WSM doesn't have it, drop and carry on.
                    _bestPlan->getWorkingSet()->free(id);
                    continue;
                }
            }

            // If we're here, the caller has all the data needed and we've set the out
            // parameters.  Remove the result from the WorkingSet.
            _bestPlan->getWorkingSet()->free(id);
            return Runner::RUNNER_ADVANCED;
        }

        RunnerState state = _bestPlan->getNext(objOut, dlOut);

        if (Runner::RUNNER_ERROR == state && (NULL != _backupSolution)) {
            QLOG() << "Best plan errored out switching to backup\n";
            // Uncache the bad solution if we fall back
            // on the backup solution.
            //
            // XXX: Instead of uncaching we should find a way for the
            // cached plan runner to fall back on a different solution
            // if the best solution fails. Alternatively we could try to
            // defer cache insertion to be after the first produced result.
            Database* db = cc().database();
            verify(NULL != db);
            Collection* collection = db->getCollection(_query->ns());
            verify(NULL != collection);
            PlanCache* cache = collection->infoCache()->getPlanCache();
            cache->remove(*_query);

            _bestPlan.reset(_backupPlan);
            _backupPlan = NULL;
            _bestSolution.reset(_backupSolution);
            _backupSolution = NULL;
            _alreadyProduced = _backupAlreadyProduced;
            return getNext(objOut, dlOut);
        }

        if (NULL != _backupSolution && Runner::RUNNER_ADVANCED == state) {
            QLOG() << "Best plan had a blocking sort, became unblocked, deleting backup plan\n";
            delete _backupSolution;
            delete _backupPlan;
            _backupSolution = NULL;
            _backupPlan = NULL;
            // TODO: free from WS?
            _backupAlreadyProduced.clear();
        }

        return state;
    }
Пример #24
0
    Status getExecutor(OperationContext* txn,
                      Collection* collection,
                      CanonicalQuery* rawCanonicalQuery,
                      PlanExecutor** out,
                      size_t plannerOptions) {
        invariant(rawCanonicalQuery);
        auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery);

        // This can happen as we're called by internal clients as well.
        if (NULL == collection) {
            const string& ns = canonicalQuery->ns();
            LOG(2) << "Collection " << ns << " does not exist."
                   << " Using EOF runner: " << canonicalQuery->toStringShort();
            EOFStage* eofStage = new EOFStage();
            WorkingSet* ws = new WorkingSet();
            *out = new PlanExecutor(ws, eofStage, canonicalQuery.release(), collection);
            return Status::OK();
        }

        // Fill out the planning params.  We use these for both cached solutions and non-cached.
        QueryPlannerParams plannerParams;
        plannerParams.options = plannerOptions;
        fillOutPlannerParams(collection, canonicalQuery.get(), &plannerParams);

        // If we have an _id index we can use the idhack runner.
        if (IDHackStage::supportsQuery(*canonicalQuery.get()) &&
            collection->getIndexCatalog()->findIdIndex()) {
            return getExecutorIDHack(txn, collection, canonicalQuery.release(), plannerParams, out);
        }

        // Tailable: If the query requests tailable the collection must be capped.
        if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) {
            if (!collection->isCapped()) {
                return Status(ErrorCodes::BadValue,
                              "error processing query: " + canonicalQuery->toString() +
                              " tailable cursor requested on non capped collection");
            }

            // If a sort is specified it must be equal to expectedSort.
            const BSONObj expectedSort = BSON("$natural" << 1);
            const BSONObj& actualSort = canonicalQuery->getParsed().getSort();
            if (!actualSort.isEmpty() && !(actualSort == expectedSort)) {
                return Status(ErrorCodes::BadValue,
                              "error processing query: " + canonicalQuery->toString() +
                              " invalid sort specified for tailable cursor: "
                              + actualSort.toString());
            }
        }

        // Try to look up a cached solution for the query.

        CachedSolution* rawCS;
        if (PlanCache::shouldCacheQuery(*canonicalQuery) &&
            collection->infoCache()->getPlanCache()->get(*canonicalQuery.get(), &rawCS).isOK()) {
            // We have a CachedSolution.  Have the planner turn it into a QuerySolution.
            boost::scoped_ptr<CachedSolution> cs(rawCS);
            QuerySolution *qs, *backupQs;
            QuerySolution*& chosenSolution=qs; // either qs or backupQs
            Status status = QueryPlanner::planFromCache(*canonicalQuery.get(), plannerParams, *cs,
                                                        &qs, &backupQs);

            if (status.isOK()) {
                // the working set will be shared by the root and backupRoot plans
                // and owned by the containing single-solution-runner
                //
                WorkingSet* sharedWs = new WorkingSet();

                PlanStage *root, *backupRoot=NULL;
                verify(StageBuilder::build(txn, collection, *qs, sharedWs, &root));
                if ((plannerParams.options & QueryPlannerParams::PRIVATE_IS_COUNT)
                    && turnIxscanIntoCount(qs)) {
                    LOG(2) << "Using fast count: " << canonicalQuery->toStringShort()
                           << ", planSummary: " << getPlanSummary(*qs);

                    if (NULL != backupQs) {
                        delete backupQs;
                    }
                }
                else if (NULL != backupQs) {
                    verify(StageBuilder::build(txn, collection, *backupQs, sharedWs, &backupRoot));
                }

                // add a CachedPlanStage on top of the previous root
                root = new CachedPlanStage(collection, canonicalQuery.get(), root, backupRoot);

                *out = new PlanExecutor(sharedWs, root, chosenSolution, canonicalQuery.release(),
                                        collection);
                return Status::OK();
            }
        }

        if (internalQueryPlanOrChildrenIndependently
            && SubplanStage::canUseSubplanning(*canonicalQuery)) {

            QLOG() << "Running query as sub-queries: " << canonicalQuery->toStringShort();

            auto_ptr<WorkingSet> ws(new WorkingSet());

            SubplanStage* subplan;
            Status subplanStatus = SubplanStage::make(txn, collection, ws.get(), plannerParams,
                                                      canonicalQuery.get(), &subplan);
            if (subplanStatus.isOK()) {
                LOG(2) << "Running query as sub-queries: " << canonicalQuery->toStringShort();
                *out = new PlanExecutor(ws.release(), subplan, canonicalQuery.release(),
                                        collection);
                return Status::OK();
            }
            else {
                QLOG() << "Subplanner: " << subplanStatus.reason();
            }
        }

        return getExecutorAlwaysPlan(txn, collection, canonicalQuery.release(), plannerParams, out);
    }
Пример #25
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View newGetMore(OperationContext* txn,
                            const char* ns,
                            int ntoreturn,
                            long long cursorid,
                            CurOp& curop,
                            int pass,
                            bool& exhaust,
                            bool* isCursorAuthorized,
                            bool fromDBDirectClient) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        // This is a read lock.
        const NamespaceString nss(ns);
        scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss));
        Collection* collection = ctx->getCollection();
        uassert( 17356, "collection dropped between getMore calls", collection );

        QLOG() << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(collection, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (fromDBDirectClient) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn));

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn, curop); 
            }

            if (cc->isAggCursor) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            // We save the client cursor when there might be more results, and hence we may receive
            // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know
            // that we will not be producing more results. We indicate that the cursor is closed by
            // sending a cursorId of 0 back to the client.
            //
            // On the other hand, if we retrieve all results necessary for this batch, then
            // 'saveClientCursor' is true and we send a valid cursorId back to the client. In
            // this case, there may or may not actually be more results (for example, the next call
            // to getNext(...) might just return EOF).
            bool saveClientCursor = false;

            if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) {
                // Propagate this error to caller.
                if (PlanExecutor::EXEC_ERROR == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // If we're dead there's no way to get more results.
                saveClientCursor = false;

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (PlanExecutor::IS_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(PlanExecutor::ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                QLOG() << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!fromDBDirectClient) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper.reset();
                        delete cc->releaseOwnedRecoveryUnit();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
Пример #26
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        QLOG() << "running getMore in new system, cursorid " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_EOF == state && 0 == numResults
                && (queryOptions & QueryOption_CursorTailable)
                && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) {
                // If the cursor is tailable we don't kill it if it's eof.  We let it try to get
                // data some # of times first.
                return 0;
            }

            bool saveClientCursor = false;

            if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) {
                // If we're dead there's no way to get more results.
                saveClientCursor = false;
                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                // 
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (Runner::RUNNER_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(Runner::RUNNER_ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended w/state "
                       << Runner::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();
                QLOG() << "getMore saving client cursor ended w/state "
                       << Runner::statestr(state)
                       << endl;

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
Пример #27
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) {
        QLOG() << "Running query on new system: " << cq->toString();

        // This is a read lock.
        Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // Need to call cq->toString() now, since upon error getRunner doesn't guarantee
        // cq is in a consistent state.
        string cqStr = cq->toString();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr);
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);
        auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety(
            new DeregisterEvenIfUnderlyingCodeThrows(runner.get()));

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Append explain information to query results by asking the runner to produce them.
        if (isExplain) {
            TypeExplain* bareExplain;
            Status res = runner->getExplainPlan(&bareExplain);

            if (!res.isOK()) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
            else {
                boost::scoped_ptr<TypeExplain> explain(bareExplain);

                // Fill in the missing run-time fields in explain, starting with propeties of
                // the process running the query.
                std::string server = mongoutils::str::stream()
                    << getHostNameCached() << ":" << serverGlobalParams.port;
                explain->setServer(server);

                // We might have skipped some results due to chunk migration etc. so our count is
                // correct.
                explain->setN(numResults);

                // Clock the whole operation.
                explain->setMillis(curop.elapsedMillis());

                BSONObj explainObj = explain->toBSON();
                bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

                // The explain output is actually a result.
                numResults = 1;
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "not caching runner but returning " << numResults << " results\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Пример #28
0
    bool SubplanRunner::runSubplans() {
        // This is what we annotate with the index selections and then turn into a solution.
        auto_ptr<OrMatchExpression> theOr(
            static_cast<OrMatchExpression*>(_query->root()->shallowClone()));

        // This is the skeleton of index selections that is inserted into the cache.
        auto_ptr<PlanCacheIndexTree> cacheData(new PlanCacheIndexTree());

        for (size_t i = 0; i < theOr->numChildren(); ++i) {
            MatchExpression* orChild = theOr->getChild(i);

            auto_ptr<CanonicalQuery> orChildCQ(_cqs.front());
            _cqs.pop();

            // 'solutions' is owned by the SubplanRunner instance until
            // it is popped from the queue.
            vector<QuerySolution*> solutions = _solutions.front();
            _solutions.pop();

            // We already checked for zero solutions in planSubqueries(...).
            invariant(!solutions.empty());

            if (1 == solutions.size()) {
                // There is only one solution. Transfer ownership to an auto_ptr.
                auto_ptr<QuerySolution> autoSoln(solutions[0]);

                // We want a well-formed *indexed* solution.
                if (NULL == autoSoln->cacheData.get()) {
                    // For example, we don't cache things for 2d indices.
                    QLOG() << "Subplanner: No cache data for subchild " << orChild->toString();
                    return false;
                }

                if (SolutionCacheData::USE_INDEX_TAGS_SOLN != autoSoln->cacheData->solnType) {
                    QLOG() << "Subplanner: No indexed cache data for subchild "
                           << orChild->toString();
                    return false;
                }

                // Add the index assignments to our original query.
                Status tagStatus = QueryPlanner::tagAccordingToCache(
                    orChild, autoSoln->cacheData->tree.get(), _indexMap);

                if (!tagStatus.isOK()) {
                    QLOG() << "Subplanner: Failed to extract indices from subchild "
                           << orChild->toString();
                    return false;
                }

                // Add the child's cache data to the cache data we're creating for the main query.
                cacheData->children.push_back(autoSoln->cacheData->tree->clone());
            }
            else {
                // N solutions, rank them.  Takes ownership of orChildCQ.

                // the working set will be shared by the candidate plans and owned by the runner
                WorkingSet* sharedWorkingSet = new WorkingSet();

                MultiPlanStage* multiPlanStage = new MultiPlanStage(_collection,
                                                                    orChildCQ.get());

                // Dump all the solutions into the MPR.
                for (size_t ix = 0; ix < solutions.size(); ++ix) {
                    PlanStage* nextPlanRoot;
                    verify(StageBuilder::build(_txn,
                                               _collection,
                                               *solutions[ix],
                                               sharedWorkingSet,
                                               &nextPlanRoot));

                    // Owns first two arguments
                    multiPlanStage->addPlan(solutions[ix], nextPlanRoot, sharedWorkingSet);
                }

                multiPlanStage->pickBestPlan();
                if (! multiPlanStage->bestPlanChosen()) {
                    QLOG() << "Subplanner: Failed to pick best plan for subchild "
                           << orChildCQ->toString();
                    return false;
                }

                Runner* mpr = new SingleSolutionRunner(_collection,
                                                       orChildCQ.release(),
                                                       multiPlanStage->bestSolution(),
                                                       multiPlanStage,
                                                       sharedWorkingSet);

                _underlyingRunner.reset(mpr);

                if (_killed) {
                    QLOG() << "Subplanner: Killed while picking best plan for subchild "
                           << orChild->toString();
                    return false;
                }

                QuerySolution* bestSoln = multiPlanStage->bestSolution();

                if (SolutionCacheData::USE_INDEX_TAGS_SOLN != bestSoln->cacheData->solnType) {
                    QLOG() << "Subplanner: No indexed cache data for subchild "
                           << orChild->toString();
                    return false;
                }

                // Add the index assignments to our original query.
                Status tagStatus = QueryPlanner::tagAccordingToCache(
                    orChild, bestSoln->cacheData->tree.get(), _indexMap);

                if (!tagStatus.isOK()) {
                    QLOG() << "Subplanner: Failed to extract indices from subchild "
                           << orChild->toString();
                    return false;
                }

                cacheData->children.push_back(bestSoln->cacheData->tree->clone());
            }
        }

        // Must do this before using the planner functionality.
        sortUsingTags(theOr.get());

        // Use the cached index assignments to build solnRoot.  Takes ownership of 'theOr'
        QuerySolutionNode* solnRoot = QueryPlannerAccess::buildIndexedDataAccess(
            *_query, theOr.release(), false, _plannerParams.indices);

        if (NULL == solnRoot) {
            QLOG() << "Subplanner: Failed to build indexed data path for subplanned query\n";
            return false;
        }

        QLOG() << "Subplanner: fully tagged tree is " << solnRoot->toString();

        // Takes ownership of 'solnRoot'
        QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(*_query,
                                                                      _plannerParams,
                                                                      solnRoot);

        if (NULL == soln) {
            QLOG() << "Subplanner: Failed to analyze subplanned query";
            return false;
        }

        // We want our franken-solution to be cached.
        SolutionCacheData* scd = new SolutionCacheData();
        scd->tree.reset(cacheData.release());
        soln->cacheData.reset(scd);

        QLOG() << "Subplanner: Composite solution is " << soln->toString() << endl;

        // We use one of these even if there is one plan.  We do this so that the entry is cached
        // with stats obtained in the same fashion as a competitive ranking would have obtained
        // them.
        MultiPlanStage* multiPlanStage = new MultiPlanStage(_collection, _query.get());
        WorkingSet* ws = new WorkingSet();
        PlanStage* root;
        verify(StageBuilder::build(_txn, _collection, *soln, ws, &root));
        multiPlanStage->addPlan(soln, root, ws); // Takes ownership first two arguments.

        multiPlanStage->pickBestPlan();
        if (! multiPlanStage->bestPlanChosen()) {
            QLOG() << "Subplanner: Failed to pick best plan for subchild "
                   << _query->toString();
            return false;
        }

        Runner* mpr = new SingleSolutionRunner(_collection,
                                               _query.release(),
                                               multiPlanStage->bestSolution(),
                                               multiPlanStage,
                                               ws);
        _underlyingRunner.reset(mpr);

        return true;
    }
Пример #29
0
    std::string newRunQuery(OperationContext* txn,
                            Message& m,
                            QueryMessage& q,
                            CurOp& curop,
                            Message &result,
                            bool fromDBDirectClient) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        const NamespaceString nss(q.ns);

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(
                                    q, &cq, WhereCallbackReal(txn, StringData(nss.db())));
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }

        QLOG() << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        PlanExecutor* rawExec = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        AutoGetCollectionForRead ctx(txn, nss);

        const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() :
                                                             serverGlobalParams.defaultProfile;

        Collection* collection = ctx.getCollection();

        // We'll now try to get the query executor that will execute this query for us. There
        // are a few cases in which we know upfront which executor we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we use an EOFStage.
        //
        // (b) if the query is a replication's initial sync one, we use a specifically designed
        // stage that skips extents faster (see details in exec/oplogstart.h).
        //
        // Otherwise we go through the selection of which executor is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (NULL != collection && pq.getOptions().oplogReplay) {
            // Takes ownership of 'cq'.
            status = getOplogStartHack(txn, collection, cq, &rawExec);
        }
        else {
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            // Takes ownership of 'cq'.
            status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getExecutor has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawExec);
        auto_ptr<PlanExecutor> exec(rawExec);

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref();
        status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                NamespaceString(cq->ns()),
                slaveOK);
        uassertStatusOK(status);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the PlanExecutor in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.getOptions().oplogReplay) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (!supportsGetMore && (enough(pq, numResults)
                                     || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " executor EOF=" << exec->isEOF() << endl;
                    saveClientCursor = !exec->isEOF();
                }
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::EXEC_ERROR == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // Why save a dead executor?
        if (PlanExecutor::DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.getOptions().tailable) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery;
        const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1);

        PlanSummaryStats summaryStats;
        Explain::getSummaryStats(exec.get(), &summaryStats);

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;
        curop.debug().scanAndOrder = summaryStats.hasSortStage;
        curop.debug().nscanned = summaryStats.totalKeysExamined;
        curop.debug().nscannedObjects = summaryStats.totalDocsExamined;
        curop.debug().idhack = summaryStats.isIdhack;

        // Set debug information for consumption by the profiler.
        if (dbProfilingLevel > 0 ||
            curop.elapsedMillis() > serverGlobalParams.slowMS ||
            logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) {
            // Get BSON stats.
            scoped_ptr<PlanStageStats> execStats(exec->getStats());
            BSONObjBuilder statsBob;
            Explain::statsToBSON(*execStats, &statsBob);
            curop.debug().execStats.set(statsBob.obj());

            // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
            if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) {
                BSONObjBuilder bob;
                bob.append("summary", curop.debug().planSummary.toString());
                curop.debug().execStats.set(bob.done());
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection, exec.get(),
                                                cq->getParsed().getOptions().toInt(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            if (fromDBDirectClient) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
                txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn));
            }

            QLOG() << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of executor.  Release to make sure it's not deleted.
            exec.release();

            // TODO document
            if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.getOptions().exhaust) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "Not caching executor but returning " << numResults << " results.\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Пример #30
0
    void MultiPlanStage::pickBestPlan() {
        // Run each plan some number of times. This number is at least as great as
        // 'internalQueryPlanEvaluationWorks', but may be larger for big collections.
        size_t numWorks = internalQueryPlanEvaluationWorks;
        if (NULL != _collection) {
            // For large collections, the number of works is set to be this
            // fraction of the collection size.
            double fraction = internalQueryPlanEvaluationCollFraction;

            numWorks = std::max(size_t(internalQueryPlanEvaluationWorks),
                                size_t(fraction * _collection->numRecords()));
        }

        // We treat ntoreturn as though it is a limit during plan ranking.
        // This means that ranking might not be great for sort + batchSize.
        // But it also means that we don't buffer too much data for sort + limit.
        // See SERVER-14174 for details.
        size_t numToReturn = _query->getParsed().getNumToReturn();

        // Determine the number of results which we will produce during the plan
        // ranking phase before stopping.
        size_t numResults = (size_t)internalQueryPlanEvaluationMaxResults;
        if (numToReturn > 0) {
            numResults = std::min(numToReturn, numResults);
        }

        // Work the plans, stopping when a plan hits EOF or returns some
        // fixed number of results.
        for (size_t ix = 0; ix < numWorks; ++ix) {
            bool moreToDo = workAllPlans(numResults);
            if (!moreToDo) { break; }
        }

        if (_failure) { return; }

        // After picking best plan, ranking will own plan stats from
        // candidate solutions (winner and losers).
        std::auto_ptr<PlanRankingDecision> ranking(new PlanRankingDecision);
        _bestPlanIdx = PlanRanker::pickBestPlan(_candidates, ranking.get());
        verify(_bestPlanIdx >= 0 && _bestPlanIdx < static_cast<int>(_candidates.size()));

        // Copy candidate order. We will need this to sort candidate stats for explain
        // after transferring ownership of 'ranking' to plan cache.
        std::vector<size_t> candidateOrder = ranking->candidateOrder;

        CandidatePlan& bestCandidate = _candidates[_bestPlanIdx];
        std::list<WorkingSetID>& alreadyProduced = bestCandidate.results;
        QuerySolution* bestSolution = bestCandidate.solution;

        QLOG() << "Winning solution:\n" << bestSolution->toString() << endl;
        LOG(2) << "Winning plan: " << getPlanSummary(*bestSolution);

        _backupPlanIdx = kNoSuchPlan;
        if (bestSolution->hasBlockingStage && (0 == alreadyProduced.size())) {
            QLOG() << "Winner has blocking stage, looking for backup plan...\n";
            for (size_t ix = 0; ix < _candidates.size(); ++ix) {
                if (!_candidates[ix].solution->hasBlockingStage) {
                    QLOG() << "Candidate " << ix << " is backup child\n";
                    _backupPlanIdx = ix;
                    break;
                }
            }
        }

        // Store the choice we just made in the cache. In order to do so,
        //   1) the query must be of a type that is safe to cache, and
        //   2) two or more plans cannot have tied for the win. Caching in the
        //   case of ties can cause successive queries of the same shape to
        //   use a bad index.
        if (PlanCache::shouldCacheQuery(*_query) && !ranking->tieForBest) {
            // Create list of candidate solutions for the cache with
            // the best solution at the front.
            std::vector<QuerySolution*> solutions;

            // Generate solutions and ranking decisions sorted by score.
            for (size_t orderingIndex = 0;
                 orderingIndex < candidateOrder.size(); ++orderingIndex) {
                // index into candidates/ranking
                size_t ix = candidateOrder[orderingIndex];
                solutions.push_back(_candidates[ix].solution);
            }

            // Check solution cache data. Do not add to cache if
            // we have any invalid SolutionCacheData data.
            // XXX: One known example is 2D queries
            bool validSolutions = true;
            for (size_t ix = 0; ix < solutions.size(); ++ix) {
                if (NULL == solutions[ix]->cacheData.get()) {
                    QLOG() << "Not caching query because this solution has no cache data: "
                           << solutions[ix]->toString();
                    validSolutions = false;
                    break;
                }
            }

            if (validSolutions) {
                _collection->infoCache()->getPlanCache()->add(*_query, solutions, ranking.release());
            }
        }
    }