void test7(int type){ int i = 0, j = 0; char* buffer; qlog_init(50); qlog_thread_init("test_6_main_thread"); for (i = 0; i < 10; i++){ for (j = 0; j< 1500; j++){ switch (type){ case 1: QLOG("log entry"); break; case 2: QLOG_HEX(&buffer, 15); break; case 3: QLOG_BT; break; case 4: QLOG("log entry sdlkfjsdklfjsdlkfjsdklfj"); QLOG_HEX(&buffer, 98); QLOG_BT; default: break; } } /*qlog_display_print_buffer(stdout);*/ qlog_reset(); } qlog_cleanup(); }
Status SubplanRunner::planSubqueries() { MatchExpression* theOr = _query->root(); for (size_t i = 0; i < _plannerParams.indices.size(); ++i) { const IndexEntry& ie = _plannerParams.indices[i]; _indexMap[ie.keyPattern] = i; QLOG() << "Subplanner: index " << i << " is " << ie.toString() << endl; } const WhereCallbackReal whereCallback(_collection->ns().db()); for (size_t i = 0; i < theOr->numChildren(); ++i) { // Turn the i-th child into its own query. MatchExpression* orChild = theOr->getChild(i); CanonicalQuery* orChildCQ; Status childCQStatus = CanonicalQuery::canonicalize(*_query, orChild, &orChildCQ, whereCallback); if (!childCQStatus.isOK()) { mongoutils::str::stream ss; ss << "Subplanner: Can't canonicalize subchild " << orChild->toString() << " " << childCQStatus.reason(); return Status(ErrorCodes::BadValue, ss); } // Make sure it gets cleaned up. auto_ptr<CanonicalQuery> safeOrChildCQ(orChildCQ); // Plan the i-th child. vector<QuerySolution*> solutions; // We don't set NO_TABLE_SCAN because peeking at the cache data will keep us from // considering any plan that's a collscan. QLOG() << "Subplanner: planning child " << i << " of " << theOr->numChildren(); Status status = QueryPlanner::plan(*safeOrChildCQ, _plannerParams, &solutions); if (!status.isOK()) { mongoutils::str::stream ss; ss << "Subplanner: Can't plan for subchild " << orChildCQ->toString() << " " << status.reason(); return Status(ErrorCodes::BadValue, ss); } QLOG() << "Subplanner: got " << solutions.size() << " solutions"; if (0 == solutions.size()) { // If one child doesn't have an indexed solution, bail out. mongoutils::str::stream ss; ss << "Subplanner: No solutions for subchild " << orChildCQ->toString(); return Status(ErrorCodes::BadValue, ss); } // Hang onto the canonicalized subqueries and the corresponding query solutions // so that they can be used in subplan running later on. _cqs.push(safeOrChildCQ.release()); _solutions.push(solutions); } return Status::OK(); }
PlanStage::StageState MultiPlanStage::work(WorkingSetID* out) { // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_failure) { *out = _statusMemberId; return PlanStage::FAILURE; } CandidatePlan& bestPlan = _candidates[_bestPlanIdx]; // Look for an already produced result that provides the data the caller wants. if (!bestPlan.results.empty()) { *out = bestPlan.results.front(); bestPlan.results.pop_front(); _commonStats.advanced++; return PlanStage::ADVANCED; } // best plan had no (or has no more) cached results StageState state = bestPlan.root->work(out); if (PlanStage::FAILURE == state && hasBackupPlan()) { QLOG() << "Best plan errored out switching to backup\n"; // Uncache the bad solution if we fall back // on the backup solution. // // XXX: Instead of uncaching we should find a way for the // cached plan runner to fall back on a different solution // if the best solution fails. Alternatively we could try to // defer cache insertion to be after the first produced result. _collection->infoCache()->getPlanCache()->remove(*_query); _bestPlanIdx = _backupPlanIdx; _backupPlanIdx = kNoSuchPlan; return _candidates[_bestPlanIdx].root->work(out); } if (hasBackupPlan() && PlanStage::ADVANCED == state) { QLOG() << "Best plan had a blocking stage, became unblocked\n"; _backupPlanIdx = kNoSuchPlan; } // Increment stats. if (PlanStage::ADVANCED == state) { _commonStats.advanced++; } else if (PlanStage::NEED_TIME == state) { _commonStats.needTime++; } return state; }
CCefWindow::CCefWindow(const QString& url, QCefView* host, QWindow *parent /*= 0*/) : QWindow(parent) , pQCefViewWidget_(host) , hwndCefBrowser_(NULL) , pQCefViewHandler(NULL) { CCefManager::getInstance().AddBrowserRefCount(); setFlags(Qt::FramelessWindowHint); // Create native window create(); // Set window info CefWindowInfo window_info; RECT rc = { 0 }; window_info.SetAsChild((HWND)winId(), rc); CefBrowserSettings browserSettings; browserSettings.plugins = STATE_DISABLED; // disable all plugins pQCefViewHandler = new QCefViewBrowserHandler(host); instance_map_[(HWND)winId()] = pQCefViewWidget_; // Create the main browser window. if (!CefBrowserHost::CreateBrowser( window_info, // window info pQCefViewHandler.get(), // handler url.toStdString(), // url browserSettings, // settings NULL)) { QLOG() << QStringLiteral("Failed to create browser."); } }
void CachedPlanRunner::updateCache() { _updatedCache = true; if (_killed) { return; } Database* db = cc().database(); // XXX: We need to check for NULL because this is called upon // destruction of the CachedPlanRunner. In some cases, the db // or collection could be dropped without kill() being called // on the runner (for example, timeout of a ClientCursor holding // the runner). if (NULL == db) { return; } Collection* collection = db->getCollection(_canonicalQuery->ns()); if (NULL == collection) { return; } PlanCache* cache = collection->infoCache()->getPlanCache(); std::auto_ptr<PlanCacheEntryFeedback> feedback(new PlanCacheEntryFeedback()); // XXX: what else can we provide here? feedback->stats.reset(_exec->getStats()); feedback->score = PlanRanker::scoreTree(feedback->stats.get()); Status fbs = cache->feedback(*_canonicalQuery, feedback.release()); if (!fbs.isOK()) { QLOG() << _canonicalQuery->ns() << ": Failed to update cache with feedback: " << fbs.toString() << " - " << "(query: " << _canonicalQuery->getQueryObj() << "; sort: " << _canonicalQuery->getParsed().getSort() << "; projection: " << _canonicalQuery->getParsed().getProj() << ") is no longer in plan cache."; } }
// static double PlanRanker::scoreTree(const PlanStageStats* stats) { // We start all scores at 1. Our "no plan selected" score is 0 and we want all plans to // be greater than that. double baseScore = 1; // How much did a plan produce? // Range: [0, 1] double productivity = static_cast<double>(stats->common.advanced) / static_cast<double>(stats->common.works); // Does a plan have a sort? // bool sort = hasSort(stats); // How selective do we think an index is? //double selectivity = computeSelectivity(stats); //return baseScore + productivity + selectivity; //double sortPenalty = sort ? 0.5 : 0; //double score = baseScore + productivity - sortPenalty; double score = baseScore + productivity; QLOG() << "score (" << score << ") = baseScore (" << baseScore << ") + productivity(" << productivity << ")\n"; return score; }
// static Status QueryPlanner::planFromCache(const CanonicalQuery& query, const QueryPlannerParams& params, CachedSolution* cachedSoln, QuerySolution** out) { // Create a copy of the expression tree. We use cachedSoln to annotate this with indices. MatchExpression* clone = query.root()->shallowClone(); // XXX: Use data in cachedSoln to tag 'clone' with the indices used. The tags use an index // ID which is an index into some vector of IndexEntry(s). How do we maintain this across // calls to plan? Do we want to store in the soln the keypatterns of the indices and just // map those to an index into params.indices? Might be easiest thing to do, and certainly // most intelligible for debugging. // Use the cached index assignments to build solnRoot. Takes ownership of clone. QuerySolutionNode* solnRoot = QueryPlannerAccess::buildIndexedDataAccess(query, clone, false, params.indices); // XXX: are the NULL cases an error/when does this happen / can this happen? if (NULL != solnRoot) { QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, solnRoot); if (NULL != soln) { QLOG() << "Planner: adding cached solution:\n" << soln->toString() << endl; *out = soln; } } // XXX: if any NULLs return error status? return Status::OK(); }
void test6(){ int data[100]; int i = 0; alma a; alma2 a2; qlog_ext_event_type_t et = 0, et2 = 0; a.alma1 = 100; a.alma2 = 200; a2.a21 = 10923; a2.a22 = 987; for (i = 0; i < 100; i++) { data[i] = 1879*i; } qlog_init(10); qlog_thread_init("main thread"); QLOG("message1"); QLOG_BT; QLOG("message2"); QLOG_HEX(data, sizeof(data)); QLOG("message3"); et = qlog_ext_register_event(test6_print_cb); printf("event type: %d\n", et); et = qlog_ext_register_event(test6_print_cb); printf("event type: %d\n", et); et = qlog_ext_register_event(test6_print_cb); printf("event type: %d\n", et); et = qlog_ext_register_event(test6_print_cb); printf("event type: %d\n", et); et = qlog_ext_register_event(test6_print_cb); printf("event type: %d\n", et); et2 = qlog_ext_register_event(test6_print_cb2); printf("event type: %d\n", et2); qlog_ext_log(et, &a, sizeof(a), "Hoki alma"); qlog_ext_log(1000, &a, sizeof(a), "Hoki2 alma"); qlog_ext_log(et2, &a2, sizeof(a2), "Hoki2222 alma"); QLOG_HEX(&a, sizeof(a)); QLOG_BT; QLOG_VA("This is a formatted message: %d, %s, %d", 1023, "alma", 12); qlog_display_print_buffer(stdout); qlog_cleanup(); }
void test10(void){ int a = 0; int b = 0; qlog_init(25); qlog_thread_init("main thread"); QLOG("test10 has been started"); QLOG_ENTRY; a = test10_a(); printf("%s, %d\n", __FUNCTION__, a); QLOG("test10_a has been returned"); b = test10_c(86); printf("%s, %d\n", __FUNCTION__, b); QLOG_LEAVE; QLOG("test10 has been finished"); qlog_display_print_buffer(stdout); qlog_cleanup(); }
bool PlanEnumerator::getNext(MatchExpression** tree) { if (_done) { return false; } // Tag with our first solution. tagMemo(_nodeToId[_root]); *tree = _root->shallowClone(); tagForSort(*tree); sortUsingTags(*tree); _root->resetTag(); QLOG() << "Enumerator: memo right before moving:\n"; dumpMemo(); _done = nextMemo(_nodeToId[_root]); QLOG() << "Enumerator: memo right after moving:\n"; dumpMemo(); return true; }
// static size_t PlanRanker::pickBestPlan(const vector<CandidatePlan>& candidates, PlanRankingDecision* why) { // Each plan will have a stat tree. vector<PlanStageStats*> statTrees; // Get stat trees from each plan. for (size_t i = 0; i < candidates.size(); ++i) { statTrees.push_back(candidates[i].root->getStats()); } // Compute score for each tree. Record the best. double maxScore = 0; size_t bestChild = numeric_limits<size_t>::max(); for (size_t i = 0; i < statTrees.size(); ++i) { QLOG() << "scoring plan " << i << ":\n" << candidates[i].solution->toString(); double score = scoreTree(statTrees[i]); QLOG() << "score = " << score << endl; if (score > maxScore) { maxScore = score; bestChild = i; } } // Make sure we got something. verify(numeric_limits<size_t>::max() != bestChild); if (NULL != why) { // Record the stats of the winner. why->statsOfWinner = statTrees[bestChild]; } // Clean up stats of losers. for (size_t i = 0; i < statTrees.size(); ++i) { // If why is null we're not saving the bestChild's stats and we can delete it. if (i != bestChild || NULL == why) { delete statTrees[i]; } } return bestChild; }
bool MultiPlanRunner::pickBestPlan(size_t* out) { static const int timesEachPlanIsWorked = 100; // Run each plan some number of times. for (int i = 0; i < timesEachPlanIsWorked; ++i) { bool moreToDo = workAllPlans(); if (!moreToDo) { break; } } if (_failure || _killed) { return false; } size_t bestChild = PlanRanker::pickBestPlan(_candidates, NULL); // Run the best plan. Store it. _bestPlan.reset(new PlanExecutor(_candidates[bestChild].ws, _candidates[bestChild].root)); _bestPlan->setYieldPolicy(_policy); _alreadyProduced = _candidates[bestChild].results; _bestSolution.reset(_candidates[bestChild].solution); QLOG() << "Winning solution:\n" << _bestSolution->toString() << endl; // TODO: // Store the choice we just made in the cache. // QueryPlanCache* cache = PlanCache::get(somenamespace); // cache->add(_query, *_candidates[bestChild]->solution, decision->bestPlanStats); // delete decision; // Clear out the candidate plans, leaving only stats as we're all done w/them. for (size_t i = 0; i < _candidates.size(); ++i) { if (i == bestChild) { continue; } delete _candidates[i].solution; // Remember the stats for the candidate plan because we always show it on an // explain. (The {verbose:false} in explain() is client-side trick; we always // generate a "verbose" explain.) PlanStageStats* stats = _candidates[i].root->getStats(); if (stats) { _candidateStats.push_back(stats); } delete _candidates[i].root; // ws must die after the root. delete _candidates[i].ws; } _candidates.clear(); if (NULL != out) { *out = bestChild; } return true; }
Status PlanEnumerator::init() { _inOrderCount = 0; _done = false; QLOG() << "enumerator received root:\n" << _root->toString() << endl; // Fill out our memo structure from the tagged _root. _done = !prepMemo(_root); // Dump the tags. We replace them with IndexTag instances. _root->resetTag(); return Status::OK(); }
void PlanEnumerator::tagMemo(size_t id) { QLOG() << "Tagging memoID " << id << endl; NodeAssignment* assign = _memo[id]; verify(NULL != assign); if (NULL != assign->pred) { PredicateAssignment* pa = assign->pred.get(); verify(NULL == pa->expr->getTag()); verify(pa->indexToAssign < pa->first.size()); pa->expr->setTag(new IndexTag(pa->first[pa->indexToAssign])); } else if (NULL != assign->orAssignment) { OrAssignment* oa = assign->orAssignment.get(); for (size_t i = 0; i < oa->subnodes.size(); ++i) { tagMemo(oa->subnodes[i]); } } else if (NULL != assign->newAnd) { AndAssignment* aa = assign->newAnd.get(); if (AndAssignment::MANDATORY == aa->state) { verify(aa->counter < aa->mandatory.size()); const OneIndexAssignment& assign = aa->mandatory[aa->counter]; for (size_t i = 0; i < assign.preds.size(); ++i) { MatchExpression* pred = assign.preds[i]; verify(NULL == pred->getTag()); pred->setTag(new IndexTag(assign.index, assign.positions[i])); } } else if (AndAssignment::PRED_CHOICES == aa->state) { verify(aa->counter < aa->predChoices.size()); const OneIndexAssignment& assign = aa->predChoices[aa->counter]; for (size_t i = 0; i < assign.preds.size(); ++i) { MatchExpression* pred = assign.preds[i]; verify(NULL == pred->getTag()); pred->setTag(new IndexTag(assign.index, assign.positions[i])); } } else { verify(AndAssignment::SUBNODES == aa->state); verify(aa->counter < aa->subnodes.size()); tagMemo(aa->subnodes[aa->counter]); } } else { verify(0); } }
void CachedPlanStage::updateCache() { _updatedCache = true; std::auto_ptr<PlanCacheEntryFeedback> feedback(new PlanCacheEntryFeedback()); feedback->stats.reset(getStats()); feedback->score = PlanRanker::scoreTree(feedback->stats.get()); PlanCache* cache = _collection->infoCache()->getPlanCache(); Status fbs = cache->feedback(*_canonicalQuery, feedback.release()); if (!fbs.isOK()) { QLOG() << _canonicalQuery->ns() << ": Failed to update cache with feedback: " << fbs.toString() << " - " << "(query: " << _canonicalQuery->getQueryObj() << "; sort: " << _canonicalQuery->getParsed().getSort() << "; projection: " << _canonicalQuery->getParsed().getProj() << ") is no longer in plan cache."; } }
void* test8_thr(void* data){ char* thread_name = (char*) data; struct timespec sleep_time; struct timespec remaining_time; unsigned long counter = 0; sleep_time.tv_sec = 0; sleep_time.tv_nsec = 1000000; qlog_thread_init(thread_name); printf("Thread started: %s\n", thread_name); while(1){ if (test8_run == 0) { printf("Loop count in thread %s: %lu\n", thread_name, counter); break; } counter++; QLOG("thread message"); QLOG_HEX(thread_name, 10); QLOG_BT; nanosleep(&sleep_time, &remaining_time); } return NULL; }
// static QuerySolution* QueryPlannerAnalysis::analyzeDataAccess(const CanonicalQuery& query, const QueryPlannerParams& params, QuerySolutionNode* solnRoot) { auto_ptr<QuerySolution> soln(new QuerySolution()); soln->filterData = query.getQueryObj(); verify(soln->filterData.isOwned()); soln->ns = query.ns(); soln->indexFilterApplied = params.indexFiltersApplied; solnRoot->computeProperties(); // solnRoot finds all our results. Let's see what transformations we must perform to the // data. // If we're answering a query on a sharded system, we need to drop documents that aren't // logically part of our shard (XXX GREG elaborate more precisely) if (params.options & QueryPlannerParams::INCLUDE_SHARD_FILTER) { // XXX TODO: use params.shardKey to do fetch analysis instead of always fetching. if (!solnRoot->fetched()) { FetchNode* fetch = new FetchNode(); fetch->children.push_back(solnRoot); solnRoot = fetch; } ShardingFilterNode* sfn = new ShardingFilterNode(); sfn->children.push_back(solnRoot); solnRoot = sfn; } solnRoot = analyzeSort(query, params, solnRoot, &soln->hasSortStage); // This can happen if we need to create a blocking sort stage and we're not allowed to. if (NULL == solnRoot) { return NULL; } // If we can (and should), add the keep mutations stage. // We cannot keep mutated documents if: // // 1. The query requires an index to evaluate the predicate ($text). We can't tell whether // or not the doc actually satisfies the $text predicate since we can't evaluate a // text MatchExpression. // // 2. The query implies a sort ($geoNear). It would be rather expensive and hacky to merge // the document at the right place. // // 3. There is an index-provided sort. Ditto above comment about merging. // XXX; do we want some kind of static init for a set of stages we care about & pass that // set into hasNode? bool cannotKeepFlagged = hasNode(solnRoot, STAGE_TEXT) || hasNode(solnRoot, STAGE_GEO_NEAR_2D) || hasNode(solnRoot, STAGE_GEO_NEAR_2DSPHERE) || (!query.getParsed().getSort().isEmpty() && !soln->hasSortStage); // Only these stages can produce flagged results. A stage has to hold state past one call // to work(...) in order to possibly flag a result. bool couldProduceFlagged = hasNode(solnRoot, STAGE_GEO_2D) || hasNode(solnRoot, STAGE_AND_HASH) || hasNode(solnRoot, STAGE_AND_SORTED) || hasNode(solnRoot, STAGE_FETCH); bool shouldAddMutation = !cannotKeepFlagged && couldProduceFlagged; if (shouldAddMutation && (params.options & QueryPlannerParams::KEEP_MUTATIONS)) { KeepMutationsNode* keep = new KeepMutationsNode(); // We must run the entire expression tree to make sure the document is still valid. keep->filter.reset(query.root()->shallowClone()); if (STAGE_SORT == solnRoot->getType()) { // We want to insert the invalidated results before the sort stage, if there is one. verify(1 == solnRoot->children.size()); keep->children.push_back(solnRoot->children[0]); solnRoot->children[0] = keep; } else { keep->children.push_back(solnRoot); solnRoot = keep; } } // Project the results. if (NULL != query.getProj()) { QLOG() << "PROJECTION: fetched status: " << solnRoot->fetched() << endl; QLOG() << "PROJECTION: Current plan is:\n" << solnRoot->toString() << endl; if (query.getProj()->requiresDocument()) { QLOG() << "PROJECTION: claims to require doc adding fetch.\n"; // If the projection requires the entire document, somebody must fetch. if (!solnRoot->fetched()) { FetchNode* fetch = new FetchNode(); fetch->children.push_back(solnRoot); solnRoot = fetch; } } else { QLOG() << "PROJECTION: requires fields\n"; const vector<string>& fields = query.getProj()->getRequiredFields(); bool covered = true; for (size_t i = 0; i < fields.size(); ++i) { if (!solnRoot->hasField(fields[i])) { QLOG() << "PROJECTION: not covered cuz doesn't have field " << fields[i] << endl; covered = false; break; } } QLOG() << "PROJECTION: is covered?: = " << covered << endl; // If any field is missing from the list of fields the projection wants, // a fetch is required. if (!covered) { FetchNode* fetch = new FetchNode(); fetch->children.push_back(solnRoot); solnRoot = fetch; } } // We now know we have whatever data is required for the projection. ProjectionNode* projNode = new ProjectionNode(); projNode->children.push_back(solnRoot); projNode->fullExpression = query.root(); projNode->projection = query.getParsed().getProj(); solnRoot = projNode; } else { // If there's no projection, we must fetch, as the user wants the entire doc. if (!solnRoot->fetched()) { FetchNode* fetch = new FetchNode(); fetch->children.push_back(solnRoot); solnRoot = fetch; } } if (0 != query.getParsed().getSkip()) { SkipNode* skip = new SkipNode(); skip->skip = query.getParsed().getSkip(); skip->children.push_back(solnRoot); solnRoot = skip; } // When there is both a blocking sort and a limit, the limit will // be enforced by the blocking sort. // Otherwise, we need to limit the results in the case of a hard limit // (ie. limit in raw query is negative) if (0 != query.getParsed().getNumToReturn() && !soln->hasSortStage && !query.getParsed().wantMore()) { LimitNode* limit = new LimitNode(); limit->limit = query.getParsed().getNumToReturn(); limit->children.push_back(solnRoot); solnRoot = limit; } soln->root.reset(solnRoot); return soln.release(); }
// static QuerySolutionNode* QueryPlannerAnalysis::analyzeSort(const CanonicalQuery& query, const QueryPlannerParams& params, QuerySolutionNode* solnRoot, bool* blockingSortOut) { *blockingSortOut = false; const BSONObj& sortObj = query.getParsed().getSort(); if (sortObj.isEmpty()) { return solnRoot; } // TODO: We could check sortObj for any projections other than :1 and :-1 // and short-cut some of this. // If the sort is $natural, we ignore it, assuming that the caller has detected that and // outputted a collscan to satisfy the desired order. BSONElement natural = sortObj.getFieldDotted("$natural"); if (!natural.eoo()) { return solnRoot; } // See if solnRoot gives us the sort. If so, we're done. BSONObjSet sorts = solnRoot->getSort(); // If the sort we want is in the set of sort orders provided already, bail out. if (sorts.end() != sorts.find(sortObj)) { return solnRoot; } // Sort is not provided. See if we provide the reverse of our sort pattern. // If so, we can reverse the scan direction(s). BSONObj reverseSort = QueryPlannerCommon::reverseSortObj(sortObj); if (sorts.end() != sorts.find(reverseSort)) { QueryPlannerCommon::reverseScans(solnRoot); QLOG() << "Reversing ixscan to provide sort. Result: " << solnRoot->toString() << endl; return solnRoot; } // Sort not provided, can't reverse scans to get the sort. One last trick: We can "explode" // index scans over point intervals to an OR of sub-scans in order to pull out a sort. // Let's try this. if (explodeForSort(query, params, &solnRoot)) { return solnRoot; } // If we're here, we need to add a sort stage. // If we're not allowed to put a blocking sort in, bail out. if (params.options & QueryPlannerParams::NO_BLOCKING_SORT) { delete solnRoot; return NULL; } // Add a fetch stage so we have the full object when we hit the sort stage. XXX TODO: Can // we pull values out of the key and if so in what cases? (covered_index_sort_3.js) if (!solnRoot->fetched()) { FetchNode* fetch = new FetchNode(); fetch->children.push_back(solnRoot); solnRoot = fetch; } // And build the full sort stage. SortNode* sort = new SortNode(); sort->pattern = sortObj; sort->query = query.getParsed().getFilter(); // When setting the limit on the sort, we need to consider both // the limit N and skip count M. The sort should return an ordered list // N + M items so that the skip stage can discard the first M results. if (0 != query.getParsed().getNumToReturn()) { sort->limit = query.getParsed().getNumToReturn() + query.getParsed().getSkip(); } else { sort->limit = 0; } sort->children.push_back(solnRoot); solnRoot = sort; *blockingSortOut = true; return solnRoot; }
bool QueryPlannerAnalysis::explodeForSort(const CanonicalQuery& query, const QueryPlannerParams& params, QuerySolutionNode** solnRoot) { vector<QuerySolutionNode*> leafNodes; if (!structureOKForExplode(*solnRoot)) { return false; } getLeafNodes(*solnRoot, &leafNodes); const BSONObj& desiredSort = query.getParsed().getSort(); // How many scan leaves will result from our expansion? size_t totalNumScans = 0; // The value of entry i is how many scans we want to blow up for leafNodes[i]. // We calculate this in the loop below and might as well reuse it if we blow up // that scan. vector<size_t> fieldsToExplode; // The sort order we're looking for has to possibly be provided by each of the index scans // upon explosion. for (size_t i = 0; i < leafNodes.size(); ++i) { // We can do this because structureOKForExplode is only true if the leaves are index // scans. IndexScanNode* isn = static_cast<IndexScanNode*>(leafNodes[i]); const IndexBounds& bounds = isn->bounds; // Not a point interval prefix, can't try to rewrite. if (bounds.isSimpleRange) { return false; } // How many scans will we create if we blow up this ixscan? size_t numScans = 1; // Skip every field that is a union of point intervals and build the resulting sort // order from the remaining fields. BSONObjIterator kpIt(isn->indexKeyPattern); size_t boundsIdx = 0; while (kpIt.more()) { const OrderedIntervalList& oil = bounds.fields[boundsIdx]; if (!isUnionOfPoints(oil)) { break; } numScans *= oil.intervals.size(); kpIt.next(); ++boundsIdx; } // There's no sort order left to gain by exploding. Just go home. TODO: verify nothing // clever we can do here. if (!kpIt.more()) { return false; } // The rest of the fields define the sort order we could obtain by exploding // the bounds. BSONObjBuilder resultingSortBob; while (kpIt.more()) { resultingSortBob.append(kpIt.next()); } // See if it's the order we're looking for. BSONObj possibleSort = resultingSortBob.obj(); if (0 != possibleSort.woCompare(desiredSort)) { return false; } // Do some bookkeeping to see how many ixscans we'll create total. totalNumScans += numScans; // And for this scan how many fields we expand. fieldsToExplode.push_back(boundsIdx); } // Too many ixscans spoil the performance. if (totalNumScans > QueryPlannerAnalysis::kMaxScansToExplode) { QLOG() << "Could expand ixscans to pull out sort order but resulting scan count" << "(" << totalNumScans << ") is too high."; return false; } // If we're here, we can (probably? depends on how restrictive the structure check is) // get our sort order via ixscan blow-up. for (size_t i = 0; i < leafNodes.size(); ++i) { IndexScanNode* isn = static_cast<IndexScanNode*>(leafNodes[i]); QuerySolutionNode* newNode = explodeScan(isn, desiredSort, fieldsToExplode[i]); // Replace 'isn' with 'newNode' replaceNodeInTree(solnRoot, isn, newNode); // And get rid of the old data access node. delete isn; } return true; }
// static void QueryPlanner::plan(const CanonicalQuery& query, const QueryPlannerParams& params, vector<QuerySolution*>* out) { QLOG() << "=============================\n" << "Beginning planning, options = " << optionString(params.options) << endl << "Canonical query:\n" << query.toString() << endl << "=============================" << endl; // The shortcut formerly known as IDHACK. See if it's a simple _id query. If so we might // just make an ixscan over the _id index and bypass the rest of planning entirely. if (!query.getParsed().isExplain() && !query.getParsed().showDiskLoc() && isSimpleIdQuery(query.getParsed().getFilter()) && !query.getParsed().hasOption(QueryOption_CursorTailable)) { // See if we can find an _id index. for (size_t i = 0; i < params.indices.size(); ++i) { if (isIdIndex(params.indices[i].keyPattern)) { const IndexEntry& index = params.indices[i]; QLOG() << "IDHACK using index " << index.toString() << endl; // If so, we make a simple scan to find the doc. IndexScanNode* isn = new IndexScanNode(); isn->indexKeyPattern = index.keyPattern; isn->indexIsMultiKey = index.multikey; isn->direction = 1; isn->bounds.isSimpleRange = true; BSONObj key = getKeyFromQuery(index.keyPattern, query.getParsed().getFilter()); isn->bounds.startKey = isn->bounds.endKey = key; isn->bounds.endKeyInclusive = true; isn->computeProperties(); QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, isn); if (NULL != soln) { out->push_back(soln); QLOG() << "IDHACK solution is:\n" << (*out)[0]->toString() << endl; // And that's it. return; } } } } for (size_t i = 0; i < params.indices.size(); ++i) { QLOG() << "idx " << i << " is " << params.indices[i].toString() << endl; } bool canTableScan = !(params.options & QueryPlannerParams::NO_TABLE_SCAN); // If the query requests a tailable cursor, the only solution is a collscan + filter with // tailable set on the collscan. TODO: This is a policy departure. Previously I think you // could ask for a tailable cursor and it just tried to give you one. Now, we fail if we // can't provide one. Is this what we want? if (query.getParsed().hasOption(QueryOption_CursorTailable)) { if (!QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR) && canTableScan) { QuerySolution* soln = buildCollscanSoln(query, true, params); if (NULL != soln) { out->push_back(soln); } } return; } // The hint can be $natural: 1. If this happens, output a collscan. It's a weird way of // saying "table scan for two, please." if (!query.getParsed().getHint().isEmpty()) { BSONElement natural = query.getParsed().getHint().getFieldDotted("$natural"); if (!natural.eoo()) { QLOG() << "forcing a table scan due to hinted $natural\n"; if (canTableScan) { QuerySolution* soln = buildCollscanSoln(query, false, params); if (NULL != soln) { out->push_back(soln); } } return; } } // NOR and NOT we can't handle well with indices. If we see them here, they weren't // rewritten to remove the negation. Just output a collscan for those. if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::NOT) || QueryPlannerCommon::hasNode(query.root(), MatchExpression::NOR)) { // If there's a near predicate, we can't handle this. // TODO: Should canonicalized query detect this? if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR)) { warning() << "Can't handle NOT/NOR with GEO_NEAR"; return; } QLOG() << "NOT/NOR in plan, just outtping a collscan\n"; if (canTableScan) { QuerySolution* soln = buildCollscanSoln(query, false, params); if (NULL != soln) { out->push_back(soln); } } return; } // Figure out what fields we care about. unordered_set<string> fields; QueryPlannerIXSelect::getFields(query.root(), "", &fields); for (unordered_set<string>::const_iterator it = fields.begin(); it != fields.end(); ++it) { QLOG() << "predicate over field " << *it << endl; } // Filter our indices so we only look at indices that are over our predicates. vector<IndexEntry> relevantIndices; // Hints require us to only consider the hinted index. BSONObj hintIndex = query.getParsed().getHint(); // Snapshot is a form of a hint. If snapshot is set, try to use _id index to make a real // plan. If that fails, just scan the _id index. if (query.getParsed().isSnapshot()) { // Find the ID index in indexKeyPatterns. It's our hint. for (size_t i = 0; i < params.indices.size(); ++i) { if (isIdIndex(params.indices[i].keyPattern)) { hintIndex = params.indices[i].keyPattern; break; } } } size_t hintIndexNumber = numeric_limits<size_t>::max(); if (!hintIndex.isEmpty()) { // Sigh. If the hint is specified it might be using the index name. BSONElement firstHintElt = hintIndex.firstElement(); if (str::equals("$hint", firstHintElt.fieldName()) && String == firstHintElt.type()) { string hintName = firstHintElt.String(); for (size_t i = 0; i < params.indices.size(); ++i) { if (params.indices[i].name == hintName) { QLOG() << "hint by name specified, restricting indices to " << params.indices[i].keyPattern.toString() << endl; relevantIndices.clear(); relevantIndices.push_back(params.indices[i]); hintIndexNumber = i; hintIndex = params.indices[i].keyPattern; break; } } } else { for (size_t i = 0; i < params.indices.size(); ++i) { if (0 == params.indices[i].keyPattern.woCompare(hintIndex)) { relevantIndices.clear(); relevantIndices.push_back(params.indices[i]); QLOG() << "hint specified, restricting indices to " << hintIndex.toString() << endl; hintIndexNumber = i; break; } } } if (hintIndexNumber == numeric_limits<size_t>::max()) { // This is supposed to be an error. warning() << "Can't find hint for " << hintIndex.toString(); return; } } else { QLOG() << "Finding relevant indices\n"; QueryPlannerIXSelect::findRelevantIndices(fields, params.indices, &relevantIndices); } for (size_t i = 0; i < relevantIndices.size(); ++i) { QLOG() << "relevant idx " << i << " is " << relevantIndices[i].toString() << endl; } // Figure out how useful each index is to each predicate. // query.root() is now annotated with RelevantTag(s). QueryPlannerIXSelect::rateIndices(query.root(), "", relevantIndices); QLOG() << "rated tree" << endl; QLOG() << query.root()->toString() << endl; // If there is a GEO_NEAR it must have an index it can use directly. // XXX: move into data access? MatchExpression* gnNode = NULL; if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR, &gnNode)) { // No index for GEO_NEAR? No query. RelevantTag* tag = static_cast<RelevantTag*>(gnNode->getTag()); if (0 == tag->first.size() && 0 == tag->notFirst.size()) { return; } GeoNearMatchExpression* gnme = static_cast<GeoNearMatchExpression*>(gnNode); vector<size_t> newFirst; // 2d + GEO_NEAR is annoying. Because 2d's GEO_NEAR isn't streaming we have to embed // the full query tree inside it as a matcher. for (size_t i = 0; i < tag->first.size(); ++i) { // GEO_NEAR has a non-2d index it can use. We can deal w/that in normal planning. if (!is2DIndex(relevantIndices[tag->first[i]].keyPattern)) { newFirst.push_back(i); continue; } // If we're here, GEO_NEAR has a 2d index. We create a 2dgeonear plan with the // entire tree as a filter, if possible. GeoNear2DNode* solnRoot = new GeoNear2DNode(); solnRoot->nq = gnme->getData(); if (MatchExpression::GEO_NEAR != query.root()->matchType()) { // root is an AND, clone and delete the GEO_NEAR child. MatchExpression* filterTree = query.root()->shallowClone(); verify(MatchExpression::AND == filterTree->matchType()); bool foundChild = false; for (size_t i = 0; i < filterTree->numChildren(); ++i) { if (MatchExpression::GEO_NEAR == filterTree->getChild(i)->matchType()) { foundChild = true; filterTree->getChildVector()->erase(filterTree->getChildVector()->begin() + i); break; } } verify(foundChild); solnRoot->filter.reset(filterTree); } solnRoot->numWanted = query.getParsed().getNumToReturn(); if (0 == solnRoot->numWanted) { solnRoot->numWanted = 100; } solnRoot->indexKeyPattern = relevantIndices[tag->first[i]].keyPattern; // Remove the 2d index. 2d can only be the first field, and we know there is // only one GEO_NEAR, so we don't care if anyone else was assigned it; it'll // only be first for gnNode. tag->first.erase(tag->first.begin() + i); QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, solnRoot); if (NULL != soln) { out->push_back(soln); } } // Continue planning w/non-2d indices tagged for this pred. tag->first.swap(newFirst); if (0 == tag->first.size() && 0 == tag->notFirst.size()) { return; } } // Likewise, if there is a TEXT it must have an index it can use directly. MatchExpression* textNode; if (QueryPlannerCommon::hasNode(query.root(), MatchExpression::TEXT, &textNode)) { RelevantTag* tag = static_cast<RelevantTag*>(textNode->getTag()); if (0 == tag->first.size() && 0 == tag->notFirst.size()) { return; } } // If we have any relevant indices, we try to create indexed plans. if (0 < relevantIndices.size()) { // The enumerator spits out trees tagged with IndexTag(s). PlanEnumerator isp(query.root(), &relevantIndices); isp.init(); MatchExpression* rawTree; while (isp.getNext(&rawTree)) { QLOG() << "about to build solntree from tagged tree:\n" << rawTree->toString() << endl; // This can fail if enumeration makes a mistake. QuerySolutionNode* solnRoot = QueryPlannerAccess::buildIndexedDataAccess(query, rawTree, false, relevantIndices); if (NULL == solnRoot) { continue; } QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(query, params, solnRoot); if (NULL != soln) { QLOG() << "Planner: adding solution:\n" << soln->toString() << endl; out->push_back(soln); } } } QLOG() << "Planner: outputted " << out->size() << " indexed solutions.\n"; // An index was hinted. If there are any solutions, they use the hinted index. If not, we // scan the entire index to provide results and output that as our plan. This is the // desired behavior when an index is hinted that is not relevant to the query. if (!hintIndex.isEmpty() && (0 == out->size())) { QuerySolution* soln = buildWholeIXSoln(params.indices[hintIndexNumber], query, params); if (NULL != soln) { QLOG() << "Planner: outputting soln that uses hinted index as scan." << endl; out->push_back(soln); } return; } // If a sort order is requested, there may be an index that provides it, even if that // index is not over any predicates in the query. // // XXX XXX: Can we do this even if the index is sparse? Might we miss things? if (!query.getParsed().getSort().isEmpty() && !QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR) && !QueryPlannerCommon::hasNode(query.root(), MatchExpression::TEXT)) { // See if we have a sort provided from an index already. bool usingIndexToSort = false; for (size_t i = 0; i < out->size(); ++i) { QuerySolution* soln = (*out)[i]; if (!soln->hasSortStage) { usingIndexToSort = true; break; } } if (!usingIndexToSort) { for (size_t i = 0; i < params.indices.size(); ++i) { const BSONObj& kp = params.indices[i].keyPattern; if (providesSort(query, kp)) { QLOG() << "Planner: outputting soln that uses index to provide sort." << endl; QuerySolution* soln = buildWholeIXSoln(params.indices[i], query, params); if (NULL != soln) { out->push_back(soln); break; } } if (providesSort(query, QueryPlannerCommon::reverseSortObj(kp))) { QLOG() << "Planner: outputting soln that uses (reverse) index " << "to provide sort." << endl; QuerySolution* soln = buildWholeIXSoln(params.indices[i], query, params, -1); if (NULL != soln) { out->push_back(soln); break; } } } } } // TODO: Do we always want to offer a collscan solution? // XXX: currently disabling the always-use-a-collscan in order to find more planner bugs. if ( !QueryPlannerCommon::hasNode(query.root(), MatchExpression::GEO_NEAR) && !QueryPlannerCommon::hasNode(query.root(), MatchExpression::TEXT) && ((params.options & QueryPlannerParams::INCLUDE_COLLSCAN) || (0 == out->size() && canTableScan))) { QuerySolution* collscan = buildCollscanSoln(query, false, params); if (NULL != collscan) { out->push_back(collscan); QLOG() << "Planner: outputting a collscan:\n"; QLOG() << collscan->toString() << endl; } } }
/** * For a given query, get a runner. The runner could be a SingleSolutionRunner, a * CachedQueryRunner, or a MultiPlanRunner, depending on the cache/query solver/etc. */ Status getRunner(Collection* collection, CanonicalQuery* rawCanonicalQuery, Runner** out, size_t plannerOptions) { verify(rawCanonicalQuery); auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery); // This can happen as we're called by internal clients as well. if (NULL == collection) { const string& ns = canonicalQuery->ns(); LOG(2) << "Collection " << ns << " does not exist." << " Using EOF runner: " << canonicalQuery->toStringShort(); *out = new EOFRunner(canonicalQuery.release(), ns); return Status::OK(); } // If we have an _id index we can use the idhack runner. if (IDHackRunner::supportsQuery(*canonicalQuery) && collection->getIndexCatalog()->findIdIndex()) { LOG(2) << "Using idhack: " << canonicalQuery->toStringShort(); *out = new IDHackRunner(collection, canonicalQuery.release()); return Status::OK(); } // Tailable: If the query requests tailable the collection must be capped. if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) { if (!collection->isCapped()) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " tailable cursor requested on non capped collection"); } // If a sort is specified it must be equal to expectedSort. const BSONObj expectedSort = BSON("$natural" << 1); const BSONObj& actualSort = canonicalQuery->getParsed().getSort(); if (!actualSort.isEmpty() && !(actualSort == expectedSort)) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " invalid sort specified for tailable cursor: " + actualSort.toString()); } } // Fill out the planning params. We use these for both cached solutions and non-cached. QueryPlannerParams plannerParams; plannerParams.options = plannerOptions; fillOutPlannerParams(collection, rawCanonicalQuery, &plannerParams); // See if the cache has what we're looking for. Status cacheStatus = getRunnerFromCache(canonicalQuery.get(), collection, plannerParams, out); // This can be not-OK and we can carry on. It just means the query wasn't cached. if (cacheStatus.isOK()) { // We got a cached runner. canonicalQuery.release(); return cacheStatus; } if (internalQueryPlanOrChildrenIndependently && SubplanRunner::canUseSubplanRunner(*canonicalQuery)) { QLOG() << "Running query as sub-queries: " << canonicalQuery->toStringShort(); LOG(2) << "Running query as sub-queries: " << canonicalQuery->toStringShort(); SubplanRunner* runner; Status runnerStatus = SubplanRunner::make(collection, plannerParams, canonicalQuery.release(), &runner); if (!runnerStatus.isOK()) { return runnerStatus; } *out = runner; return Status::OK(); } return getRunnerAlwaysPlan(collection, canonicalQuery.release(), plannerParams, out); }
bool MultiPlanRunner::pickBestPlan(size_t* out, BSONObj* objOut) { static const int timesEachPlanIsWorked = 100; // Run each plan some number of times. for (int i = 0; i < timesEachPlanIsWorked; ++i) { bool moreToDo = workAllPlans(objOut); if (!moreToDo) { break; } } if (_failure || _killed) { return false; } // After picking best plan, ranking will own plan stats from // candidate solutions (winner and losers). std::auto_ptr<PlanRankingDecision> ranking(new PlanRankingDecision); size_t bestChild = PlanRanker::pickBestPlan(_candidates, ranking.get()); // Copy candidate order. We will need this to sort candidate stats for explain // after transferring ownership of 'ranking' to plan cache. std::vector<size_t> candidateOrder = ranking->candidateOrder; // Run the best plan. Store it. _bestPlan.reset(new PlanExecutor(_candidates[bestChild].ws, _candidates[bestChild].root)); _bestPlan->setYieldPolicy(_policy); _alreadyProduced = _candidates[bestChild].results; _bestSolution.reset(_candidates[bestChild].solution); QLOG() << "Winning solution:\n" << _bestSolution->toString() << endl; size_t backupChild = bestChild; if (_bestSolution->hasBlockingStage && (0 == _alreadyProduced.size())) { QLOG() << "Winner has blocking stage, looking for backup plan...\n"; for (size_t i = 0; i < _candidates.size(); ++i) { if (!_candidates[i].solution->hasBlockingStage) { QLOG() << "Candidate " << i << " is backup child\n"; backupChild = i; _backupSolution = _candidates[i].solution; _backupAlreadyProduced = _candidates[i].results; _backupPlan = new PlanExecutor(_candidates[i].ws, _candidates[i].root); _backupPlan->setYieldPolicy(_policy); break; } } } // Store the choice we just made in the cache. We do // not cache the query if: // 1) The query is of a type that is not safe to cache, or // 2) the winning plan did not actually produce any results, // without hitting EOF. In this case, we have no information to // suggest that this plan is good. const PlanStageStats* bestStats = ranking->stats.vector()[0]; if (PlanCache::shouldCacheQuery(*_query) && (!_alreadyProduced.empty() || bestStats->common.isEOF)) { Database* db = cc().database(); verify(NULL != db); Collection* collection = db->getCollection(_query->ns()); verify(NULL != collection); PlanCache* cache = collection->infoCache()->getPlanCache(); // Create list of candidate solutions for the cache with // the best solution at the front. std::vector<QuerySolution*> solutions; // Generate solutions and ranking decisions sorted by score. for (size_t orderingIndex = 0; orderingIndex < candidateOrder.size(); ++orderingIndex) { // index into candidates/ranking size_t i = candidateOrder[orderingIndex]; solutions.push_back(_candidates[i].solution); } // Check solution cache data. Do not add to cache if // we have any invalid SolutionCacheData data. // XXX: One known example is 2D queries bool validSolutions = true; for (size_t i = 0; i < solutions.size(); ++i) { if (NULL == solutions[i]->cacheData.get()) { QLOG() << "Not caching query because this solution has no cache data: " << solutions[i]->toString(); validSolutions = false; break; } } if (validSolutions) { cache->add(*_query, solutions, ranking.release()); } } // Clear out the candidate plans, leaving only stats as we're all done w/them. // Traverse candidate plans in order or score for (size_t orderingIndex = 0; orderingIndex < candidateOrder.size(); ++orderingIndex) { // index into candidates/ranking size_t i = candidateOrder[orderingIndex]; if (i == bestChild) { continue; } if (i == backupChild) { continue; } delete _candidates[i].solution; // Remember the stats for the candidate plan because we always show it on an // explain. (The {verbose:false} in explain() is client-side trick; we always // generate a "verbose" explain.) PlanStageStats* stats = _candidates[i].root->getStats(); if (stats) { _candidateStats.push_back(stats); } delete _candidates[i].root; // ws must die after the root. delete _candidates[i].ws; } _candidates.clear(); if (NULL != out) { *out = bestChild; } return true; }
Runner::RunnerState MultiPlanRunner::getNext(BSONObj* objOut, DiskLoc* dlOut) { if (_killed) { return Runner::RUNNER_DEAD; } if (_failure) { return Runner::RUNNER_ERROR; } // If we haven't picked the best plan yet... if (NULL == _bestPlan) { if (!pickBestPlan(NULL, objOut)) { verify(_failure || _killed); if (_killed) { return Runner::RUNNER_DEAD; } if (_failure) { return Runner::RUNNER_ERROR; } } } // Look for an already produced result that provides the data the caller wants. while (!_alreadyProduced.empty()) { WorkingSetID id = _alreadyProduced.front(); _alreadyProduced.pop_front(); WorkingSetMember* member = _bestPlan->getWorkingSet()->get(id); // Note that this copies code from PlanExecutor. if (NULL != objOut) { if (WorkingSetMember::LOC_AND_IDX == member->state) { if (1 != member->keyData.size()) { _bestPlan->getWorkingSet()->free(id); // If the caller needs the key data and the WSM doesn't have it, drop the // result and carry on. continue; } *objOut = member->keyData[0].keyData; } else if (member->hasObj()) { *objOut = member->obj; } else { // If the caller needs an object and the WSM doesn't have it, drop and // try the next result. _bestPlan->getWorkingSet()->free(id); continue; } } if (NULL != dlOut) { if (member->hasLoc()) { *dlOut = member->loc; } else { // If the caller needs a DiskLoc and the WSM doesn't have it, drop and carry on. _bestPlan->getWorkingSet()->free(id); continue; } } // If we're here, the caller has all the data needed and we've set the out // parameters. Remove the result from the WorkingSet. _bestPlan->getWorkingSet()->free(id); return Runner::RUNNER_ADVANCED; } RunnerState state = _bestPlan->getNext(objOut, dlOut); if (Runner::RUNNER_ERROR == state && (NULL != _backupSolution)) { QLOG() << "Best plan errored out switching to backup\n"; // Uncache the bad solution if we fall back // on the backup solution. // // XXX: Instead of uncaching we should find a way for the // cached plan runner to fall back on a different solution // if the best solution fails. Alternatively we could try to // defer cache insertion to be after the first produced result. Database* db = cc().database(); verify(NULL != db); Collection* collection = db->getCollection(_query->ns()); verify(NULL != collection); PlanCache* cache = collection->infoCache()->getPlanCache(); cache->remove(*_query); _bestPlan.reset(_backupPlan); _backupPlan = NULL; _bestSolution.reset(_backupSolution); _backupSolution = NULL; _alreadyProduced = _backupAlreadyProduced; return getNext(objOut, dlOut); } if (NULL != _backupSolution && Runner::RUNNER_ADVANCED == state) { QLOG() << "Best plan had a blocking sort, became unblocked, deleting backup plan\n"; delete _backupSolution; delete _backupPlan; _backupSolution = NULL; _backupPlan = NULL; // TODO: free from WS? _backupAlreadyProduced.clear(); } return state; }
Status getExecutor(OperationContext* txn, Collection* collection, CanonicalQuery* rawCanonicalQuery, PlanExecutor** out, size_t plannerOptions) { invariant(rawCanonicalQuery); auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery); // This can happen as we're called by internal clients as well. if (NULL == collection) { const string& ns = canonicalQuery->ns(); LOG(2) << "Collection " << ns << " does not exist." << " Using EOF runner: " << canonicalQuery->toStringShort(); EOFStage* eofStage = new EOFStage(); WorkingSet* ws = new WorkingSet(); *out = new PlanExecutor(ws, eofStage, canonicalQuery.release(), collection); return Status::OK(); } // Fill out the planning params. We use these for both cached solutions and non-cached. QueryPlannerParams plannerParams; plannerParams.options = plannerOptions; fillOutPlannerParams(collection, canonicalQuery.get(), &plannerParams); // If we have an _id index we can use the idhack runner. if (IDHackStage::supportsQuery(*canonicalQuery.get()) && collection->getIndexCatalog()->findIdIndex()) { return getExecutorIDHack(txn, collection, canonicalQuery.release(), plannerParams, out); } // Tailable: If the query requests tailable the collection must be capped. if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) { if (!collection->isCapped()) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " tailable cursor requested on non capped collection"); } // If a sort is specified it must be equal to expectedSort. const BSONObj expectedSort = BSON("$natural" << 1); const BSONObj& actualSort = canonicalQuery->getParsed().getSort(); if (!actualSort.isEmpty() && !(actualSort == expectedSort)) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " invalid sort specified for tailable cursor: " + actualSort.toString()); } } // Try to look up a cached solution for the query. CachedSolution* rawCS; if (PlanCache::shouldCacheQuery(*canonicalQuery) && collection->infoCache()->getPlanCache()->get(*canonicalQuery.get(), &rawCS).isOK()) { // We have a CachedSolution. Have the planner turn it into a QuerySolution. boost::scoped_ptr<CachedSolution> cs(rawCS); QuerySolution *qs, *backupQs; QuerySolution*& chosenSolution=qs; // either qs or backupQs Status status = QueryPlanner::planFromCache(*canonicalQuery.get(), plannerParams, *cs, &qs, &backupQs); if (status.isOK()) { // the working set will be shared by the root and backupRoot plans // and owned by the containing single-solution-runner // WorkingSet* sharedWs = new WorkingSet(); PlanStage *root, *backupRoot=NULL; verify(StageBuilder::build(txn, collection, *qs, sharedWs, &root)); if ((plannerParams.options & QueryPlannerParams::PRIVATE_IS_COUNT) && turnIxscanIntoCount(qs)) { LOG(2) << "Using fast count: " << canonicalQuery->toStringShort() << ", planSummary: " << getPlanSummary(*qs); if (NULL != backupQs) { delete backupQs; } } else if (NULL != backupQs) { verify(StageBuilder::build(txn, collection, *backupQs, sharedWs, &backupRoot)); } // add a CachedPlanStage on top of the previous root root = new CachedPlanStage(collection, canonicalQuery.get(), root, backupRoot); *out = new PlanExecutor(sharedWs, root, chosenSolution, canonicalQuery.release(), collection); return Status::OK(); } } if (internalQueryPlanOrChildrenIndependently && SubplanStage::canUseSubplanning(*canonicalQuery)) { QLOG() << "Running query as sub-queries: " << canonicalQuery->toStringShort(); auto_ptr<WorkingSet> ws(new WorkingSet()); SubplanStage* subplan; Status subplanStatus = SubplanStage::make(txn, collection, ws.get(), plannerParams, canonicalQuery.get(), &subplan); if (subplanStatus.isOK()) { LOG(2) << "Running query as sub-queries: " << canonicalQuery->toStringShort(); *out = new PlanExecutor(ws.release(), subplan, canonicalQuery.release(), collection); return Status::OK(); } else { QLOG() << "Subplanner: " << subplanStatus.reason(); } } return getExecutorAlwaysPlan(txn, collection, canonicalQuery.release(), plannerParams, out); }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View newGetMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized, bool fromDBDirectClient) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; // This is a read lock. const NamespaceString nss(ns); scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); QLOG() << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(collection, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (fromDBDirectClient) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn)); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn, curop); } if (cc->isAggCursor) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } // We save the client cursor when there might be more results, and hence we may receive // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know // that we will not be producing more results. We indicate that the cursor is closed by // sending a cursorId of 0 back to the client. // // On the other hand, if we retrieve all results necessary for this batch, then // 'saveClientCursor' is true and we send a valid cursorId back to the client. In // this case, there may or may not actually be more results (for example, the next call // to getNext(...) might just return EOF). bool saveClientCursor = false; if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) { // Propagate this error to caller. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (PlanExecutor::IS_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(PlanExecutor::ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); QLOG() << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!fromDBDirectClient) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper.reset(); delete cc->releaseOwnedRecoveryUnit(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); QLOG() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } bool saveClientCursor = false; if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) { // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (Runner::RUNNER_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(Runner::RUNNER_ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ccPin.deleteUnderlying(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended w/state " << Runner::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); QLOG() << "getMore saving client cursor ended w/state " << Runner::statestr(state) << endl; // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { QLOG() << "Running query on new system: " << cq->toString(); // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // Need to call cq->toString() now, since upon error getRunner doesn't guarantee // cq is in a consistent state. string cqStr = cq->toString(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
bool SubplanRunner::runSubplans() { // This is what we annotate with the index selections and then turn into a solution. auto_ptr<OrMatchExpression> theOr( static_cast<OrMatchExpression*>(_query->root()->shallowClone())); // This is the skeleton of index selections that is inserted into the cache. auto_ptr<PlanCacheIndexTree> cacheData(new PlanCacheIndexTree()); for (size_t i = 0; i < theOr->numChildren(); ++i) { MatchExpression* orChild = theOr->getChild(i); auto_ptr<CanonicalQuery> orChildCQ(_cqs.front()); _cqs.pop(); // 'solutions' is owned by the SubplanRunner instance until // it is popped from the queue. vector<QuerySolution*> solutions = _solutions.front(); _solutions.pop(); // We already checked for zero solutions in planSubqueries(...). invariant(!solutions.empty()); if (1 == solutions.size()) { // There is only one solution. Transfer ownership to an auto_ptr. auto_ptr<QuerySolution> autoSoln(solutions[0]); // We want a well-formed *indexed* solution. if (NULL == autoSoln->cacheData.get()) { // For example, we don't cache things for 2d indices. QLOG() << "Subplanner: No cache data for subchild " << orChild->toString(); return false; } if (SolutionCacheData::USE_INDEX_TAGS_SOLN != autoSoln->cacheData->solnType) { QLOG() << "Subplanner: No indexed cache data for subchild " << orChild->toString(); return false; } // Add the index assignments to our original query. Status tagStatus = QueryPlanner::tagAccordingToCache( orChild, autoSoln->cacheData->tree.get(), _indexMap); if (!tagStatus.isOK()) { QLOG() << "Subplanner: Failed to extract indices from subchild " << orChild->toString(); return false; } // Add the child's cache data to the cache data we're creating for the main query. cacheData->children.push_back(autoSoln->cacheData->tree->clone()); } else { // N solutions, rank them. Takes ownership of orChildCQ. // the working set will be shared by the candidate plans and owned by the runner WorkingSet* sharedWorkingSet = new WorkingSet(); MultiPlanStage* multiPlanStage = new MultiPlanStage(_collection, orChildCQ.get()); // Dump all the solutions into the MPR. for (size_t ix = 0; ix < solutions.size(); ++ix) { PlanStage* nextPlanRoot; verify(StageBuilder::build(_txn, _collection, *solutions[ix], sharedWorkingSet, &nextPlanRoot)); // Owns first two arguments multiPlanStage->addPlan(solutions[ix], nextPlanRoot, sharedWorkingSet); } multiPlanStage->pickBestPlan(); if (! multiPlanStage->bestPlanChosen()) { QLOG() << "Subplanner: Failed to pick best plan for subchild " << orChildCQ->toString(); return false; } Runner* mpr = new SingleSolutionRunner(_collection, orChildCQ.release(), multiPlanStage->bestSolution(), multiPlanStage, sharedWorkingSet); _underlyingRunner.reset(mpr); if (_killed) { QLOG() << "Subplanner: Killed while picking best plan for subchild " << orChild->toString(); return false; } QuerySolution* bestSoln = multiPlanStage->bestSolution(); if (SolutionCacheData::USE_INDEX_TAGS_SOLN != bestSoln->cacheData->solnType) { QLOG() << "Subplanner: No indexed cache data for subchild " << orChild->toString(); return false; } // Add the index assignments to our original query. Status tagStatus = QueryPlanner::tagAccordingToCache( orChild, bestSoln->cacheData->tree.get(), _indexMap); if (!tagStatus.isOK()) { QLOG() << "Subplanner: Failed to extract indices from subchild " << orChild->toString(); return false; } cacheData->children.push_back(bestSoln->cacheData->tree->clone()); } } // Must do this before using the planner functionality. sortUsingTags(theOr.get()); // Use the cached index assignments to build solnRoot. Takes ownership of 'theOr' QuerySolutionNode* solnRoot = QueryPlannerAccess::buildIndexedDataAccess( *_query, theOr.release(), false, _plannerParams.indices); if (NULL == solnRoot) { QLOG() << "Subplanner: Failed to build indexed data path for subplanned query\n"; return false; } QLOG() << "Subplanner: fully tagged tree is " << solnRoot->toString(); // Takes ownership of 'solnRoot' QuerySolution* soln = QueryPlannerAnalysis::analyzeDataAccess(*_query, _plannerParams, solnRoot); if (NULL == soln) { QLOG() << "Subplanner: Failed to analyze subplanned query"; return false; } // We want our franken-solution to be cached. SolutionCacheData* scd = new SolutionCacheData(); scd->tree.reset(cacheData.release()); soln->cacheData.reset(scd); QLOG() << "Subplanner: Composite solution is " << soln->toString() << endl; // We use one of these even if there is one plan. We do this so that the entry is cached // with stats obtained in the same fashion as a competitive ranking would have obtained // them. MultiPlanStage* multiPlanStage = new MultiPlanStage(_collection, _query.get()); WorkingSet* ws = new WorkingSet(); PlanStage* root; verify(StageBuilder::build(_txn, _collection, *soln, ws, &root)); multiPlanStage->addPlan(soln, root, ws); // Takes ownership first two arguments. multiPlanStage->pickBestPlan(); if (! multiPlanStage->bestPlanChosen()) { QLOG() << "Subplanner: Failed to pick best plan for subchild " << _query->toString(); return false; } Runner* mpr = new SingleSolutionRunner(_collection, _query.release(), multiPlanStage->bestSolution(), multiPlanStage, ws); _underlyingRunner.reset(mpr); return true; }
std::string newRunQuery(OperationContext* txn, Message& m, QueryMessage& q, CurOp& curop, Message &result, bool fromDBDirectClient) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder cmdResBuf; if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } const NamespaceString nss(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize( q, &cq, WhereCallbackReal(txn, StringData(nss.db()))); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. PlanExecutor* rawExec = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); AutoGetCollectionForRead ctx(txn, nss); const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; Collection* collection = ctx.getCollection(); // We'll now try to get the query executor that will execute this query for us. There // are a few cases in which we know upfront which executor we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we use an EOFStage. // // (b) if the query is a replication's initial sync one, we use a specifically designed // stage that skips extents faster (see details in exec/oplogstart.h). // // Otherwise we go through the selection of which executor is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (NULL != collection && pq.getOptions().oplogReplay) { // Takes ownership of 'cq'. status = getOplogStartHack(txn, collection, cq, &rawExec); } else { size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } // Takes ownership of 'cq'. status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options); } if (!status.isOK()) { // NOTE: Do not access cq as getExecutor has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawExec); auto_ptr<PlanExecutor> exec(rawExec); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref(); status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, NamespaceString(cq->ns()), slaveOK); uassertStatusOK(status); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the PlanExecutor in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.getOptions().oplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " executor EOF=" << exec->isEOF() << endl; saveClientCursor = !exec->isEOF(); } break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead executor? if (PlanExecutor::DEAD == state) { saveClientCursor = false; } else if (pq.getOptions().tailable) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery; const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1); PlanSummaryStats summaryStats; Explain::getSummaryStats(exec.get(), &summaryStats); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; curop.debug().scanAndOrder = summaryStats.hasSortStage; curop.debug().nscanned = summaryStats.totalKeysExamined; curop.debug().nscannedObjects = summaryStats.totalDocsExamined; curop.debug().idhack = summaryStats.isIdhack; // Set debug information for consumption by the profiler. if (dbProfilingLevel > 0 || curop.elapsedMillis() > serverGlobalParams.slowMS || logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) { // Get BSON stats. scoped_ptr<PlanStageStats> execStats(exec->getStats()); BSONObjBuilder statsBob; Explain::statsToBSON(*execStats, &statsBob); curop.debug().execStats.set(statsBob.obj()); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } long long ccId = 0; if (saveClientCursor) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, exec.get(), cq->getParsed().getOptions().toInt(), cq->getParsed().getFilter()); ccId = cc->cursorid(); if (fromDBDirectClient) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn)); } QLOG() << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of executor. Release to make sure it's not deleted. exec.release(); // TODO document if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.getOptions().exhaust) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching executor but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
void MultiPlanStage::pickBestPlan() { // Run each plan some number of times. This number is at least as great as // 'internalQueryPlanEvaluationWorks', but may be larger for big collections. size_t numWorks = internalQueryPlanEvaluationWorks; if (NULL != _collection) { // For large collections, the number of works is set to be this // fraction of the collection size. double fraction = internalQueryPlanEvaluationCollFraction; numWorks = std::max(size_t(internalQueryPlanEvaluationWorks), size_t(fraction * _collection->numRecords())); } // We treat ntoreturn as though it is a limit during plan ranking. // This means that ranking might not be great for sort + batchSize. // But it also means that we don't buffer too much data for sort + limit. // See SERVER-14174 for details. size_t numToReturn = _query->getParsed().getNumToReturn(); // Determine the number of results which we will produce during the plan // ranking phase before stopping. size_t numResults = (size_t)internalQueryPlanEvaluationMaxResults; if (numToReturn > 0) { numResults = std::min(numToReturn, numResults); } // Work the plans, stopping when a plan hits EOF or returns some // fixed number of results. for (size_t ix = 0; ix < numWorks; ++ix) { bool moreToDo = workAllPlans(numResults); if (!moreToDo) { break; } } if (_failure) { return; } // After picking best plan, ranking will own plan stats from // candidate solutions (winner and losers). std::auto_ptr<PlanRankingDecision> ranking(new PlanRankingDecision); _bestPlanIdx = PlanRanker::pickBestPlan(_candidates, ranking.get()); verify(_bestPlanIdx >= 0 && _bestPlanIdx < static_cast<int>(_candidates.size())); // Copy candidate order. We will need this to sort candidate stats for explain // after transferring ownership of 'ranking' to plan cache. std::vector<size_t> candidateOrder = ranking->candidateOrder; CandidatePlan& bestCandidate = _candidates[_bestPlanIdx]; std::list<WorkingSetID>& alreadyProduced = bestCandidate.results; QuerySolution* bestSolution = bestCandidate.solution; QLOG() << "Winning solution:\n" << bestSolution->toString() << endl; LOG(2) << "Winning plan: " << getPlanSummary(*bestSolution); _backupPlanIdx = kNoSuchPlan; if (bestSolution->hasBlockingStage && (0 == alreadyProduced.size())) { QLOG() << "Winner has blocking stage, looking for backup plan...\n"; for (size_t ix = 0; ix < _candidates.size(); ++ix) { if (!_candidates[ix].solution->hasBlockingStage) { QLOG() << "Candidate " << ix << " is backup child\n"; _backupPlanIdx = ix; break; } } } // Store the choice we just made in the cache. In order to do so, // 1) the query must be of a type that is safe to cache, and // 2) two or more plans cannot have tied for the win. Caching in the // case of ties can cause successive queries of the same shape to // use a bad index. if (PlanCache::shouldCacheQuery(*_query) && !ranking->tieForBest) { // Create list of candidate solutions for the cache with // the best solution at the front. std::vector<QuerySolution*> solutions; // Generate solutions and ranking decisions sorted by score. for (size_t orderingIndex = 0; orderingIndex < candidateOrder.size(); ++orderingIndex) { // index into candidates/ranking size_t ix = candidateOrder[orderingIndex]; solutions.push_back(_candidates[ix].solution); } // Check solution cache data. Do not add to cache if // we have any invalid SolutionCacheData data. // XXX: One known example is 2D queries bool validSolutions = true; for (size_t ix = 0; ix < solutions.size(); ++ix) { if (NULL == solutions[ix]->cacheData.get()) { QLOG() << "Not caching query because this solution has no cache data: " << solutions[ix]->toString(); validSolutions = false; break; } } if (validSolutions) { _collection->infoCache()->getPlanCache()->add(*_query, solutions, ranking.release()); } } }