StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn, const GetMoreRequest& request) { auto cursorManager = grid.getCursorManager(); auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid); if (!pinnedCursor.isOK()) { return pinnedCursor.getStatus(); } invariant(request.cursorid == pinnedCursor.getValue().getCursorId()); // If the fail point is enabled, busy wait until it is disabled. while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) { } if (request.awaitDataTimeout) { auto status = pinnedCursor.getValue().setAwaitDataTimeout(*request.awaitDataTimeout); if (!status.isOK()) { return status; } } std::vector<BSONObj> batch; int bytesBuffered = 0; long long batchSize = request.batchSize.value_or(0); long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar(); auto cursorState = ClusterCursorManager::CursorState::NotExhausted; while (!FindCommon::enoughForGetMore(batchSize, batch.size())) { auto next = pinnedCursor.getValue().next(); if (!next.isOK()) { return next.getStatus(); } if (!next.getValue()) { // We reached end-of-stream. if (!pinnedCursor.getValue().isTailable()) { cursorState = ClusterCursorManager::CursorState::Exhausted; } break; } if (!FindCommon::haveSpaceForNext(*next.getValue(), batch.size(), bytesBuffered)) { pinnedCursor.getValue().queueResult(*next.getValue()); break; } // Add doc to the batch. Account for the space overhead associated with returning this doc // inside a BSON array. bytesBuffered += (next.getValue()->objsize() + kPerDocumentOverheadBytesUpperBound); batch.push_back(std::move(*next.getValue())); } // Transfer ownership of the cursor back to the cursor manager. pinnedCursor.getValue().returnCursor(cursorState); CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted) ? CursorId(0) : request.cursorid; return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom); }
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn, const CanonicalQuery& query, const ReadPreferenceSetting& readPref, std::vector<BSONObj>* results) { invariant(results); // Projection on the reserved sort key field is illegal in mongos. if (query.getParsed().getProj().hasField(ClusterClientCursorParams::kSortKeyField)) { return {ErrorCodes::BadValue, str::stream() << "Projection contains illegal field '" << ClusterClientCursorParams::kSortKeyField << "': " << query.getParsed().getProj()}; } auto dbConfig = grid.catalogCache()->getDatabase(txn, query.nss().db().toString()); if (dbConfig.getStatus() == ErrorCodes::NamespaceNotFound) { // If the database doesn't exist, we successfully return an empty result set without // creating a cursor. return CursorId(0); } else if (!dbConfig.isOK()) { return dbConfig.getStatus(); } std::shared_ptr<ChunkManager> chunkManager; std::shared_ptr<Shard> primary; dbConfig.getValue()->getChunkManagerOrPrimary(txn, query.nss().ns(), chunkManager, primary); // Re-target and re-send the initial find command to the shards until we have established the // shard version. for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) { auto cursorId = runQueryWithoutRetrying( txn, query, readPref, chunkManager.get(), std::move(primary), results); if (cursorId.isOK()) { return cursorId; } auto status = std::move(cursorId.getStatus()); if (status != ErrorCodes::SendStaleConfig && status != ErrorCodes::RecvStaleConfig && status != ErrorCodes::HostUnreachable) { // Errors other than receiving a stale config message from mongoD or an unreachable host // are fatal to the operation. return status; } LOG(1) << "Received error status for query " << query.toStringShort() << " on attempt " << retries << " of " << kMaxStaleConfigRetries << ": " << status; chunkManager = dbConfig.getValue()->getChunkManagerIfExists(txn, query.nss().ns(), true); if (!chunkManager) { dbConfig.getValue()->getChunkManagerOrPrimary( txn, query.nss().ns(), chunkManager, primary); } } return {ErrorCodes::StaleShardVersion, str::stream() << "Retried " << kMaxStaleConfigRetries << " times without establishing shard version on a reachable host."}; }
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn, const GetMoreRequest& request) { auto cursorManager = grid.getCursorManager(); auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid); if (!pinnedCursor.isOK()) { return pinnedCursor.getStatus(); } invariant(request.cursorid == pinnedCursor.getValue().getCursorId()); std::vector<BSONObj> batch; int bytesBuffered = 0; long long batchSize = request.batchSize.value_or(0); long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar(); auto cursorState = ClusterCursorManager::CursorState::NotExhausted; while (!FindCommon::enoughForGetMore(batchSize, batch.size(), bytesBuffered)) { auto next = pinnedCursor.getValue().next(); if (!next.isOK()) { return next.getStatus(); } if (!next.getValue()) { // We reached end-of-stream. if (!pinnedCursor.getValue().isTailable()) { cursorState = ClusterCursorManager::CursorState::Exhausted; } break; } // If adding this object will cause us to exceed the BSON size limit, then we stash it for // later. By using BSONObjMaxUserSize, we ensure that there is enough room for the // "envelope" (e.g. the "ns" and "id" fields included in the response) before exceeding // BSONObjMaxInternalSize. int sizeEstimate = bytesBuffered + next.getValue()->objsize() + ((batch.size() + 1U) * kPerDocumentOverheadBytesUpperBound); if (sizeEstimate > BSONObjMaxUserSize && !batch.empty()) { pinnedCursor.getValue().queueResult(*next.getValue()); break; } // Add doc to the batch. bytesBuffered += next.getValue()->objsize(); batch.push_back(std::move(*next.getValue())); } // Transfer ownership of the cursor back to the cursor manager. pinnedCursor.getValue().returnCursor(cursorState); CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted) ? CursorId(0) : request.cursorid; return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom); }
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn, const CanonicalQuery& query, const ReadPreferenceSetting& readPref, std::vector<BSONObj>* results) { invariant(results); auto dbConfig = grid.catalogCache()->getDatabase(txn, query.nss().db().toString()); if (dbConfig.getStatus() == ErrorCodes::DatabaseNotFound) { // If the database doesn't exist, we successfully return an empty result set without // creating a cursor. return CursorId(0); } else if (!dbConfig.isOK()) { return dbConfig.getStatus(); } std::shared_ptr<ChunkManager> chunkManager; std::shared_ptr<Shard> primary; dbConfig.getValue()->getChunkManagerOrPrimary(query.nss().ns(), chunkManager, primary); // Re-target and re-send the initial find command to the shards until we have established the // shard version. for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) { auto cursorId = runQueryWithoutRetrying( txn, query, readPref, chunkManager.get(), std::move(primary), results); if (cursorId.isOK()) { return cursorId; } auto status = std::move(cursorId.getStatus()); if (status != ErrorCodes::RecvStaleConfig) { // Errors other than receiving a stale config message from mongoD are fatal to the // operation. return status; } LOG(1) << "Received stale config for query " << query.toStringShort() << " on attempt " << retries << " of " << kMaxStaleConfigRetries << ": " << status.reason(); invariant(chunkManager); chunkManager = chunkManager->reload(txn); } return {ErrorCodes::StaleShardVersion, str::stream() << "Retried " << kMaxStaleConfigRetries << " times without establishing shard version."}; }
StatusWith<GetMoreResponse> ClusterFind::runGetMore(OperationContext* txn, const GetMoreRequest& request) { auto cursorManager = grid.getCursorManager(); auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid); if (!pinnedCursor.isOK()) { return pinnedCursor.getStatus(); } invariant(request.cursorid == pinnedCursor.getValue().getCursorId()); std::vector<BSONObj> batch; int bytesBuffered = 0; long long batchSize = request.batchSize.value_or(0); long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar(); auto cursorState = ClusterCursorManager::CursorState::NotExhausted; while (!FindCommon::enoughForGetMore(batchSize, batch.size(), bytesBuffered)) { auto next = pinnedCursor.getValue().next(); if (!next.isOK()) { return next.getStatus(); } if (!next.getValue()) { // We reached end-of-stream. if (!pinnedCursor.getValue().isTailable()) { cursorState = ClusterCursorManager::CursorState::Exhausted; } break; } // Add doc to the batch. bytesBuffered += next.getValue()->objsize(); batch.push_back(std::move(*next.getValue())); } // Transfer ownership of the cursor back to the cursor manager. pinnedCursor.getValue().returnCursor(cursorState); CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted) ? CursorId(0) : request.cursorid; return GetMoreResponse(request.nss, idToReturn, std::move(batch), startingFrom); }
StatusWith<BSONObj> storePossibleCursor(const HostAndPort& server, const BSONObj& cmdResult, executor::TaskExecutor* executor, ClusterCursorManager* cursorManager) { if (!useClusterClientCursor) { Status status = storePossibleCursorLegacy(server, cmdResult); return (status.isOK() ? StatusWith<BSONObj>(cmdResult) : StatusWith<BSONObj>(status)); } if (!cmdResult["ok"].trueValue() || !cmdResult.hasField("cursor")) { return cmdResult; } auto incomingCursorResponse = CursorResponse::parseFromBSON(cmdResult); if (!incomingCursorResponse.isOK()) { return incomingCursorResponse.getStatus(); } if (incomingCursorResponse.getValue().getCursorId() == CursorId(0)) { return cmdResult; } ClusterClientCursorParams params(incomingCursorResponse.getValue().getNSS()); params.remotes.emplace_back(server, incomingCursorResponse.getValue().getCursorId()); auto ccc = stdx::make_unique<ClusterClientCursorImpl>(executor, std::move(params)); auto pinnedCursor = cursorManager->registerCursor(std::move(ccc), incomingCursorResponse.getValue().getNSS(), ClusterCursorManager::CursorType::NamespaceNotSharded, ClusterCursorManager::CursorLifetime::Mortal); CursorId clusterCursorId = pinnedCursor.getCursorId(); pinnedCursor.returnCursor(ClusterCursorManager::CursorState::NotExhausted); CursorResponse outgoingCursorResponse(incomingCursorResponse.getValue().getNSS(), clusterCursorId, incomingCursorResponse.getValue().getBatch()); return outgoingCursorResponse.toBSON(CursorResponse::ResponseType::InitialResponse); }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { // Determine the default read preference mode based on the value of the slaveOk flag. ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk) ? ReadPreference::SecondaryPreferred : ReadPreference::PrimaryOnly; ReadPreferenceSetting readPreference(readPreferenceOption, TagSet()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK(ClusterFind::runExplain( txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn, const CanonicalQuery& query, const ReadPreferenceSetting& readPref, std::vector<BSONObj>* results, BSONObj* viewDefinition) { invariant(results); // Projection on the reserved sort key field is illegal in mongos. if (query.getQueryRequest().getProj().hasField(ClusterClientCursorParams::kSortKeyField)) { return {ErrorCodes::BadValue, str::stream() << "Projection contains illegal field '" << ClusterClientCursorParams::kSortKeyField << "': " << query.getQueryRequest().getProj()}; } auto dbConfig = Grid::get(txn)->catalogCache()->getDatabase(txn, query.nss().db().toString()); if (dbConfig.getStatus() == ErrorCodes::NamespaceNotFound) { // If the database doesn't exist, we successfully return an empty result set without // creating a cursor. return CursorId(0); } else if (!dbConfig.isOK()) { return dbConfig.getStatus(); } std::shared_ptr<ChunkManager> chunkManager; std::shared_ptr<Shard> primary; dbConfig.getValue()->getChunkManagerOrPrimary(txn, query.nss().ns(), chunkManager, primary); // Re-target and re-send the initial find command to the shards until we have established the // shard version. for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) { auto cursorId = runQueryWithoutRetrying( txn, query, readPref, chunkManager.get(), std::move(primary), results, viewDefinition); if (cursorId.isOK()) { return cursorId; } auto status = std::move(cursorId.getStatus()); if (!ErrorCodes::isStaleShardingError(status.code()) && status != ErrorCodes::ShardNotFound) { // Errors other than trying to reach a non existent shard or receiving a stale // metadata message from MongoD are fatal to the operation. Network errors and // replication retries happen at the level of the AsyncResultsMerger. return status; } LOG(1) << "Received error status for query " << redact(query.toStringShort()) << " on attempt " << retries << " of " << kMaxStaleConfigRetries << ": " << redact(status); const bool staleEpoch = (status == ErrorCodes::StaleEpoch); if (staleEpoch) { if (!dbConfig.getValue()->reload(txn)) { // If the reload failed that means the database wasn't found, so successfully return // an empty result set without creating a cursor. return CursorId(0); } } chunkManager = dbConfig.getValue()->getChunkManagerIfExists(txn, query.nss().ns(), true, staleEpoch); if (!chunkManager) { dbConfig.getValue()->getChunkManagerOrPrimary( txn, query.nss().ns(), chunkManager, primary); } } return {ErrorCodes::StaleShardVersion, str::stream() << "Retried " << kMaxStaleConfigRetries << " times without successfully establishing shard version."}; }