Exemple #1
0
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn,
                                                   const GetMoreRequest& request) {
    auto cursorManager = grid.getCursorManager();

    auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid);
    if (!pinnedCursor.isOK()) {
        return pinnedCursor.getStatus();
    }
    invariant(request.cursorid == pinnedCursor.getValue().getCursorId());

    // If the fail point is enabled, busy wait until it is disabled.
    while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) {
    }

    if (request.awaitDataTimeout) {
        auto status = pinnedCursor.getValue().setAwaitDataTimeout(*request.awaitDataTimeout);
        if (!status.isOK()) {
            return status;
        }
    }

    std::vector<BSONObj> batch;
    int bytesBuffered = 0;
    long long batchSize = request.batchSize.value_or(0);
    long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar();
    auto cursorState = ClusterCursorManager::CursorState::NotExhausted;
    while (!FindCommon::enoughForGetMore(batchSize, batch.size())) {
        auto next = pinnedCursor.getValue().next();
        if (!next.isOK()) {
            return next.getStatus();
        }

        if (!next.getValue()) {
            // We reached end-of-stream.
            if (!pinnedCursor.getValue().isTailable()) {
                cursorState = ClusterCursorManager::CursorState::Exhausted;
            }
            break;
        }

        if (!FindCommon::haveSpaceForNext(*next.getValue(), batch.size(), bytesBuffered)) {
            pinnedCursor.getValue().queueResult(*next.getValue());
            break;
        }

        // Add doc to the batch. Account for the space overhead associated with returning this doc
        // inside a BSON array.
        bytesBuffered += (next.getValue()->objsize() + kPerDocumentOverheadBytesUpperBound);
        batch.push_back(std::move(*next.getValue()));
    }

    // Transfer ownership of the cursor back to the cursor manager.
    pinnedCursor.getValue().returnCursor(cursorState);

    CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted)
        ? CursorId(0)
        : request.cursorid;
    return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom);
}
Exemple #2
0
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn,
                                           const CanonicalQuery& query,
                                           const ReadPreferenceSetting& readPref,
                                           std::vector<BSONObj>* results) {
    invariant(results);

    // Projection on the reserved sort key field is illegal in mongos.
    if (query.getParsed().getProj().hasField(ClusterClientCursorParams::kSortKeyField)) {
        return {ErrorCodes::BadValue,
                str::stream() << "Projection contains illegal field '"
                              << ClusterClientCursorParams::kSortKeyField
                              << "': " << query.getParsed().getProj()};
    }

    auto dbConfig = grid.catalogCache()->getDatabase(txn, query.nss().db().toString());
    if (dbConfig.getStatus() == ErrorCodes::NamespaceNotFound) {
        // If the database doesn't exist, we successfully return an empty result set without
        // creating a cursor.
        return CursorId(0);
    } else if (!dbConfig.isOK()) {
        return dbConfig.getStatus();
    }

    std::shared_ptr<ChunkManager> chunkManager;
    std::shared_ptr<Shard> primary;
    dbConfig.getValue()->getChunkManagerOrPrimary(txn, query.nss().ns(), chunkManager, primary);

    // Re-target and re-send the initial find command to the shards until we have established the
    // shard version.
    for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) {
        auto cursorId = runQueryWithoutRetrying(
            txn, query, readPref, chunkManager.get(), std::move(primary), results);
        if (cursorId.isOK()) {
            return cursorId;
        }
        auto status = std::move(cursorId.getStatus());

        if (status != ErrorCodes::SendStaleConfig && status != ErrorCodes::RecvStaleConfig &&
            status != ErrorCodes::HostUnreachable) {
            // Errors other than receiving a stale config message from mongoD or an unreachable host
            // are fatal to the operation.
            return status;
        }

        LOG(1) << "Received error status for query " << query.toStringShort() << " on attempt "
               << retries << " of " << kMaxStaleConfigRetries << ": " << status;

        chunkManager = dbConfig.getValue()->getChunkManagerIfExists(txn, query.nss().ns(), true);
        if (!chunkManager) {
            dbConfig.getValue()->getChunkManagerOrPrimary(
                txn, query.nss().ns(), chunkManager, primary);
        }
    }

    return {ErrorCodes::StaleShardVersion,
            str::stream() << "Retried " << kMaxStaleConfigRetries
                          << " times without establishing shard version on a reachable host."};
}
Exemple #3
0
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn,
                                                   const GetMoreRequest& request) {
    auto cursorManager = grid.getCursorManager();

    auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid);
    if (!pinnedCursor.isOK()) {
        return pinnedCursor.getStatus();
    }
    invariant(request.cursorid == pinnedCursor.getValue().getCursorId());

    std::vector<BSONObj> batch;
    int bytesBuffered = 0;
    long long batchSize = request.batchSize.value_or(0);
    long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar();
    auto cursorState = ClusterCursorManager::CursorState::NotExhausted;
    while (!FindCommon::enoughForGetMore(batchSize, batch.size(), bytesBuffered)) {
        auto next = pinnedCursor.getValue().next();
        if (!next.isOK()) {
            return next.getStatus();
        }

        if (!next.getValue()) {
            // We reached end-of-stream.
            if (!pinnedCursor.getValue().isTailable()) {
                cursorState = ClusterCursorManager::CursorState::Exhausted;
            }
            break;
        }

        // If adding this object will cause us to exceed the BSON size limit, then we stash it for
        // later. By using BSONObjMaxUserSize, we ensure that there is enough room for the
        // "envelope" (e.g. the "ns" and "id" fields included in the response) before exceeding
        // BSONObjMaxInternalSize.
        int sizeEstimate = bytesBuffered + next.getValue()->objsize() +
            ((batch.size() + 1U) * kPerDocumentOverheadBytesUpperBound);
        if (sizeEstimate > BSONObjMaxUserSize && !batch.empty()) {
            pinnedCursor.getValue().queueResult(*next.getValue());
            break;
        }

        // Add doc to the batch.
        bytesBuffered += next.getValue()->objsize();
        batch.push_back(std::move(*next.getValue()));
    }

    // Transfer ownership of the cursor back to the cursor manager.
    pinnedCursor.getValue().returnCursor(cursorState);

    CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted)
        ? CursorId(0)
        : request.cursorid;
    return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom);
}
Exemple #4
0
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn,
                                           const CanonicalQuery& query,
                                           const ReadPreferenceSetting& readPref,
                                           std::vector<BSONObj>* results) {
    invariant(results);

    auto dbConfig = grid.catalogCache()->getDatabase(txn, query.nss().db().toString());
    if (dbConfig.getStatus() == ErrorCodes::DatabaseNotFound) {
        // If the database doesn't exist, we successfully return an empty result set without
        // creating a cursor.
        return CursorId(0);
    } else if (!dbConfig.isOK()) {
        return dbConfig.getStatus();
    }

    std::shared_ptr<ChunkManager> chunkManager;
    std::shared_ptr<Shard> primary;
    dbConfig.getValue()->getChunkManagerOrPrimary(query.nss().ns(), chunkManager, primary);

    // Re-target and re-send the initial find command to the shards until we have established the
    // shard version.
    for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) {
        auto cursorId = runQueryWithoutRetrying(
            txn, query, readPref, chunkManager.get(), std::move(primary), results);
        if (cursorId.isOK()) {
            return cursorId;
        }
        auto status = std::move(cursorId.getStatus());

        if (status != ErrorCodes::RecvStaleConfig) {
            // Errors other than receiving a stale config message from mongoD are fatal to the
            // operation.
            return status;
        }

        LOG(1) << "Received stale config for query " << query.toStringShort() << " on attempt "
               << retries << " of " << kMaxStaleConfigRetries << ": " << status.reason();

        invariant(chunkManager);
        chunkManager = chunkManager->reload(txn);
    }

    return {ErrorCodes::StaleShardVersion,
            str::stream() << "Retried " << kMaxStaleConfigRetries
                          << " times without establishing shard version."};
}
Exemple #5
0
StatusWith<GetMoreResponse> ClusterFind::runGetMore(OperationContext* txn,
                                                    const GetMoreRequest& request) {
    auto cursorManager = grid.getCursorManager();

    auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid);
    if (!pinnedCursor.isOK()) {
        return pinnedCursor.getStatus();
    }
    invariant(request.cursorid == pinnedCursor.getValue().getCursorId());

    std::vector<BSONObj> batch;
    int bytesBuffered = 0;
    long long batchSize = request.batchSize.value_or(0);
    long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar();
    auto cursorState = ClusterCursorManager::CursorState::NotExhausted;
    while (!FindCommon::enoughForGetMore(batchSize, batch.size(), bytesBuffered)) {
        auto next = pinnedCursor.getValue().next();
        if (!next.isOK()) {
            return next.getStatus();
        }

        if (!next.getValue()) {
            // We reached end-of-stream.
            if (!pinnedCursor.getValue().isTailable()) {
                cursorState = ClusterCursorManager::CursorState::Exhausted;
            }
            break;
        }

        // Add doc to the batch.
        bytesBuffered += next.getValue()->objsize();
        batch.push_back(std::move(*next.getValue()));
    }

    // Transfer ownership of the cursor back to the cursor manager.
    pinnedCursor.getValue().returnCursor(cursorState);

    CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted)
        ? CursorId(0)
        : request.cursorid;
    return GetMoreResponse(request.nss, idToReturn, std::move(batch), startingFrom);
}
StatusWith<BSONObj> storePossibleCursor(const HostAndPort& server,
                                        const BSONObj& cmdResult,
                                        executor::TaskExecutor* executor,
                                        ClusterCursorManager* cursorManager) {
    if (!useClusterClientCursor) {
        Status status = storePossibleCursorLegacy(server, cmdResult);
        return (status.isOK() ? StatusWith<BSONObj>(cmdResult) : StatusWith<BSONObj>(status));
    }

    if (!cmdResult["ok"].trueValue() || !cmdResult.hasField("cursor")) {
        return cmdResult;
    }

    auto incomingCursorResponse = CursorResponse::parseFromBSON(cmdResult);
    if (!incomingCursorResponse.isOK()) {
        return incomingCursorResponse.getStatus();
    }

    if (incomingCursorResponse.getValue().getCursorId() == CursorId(0)) {
        return cmdResult;
    }

    ClusterClientCursorParams params(incomingCursorResponse.getValue().getNSS());
    params.remotes.emplace_back(server, incomingCursorResponse.getValue().getCursorId());

    auto ccc = stdx::make_unique<ClusterClientCursorImpl>(executor, std::move(params));
    auto pinnedCursor =
        cursorManager->registerCursor(std::move(ccc),
                                      incomingCursorResponse.getValue().getNSS(),
                                      ClusterCursorManager::CursorType::NamespaceNotSharded,
                                      ClusterCursorManager::CursorLifetime::Mortal);
    CursorId clusterCursorId = pinnedCursor.getCursorId();
    pinnedCursor.returnCursor(ClusterCursorManager::CursorState::NotExhausted);

    CursorResponse outgoingCursorResponse(incomingCursorResponse.getValue().getNSS(),
                                          clusterCursorId,
                                          incomingCursorResponse.getValue().getBatch());
    return outgoingCursorResponse.toBSON(CursorResponse::ResponseType::InitialResponse);
}
Exemple #7
0
void Strategy::queryOp(OperationContext* txn, Request& request) {
    verify(!NamespaceString(request.getns()).isCommand());

    Timer queryTimer;

    globalOpCounters.gotQuery();

    QueryMessage q(request.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForFind(ns, false);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        // Determine the default read preference mode based on the value of the slaveOk flag.
        ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk)
            ? ReadPreference::SecondaryPreferred
            : ReadPreference::PrimaryOnly;
        ReadPreferenceSetting readPreference(readPreferenceOption, TagSet());

        BSONElement rpElem;
        auto readPrefExtractStatus = bsonExtractTypedField(
            q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem);

        if (readPrefExtractStatus.isOK()) {
            auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
            uassertStatusOK(parsedRps.getStatus());
            readPreference = parsedRps.getValue();
        } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
            uassertStatusOK(readPrefExtractStatus);
        }

        auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop());
        uassertStatusOK(canonicalQuery.getStatus());

        // If the $explain flag was set, we must run the operation on the shards as an explain
        // command rather than a find command.
        if (canonicalQuery.getValue()->getParsed().isExplain()) {
            const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed();
            BSONObj findCommand = lpq.asFindCommand();

            // We default to allPlansExecution verbosity.
            auto verbosity = ExplainCommon::EXEC_ALL_PLANS;

            const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly);
            rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference);

            BSONObjBuilder explainBuilder;
            uassertStatusOK(ClusterFind::runExplain(
                txn, findCommand, lpq, verbosity, metadata, &explainBuilder));

            BSONObj explainObj = explainBuilder.done();
            replyToQuery(0,  // query result flags
                         request.p(),
                         request.m(),
                         static_cast<const void*>(explainObj.objdata()),
                         explainObj.objsize(),
                         1,  // numResults
                         0,  // startingFrom
                         CursorId(0));
            return;
        }

        // Do the work to generate the first batch of results. This blocks waiting to get responses
        // from the shard(s).
        std::vector<BSONObj> batch;

        // 0 means the cursor is exhausted and
        // otherwise we assume that a cursor with the returned id can be retrieved via the
        // ClusterCursorManager
        auto cursorId =
            ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch);
        uassertStatusOK(cursorId.getStatus());

        // TODO: this constant should be shared between mongos and mongod, and should
        // not be inside ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        // Fill out the response buffer.
        int numResults = 0;
        for (const auto& obj : batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            numResults++;
        }

        replyToQuery(0,  // query result flags
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     0,  // startingFrom
                     cursorId.getValue());
        return;
    }

    QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions);

    // Parse "$maxTimeMS".
    StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query);
    uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK());

    if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) {
        return;
    }

    ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo());
    verify(cursor);

    // TODO:  Move out to Request itself, not strategy based
    try {
        cursor->init(txn);

        if (qSpec.isExplain()) {
            BSONObjBuilder explain_builder;
            cursor->explain(explain_builder);
            explain_builder.appendNumber("executionTimeMillis",
                                         static_cast<long long>(queryTimer.millis()));
            BSONObj b = explain_builder.obj();

            replyToQuery(0, request.p(), request.m(), b);
            delete (cursor);
            return;
        }
    } catch (...) {
        delete cursor;
        throw;
    }

    // TODO: Revisit all of this when we revisit the sharded cursor cache

    if (cursor->getNumQueryShards() != 1) {
        // More than one shard (or zero), manage with a ShardedClientCursor
        // NOTE: We may also have *zero* shards here when the returnPartial flag is set.
        // Currently the code in ShardedClientCursor handles this.

        ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor));

        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cc->getTotalSent();
        bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount);

        if (hasMore) {
            LOG(5) << "storing cursor : " << cc->getId();

            int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis();
            if (maxTimeMS.getValue() == 0) {  // 0 represents "no limit".
                cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit;
            } else if (cursorLeftoverMillis <= 0) {
                cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
            }

            cursorCache.store(cc, cursorLeftoverMillis);
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cc->getId() : 0);
    } else {
        // Only one shard is used

        // Remote cursors are stored remotely, we shouldn't need this around.
        unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor);

        ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId());
        verify(shard.get());
        DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId());

        // Implicitly stores the cursor in the cache
        request.reply(*(shardCursor->getMessage()), shardCursor->originalHost());

        // We don't want to kill the cursor remotely if there's still data left
        shardCursor->decouple();
    }
}
Exemple #8
0
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn,
                                           const CanonicalQuery& query,
                                           const ReadPreferenceSetting& readPref,
                                           std::vector<BSONObj>* results,
                                           BSONObj* viewDefinition) {
    invariant(results);

    // Projection on the reserved sort key field is illegal in mongos.
    if (query.getQueryRequest().getProj().hasField(ClusterClientCursorParams::kSortKeyField)) {
        return {ErrorCodes::BadValue,
                str::stream() << "Projection contains illegal field '"
                              << ClusterClientCursorParams::kSortKeyField
                              << "': "
                              << query.getQueryRequest().getProj()};
    }

    auto dbConfig = Grid::get(txn)->catalogCache()->getDatabase(txn, query.nss().db().toString());
    if (dbConfig.getStatus() == ErrorCodes::NamespaceNotFound) {
        // If the database doesn't exist, we successfully return an empty result set without
        // creating a cursor.
        return CursorId(0);
    } else if (!dbConfig.isOK()) {
        return dbConfig.getStatus();
    }

    std::shared_ptr<ChunkManager> chunkManager;
    std::shared_ptr<Shard> primary;
    dbConfig.getValue()->getChunkManagerOrPrimary(txn, query.nss().ns(), chunkManager, primary);

    // Re-target and re-send the initial find command to the shards until we have established the
    // shard version.
    for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) {
        auto cursorId = runQueryWithoutRetrying(
            txn, query, readPref, chunkManager.get(), std::move(primary), results, viewDefinition);
        if (cursorId.isOK()) {
            return cursorId;
        }
        auto status = std::move(cursorId.getStatus());

        if (!ErrorCodes::isStaleShardingError(status.code()) &&
            status != ErrorCodes::ShardNotFound) {
            // Errors other than trying to reach a non existent shard or receiving a stale
            // metadata message from MongoD are fatal to the operation. Network errors and
            // replication retries happen at the level of the AsyncResultsMerger.
            return status;
        }

        LOG(1) << "Received error status for query " << redact(query.toStringShort())
               << " on attempt " << retries << " of " << kMaxStaleConfigRetries << ": "
               << redact(status);

        const bool staleEpoch = (status == ErrorCodes::StaleEpoch);
        if (staleEpoch) {
            if (!dbConfig.getValue()->reload(txn)) {
                // If the reload failed that means the database wasn't found, so successfully return
                // an empty result set without creating a cursor.
                return CursorId(0);
            }
        }
        chunkManager =
            dbConfig.getValue()->getChunkManagerIfExists(txn, query.nss().ns(), true, staleEpoch);
        if (!chunkManager) {
            dbConfig.getValue()->getChunkManagerOrPrimary(
                txn, query.nss().ns(), chunkManager, primary);
        }
    }

    return {ErrorCodes::StaleShardVersion,
            str::stream() << "Retried " << kMaxStaleConfigRetries
                          << " times without successfully establishing shard version."};
}