Esempio n. 1
0
void Strategy::queryOp(OperationContext* txn, Request& request) {
    verify(!NamespaceString(request.getns()).isCommand());

    Timer queryTimer;

    globalOpCounters.gotQuery();

    QueryMessage q(request.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForFind(ns, false);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        // Determine the default read preference mode based on the value of the slaveOk flag.
        ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk)
            ? ReadPreference::SecondaryPreferred
            : ReadPreference::PrimaryOnly;
        ReadPreferenceSetting readPreference(readPreferenceOption, TagSet());

        BSONElement rpElem;
        auto readPrefExtractStatus = bsonExtractTypedField(
            q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem);

        if (readPrefExtractStatus.isOK()) {
            auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
            uassertStatusOK(parsedRps.getStatus());
            readPreference = parsedRps.getValue();
        } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
            uassertStatusOK(readPrefExtractStatus);
        }

        auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop());
        uassertStatusOK(canonicalQuery.getStatus());

        // If the $explain flag was set, we must run the operation on the shards as an explain
        // command rather than a find command.
        if (canonicalQuery.getValue()->getParsed().isExplain()) {
            const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed();
            BSONObj findCommand = lpq.asFindCommand();

            // We default to allPlansExecution verbosity.
            auto verbosity = ExplainCommon::EXEC_ALL_PLANS;

            const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly);
            rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference);

            BSONObjBuilder explainBuilder;
            uassertStatusOK(ClusterFind::runExplain(
                txn, findCommand, lpq, verbosity, metadata, &explainBuilder));

            BSONObj explainObj = explainBuilder.done();
            replyToQuery(0,  // query result flags
                         request.p(),
                         request.m(),
                         static_cast<const void*>(explainObj.objdata()),
                         explainObj.objsize(),
                         1,  // numResults
                         0,  // startingFrom
                         CursorId(0));
            return;
        }

        // Do the work to generate the first batch of results. This blocks waiting to get responses
        // from the shard(s).
        std::vector<BSONObj> batch;

        // 0 means the cursor is exhausted and
        // otherwise we assume that a cursor with the returned id can be retrieved via the
        // ClusterCursorManager
        auto cursorId =
            ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch);
        uassertStatusOK(cursorId.getStatus());

        // TODO: this constant should be shared between mongos and mongod, and should
        // not be inside ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        // Fill out the response buffer.
        int numResults = 0;
        for (const auto& obj : batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            numResults++;
        }

        replyToQuery(0,  // query result flags
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     0,  // startingFrom
                     cursorId.getValue());
        return;
    }

    QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions);

    // Parse "$maxTimeMS".
    StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query);
    uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK());

    if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) {
        return;
    }

    ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo());
    verify(cursor);

    // TODO:  Move out to Request itself, not strategy based
    try {
        cursor->init(txn);

        if (qSpec.isExplain()) {
            BSONObjBuilder explain_builder;
            cursor->explain(explain_builder);
            explain_builder.appendNumber("executionTimeMillis",
                                         static_cast<long long>(queryTimer.millis()));
            BSONObj b = explain_builder.obj();

            replyToQuery(0, request.p(), request.m(), b);
            delete (cursor);
            return;
        }
    } catch (...) {
        delete cursor;
        throw;
    }

    // TODO: Revisit all of this when we revisit the sharded cursor cache

    if (cursor->getNumQueryShards() != 1) {
        // More than one shard (or zero), manage with a ShardedClientCursor
        // NOTE: We may also have *zero* shards here when the returnPartial flag is set.
        // Currently the code in ShardedClientCursor handles this.

        ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor));

        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cc->getTotalSent();
        bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount);

        if (hasMore) {
            LOG(5) << "storing cursor : " << cc->getId();

            int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis();
            if (maxTimeMS.getValue() == 0) {  // 0 represents "no limit".
                cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit;
            } else if (cursorLeftoverMillis <= 0) {
                cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
            }

            cursorCache.store(cc, cursorLeftoverMillis);
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cc->getId() : 0);
    } else {
        // Only one shard is used

        // Remote cursors are stored remotely, we shouldn't need this around.
        unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor);

        ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId());
        verify(shard.get());
        DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId());

        // Implicitly stores the cursor in the cache
        request.reply(*(shardCursor->getMessage()), shardCursor->originalHost());

        // We don't want to kill the cursor remotely if there's still data left
        shardCursor->decouple();
    }
}
Esempio n. 2
0
/**
 * Updates the remote cached version on the remote shard host (primary, in the case of replica
 * sets) if needed with a fully-qualified shard version for the given namespace:
 *   config server(s) + shard name + shard version
 *
 * If no remote cached version has ever been set, an initial shard version is sent.
 *
 * If the namespace is empty and no version has ever been sent, the config server + shard name
 * is sent to the remote shard host to initialize the connection as coming from mongos.
 * NOTE: This initialization is *best-effort only*.  Operations which wish to correctly version
 * must send the namespace.
 *
 * Config servers are special and are not (unless otherwise a shard) kept up to date with this
 * protocol.  This is safe so long as config servers only contain unversioned collections.
 *
 * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB).
 *
 * @return true if we contacted the remote host
 */
bool checkShardVersion(DBClientBase* conn_in,
                       const string& ns,
                       ChunkManagerPtr refManager,
                       bool authoritative,
                       int tryNumber) {
    // TODO: cache, optimize, etc...

    // Empty namespaces are special - we require initialization but not versioning
    if (ns.size() == 0) {
        return initShardVersionEmptyNS(conn_in);
    }

    auto status = grid.catalogCache()->getDatabase(nsToDatabase(ns));
    if (!status.isOK()) {
        return false;
    }

    shared_ptr<DBConfig> conf = status.getValue();

    DBClientBase* conn = getVersionable(conn_in);
    verify(conn);  // errors thrown above

    unsigned long long officialSequenceNumber = 0;

    ShardPtr primary;
    ChunkManagerPtr manager;
    if (authoritative)
        conf->getChunkManagerIfExists(ns, true);

    conf->getChunkManagerOrPrimary(ns, manager, primary);

    if (manager) {
        officialSequenceNumber = manager->getSequenceNumber();
    }

    const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress());
    uassert(ErrorCodes::ShardNotFound,
            str::stream() << conn->getServerAddress() << " is not recognized as a shard",
            shard);

    // Check this manager against the reference manager
    if (manager) {
        if (refManager && !refManager->compatibleWith(*manager, shard->getId())) {
            const ChunkVersion refVersion(refManager->getVersion(shard->getId()));
            const ChunkVersion currentVersion(manager->getVersion(shard->getId()));

            string msg(str::stream()
                       << "manager (" << currentVersion.toString() << " : "
                       << manager->getSequenceNumber() << ") "
                       << "not compatible with reference manager (" << refVersion.toString()
                       << " : " << refManager->getSequenceNumber() << ") "
                       << "on shard " << shard->getId() << " (" << shard->getConnString().toString()
                       << ")");

            throw SendStaleConfigException(ns, msg, refVersion, currentVersion);
        }
    } else if (refManager) {
        string msg(str::stream() << "not sharded ("
                                 << ((manager.get() == 0) ? string("<none>") : str::stream()
                                             << manager->getSequenceNumber())
                                 << ") but has reference manager ("
                                 << refManager->getSequenceNumber() << ") "
                                 << "on conn " << conn->getServerAddress() << " ("
                                 << conn_in->getServerAddress() << ")");

        throw SendStaleConfigException(
            ns, msg, refManager->getVersion(shard->getId()), ChunkVersion::UNSHARDED());
    }

    // Do not send setShardVersion to collections on the config servers - this causes problems
    // when config servers are also shards and get SSV with conflicting names.
    // TODO: Make config servers regular shards
    if (primary && primary->getId() == "config") {
        return false;
    }

    // Has the ChunkManager been reloaded since the last time we updated the shard version over
    // this connection?  If we've never updated the shard version, do so now.
    unsigned long long sequenceNumber = 0;
    if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) {
        if (sequenceNumber == officialSequenceNumber) {
            return false;
        }
    }

    ChunkVersion version = ChunkVersion(0, 0, OID());
    if (manager) {
        version = manager->getVersion(shard->getId());
    }

    LOG(1) << "setting shard version of " << version << " for " << ns << " on shard "
           << shard->toString();

    LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber
           << ", current chunk manager iteration is " << officialSequenceNumber;

    BSONObj result;
    if (setShardVersion(*conn,
                        ns,
                        grid.catalogManager()->connectionString().toString(),
                        version,
                        manager.get(),
                        authoritative,
                        result)) {
        LOG(1) << "      setShardVersion success: " << result;
        connectionShardStatus.setSequence(conn, ns, officialSequenceNumber);
        return true;
    }

    LOG(1) << "       setShardVersion failed!\n" << result << endl;

    if (result["need_authoritative"].trueValue())
        massert(10428, "need_authoritative set but in authoritative mode already", !authoritative);

    if (!authoritative) {
        // use the original connection and get a fresh versionable connection
        // since conn can be invalidated (or worse, freed) after the failure
        checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1);
        return true;
    }

    if (result["reloadConfig"].trueValue()) {
        if (result["version"].timestampTime() == Date_t()) {
            warning() << "reloading full configuration for " << conf->name()
                      << ", connection state indicates significant version changes";

            // reload db
            conf->reload();
        } else {
            // reload config
            conf->getChunkManager(ns, true);
        }
    }

    const int maxNumTries = 7;
    if (tryNumber < maxNumTries) {
        LOG(tryNumber < (maxNumTries / 2) ? 1 : 0)
            << "going to retry checkShardVersion shard: " << shard->toString() << " " << result;
        sleepmillis(10 * tryNumber);
        // use the original connection and get a fresh versionable connection
        // since conn can be invalidated (or worse, freed) after the failure
        checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1);
        return true;
    }

    string errmsg = str::stream() << "setShardVersion failed shard: " << shard->toString() << " "
                                  << result;
    log() << "     " << errmsg << endl;
    massert(10429, errmsg, 0);
    return true;
}