StatusWith<OpTime> ReplicationCoordinatorExternalStateImpl::loadLastOpTime(
            OperationContext* txn) {

        // TODO: handle WriteConflictExceptions below
        try {
            BSONObj oplogEntry;
            if (!Helpers::getLast(txn, rsOplogName.c_str(), oplogEntry)) {
                return StatusWith<OpTime>(
                        ErrorCodes::NoMatchingDocument,
                        str::stream() << "Did not find any entries in " << rsOplogName);
            }
            BSONElement tsElement = oplogEntry[tsFieldName];
            if (tsElement.eoo()) {
                return StatusWith<OpTime>(
                        ErrorCodes::NoSuchKey,
                        str::stream() << "Most recent entry in " << rsOplogName << " missing \"" <<
                        tsFieldName << "\" field");
            }
            if (tsElement.type() != bsonTimestamp) {
                return StatusWith<OpTime>(
                        ErrorCodes::TypeMismatch,
                        str::stream() << "Expected type of \"" << tsFieldName <<
                        "\" in most recent " << rsOplogName <<
                        " entry to have type Timestamp, but found " << typeName(tsElement.type()));
            }
            // TODO(siyuan) add term
            return StatusWith<OpTime>(OpTime(tsElement.timestamp(), 0));
        }
        catch (const DBException& ex) {
            return StatusWith<OpTime>(ex.toStatus());
        }
    }
Beispiel #2
0
Status bsonExtractTimestampField(const BSONObj& object, StringData fieldName, Timestamp* out) {
    BSONElement element;
    Status status = bsonExtractTypedField(object, fieldName, bsonTimestamp, &element);
    if (status.isOK())
        *out = element.timestamp();
    return status;
}
StatusWith<ShardingMetadata> ShardingMetadata::readFromMetadata(const BSONObj& metadataObj) {
    BSONElement smElem;
    auto smExtractStatus =
        bsonExtractTypedField(metadataObj, kGLEStatsFieldName, mongo::Object, &smElem);
    if (!smExtractStatus.isOK()) {
        return smExtractStatus;
    }

    if (smElem.embeddedObject().nFields() != 2) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << "The $gleStats object can only have 2 fields, but got "
                                    << smElem.embeddedObject().toString());
    }

    BSONElement lastOpTimeElem;
    auto lastOpTimeExtractStatus = bsonExtractTypedField(smElem.embeddedObject(),
                                                         kGLEStatsLastOpTimeFieldName,
                                                         mongo::bsonTimestamp,
                                                         &lastOpTimeElem);
    if (!lastOpTimeExtractStatus.isOK()) {
        return lastOpTimeExtractStatus;
    }

    BSONElement lastElectionIdElem;
    auto lastElectionIdExtractStatus = bsonExtractTypedField(
        smElem.embeddedObject(), kGLEStatsElectionIdFieldName, mongo::jstOID, &lastElectionIdElem);
    if (!lastElectionIdExtractStatus.isOK()) {
        return lastElectionIdExtractStatus;
    }

    return ShardingMetadata(lastOpTimeElem.timestamp(), lastElectionIdElem.OID());
}
Beispiel #4
0
/**
 * data and len must be the arguments from RecordStore::insert() on an oplog collection.
 */
StatusWith<RecordId> extractKey(const char* data, int len) {
    DEV invariant(validateBSON(data, len).isOK());

    const BSONObj obj(data);
    const BSONElement elem = obj["ts"];
    if (elem.eoo())
        return StatusWith<RecordId>(ErrorCodes::BadValue, "no ts field");
    if (elem.type() != bsonTimestamp)
        return StatusWith<RecordId>(ErrorCodes::BadValue, "ts must be a Timestamp");

    return keyForOptime(elem.timestamp());
}
/**
 * data and len must be the arguments from RecordStore::insert() on an oplog collection.
 */
StatusWith<RecordId> extractKey(const char* data, int len) {
    // Use the latest BSON validation version. Oplog entries are allowed to contain decimal data
    // even if decimal is disabled.
    DEV invariant(validateBSON(data, len, BSONVersion::kLatest).isOK());

    const BSONObj obj(data);
    const BSONElement elem = obj["ts"];
    if (elem.eoo())
        return StatusWith<RecordId>(ErrorCodes::BadValue, "no ts field");
    if (elem.type() != bsonTimestamp)
        return StatusWith<RecordId>(ErrorCodes::BadValue, "ts must be a Timestamp");

    return keyForOptime(elem.timestamp());
}
StatusWith<ShardingMetadata> ShardingMetadata::readFromMetadata(const BSONObj& metadataObj) {
    BSONElement smElem;
    auto smExtractStatus =
        bsonExtractTypedField(metadataObj, kGLEStatsFieldName, mongol::Object, &smElem);
    if (!smExtractStatus.isOK()) {
        return smExtractStatus;
    }

    if (smElem.embeddedObject().nFields() != 2) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << "The $gleStats object can only have 2 fields, but got "
                                    << smElem.embeddedObject().toString());
    }

    repl::OpTime opTime;
    const BSONElement opTimeElement = smElem.embeddedObject()[kGLEStatsLastOpTimeFieldName];
    if (opTimeElement.eoo()) {
        return Status(ErrorCodes::NoSuchKey, "lastOpTime field missing");
    } else if (opTimeElement.type() == bsonTimestamp) {
        opTime = repl::OpTime(opTimeElement.timestamp(), repl::OpTime::kUninitializedTerm);
    } else if (opTimeElement.type() == Date) {
        opTime = repl::OpTime(Timestamp(opTimeElement.date()), repl::OpTime::kUninitializedTerm);
    } else if (opTimeElement.type() == Object) {
        Status status =
            bsonExtractOpTimeField(smElem.embeddedObject(), kGLEStatsLastOpTimeFieldName, &opTime);
        if (!status.isOK()) {
            return status;
        }
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kGLEStatsLastOpTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(opTimeElement.type()));
    }

    BSONElement lastElectionIdElem;
    auto lastElectionIdExtractStatus = bsonExtractTypedField(
        smElem.embeddedObject(), kGLEStatsElectionIdFieldName, mongol::jstOID, &lastElectionIdElem);
    if (!lastElectionIdExtractStatus.isOK()) {
        return lastElectionIdExtractStatus;
    }

    return ShardingMetadata(opTime, lastElectionIdElem.OID());
}
Beispiel #7
0
StatusWith<ChunkVersion> ChunkVersion::parseFromBSONWithFieldForCommands(const BSONObj& obj,
                                                                         StringData field) {
    BSONElement versionElem;
    Status status = bsonExtractField(obj, field, &versionElem);
    if (!status.isOK())
        return status;

    if (versionElem.type() != Array) {
        return {ErrorCodes::TypeMismatch,
                str::stream() << "Invalid type " << versionElem.type()
                              << " for shardVersion element. Expected an array"};
    }

    BSONObjIterator it(versionElem.Obj());
    if (!it.more())
        return {ErrorCodes::BadValue, "Unexpected empty version"};

    ChunkVersion version;

    // Expect the timestamp
    {
        BSONElement tsPart = it.next();
        if (tsPart.type() != bsonTimestamp)
            return {ErrorCodes::TypeMismatch,
                    str::stream() << "Invalid type " << tsPart.type()
                                  << " for version timestamp part."};

        version._combined = tsPart.timestamp().asULL();
    }

    // Expect the epoch OID
    {
        BSONElement epochPart = it.next();
        if (epochPart.type() != jstOID)
            return {ErrorCodes::TypeMismatch,
                    str::stream() << "Invalid type " << epochPart.type()
                                  << " for version epoch part."};

        version._epoch = epochPart.OID();
    }

    return version;
}
Beispiel #8
0
FieldParser::FieldState FieldParser::extract(BSONElement elem,
                                             const BSONField<Timestamp>& field,
                                             Timestamp* out,
                                             string* errMsg) {
    if (elem.eoo()) {
        if (field.hasDefault()) {
            *out = field.getDefault();
            return FIELD_DEFAULT;
        } else {
            return FIELD_NONE;
        }
    }

    if (elem.type() == bsonTimestamp) {
        *out = elem.timestamp();
        return FIELD_SET;
    }

    _genFieldErrMsg(elem, field, "timestamp", errMsg);
    return FIELD_INVALID;
}
Beispiel #9
0
    std::string runQuery(OperationContext* txn,
                         QueryMessage& q,
                         const NamespaceString& nss,
                         CurOp& curop,
                         Message &result) {
        // Validate the namespace.
        uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid());
        invariant(!nss.isCommand());

        // Set curop information.
        beginQueryOp(nss, q.query, q.ntoreturn, q.ntoskip, &curop);

        // Parse the qm into a CanonicalQuery.
        std::auto_ptr<CanonicalQuery> cq;
        {
            CanonicalQuery* cqRaw;
            Status canonStatus = CanonicalQuery::canonicalize(q,
                                                              &cqRaw,
                                                              WhereCallbackReal(txn, nss.db()));
            if (!canonStatus.isOK()) {
                uasserted(17287, str::stream() << "Can't canonicalize query: "
                                               << canonStatus.toString());
            }
            cq.reset(cqRaw);
        }
        invariant(cq.get());

        LOG(5) << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() :
                                                   serverGlobalParams.defaultProfile;

        // We have a parsed query. Time to get the execution plan for it.
        std::unique_ptr<PlanExecutor> exec;
        {
            PlanExecutor* rawExec;
            Status execStatus = getExecutorFind(txn,
                                                collection,
                                                nss,
                                                cq.release(),
                                                PlanExecutor::YIELD_AUTO,
                                                &rawExec);
            uassertStatusOK(execStatus);
            exec.reset(rawExec);
        }
        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(nss.ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
        Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                slaveOK);
        uassertStatusOK(serveReadsStatus);

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        Timestamp slaveReadTill;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.isOplogReplay()) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || bsonTimestamp == e.type()) {
                    slaveReadTill = e.timestamp();
                }
            }

            if (enoughForFirstBatch(pq, numResults, bb.len())) {
                LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::FAILURE == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
        // chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState.getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(nss.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(nss.ns()));
        }

        // Fill out curop based on query results. If we have a cursorid, we will fill out curop with
        // this cursorid later.
        long long ccId = 0;

        if (shouldSaveCursor(txn, collection, state, exec.get())) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection->getCursorManager(),
                                                exec.release(),
                                                nss.ns(),
                                                pq.getOptions(),
                                                pq.getFilter());
            ccId = cc->cursorid();

            if (txn->getClient()->isInDirectClient()) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.isTailable()) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                txn->recoveryUnit()->abandonSnapshot();
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
                invariant(txn->setRecoveryUnit(storageEngine->newRecoveryUnit(),
                                               OperationContext::kNotInUnitOfWork)
                          == OperationContext::kNotInUnitOfWork);
            }

            LOG(5) << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // TODO document
            if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.isExhaust()) {
                curop.debug().exhaust = true;
            }

            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

            endQueryOp(cc->getExecutor(), dbProfilingLevel, numResults, ccId, &curop);
        }
        else {
            LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
            endQueryOp(exec.get(), dbProfilingLevel, numResults, ccId, &curop);
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? nss.ns() : "";
    }
Beispiel #10
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View getMore(OperationContext* txn,
                              const char* ns,
                              int ntoreturn,
                              long long cursorid,
                              CurOp& curop,
                              int pass,
                              bool& exhaust,
                              bool* isCursorAuthorized) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        const NamespaceString nss(ns);

        // Depending on the type of cursor being operated on, we hold locks for the whole getMore,
        // or none of the getMore, or part of the getMore.  The three cases in detail:
        //
        // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
        // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors don't
        //    own any collection state.
        // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
        //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
        //    release), but the pin and unpin of the cursor must occur under the collection lock.
        //    We don't use our AutoGetCollectionForRead "ctx" to relock, because
        //    AutoGetCollectionForRead checks the sharding version (and we want the relock for the
        //    unpin to succeed even if the sharding version has changed).
        //
        // Note that we declare our locks before our ClientCursorPin, in order to ensure that the
        // pin's destructor is called before the lock destructors (so that the unpin occurs under
        // the lock).
        boost::scoped_ptr<AutoGetCollectionForRead> ctx;
        boost::scoped_ptr<Lock::DBLock> unpinDBLock;
        boost::scoped_ptr<Lock::CollectionLock> unpinCollLock;

        CursorManager* cursorManager;
        CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
        if (globalCursorManager->ownsCursorId(cursorid)) {
            cursorManager = globalCursorManager;
        }
        else {
            ctx.reset(new AutoGetCollectionForRead(txn, nss));
            Collection* collection = ctx->getCollection();
            uassert( 17356, "collection dropped between getMore calls", collection );
            cursorManager = collection->getCursorManager();
        }

        LOG(5) << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorManager, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Check for spoofing of the ns such that it does not match the one originally
            // there for the cursor.
            uassert(ErrorCodes::Unauthorized,
                    str::stream() << "Requested getMore on namespace " << ns << ", but cursor "
                                  << cursorid << " belongs to namespace " << cc->ns(),
                    ns == cc->ns());
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (txn->getClient()->isInDirectClient()) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit());

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn); 
            }

            if (cc->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            // If we're replaying the oplog, we save the last time that we read.
            Timestamp slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || bsonTimestamp == e.type()) {
                        slaveReadTill = e.timestamp();
                    }
                }

                if (enoughForGetMore(ntoreturn, numResults, bb.len())) {
                    break;
                }
            }

            if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) {
                // Propagate this error to caller.
                if (PlanExecutor::FAILURE == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }

            const bool shouldSaveCursor =
                    shouldSaveCursorGetMore(state, exec, isCursorTailable(cc));

            // In order to deregister a cursor, we need to be holding the DB + collection lock and
            // if the cursor is aggregation, we release these locks.
            if (cc->isAggCursor()) {
                invariant(NULL == ctx.get());
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS));
                unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS));
            }

            // Our two possible ClientCursorPin cleanup paths are:
            // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin.
            // 2) If the cursor is going to be saved, we simply let the pin go out of scope.  In
            //    this case, the pin's destructor will be invoked, which will call release() on the
            //    pin.  Because our ClientCursorPin is declared after our lock is declared, this
            //    will happen under the lock.
            if (!shouldSaveCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();

                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                curop.debug().cursorExhausted = true;

                LOG(5) << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                LOG(5) << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!txn->getClient()->isInDirectClient()) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper->dismiss();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        LOG(5) << "getMore returned " << numResults << " results\n";
        return qr;
    }
void BSONComparatorInterfaceBase<T>::hashCombineBSONElement(
    size_t& hash,
    BSONElement elemToHash,
    bool considerFieldName,
    const StringData::ComparatorInterface* stringComparator) {
    boost::hash_combine(hash, elemToHash.canonicalType());

    const StringData fieldName = elemToHash.fieldNameStringData();
    if (considerFieldName && !fieldName.empty()) {
        SimpleStringDataComparator::kInstance.hash_combine(hash, fieldName);
    }

    switch (elemToHash.type()) {
        // Order of types is the same as in compareElementValues().

        case mongo::EOO:
        case mongo::Undefined:
        case mongo::jstNULL:
        case mongo::MaxKey:
        case mongo::MinKey:
            // These are valueless types
            break;

        case mongo::Bool:
            boost::hash_combine(hash, elemToHash.boolean());
            break;

        case mongo::bsonTimestamp:
            boost::hash_combine(hash, elemToHash.timestamp().asULL());
            break;

        case mongo::Date:
            boost::hash_combine(hash, elemToHash.date().asInt64());
            break;

        case mongo::NumberDecimal: {
            const Decimal128 dcml = elemToHash.numberDecimal();
            if (dcml.toAbs().isGreater(Decimal128(std::numeric_limits<double>::max(),
                                                  Decimal128::kRoundTo34Digits,
                                                  Decimal128::kRoundTowardZero)) &&
                !dcml.isInfinite() && !dcml.isNaN()) {
                // Normalize our decimal to force equivalent decimals
                // in the same cohort to hash to the same value
                Decimal128 dcmlNorm(dcml.normalize());
                boost::hash_combine(hash, dcmlNorm.getValue().low64);
                boost::hash_combine(hash, dcmlNorm.getValue().high64);
                break;
            }
            // Else, fall through and convert the decimal to a double and hash.
            // At this point the decimal fits into the range of doubles, is infinity, or is NaN,
            // which doubles have a cheaper representation for.
        }
        case mongo::NumberDouble:
        case mongo::NumberLong:
        case mongo::NumberInt: {
            // This converts all numbers to doubles, which ignores the low-order bits of
            // NumberLongs > 2**53 and precise decimal numbers without double representations,
            // but that is ok since the hash will still be the same for equal numbers and is
            // still likely to be different for different numbers. (Note: this issue only
            // applies for decimals when they are outside of the valid double range. See
            // the above case.)
            // SERVER-16851
            const double dbl = elemToHash.numberDouble();
            if (std::isnan(dbl)) {
                boost::hash_combine(hash, std::numeric_limits<double>::quiet_NaN());
            } else {
                boost::hash_combine(hash, dbl);
            }
            break;
        }

        case mongo::jstOID:
            elemToHash.__oid().hash_combine(hash);
            break;

        case mongo::String: {
            if (stringComparator) {
                stringComparator->hash_combine(hash, elemToHash.valueStringData());
            } else {
                SimpleStringDataComparator::kInstance.hash_combine(hash,
                                                                   elemToHash.valueStringData());
            }
            break;
        }

        case mongo::Code:
        case mongo::Symbol:
            SimpleStringDataComparator::kInstance.hash_combine(hash, elemToHash.valueStringData());
            break;

        case mongo::Object:
        case mongo::Array:
            hashCombineBSONObj(hash,
                               elemToHash.embeddedObject(),
                               true,  // considerFieldName
                               stringComparator);
            break;

        case mongo::DBRef:
        case mongo::BinData:
            // All bytes of the value are required to be identical.
            SimpleStringDataComparator::kInstance.hash_combine(
                hash, StringData(elemToHash.value(), elemToHash.valuesize()));
            break;

        case mongo::RegEx:
            SimpleStringDataComparator::kInstance.hash_combine(hash, elemToHash.regex());
            SimpleStringDataComparator::kInstance.hash_combine(hash, elemToHash.regexFlags());
            break;

        case mongo::CodeWScope: {
            SimpleStringDataComparator::kInstance.hash_combine(
                hash, StringData(elemToHash.codeWScopeCode(), elemToHash.codeWScopeCodeLen()));
            hashCombineBSONObj(hash,
                               elemToHash.codeWScopeObject(),
                               true,  // considerFieldName
                               &SimpleStringDataComparator::kInstance);
            break;
        }
    }
}
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc, long long term) {
    // Old versions set this even though they returned not "ok"
    _mismatch = doc[kMismatchFieldName].trueValue();
    if (_mismatch)
        return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match.");

    // Old versions sometimes set the replica set name ("set") but ok:0
    const BSONElement replSetNameElement = doc[kReplSetFieldName];
    if (replSetNameElement.eoo()) {
        _setName.clear();
    } else if (replSetNameElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kReplSetFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type String, but found "
                                    << typeName(replSetNameElement.type()));
    } else {
        _setName = replSetNameElement.String();
    }

    if (_setName.empty() && !doc[kOkFieldName].trueValue()) {
        std::string errMsg = doc[kErrMsgFieldName].str();

        BSONElement errCodeElem = doc[kErrorCodeFieldName];
        if (errCodeElem.ok()) {
            if (!errCodeElem.isNumber())
                return Status(ErrorCodes::BadValue, "Error code is not a number!");

            int errorCode = errCodeElem.numberInt();
            return Status(ErrorCodes::Error(errorCode), errMsg);
        }
        return Status(ErrorCodes::UnknownError, errMsg);
    }

    const BSONElement hasDataElement = doc[kHasDataFieldName];
    _hasDataSet = !hasDataElement.eoo();
    _hasData = hasDataElement.trueValue();

    const BSONElement electionTimeElement = doc[kElectionTimeFieldName];
    if (electionTimeElement.eoo()) {
        _electionTimeSet = false;
    } else if (electionTimeElement.type() == bsonTimestamp) {
        _electionTimeSet = true;
        _electionTime = electionTimeElement.timestamp();
    } else if (electionTimeElement.type() == Date) {
        _electionTimeSet = true;
        _electionTime = Timestamp(electionTimeElement.date());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kElectionTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(electionTimeElement.type()));
    }

    const BSONElement timeElement = doc[kTimeFieldName];
    if (timeElement.eoo()) {
        _timeSet = false;
    } else if (timeElement.isNumber()) {
        _timeSet = true;
        _time = Seconds(timeElement.numberLong());
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have a numeric type, but found type "
                                    << typeName(timeElement.type()));
    }

    _isReplSet = doc[kIsReplSetFieldName].trueValue();

    Status termStatus = bsonExtractIntegerField(doc, kTermFieldName, &_term);
    if (!termStatus.isOK() && termStatus != ErrorCodes::NoSuchKey) {
        return termStatus;
    }

    // In order to support both the 3.0(V0) and 3.2(V1) heartbeats we must parse the OpTime
    // field based on its type. If it is a Date, we parse it as the timestamp and use
    // initialize's term argument to complete the OpTime type. If it is an Object, then it's
    // V1 and we construct an OpTime out of its nested fields.
    const BSONElement opTimeElement = doc[kOpTimeFieldName];
    if (opTimeElement.eoo()) {
        _opTimeSet = false;
    } else if (opTimeElement.type() == bsonTimestamp) {
        _opTimeSet = true;
        _opTime = OpTime(opTimeElement.timestamp(), term);
    } else if (opTimeElement.type() == Date) {
        _opTimeSet = true;
        _opTime = OpTime(Timestamp(opTimeElement.date()), term);
    } else if (opTimeElement.type() == Object) {
        Status status = bsonExtractOpTimeField(doc, kOpTimeFieldName, &_opTime);
        _opTimeSet = true;
        // since a v1 OpTime was in the response, the member must be part of a replset
        _isReplSet = true;
    } else {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kOpTimeFieldName
                                    << "\" field in response to replSetHeartbeat "
                                       "command to have type Date or Timestamp, but found type "
                                    << typeName(opTimeElement.type()));
    }

    const BSONElement electableElement = doc[kIsElectableFieldName];
    if (electableElement.eoo()) {
        _electableSet = false;
    } else {
        _electableSet = true;
        _electable = electableElement.trueValue();
    }

    const BSONElement memberStateElement = doc[kMemberStateFieldName];
    if (memberStateElement.eoo()) {
        _stateSet = false;
    } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream()
                          << "Expected \"" << kMemberStateFieldName
                          << "\" field in response to replSetHeartbeat "
                             "command to have type NumberInt or NumberLong, but found type "
                          << typeName(memberStateElement.type()));
    } else {
        long long stateInt = memberStateElement.numberLong();
        if (stateInt < 0 || stateInt > MemberState::RS_MAX) {
            return Status(ErrorCodes::BadValue,
                          str::stream()
                              << "Value for \"" << kMemberStateFieldName
                              << "\" in response to replSetHeartbeat is "
                                 "out of range; legal values are non-negative and no more than "
                              << MemberState::RS_MAX);
        }
        _stateSet = true;
        _state = MemberState(static_cast<int>(stateInt));
    }

    _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue();


    // Not required for the case of uninitialized members -- they have no config
    const BSONElement configVersionElement = doc[kConfigVersionFieldName];

    // If we have an optime then we must have a configVersion
    if (_opTimeSet && configVersionElement.eoo()) {
        return Status(ErrorCodes::NoSuchKey,
                      str::stream() << "Response to replSetHeartbeat missing required \""
                                    << kConfigVersionFieldName
                                    << "\" field even though initialized");
    }

    // If there is a "v" (config version) then it must be an int.
    if (!configVersionElement.eoo() && configVersionElement.type() != NumberInt) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kConfigVersionFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type NumberInt, but found "
                                    << typeName(configVersionElement.type()));
    }
    _configVersion = configVersionElement.numberInt();

    const BSONElement hbMsgElement = doc[kHbMessageFieldName];
    if (hbMsgElement.eoo()) {
        _hbmsg.clear();
    } else if (hbMsgElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kHbMessageFieldName
                                    << "\" field in response to replSetHeartbeat to have "
                                       "type String, but found " << typeName(hbMsgElement.type()));
    } else {
        _hbmsg = hbMsgElement.String();
    }

    const BSONElement syncingToElement = doc[kSyncSourceFieldName];
    if (syncingToElement.eoo()) {
        _syncingTo = HostAndPort();
    } else if (syncingToElement.type() != String) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kSyncSourceFieldName
                                    << "\" field in response to replSetHeartbeat to "
                                       "have type String, but found "
                                    << typeName(syncingToElement.type()));
    } else {
        _syncingTo = HostAndPort(syncingToElement.String());
    }

    const BSONElement rsConfigElement = doc[kConfigFieldName];
    if (rsConfigElement.eoo()) {
        _configSet = false;
        _config = ReplicaSetConfig();
        return Status::OK();
    } else if (rsConfigElement.type() != Object) {
        return Status(ErrorCodes::TypeMismatch,
                      str::stream() << "Expected \"" << kConfigFieldName
                                    << "\" in response to replSetHeartbeat to have type "
                                       "Object, but found " << typeName(rsConfigElement.type()));
    }
    _configSet = true;

    return _config.initialize(rsConfigElement.Obj());
}
bool BatchedCommandResponse::parseBSON(const BSONObj& source, string* errMsg) {
    clear();

    std::string dummy;
    if (!errMsg)
        errMsg = &dummy;

    FieldParser::FieldState fieldState;
    fieldState = FieldParser::extractNumber(source, ok, &_ok, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _isOkSet = fieldState == FieldParser::FIELD_SET;

    fieldState = FieldParser::extract(source, errCode, &_errCode, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _isErrCodeSet = fieldState == FieldParser::FIELD_SET;

    fieldState = FieldParser::extract(source, errMessage, &_errMessage, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _isErrMessageSet = fieldState == FieldParser::FIELD_SET;

    // We're using appendNumber on generation so we'll try a smaller type
    // (int) first and then fall back to the original type (long long).
    BSONField<int> fieldN(n());
    int tempN;
    fieldState = FieldParser::extract(source, fieldN, &tempN, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID) {
        // try falling back to a larger type
        fieldState = FieldParser::extract(source, n, &_n, errMsg);
        if (fieldState == FieldParser::FIELD_INVALID)
            return false;
        _isNSet = fieldState == FieldParser::FIELD_SET;
    } else if (fieldState == FieldParser::FIELD_SET) {
        _isNSet = true;
        _n = tempN;
    }

    // We're using appendNumber on generation so we'll try a smaller type
    // (int) first and then fall back to the original type (long long).
    BSONField<int> fieldNModified(nModified());
    int intNModified;
    fieldState = FieldParser::extract(source, fieldNModified, &intNModified, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID) {
        // try falling back to a larger type
        fieldState = FieldParser::extract(source, nModified, &_nModified, errMsg);
        if (fieldState == FieldParser::FIELD_INVALID)
            return false;
        _isNModifiedSet = fieldState == FieldParser::FIELD_SET;
    } else if (fieldState == FieldParser::FIELD_SET) {
        _isNModifiedSet = true;
        _nModified = intNModified;
    }

    std::vector<BatchedUpsertDetail*>* tempUpsertDetails = NULL;
    fieldState = FieldParser::extract(source, upsertDetails, &tempUpsertDetails, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _upsertDetails.reset(tempUpsertDetails);

    const BSONElement opTimeElement = source["opTime"];
    _isLastOpSet = true;
    if (opTimeElement.eoo()) {
        _isLastOpSet = false;
    } else if (opTimeElement.type() == bsonTimestamp) {
        _lastOp = repl::OpTime(opTimeElement.timestamp(), repl::OpTime::kUninitializedTerm);
    } else if (opTimeElement.type() == Date) {
        _lastOp = repl::OpTime(Timestamp(opTimeElement.date()), repl::OpTime::kUninitializedTerm);
    } else if (opTimeElement.type() == Object) {
        Status status = bsonExtractOpTimeField(source, "opTime", &_lastOp);
        if (!status.isOK()) {
            return false;
        }
    } else {
        return false;
    }

    fieldState = FieldParser::extract(source, electionId, &_electionId, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _isElectionIdSet = fieldState == FieldParser::FIELD_SET;

    std::vector<WriteErrorDetail*>* tempErrDetails = NULL;
    fieldState = FieldParser::extract(source, writeErrors, &tempErrDetails, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _writeErrorDetails.reset(tempErrDetails);

    WCErrorDetail* wcError = NULL;
    fieldState = FieldParser::extract(source, writeConcernError, &wcError, errMsg);
    if (fieldState == FieldParser::FIELD_INVALID)
        return false;
    _wcErrDetails.reset(wcError);

    return true;
}
Beispiel #14
0
std::string runQuery(OperationContext* txn,
                     QueryMessage& q,
                     const NamespaceString& nss,
                     Message& result) {
    CurOp& curop = *CurOp::get(txn);

    uassert(ErrorCodes::InvalidNamespace,
            str::stream() << "Invalid ns [" << nss.ns() << "]",
            nss.isValid());
    invariant(!nss.isCommand());

    // Set curop information.
    beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip);

    // Parse the qm into a CanonicalQuery.

    auto statusWithCQ = CanonicalQuery::canonicalize(q, ExtensionsCallbackReal(txn, &nss));
    if (!statusWithCQ.isOK()) {
        uasserted(
            17287,
            str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString());
    }
    unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
    invariant(cq.get());

    LOG(5) << "Running query:\n" << cq->toString();
    LOG(2) << "Running query: " << cq->toStringShort();

    // Parse, canonicalize, plan, transcribe, and get a plan executor.
    AutoGetCollectionForRead ctx(txn, nss);
    Collection* collection = ctx.getCollection();

    const int dbProfilingLevel =
        ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

    // We have a parsed query. Time to get the execution plan for it.
    std::unique_ptr<PlanExecutor> exec = uassertStatusOK(
        getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO));

    const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

    // If it's actually an explain, do the explain and return rather than falling through
    // to the normal query execution loop.
    if (pq.isExplain()) {
        BufBuilder bb;
        bb.skip(sizeof(QueryResult::Value));

        BSONObjBuilder explainBob;
        Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

        // Add the resulting object to the return buffer.
        BSONObj explainObj = explainBob.obj();
        bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

        // TODO: Does this get overwritten/do we really need to set this twice?
        curop.debug().query = q.query;

        // Set query result fields.
        QueryResult::View qr = bb.buf();
        bb.decouple();
        qr.setResultFlagsToOk();
        qr.msgdata().setLen(bb.len());
        curop.debug().responseLength = bb.len();
        qr.msgdata().setOperation(opReply);
        qr.setCursorId(0);
        qr.setStartingFrom(0);
        qr.setNReturned(1);
        result.setData(qr.view2ptr(), true);
        return "";
    }

    // Handle query option $maxTimeMS (not used with commands).
    curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
    txn->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

    // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
    bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
    Status serveReadsStatus =
        repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK);
    uassertStatusOK(serveReadsStatus);

    // Run the query.
    // bb is used to hold query results
    // this buffer should contain either requested documents per query or
    // explain information, but not both
    BufBuilder bb(FindCommon::kInitReplyBufferSize);
    bb.skip(sizeof(QueryResult::Value));

    // How many results have we obtained from the executor?
    int numResults = 0;

    // If we're replaying the oplog, we save the last time that we read.
    Timestamp slaveReadTill;

    BSONObj obj;
    PlanExecutor::ExecState state;

    // Get summary info about which plan the executor is using.
    {
        stdx::lock_guard<Client> lk(*txn->getClient());
        curop.setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
    }

    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
        // If we can't fit this result inside the current batch, then we stash it for later.
        if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) {
            exec->enqueue(obj);
            break;
        }

        // Add result to output buffer.
        bb.appendBuf((void*)obj.objdata(), obj.objsize());

        // Count the result.
        ++numResults;

        // Possibly note slave's position in the oplog.
        if (pq.isOplogReplay()) {
            BSONElement e = obj["ts"];
            if (Date == e.type() || bsonTimestamp == e.type()) {
                slaveReadTill = e.timestamp();
            }
        }

        if (FindCommon::enoughForFirstBatch(pq, numResults)) {
            LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
                   << " ntoreturn=" << pq.getNToReturn().value_or(0) << " numResults=" << numResults
                   << endl;
            break;
        }
    }

    // If we cache the executor later, we want to deregister it as it receives notifications
    // anyway by virtue of being cached.
    //
    // If we don't cache the executor later, we are deleting it, so it must be deregistered.
    //
    // So, no matter what, deregister the executor.
    exec->deregisterExec();

    // Caller expects exceptions thrown in certain cases.
    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
        error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
                << ", stats: " << Explain::getWinningPlanStats(exec.get());
        uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
    }

    // Before saving the cursor, ensure that whatever plan we established happened with the expected
    // collection version
    auto css = CollectionShardingState::get(txn, nss);
    css->checkShardVersionOrThrow(txn);

    // Fill out curop based on query results. If we have a cursorid, we will fill out curop with
    // this cursorid later.
    long long ccId = 0;

    if (shouldSaveCursor(txn, collection, state, exec.get())) {
        // We won't use the executor until it's getMore'd.
        exec->saveState();
        exec->detachFromOperationContext();

        // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
        // inserted into a global map by its ctor.
        ClientCursor* cc =
            new ClientCursor(collection->getCursorManager(),
                             exec.release(),
                             nss.ns(),
                             txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                             pq.getOptions(),
                             pq.getFilter());
        ccId = cc->cursorid();

        LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
               << " results" << endl;

        // TODO document
        if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
            cc->slaveReadTill(slaveReadTill);
        }

        // TODO document
        if (pq.isExhaust()) {
            curop.debug().exhaust = true;
        }

        cc->setPos(numResults);

        // If the query had a time limit, remaining time is "rolled over" to the cursor (for
        // use by future getmore ops).
        cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

        endQueryOp(txn, collection, *cc->getExecutor(), dbProfilingLevel, numResults, ccId);
    } else {
        LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
        endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, ccId);
    }

    // Add the results from the query into the output buffer.
    result.appendData(bb.buf(), bb.len());
    bb.decouple();

    // Fill out the output buffer's header.
    QueryResult::View qr = result.header().view2ptr();
    qr.setCursorId(ccId);
    qr.setResultFlagsToOk();
    qr.msgdata().setOperation(opReply);
    qr.setStartingFrom(0);
    qr.setNReturned(numResults);

    // curop.debug().exhaust is set above.
    return curop.debug().exhaust ? nss.ns() : "";
}
int BSONElement::compareElements(const BSONElement& l,
                                 const BSONElement& r,
                                 ComparisonRulesSet rules,
                                 const StringData::ComparatorInterface* comparator) {
    switch (l.type()) {
        case BSONType::EOO:
        case BSONType::Undefined:  // EOO and Undefined are same canonicalType
        case BSONType::jstNULL:
        case BSONType::MaxKey:
        case BSONType::MinKey: {
            auto f = l.canonicalType() - r.canonicalType();
            if (f < 0)
                return -1;
            return f == 0 ? 0 : 1;
        }
        case BSONType::Bool:
            return *l.value() - *r.value();
        case BSONType::bsonTimestamp:
            // unsigned compare for timestamps - note they are not really dates but (ordinal +
            // time_t)
            if (l.timestamp() < r.timestamp())
                return -1;
            return l.timestamp() == r.timestamp() ? 0 : 1;
        case BSONType::Date:
            // Signed comparisons for Dates.
            {
                const Date_t a = l.Date();
                const Date_t b = r.Date();
                if (a < b)
                    return -1;
                return a == b ? 0 : 1;
            }

        case BSONType::NumberInt: {
            // All types can precisely represent all NumberInts, so it is safe to simply convert to
            // whatever rhs's type is.
            switch (r.type()) {
                case NumberInt:
                    return compareInts(l._numberInt(), r._numberInt());
                case NumberLong:
                    return compareLongs(l._numberInt(), r._numberLong());
                case NumberDouble:
                    return compareDoubles(l._numberInt(), r._numberDouble());
                case NumberDecimal:
                    return compareIntToDecimal(l._numberInt(), r._numberDecimal());
                default:
                    MONGO_UNREACHABLE;
            }
        }

        case BSONType::NumberLong: {
            switch (r.type()) {
                case NumberLong:
                    return compareLongs(l._numberLong(), r._numberLong());
                case NumberInt:
                    return compareLongs(l._numberLong(), r._numberInt());
                case NumberDouble:
                    return compareLongToDouble(l._numberLong(), r._numberDouble());
                case NumberDecimal:
                    return compareLongToDecimal(l._numberLong(), r._numberDecimal());
                default:
                    MONGO_UNREACHABLE;
            }
        }

        case BSONType::NumberDouble: {
            switch (r.type()) {
                case NumberDouble:
                    return compareDoubles(l._numberDouble(), r._numberDouble());
                case NumberInt:
                    return compareDoubles(l._numberDouble(), r._numberInt());
                case NumberLong:
                    return compareDoubleToLong(l._numberDouble(), r._numberLong());
                case NumberDecimal:
                    return compareDoubleToDecimal(l._numberDouble(), r._numberDecimal());
                default:
                    MONGO_UNREACHABLE;
            }
        }

        case BSONType::NumberDecimal: {
            switch (r.type()) {
                case NumberDecimal:
                    return compareDecimals(l._numberDecimal(), r._numberDecimal());
                case NumberInt:
                    return compareDecimalToInt(l._numberDecimal(), r._numberInt());
                case NumberLong:
                    return compareDecimalToLong(l._numberDecimal(), r._numberLong());
                case NumberDouble:
                    return compareDecimalToDouble(l._numberDecimal(), r._numberDouble());
                default:
                    MONGO_UNREACHABLE;
            }
        }

        case BSONType::jstOID:
            return memcmp(l.value(), r.value(), OID::kOIDSize);
        case BSONType::Code:
            return compareElementStringValues(l, r);
        case BSONType::Symbol:
        case BSONType::String: {
            if (comparator) {
                return comparator->compare(l.valueStringData(), r.valueStringData());
            } else {
                return compareElementStringValues(l, r);
            }
        }
        case BSONType::Object:
        case BSONType::Array: {
            return l.embeddedObject().woCompare(
                r.embeddedObject(),
                BSONObj(),
                rules | BSONElement::ComparisonRules::kConsiderFieldName,
                comparator);
        }
        case BSONType::DBRef: {
            int lsz = l.valuesize();
            int rsz = r.valuesize();
            if (lsz - rsz != 0)
                return lsz - rsz;
            return memcmp(l.value(), r.value(), lsz);
        }
        case BSONType::BinData: {
            int lsz = l.objsize();  // our bin data size in bytes, not including the subtype byte
            int rsz = r.objsize();
            if (lsz - rsz != 0)
                return lsz - rsz;
            return memcmp(l.value() + 4, r.value() + 4, lsz + 1 /*+1 for subtype byte*/);
        }
        case BSONType::RegEx: {
            int c = strcmp(l.regex(), r.regex());
            if (c)
                return c;
            return strcmp(l.regexFlags(), r.regexFlags());
        }
        case BSONType::CodeWScope: {
            int cmp = StringData(l.codeWScopeCode(), l.codeWScopeCodeLen() - 1)
                          .compare(StringData(r.codeWScopeCode(), r.codeWScopeCodeLen() - 1));
            if (cmp)
                return cmp;

            // When comparing the scope object, we should consider field names. Special string
            // comparison semantics do not apply to strings nested inside the CodeWScope scope
            // object, so we do not pass through the string comparator.
            return l.codeWScopeObject().woCompare(
                r.codeWScopeObject(),
                BSONObj(),
                rules | BSONElement::ComparisonRules::kConsiderFieldName);
        }
    }

    MONGO_UNREACHABLE;
}
    Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) {

        // Old versions set this even though they returned not "ok"
        _mismatch = doc[kMismatchFieldName].trueValue();
        if (_mismatch)
            return Status(ErrorCodes::InconsistentReplicaSetNames,
                          "replica set name doesn't match.");

        // Old versions sometimes set the replica set name ("set") but ok:0
        const BSONElement replSetNameElement = doc[kReplSetFieldName];
        if (replSetNameElement.eoo()) {
            _setName.clear();
        }
        else if (replSetNameElement.type() != String) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kReplSetFieldName << "\" field in response to replSetHeartbeat to have "
                          "type String, but found " << typeName(replSetNameElement.type()));
        }
        else {
            _setName = replSetNameElement.String();
        }

        if (_setName.empty()  && !doc[kOkFieldName].trueValue()) {
            std::string errMsg = doc[kErrMsgFieldName].str();

            BSONElement errCodeElem = doc[kErrorCodeFieldName];
            if (errCodeElem.ok()) {
                if (!errCodeElem.isNumber())
                    return Status(ErrorCodes::BadValue, "Error code is not a number!");

                int errorCode = errCodeElem.numberInt();
                return Status(ErrorCodes::Error(errorCode), errMsg);
            }
            return Status(ErrorCodes::UnknownError, errMsg);
        }

        const BSONElement hasDataElement = doc[kHasDataFieldName];
        _hasDataSet = !hasDataElement.eoo();
        _hasData = hasDataElement.trueValue();

        const BSONElement electionTimeElement = doc[kElectionTimeFieldName];
        if (electionTimeElement.eoo()) {
            _electionTimeSet = false;
        }
        else if (electionTimeElement.type() == bsonTimestamp) {
            _electionTimeSet = true;
            _electionTime = electionTimeElement.timestamp();
        }
        else if (electionTimeElement.type() == Date) {
            _electionTimeSet = true;
            _electionTime = Timestamp(electionTimeElement.date());
        }
        else {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kElectionTimeFieldName << "\" field in response to replSetHeartbeat "
                          "command to have type Date or Timestamp, but found type " <<
                          typeName(electionTimeElement.type()));
        }

        const BSONElement timeElement = doc[kTimeFieldName];
        if (timeElement.eoo()) {
            _timeSet = false;
        }
        else if (timeElement.isNumber()) {
            _timeSet = true;
            _time = Seconds(timeElement.numberLong());
        }
        else {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kTimeFieldName << "\" field in response to replSetHeartbeat "
                          "command to have a numeric type, but found type " <<
                          typeName(timeElement.type()));
        }

        const BSONElement opTimeElement = doc[kOpTimeFieldName];
        if (opTimeElement.eoo()) {
            _opTimeSet = false;
        }
        else if (opTimeElement.type() == bsonTimestamp) {
            _opTimeSet = true;
            _opTime = opTimeElement.timestamp();
        }
        else if (opTimeElement.type() == Date) {
            _opTimeSet = true;
            _opTime = Timestamp(opTimeElement.date());
        }
        else {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kOpTimeFieldName << "\" field in response to replSetHeartbeat "
                          "command to have type Date or Timestamp, but found type " <<
                          typeName(opTimeElement.type()));
        }

        const BSONElement electableElement = doc[kIsElectableFieldName];
        if (electableElement.eoo()) {
            _electableSet = false;
        }
        else {
            _electableSet = true;
            _electable = electableElement.trueValue();
        }

        _isReplSet = doc[kIsReplSetFieldName].trueValue();

        const BSONElement memberStateElement = doc[kMemberStateFieldName];
        if (memberStateElement.eoo()) {
            _stateSet = false;
        }
        else if (memberStateElement.type() != NumberInt &&
                 memberStateElement.type() != NumberLong) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kMemberStateFieldName << "\" field in response to replSetHeartbeat "
                          "command to have type NumberInt or NumberLong, but found type " <<
                          typeName(memberStateElement.type()));
        }
        else {
            long long stateInt = memberStateElement.numberLong();
            if (stateInt < 0 || stateInt > MemberState::RS_MAX) {
                return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" <<
                              kMemberStateFieldName << "\" in response to replSetHeartbeat is "
                              "out of range; legal values are non-negative and no more than " <<
                              MemberState::RS_MAX);
            }
            _stateSet = true;
            _state = MemberState(static_cast<int>(stateInt));
        }

        _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue();


        // Not required for the case of uninitialized members -- they have no config
        const BSONElement versionElement = doc[kConfigVersionFieldName];

        // If we have an optime then we must have a version
        if (_opTimeSet && versionElement.eoo()) {
            return Status(ErrorCodes::NoSuchKey, str::stream() <<
                          "Response to replSetHeartbeat missing required \"" <<
                          kConfigVersionFieldName << "\" field even though initialized");
        }

        // If there is a "v" (config version) then it must be an int.
        if (!versionElement.eoo() && versionElement.type() != NumberInt) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kConfigVersionFieldName <<
                          "\" field in response to replSetHeartbeat to have "
                          "type NumberInt, but found " << typeName(versionElement.type()));
        }
        _version = versionElement.numberInt();

        const BSONElement hbMsgElement = doc[kHbMessageFieldName];
        if (hbMsgElement.eoo()) {
            _hbmsg.clear();
        }
        else if (hbMsgElement.type() != String) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kHbMessageFieldName << "\" field in response to replSetHeartbeat to have "
                          "type String, but found " << typeName(hbMsgElement.type()));
        }
        else {
            _hbmsg = hbMsgElement.String();
        }

        const BSONElement syncingToElement = doc[kSyncSourceFieldName];
        if (syncingToElement.eoo()) {
            _syncingTo.clear();
        }
        else if (syncingToElement.type() != String) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kSyncSourceFieldName << "\" field in response to replSetHeartbeat to "
                          "have type String, but found " << typeName(syncingToElement.type()));
        }
        else {
            _syncingTo = syncingToElement.String();
        }

        const BSONElement rsConfigElement = doc[kConfigFieldName];
        if (rsConfigElement.eoo()) {
            _configSet = false;
            _config = ReplicaSetConfig();
            return Status::OK();
        }
        else if (rsConfigElement.type() != Object) {
            return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" <<
                          kConfigFieldName << "\" in response to replSetHeartbeat to have type "
                          "Object, but found " << typeName(rsConfigElement.type()));
        }
        _configSet = true;
        return _config.initialize(rsConfigElement.Obj());
    }