Beispiel #1
0
void Strategy::queryOp(OperationContext* txn, Request& request) {
    verify(!NamespaceString(request.getns()).isCommand());

    Timer queryTimer;

    globalOpCounters.gotQuery();

    QueryMessage q(request.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForFind(ns, false);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        ReadPreferenceSetting readPreference(ReadPreference::PrimaryOnly, TagSet::primaryOnly());

        BSONElement rpElem;
        auto readPrefExtractStatus = bsonExtractTypedField(
            q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem);

        if (readPrefExtractStatus.isOK()) {
            auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
            uassertStatusOK(parsedRps.getStatus());
            readPreference = parsedRps.getValue();
        } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
            uassertStatusOK(readPrefExtractStatus);
        }

        auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop());
        uassertStatusOK(canonicalQuery.getStatus());

        // If the $explain flag was set, we must run the operation on the shards as an explain
        // command rather than a find command.
        if (canonicalQuery.getValue()->getParsed().isExplain()) {
            const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed();
            BSONObj findCommand = lpq.asFindCommand();

            // We default to allPlansExecution verbosity.
            auto verbosity = ExplainCommon::EXEC_ALL_PLANS;

            const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly);
            rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference);

            BSONObjBuilder explainBuilder;
            uassertStatusOK(ClusterFind::runExplain(
                txn, findCommand, lpq, verbosity, metadata, &explainBuilder));

            BSONObj explainObj = explainBuilder.done();
            replyToQuery(0,  // query result flags
                         request.p(),
                         request.m(),
                         static_cast<const void*>(explainObj.objdata()),
                         explainObj.objsize(),
                         1,  // numResults
                         0,  // startingFrom
                         CursorId(0));
            return;
        }

        // Do the work to generate the first batch of results. This blocks waiting to get responses
        // from the shard(s).
        std::vector<BSONObj> batch;

        // 0 means the cursor is exhausted and
        // otherwise we assume that a cursor with the returned id can be retrieved via the
        // ClusterCursorManager
        auto cursorId =
            ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch);
        uassertStatusOK(cursorId.getStatus());

        // TODO: this constant should be shared between mongos and mongod, and should
        // not be inside ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        // Fill out the response buffer.
        int numResults = 0;
        for (const auto& obj : batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            numResults++;
        }

        replyToQuery(0,  // query result flags
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     0,  // startingFrom
                     cursorId.getValue());
        return;
    }

    QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions);

    // Parse "$maxTimeMS".
    StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query);
    uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK());

    if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) {
        return;
    }

    ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo());
    verify(cursor);

    // TODO:  Move out to Request itself, not strategy based
    try {
        cursor->init(txn);

        if (qSpec.isExplain()) {
            BSONObjBuilder explain_builder;
            cursor->explain(explain_builder);
            explain_builder.appendNumber("executionTimeMillis",
                                         static_cast<long long>(queryTimer.millis()));
            BSONObj b = explain_builder.obj();

            replyToQuery(0, request.p(), request.m(), b);
            delete (cursor);
            return;
        }
    } catch (...) {
        delete cursor;
        throw;
    }

    // TODO: Revisit all of this when we revisit the sharded cursor cache

    if (cursor->getNumQueryShards() != 1) {
        // More than one shard (or zero), manage with a ShardedClientCursor
        // NOTE: We may also have *zero* shards here when the returnPartial flag is set.
        // Currently the code in ShardedClientCursor handles this.

        ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor));

        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cc->getTotalSent();
        bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount);

        if (hasMore) {
            LOG(5) << "storing cursor : " << cc->getId();

            int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis();
            if (maxTimeMS.getValue() == 0) {  // 0 represents "no limit".
                cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit;
            } else if (cursorLeftoverMillis <= 0) {
                cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
            }

            cursorCache.store(cc, cursorLeftoverMillis);
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cc->getId() : 0);
    } else {
        // Only one shard is used

        // Remote cursors are stored remotely, we shouldn't need this around.
        unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor);

        ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId());
        verify(shard.get());
        DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId());

        // Implicitly stores the cursor in the cache
        request.reply(*(shardCursor->getMessage()), shardCursor->originalHost());

        // We don't want to kill the cursor remotely if there's still data left
        shardCursor->decouple();
    }
}
Beispiel #2
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) {
        // This is a read lock.
        Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner;
        // Takes ownership of cq.
        Status status = getRunner(cq, &rawRunner);
        if (!status.isOK()) {
            uasserted(17007, "Couldn't process query " + cq->toString()
                             + " why: " + status.reason());
        }
        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        QLOG() << "Running query on new system: " << cq->toString();

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // TODO: Remove when impl'd
        if (pq.hasOption(QueryOption_OplogReplay)) {
            warning() << "haven't implemented findingstartcursor yet\n";
        }

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);
        auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety(
            new DeregisterEvenIfUnderlyingCodeThrows(runner.get()));

        BSONObj obj;
        Runner::RunnerState state;
        uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // If we're sharded make sure that we don't return any data that hasn't been migrated
            // off of our shared yet.
            if (collMetadata) {
                // This information can change if we yield and as such we must make sure to re-fetch
                // it if we yield.
                KeyPattern kp(collMetadata->getKeyPattern());
                // This performs excessive BSONObj creation but that's OK for now.
                if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) {
                    ++numMisplacedDocs;
                    continue;
                }
            }

            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    saveClientCursor = true;
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable) && (1 != pq.getNumToReturn())) {
            // If pq.hasOption(tailable) the only plan the planner will output is a collscan with
            // tailable set.
            saveClientCursor = true;
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Append explain information to query results by asking the runner to produce them.
        if (isExplain) {
            TypeExplain* bareExplain;
            Status res = runner->getExplainPlan(&bareExplain);

            if (!res.isOK()) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
            else {
                boost::scoped_ptr<TypeExplain> explain(bareExplain);

                // Fill in the missing run-time fields in explain, starting with propeties of
                // the process running the query.
                std::string server = mongoutils::str::stream()
                    << getHostNameCached() << ":" << serverGlobalParams.port;
                explain->setServer(server);

                // Fill in the number of documents consummed that were involved in an ongoing
                // (or aborted) migration.
                explain->setNChunkSkips(numMisplacedDocs);

                // We might have skipped some results due to chunk migration etc. so our count is
                // correct and explain's is not.
                explain->setN(numResults);

                // Clock the whole operation.
                explain->setMillis(curop.elapsedMillis());

                BSONObj explainObj = explain->toBSON();
                bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

                // The explain output is actually a result.
                numResults = 1;
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;
        // TODO: nscanned is bogus.
        // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Beispiel #3
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());

        shared_ptr<AssertionException> ex;

        try {
            dbresponse.exhaust = runQuery(m, q, op, *resp);
            assert( !resp->empty() );
        }
        catch ( SendStaleConfigException& e ){
            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) );
            ok = false;
        }
        catch ( AssertionException& e ) {
            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
            ok = false;
        }

        if( ex ){

            op.debug().exceptionInfo = ex->getInfo();
            LOGWITHRATELIMIT {
                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            SendStaleConfigException* scex = NULL;
            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );

            BSONObjBuilder err;
            ex->getInfo().append( err );
            if( scex ) err.append( "ns", scex->getns() );
            BSONObj errObj = err.done();

            log() << errObj << endl;

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );

        }

        op.debug().responseLength = resp->header()->dataLen();

        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;

        return ok;
    }
Beispiel #4
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            context.relocked();

            bool createdCollection = false;
            Collection* collection = NULL;

            while( i.moreInCurrentBatch() ) {
                if ( numSeen % 128 == 127 /*yield some*/ ) {
                    collection = NULL;
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) {
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << numSeen << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                if ( isindex == false && collection == NULL ) {
                    collection = context.db()->getCollection( to_collection );
                    if ( !collection ) {
                        massert( 17321,
                                 str::stream()
                                 << "collection dropped during clone ["
                                 << to_collection << "]",
                                 !createdCollection );
                        createdCollection = true;
                        collection = context.db()->createCollection( txn, to_collection );
                        verify( collection );
                    }
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                const Status status = validateBSON(tmp.objdata(), tmp.objsize());
                if (!status.isOK()) {
                    out() << "Cloner: skipping corrupt object from " << from_collection
                          << ": " << status.reason();
                    continue;
                }

                ++numSeen;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify(nsToCollectionSubstring(from_collection) == "system.indexes");
                    js = fixindex(context.db()->name(), tmp);
                    indexesToBuild->push_back( js.getOwned() );
                    continue;
                }

                verify(nsToCollectionSubstring(from_collection) != "system.indexes");

                StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true );
                if ( !loc.isOK() ) {
                    error() << "error: exception cloning object in " << from_collection
                            << ' ' << loc.toString() << " obj:" << js;
                }
                uassertStatusOK( loc.getStatus() );
                if ( logForRepl )
                    logOp(txn, "i", to_collection, js);

                getDur().commitIfNeeded();

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << numSeen << " objects cloned so far from collection " << from_collection;
                    saveLast = time( 0 );
                }
            }
        }
Beispiel #5
0
void Strategy::queryOp(OperationContext* txn, Request& request) {
    verify(!NamespaceString(request.getns()).isCommand());

    globalOpCounters.gotQuery();

    QueryMessage q(request.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForFind(ns, false);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Determine the default read preference mode based on the value of the slaveOk flag.
    ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk)
        ? ReadPreference::SecondaryPreferred
        : ReadPreference::PrimaryOnly;
    ReadPreferenceSetting readPreference(readPreferenceOption, TagSet());

    BSONElement rpElem;
    auto readPrefExtractStatus = bsonExtractTypedField(
        q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem);

    if (readPrefExtractStatus.isOK()) {
        auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
        uassertStatusOK(parsedRps.getStatus());
        readPreference = parsedRps.getValue();
    } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
        uassertStatusOK(readPrefExtractStatus);
    }

    auto canonicalQuery = CanonicalQuery::canonicalize(q, ExtensionsCallbackNoop());
    uassertStatusOK(canonicalQuery.getStatus());

    // If the $explain flag was set, we must run the operation on the shards as an explain command
    // rather than a find command.
    if (canonicalQuery.getValue()->getParsed().isExplain()) {
        const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed();
        BSONObj findCommand = lpq.asFindCommand();

        // We default to allPlansExecution verbosity.
        auto verbosity = ExplainCommon::EXEC_ALL_PLANS;

        const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly);
        rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference);

        BSONObjBuilder explainBuilder;
        uassertStatusOK(
            Strategy::explainFind(txn, findCommand, lpq, verbosity, metadata, &explainBuilder));

        BSONObj explainObj = explainBuilder.done();
        replyToQuery(0,  // query result flags
                     request.p(),
                     request.m(),
                     static_cast<const void*>(explainObj.objdata()),
                     explainObj.objsize(),
                     1,  // numResults
                     0,  // startingFrom
                     CursorId(0));
        return;
    }

    // Do the work to generate the first batch of results. This blocks waiting to get responses from
    // the shard(s).
    std::vector<BSONObj> batch;

    // 0 means the cursor is exhausted. Otherwise we assume that a cursor with the returned id can
    // be retrieved via the ClusterCursorManager.
    auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch);
    uassertStatusOK(cursorId.getStatus());

    // Fill out the response buffer.
    int numResults = 0;
    OpQueryReplyBuilder reply;
    for (auto&& obj : batch) {
        obj.appendSelfToBufBuilder(reply.bufBuilderForResults());
        numResults++;
    }
    reply.send(request.p(),
               0,  // query result flags
               request.m(),
               numResults,
               0,  // startingFrom
               cursorId.getValue());
}
Beispiel #6
0
    /* note: this is only (as-is) called for

             - not multi
             - not mods is indexed
             - not upsert
    */
    static UpdateResult _updateById(bool isOperatorUpdate,
                                    int idIdxNo,
                                    ModSet* mods,
                                    NamespaceDetails* d,
                                    NamespaceDetailsTransient *nsdt,
                                    bool su,
                                    const char* ns,
                                    const BSONObj& updateobj,
                                    BSONObj patternOrig,
                                    bool logop,
                                    OpDebug& debug,
                                    bool fromMigrate = false) {

        DiskLoc loc;
        {
            IndexDetails& i = d->idx(idIdxNo);
            BSONObj key = i.getKeyFromQuery( patternOrig );
            loc = QueryRunner::fastFindSingle(i, key);
            if( loc.isNull() ) {
                // no upsert support in _updateById yet, so we are done.
                return UpdateResult( 0 , 0 , 0 , BSONObj() );
            }
        }
        Record* r = loc.rec();

        if ( cc().allowedToThrowPageFaultException() && ! r->likelyInPhysicalMemory() ) {
            throw PageFaultException( r );
        }

        /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
           regular ones at the moment. */
        BSONObj newObj;
        if ( isOperatorUpdate ) {
            const BSONObj& onDisk = loc.obj();
            auto_ptr<ModSetState> mss = mods->prepare( onDisk, false /* not an insertion */ );

            if( mss->canApplyInPlace() ) {
                mss->applyModsInPlace(true);
                debug.fastmod = true;
                DEBUGUPDATE( "\t\t\t updateById doing in place update" );

                newObj = onDisk;
            }
            else {
                newObj = mss->createNewFromMods();
                checkTooLarge(newObj);
                verify(nsdt);
                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
            }

            if ( logop ) {
                DEV verify( mods->size() );
                BSONObj pattern = patternOrig;
                BSONObj logObj = mss->getOpLogRewrite();
                DEBUGUPDATE( "\t rewrite update: " << logObj );

                // It is possible that the entire mod set was a no-op over this document.  We
                // would have an empty log record in that case. If we call logOp, with an empty
                // record, that would be replicated as "clear this record", which is not what
                // we want. Therefore, to get a no-op in the replica, we simply don't log.
                if ( logObj.nFields() ) {
                    logOp("u", ns, logObj, &pattern, 0, fromMigrate, &newObj );
                }
            }
            return UpdateResult( 1 , 1 , 1 , BSONObj() );

        } // end $operator update

        // regular update
        BSONElementManipulator::lookForTimestamps( updateobj );
        checkNoMods( updateobj );
        verify(nsdt);
        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
        if ( logop ) {
            logOp("u", ns, updateobj, &patternOrig, 0, fromMigrate, &updateobj );
        }
        return UpdateResult( 1 , 0 , 1 , BSONObj() );
    }
Beispiel #7
0
   // PD_TRACE_DECLARE_FUNCTION ( SDB__DMSROUNIT_INSRCD, "_dmsReorgUnit::insertRecord" )
   INT32 _dmsReorgUnit::insertRecord ( BSONObj &obj,
                                       _pmdEDUCB *cb, UINT32 attributes )
   {
      INT32 rc                     = SDB_OK ;
      PD_TRACE_ENTRY ( SDB__DMSROUNIT_INSRCD );
      UINT32 dmsrecordSize         = 0 ;
      ossValuePtr recordPtr        = 0 ;
      ossValuePtr prevPtr          = 0 ;
      dmsOffset offset             = DMS_INVALID_OFFSET ;
      dmsOffset recordOffset       = DMS_INVALID_OFFSET ;
      dmsExtent *currentExtent     = (dmsExtent*)_pCurrentExtent ;
      BOOLEAN isCompressed         = FALSE ;
      const CHAR *compressedData   = NULL ;
      INT32 compressedDataSize     = 0 ;

      if ( obj.objsize() + DMS_RECORD_METADATA_SZ >
           DMS_RECORD_MAX_SZ )
      {
         rc = SDB_CORRUPTED_RECORD ;
         goto error ;
      }

      if ( OSS_BIT_TEST ( attributes, DMS_MB_ATTR_COMPRESSED ) )
      {
         rc = dmsCompress ( cb, obj, NULL, 0, &compressedData,
                            &compressedDataSize ) ;
         PD_RC_CHECK ( rc, PDERROR,
                       "Failed to compress record, rc = %d: %s",
                       rc, obj.toString().c_str() ) ;
         dmsrecordSize = compressedDataSize + sizeof(INT32) ;
         if ( dmsrecordSize > (UINT32)(obj.objsize()) )
         {
            dmsrecordSize = obj.objsize() ;
         }
         else
         {
            isCompressed = TRUE ;
         }
      }
      else
      {
         dmsrecordSize = obj.objsize() ;
      }
      dmsrecordSize += DMS_RECORD_METADATA_SZ ;
      dmsrecordSize *= DMS_RECORD_OVERFLOW_RATIO ;
      dmsrecordSize = OSS_MIN(DMS_RECORD_MAX_SZ, ossAlignX(dmsrecordSize,4)) ;
   alloc:
      if ( !_pCurrentExtent )
      {
         rc = _allocateExtent ( dmsrecordSize <<
                                DMS_RECORDS_PER_EXTENT_SQUARE ) ;
         if ( rc )
         {
            PD_LOG ( PDERROR, "Failed to allocate new extent in reorg file, "
                     "rc = %d", rc ) ;
            goto error ;
         }
         currentExtent = (dmsExtent*)_pCurrentExtent ;
      }
      if ( dmsrecordSize > (UINT32)currentExtent->_freeSpace )
      {
         rc = _flushExtent () ;
         if ( rc )
         {
            PD_LOG ( PDERROR, "Failed to flush extent, rc = %d", rc ) ;
            goto error ;
         }
         goto alloc ;
      }
      recordOffset = _currentExtentSize - currentExtent->_freeSpace ;
      recordPtr = ((ossValuePtr)currentExtent) + recordOffset ;
      if ( currentExtent->_freeSpace - (INT32)dmsrecordSize <
           (INT32)DMS_MIN_RECORD_SZ &&
           currentExtent->_freeSpace <= (INT32)DMS_RECORD_MAX_SZ )
      {
         dmsrecordSize = (UINT32)currentExtent->_freeSpace ;
      }

      DMS_RECORD_SETSTATE ( recordPtr, DMS_RECORD_FLAG_NORMAL ) ;
      DMS_RECORD_RESETATTR ( recordPtr ) ;
      DMS_RECORD_SETMYOFFSET ( recordPtr, recordOffset ) ;
      DMS_RECORD_SETSIZE ( recordPtr, dmsrecordSize ) ;
      if ( isCompressed )
      {
         DMS_RECORD_SETATTR ( recordPtr, DMS_RECORD_FLAG_COMPRESSED ) ;
         DMS_RECORD_SETDATA ( recordPtr, compressedData, compressedDataSize ) ;
      }
      else
      {
         DMS_RECORD_SETDATA ( recordPtr, obj.objdata(), obj.objsize() ) ;
      }
      DMS_RECORD_SETNEXTOFFSET ( recordPtr, DMS_INVALID_OFFSET ) ;
      DMS_RECORD_SETPREVOFFSET ( recordPtr, DMS_INVALID_OFFSET ) ;
      currentExtent->_recCount ++ ;
      currentExtent->_freeSpace -= dmsrecordSize ;
      offset = currentExtent->_lastRecordOffset ;
      if ( DMS_INVALID_OFFSET != offset )
      {
         prevPtr = ((ossValuePtr)currentExtent) + offset ;
         DMS_RECORD_SETNEXTOFFSET ( prevPtr, recordOffset ) ;
         DMS_RECORD_SETPREVOFFSET ( recordPtr, offset ) ;
      }
      currentExtent->_lastRecordOffset = recordOffset ;
      offset = currentExtent->_firstRecordOffset ;
      if ( DMS_INVALID_OFFSET == offset )
      {
         currentExtent->_firstRecordOffset = recordOffset ;
      }
   done :
      PD_TRACE_EXITRC ( SDB__DMSROUNIT_INSRCD, rc );
      return rc ;
   error :
      goto done ;
   }
   INT32 catMainController::_processMsg( const NET_HANDLE &handle,
                                         MsgHeader *pMsg )
   {
      INT32 rc = SDB_OK ;

      switch ( pMsg->opCode )
      {
      case MSG_BS_QUERY_REQ:
         {
            rc = _processQueryMsg( handle, pMsg );
            break;
         }
      case MSG_BS_GETMORE_REQ :
         {
            rc = _processGetMoreMsg( handle, pMsg ) ;
            break ;
         }
      case MSG_BS_KILL_CONTEXT_REQ:
         {
            rc = _processKillContext( handle, pMsg ) ;
            break;
         }
      case MSG_BS_INTERRUPTE :
         {
            rc = _processInterruptMsg( handle, pMsg ) ;
            break ;
         }
      case MSG_BS_DISCONNECT :
         {
            rc = _processDisconnectMsg( handle, pMsg ) ;
            break ;
         }
      case MSG_CAT_QUERY_DATA_GRP_REQ :
         {
            rc = _processQueryDataGrp( handle, pMsg ) ;
            break ;
         }
      case MSG_CAT_QUERY_COLLECTIONS_REQ :
         {
            rc = _processQueryCollections( handle, pMsg ) ;
            break ;
         }
      case MSG_CAT_QUERY_COLLECTIONSPACES_REQ :
         {
            rc = _processQueryCollectionSpaces ( handle, pMsg ) ;
            break ;
         }
      case MSG_AUTH_VERIFY_REQ :
         {
            rc = _processAuthenticate( handle, pMsg ) ;
            break ;
         }
      case MSG_AUTH_CRTUSR_REQ :
         {
            _pCatCB->getCatDCMgr()->setImageCommand( TRUE ) ;
            rc = _processAuthCrt( handle, pMsg ) ;
            break ;
         }
      case MSG_AUTH_DELUSR_REQ :
         {
            _pCatCB->getCatDCMgr()->setImageCommand( TRUE ) ;
            rc = _processAuthDel( handle, pMsg ) ;
            break ;
         }
      case MSG_COOR_CHECK_ROUTEID_REQ :
         {
            rc = _processCheckRouteID( handle, pMsg ) ;
            break;
         }
      default :
         {
            PD_LOG( PDERROR, "Recieve unknow msg[opCode:(%d)%d, len: %d, "
                    "tid: %d, reqID: %lld, nodeID: %u.%u.%u]",
                    IS_REPLY_TYPE(pMsg->opCode), GET_REQUEST_TYPE(pMsg->opCode),
                    pMsg->messageLength, pMsg->TID, pMsg->requestID,
                    pMsg->routeID.columns.groupID, pMsg->routeID.columns.nodeID,
                    pMsg->routeID.columns.serviceID ) ;
            rc = SDB_UNKNOWN_MESSAGE ;

            BSONObj err = utilGetErrorBson( rc, _pEDUCB->getInfo(
                                            EDU_INFO_ERROR ) ) ;
            MsgOpReply reply ;
            reply.header.opCode = MAKE_REPLY_TYPE( pMsg->opCode ) ;
            reply.header.messageLength = sizeof( MsgOpReply ) + err.objsize() ;
            reply.header.requestID = pMsg->requestID ;
            reply.header.routeID.value = 0 ;
            reply.header.TID = pMsg->TID ;
            reply.flags = rc ;
            reply.contextID = -1 ;
            reply.numReturned = 1 ;
            reply.startFrom = 0 ;

            _pCatCB->netWork()->syncSend( handle, (MsgHeader*)&reply,
                                          (void*)err.objdata(),
                                          err.objsize() ) ;
            break ;
         }
      }

      if ( rc && SDB_UNKNOWN_MESSAGE != rc )
      {
         PD_LOG( PDWARNING, "Process msg[opCode:(%d)%d, len: %d, tid: %d, "
                 "reqID: %lld, nodeID: %u.%u.%u] failed, rc: %d",
                 IS_REPLY_TYPE(pMsg->opCode), GET_REQUEST_TYPE(pMsg->opCode),
                 pMsg->messageLength, pMsg->TID, pMsg->requestID,
                 pMsg->routeID.columns.groupID, pMsg->routeID.columns.nodeID,
                 pMsg->routeID.columns.serviceID, rc ) ;
      }

      return rc ;
   }
Beispiel #9
0
std::string runQuery(OperationContext* opCtx,
                     QueryMessage& q,
                     const NamespaceString& nss,
                     Message& result) {
    CurOp& curOp = *CurOp::get(opCtx);
    curOp.ensureStarted();

    uassert(ErrorCodes::InvalidNamespace,
            str::stream() << "Invalid ns [" << nss.ns() << "]",
            nss.isValid());
    invariant(!nss.isCommand());

    // Set CurOp information.
    const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip);
    beginQueryOp(opCtx, nss, upconvertedQuery, q.ntoreturn, q.ntoskip);

    // Parse the qm into a CanonicalQuery.
    const boost::intrusive_ptr<ExpressionContext> expCtx;
    auto cq = uassertStatusOKWithContext(
        CanonicalQuery::canonicalize(opCtx,
                                     q,
                                     expCtx,
                                     ExtensionsCallbackReal(opCtx, &nss),
                                     MatchExpressionParser::kAllowAllSpecialFeatures),
        "Can't canonicalize query");
    invariant(cq.get());

    LOG(5) << "Running query:\n" << redact(cq->toString());
    LOG(2) << "Running query: " << redact(cq->toStringShort());

    // Parse, canonicalize, plan, transcribe, and get a plan executor.
    AutoGetCollectionForReadCommand ctx(opCtx, nss, AutoGetCollection::ViewMode::kViewsForbidden);
    Collection* const collection = ctx.getCollection();

    {
        const QueryRequest& qr = cq->getQueryRequest();

        // Allow the query to run on secondaries if the read preference permits it. If no read
        // preference was specified, allow the query to run iff slaveOk has been set.
        const bool slaveOK = qr.hasReadPref()
            ? uassertStatusOK(ReadPreferenceSetting::fromContainingBSON(q.query))
                  .canRunOnSecondary()
            : qr.isSlaveOk();
        uassertStatusOK(
            repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, slaveOK));
    }

    // We have a parsed query. Time to get the execution plan for it.
    auto exec = uassertStatusOK(getExecutorLegacyFind(opCtx, collection, nss, std::move(cq)));

    const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest();

    // If it's actually an explain, do the explain and return rather than falling through
    // to the normal query execution loop.
    if (qr.isExplain()) {
        BufBuilder bb;
        bb.skip(sizeof(QueryResult::Value));

        BSONObjBuilder explainBob;
        Explain::explainStages(
            exec.get(), collection, ExplainOptions::Verbosity::kExecAllPlans, &explainBob);

        // Add the resulting object to the return buffer.
        BSONObj explainObj = explainBob.obj();
        bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

        // Set query result fields.
        QueryResult::View qr = bb.buf();
        qr.setResultFlagsToOk();
        qr.msgdata().setLen(bb.len());
        curOp.debug().responseLength = bb.len();
        qr.msgdata().setOperation(opReply);
        qr.setCursorId(0);
        qr.setStartingFrom(0);
        qr.setNReturned(1);
        result.setData(bb.release());
        return "";
    }

    // Handle query option $maxTimeMS (not used with commands).
    if (qr.getMaxTimeMS() > 0) {
        uassert(40116,
                "Illegal attempt to set operation deadline within DBDirectClient",
                !opCtx->getClient()->isInDirectClient());
        opCtx->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()});
    }
    opCtx->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

    // Run the query.
    // bb is used to hold query results
    // this buffer should contain either requested documents per query or
    // explain information, but not both
    BufBuilder bb(FindCommon::kInitReplyBufferSize);
    bb.skip(sizeof(QueryResult::Value));

    // How many results have we obtained from the executor?
    int numResults = 0;

    BSONObj obj;
    PlanExecutor::ExecState state;

    // Get summary info about which plan the executor is using.
    {
        stdx::lock_guard<Client> lk(*opCtx->getClient());
        curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
    }

    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
        // If we can't fit this result inside the current batch, then we stash it for later.
        if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) {
            exec->enqueue(obj);
            break;
        }

        // Add result to output buffer.
        bb.appendBuf((void*)obj.objdata(), obj.objsize());

        // Count the result.
        ++numResults;

        if (FindCommon::enoughForFirstBatch(qr, numResults)) {
            LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore()
                   << " ntoreturn=" << qr.getNToReturn().value_or(0)
                   << " numResults=" << numResults;
            break;
        }
    }

    // Caller expects exceptions thrown in certain cases.
    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
        error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
                << ", stats: " << redact(Explain::getWinningPlanStats(exec.get()));
        uassertStatusOKWithContext(WorkingSetCommon::getMemberObjectStatus(obj),
                                   "Executor error during OP_QUERY find");
        MONGO_UNREACHABLE;
    }

    // Before saving the cursor, ensure that whatever plan we established happened with the expected
    // collection version
    auto css = CollectionShardingState::get(opCtx, nss);
    css->checkShardVersionOrThrow(opCtx);

    // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with
    // this cursorid later.
    long long ccId = 0;

    if (shouldSaveCursor(opCtx, collection, state, exec.get())) {
        // We won't use the executor until it's getMore'd.
        exec->saveState();
        exec->detachFromOperationContext();

        // Allocate a new ClientCursor and register it with the cursor manager.
        ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor(
            opCtx,
            {std::move(exec),
             nss,
             AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(),
             opCtx->recoveryUnit()->getReadConcernLevel(),
             upconvertedQuery});
        ccId = pinnedCursor.getCursor()->cursorid();

        LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
               << " results";

        // TODO document
        if (qr.isExhaust()) {
            curOp.debug().exhaust = true;
        }

        pinnedCursor.getCursor()->setPos(numResults);

        // We assume that cursors created through a DBDirectClient are always used from their
        // original OperationContext, so we do not need to move time to and from the cursor.
        if (!opCtx->getClient()->isInDirectClient()) {
            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros());
        }

        endQueryOp(opCtx, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId);
    } else {
        LOG(5) << "Not caching executor but returning " << numResults << " results.";
        endQueryOp(opCtx, collection, *exec, numResults, ccId);
    }

    // Fill out the output buffer's header.
    QueryResult::View queryResultView = bb.buf();
    queryResultView.setCursorId(ccId);
    queryResultView.setResultFlagsToOk();
    queryResultView.msgdata().setLen(bb.len());
    queryResultView.msgdata().setOperation(opReply);
    queryResultView.setStartingFrom(0);
    queryResultView.setNReturned(numResults);

    // Add the results from the query into the output buffer.
    result.setData(bb.release());

    // curOp.debug().exhaust is set above.
    return curOp.debug().exhaust ? nss.ns() : "";
}
   // PD_TRACE_DECLARE_FUNCTION ( SDB__DMSSTORAGELOADEXT__IMPRTBLOCK, "dmsStorageLoadOp::pushToTempDataBlock" )
   INT32 dmsStorageLoadOp::pushToTempDataBlock ( dmsMBContext *mbContext,
                                                 pmdEDUCB *cb,
                                                 BSONObj &record,
                                                 BOOLEAN isLast,
                                                 BOOLEAN isAsynchr )
   {
      INT32 rc = SDB_OK ;
      PD_TRACE_ENTRY ( SDB__DMSSTORAGELOADEXT__IMPRTBLOCK );
      UINT32      dmsrecordSize   = 0 ;
      dmsRecord   *pRecord        = NULL ;
      dmsRecord   *pPreRecord     = NULL ;
      dmsOffset   offset          = DMS_INVALID_OFFSET ;
      dmsOffset   recordOffset    = DMS_INVALID_OFFSET ;

      _IDToInsert oid ;
      idToInsertEle oidEle((CHAR*)(&oid)) ;
      CHAR *pNewRecordData       = NULL ;
      dmsRecordData recordData ;

      dmsCompressorEntry *compressorEntry = NULL ;

      SDB_ASSERT( mbContext, "mb context can't be NULL" ) ;

      compressorEntry = _su->data()->getCompressorEntry( mbContext->mbID() ) ;
      /* For concurrency protection with drop CL and set compresor. */
      dmsCompressorGuard compGuard( compressorEntry, SHARED ) ;

      try
      {
         recordData.setData( record.objdata(), record.objsize(),
                             FALSE, TRUE ) ;
         /* (0) */
         BSONElement ele = record.getField ( DMS_ID_KEY_NAME ) ;
         const CHAR *pCheckErr = "" ;
         if ( !dmsIsRecordIDValid( ele, TRUE, &pCheckErr ) )
         {
            PD_LOG( PDERROR, "Record[%s] _id is error: %s",
                    record.toString().c_str(), pCheckErr ) ;
            rc = SDB_INVALIDARG ;
            goto error ;
         }

         if ( ele.eoo() )
         {
            oid._oid.init() ;
            rc = cb->allocBuff( oidEle.size() + record.objsize(),
                                &pNewRecordData ) ;
            if ( rc )
            {
               PD_LOG( PDERROR, "Alloc memory[size:%u] failed, rc: %d",
                       oidEle.size() + record.objsize(), rc ) ;
               goto error ;
            }
            *(UINT32*)pNewRecordData = oidEle.size() + record.objsize() ;
            ossMemcpy( pNewRecordData + sizeof(UINT32), oidEle.rawdata(),
                       oidEle.size() ) ;
            ossMemcpy( pNewRecordData + sizeof(UINT32) + oidEle.size(),
                       record.objdata() + sizeof(UINT32),
                       record.objsize() - sizeof(UINT32) ) ;
            recordData.setData( pNewRecordData,
                                oidEle.size() + record.objsize(),
                                FALSE, TRUE ) ;
            record = BSONObj( pNewRecordData ) ;
         }
         dmsrecordSize = recordData.len() ;

         if ( recordData.len() + DMS_RECORD_METADATA_SZ >
              DMS_RECORD_USER_MAX_SZ )
         {
            rc = SDB_DMS_RECORD_TOO_BIG ;
            goto error ;
         }

         if ( compressorEntry->ready() )
         {
            const CHAR *compressedData    = NULL ;
            INT32 compressedDataSize      = 0 ;
            UINT8 compressRatio           = 0 ;
            rc = dmsCompress( cb, compressorEntry,
                              recordData.data(), recordData.len(),
                              &compressedData, &compressedDataSize,
                              compressRatio ) ;
            if ( SDB_OK == rc &&
                 compressedDataSize + sizeof(UINT32) < recordData.orgLen() &&
                 compressRatio < DMS_COMPRESS_RATIO_THRESHOLD )
            {
               dmsrecordSize = compressedDataSize + sizeof(UINT32) ;
               recordData.setData( compressedData, compressedDataSize,
                                   TRUE, FALSE ) ;
            }
            else if ( rc )
            {
               if ( SDB_UTIL_COMPRESS_ABORT == rc )
               {
                  PD_LOG( PDINFO, "Record compression aborted. "
                          "Insert the original data. rc: %d", rc ) ;
               }
               else
               {
                  PD_LOG( PDWARNING, "Record compression failed. "
                          "Insert the original data. rc: %d", rc ) ;
               }
               rc = SDB_OK ;
            }
         }

         /*
          * Release the guard to avoid deadlock with truncate/drop collection.
          */
         compGuard.release() ;

         dmsrecordSize *= DMS_RECORD_OVERFLOW_RATIO ;
         dmsrecordSize += DMS_RECORD_METADATA_SZ ;
         dmsrecordSize = OSS_MIN( DMS_RECORD_MAX_SZ,
                                  ossAlignX ( dmsrecordSize, 4 ) ) ;

         INT32 expandSize = dmsrecordSize << DMS_RECORDS_PER_EXTENT_SQUARE ;
         if ( expandSize > DMS_BEST_UP_EXTENT_SZ )
         {
            expandSize = expandSize < DMS_BEST_UP_EXTENT_SZ ?
                         DMS_BEST_UP_EXTENT_SZ : expandSize ;
         }

         if ( !_pCurrentExtent )
         {
            rc = _allocateExtent ( expandSize ) ;
            if ( rc )
            {
               PD_LOG ( PDERROR, "Failed to allocate new extent in reorg file, "
                        "rc = %d", rc ) ;
               goto error ;
            }
            _currentExtent = (dmsExtent*)_pCurrentExtent ;
         }

         if ( dmsrecordSize > (UINT32)_currentExtent->_freeSpace || isLast )
         {
            rc = mbContext->mbLock( EXCLUSIVE ) ;
            if ( rc )
            {
               PD_LOG ( PDERROR, "Failed to lock collection, rc=%d", rc ) ;
               goto error ;
            }
            if ( !isAsynchr )
            {
               _currentExtent->_firstRecordOffset = DMS_INVALID_OFFSET ;
               _currentExtent->_lastRecordOffset = DMS_INVALID_OFFSET ;
            }

            rc = _su->loadExtentA( mbContext, _pCurrentExtent,
                                   _currentExtentSize / _pageSize,
                                   TRUE ) ;
            mbContext->mbUnlock() ;

            if ( rc )
            {
               PD_LOG ( PDERROR, "Failed to load extent, rc = %d", rc ) ;
               goto error ;
            }

            if ( isLast )
            {
               goto done ;
            }

            rc = _allocateExtent ( expandSize ) ;
            if ( rc )
            {
               PD_LOG ( PDERROR, "Failed to allocate new extent in reorg file, "
                        "rc = %d", rc ) ;
               goto error ;
            }
         }

         recordOffset = _currentExtentSize - _currentExtent->_freeSpace ;
         pRecord = ( dmsRecord* )( (const CHAR*)_currentExtent + recordOffset ) ;

         if ( _currentExtent->_freeSpace - (INT32)dmsrecordSize <
              (INT32)DMS_MIN_RECORD_SZ &&
              _currentExtent->_freeSpace <= (INT32)DMS_RECORD_MAX_SZ )
         {
            dmsrecordSize = _currentExtent->_freeSpace ;
         }

         pRecord->setNormal() ;
         pRecord->setMyOffset( recordOffset ) ;
         pRecord->setSize( dmsrecordSize ) ;
         pRecord->setData( recordData ) ;
         pRecord->setNextOffset( DMS_INVALID_OFFSET ) ;
         pRecord->setPrevOffset( DMS_INVALID_OFFSET ) ;

         if ( isAsynchr )
         {
            _currentExtent->_recCount++ ;
         }
         _currentExtent->_freeSpace -= dmsrecordSize ;
         offset = _currentExtent->_lastRecordOffset ;
         if ( DMS_INVALID_OFFSET != offset )
         {
            pPreRecord = (dmsRecord*)( (const CHAR*)_currentExtent + offset ) ;
            pPreRecord->setNextOffset( recordOffset ) ;
            pRecord->setPrevOffset( offset ) ;
         }
         _currentExtent->_lastRecordOffset = recordOffset ;

         offset = _currentExtent->_firstRecordOffset ;
         if ( DMS_INVALID_OFFSET == offset )
         {
            _currentExtent->_firstRecordOffset = recordOffset ;
         }
      }
      catch( std::exception &e )
      {
         PD_LOG( PDERROR, "Occur exception: %s", e.what() ) ;
         rc = SDB_SYS ;
         goto error ;
      }

   done:
      PD_TRACE_EXITRC ( SDB__DMSSTORAGELOADEXT__IMPRTBLOCK, rc );
      return rc ;
   error:
      goto done ;
   }
    TEST(RocksRecordStoreTest, OplogHack) {
        RocksRecordStoreHarnessHelper harnessHelper;
        scoped_ptr<RecordStore> rs(harnessHelper.newNonCappedRecordStore("local.oplog.foo"));
        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());

            // always illegal
            ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(2,-1)).getStatus(),
                  ErrorCodes::BadValue);

            {
                BSONObj obj = BSON("not_ts" << Timestamp(2,1));
                ASSERT_EQ(rs->insertRecord(opCtx.get(), obj.objdata(), obj.objsize(),
                                           false ).getStatus(),
                          ErrorCodes::BadValue);

                obj = BSON( "ts" << "not an Timestamp" );
                ASSERT_EQ(rs->insertRecord(opCtx.get(), obj.objdata(), obj.objsize(),
                                           false ).getStatus(),
                          ErrorCodes::BadValue);
            }

            // currently dasserts
            // ASSERT_EQ(insertBSON(opCtx, rs, BSON("ts" << Timestamp(-2,1))).getStatus(),
            // ErrorCodes::BadValue);

            // success cases
            ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(1,1)).getValue(),
                      RecordId(1,1));

            ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(1,2)).getValue(),
                      RecordId(1,2));

            ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(2,2)).getValue(),
                      RecordId(2,2));
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            // find start
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(0,1)), RecordId()); // nothing <=
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,1)), RecordId(1,2)); // between
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,2)), RecordId(2,2)); // ==
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(2,2)); // > highest
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            rs->temp_cappedTruncateAfter(opCtx.get(), RecordId(2,2),  false); // no-op
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(2,2));
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            rs->temp_cappedTruncateAfter(opCtx.get(), RecordId(1,2),  false); // deletes 2,2
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(1,2));
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            rs->temp_cappedTruncateAfter(opCtx.get(), RecordId(1,2),  true); // deletes 1,2
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(1,1));
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            WriteUnitOfWork wuow(opCtx.get());
            ASSERT_OK(rs->truncate(opCtx.get())); // deletes 1,1 and leaves collection empty
            wuow.commit();
        }

        {
            scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext());
            ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId());
        }
    }
Beispiel #12
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        log() << "Running query on new system: " << q.query.toString() << endl;

        // This is a read lock.
        Client::ReadContext ctx(q.ns, dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner;
        CanonicalQuery* cq;
        Status status = getRunner(q, &rawRunner, &cq);
        if (!status.isOK()) {
            uasserted(17007, "Couldn't process query " + q.query.toString()
                         + " why: " + status.reason());
        }
        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns);

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // TODO: Document why we do this.
        // TODO: do this when we can pass in our own parsed query
        //replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here, so we must register it with the active
        // runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);

        BSONObj obj;
        Runner::RunnerState state;

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // If we're sharded make sure that we don't return any data that hasn't been migrated
            // off of our shared yet.
            if (collMetadata) {
                // This information can change if we yield and as such we must make sure to re-fetch
                // it if we yield.
                KeyPattern kp(collMetadata->getKeyPattern());
                // This performs excessive BSONObj creation but that's OK for now.
                if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
            }

            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            const bool isExplain = pq.isExplain();
            if (isExplain && enoughForExplain(pq, numResults)) {
                break;
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    saveClientCursor = true;
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        ClientCursor::deregisterRunner(runner.get());

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; }

        // TODO: Stage creation can set tailable depending on what's in the parsed query.  We have
        // the full parsed query available during planning...set it there.
        //
        // TODO: If we're tailable we want to save the client cursor.  Make sure we do this later.
        //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            log() << "caching runner with cursorid " << ccId << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;
        // TODO: nscanned is bogus.
        // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Beispiel #13
0
/** @return number of skipped (invalid) documents */
unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n,
                       int nidx, bool validate, double pf, int pb, bool useDefaultPadding) {

    log() << "compact begin extent #" << n << " for namespace " << ns << endl;
    unsigned oldObjSize = 0; // we'll report what the old padding was
    unsigned oldObjSizeWithPadding = 0;

    Extent *e = diskloc.ext();
    e->assertOk();
    verify( e->validates(diskloc) );
    unsigned skipped = 0;

    Database* db = cc().database();

    {
        // the next/prev pointers within the extent might not be in order so we first
        // page the whole thing in sequentially
        log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
        Timer t;
        DataFile* mdf = db->getFile( diskloc.a() );
        HANDLE fd = mdf->getFd();
        int offset = diskloc.getOfs();
        Extent* ext = diskloc.ext();
        size_t length = ext->length;

        touch_pages(fd, offset, length, ext);
        int ms = t.millis();
        if( ms > 1000 )
            log() << "compact end paging in " << ms << "ms "
                  << e->length/1000000.0/ms << "MB/sec" << endl;
    }

    {
        log() << "compact copying records" << endl;
        long long datasize = 0;
        long long nrecords = 0;
        DiskLoc L = e->firstRecord;
        if( !L.isNull() ) {
            while( 1 ) {
                Record *recOld = L.rec();
                L = db->getExtentManager().getNextRecordInExtent(L);
                BSONObj objOld = BSONObj::make(recOld);

                if( !validate || objOld.valid() ) {
                    nrecords++;
                    unsigned sz = objOld.objsize();

                    oldObjSize += sz;
                    oldObjSizeWithPadding += recOld->netLength();

                    unsigned lenWHdr = sz + Record::HeaderSize;
                    unsigned lenWPadding = lenWHdr;
                    // maintain UsePowerOf2Sizes if no padding values were passed in
                    if (d->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes)
                            && useDefaultPadding) {
                        lenWPadding = d->quantizePowerOf2AllocationSpace(lenWPadding);
                    }
                    // otherwise use the padding values (pf and pb) that were passed in
                    else {
                        lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                        lenWPadding += pb;
                        lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                    }
                    if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
                        lenWPadding = lenWHdr;
                    }
                    DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                    uassert(14024, "compact error out of space during compaction", !loc.isNull());
                    Record *recNew = loc.rec();
                    datasize += recNew->netLength();
                    recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                    addRecordToRecListInExtent(recNew, loc);
                    memcpy(recNew->data(), objOld.objdata(), sz);
                }
                else {
                    if( ++skipped <= 10 )
                        log() << "compact skipping invalid object" << endl;
                }

                if( L.isNull() ) {
                    // we just did the very last record from the old extent.  it's still pointed to
                    // by the old extent ext, but that will be fixed below after this loop
                    break;
                }

                // remove the old records (orphan them) periodically so our commit block doesn't get too large
                bool stopping = false;
                RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                if( stopping || getDur().aCommitIsNeeded() ) {
                    e->firstRecord.writing() = L;
                    Record *r = L.rec();
                    getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                    getDur().commitIfNeeded();
                    killCurrentOp.checkForInterrupt(false);
                }
            }
        } // if !L.isNull()

        verify( d->firstExtent() == diskloc );
        verify( d->lastExtent() != diskloc );
        DiskLoc newFirst = e->xnext;
        d->firstExtent().writing() = newFirst;
        newFirst.ext()->xprev.writing().Null();
        getDur().writing(e)->markEmpty();
        cc().database()->getExtentManager().freeExtents( diskloc, diskloc );

        // update datasize/record count for this namespace's extent
        d->incrementStats( datasize, nrecords );

        getDur().commitIfNeeded();

        {
            double op = 1.0;
            if( oldObjSize )
                op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
            log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)"
                  << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                  << endl;
        }
    }

    return skipped;
}
Beispiel #14
0
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate,
              BSONObjBuilder& result, double pf, int pb, bool useDefaultPadding) {
    // this is a big job, so might as well make things tidy before we start just to be nice.
    getDur().commitIfNeeded();

    list<DiskLoc> extents;
    for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext )
        extents.push_back(L);
    log() << "compact " << extents.size() << " extents" << endl;

    ProgressMeterHolder pm(cc().curop()->setMessage("compact extent",
                           "Extent Compacting Progress",
                           extents.size()));

    // same data, but might perform a little different after compact?
    Collection* collection = cc().database()->getCollection( ns );
    verify( collection );
    collection->infoCache()->addedIndex();

    verify( d->getCompletedIndexCount() == d->getTotalIndexCount() );
    int nidx = d->getCompletedIndexCount();
    scoped_array<BSONObj> indexSpecs( new BSONObj[nidx] );
    {
        NamespaceDetails::IndexIterator ii = d->ii();
        // For each existing index...
        for( int idxNo = 0; ii.more(); ++idxNo ) {
            // Build a new index spec based on the old index spec.
            BSONObjBuilder b;
            BSONObj::iterator i(ii.next().info.obj());
            while( i.more() ) {
                BSONElement e = i.next();
                if ( str::equals( e.fieldName(), "v" ) ) {
                    // Drop any preexisting index version spec.  The default index version will
                    // be used instead for the new index.
                    continue;
                }
                if ( str::equals( e.fieldName(), "background" ) ) {
                    // Create the new index in the foreground.
                    continue;
                }
                // Pass the element through to the new index spec.
                b.append(e);
            }
            indexSpecs[idxNo] = b.obj().getOwned();
        }
    }

    log() << "compact orphan deleted lists" << endl;
    d->orphanDeletedList();

    // Start over from scratch with our extent sizing and growth
    d->setLastExtentSize( 0 );

    // before dropping indexes, at least make sure we can allocate one extent!
    uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull());

    // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
    log() << "compact dropping indexes" << endl;
    Status status = collection->getIndexCatalog()->dropAllIndexes( true );
    if ( !status.isOK() ) {
        errmsg = str::stream() << "compact drop indexes failed: " << status.toString();
        log() << status.toString() << endl;
        return false;
    }

    getDur().commitIfNeeded();

    long long skipped = 0;
    int n = 0;

    // reset data size and record counts to 0 for this namespace
    // as we're about to tally them up again for each new extent
    d->setStats( 0, 0 );

    for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
        skipped += compactExtent(ns, d, *i, n++, nidx, validate, pf, pb, useDefaultPadding);
        pm.hit();
    }

    if( skipped ) {
        result.append("invalidObjects", skipped);
    }

    verify( d->firstExtent().ext()->xprev.isNull() );

    // indexes will do their own progress meter?
    pm.finished();

    // build indexes
    NamespaceString s(ns);
    string si = s.db().toString() + ".system.indexes";
    for( int i = 0; i < nidx; i++ ) {
        killCurrentOp.checkForInterrupt(false);
        BSONObj info = indexSpecs[i];
        log() << "compact create index " << info["key"].Obj().toString() << endl;
        theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
    }

    return true;
}
Beispiel #15
0
    UpdateResult _updateObjects( bool su,
                                 const char* ns,
                                 const BSONObj& updateobj,
                                 const BSONObj& patternOrig,
                                 bool upsert,
                                 bool multi,
                                 bool logop ,
                                 OpDebug& debug,
                                 RemoveSaver* rs,
                                 bool fromMigrate,
                                 const QueryPlanSelectionPolicy& planPolicy,
                                 bool forReplication ) {

        DEBUGUPDATE( "update: " << ns
                     << " update: " << updateobj
                     << " query: " << patternOrig
                     << " upsert: " << upsert << " multi: " << multi );

        Client& client = cc();

        debug.updateobj = updateobj;

        // The idea with these here it to make them loop invariant for
        // multi updates, and thus be a bit faster for that case.  The
        // pointers may be left invalid on a failed or terminal yield
        // recovery.
        NamespaceDetails* d = nsdetails(ns); // can be null if an upsert...
        NamespaceDetailsTransient* nsdt = &NamespaceDetailsTransient::get(ns);

        auto_ptr<ModSet> mods;
        bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
        int modsIsIndexed = false; // really the # of indexes
        if ( isOperatorUpdate ) {
            mods.reset( new ModSet(updateobj, nsdt->indexKeys(), forReplication) );
            modsIsIndexed = mods->maxNumIndexUpdated();
        }

        if( planPolicy.permitOptimalIdPlan() && !multi && isSimpleIdQuery(patternOrig) && d &&
           !modsIsIndexed ) {
            int idxNo = d->findIdIndex();
            if( idxNo >= 0 ) {
                debug.idhack = true;

                UpdateResult result = _updateById( isOperatorUpdate,
                                                   idxNo,
                                                   mods.get(),
                                                   d,
                                                   nsdt,
                                                   su,
                                                   ns,
                                                   updateobj,
                                                   patternOrig,
                                                   logop,
                                                   debug,
                                                   fromMigrate);
                if ( result.existing || ! upsert ) {
                    return result;
                }
                else if ( upsert && ! isOperatorUpdate ) {
                    // this handles repl inserts
                    checkNoMods( updateobj );
                    debug.upsert = true;
                    BSONObj no = updateobj;
                    theDataFileMgr.insertWithObjMod(ns, no, false, su);
                    if ( logop )
                        logOp( "i", ns, no, 0, 0, fromMigrate, &no );

                    return UpdateResult( 0 , 0 , 1 , no );
                }
            }
        }

        int numModded = 0;
        debug.nscanned = 0;
        shared_ptr<Cursor> c = getOptimizedCursor( ns, patternOrig, BSONObj(), planPolicy );
        d = nsdetails(ns);
        nsdt = &NamespaceDetailsTransient::get(ns);
        bool autoDedup = c->autoDedup();

        if( c->ok() ) {
            set<DiskLoc> seenObjects;
            MatchDetails details;
            auto_ptr<ClientCursor> cc;
            do {

                if ( cc.get() == 0 &&
                     client.allowedToThrowPageFaultException() &&
                     ! c->currLoc().isNull() &&
                     ! c->currLoc().rec()->likelyInPhysicalMemory() ) {
                    throw PageFaultException( c->currLoc().rec() );
                }

                bool atomic = c->matcher() && c->matcher()->docMatcher().atomic();

                if ( ! atomic && debug.nscanned > 0 ) {
                    // we need to use a ClientCursor to yield
                    if ( cc.get() == 0 ) {
                        shared_ptr< Cursor > cPtr = c;
                        cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
                    }

                    bool didYield;
                    if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
                        cc.release();
                        break;
                    }
                    if ( !c->ok() ) {
                        break;
                    }

                    if ( didYield ) {
                        d = nsdetails(ns);
                        if ( ! d )
                            break;
                        nsdt = &NamespaceDetailsTransient::get(ns);
                        if ( mods.get() ) {
                            mods->setIndexedStatus( nsdt->indexKeys() );
                            modsIsIndexed = mods->maxNumIndexUpdated();
                        }

                    }

                } // end yielding block

                debug.nscanned++;

                if ( mods.get() && mods->hasDynamicArray() ) {
                    details.requestElemMatchKey();
                }

                if ( !c->currentMatches( &details ) ) {
                    c->advance();
                    continue;
                }

                Record* r = c->_current();
                DiskLoc loc = c->currLoc();

                if ( c->getsetdup( loc ) && autoDedup ) {
                    c->advance();
                    continue;
                }

                BSONObj js = BSONObj::make(r);

                BSONObj pattern = patternOrig;

                if ( logop ) {
                    BSONObjBuilder idPattern;
                    BSONElement id;
                    // NOTE: If the matching object lacks an id, we'll log
                    // with the original pattern.  This isn't replay-safe.
                    // It might make sense to suppress the log instead
                    // if there's no id.
                    if ( js.getObjectID( id ) ) {
                        idPattern.append( id );
                        pattern = idPattern.obj();
                    }
                    else {
                        uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
                    }
                }

                /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
                    regular ones at the moment. */
                if ( isOperatorUpdate ) {

                    if ( multi ) {
                        // go to next record in case this one moves
                        c->advance();

                        // Update operations are deduped for cursors that implement their own
                        // deduplication.  In particular, some geo cursors are excluded.
                        if ( autoDedup ) {

                            if ( seenObjects.count( loc ) ) {
                                continue;
                            }

                            // SERVER-5198 Advance past the document to be modified, provided
                            // deduplication is enabled, but see SERVER-5725.
                            while( c->ok() && loc == c->currLoc() ) {
                                c->advance();
                            }
                        }
                    }

                    const BSONObj& onDisk = loc.obj();

                    ModSet* useMods = mods.get();

                    auto_ptr<ModSet> mymodset;
                    if ( details.hasElemMatchKey() && mods->hasDynamicArray() ) {
                        useMods = mods->fixDynamicArray( details.elemMatchKey() );
                        mymodset.reset( useMods );
                    }

                    auto_ptr<ModSetState> mss = useMods->prepare( onDisk,
                                                                  false /* not an insertion */ );

                    bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() );

                    if ( willAdvanceCursor ) {
                        if ( cc.get() ) {
                            cc->setDoingDeletes( true );
                        }
                        c->prepareToTouchEarlierIterate();
                    }

                    // If we've made it this far, "ns" must contain a valid collection name, and so
                    // is of the form "db.collection".  Therefore, the following expression must
                    // always be valid.  "system.users" updates must never be done in place, in
                    // order to ensure that they are validated inside DataFileMgr::updateRecord(.).
                    bool isSystemUsersMod = nsToCollectionSubstring(ns) == "system.users";

                    BSONObj newObj;
                    if ( !mss->isUpdateIndexed() && mss->canApplyInPlace() && !isSystemUsersMod ) {
                        mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );

                        DEBUGUPDATE( "\t\t\t doing in place update" );
                        if ( !multi )
                            debug.fastmod = true;

                        if ( modsIsIndexed ) {
                            seenObjects.insert( loc );
                        }
                        newObj = loc.obj();
                        d->paddingFits();
                    }
                    else {
                        newObj = mss->createNewFromMods();
                        checkTooLarge(newObj);
                        DiskLoc newLoc = theDataFileMgr.updateRecord(ns,
                                                                     d,
                                                                     nsdt,
                                                                     r,
                                                                     loc,
                                                                     newObj.objdata(),
                                                                     newObj.objsize(),
                                                                     debug);

                        if ( newLoc != loc || modsIsIndexed ){
                            // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl;
                            // object moved, need to make sure we don' get again
                            seenObjects.insert( newLoc );
                        }

                    }

                    if ( logop ) {
                        DEV verify( mods->size() );
                        BSONObj logObj = mss->getOpLogRewrite();
                        DEBUGUPDATE( "\t rewrite update: " << logObj );

                        // It is possible that the entire mod set was a no-op over this
                        // document.  We would have an empty log record in that case. If we
                        // call logOp, with an empty record, that would be replicated as "clear
                        // this record", which is not what we want. Therefore, to get a no-op
                        // in the replica, we simply don't log.
                        if ( logObj.nFields() ) {
                            logOp("u", ns, logObj , &pattern, 0, fromMigrate, &newObj );
                        }
                    }
                    numModded++;
                    if ( ! multi )
                        return UpdateResult( 1 , 1 , numModded , BSONObj() );
                    if ( willAdvanceCursor )
                        c->recoverFromTouchingEarlierIterate();

                    getDur().commitIfNeeded();

                    continue;
                }

                uassert( 10158 ,  "multi update only works with $ operators" , ! multi );

                BSONElementManipulator::lookForTimestamps( updateobj );
                checkNoMods( updateobj );
                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, su);
                if ( logop ) {
                    DEV wassert( !su ); // super used doesn't get logged, this would be bad.
                    logOp("u", ns, updateobj, &pattern, 0, fromMigrate, &updateobj );
                }
                return UpdateResult( 1 , 0 , 1 , BSONObj() );
            } while ( c->ok() );
        } // endif

        if ( numModded )
            return UpdateResult( 1 , 1 , numModded , BSONObj() );

        if ( upsert ) {
            if ( updateobj.firstElementFieldName()[0] == '$' ) {
                // upsert of an $operation. build a default object
                BSONObj newObj = mods->createNewFromQuery( patternOrig );
                checkNoMods( newObj );
                debug.fastmodinsert = true;
                theDataFileMgr.insertWithObjMod(ns, newObj, false, su);
                if ( logop )
                    logOp( "i", ns, newObj, 0, 0, fromMigrate, &newObj );

                return UpdateResult( 0 , 1 , 1 , newObj );
            }
            uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
            checkNoMods( updateobj );
            debug.upsert = true;
            BSONObj no = updateobj;
            theDataFileMgr.insertWithObjMod(ns, no, false, su);
            if ( logop )
                logOp( "i", ns, no, 0, 0, fromMigrate, &no );
            return UpdateResult( 0 , 0 , 1 , no );
        }

        return UpdateResult( 0 , isOperatorUpdate , 0 , BSONObj() );
    }
Beispiel #16
0
std::string DBHashCmd::hashCollection(OperationContext* opCtx,
                                      Database* db,
                                      const std::string& fullCollectionName,
                                      bool* fromCache) {
    stdx::unique_lock<stdx::mutex> cachedHashedLock(_cachedHashedMutex, stdx::defer_lock);

    if (isCachable(fullCollectionName)) {
        cachedHashedLock.lock();
        string hash = _cachedHashed[fullCollectionName];
        if (hash.size() > 0) {
            *fromCache = true;
            return hash;
        }
    }

    *fromCache = false;
    Collection* collection = db->getCollection(fullCollectionName);
    if (!collection)
        return "";

    IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx);

    unique_ptr<PlanExecutor> exec;
    if (desc) {
        exec.reset(InternalPlanner::indexScan(opCtx,
                                              collection,
                                              desc,
                                              BSONObj(),
                                              BSONObj(),
                                              false,
                                              InternalPlanner::FORWARD,
                                              InternalPlanner::IXSCAN_FETCH));
    } else if (collection->isCapped()) {
        exec.reset(InternalPlanner::collectionScan(opCtx, fullCollectionName, collection));
    } else {
        log() << "can't find _id index for: " << fullCollectionName << endl;
        return "no _id _index";
    }

    md5_state_t st;
    md5_init(&st);

    long long n = 0;
    PlanExecutor::ExecState state;
    BSONObj c;
    verify(NULL != exec.get());
    while (PlanExecutor::ADVANCED == (state = exec->getNext(&c, NULL))) {
        md5_append(&st, (const md5_byte_t*)c.objdata(), c.objsize());
        n++;
    }
    if (PlanExecutor::IS_EOF != state) {
        warning() << "error while hashing, db dropped? ns=" << fullCollectionName << endl;
    }
    md5digest d;
    md5_finish(&st, d);
    string hash = digestToString(d);

    if (cachedHashedLock.owns_lock()) {
        _cachedHashed[fullCollectionName] = hash;
    }

    return hash;
}
Beispiel #17
0
    UpdateResult _updateObjectsNEW( bool su,
                                    const char* ns,
                                    const BSONObj& updateobj,
                                    const BSONObj& patternOrig,
                                    bool upsert,
                                    bool multi,
                                    bool logop ,
                                    OpDebug& debug,
                                    RemoveSaver* rs,
                                    bool fromMigrate,
                                    const QueryPlanSelectionPolicy& planPolicy,
                                    bool forReplication ) {

        // TODO
        // + Separate UpdateParser from UpdateRunner (the latter should be "stage-y")
        //   + All the yield and deduplicate logic would move to the query stage
        //     portion of it
        //
        // + Replication related
        //   + fast path for update for query by _id
        //   + support for relaxing viable path constraint in replication
        //
        // + Field Management
        //   + Force all upsert to contain _id
        //   + Prevent changes to immutable fields (_id, and those mentioned by sharding)
        //
        // + Yiedling related
        //   + $atomic support (or better, support proper yielding if not)
        //   + page fault support

        debug.updateobj = updateobj;

        NamespaceDetails* d = nsdetails( ns );
        NamespaceDetailsTransient* nsdt = &NamespaceDetailsTransient::get( ns );

        UpdateDriver::Options opts;
        opts.multi = multi;
        opts.upsert = upsert;
        opts.logOp = logop;
        UpdateDriver driver( opts );
        Status status = driver.parse( nsdt->indexKeys(), updateobj );
        if ( !status.isOK() ) {
            uasserted( 16840, status.reason() );
        }

        shared_ptr<Cursor> cursor = getOptimizedCursor( ns, patternOrig, BSONObj(), planPolicy );

        // The 'cursor' the optimizer gave us may contain query plans that generate duplicate
        // diskloc's. We set up here the mechanims that will prevent us from processing those
        // twice if we see them. We also set up a 'ClientCursor' so that we can support
        // yielding.
        const bool dedupHere = cursor->autoDedup();
        shared_ptr<Cursor> cPtr = cursor;
        auto_ptr<ClientCursor> clientCursor( new ClientCursor( QueryOption_NoCursorTimeout,
                                                               cPtr,
                                                               ns ) );

        //
        // Upsert Logic
        //

        // We may or may not have documents for this update. If we don't, then try to upsert,
        // if allowed.
        if ( !cursor->ok() && upsert ) {

            // If this is a $mod base update, we need to generate a document by examining the
            // query and the mods. Otherwise, we can use the object replacement sent by the user
            // update command that was parsed by the driver before.
            BSONObj oldObj;
            if ( *updateobj.firstElementFieldName() == '$' ) {
                if ( !driver.createFromQuery( patternOrig, &oldObj ) ) {
                    uasserted( 16835, "cannot create object to update" );
                }
                debug.fastmodinsert = true;
            }
            else {
                debug.upsert = true;
            }

            // Since this is an upsert, we will be oplogging it as an insert. We don't
            // need the driver's help to build the oplog record, then. We also set the
            // context of the update driver to an "upsert". Some mods may only work in that
            // context (e.g. $setOnInsert).
            driver.setLogOp( false );
            driver.setContext( ModifierInterface::ExecInfo::INSERT_CONTEXT );

            mutablebson::Document doc( oldObj, mutablebson::Document::kInPlaceDisabled );
            status = driver.update( StringData(), &doc, NULL /* no oplog record */);
            if ( !status.isOK() ) {
                uasserted( 16836, status.reason() );
            }
            BSONObj newObj = doc.getObject();

            theDataFileMgr.insertWithObjMod( ns, newObj, false, su );

            if ( logop ) {
                logOp( "i", ns, newObj, 0, 0, fromMigrate, &newObj );
            }

            return UpdateResult( false /* updated a non existing document */,
                                 driver.dollarModMode() /* $mod or obj replacement? */,
                                 1 /* count of updated documents */,
                                 newObj /* object that was upserted */ );
        }

        //
        // We have one or more documents for this update.
        //

        // We record that this will not be an upsert, in case a mod doesn't want to be applied
        // when in strict update mode.
        driver.setContext( ModifierInterface::ExecInfo::UPDATE_CONTEXT );

        // Let's fetch each of them and pipe them through the update expression, making sure to
        // keep track of the necessary stats. Recall that we'll be pulling documents out of
        // cursors and some of them do not deduplicate the entries they generate. We have
        // deduping logic in here, too -- for now.
        unordered_set<DiskLoc, DiskLoc::Hasher> seenLocs;
        int numUpdated = 0;
        debug.nscanned = 0;
        while ( cursor->ok() ) {

            // Let's fetch the next candidate object for this update.
            Record* r = cursor->_current();
            DiskLoc loc = cursor->currLoc();
            const BSONObj oldObj = loc.obj();

            // We count how many documents we scanned even though we may skip those that are
            // deemed duplicated. The final 'numUpdated' and 'nscanned' numbers may differ for
            // that reason.
            debug.nscanned++;

            // Skips this document if it:
            // a) doesn't match the query portion of the update
            // b) was deemed duplicate by the underlying cursor machinery
            //
            // Now, if we are going to update the document,
            // c) we don't want to do so while the cursor is at it, as that may invalidate
            // the cursor. So, we advance to next document, before issuing the update.
            MatchDetails matchDetails;
            matchDetails.requestElemMatchKey();
            if ( !cursor->currentMatches( &matchDetails ) ) {
                // a)
                cursor->advance();
                continue;
            }
            else if ( cursor->getsetdup( loc ) && dedupHere ) {
                // b)
                cursor->advance();
                continue;
            }
            else if (driver.dollarModMode() && multi) {
                // c)
                cursor->advance();
                if ( dedupHere ) {
                    if ( seenLocs.count( loc ) ) {
                        continue;
                    }
                }

                // There are certain kind of cursors that hold multiple pointers to data
                // underneath. $or cursors is one example. In a $or cursor, it may be the case
                // that when we did the last advance(), we finished consuming documents from
                // one of $or child and started consuming the next one. In that case, it is
                // possible that the last document of the previous child is the same as the
                // first document of the next (see SERVER-5198 and jstests/orp.js).
                //
                // So we advance the cursor here until we see a new diskloc.
                //
                // Note that we won't be yielding, and we may not do so for a while if we find
                // a particularly duplicated sequence of loc's. That is highly unlikely,
                // though.  (See SERVER-5725, if curious, but "stage" based $or will make that
                // ticket moot).
                while( cursor->ok() && loc == cursor->currLoc() ) {
                    cursor->advance();
                }
            }

            // For some (unfortunate) historical reasons, not all cursors would be valid after
            // a write simply because we advanced them to a document not affected by the write.
            // To protect in those cases, not only we engaged in the advance() logic above, but
            // we also tell the cursor we're about to write a document that we've just seen.
            // prepareToTouchEarlierIterate() requires calling later
            // recoverFromTouchingEarlierIterate(), so we make a note here to do so.
            bool touchPreviousDoc = multi && cursor->ok();
            if ( touchPreviousDoc  ) {
                clientCursor->setDoingDeletes( true );
                cursor->prepareToTouchEarlierIterate();
            }

            // Ask the driver to apply the mods. It may be that the driver can apply those "in
            // place", that is, some values of the old document just get adjusted without any
            // change to the binary layout on the bson layer. It may be that a whole new
            // document is needed to accomodate the new bson layout of the resulting document.
            mutablebson::Document doc( oldObj, mutablebson::Document::kInPlaceEnabled );
            BSONObj logObj;
            StringData matchedField = matchDetails.hasElemMatchKey() ?
                                                    matchDetails.elemMatchKey():
                                                    StringData();
            status = driver.update( matchedField, &doc, &logObj );
            if ( !status.isOK() ) {
                uasserted( 16837, status.reason() );
            }

            // If the driver applied the mods in place, we can ask the mutable for what
            // changed. We call those changes "damages". :) We use the damages to inform the
            // journal what was changed, and then apply them to the original document
            // ourselves. If, however, the driver applied the mods out of place, we ask it to
            // generate a new, modified document for us. In that case, the file manager will
            // take care of the journaling details for us.
            //
            // This code flow is admittedly odd. But, right now, journaling is baked in the file
            // manager. And if we aren't using the file manager, we have to do jounaling
            // ourselves.
            BSONObj newObj;
            const char* source = NULL;
            mutablebson::DamageVector damages;
            bool inPlace = doc.getInPlaceUpdates(&damages, &source);
            if ( inPlace && !driver.modsAffectIndices() ) {

                // All updates were in place. Apply them via durability and writing pointer.
                mutablebson::DamageVector::const_iterator where = damages.begin();
                const mutablebson::DamageVector::const_iterator end = damages.end();
                for( ; where != end; ++where ) {
                    const char* sourcePtr = source + where->sourceOffset;
                    void* targetPtr = getDur().writingPtr(
                        const_cast<char*>(oldObj.objdata()) + where->targetOffset,
                        where->size);
                    std::memcpy(targetPtr, sourcePtr, where->size);
                }
                newObj = oldObj;
                debug.fastmod = true;
            }
            else {

                // The updates were not in place. Apply them through the file manager.
                newObj = doc.getObject();
                DiskLoc newLoc = theDataFileMgr.updateRecord(ns,
                                                             d,
                                                             nsdt,
                                                             r,
                                                             loc,
                                                             newObj.objdata(),
                                                             newObj.objsize(),
                                                             debug);

                // If we've moved this object to a new location, make sure we don't apply
                // that update again if our traversal picks the objecta again.
                //
                // We also take note that the diskloc if the updates are affecting indices.
                // Chances are that we're traversing one of them and they may be multi key and
                // therefore duplicate disklocs.
                if ( newLoc != loc || driver.modsAffectIndices()  ) {
                    seenLocs.insert( newLoc );
                }
            }

            // Log Obj
            if ( logop ) {
                if ( !logObj.isEmpty() ) {
                    BSONObj pattern = patternOrig;
                    logOp("u", ns, logObj , &pattern, 0, fromMigrate, &newObj );
                }
            }

            // One more document updated.
            numUpdated++;

            if (!multi) {
                break;
            }

            // If we used the cursor mechanism that prepares an earlier seen document for a
            // write we need to tell such mechanisms that the write is over.
            if ( touchPreviousDoc ) {
                cursor->recoverFromTouchingEarlierIterate();
            }

            getDur().commitIfNeeded();

        }

        return UpdateResult( true /* updated existing object(s) */,
                             driver.dollarModMode() /* $mod or obj replacement */,
                             numUpdated /* # of docments update */,
                             BSONObj() );
    }
Beispiel #18
0
    void applyOperation_inlock(const BSONObj& op , bool fromRepl ) {
        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;

        if( logLevel >= 6 )
            log() << "applying op: " << op << endl;

        assertInWriteLock();

        OpDebug debug;
        BSONObj o = op.getObjectField("o");
        const char *ns = op.getStringField("ns");
        // operation type -- see logOp() comments for types
        const char *opType = op.getStringField("op");

        if ( *opType == 'i' ) {
            opCounters->gotInsert();

            const char *p = strchr(ns, '.');
            if ( p && strcmp(p, ".system.indexes") == 0 ) {
                // updates aren't allowed for indexes -- so we will do a regular insert. if index already
                // exists, that is ok.
                theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
            }
            else {
                // do upserts for inserts as we might get replayed more than once
                BSONElement _id;
                if( !o.getObjectID(_id) ) {
                    /* No _id.  This will be very slow. */
                    Timer t;
                    updateObjects(ns, o, o, true, false, false , debug );
                    if( t.millis() >= 2 ) {
                        RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
                    }
                }
                else {
                    BSONObjBuilder b;
                    b.append(_id);

                    /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
                    RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow

                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                              then.  very few upserts will not be inserts...
                              */
                    updateObjects(ns, o, b.done(), true, false, false , debug );
                }
            }
        }
        else if ( *opType == 'u' ) {
            opCounters->gotUpdate();

            RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
            updateObjects(ns, o, op.getObjectField("o2"), /*upsert*/ op.getBoolField("b"), /*multi*/ false, /*logop*/ false , debug );
        }
        else if ( *opType == 'd' ) {
            opCounters->gotDelete();

            if ( opType[1] == 0 )
                deleteObjects(ns, o, op.getBoolField("b"));
            else
                assert( opType[1] == 'b' ); // "db" advertisement
        }
        else if ( *opType == 'n' ) {
            // no op
        }
        else if ( *opType == 'c' ) {
            opCounters->gotCommand();

            BufBuilder bb;
            BSONObjBuilder ob;
            _runCommands(ns, o, bb, ob, true, 0);
        }
        else {
            stringstream ss;
            ss << "unknown opType [" << opType << "]";
            throw MsgAssertionException( 13141 , ss.str() );
        }

    }
Beispiel #19
0
bool ShardedClientCursor::sendNextBatch(int batchSize, BufBuilder& buffer, int& docCount) {
    uassert(10191, "cursor already done", !_done);

    int maxSize = 1024 * 1024;
    if (_totalSent > 0)
        maxSize *= 3;

    docCount = 0;

    // If batchSize is negative, it means that we should send up to -batchSize results
    // back to the client, and that we should only send a *single batch*. An batchSize of
    // 1 is also a special case which means "return up to 1 result in a single batch" (so
    // that +1 actually has the same meaning of -1). For all other values of batchSize, we
    // may have to return multiple batches.
    const bool sendMoreBatches = batchSize == 0 || batchSize > 1;
    batchSize = abs(batchSize);

    // Set the initial batch size to 101, just like mongoD.
    if (batchSize == 0 && _totalSent == 0)
        batchSize = 101;

    // Set batch size to batchSize requested by the current operation unconditionally.  This is
    // necessary because if the loop exited due to docCount == batchSize then setBatchSize(0) was
    // called, so the next _cusor->more() will be called with a batch size of 0 if the cursor
    // buffer was drained the previous run.  Unconditionally setting the batch size ensures that
    // we don't ask for a batch size of zero as a side effect.
    _cursor->setBatchSize(batchSize);

    bool cursorHasMore = true;
    while ((cursorHasMore = _cursor->more())) {
        BSONObj o = _cursor->next();
        buffer.appendBuf((void*)o.objdata(), o.objsize());
        ++docCount;

        // Ensure that the next batch will never wind up requesting more docs from the shard
        // than are remaining to satisfy the initial batchSize.
        if (batchSize != 0) {
            if (docCount == batchSize)
                break;
            _cursor->setBatchSize(batchSize - docCount);
        }

        if (buffer.len() > maxSize) {
            break;
        }
    }

    // We need to request another batch if the following two conditions hold:
    //
    //  1. batchSize is positive and not equal to 1 (see the comment above). This condition
    //  is stored in 'sendMoreBatches'.
    //
    //  2. The last call to _cursor->more() was true (i.e. we never explicitly got a false
    //  value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server
    //  hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the
    //  query response to indicate that there are no more results. In this case, _cursor->more()
    //  will be explicitly false, and we know for sure that we do not have to send more batches.
    //
    //  On the other hand, if _cursor->more() is true there may or may not be more results.
    //  Suppose that the mongod generates enough results to fill this batch. In this case it
    //  does not know whether not there are more, because doing so would require requesting an
    //  extra result and seeing whether we get EOF. The mongod sends a valid cursorId to
    //  indicate that there may be more. We do the same here: we indicate that there may be
    //  more results to retrieve by setting 'hasMoreBatches' to true.
    bool hasMoreBatches = sendMoreBatches && cursorHasMore;

    LOG(5) << "\t hasMoreBatches: " << hasMoreBatches << " sendMoreBatches: " << sendMoreBatches
           << " cursorHasMore: " << cursorHasMore << " batchSize: " << batchSize
           << " num: " << docCount << " id:" << getId() << " totalSent: " << _totalSent << endl;

    _totalSent += docCount;
    _done = !hasMoreBatches;

    return hasMoreBatches;
}
Beispiel #20
0
   // PD_TRACE_DECLARE_FUNCTION ( SDB_RTNINSERT2, "rtnInsert" )
   INT32 rtnInsert ( const CHAR *pCollectionName, BSONObj &objs, INT32 objNum,
                     INT32 flags, pmdEDUCB *cb, SDB_DMSCB *dmsCB,
                     SDB_DPSCB *dpsCB, INT16 w )
   {
      INT32 rc = SDB_OK ;
      PD_TRACE_ENTRY ( SDB_RTNINSERT2 ) ;
      SDB_ASSERT ( pCollectionName, "collection name can't be NULL" ) ;
      SDB_ASSERT ( cb, "educb can't be NULL" ) ;
      SDB_ASSERT ( dmsCB, "dmsCB can't be NULL" ) ;
      dmsStorageUnit *su = NULL ;
      dmsStorageUnitID suID = DMS_INVALID_CS ;
      const CHAR *pCollectionShortName = NULL ;
      UINT32 insertCount = 0 ;
      BOOLEAN writable = FALSE ;

      ossValuePtr pDataPos = 0 ;
      rc = dmsCB->writable( cb ) ;
      if ( rc )
      {
         PD_LOG ( PDERROR, "Database is not writable, rc = %d", rc ) ;
         goto error;
      }
      writable = TRUE;

      rc = rtnResolveCollectionNameAndLock ( pCollectionName, dmsCB, &su,
                                             &pCollectionShortName, suID ) ;
      if ( rc )
      {
         PD_LOG ( PDERROR, "Failed to resolve collection name %s",
                  pCollectionName ) ;
         goto error ;
      }

      if ( objs.isEmpty () )
      {
         PD_LOG ( PDERROR, "Insert record can't be empty" ) ;
         rc = SDB_INVALIDARG ;
         goto error ;
      }

      pDataPos = (ossValuePtr)objs.objdata() ;
      for ( INT32 i = 0 ; i < objNum ; ++i )
      {
         if ( ++insertCount > RTN_INSERT_ONCE_NUM )
         {
            insertCount = 0 ;
            if ( cb->isInterrupted() )
            {
               rc = SDB_APP_INTERRUPT ;
               goto error ;
            }
         }

         try
         {
            BSONObj record ( (const CHAR*)pDataPos ) ;
            rc = su->insertRecord ( pCollectionShortName, record, cb, dpsCB ) ;
            if ( rc )
            {
               if ( ( SDB_IXM_DUP_KEY == rc ) &&
                    ( FLG_INSERT_CONTONDUP & flags ) )
               {
                  rc = SDB_OK ;
               }
               else
               {
                  PD_LOG ( PDERROR, "Failed to insert record %s into "
                           "collection: %s, rc: %d", record.toString().c_str(),
                           pCollectionName, rc ) ;
                  goto error ;
               }
            }
            pDataPos += ossAlignX ( (ossValuePtr)record.objsize(), 4 ) ;
         }
         catch ( std::exception &e )
         {
            PD_LOG ( PDERROR, "Failed to convert to BSON and insert to "
                     "collection: %s", e.what() ) ;
            rc = SDB_INVALIDARG ;
            goto error ;
         }
      }

   done :
      if ( DMS_INVALID_CS != suID )
      {
         dmsCB->suUnlock ( suID ) ;
      }
      if ( writable )
      {
         dmsCB->writeDown( cb );
      }
      if ( cb )
      {
         if ( SDB_OK == rc && dpsCB )
         {
            rc = dpsCB->completeOpr( cb, w ) ;
         }
      }
      PD_TRACE_EXITRC ( SDB_RTNINSERT2, rc ) ;
      return rc ;
   error :
      goto done ;
   }
Beispiel #21
0
    bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        DEV verify( shardingState.enabled() );

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        // We don't care about the version here, since we're returning it later in the writeback
        ChunkVersion received, wanted;
        if ( shardVersionOk( ns , errmsg, received, wanted ) ) {
            return false;
        }

        bool getsAResponse = doesOpGetAResponse( op );

        LOG(1) << "connection sharding metadata does not match for collection " << ns
               << ", will retry (wanted : " << wanted << ", received : " << received << ")"
               << ( getsAResponse ? "" : " (queuing writeback)" ) << endl;

        if( getsAResponse ){
            verify( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObjBuilder bob;

                bob.append( "$err", errmsg );
                bob.append( "ns", ns );
                wanted.addToBSON( bob, "vWanted" );
                received.addToBSON( bob, "vReceived" );

                BSONObj obj = bob.obj();

                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }

        uassert(9517, "cannot queue a writeback operation to the writeback queue",
                (d.reservedField() & Reserved_FromWriteback) == 0);

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        // We need to check this here, since otherwise we'll get errors wrapping the writeback -
        // not just here, but also when returning as a command result.
        // We choose 1/2 the overhead of the internal maximum so that we can still handle ops of
        // 16MB exactly.
        massert( 16437, "data size of operation is too large to queue for writeback",
                 m.dataSize() < BSONObjMaxInternalSize - (8 * 1024));

        LOG(1) << "writeback queued for " << m.toString() << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "connectionId" , cc().getConnectionId() );
        b.append( "instanceIdent" , prettyHostName() );
        wanted.addToBSON( b );
        received.addToBSON( b, "yourVersion" );

        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        
        // we pass the builder to queueWriteBack so that it can select the writebackId
        // this is important so that the id is guaranteed to be ascending 
        // that is important since mongos assumes if its seen a greater writeback
        // that all former have been processed
        OID writebackID = writeBackManager.queueWriteBack( clientID.str() , b );

        lastError.getSafe()->writeback( writebackID );

        return true;
    }
Beispiel #22
0
    StatusWith<DiskLoc> Collection::updateDocument( OperationContext* txn,
                                                    const DiskLoc& oldLocation,
                                                    const BSONObj& objNew,
                                                    bool enforceQuota,
                                                    OpDebug* debug ) {

        BSONObj objOld = _recordStore->dataFor( txn, oldLocation ).toBson();

        if ( objOld.hasElement( "_id" ) ) {
            BSONElement oldId = objOld["_id"];
            BSONElement newId = objNew["_id"];
            if ( oldId != newId )
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            "in Collection::updateDocument _id mismatch",
                                            13596 );
        }

        if ( ns().coll() == "system.users" ) {
            // XXX - andy and spencer think this should go away now
            V2UserDocumentParser parser;
            Status s = parser.checkValidUserDocument(objNew);
            if ( !s.isOK() )
                return StatusWith<DiskLoc>( s );
        }

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerMap<IndexDescriptor*,UpdateTicket> updateTickets;
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator( txn, true );
        while ( ii.more() ) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique())
                || repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor);
            UpdateTicket* updateTicket = new UpdateTicket();
            updateTickets.mutableMap()[descriptor] = updateTicket;
            Status ret = iam->validateUpdate(txn, objOld, objNew, oldLocation, options, updateTicket );
            if ( !ret.isOK() ) {
                return StatusWith<DiskLoc>( ret );
            }
        }

        // this can callback into Collection::recordStoreGoingToMove
        StatusWith<DiskLoc> newLocation = _recordStore->updateRecord( txn,
                                                                      oldLocation,
                                                                      objNew.objdata(),
                                                                      objNew.objsize(),
                                                                      _enforceQuota( enforceQuota ),
                                                                      this );

        if ( !newLocation.isOK() ) {
            return newLocation;
        }

        _infoCache.notifyOfWriteOp();

        if ( newLocation.getValue() != oldLocation ) {

            if ( debug ) {
                if (debug->nmoved == -1) // default of -1 rather than 0
                    debug->nmoved = 1;
                else
                    debug->nmoved += 1;
            }

            _indexCatalog.indexRecord(txn, objNew, newLocation.getValue());

            return newLocation;
        }

        if ( debug )
            debug->keyUpdates = 0;

        ii = _indexCatalog.getIndexIterator( txn, true );
        while ( ii.more() ) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            int64_t updatedKeys;
            Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys);
            if ( !ret.isOK() )
                return StatusWith<DiskLoc>( ret );
            if ( debug )
                debug->keyUpdates += updatedKeys;
        }

        // Broadcast the mutation so that query results stay correct.
        _cursorCache.invalidateDocument(oldLocation, INVALIDATION_MUTATION);

        return newLocation;
    }
Beispiel #23
0
    bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result) { 
        //int les = d->lastExtentSize;

        // this is a big job, so might as well make things tidy before we start just to be nice.
        getDur().commitNow();

        list<DiskLoc> extents;
        for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) 
            extents.push_back(L);
        log() << "compact " << extents.size() << " extents" << endl;

        ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) );

        // same data, but might perform a little different after compact?
        NamespaceDetailsTransient::get_w(ns).clearQueryCache();

        int nidx = d->nIndexes;
        scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] );
        scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] );
        {
            NamespaceDetails::IndexIterator ii = d->ii(); 
            int x = 0;
            while( ii.more() ) { 
                BSONObjBuilder b;
                BSONObj::iterator i(ii.next().info.obj());
                while( i.more() ) { 
                    BSONElement e = i.next();
                    if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) {
                        b.append(e);
                    }
                }
                BSONObj o = b.obj().getOwned();
                phase1[x].sorter.reset( new BSONObjExternalSorter( o.getObjectField("key") ) );
                phase1[x].sorter->hintNumObjects( d->stats.nrecords );
                indexSpecs[x++].reset(o);
            }
        }

        log() << "compact orphan deleted lists" << endl;
        for( int i = 0; i < Buckets; i++ ) { 
            d->deletedList[i].writing().Null();
        }

        // before dropping indexes, at least make sure we can allocate one extent!
        uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1).isNull());

        // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
        log() << "compact dropping indexes" << endl;
        BSONObjBuilder b;
        if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { 
            errmsg = "compact drop indexes failed";
            log() << errmsg << endl;
            return false;
        }

        getDur().commitNow();

        long long skipped = 0;
        int n = 0;
        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { 
            skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate);
            pm.hit();
        }

        if( skipped ) {
            result.append("invalidObjects", skipped);
        }

        assert( d->firstExtent.ext()->xprev.isNull() );

        // indexes will do their own progress meter?
        pm.finished();

        // build indexes
        NamespaceString s(ns);
        string si = s.db + ".system.indexes";
        for( int i = 0; i < nidx; i++ ) {
            killCurrentOp.checkForInterrupt(false);
            BSONObj info = indexSpecs[i].info;
            log() << "compact create index " << info["key"].Obj().toString() << endl;
            try {
                precalced = &phase1[i];
                theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
            }
            catch(...) { 
                precalced = 0;
                throw;
            }
            precalced = 0;
        }

        return true;
    }
Beispiel #24
0
 static void insert( const BSONObj &o, bool god = false ) {
     dblock lk;
     Client::Context ctx( ns() );
     theDataFileMgr.insert( ns(), o.objdata(), o.objsize(), god );
 }
Beispiel #25
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        QLOG() << "running getMore in new system, cursorid " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // If we're sharded make sure that we don't return any data that hasn't been
                // migrated off of our shard yet.
                if (collMetadata) {
                    KeyPattern kp(collMetadata->getKeyPattern());
                    if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
                }

                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((numResults && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_EOF == state
                && 0 == numResults
                && (queryOptions & QueryOption_CursorTailable)
                && (queryOptions & QueryOption_AwaitData)
                && (pass < 1000)) {
                // If the cursor is tailable we don't kill it if it's eof.  We let it try to get
                // data some # of times first.
                return 0;
            }
            else if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) {
                ccPin.free();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        return qr;
    }
Beispiel #26
0
    /** object cannot be represented in compact format.  so store in traditional bson format 
        with a leading sentinel byte IsBSON to indicate it's in that format.

        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
        so that we don't have to do an extra malloc.
    */
    void KeyV1Owned::traditional(const BSONObj& obj) { 
        b.reset();
        b.appendUChar(IsBSON);
        b.appendBuf(obj.objdata(), obj.objsize());
        _keyData = (const unsigned char *) b.buf();
    }
Beispiel #27
0
std::string runQuery(OperationContext* txn,
                     QueryMessage& q,
                     const NamespaceString& nss,
                     Message& result) {
    CurOp& curop = *CurOp::get(txn);
    // Validate the namespace.
    uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid());
    invariant(!nss.isCommand());

    // Set curop information.
    beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip);

    // Parse the qm into a CanonicalQuery.

    auto statusWithCQ = CanonicalQuery::canonicalize(q, ExtensionsCallbackReal(txn, &nss));
    if (!statusWithCQ.isOK()) {
        uasserted(
            17287,
            str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString());
    }
    unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
    invariant(cq.get());

    LOG(5) << "Running query:\n" << cq->toString();
    LOG(2) << "Running query: " << cq->toStringShort();

    // Parse, canonicalize, plan, transcribe, and get a plan executor.
    AutoGetCollectionForRead ctx(txn, nss);
    Collection* collection = ctx.getCollection();

    const int dbProfilingLevel =
        ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

    // We have a parsed query. Time to get the execution plan for it.
    std::unique_ptr<PlanExecutor> exec = uassertStatusOK(
        getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO));

    const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

    // If it's actually an explain, do the explain and return rather than falling through
    // to the normal query execution loop.
    if (pq.isExplain()) {
        BufBuilder bb;
        bb.skip(sizeof(QueryResult::Value));

        BSONObjBuilder explainBob;
        Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

        // Add the resulting object to the return buffer.
        BSONObj explainObj = explainBob.obj();
        bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

        // TODO: Does this get overwritten/do we really need to set this twice?
        curop.debug().query = q.query;

        // Set query result fields.
        QueryResult::View qr = bb.buf();
        bb.decouple();
        qr.setResultFlagsToOk();
        qr.msgdata().setLen(bb.len());
        curop.debug().responseLength = bb.len();
        qr.msgdata().setOperation(opReply);
        qr.setCursorId(0);
        qr.setStartingFrom(0);
        qr.setNReturned(1);
        result.setData(qr.view2ptr(), true);
        return "";
    }

    ShardingState* const shardingState = ShardingState::get(txn);

    // We freak out later if this changes before we're done with the query.
    const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

    // Handle query option $maxTimeMS (not used with commands).
    curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
    txn->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

    // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
    bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
    Status serveReadsStatus =
        repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK);
    uassertStatusOK(serveReadsStatus);

    // Run the query.
    // bb is used to hold query results
    // this buffer should contain either requested documents per query or
    // explain information, but not both
    BufBuilder bb(FindCommon::kInitReplyBufferSize);
    bb.skip(sizeof(QueryResult::Value));

    // How many results have we obtained from the executor?
    int numResults = 0;

    // If we're replaying the oplog, we save the last time that we read.
    Timestamp slaveReadTill;

    BSONObj obj;
    PlanExecutor::ExecState state;
    // uint64_t numMisplacedDocs = 0;

    // Get summary info about which plan the executor is using.
    {
        stdx::lock_guard<Client> lk(*txn->getClient());
        curop.setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
    }

    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
        // If we can't fit this result inside the current batch, then we stash it for later.
        if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) {
            exec->enqueue(obj);
            break;
        }

        // Add result to output buffer.
        bb.appendBuf((void*)obj.objdata(), obj.objsize());

        // Count the result.
        ++numResults;

        // Possibly note slave's position in the oplog.
        if (pq.isOplogReplay()) {
            BSONElement e = obj["ts"];
            if (Date == e.type() || bsonTimestamp == e.type()) {
                slaveReadTill = e.timestamp();
            }
        }

        if (FindCommon::enoughForFirstBatch(pq, numResults)) {
            LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
                   << " ntoreturn=" << pq.getNToReturn().value_or(0) << " numResults=" << numResults
                   << endl;
            break;
        }
    }

    // If we cache the executor later, we want to deregister it as it receives notifications
    // anyway by virtue of being cached.
    //
    // If we don't cache the executor later, we are deleting it, so it must be deregistered.
    //
    // So, no matter what, deregister the executor.
    exec->deregisterExec();

    // Caller expects exceptions thrown in certain cases.
    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
        const unique_ptr<PlanStageStats> stats(exec->getStats());
        error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
                << ", stats: " << Explain::statsToBSON(*stats);
        uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
    }

    // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
    // chunk belonged on this node are gone. Separating chunk lifetime management from
    // ClientCursor should allow this check to go away.
    if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
        // if the version changed during the query we might be missing some data and its safe to
        // send this as mongos can resend at this point
        throw SendStaleConfigException(nss.ns(),
                                       "version changed during initial query",
                                       shardingVersionAtStart,
                                       shardingState->getVersion(nss.ns()));
    }

    // Fill out curop based on query results. If we have a cursorid, we will fill out curop with
    // this cursorid later.
    long long ccId = 0;

    if (shouldSaveCursor(txn, collection, state, exec.get())) {
        // We won't use the executor until it's getMore'd.
        exec->saveState();
        exec->detachFromOperationContext();

        // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
        // inserted into a global map by its ctor.
        ClientCursor* cc =
            new ClientCursor(collection->getCursorManager(),
                             exec.release(),
                             nss.ns(),
                             txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                             pq.getOptions(),
                             pq.getFilter());
        ccId = cc->cursorid();

        LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
               << " results" << endl;

        // TODO document
        if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
            cc->slaveReadTill(slaveReadTill);
        }

        // TODO document
        if (pq.isExhaust()) {
            curop.debug().exhaust = true;
        }

        cc->setPos(numResults);

        // If the query had a time limit, remaining time is "rolled over" to the cursor (for
        // use by future getmore ops).
        cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

        endQueryOp(txn, collection, *cc->getExecutor(), dbProfilingLevel, numResults, ccId);
    } else {
        LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
        endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, ccId);
    }

    // Add the results from the query into the output buffer.
    result.appendData(bb.buf(), bb.len());
    bb.decouple();

    // Fill out the output buffer's header.
    QueryResult::View qr = result.header().view2ptr();
    qr.setCursorId(ccId);
    qr.setResultFlagsToOk();
    qr.msgdata().setOperation(opReply);
    qr.setStartingFrom(0);
    qr.setNReturned(numResults);

    // curop.debug().exhaust is set above.
    return curop.debug().exhaust ? nss.ns() : "";
}
Beispiel #28
0
StatusWith<RecordId> Collection::updateDocument(OperationContext* txn,
                                                const RecordId& oldLocation,
                                                const Snapshotted<BSONObj>& oldDoc,
                                                const BSONObj& newDoc,
                                                bool enforceQuota,
                                                bool indexesAffected,
                                                OpDebug* debug,
                                                oplogUpdateEntryArgs& args) {
    {
        auto status = checkValidation(txn, newDoc);
        if (!status.isOK()) {
            if (_validationLevel == STRICT_V) {
                return status;
            }
            // moderate means we have to check the old doc
            auto oldDocStatus = checkValidation(txn, oldDoc.value());
            if (oldDocStatus.isOK()) {
                // transitioning from good -> bad is not ok
                return status;
            }
            // bad -> bad is ok in moderate mode
        }
    }

    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId());

    SnapshotId sid = txn->recoveryUnit()->getSnapshotId();

    BSONElement oldId = oldDoc.value()["_id"];
    if (!oldId.eoo() && (oldId != newDoc["_id"]))
        return StatusWith<RecordId>(
            ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596);

    // At the end of this step, we will have a map of UpdateTickets, one per index, which
    // represent the index updates needed to be done, based on the changes between oldDoc and
    // newDoc.
    OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets;
    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexCatalogEntry* entry = ii.catalogEntry(descriptor);
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) ||
                repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor);
            UpdateTicket* updateTicket = new UpdateTicket();
            updateTickets.mutableMap()[descriptor] = updateTicket;
            Status ret = iam->validateUpdate(txn,
                                             oldDoc.value(),
                                             newDoc,
                                             oldLocation,
                                             options,
                                             updateTicket,
                                             entry->getFilterExpression());
            if (!ret.isOK()) {
                return StatusWith<RecordId>(ret);
            }
        }
    }

    // This can call back into Collection::recordStoreGoingToMove.  If that happens, the old
    // object is removed from all indexes.
    StatusWith<RecordId> newLocation = _recordStore->updateRecord(
        txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this);

    if (!newLocation.isOK()) {
        return newLocation;
    }

    // At this point, the old object may or may not still be indexed, depending on if it was
    // moved. If the object did move, we need to add the new location to all indexes.
    if (newLocation.getValue() != oldLocation) {
        if (debug) {
            if (debug->nmoved == -1)  // default of -1 rather than 0
                debug->nmoved = 1;
            else
                debug->nmoved += 1;
        }

        Status s = _indexCatalog.indexRecord(txn, newDoc, newLocation.getValue());
        if (!s.isOK())
            return StatusWith<RecordId>(s);
        invariant(sid == txn->recoveryUnit()->getSnapshotId());
        args.ns = ns().ns();
        getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

        return newLocation;
    }

    // Object did not move.  We update each index with each respective UpdateTicket.

    if (debug)
        debug->keyUpdates = 0;

    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            int64_t updatedKeys;
            Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys);
            if (!ret.isOK())
                return StatusWith<RecordId>(ret);
            if (debug)
                debug->keyUpdates += updatedKeys;
        }
    }

    invariant(sid == txn->recoveryUnit()->getSnapshotId());
    args.ns = ns().ns();
    getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

    return newLocation;
}
Beispiel #29
0
Status FTDCFileWriter::writeMetadata(const BSONObj& metadata, Date_t date) {
    BSONObj wrapped = FTDCBSONUtil::createBSONMetadataDocument(metadata, date);

    return writeArchiveFileBuffer({wrapped.objdata(), static_cast<size_t>(wrapped.objsize())});
}
Beispiel #30
0
    bool ShardedClientCursor::sendNextBatch(int ntoreturn, BufBuilder& buffer, int& docCount) {
        uassert( 10191 ,  "cursor already done" , ! _done );

        int maxSize = 1024 * 1024;
        if ( _totalSent > 0 )
            maxSize *= 3;

        docCount = 0;

        // If ntoreturn is negative, it means that we should send up to -ntoreturn results
        // back to the client, and that we should only send a *single batch*. An ntoreturn of
        // 1 is also a special case which means "return up to 1 result in a single batch" (so
        // that +1 actually has the same meaning of -1). For all other values of ntoreturn, we
        // may have to return multiple batches.
        const bool sendMoreBatches = ntoreturn == 0 || ntoreturn > 1;
        ntoreturn = abs( ntoreturn );

        bool cursorHasMore = true;
        while ( ( cursorHasMore = _cursor->more() ) ) {
            BSONObj o = _cursor->next();

            buffer.appendBuf( (void*)o.objdata() , o.objsize() );
            docCount++;
            // Ensure that the next batch will never wind up requesting more docs from the shard
            // than are remaining to satisfy the initial ntoreturn.
            if (ntoreturn != 0) {
                _cursor->setBatchSize(ntoreturn - docCount);
            }

            if ( buffer.len() > maxSize ) {
                break;
            }

            if ( docCount == ntoreturn ) {
                // soft limit aka batch size
                break;
            }

            if ( ntoreturn == 0 && _totalSent == 0 && docCount >= 100 ) {
                // first batch should be max 100 unless batch size specified
                break;
            }
        }

        // We need to request another batch if the following two conditions hold:
        //
        //  1. ntoreturn is positive and not equal to 1 (see the comment above). This condition
        //  is stored in 'sendMoreBatches'.
        //
        //  2. The last call to _cursor->more() was true (i.e. we never explicitly got a false
        //  value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server
        //  hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the
        //  query response to indicate that there are no more results. In this case, _cursor->more()
        //  will be explicitly false, and we know for sure that we do not have to send more batches.
        //
        //  On the other hand, if _cursor->more() is true there may or may not be more results.
        //  Suppose that the mongod generates enough results to fill this batch. In this case it
        //  does not know whether not there are more, because doing so would require requesting an
        //  extra result and seeing whether we get EOF. The mongod sends a valid cursorId to
        //  indicate that there may be more. We do the same here: we indicate that there may be
        //  more results to retrieve by setting 'hasMoreBatches' to true.
        bool hasMoreBatches = sendMoreBatches && cursorHasMore;

        LOG(5) << "\t hasMoreBatches: " << hasMoreBatches
               << " sendMoreBatches: " << sendMoreBatches
               << " cursorHasMore: " << cursorHasMore
               << " ntoreturn: " << ntoreturn
               << " num: " << docCount
               << " id:" << getId()
               << " totalSent: " << _totalSent << endl;

        _totalSent += docCount;
        _done = ! hasMoreBatches;

        return hasMoreBatches;
    }