Exemple #1
0
Status FTDCFileWriter::writeMetadata(const BSONObj& metadata) {
    BSONObj wrapped = FTDCBSONUtil::createBSONMetadataDocument(metadata);

    return writeArchiveFileBuffer({wrapped.objdata(), static_cast<size_t>(wrapped.objsize())});
}
Exemple #2
0
        void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
            uassert( 10074 ,  "need values" , tuples.size() );

            int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;

            BSONObjBuilder reduceArgs( sizeEstimate );
            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;

            int sizeSoFar = 0;
            unsigned n = 0;
            for ( ; n<tuples.size(); n++ ) {
                BSONObjIterator j(tuples[n]);
                BSONElement keyE = j.next();
                if ( n == 0 ) {
                    reduceArgs.append( keyE );
                    key = keyE.wrap();
                    sizeSoFar = 5 + keyE.size();
                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
                }

                BSONElement ee = j.next();

                uassert( 13070 , "value to large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );

                if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
                    assert( n > 1 ); // if not, inf. loop
                    break;
                }

                valueBuilder->append( ee );
                sizeSoFar += ee.size();
            }
            assert(valueBuilder);
            valueBuilder->done();
            BSONObj args = reduceArgs.obj();

            Scope * s = _func.scope();

            s->invokeSafe( _func.func() , args );

            if ( s->type( "return" ) == Array ) {
                uasserted( 10075 , "reduce -> multiple not supported yet");
                return;
            }

            endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );

            if ( n == tuples.size() )
                return;

            // the input list was too large

            BSONList x;
            for ( ; n < tuples.size(); n++ ) {
                x.push_back( tuples[n] );
            }
            BSONObjBuilder temp( endSizeEstimate );
            temp.append( key.firstElement() );
            s->append( temp , "1" , "return" );
            x.push_back( temp.obj() );
            _reduce( x , key , endSizeEstimate );
        }
 static void insert( const BSONObj &o, bool god = false ) {
     Lock::DBWrite lk(ns());
     Client::Context ctx( ns() );
     theDataFileMgr.insert( ns(), o.objdata(), o.objsize(), god );
 }
Exemple #4
0
static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
    Lock::DBWrite lk("local");
    static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor

    if ( strncmp(ns, "local.", 6) == 0 ) {
        if ( strncmp(ns, "local.slaves", 12) == 0 ) {
            resetSlaveCache();
        }
        return;
    }

    mutex::scoped_lock lk2(OpTime::m);

    const OpTime ts = OpTime::now(lk2);
    Client::Context context("",0,false);

    /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
       instead we do a single copy to the destination position in the memory mapped file.
    */

    bufbuilder.reset();
    BSONObjBuilder b(bufbuilder);
    b.appendTimestamp("ts", ts.asDate());
    b.append("op", opstr);
    b.append("ns", ns);
    if (fromMigrate)
        b.appendBool("fromMigrate", true);
    if ( bb )
        b.appendBool("b", *bb);
    if ( o2 )
        b.append("o2", *o2);
    BSONObj partial = b.done(); // partial is everything except the o:... part.

    int po_sz = partial.objsize();
    int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;

    Record *r;
    if( logNS == 0 ) {
        logNS = "local.oplog.$main";
        if ( localOplogMainDetails == 0 ) {
            Client::Context ctx( logNS , dbpath, false);
            localDB = ctx.db();
            verify( localDB );
            localOplogMainDetails = nsdetails(logNS);
            verify( localOplogMainDetails );
        }
        Client::Context ctx( logNS , localDB, false );
        r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
    }
    else {
        Client::Context ctx( logNS, dbpath, false );
        verify( nsdetails( logNS ) );
        // first we allocate the space, then we fill it below.
        r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
    }

    append_O_Obj(r->data(), partial, obj);

    context.getClient()->setLastOp( ts );

    LOG( 6 ) << "logging op:" << BSONObj::make(r) << endl;
}
Exemple #5
0
    /* Prepare to build an index.  Does not actually build it (except for a special _id case).
       - We validate that the params are good
       - That the index does not already exist
       - Creates the source collection if it DNE

       example of 'io':
         { ns : 'test.foo', name : 'z', key : { z : 1 } }

       throws DBException

       @param sourceNS - source NS we are indexing
       @param sourceCollection - its details ptr
       @return true if ok to continue.  when false we stop/fail silently (index already exists)
    */
    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) {
        sourceCollection = 0;

        // logical name of the index.  todo: get rid of the name, we don't need it!
        const char *name = io.getStringField("name");
        uassert(12523, "no index name specified", *name);

        // the collection for which we are building an index
        sourceNS = io.getStringField("ns");
        uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
        uassert(10097, "bad table to index name on add index attempt",
                cc().database()->name == nsToDatabase(sourceNS.c_str()));


        BSONObj key = io.getObjectField("key");
        uassert(12524, "index key pattern too large", key.objsize() <= 2048);
        if( !validKeyPattern(key) ) {
            string s = string("bad index key pattern ") + key.toString();
            uasserted(10098 , s.c_str());
        }

        if ( sourceNS.empty() || key.isEmpty() ) {
            log(2) << "bad add index attempt name:" << (name?name:"") << "\n  ns:" <<
                   sourceNS << "\n  idxobj:" << io.toString() << endl;
            string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
            uasserted(12504, s);
        }

        sourceCollection = nsdetails(sourceNS.c_str());
        if( sourceCollection == 0 ) {
            // try to create it
            string err;
            if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) {
                problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl;
                return false;
            }
            sourceCollection = nsdetails(sourceNS.c_str());
            tlog() << "info: creating collection " << sourceNS << " on add index" << endl;
            assert( sourceCollection );
        }

        if ( sourceCollection->findIndexByName(name) >= 0 ) {
            // index already exists.
            return false;
        }
        if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) {
            log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl;
            return false;
        }

        if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) {
            stringstream ss;
            ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString();
            string s = ss.str();
            log() << s << '\n';
            uasserted(12505,s);
        }

        /* we can't build a new index for the ns if a build is already in progress in the background -
           EVEN IF this is a foreground build.
           */
        uassert(12588, "cannot add index with a background operation in progress",
                !BackgroundOperation::inProgForNs(sourceNS.c_str()));

        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to
           all be treated as the same pattern.
        */
        if ( IndexDetails::isIdIndexPattern(key) ) {
            if( !god ) {
                ensureHaveIdIndex( sourceNS.c_str() );
                return false;
            }
        }
        else {
            /* is buildIndexes:false set for this replica set member?
               if so we don't build any indexes except _id
            */
            if( theReplSet && !theReplSet->buildIndexes() )
                return false;
        }

        string pluginName = IndexPlugin::findPluginName( key );
        IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;

        if ( plugin ) {
            fixedIndexObject = plugin->adjustIndexSpec( io );
        }
        else if ( io["v"].eoo() ) {
            // add "v" if it doesn't exist
            // if it does - leave whatever value was there
            // this is for testing and replication
            BSONObjBuilder b( io.objsize() + 32 );
            b.appendElements( io );
            b.append( "v" , 0 );
            fixedIndexObject = b.obj();
        }

        return true;
    }
Exemple #6
0
        bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            if (!cmdObj["start"].eoo()) {
                errmsg = "using deprecated 'start' argument to geoNear";
                return false;
            }

            const NamespaceString nss(parseNs(dbname, cmdObj));
            AutoGetCollectionForRead ctx(txn, nss);

            Collection* collection = ctx.getCollection();
            if ( !collection ) {
                errmsg = "can't find ns";
                return false;
            }

            IndexCatalog* indexCatalog = collection->getIndexCatalog();

            // cout << "raw cmd " << cmdObj.toString() << endl;

            // We seek to populate this.
            string nearFieldName;
            bool using2DIndex = false;
            if (!getFieldName(txn, collection, indexCatalog, &nearFieldName, &errmsg, &using2DIndex)) {
                return false;
            }

            PointWithCRS point;
            uassert(17304, "'near' field must be point",
                    GeoParser::parseQueryPoint(cmdObj["near"], &point).isOK());

            bool isSpherical = cmdObj["spherical"].trueValue();
            if (!using2DIndex) {
                uassert(17301, "2dsphere index must have spherical: true", isSpherical);
            }

            // Build the $near expression for the query.
            BSONObjBuilder nearBob;
            if (isSpherical) {
                nearBob.append("$nearSphere", cmdObj["near"].Obj());
            }
            else {
                nearBob.append("$near", cmdObj["near"].Obj());
            }

            if (!cmdObj["maxDistance"].eoo()) {
                uassert(17299, "maxDistance must be a number",cmdObj["maxDistance"].isNumber());
                nearBob.append("$maxDistance", cmdObj["maxDistance"].number());
            }

            if (!cmdObj["minDistance"].eoo()) {
                uassert(17298, "minDistance doesn't work on 2d index", !using2DIndex);
                uassert(17300, "minDistance must be a number",cmdObj["minDistance"].isNumber());
                nearBob.append("$minDistance", cmdObj["minDistance"].number());
            }

            if (!cmdObj["uniqueDocs"].eoo()) {
                warning() << nss << ": ignoring deprecated uniqueDocs option in geoNear command";
            }

            // And, build the full query expression.
            BSONObjBuilder queryBob;
            queryBob.append(nearFieldName, nearBob.obj());
            if (!cmdObj["query"].eoo() && cmdObj["query"].isABSONObj()) {
                queryBob.appendElements(cmdObj["query"].Obj());
            }
            BSONObj rewritten = queryBob.obj();

            // cout << "rewritten query: " << rewritten.toString() << endl;

            long long numWanted = 100;
            const char* limitName = !cmdObj["num"].eoo() ? "num" : "limit";
            BSONElement eNumWanted = cmdObj[limitName];
            if (!eNumWanted.eoo()) {
                uassert(17303, "limit must be number", eNumWanted.isNumber());
                numWanted = eNumWanted.safeNumberLong();
                uassert(17302, "limit must be >=0", numWanted >= 0);
            }

            bool includeLocs = false;
            if (!cmdObj["includeLocs"].eoo()) {
                includeLocs = cmdObj["includeLocs"].trueValue();
            }

            double distanceMultiplier = 1.0;
            BSONElement eDistanceMultiplier = cmdObj["distanceMultiplier"];
            if (!eDistanceMultiplier.eoo()) {
                uassert(17296, "distanceMultiplier must be a number", eDistanceMultiplier.isNumber());
                distanceMultiplier = eDistanceMultiplier.number();
                uassert(17297, "distanceMultiplier must be non-negative", distanceMultiplier >= 0);
            }

            BSONObj projObj = BSON("$pt" << BSON("$meta" << LiteParsedQuery::metaGeoNearPoint) <<
                                   "$dis" << BSON("$meta" << LiteParsedQuery::metaGeoNearDistance));

            CanonicalQuery* cq;
            const WhereCallbackReal whereCallback(txn, nss.db());

            if (!CanonicalQuery::canonicalize(nss,
                                              rewritten,
                                              BSONObj(),
                                              projObj,
                                              0,
                                              numWanted,
                                              BSONObj(),
                                              &cq,
                                              whereCallback).isOK()) {
                errmsg = "Can't parse filter / create query";
                return false;
            }

            // Prevent chunks from being cleaned up during yields - this allows us to only check the
            // version on initial entry into geoNear.
            RangePreserver preserver(collection);

            PlanExecutor* rawExec;
            if (!getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, 0).isOK()) {
                errmsg = "can't get query executor";
                return false;
            }

            scoped_ptr<PlanExecutor> exec(rawExec);

            double totalDistance = 0;
            BSONObjBuilder resultBuilder(result.subarrayStart("results"));
            double farthestDist = 0;

            BSONObj currObj;
            long long results = 0;
            while ((results < numWanted) && PlanExecutor::ADVANCED == exec->getNext(&currObj, NULL)) {

                // Come up with the correct distance.
                double dist = currObj["$dis"].number() * distanceMultiplier;
                totalDistance += dist;
                if (dist > farthestDist) { farthestDist = dist; }

                // Strip out '$dis' and '$pt' from the result obj.  The rest gets added as 'obj'
                // in the command result.
                BSONObjIterator resIt(currObj);
                BSONObjBuilder resBob;
                while (resIt.more()) {
                    BSONElement elt = resIt.next();
                    if (!mongoutils::str::equals("$pt", elt.fieldName())
                        && !mongoutils::str::equals("$dis", elt.fieldName())) {
                        resBob.append(elt);
                    }
                }
                BSONObj resObj = resBob.obj();

                // Don't make a too-big result object.
                if (resultBuilder.len() + resObj.objsize()> BSONObjMaxUserSize) {
                    warning() << "Too many geoNear results for query " << rewritten.toString()
                              << ", truncating output.";
                    break;
                }

                // Add the next result to the result builder.
                BSONObjBuilder oneResultBuilder(
                    resultBuilder.subobjStart(BSONObjBuilder::numStr(results)));
                oneResultBuilder.append("dis", dist);
                if (includeLocs) {
                    oneResultBuilder.appendAs(currObj["$pt"], "loc");
                }
                oneResultBuilder.append("obj", resObj);
                oneResultBuilder.done();
                ++results;
            }

            resultBuilder.done();

            // Fill out the stats subobj.
            BSONObjBuilder stats(result.subobjStart("stats"));

            // Fill in nscanned from the explain.
            PlanSummaryStats summary;
            Explain::getSummaryStats(exec.get(), &summary);
            stats.appendNumber("nscanned", summary.totalKeysExamined);
            stats.appendNumber("objectsLoaded", summary.totalDocsExamined);

            stats.append("avgDistance", totalDistance / results);
            stats.append("maxDistance", farthestDist);
            stats.append("time", txn->getCurOp()->elapsedMillis());
            stats.done();

            return true;
        }
Exemple #7
0
    bool ShardedClientCursor::sendNextBatch( Request& r , int ntoreturn ,
            BufBuilder& buffer, int& docCount ) {
        uassert( 10191 ,  "cursor already done" , ! _done );

        int maxSize = 1024 * 1024;
        if ( _totalSent > 0 )
            maxSize *= 3;

        docCount = 0;

        // If ntoreturn is negative, it means that we should send up to -ntoreturn results
        // back to the client, and that we should only send a *single batch*. An ntoreturn of
        // 1 is also a special case which means "return up to 1 result in a single batch" (so
        // that +1 actually has the same meaning of -1). For all other values of ntoreturn, we
        // may have to return multiple batches.
        const bool sendMoreBatches = ntoreturn == 0 || ntoreturn > 1;
        ntoreturn = abs( ntoreturn );

        bool cursorHasMore = true;
        while ( ( cursorHasMore = _cursor->more() ) ) {
            BSONObj o = _cursor->next();

            buffer.appendBuf( (void*)o.objdata() , o.objsize() );
            docCount++;
            // Ensure that the next batch will never wind up requesting more docs from the shard
            // than are remaining to satisfy the initial ntoreturn.
            if (ntoreturn != 0) {
                _cursor->setBatchSize(ntoreturn - docCount);
            }

            if ( buffer.len() > maxSize ) {
                break;
            }

            if ( docCount == ntoreturn ) {
                // soft limit aka batch size
                break;
            }

            if ( ntoreturn == 0 && _totalSent == 0 && docCount >= 100 ) {
                // first batch should be max 100 unless batch size specified
                break;
            }
        }

        // We need to request another batch if the following two conditions hold:
        //
        //  1. ntoreturn is positive and not equal to 1 (see the comment above). This condition
        //  is stored in 'sendMoreBatches'.
        //
        //  2. The last call to _cursor->more() was true (i.e. we never explicitly got a false
        //  value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server
        //  hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the
        //  query response to indicate that there are no more results. In this case, _cursor->more()
        //  will be explicitly false, and we know for sure that we do not have to send more batches.
        //
        //  On the other hand, if _cursor->more() is true there may or may not be more results.
        //  Suppose that the mongod generates enough results to fill this batch. In this case it
        //  does not know whether not there are more, because doing so would require requesting an
        //  extra result and seeing whether we get EOF. The mongod sends a valid cursorId to
        //  indicate that there may be more. We do the same here: we indicate that there may be
        //  more results to retrieve by setting 'hasMoreBatches' to true.
        bool hasMoreBatches = sendMoreBatches && cursorHasMore;

        LOG(5) << "\t hasMoreBatches: " << hasMoreBatches
               << " sendMoreBatches: " << sendMoreBatches
               << " cursorHasMore: " << cursorHasMore
               << " ntoreturn: " << ntoreturn
               << " num: " << docCount
               << " id:" << getId()
               << " totalSent: " << _totalSent << endl;

        _totalSent += docCount;
        _done = ! hasMoreBatches;

        return hasMoreBatches;
    }
Exemple #8
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        // TODO: Document.
        replVerifyReadsOk();

        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            startingResult = cc->pos();

            Runner* runner = cc->getRunner();
            const ParsedQuery& pq = runner->getQuery().getParsed();

            // Get results out of the runner.
            // TODO: There may be special handling required for tailable cursors?
            runner->restoreState();
            BSONObj obj;
            // TODO: Differentiate EOF from error.
            while (runner->getNext(&obj)) {
                // If we're sharded make sure that we don't return any data that hasn't been
                // migrated off of our shard yet.
                if (collMetadata) {
                    KeyPattern kp(collMetadata->getKeyPattern());
                    if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
                }

                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (pq.hasOption(QueryOption_OplogReplay)) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((numResults && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            cc->incPos(numResults);
            runner->saveState();

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            exhaust = pq.hasOption(QueryOption_Exhaust);

            // If the getmore had a time limit, remaining time is "rolled over" back to the
            // cursor (for use by future getmore ops).
            cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        return qr;
    }
Exemple #9
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        // This is a read lock.
        Client::ReadContext ctx(q.ns, dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner;
        Status status = getRunner(q, &rawRunner);
        if (!status.isOK()) {
            uasserted(17007, "Couldn't process query " + q.query.toString()
                         + " why: " + status.reason());
        }
        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns);

        // We use this a lot below.
        const ParsedQuery& pq = runner->getQuery().getParsed();

        // TODO: Document why we do this.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        BSONObj obj;
        // TODO: Differentiate EOF from error.
        while (runner->getNext(&obj)) {
            // If we're sharded make sure that we don't return any data that hasn't been migrated
            // off of our shared yet.
            if (collMetadata) {
                // This information can change if we yield and as such we must make sure to re-fetch
                // it if we yield.
                KeyPattern kp(collMetadata->getKeyPattern());
                // This performs excessive BSONObj creation but that's OK for now.
                if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
            }

            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            const bool isExplain = pq.isExplain();
            if (isExplain && pq.enoughForExplain(numResults)) {
                break;
            }
            else if (!supportsGetMore && (pq.enough(numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (pq.enoughForFirstBatch(numResults, bb.len())) {
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    saveClientCursor = true;
                }
                break;
            }
        }

        // TODO: Stage creation can set tailable depending on what's in the parsed query.  We have
        // the full parsed query available during planning...set it there.
        //
        // TODO: If we're tailable we want to save the client cursor.  Make sure we do this later.
        //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // Allocate a new ClientCursor.
            ClientCursorHolder ccHolder;
            ccHolder.reset(new ClientCursor(runner.get()));
            ccId = ccHolder->cursorid();

            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                ccHolder->slaveReadTill(slaveReadTill);
            }

            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            ccHolder->setCollMetadata(collMetadata);
            ccHolder->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            ccHolder->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

            // Give up our reference to the CC.
            ccHolder.release();
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;
        // TODO: nscanned is bogus.
        // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Exemple #10
0
    int run() {

        if ( ! hasParam( "from" ) ) {
            log() << "need to specify --from" << endl;
            return -1;
        }

        Client::initThread( "oplogreplay" );

        log() << "going to connect" << endl;
        
        OplogReader r(false);
        r.setTailingQueryOptions( QueryOption_SlaveOk | QueryOption_AwaitData );
        r.connect( getParam( "from" ) );

        log() << "connected" << endl;

        OpTime start( time(0) - getParam( "seconds" , 86400 ) , 0 );
        log() << "starting from " << start.toStringPretty() << endl;

        string ns = getParam( "oplogns" );
        r.tailingQueryGTE( ns.c_str() , start );

        bool legacyApplyOps = (versionCmp(mongodVersion(), "2.2.0") < 0);

        int num = 0;
        while ( r.more() ) {
            BSONObj o = r.next();
            LOG(2) << o << endl;
            
            if ( o["$err"].type() ) {
                log() << "error getting oplog" << endl;
                log() << o << endl;
                return -1;
            }
                

            bool print = ++num % 100000 == 0;
            if ( print )
                cout << num << "\t" << o << endl;
            
            if ( o["op"].String() == "n" )
                continue;

            string dbname = legacyApplyOps? nsToDatabase(o["ns"].String()) : "admin";

            BSONObjBuilder b( o.objsize() + 32 );
            BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) );
            updates.append( o );
            updates.done();

            BSONObj c = b.obj();
            
            BSONObj res;
            bool ok = conn().runCommand( dbname , c , res );
            if ( print || ! ok )
                log() << res << endl;
        }

        return 0;
    }
Exemple #11
0
        void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
            const int flags = d.reservedField();
            bool keepGoing = flags & InsertOption_KeepGoing; // modified before assertion if should abort

            while ( d.moreJSObjs() ) {
                try {
                    BSONObj o = d.nextJsObj();
                    if ( ! manager->hasShardKey( o ) ) {

                        bool bad = true;

                        if ( manager->getShardKey().partOfShardKey( "_id" ) ) {
                            BSONObjBuilder b;
                            b.appendOID( "_id" , 0 , true );
                            b.appendElements( o );
                            o = b.obj();
                            bad = ! manager->hasShardKey( o );
                        }

                        if ( bad ) {
                            log() << "tried to insert object without shard key: " << r.getns() << "  " << o << endl;
                            uasserted( 8011 , "tried to insert object without shard key" );
                        }

                    }

                    // Many operations benefit from having the shard key early in the object
                    o = manager->getShardKey().moveToFront(o);

                    const int maxTries = 30;

                    bool gotThrough = false;
                    for ( int i=0; i<maxTries; i++ ) {
                        try {
                            ChunkPtr c = manager->findChunk( o );
                            log(4) << "  server:" << c->getShard().toString() << " " << o << endl;
                            insert( c->getShard() , r.getns() , o , flags);

                            r.gotInsert();
                            if ( r.getClientInfo()->autoSplitOk() )
                                c->splitIfShould( o.objsize() );
                            gotThrough = true;
                            break;
                        }
                        catch ( StaleConfigException& e ) {
                            int logLevel = i < ( maxTries / 2 );
                            LOG( logLevel ) << "retrying insert because of StaleConfigException: " << e << " object: " << o << endl;
                            r.reset();
                            
                            unsigned long long old = manager->getSequenceNumber();
                            manager = r.getChunkManager();
                            
                            LOG( logLevel ) << "  sequenece number - old: " << old << " new: " << manager->getSequenceNumber() << endl;

                            if (!manager) {
                                keepGoing = false;
                                uasserted(14804, "collection no longer sharded");
                            }
                        }
                        sleepmillis( i * 20 );
                    }
                    
                    assert( inShutdown() || gotThrough ); // not caught below
                } catch (const UserException&){
                    if (!keepGoing || !d.moreJSObjs()){
                        throw;
                    }
                    // otherwise ignore and keep going
                }
            }
        }
Exemple #12
0
    void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber,
                                    vector<IndexAccessMethod*>& indexesToInsertTo,
                                    const CompactOptions* compactOptions, CompactStats* stats ) {

        log() << "compact begin extent #" << extentNumber
              << " for namespace " << _ns << " " << diskloc;

        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates(diskloc) );

        {
            // the next/prev pointers within the extent might not be in order so we first
            // page the whole thing in sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            size_t length = e->length;

            touch_pages( reinterpret_cast<const char*>(e), length );
            int ms = t.millis();
            if( ms > 1000 )
                log() << "compact end paging in " << ms << "ms "
                      << e->length/1000000.0/t.seconds() << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = getExtentManager()->getNextRecordInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if ( compactOptions->validateDocuments && !objOld.valid() ) {
                        // object is corrupt!
                        log() << "compact skipping corrupt document!";
                        stats->corruptDocuments++;
                    }
                    else {
                        unsigned docSize = objOld.objsize();

                        nrecords++;
                        oldObjSize += docSize;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = docSize + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;

                        switch( compactOptions->paddingMode ) {
                        case CompactOptions::NONE:
                            if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) )
                                lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding);
                            break;
                        case CompactOptions::PRESERVE:
                            // if we are preserving the padding, the record should not change size
                            lenWPadding = recOld->lengthWithHeaders();
                            break;
                        case CompactOptions::MANUAL:
                            lenWPadding = compactOptions->computeRecordSize(lenWPadding);
                            if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
                                lenWPadding = lenWHdr;
                            }
                            break;
                        }

                        CompactDocWriter writer( objOld, lenWPadding );
                        StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 );
                        uassertStatusOK( status.getStatus() );
                        datasize += _recordStore->recordFor( status.getValue() )->netLength();

                        InsertDeleteOptions options;
                        options.logIfError = false;
                        options.dupsAllowed = true; // in compact we should be doing no checking

                        for ( size_t i = 0; i < indexesToInsertTo.size(); i++ ) {
                            Status idxStatus = indexesToInsertTo[i]->insert( objOld,
                                                                             status.getValue(),
                                                                             options,
                                                                             NULL );
                            uassertStatusOK( idxStatus );
                        }
                    }

                    if( L.isNull() ) {
                        // we just did the very last record from the old extent.  it's still pointed to
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            verify( details()->firstExtent() == diskloc );
            verify( details()->lastExtent() != diskloc );
            DiskLoc newFirst = e->xnext;
            details()->firstExtent().writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            getExtentManager()->freeExtents( diskloc, diskloc );

            getDur().commitIfNeeded();

            {
                double op = 1.0;
                if( oldObjSize )
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                      << " documents (" << datasize/1000000.0 << "MB)"
                      << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100;
            }
        }

    }
Exemple #13
0
    /**
     * Run a query -- includes checking for and running a Command.
     * @return points to ns if exhaust mode. 0=normal mode
     * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
     * @yields the db mutex periodically after acquiring it.
     * @asserts on scan and order memory exhaustion and other cases.
     */
    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
        ParsedQuery& pq( *pq_shared );
        BSONObj jsobj = q.query;
        int queryOptions = q.queryOptions;
        const char *ns = q.ns;

        if( logLevel >= 2 )
            log() << "runQuery called " << ns << " " << jsobj << endl;

        curop.debug().ns = ns;
        curop.debug().ntoreturn = pq.getNumToReturn();
        curop.debug().query = jsobj;
        curop.setQuery(jsobj);

        // Run a command.
        
        if ( pq.couldBeCommand() ) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult));
            BSONObjBuilder cmdResBuf;
            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
                curop.debug().iscommand = true;
                curop.debug().query = jsobj;
                curop.markCommand();

                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = 1;
                result.setData( qr.release(), true );
            }
            else {
                uasserted(13530, "bad or malformed command request?");
            }
            return 0;
        }

        bool explain = pq.isExplain();
        BSONObj order = pq.getOrder();
        BSONObj query = pq.getFilter();

        /* The ElemIter will not be happy if this isn't really an object. So throw exception
           here when that is true.
           (Which may indicate bad data from client.)
        */
        if ( query.objsize() == 0 ) {
            out() << "Bad query object?\n  jsobj:";
            out() << jsobj.toString() << "\n  query:";
            out() << query.toString() << endl;
            uassert( 10110 , "bad query object", false);
        }

        Client::ReadContext ctx( ns , dbpath ); // read locks
        const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );

        replVerifyReadsOk(&pq);

        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
            NamespaceDetails *d = nsdetails( ns );
            uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() );
            const BSONObj nat1 = BSON( "$natural" << 1 );
            if ( order.isEmpty() ) {
                order = nat1;
            }
            else {
                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
            }
        }

        // Run a simple id query.
        
        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {

            int n = 0;
            bool nsFound = false;
            bool indexFound = false;

            BSONObj resObject;
            Client& c = cc();
            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
            if ( nsFound == false || indexFound == true ) {
                
                if ( shardingState.needShardChunkManager( ns ) ) {
                    ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns );
                    if ( m && ! m->belongsToMe( resObject ) ) {
                        // I have something this _id
                        // but it doesn't belong to me
                        // so return nothing
                        resObject = BSONObj();
                        found = false;
                    }
                }

                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
                bb.skip(sizeof(QueryResult));
                
                curop.debug().idhack = true;
                if ( found ) {
                    n = 1;
                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
                }
                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = n;
                result.setData( qr.release(), true );
                return NULL;
            }
        }
        
        // Run a regular query.
        
        BSONObj oldPlan;
        if ( explain && ! pq.hasIndexSpecifier() ) {
            MultiPlanScanner mps( ns, query, order );
            if ( mps.usingCachedPlan() ) {
                oldPlan =
                mps.oldExplain().firstElement().embeddedObject()
                .firstElement().embeddedObject().getOwned();
            }
        }

        // In some cases the query may be retried if there is an in memory sort size assertion.
        for( int retry = 0; retry < 2; ++retry ) {
            try {
                return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order,
                                               pq_shared, oldPlan, shardingVersionAtStart, result );
            } catch ( const QueryRetryException & ) {
                verify( retry == 0 );
            }
        }
        verify( false );
        return 0;
    }
    Status cloneCollectionAsCapped( OperationContext* txn,
                                    Database* db,
                                    const string& shortFrom,
                                    const string& shortTo,
                                    double size,
                                    bool temp,
                                    bool logForReplication ) {

        string fromNs = db->name() + "." + shortFrom;
        string toNs = db->name() + "." + shortTo;

        Collection* fromCollection = db->getCollection( txn, fromNs );
        if ( !fromCollection )
            return Status( ErrorCodes::NamespaceNotFound,
                           str::stream() << "source collection " << fromNs <<  " does not exist" );

        if ( db->getCollection( toNs ) )
            return Status( ErrorCodes::NamespaceExists, "to collection already exists" );

        // create new collection
        {
            Client::Context ctx( toNs );
            BSONObjBuilder spec;
            spec.appendBool( "capped", true );
            spec.append( "size", size );
            if ( temp )
                spec.appendBool( "temp", true );

            Status status = userCreateNS( txn, ctx.db(), toNs, spec.done(), logForReplication );
            if ( !status.isOK() )
                return status;
        }

        Collection* toCollection = db->getCollection( txn, toNs );
        invariant( toCollection ); // we created above

        // how much data to ignore because it won't fit anyway
        // datasize and extentSize can't be compared exactly, so add some padding to 'size'
        long long excessSize =
            static_cast<long long>( fromCollection->dataSize() -
                                    ( toCollection->getRecordStore()->storageSize() * 2 ) );

        scoped_ptr<Runner> runner( InternalPlanner::collectionScan(fromNs,
                                                                   fromCollection,
                                                                   InternalPlanner::FORWARD ) );


        while ( true ) {
            BSONObj obj;
            Runner::RunnerState state = runner->getNext(&obj, NULL);

            switch( state ) {
            case Runner::RUNNER_EOF:
                return Status::OK();
            case Runner::RUNNER_DEAD:
                db->dropCollection( txn, toNs );
                return Status( ErrorCodes::InternalError, "runner turned dead while iterating" );
            case Runner::RUNNER_ERROR:
                return Status( ErrorCodes::InternalError, "runner error while iterating" );
            case Runner::RUNNER_ADVANCED:
                if ( excessSize > 0 ) {
                    excessSize -= ( 4 * obj.objsize() ); // 4x is for padding, power of 2, etc...
                    continue;
                }

                toCollection->insertDocument( txn, obj, true );
                if ( logForReplication )
                    replset::logOp(txn, "i", toNs.c_str(), obj);
                txn->recoveryUnit()->commitIfNeeded();
            }
        }

        invariant( false ); // unreachable
    }
Exemple #15
0
    std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
            bb.decouple();
            qr->setResultFlagsToOk();
            qr->len = bb.len();
            curop.debug().responseLength = bb.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            result.setData(qr, true);
            return "";
        }

        // This is a read lock.  We require this because if we're parsing a $where, the
        // where-specific parsing code assumes we have a lock and creates execution machinery that
        // requires it.
        Client::ReadContext ctx(q.ns);

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(q, &cq);
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }
        verify(cq);

        QLOG() << "Running query on new system: " << cq->toString();

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getRunner has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get()));
        runner->setYieldPolicy(Runner::YIELD_AUTO);

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Append explain information to query results by asking the runner to produce them.
        if (isExplain) {
            TypeExplain* bareExplain;
            Status res = runner->getExplainPlan(&bareExplain);

            if (!res.isOK()) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
            else {
                boost::scoped_ptr<TypeExplain> explain(bareExplain);

                // Fill in the missing run-time fields in explain, starting with propeties of
                // the process running the query.
                std::string server = mongoutils::str::stream()
                    << getHostNameCached() << ":" << serverGlobalParams.port;
                explain->setServer(server);

                // We might have skipped some results due to chunk migration etc. so our count is
                // correct.
                explain->setN(numResults);

                // Clock the whole operation.
                explain->setMillis(curop.elapsedMillis());

                BSONObj explainObj = explain->toBSON();
                bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

                // The explain output is actually a result.
                numResults = 1;
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "not caching runner but returning " << numResults << " results\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Exemple #16
0
std::string runQuery(OperationContext* opCtx,
                     QueryMessage& q,
                     const NamespaceString& nss,
                     Message& result) {
    CurOp& curOp = *CurOp::get(opCtx);
    curOp.ensureStarted();

    uassert(ErrorCodes::InvalidNamespace,
            str::stream() << "Invalid ns [" << nss.ns() << "]",
            nss.isValid());
    invariant(!nss.isCommand());

    // Set CurOp information.
    const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip);
    beginQueryOp(opCtx, nss, upconvertedQuery, q.ntoreturn, q.ntoskip);

    // Parse the qm into a CanonicalQuery.
    const boost::intrusive_ptr<ExpressionContext> expCtx;
    auto statusWithCQ =
        CanonicalQuery::canonicalize(opCtx,
                                     q,
                                     expCtx,
                                     ExtensionsCallbackReal(opCtx, &nss),
                                     MatchExpressionParser::kAllowAllSpecialFeatures);
    if (!statusWithCQ.isOK()) {
        uasserted(17287,
                  str::stream() << "Can't canonicalize query: "
                                << statusWithCQ.getStatus().toString());
    }
    unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
    invariant(cq.get());

    LOG(5) << "Running query:\n" << redact(cq->toString());
    LOG(2) << "Running query: " << redact(cq->toStringShort());

    // Parse, canonicalize, plan, transcribe, and get a plan executor.
    AutoGetCollectionForReadCommand ctx(opCtx, nss, AutoGetCollection::ViewMode::kViewsForbidden);
    Collection* const collection = ctx.getCollection();

    {
        const QueryRequest& qr = cq->getQueryRequest();

        // Allow the query to run on secondaries if the read preference permits it. If no read
        // preference was specified, allow the query to run iff slaveOk has been set.
        const bool slaveOK = qr.hasReadPref()
            ? uassertStatusOK(ReadPreferenceSetting::fromContainingBSON(q.query))
                  .canRunOnSecondary()
            : qr.isSlaveOk();
        uassertStatusOK(
            repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, slaveOK));
    }

    // We have a parsed query. Time to get the execution plan for it.
    auto exec = uassertStatusOK(getExecutorLegacyFind(opCtx, collection, nss, std::move(cq)));

    const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest();

    // If it's actually an explain, do the explain and return rather than falling through
    // to the normal query execution loop.
    if (qr.isExplain()) {
        BufBuilder bb;
        bb.skip(sizeof(QueryResult::Value));

        BSONObjBuilder explainBob;
        Explain::explainStages(
            exec.get(), collection, ExplainOptions::Verbosity::kExecAllPlans, &explainBob);

        // Add the resulting object to the return buffer.
        BSONObj explainObj = explainBob.obj();
        bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

        // Set query result fields.
        QueryResult::View qr = bb.buf();
        qr.setResultFlagsToOk();
        qr.msgdata().setLen(bb.len());
        curOp.debug().responseLength = bb.len();
        qr.msgdata().setOperation(opReply);
        qr.setCursorId(0);
        qr.setStartingFrom(0);
        qr.setNReturned(1);
        result.setData(bb.release());
        return "";
    }

    // Handle query option $maxTimeMS (not used with commands).
    if (qr.getMaxTimeMS() > 0) {
        uassert(40116,
                "Illegal attempt to set operation deadline within DBDirectClient",
                !opCtx->getClient()->isInDirectClient());
        opCtx->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()});
    }
    opCtx->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

    // Run the query.
    // bb is used to hold query results
    // this buffer should contain either requested documents per query or
    // explain information, but not both
    BufBuilder bb(FindCommon::kInitReplyBufferSize);
    bb.skip(sizeof(QueryResult::Value));

    // How many results have we obtained from the executor?
    int numResults = 0;

    BSONObj obj;
    PlanExecutor::ExecState state;

    // Get summary info about which plan the executor is using.
    {
        stdx::lock_guard<Client> lk(*opCtx->getClient());
        curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
    }

    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
        // If we can't fit this result inside the current batch, then we stash it for later.
        if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) {
            exec->enqueue(obj);
            break;
        }

        // Add result to output buffer.
        bb.appendBuf((void*)obj.objdata(), obj.objsize());

        // Count the result.
        ++numResults;

        if (FindCommon::enoughForFirstBatch(qr, numResults)) {
            LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore()
                   << " ntoreturn=" << qr.getNToReturn().value_or(0)
                   << " numResults=" << numResults;
            break;
        }
    }

    // Caller expects exceptions thrown in certain cases.
    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
        error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
                << ", stats: " << redact(Explain::getWinningPlanStats(exec.get()));
        uassertStatusOKWithContext(WorkingSetCommon::getMemberObjectStatus(obj),
                                   "Executor error during OP_QUERY find");
        MONGO_UNREACHABLE;
    }

    // Before saving the cursor, ensure that whatever plan we established happened with the expected
    // collection version
    auto css = CollectionShardingState::get(opCtx, nss);
    css->checkShardVersionOrThrow(opCtx);

    // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with
    // this cursorid later.
    long long ccId = 0;

    if (shouldSaveCursor(opCtx, collection, state, exec.get())) {
        // We won't use the executor until it's getMore'd.
        exec->saveState();
        exec->detachFromOperationContext();

        // Allocate a new ClientCursor and register it with the cursor manager.
        ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor(
            opCtx,
            {std::move(exec),
             nss,
             AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(),
             opCtx->recoveryUnit()->getReadConcernLevel(),
             upconvertedQuery});
        ccId = pinnedCursor.getCursor()->cursorid();

        LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults
               << " results";

        // TODO document
        if (qr.isExhaust()) {
            curOp.debug().exhaust = true;
        }

        pinnedCursor.getCursor()->setPos(numResults);

        // We assume that cursors created through a DBDirectClient are always used from their
        // original OperationContext, so we do not need to move time to and from the cursor.
        if (!opCtx->getClient()->isInDirectClient()) {
            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros());
        }

        endQueryOp(opCtx, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId);
    } else {
        LOG(5) << "Not caching executor but returning " << numResults << " results.";
        endQueryOp(opCtx, collection, *exec, numResults, ccId);
    }

    // Fill out the output buffer's header.
    QueryResult::View queryResultView = bb.buf();
    queryResultView.setCursorId(ccId);
    queryResultView.setResultFlagsToOk();
    queryResultView.msgdata().setLen(bb.len());
    queryResultView.msgdata().setOperation(opReply);
    queryResultView.setStartingFrom(0);
    queryResultView.setNReturned(numResults);

    // Add the results from the query into the output buffer.
    result.setData(bb.release());

    // curOp.debug().exhaust is set above.
    return curOp.debug().exhaust ? nss.ns() : "";
}
Status IndexBuildInterceptor::drainWritesIntoIndex(OperationContext* opCtx,
                                                   const InsertDeleteOptions& options,
                                                   RecoveryUnit::ReadSource readSource) {
    invariant(!opCtx->lockState()->inAWriteUnitOfWork());

    // Callers may request to read at a specific timestamp so that no drained writes are timestamped
    // earlier than their original write timestamp. Also ensure that leaving this function resets
    // the ReadSource to its original value.
    auto resetReadSourceGuard =
        makeGuard([ opCtx, prevReadSource = opCtx->recoveryUnit()->getTimestampReadSource() ] {
            opCtx->recoveryUnit()->abandonSnapshot();
            opCtx->recoveryUnit()->setTimestampReadSource(prevReadSource);
        });

    if (readSource != RecoveryUnit::ReadSource::kUnset) {
        opCtx->recoveryUnit()->abandonSnapshot();
        opCtx->recoveryUnit()->setTimestampReadSource(readSource);
    } else {
        resetReadSourceGuard.dismiss();
    }

    // These are used for logging only.
    int64_t totalDeleted = 0;
    int64_t totalInserted = 0;
    Timer timer;

    const int64_t appliedAtStart = _numApplied;

    // Set up the progress meter. This will never be completely accurate, because more writes can be
    // read from the side writes table than are observed before draining.
    static const char* curopMessage = "Index Build: draining writes received during build";
    ProgressMeterHolder progress;
    {
        stdx::unique_lock<Client> lk(*opCtx->getClient());
        progress.set(CurOp::get(opCtx)->setProgress_inlock(curopMessage));
    }

    // Force the progress meter to log at the end of every batch. By default, the progress meter
    // only logs after a large number of calls to hit(), but since we batch inserts by up to
    // 1000 records, progress would rarely be displayed.
    progress->reset(_sideWritesCounter.load() - appliedAtStart /* total */,
                    3 /* secondsBetween */,
                    1 /* checkInterval */);

    // Buffer operations into batches to insert per WriteUnitOfWork. Impose an upper limit on the
    // number of documents and the total size of the batch.
    const int32_t kBatchMaxSize = 1000;
    const int64_t kBatchMaxBytes = BSONObjMaxInternalSize;

    int64_t batchSizeBytes = 0;

    std::vector<SideWriteRecord> batch;
    batch.reserve(kBatchMaxSize);

    // Hold on to documents that would exceed the per-batch memory limit. Always insert this first
    // into the next batch.
    boost::optional<SideWriteRecord> stashed;

    auto cursor = _sideWritesTable->rs()->getCursor(opCtx);

    bool atEof = false;
    while (!atEof) {
        opCtx->checkForInterrupt();

        // Stashed records should be inserted into a batch first.
        if (stashed) {
            invariant(batch.empty());
            batch.push_back(std::move(stashed.get()));
            stashed.reset();
        }

        auto record = cursor->next();

        if (record) {
            RecordId currentRecordId = record->id;
            BSONObj docOut = record->data.toBson().getOwned();

            // If the total batch size in bytes would be too large, stash this document and let the
            // current batch insert.
            int objSize = docOut.objsize();
            if (batchSizeBytes + objSize > kBatchMaxBytes) {
                invariant(!stashed);

                // Stash this document to be inserted in the next batch.
                stashed.emplace(currentRecordId, std::move(docOut));
            } else {
                batchSizeBytes += objSize;
                batch.emplace_back(currentRecordId, std::move(docOut));

                // Continue if there is more room in the batch.
                if (batch.size() < kBatchMaxSize) {
                    continue;
                }
            }
        } else {
            atEof = true;
            if (batch.empty())
                break;
        }

        invariant(!batch.empty());

        cursor->save();

        // If we are here, either we have reached the end of the table or the batch is full, so
        // insert everything in one WriteUnitOfWork, and delete each inserted document from the side
        // writes table.
        auto status = writeConflictRetry(opCtx, "index build drain", _indexCatalogEntry->ns(), [&] {
            WriteUnitOfWork wuow(opCtx);
            for (auto& operation : batch) {
                auto status =
                    _applyWrite(opCtx, operation.second, options, &totalInserted, &totalDeleted);
                if (!status.isOK()) {
                    return status;
                }

                // Delete the document from the table as soon as it has been inserted into the
                // index. This ensures that no key is ever inserted twice and no keys are skipped.
                _sideWritesTable->rs()->deleteRecord(opCtx, operation.first);
            }

            // For rollback to work correctly, these writes need to be timestamped. The actual time
            // is not important, as long as it not older than the most recent visible side write.
            IndexTimestampHelper::setGhostCommitTimestampForWrite(
                opCtx, NamespaceString(_indexCatalogEntry->ns()));

            wuow.commit();
            return Status::OK();
        });
        if (!status.isOK()) {
            return status;
        }

        progress->hit(batch.size());

        // Lock yielding will only happen if we are holding intent locks.
        _tryYield(opCtx);
        cursor->restore();

        // Account for more writes coming in during a batch.
        progress->setTotalWhileRunning(_sideWritesCounter.loadRelaxed() - appliedAtStart);

        _numApplied += batch.size();
        batch.clear();
        batchSizeBytes = 0;
    }

    progress->finished();

    int logLevel = (_numApplied - appliedAtStart > 0) ? 0 : 1;
    LOG(logLevel) << "index build: drain applied " << (_numApplied - appliedAtStart)
                  << " side writes (inserted: " << totalInserted << ", deleted: " << totalDeleted
                  << ") for '" << _indexCatalogEntry->descriptor()->indexName() << "' in "
                  << timer.millis() << " ms";

    return Status::OK();
}
Exemple #18
0
    /* Prepare to build an index.  Does not actually build it (except for a special _id case).
       - We validate that the params are good
       - That the index does not already exist
       - Creates the source collection if it DNE

       example of 'io':
         { ns : 'test.foo', name : 'z', key : { z : 1 } }

       throws DBException

       @param sourceNS - source NS we are indexing
       @param sourceCollection - its details ptr
       @return true if ok to continue.  when false we stop/fail silently (index already exists)
    */
    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) {
        sourceCollection = 0;

        // logical name of the index.  todo: get rid of the name, we don't need it!
        const char *name = io.getStringField("name");
        uassert(12523, "no index name specified", *name);

        // the collection for which we are building an index
        sourceNS = io.getStringField("ns");
        uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
        massert(10097, str::stream() << "bad table to index name on add index attempt current db: " << cc().database()->name << "  source: " << sourceNS ,
                cc().database()->name == nsToDatabase(sourceNS.c_str()));

        BSONObj key = io.getObjectField("key");
        uassert(12524, "index key pattern too large", key.objsize() <= 2048);
        if( !validKeyPattern(key) ) {
            string s = string("bad index key pattern ") + key.toString();
            uasserted(10098 , s.c_str());
        }

        if ( sourceNS.empty() || key.isEmpty() ) {
            log(2) << "bad add index attempt name:" << (name?name:"") << "\n  ns:" <<
                   sourceNS << "\n  idxobj:" << io.toString() << endl;
            string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
            uasserted(12504, s);
        }

        sourceCollection = nsdetails(sourceNS.c_str());
        if( sourceCollection == 0 ) {
            // try to create it
            string err;
            if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) {
                problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl;
                return false;
            }
            sourceCollection = nsdetails(sourceNS.c_str());
            tlog() << "info: creating collection " << sourceNS << " on add index" << endl;
            verify( sourceCollection );
        }

        if ( sourceCollection->findIndexByName(name) >= 0 ) {
            // index already exists.
            return false;
        }
        if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) {
            log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl;
            return false;
        }

        if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) {
            stringstream ss;
            ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString();
            string s = ss.str();
            log() << s << '\n';
            uasserted(12505,s);
        }

        /* we can't build a new index for the ns if a build is already in progress in the background -
           EVEN IF this is a foreground build.
           */
        uassert(12588, "cannot add index with a background operation in progress",
                !BackgroundOperation::inProgForNs(sourceNS.c_str()));

        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to
           all be treated as the same pattern.
        */
        if ( IndexDetails::isIdIndexPattern(key) ) {
            if( !god ) {
                ensureHaveIdIndex( sourceNS.c_str() );
                return false;
            }
        }
        else {
            /* is buildIndexes:false set for this replica set member?
               if so we don't build any indexes except _id
            */
            if( theReplSet && !theReplSet->buildIndexes() )
                return false;
        }

        string pluginName = IndexPlugin::findPluginName( key );
        IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;


        { 
            BSONObj o = io;
            if ( plugin ) {
                o = plugin->adjustIndexSpec(o);
            }
            BSONObjBuilder b;
            int v = DefaultIndexVersionNumber;
            if( !o["v"].eoo() ) {
                double vv = o["v"].Number();
                // note (one day) we may be able to fresh build less versions than we can use
                // isASupportedIndexVersionNumber() is what we can use
                uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv, 
                    vv == 0 || vv == 1);
                v = (int) vv;
            }
            // idea is to put things we use a lot earlier
            b.append("v", v);
            b.append(o["key"]);
            if( o["unique"].trueValue() )
                b.appendBool("unique", true); // normalize to bool true in case was int 1 or something...
            b.append(o["ns"]);

            {
                // stripping _id
                BSONObjIterator i(o);
                while ( i.more() ) {
                    BSONElement e = i.next();
                    string s = e.fieldName();
                    if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" )
                        b.append(e);
                }
            }
        
            fixedIndexObject = b.obj();
        }

        return true;
    }
Exemple #19
0
 BSONObj Helpers::toKeyFormat( const BSONObj& o ) {
     BSONObjBuilder keyObj( o.objsize() );
     BSONForEach( e , o ) {
         keyObj.appendAs( e , "" );
     }
        /*
         * Runs the command object cmdobj on the db with name dbname and puts result in result.
         * @param dbname, name of db
         * @param cmdobj, object that contains entire command
         * @param options
         * @param errmsg, reference to error message
         * @param result, reference to builder for result
         * @param fromRepl
         * @return true if successful, false otherwise
         */
        bool FTSCommand::_run(OperationContext* txn,
                              const string& dbname,
                              BSONObj& cmdObj,
                              int cmdOptions,
                              const string& ns,
                              const string& searchString,
                              string language, // "" for not-set
                              int limit,
                              BSONObj& filter,
                              BSONObj& projection,
                              string& errmsg,
                              BSONObjBuilder& result ) {

            Timer comm;

            // Rewrite the cmd as a normal query.
            BSONObjBuilder queryBob;
            queryBob.appendElements(filter);

            BSONObjBuilder textBob;
            textBob.append("$search", searchString);
            if (!language.empty()) {
                textBob.append("$language", language);
            }
            queryBob.append("$text", textBob.obj());

            // This is the query we exec.
            BSONObj queryObj = queryBob.obj();

            // We sort by the score.
            BSONObj sortSpec = BSON("$s" << BSON("$meta" << LiteParsedQuery::metaTextScore));

            // We also project the score into the document and strip it out later during the reformatting
            // of the results.
            BSONObjBuilder projBob;
            projBob.appendElements(projection);
            projBob.appendElements(sortSpec);
            BSONObj projObj = projBob.obj();

            Client::ReadContext ctx(txn, ns);

            CanonicalQuery* cq;
            Status canonicalizeStatus = 
                    CanonicalQuery::canonicalize(ns, 
                                                 queryObj,
                                                 sortSpec,
                                                 projObj, 
                                                 0,
                                                 limit,
                                                 BSONObj(),
                                                 &cq,
                                                 WhereCallbackReal(txn, StringData(dbname)));
            if (!canonicalizeStatus.isOK()) {
                errmsg = canonicalizeStatus.reason();
                return false;
            }

            Runner* rawRunner;
            Status getRunnerStatus = getRunner(txn, ctx.ctx().db()->getCollection(txn, ns), cq, &rawRunner);
            if (!getRunnerStatus.isOK()) {
                errmsg = getRunnerStatus.reason();
                return false;
            }

            auto_ptr<Runner> runner(rawRunner);

            BSONArrayBuilder resultBuilder(result.subarrayStart("results"));

            // Quoth: "leave a mb for other things"
            int resultSize = 1024 * 1024;

            int numReturned = 0;

            BSONObj obj;
            while (Runner::RUNNER_ADVANCED == runner->getNext(&obj, NULL)) {
                if ((resultSize + obj.objsize()) >= BSONObjMaxUserSize) {
                    break;
                }
                // We return an array of results.  Add another element.
                BSONObjBuilder oneResultBuilder(resultBuilder.subobjStart());
                oneResultBuilder.append("score", obj["$s"].number());

                // Strip out the score from the returned obj.
                BSONObjIterator resIt(obj);
                BSONObjBuilder resBob;
                while (resIt.more()) {
                    BSONElement elt = resIt.next();
                    if (!mongoutils::str::equals("$s", elt.fieldName())) {
                        resBob.append(elt);
                    }
                }
                oneResultBuilder.append("obj", resBob.obj());
                BSONObj addedArrayObj = oneResultBuilder.done();
                resultSize += addedArrayObj.objsize();
                numReturned++;
            }

            resultBuilder.done();

            // returns some stats to the user
            BSONObjBuilder stats(result.subobjStart("stats"));

            // Fill in nscanned from the explain.
            TypeExplain* bareExplain;
            Status res = runner->getInfo(&bareExplain, NULL);
            if (res.isOK()) {
                auto_ptr<TypeExplain> explain(bareExplain);
                stats.append("nscanned", explain->getNScanned());
                stats.append("nscannedObjects", explain->getNScannedObjects());
            }

            stats.appendNumber( "n" , numReturned );
            stats.append( "timeMicros", (int)comm.micros() );
            stats.done();

            return true;
        }
Exemple #21
0
static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
    Lock::DBWrite lk1("local");

    if ( strncmp(ns, "local.", 6) == 0 ) {
        if ( strncmp(ns, "local.slaves", 12) == 0 )
            resetSlaveCache();
        return;
    }

    mutex::scoped_lock lk2(OpTime::m);

    const OpTime ts = OpTime::now(lk2);
    long long hashNew;
    if( theReplSet ) {
        massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
        hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
    }
    else {
        // must be initiation
        verify( *ns == 0 );
        hashNew = 0;
    }

    /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
       instead we do a single copy to the destination position in the memory mapped file.
    */

    logopbufbuilder.reset();
    BSONObjBuilder b(logopbufbuilder);
    b.appendTimestamp("ts", ts.asDate());
    b.append("h", hashNew);
    b.append("v", OPLOG_VERSION);
    b.append("op", opstr);
    b.append("ns", ns);
    if (fromMigrate)
        b.appendBool("fromMigrate", true);
    if ( bb )
        b.appendBool("b", *bb);
    if ( o2 )
        b.append("o2", *o2);
    BSONObj partial = b.done();
    int posz = partial.objsize();
    int len = posz + obj.objsize() + 1 + 2 /*o:*/;

    Record *r;
    DEV verify( logNS == 0 );
    {
        const char *logns = rsoplog;
        if ( rsOplogDetails == 0 ) {
            Client::Context ctx( logns , dbpath, false);
            localDB = ctx.db();
            verify( localDB );
            rsOplogDetails = nsdetails(logns);
            massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
        }
        Client::Context ctx( logns , localDB, false );
        r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
        /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                 this code (or code in now() maybe) should be improved.
                 */
        if( theReplSet ) {
            if( !(theReplSet->lastOpTimeWritten<ts) ) {
                log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog;
                log() << "replSet " << theReplSet->isPrimary() << rsLog;
            }
            theReplSet->lastOpTimeWritten = ts;
            theReplSet->lastH = hashNew;
            ctx.getClient()->setLastOp( ts );
        }
    }

    append_O_Obj(r->data(), partial, obj);

    if ( logLevel >= 6 ) {
        LOG( 6 ) << "logOp:" << BSONObj::make(r) << endl;
    }
}
Exemple #22
0
    /** object cannot be represented in compact format.  so store in traditional bson format 
        with a leading sentinel byte IsBSON to indicate it's in that format.

        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
        so that we don't have to do an extra malloc.
    */
    void KeyV1Owned::traditional(const BSONObj& obj) { 
        b.reset();
        b.appendUChar(IsBSON);
        b.appendBuf(obj.objdata(), obj.objsize());
        _keyData = (const unsigned char *) b.buf();
    }
Exemple #23
0
/** @param fromRepl false if from ApplyOpsCmd
    @return true if was and update should have happened and the document DNE.  see replset initial sync code.
 */
bool applyOperation_inlock(const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) {
    LOG(3) << "applying op: " << op << endl;
    bool failedUpdate = false;

    OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;

    const char *names[] = { "o", "ns", "op", "b" };
    BSONElement fields[4];
    op.getFields(4, names, fields);

    BSONObj o;
    if( fields[0].isABSONObj() )
        o = fields[0].embeddedObject();

    const char *ns = fields[1].valuestrsafe();

    Lock::assertWriteLocked(ns);

    NamespaceDetails *nsd = nsdetails(ns);

    // operation type -- see logOp() comments for types
    const char *opType = fields[2].valuestrsafe();

    if ( *opType == 'i' ) {
        opCounters->gotInsert();

        const char *p = strchr(ns, '.');
        if ( p && strcmp(p, ".system.indexes") == 0 ) {
            // updates aren't allowed for indexes -- so we will do a regular insert. if index already
            // exists, that is ok.
            theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
        }
        else {
            // do upserts for inserts as we might get replayed more than once
            OpDebug debug;
            BSONElement _id;
            if( !o.getObjectID(_id) ) {
                /* No _id.  This will be very slow. */
                Timer t;
                updateObjectsForReplication(ns, o, o, true, false, false, debug, false,
                                            QueryPlanSelectionPolicy::idElseNatural() );
                if( t.millis() >= 2 ) {
                    RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
                }
            }
            else {
                // probably don't need this since all replicated colls have _id indexes now
                // but keep it just in case
                RARELY if ( nsd && !nsd->isCapped() ) {
                    ensureHaveIdIndex(ns);
                }

                /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                          then.  very few upserts will not be inserts...
                          */
                BSONObjBuilder b;
                b.append(_id);
                updateObjectsForReplication(ns, o, b.done(), true, false, false , debug, false,
                                            QueryPlanSelectionPolicy::idElseNatural() );
            }
        }
    }
Exemple #24
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            context.relocked();

            bool createdCollection = false;
            Collection* collection = NULL;

            while( i.moreInCurrentBatch() ) {
                if ( numSeen % 128 == 127 /*yield some*/ ) {
                    collection = NULL;
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) {
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << numSeen << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                if ( isindex == false && collection == NULL ) {
                    collection = context.db()->getCollection( to_collection );
                    if ( !collection ) {
                        massert( 17321,
                                 str::stream()
                                 << "collection dropped during clone ["
                                 << to_collection << "]",
                                 !createdCollection );
                        createdCollection = true;
                        collection = context.db()->createCollection( txn, to_collection );
                        verify( collection );
                    }
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                const Status status = validateBSON(tmp.objdata(), tmp.objsize());
                if (!status.isOK()) {
                    out() << "Cloner: skipping corrupt object from " << from_collection
                          << ": " << status.reason();
                    continue;
                }

                ++numSeen;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify(nsToCollectionSubstring(from_collection) == "system.indexes");
                    js = fixindex(context.db()->name(), tmp);
                    indexesToBuild->push_back( js.getOwned() );
                    continue;
                }

                verify(nsToCollectionSubstring(from_collection) != "system.indexes");

                StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true );
                if ( !loc.isOK() ) {
                    error() << "error: exception cloning object in " << from_collection
                            << ' ' << loc.toString() << " obj:" << js;
                }
                uassertStatusOK( loc.getStatus() );
                if ( logForRepl )
                    logOp(txn, "i", to_collection, js);

                txn->commitIfNeeded();

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << numSeen << " objects cloned so far from collection " << from_collection;
                    saveLast = time( 0 );
                }
            }
        }
    /**
     * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore
     * requests).  Otherwise, returns false.
     */
    static bool handleCursorCommand(OperationContext* txn,
                                    const string& ns,
                                    ClientCursorPin* pin,
                                    PlanExecutor* exec,
                                    const BSONObj& cmdObj,
                                    BSONObjBuilder& result) {

        ClientCursor* cursor = pin ? pin->c() : NULL;
        if (pin) {
            invariant(cursor);
            invariant(cursor->getExecutor() == exec);
            invariant(cursor->isAggCursor());
        }

        const long long defaultBatchSize = 101; // Same as query.
        long long batchSize;
        uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize));

        // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
        BSONArrayBuilder resultsArray;
        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        BSONObj next;
        for (int objCount = 0; objCount < batchSize; objCount++) {
            // The initial getNext() on a PipelineProxyStage may be very expensive so we don't
            // do it when batchSize is 0 since that indicates a desire for a fast return.
            if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) {
                // make it an obvious error to use cursor or executor after this point
                cursor = NULL;
                exec = NULL;
                break;
            }

            if (resultsArray.len() + next.objsize() > byteLimit) {
                // Get the pipeline proxy stage wrapped by this PlanExecutor.
                PipelineProxyStage* proxy = static_cast<PipelineProxyStage*>(exec->getRootStage());
                // too big. next will be the first doc in the second batch
                proxy->pushBack(next);
                break;
            }

            resultsArray.append(next);
        }

        // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should
        // be relatively quick since if there was no pin then the input is empty. Also, this
        // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
        // case. This is ok for now however, since you can't have a sharded collection that doesn't
        // exist.
        const bool canReturnMoreBatches = pin;
        if (!canReturnMoreBatches && exec && !exec->isEOF()) {
            // msgasserting since this shouldn't be possible to trigger from today's aggregation
            // language. The wording assumes that the only reason pin would be null is if the
            // collection doesn't exist.
            msgasserted(17391, str::stream()
                << "Aggregation has more results than fit in initial batch, but can't "
                << "create cursor since collection " << ns << " doesn't exist");
        }

        if (cursor) {
            // If a time limit was set on the pipeline, remaining time is "rolled over" to the
            // cursor (for use by future getmore ops).
            cursor->setLeftoverMaxTimeMicros( txn->getCurOp()->getRemainingMaxTimeMicros() );

            if (txn->getClient()->isInDirectClient()) {
                cursor->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                txn->recoveryUnit()->commitAndRestart();
                cursor->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
                txn->setRecoveryUnit(storageEngine->newRecoveryUnit());
            }

            // Cursor needs to be in a saved state while we yield locks for getmore. State
            // will be restored in getMore().
            exec->saveState();
        }

        const long long cursorId = cursor ? cursor->cursorid() : 0LL;
        appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result);

        return static_cast<bool>(cursor);
    }
Exemple #26
0
std::string DBHashCmd::hashCollection(OperationContext* opCtx,
                                      Database* db,
                                      const std::string& fullCollectionName,
                                      bool* fromCache) {
    boost::unique_lock<boost::mutex> cachedHashedLock(_cachedHashedMutex, boost::defer_lock);

    if ( isCachable( fullCollectionName ) ) {
        cachedHashedLock.lock();
        string hash = _cachedHashed[fullCollectionName];
        if ( hash.size() > 0 ) {
            *fromCache = true;
            return hash;
        }
    }

    *fromCache = false;
    Collection* collection = db->getCollection( fullCollectionName );
    if ( !collection )
        return "";

    IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex( opCtx );

    auto_ptr<PlanExecutor> exec;
    if ( desc ) {
        exec.reset(InternalPlanner::indexScan(opCtx,
                                              collection,
                                              desc,
                                              BSONObj(),
                                              BSONObj(),
                                              false,
                                              InternalPlanner::FORWARD,
                                              InternalPlanner::IXSCAN_FETCH));
    }
    else if ( collection->isCapped() ) {
        exec.reset(InternalPlanner::collectionScan(opCtx,
                   fullCollectionName,
                   collection));
    }
    else {
        log() << "can't find _id index for: " << fullCollectionName << endl;
        return "no _id _index";
    }

    md5_state_t st;
    md5_init(&st);

    long long n = 0;
    PlanExecutor::ExecState state;
    BSONObj c;
    verify(NULL != exec.get());
    while (PlanExecutor::ADVANCED == (state = exec->getNext(&c, NULL))) {
        md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() );
        n++;
    }
    if (PlanExecutor::IS_EOF != state) {
        warning() << "error while hashing, db dropped? ns=" << fullCollectionName << endl;
    }
    md5digest d;
    md5_finish(&st, d);
    string hash = digestToString( d );

    if (cachedHashedLock.owns_lock()) {
        _cachedHashed[fullCollectionName] = hash;
    }

    return hash;
}
/**
 * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore
 * requests).  Otherwise, returns false.
 */
static bool handleCursorCommand(OperationContext* txn,
                                const string& ns,
                                ClientCursorPin* pin,
                                PlanExecutor* exec,
                                const BSONObj& cmdObj,
                                BSONObjBuilder& result) {
    ClientCursor* cursor = pin ? pin->c() : NULL;
    if (pin) {
        invariant(cursor);
        invariant(cursor->getExecutor() == exec);
        invariant(cursor->isAggCursor());
    }

    const long long defaultBatchSize = 101;  // Same as query.
    long long batchSize;
    uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize));

    // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
    BSONArrayBuilder resultsArray;
    const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce;
    BSONObj next;
    for (int objCount = 0; objCount < batchSize; objCount++) {
        // The initial getNext() on a PipelineProxyStage may be very expensive so we don't
        // do it when batchSize is 0 since that indicates a desire for a fast return.
        if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) {
            // make it an obvious error to use cursor or executor after this point
            cursor = NULL;
            exec = NULL;
            break;
        }

        // If adding this object will cause us to exceed the BSON size limit, then we stash it for
        // later.
        if (resultsArray.len() + next.objsize() > byteLimit) {
            exec->enqueue(next);
            break;
        }

        resultsArray.append(next);
    }

    // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should
    // be relatively quick since if there was no pin then the input is empty. Also, this
    // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
    // case. This is ok for now however, since you can't have a sharded collection that doesn't
    // exist.
    const bool canReturnMoreBatches = pin;
    if (!canReturnMoreBatches && exec && !exec->isEOF()) {
        // msgasserting since this shouldn't be possible to trigger from today's aggregation
        // language. The wording assumes that the only reason pin would be null is if the
        // collection doesn't exist.
        msgasserted(
            17391,
            str::stream() << "Aggregation has more results than fit in initial batch, but can't "
                          << "create cursor since collection " << ns << " doesn't exist");
    }

    if (cursor) {
        // If a time limit was set on the pipeline, remaining time is "rolled over" to the
        // cursor (for use by future getmore ops).
        cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());

        CurOp::get(txn)->debug().cursorid = cursor->cursorid();

        // Cursor needs to be in a saved state while we yield locks for getmore. State
        // will be restored in getMore().
        exec->saveState();
        exec->detachFromOperationContext();
    }

    const long long cursorId = cursor ? cursor->cursorid() : 0LL;
    appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result);

    return static_cast<bool>(cursor);
}
Exemple #28
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.
        scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns));

        QLOG() << "running getMore in new system, cursorid " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            if (cc->isAggCursor) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_EOF == state && 0 == numResults
                && (queryOptions & QueryOption_CursorTailable)
                && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) {
                // If the cursor is tailable we don't kill it if it's eof.  We let it try to get
                // data some # of times first.
                return 0;
            }

            bool saveClientCursor = false;

            if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) {
                // If we're dead there's no way to get more results.
                saveClientCursor = false;
                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                // 
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (Runner::RUNNER_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(Runner::RUNNER_ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended w/state "
                       << Runner::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();
                QLOG() << "getMore saving client cursor ended w/state "
                       << Runner::statestr(state)
                       << endl;

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
 void insert() {
     Client::Context ctx( cappedNs() );
     BSONObj o = BSON(GENOID << "x" << 456);
     DiskLoc loc = theDataFileMgr.insert( cappedNs().c_str(), o.objdata(), o.objsize(), false );
     verify(!loc.isNull());
 }
Exemple #30
0
    static void handleCursorCommand(const string& ns,
                                    ClientCursorPin* pin,
                                    PipelineRunner* runner,
                                    const BSONObj& cmdObj,
                                    BSONObjBuilder& result) {

        ClientCursor* cursor = pin ? pin->c() : NULL;
        if (pin) {
            invariant(cursor);
            invariant(cursor->getRunner() == runner);
            invariant(cursor->isAggCursor);
        }

        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
        BSONArrayBuilder resultsArray;
        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        BSONObj next;
        for (int objCount = 0; objCount < batchSize; objCount++) {
            // The initial getNext() on a PipelineRunner may be very expensive so we don't
            // do it when batchSize is 0 since that indicates a desire for a fast return.
            if (runner->getNext(&next, NULL) != Runner::RUNNER_ADVANCED) {
                if (pin) pin->deleteUnderlying();
                // make it an obvious error to use cursor or runner after this point
                cursor = NULL;
                runner = NULL;
                break;
            }

            if (resultsArray.len() + next.objsize() > byteLimit) {
                // too big. next will be the first doc in the second batch
                runner->pushBack(next);
                break;
            }

            resultsArray.append(next);
        }

        // NOTE: runner->isEOF() can have side effects such as writing by $out. However, it should
        // be relatively quick since if there was no pin then the input is empty. Also, this
        // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
        // case. This is ok for now however, since you can't have a sharded collection that doesn't
        // exist.
        const bool canReturnMoreBatches = pin;
        if (!canReturnMoreBatches && runner && !runner->isEOF()) {
            // msgasserting since this shouldn't be possible to trigger from today's aggregation
            // language. The wording assumes that the only reason pin would be null is if the
            // collection doesn't exist.
            msgasserted(17391, str::stream()
                << "Aggregation has more results than fit in initial batch, but can't "
                << "create cursor since collection " << ns << " doesn't exist");
        }

        if (cursor) {
            // If a time limit was set on the pipeline, remaining time is "rolled over" to the
            // cursor (for use by future getmore ops).
            cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() );
        }

        BSONObjBuilder cursorObj(result.subobjStart("cursor"));
        cursorObj.append("id", cursor ? cursor->cursorid() : 0LL);
        cursorObj.append("ns", ns);
        cursorObj.append("firstBatch", resultsArray.arr());
        cursorObj.done();
    }