Пример #1
0
    UpdateResult update(UpdateRequest& request, UpdateDriver* driver) {

        const NamespaceString& nsString = request.getNamespaceString();

        validateUpdate( nsString.ns().c_str(), request.getUpdates(), request.getQuery() );

        NamespaceDetails* nsDetails = nsdetails( nsString.ns() );
        NamespaceDetailsTransient* nsDetailsTransient =
            &NamespaceDetailsTransient::get( nsString.ns().c_str() );

        OpDebug& debug = request.getDebug();

        // TODO: This seems a bit circuitious.
        debug.updateobj = request.getUpdates();

        driver->refreshIndexKeys( nsDetailsTransient->indexKeys() );

        shared_ptr<Cursor> cursor = getOptimizedCursor(
            nsString.ns(), request.getQuery(), BSONObj(), request.getQueryPlanSelectionPolicy() );

        // If the update was marked with '$isolated' (a.k.a '$atomic'), we are not allowed to
        // yield while evaluating the update loop below.
        //
        // TODO: Old code checks this repeatedly within the update loop. Is that necessary? It seems
        // that once atomic should be always atomic.
        const bool isolated =
            cursor->ok() &&
            cursor->matcher() &&
            cursor->matcher()->docMatcher().atomic();

        // The 'cursor' the optimizer gave us may contain query plans that generate duplicate
        // diskloc's. We set up here the mechanims that will prevent us from processing those
        // twice if we see them. We also set up a 'ClientCursor' so that we can support
        // yielding.
        //
        // TODO: Is it valid to call this on a non-ok cursor?
        const bool dedupHere = cursor->autoDedup();

        //
        // We'll start assuming we have one or more documents for this update. (Othwerwise,
        // we'll fallback to upserting.)
        //

        // We record that this will not be an upsert, in case a mod doesn't want to be applied
        // when in strict update mode.
        driver->setContext( ModifierInterface::ExecInfo::UPDATE_CONTEXT );

        // Let's fetch each of them and pipe them through the update expression, making sure to
        // keep track of the necessary stats. Recall that we'll be pulling documents out of
        // cursors and some of them do not deduplicate the entries they generate. We have
        // deduping logic in here, too -- for now.
        unordered_set<DiskLoc, DiskLoc::Hasher> seenLocs;
        int numMatched = 0;
        debug.nscanned = 0;

        Client& client = cc();

        mutablebson::Document doc;

        // If we are going to be yielding, we will need a ClientCursor scoped to this loop. We
        // only loop as long as the underlying cursor is OK.
        for ( auto_ptr<ClientCursor> clientCursor; cursor->ok(); ) {

            // If we haven't constructed a ClientCursor, and if the client allows us to throw
            // page faults, and if we are referring to a location that is likely not in
            // physical memory, then throw a PageFaultException. The entire operation will be
            // restarted.
            if ( clientCursor.get() == NULL &&
                 client.allowedToThrowPageFaultException() &&
                 !cursor->currLoc().isNull() &&
                 !cursor->currLoc().rec()->likelyInPhysicalMemory() ) {
                // We should never throw a PFE if we have already updated items.
                dassert((numMatched == 0) || (numMatched == debug.nupdateNoops));
                throw PageFaultException( cursor->currLoc().rec() );
            }

            if ( !isolated && debug.nscanned != 0 ) {

                // We are permitted to yield. To do so we need a ClientCursor, so create one
                // now if we have not yet done so.
                if ( !clientCursor.get() )
                    clientCursor.reset(
                        new ClientCursor( QueryOption_NoCursorTimeout, cursor, nsString.ns() ) );

                // Ask the client cursor to yield. We get two bits of state back: whether or not
                // we yielded, and whether or not we correctly recovered from yielding.
                bool yielded = false;
                const bool recovered = clientCursor->yieldSometimes(
                    ClientCursor::WillNeed, &yielded );

                if ( !recovered ) {
                    // If we failed to recover from the yield, then the ClientCursor is already
                    // gone. Release it so we don't destroy it a second time.
                    clientCursor.release();
                    break;
                }

                if ( !cursor->ok() ) {
                    // If the cursor died while we were yielded, just get out of the update loop.
                    break;
                }

                if ( yielded ) {
                    // We yielded and recovered OK, and our cursor is still good. Details about
                    // our namespace may have changed while we were yielded, so we re-acquire
                    // them here. If we can't do so, escape the update loop. Otherwise, refresh
                    // the driver so that it knows about what is currently indexed.
                    nsDetails = nsdetails( nsString.ns() );
                    if ( !nsDetails )
                        break;
                    nsDetailsTransient = &NamespaceDetailsTransient::get( nsString.ns().c_str() );

                    // TODO: This copies the index keys, but it may not need to do so.
                    driver->refreshIndexKeys( nsDetailsTransient->indexKeys() );
                }

            }

            // Let's fetch the next candidate object for this update.
            Record* record = cursor->_current();
            DiskLoc loc = cursor->currLoc();
            const BSONObj oldObj = loc.obj();

            // We count how many documents we scanned even though we may skip those that are
            // deemed duplicated. The final 'numUpdated' and 'nscanned' numbers may differ for
            // that reason.
            debug.nscanned++;

            // Skips this document if it:
            // a) doesn't match the query portion of the update
            // b) was deemed duplicate by the underlying cursor machinery
            //
            // Now, if we are going to update the document,
            // c) we don't want to do so while the cursor is at it, as that may invalidate
            // the cursor. So, we advance to next document, before issuing the update.
            MatchDetails matchDetails;
            matchDetails.requestElemMatchKey();
            if ( !cursor->currentMatches( &matchDetails ) ) {
                // a)
                cursor->advance();
                continue;
            }
            else if ( cursor->getsetdup( loc ) && dedupHere ) {
                // b)
                cursor->advance();
                continue;
            }
            else if (!driver->isDocReplacement() && request.isMulti()) {
                // c)
                cursor->advance();
                if ( dedupHere ) {
                    if ( seenLocs.count( loc ) ) {
                        continue;
                    }
                }

                // There are certain kind of cursors that hold multiple pointers to data
                // underneath. $or cursors is one example. In a $or cursor, it may be the case
                // that when we did the last advance(), we finished consuming documents from
                // one of $or child and started consuming the next one. In that case, it is
                // possible that the last document of the previous child is the same as the
                // first document of the next (see SERVER-5198 and jstests/orp.js).
                //
                // So we advance the cursor here until we see a new diskloc.
                //
                // Note that we won't be yielding, and we may not do so for a while if we find
                // a particularly duplicated sequence of loc's. That is highly unlikely,
                // though.  (See SERVER-5725, if curious, but "stage" based $or will make that
                // ticket moot).
                while( cursor->ok() && loc == cursor->currLoc() ) {
                    cursor->advance();
                }
            }

            // For some (unfortunate) historical reasons, not all cursors would be valid after
            // a write simply because we advanced them to a document not affected by the write.
            // To protect in those cases, not only we engaged in the advance() logic above, but
            // we also tell the cursor we're about to write a document that we've just seen.
            // prepareToTouchEarlierIterate() requires calling later
            // recoverFromTouchingEarlierIterate(), so we make a note here to do so.
            bool touchPreviousDoc = request.isMulti() && cursor->ok();
            if ( touchPreviousDoc ) {
                if ( clientCursor.get() )
                    clientCursor->setDoingDeletes( true );
                cursor->prepareToTouchEarlierIterate();
            }

            // Found a matching document
            numMatched++;

            // Ask the driver to apply the mods. It may be that the driver can apply those "in
            // place", that is, some values of the old document just get adjusted without any
            // change to the binary layout on the bson layer. It may be that a whole new
            // document is needed to accomodate the new bson layout of the resulting document.
            doc.reset( oldObj, mutablebson::Document::kInPlaceEnabled );
            BSONObj logObj;

            // If there was a matched field, obtain it.
            string matchedField;
            if (matchDetails.hasElemMatchKey())
                matchedField = matchDetails.elemMatchKey();

            Status status = driver->update( matchedField, &doc, &logObj );
            if ( !status.isOK() ) {
                uasserted( 16837, status.reason() );
            }

            // If the driver applied the mods in place, we can ask the mutable for what
            // changed. We call those changes "damages". :) We use the damages to inform the
            // journal what was changed, and then apply them to the original document
            // ourselves. If, however, the driver applied the mods out of place, we ask it to
            // generate a new, modified document for us. In that case, the file manager will
            // take care of the journaling details for us.
            //
            // This code flow is admittedly odd. But, right now, journaling is baked in the file
            // manager. And if we aren't using the file manager, we have to do jounaling
            // ourselves.
            bool objectWasChanged = false;
            BSONObj newObj;
            const char* source = NULL;
            mutablebson::DamageVector damages;
            bool inPlace = doc.getInPlaceUpdates(&damages, &source);
            if ( inPlace && !damages.empty() && !driver->modsAffectIndices() ) {
                nsDetails->paddingFits();

                // All updates were in place. Apply them via durability and writing pointer.
                mutablebson::DamageVector::const_iterator where = damages.begin();
                const mutablebson::DamageVector::const_iterator end = damages.end();
                for( ; where != end; ++where ) {
                    const char* sourcePtr = source + where->sourceOffset;
                    void* targetPtr = getDur().writingPtr(
                        const_cast<char*>(oldObj.objdata()) + where->targetOffset,
                        where->size);
                    std::memcpy(targetPtr, sourcePtr, where->size);
                }
                newObj = oldObj;
                debug.fastmod = true;

                objectWasChanged = true;
            }
            else {

                // The updates were not in place. Apply them through the file manager.
                newObj = doc.getObject();
                DiskLoc newLoc = theDataFileMgr.updateRecord(nsString.ns().c_str(),
                                                             nsDetails,
                                                             nsDetailsTransient,
                                                             record,
                                                             loc,
                                                             newObj.objdata(),
                                                             newObj.objsize(),
                                                             debug);

                // If we've moved this object to a new location, make sure we don't apply
                // that update again if our traversal picks the objecta again.
                //
                // We also take note that the diskloc if the updates are affecting indices.
                // Chances are that we're traversing one of them and they may be multi key and
                // therefore duplicate disklocs.
                if ( newLoc != loc || driver->modsAffectIndices()  ) {
                    seenLocs.insert( newLoc );
                }

                objectWasChanged = true;
            }

            // Log Obj
            if ( request.shouldUpdateOpLog() ) {
                if ( driver->isDocReplacement() || !logObj.isEmpty() ) {
                    BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti());
                    logOp("u", nsString.ns().c_str(), logObj , &idQuery,
                          NULL, request.isFromMigration(), &newObj);
                }
            }

            // If it was noop since the document didn't change, record that.
            if (!objectWasChanged)
                debug.nupdateNoops++;

            if (!request.isMulti()) {
                break;
            }

            // If we used the cursor mechanism that prepares an earlier seen document for a
            // write we need to tell such mechanisms that the write is over.
            if ( touchPreviousDoc ) {
                cursor->recoverFromTouchingEarlierIterate();
            }

            getDur().commitIfNeeded();

        }

        // TODO: Can this be simplified?
        if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) {
            debug.nupdated = numMatched;
            return UpdateResult( numMatched > 0 /* updated existing object(s) */,
                                 !driver->isDocReplacement() /* $mod or obj replacement */,
                                 numMatched /* # of docments update, even no-ops */,
                                 BSONObj() );
        }

        //
        // We haven't found any existing document so an insert is done
        // (upsert is true).
        //
        debug.upsert = true;

        // Since this is an insert (no docs found and upsert:true), we will be logging it
        // as an insert in the oplog. We don't need the driver's help to build the
        // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT.
        // Some mods may only work in that context (e.g. $setOnInsert).
        driver->setLogOp( false );
        driver->setContext( ModifierInterface::ExecInfo::INSERT_CONTEXT );

        BSONObj baseObj;

        // Reset the document we will be writing to
        doc.reset( baseObj, mutablebson::Document::kInPlaceDisabled );
        if ( request.getQuery().hasElement("_id") ) {
            uassertStatusOK(doc.root().appendElement(request.getQuery().getField("_id")));
        }


        // If this is a $mod base update, we need to generate a document by examining the
        // query and the mods. Otherwise, we can use the object replacement sent by the user
        // update command that was parsed by the driver before.
        // In the following block we handle the query part, and then do the regular mods after.
        if ( *request.getUpdates().firstElementFieldName() == '$' ) {
            uassertStatusOK(UpdateDriver::createFromQuery(request.getQuery(), doc));
            debug.fastmodinsert = true;
        }

        // Apply the update modifications and then log the update as an insert manually.
        Status status = driver->update( StringData(), &doc, NULL /* no oplog record */);
        if ( !status.isOK() ) {
            uasserted( 16836, status.reason() );
        }

        BSONObj newObj = doc.getObject();
        theDataFileMgr.insertWithObjMod( nsString.ns().c_str(), newObj, false, request.isGod() );
        if ( request.shouldUpdateOpLog() ) {
            logOp( "i", nsString.ns().c_str(), newObj,
                   NULL, NULL, request.isFromMigration(), &newObj );
        }

        debug.nupdated = 1;
        return UpdateResult( false /* updated a non existing document */,
                             !driver->isDocReplacement() /* $mod or obj replacement? */,
                             1 /* count of updated documents */,
                             newObj /* object that was upserted */ );
    }
Пример #2
0
    boost::shared_ptr<Runner> PipelineD::prepareCursorSource(
            OperationContext* txn,
            Collection* collection,
            const intrusive_ptr<Pipeline>& pPipeline,
            const intrusive_ptr<ExpressionContext>& pExpCtx) {
        // get the full "namespace" name
        const string& fullName = pExpCtx->ns.ns();
        pExpCtx->opCtx->lockState()->assertAtLeastReadLocked(fullName);

        // We will be modifying the source vector as we go
        Pipeline::SourceContainer& sources = pPipeline->sources;

        // Inject a MongodImplementation to sources that need them.
        for (size_t i = 0; i < sources.size(); i++) {
            DocumentSourceNeedsMongod* needsMongod =
                dynamic_cast<DocumentSourceNeedsMongod*>(sources[i].get());
            if (needsMongod) {
                needsMongod->injectMongodInterface(
                    boost::make_shared<MongodImplementation>(pExpCtx));
            }
        }

        if (!sources.empty() && sources.front()->isValidInitialSource()) {
            if (dynamic_cast<DocumentSourceMergeCursors*>(sources.front().get())) {
                // Enable the hooks for setting up authentication on the subsequent internal
                // connections we are going to create. This would normally have been done
                // when SetShardVersion was called, but since SetShardVersion is never called
                // on secondaries, this is needed.
                ShardedConnectionInfo::addHook();
            }
            return boost::shared_ptr<Runner>(); // don't need a cursor
        }


        // Look for an initial match. This works whether we got an initial query or not.
        // If not, it results in a "{}" query, which will be what we want in that case.
        const BSONObj queryObj = pPipeline->getInitialQuery();
        if (!queryObj.isEmpty()) {
            // This will get built in to the Cursor we'll create, so
            // remove the match from the pipeline
            sources.pop_front();
        }

        // Find the set of fields in the source documents depended on by this pipeline.
        const DepsTracker deps = pPipeline->getDependencies(queryObj);

        // Passing query an empty projection since it is faster to use ParsedDeps::extractFields().
        // This will need to change to support covering indexes (SERVER-12015). There is an
        // exception for textScore since that can only be retrieved by a query projection.
        const BSONObj projectionForQuery = deps.needTextScore ? deps.toProjection() : BSONObj();

        /*
          Look for an initial sort; we'll try to add this to the
          Cursor we create.  If we're successful in doing that (further down),
          we'll remove the $sort from the pipeline, because the documents
          will already come sorted in the specified order as a result of the
          index scan.
        */
        intrusive_ptr<DocumentSourceSort> sortStage;
        BSONObj sortObj;
        if (!sources.empty()) {
            sortStage = dynamic_cast<DocumentSourceSort*>(sources.front().get());
            if (sortStage) {
                // build the sort key
                sortObj = sortStage->serializeSortKey(/*explain*/false).toBson();
            }
        }

        // Create the Runner.
        //
        // If we try to create a Runner that includes both the match and the
        // sort, and the two are incompatible wrt the available indexes, then
        // we don't get a Runner back.
        //
        // So we try to use both first.  If that fails, try again, without the
        // sort.
        //
        // If we don't have a sort, jump straight to just creating a Runner
        // without the sort.
        //
        // If we are able to incorporate the sort into the Runner, remove it
        // from the head of the pipeline.
        //
        // LATER - we should be able to find this out before we create the
        // cursor.  Either way, we can then apply other optimizations there
        // are tickets for, such as SERVER-4507.
        const size_t runnerOptions = QueryPlannerParams::DEFAULT
                                   | QueryPlannerParams::INCLUDE_SHARD_FILTER
                                   | QueryPlannerParams::NO_BLOCKING_SORT
                                   ;
        boost::shared_ptr<Runner> runner;
        bool sortInRunner = false;

        const WhereCallbackReal whereCallback(pExpCtx->opCtx, pExpCtx->ns.db());

        if (sortStage) {
            CanonicalQuery* cq;
            Status status =
                CanonicalQuery::canonicalize(pExpCtx->ns,
                                             queryObj,
                                             sortObj,
                                             projectionForQuery,
                                             &cq,
                                             whereCallback);
            Runner* rawRunner;
            if (status.isOK() && getRunner(txn, collection, cq, &rawRunner, runnerOptions).isOK()) {
                // success: The Runner will handle sorting for us using an index.
                runner.reset(rawRunner);
                sortInRunner = true;

                sources.pop_front();
                if (sortStage->getLimitSrc()) {
                    // need to reinsert coalesced $limit after removing $sort
                    sources.push_front(sortStage->getLimitSrc());
                }
            }
        }

        if (!runner.get()) {
            const BSONObj noSort;
            CanonicalQuery* cq;
            uassertStatusOK(
                CanonicalQuery::canonicalize(pExpCtx->ns,
                                             queryObj,
                                             noSort,
                                             projectionForQuery,
                                             &cq,
                                             whereCallback));

            Runner* rawRunner;
            uassertStatusOK(getRunner(txn, collection, cq, &rawRunner, runnerOptions));
            runner.reset(rawRunner);
        }


        // DocumentSourceCursor expects a yielding Runner that has had its state saved.
        runner->saveState();

        // Put the Runner into a DocumentSourceCursor and add it to the front of the pipeline.
        intrusive_ptr<DocumentSourceCursor> pSource =
            DocumentSourceCursor::create(fullName, runner, pExpCtx);

        // Note the query, sort, and projection for explain.
        pSource->setQuery(queryObj);
        if (sortInRunner)
            pSource->setSort(sortObj);

        pSource->setProjection(deps.toProjection(), deps.toParsedDeps());

        while (!sources.empty() && pSource->coalesce(sources.front())) {
            sources.pop_front();
        }

        pPipeline->addInitialSource(pSource);

        return runner;
    }
Пример #3
0
void Strategy::getMore(OperationContext* txn, Request& r) {
    Timer getMoreTimer;

    const char* ns = r.getns();
    const int ntoreturn = r.d().pullInt();
    const long long id = r.d().pullInt64();

    // TODO:  Handle stale config exceptions here from coll being dropped or sharded during op
    // for now has same semantics as legacy request
    const NamespaceString nss(ns);
    auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString());
    if (statusGetDb == ErrorCodes::DatabaseNotFound) {
        cursorCache.remove(id);
        replyToQuery(ResultFlag_CursorNotFound, r.p(), r.m(), 0, 0, 0);
        return;
    }

    uassertStatusOK(statusGetDb);

    shared_ptr<DBConfig> config = statusGetDb.getValue();

    ShardPtr primary;
    ChunkManagerPtr info;
    config->getChunkManagerOrPrimary(ns, info, primary);

    //
    // TODO: Cleanup cursor cache, consolidate into single codepath
    //
    const string host = cursorCache.getRef(id);
    ShardedClientCursorPtr cursor = cursorCache.get(id);
    int cursorMaxTimeMS = cursorCache.getMaxTimeMS(id);

    // Cursor ids should not overlap between sharded and unsharded cursors
    massert(17012,
            str::stream() << "duplicate sharded and unsharded cursor id " << id << " detected for "
                          << ns << ", duplicated on host " << host,
            NULL == cursorCache.get(id).get() || host.empty());

    ClientBasic* client = ClientBasic::getCurrent();
    NamespaceString nsString(ns);
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForGetMore(nsString, id);
    audit::logGetMoreAuthzCheck(client, nsString, id, status.code());
    uassertStatusOK(status);

    if (!host.empty()) {
        LOG(3) << "single getmore: " << ns;

        // we used ScopedDbConnection because we don't get about config versions
        // not deleting data is handled elsewhere
        // and we don't want to call setShardVersion
        ScopedDbConnection conn(host);

        Message response;
        bool ok = conn->callRead(r.m(), response);
        uassert(10204, "dbgrid: getmore: error calling db", ok);

        bool hasMore = (response.singleData().getCursor() != 0);

        if (!hasMore) {
            cursorCache.removeRef(id);
        }

        r.reply(response, "" /*conn->getServerAddress() */);
        conn.done();
        return;
    } else if (cursor) {
        if (cursorMaxTimeMS == kMaxTimeCursorTimeLimitExpired) {
            cursorCache.remove(id);
            uasserted(ErrorCodes::ExceededTimeLimit, "operation exceeded time limit");
        }

        // TODO: Try to match logic of mongod, where on subsequent getMore() we pull lots more data?
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cursor->getTotalSent();
        bool hasMore = cursor->sendNextBatch(ntoreturn, buffer, docCount);

        if (hasMore) {
            // still more data
            cursor->accessed();

            if (cursorMaxTimeMS != kMaxTimeCursorNoTimeLimit) {
                // Update remaining amount of time in cursor cache.
                int cursorLeftoverMillis = cursorMaxTimeMS - getMoreTimer.millis();
                if (cursorLeftoverMillis <= 0) {
                    cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
                }
                cursorCache.updateMaxTimeMS(id, cursorLeftoverMillis);
            }
        } else {
            // we've exhausted the cursor
            cursorCache.remove(id);
        }

        replyToQuery(0,
                     r.p(),
                     r.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cursor->getId() : 0);
        return;
    } else {
        LOG(3) << "could not find cursor " << id << " in cache for " << ns;

        replyToQuery(ResultFlag_CursorNotFound, r.p(), r.m(), 0, 0, 0);
        return;
    }
}
Пример #4
0
std::unique_ptr<RequestBuilderInterface> makeRequestBuilder(ProtocolSet clientProtos,
        ProtocolSet serverProtos) {
    return makeRequestBuilder(uassertStatusOK(negotiate(clientProtos, serverProtos)));
}
Пример #5
0
    void ensureIdIndexForNewNs( Collection* collection ) {
        if ( collection->ns().isSystem() && !legalClientSystemNS( collection->ns().ns(), false ) )
            return;

        uassertStatusOK( collection->getIndexCatalog()->ensureHaveIdIndex() );
    }
Пример #6
0
Status ClusterAggregate::runAggregate(OperationContext* txn,
                                      const Namespaces& namespaces,
                                      BSONObj cmdObj,
                                      int options,
                                      BSONObjBuilder* result) {
    auto scopedShardDbStatus = ScopedShardDatabase::getExisting(txn, namespaces.executionNss.db());
    if (!scopedShardDbStatus.isOK()) {
        appendEmptyResultSet(
            *result, scopedShardDbStatus.getStatus(), namespaces.requestedNss.ns());
        return Status::OK();
    }

    auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj);
    if (!request.isOK()) {
        return request.getStatus();
    }

    const auto conf = scopedShardDbStatus.getValue().db();

    // Determine the appropriate collation and 'resolve' involved namespaces to make the
    // ExpressionContext.

    // We won't try to execute anything on a mongos, but we still have to populate this map so that
    // any $lookups, etc. will be able to have a resolved view definition. It's okay that this is
    // incorrect, we will repopulate the real resolved namespace map on the mongod. Note that we
    // need to check if any involved collections are sharded before forwarding an aggregation
    // command on an unsharded collection.
    StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces;
    LiteParsedPipeline liteParsedPipeline(request.getValue());
    for (auto&& ns : liteParsedPipeline.getInvolvedNamespaces()) {
        uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns()));
        resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}};
    }

    if (!conf->isSharded(namespaces.executionNss.ns())) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }

    auto chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns());

    std::unique_ptr<CollatorInterface> collation;
    if (!request.getValue().getCollation().isEmpty()) {
        collation = uassertStatusOK(CollatorFactoryInterface::get(txn->getServiceContext())
                                        ->makeFromBSON(request.getValue().getCollation()));
    } else if (chunkMgr->getDefaultCollator()) {
        collation = chunkMgr->getDefaultCollator()->clone();
    }

    boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext(
        txn, request.getValue(), std::move(collation), std::move(resolvedNamespaces));
    mergeCtx->inRouter = true;
    // explicitly *not* setting mergeCtx->tempDir

    // Parse and optimize the pipeline specification.
    auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx);
    if (!pipeline.isOK()) {
        return pipeline.getStatus();
    }

    pipeline.getValue()->optimizePipeline();

    // If the first $match stage is an exact match on the shard key (with a simple collation or
    // no string matching), we only have to send it to one shard, so send the command to that
    // shard.
    BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery();
    BSONObj shardKeyMatches;
    shardKeyMatches = uassertStatusOK(
        chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery));
    bool singleShard = false;
    if (!shardKeyMatches.isEmpty()) {
        auto chunk = chunkMgr->findIntersectingChunk(
            txn, shardKeyMatches, request.getValue().getCollation());
        if (chunk.isOK()) {
            singleShard = true;
        }
    }

    // Don't need to split pipeline if the first $match is an exact match on shard key, unless
    // there is a stage that needs to be run on the primary shard.
    const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger();
    const bool needSplit = !singleShard || needPrimaryShardMerger;

    // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true,
    // 'pipeline' will become the merger side.
    boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded()
                                                           : pipeline.getValue());

    // Create the command for the shards. The 'fromRouter' field means produce output to be
    // merged.
    MutableDocument commandBuilder(request.getValue().serializeToCommandObj());
    commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize());
    if (needSplit) {
        commandBuilder[AggregationRequest::kFromRouterName] = Value(true);
        commandBuilder[AggregationRequest::kCursorName] =
            Value(DOC(AggregationRequest::kBatchSizeName << 0));
    }

    // These fields are not part of the AggregationRequest since they are not handled by the
    // aggregation subsystem, so we serialize them separately.
    const std::initializer_list<StringData> fieldsToPropagateToShards = {
        "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS,
    };
    for (auto&& field : fieldsToPropagateToShards) {
        commandBuilder[field] = Value(cmdObj[field]);
    }

    BSONObj shardedCommand = commandBuilder.freeze().toBson();
    BSONObj shardQuery = shardPipeline->getInitialQuery();

    // Run the command on the shards
    // TODO need to make sure cursors are killed if a retry is needed
    std::vector<Strategy::CommandResult> shardResults;
    Strategy::commandOp(txn,
                        namespaces.executionNss.db().toString(),
                        shardedCommand,
                        options,
                        namespaces.executionNss.ns(),
                        shardQuery,
                        request.getValue().getCollation(),
                        &shardResults);

    if (mergeCtx->isExplain) {
        // This must be checked before we start modifying result.
        uassertAllShardsSupportExplain(shardResults);

        if (needSplit) {
            *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline"
                    << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart"
                                        << pipeline.getValue()->writeExplainOps());
        } else {
            *result << "splitPipeline" << BSONNULL;
        }

        BSONObjBuilder shardExplains(result->subobjStart("shards"));
        for (size_t i = 0; i < shardResults.size(); i++) {
            shardExplains.append(shardResults[i].shardTargetId,
                                 BSON("host" << shardResults[i].target.toString() << "stages"
                                             << shardResults[i].result["stages"]));
        }

        return Status::OK();
    }

    if (!needSplit) {
        invariant(shardResults.size() == 1);
        invariant(shardResults[0].target.getServers().size() == 1);
        auto executorPool = Grid::get(txn)->getExecutorPool();
        const BSONObj reply =
            uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0],
                                                shardResults[0].result,
                                                namespaces.requestedNss,
                                                executorPool->getArbitraryExecutor(),
                                                Grid::get(txn)->getCursorManager()));
        result->appendElements(reply);
        return getStatusFromCommandResult(reply);
    }

    pipeline.getValue()->addInitialSource(
        DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx));

    MutableDocument mergeCmd(request.getValue().serializeToCommandObj());
    mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize());
    mergeCmd["cursor"] = Value(cmdObj["cursor"]);

    if (cmdObj.hasField("$queryOptions")) {
        mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]);
    }

    if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) {
        mergeCmd[QueryRequest::cmdOptionMaxTimeMS] =
            Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]);
    }

    mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"]));
    mergeCmd.setField("readConcern", Value(cmdObj["readConcern"]));

    // If the user didn't specify a collation already, make sure there's a collation attached to
    // the merge command, since the merging shard may not have the collection metadata.
    if (mergeCmd.peek()["collation"].missing()) {
        mergeCmd.setField("collation",
                          mergeCtx->getCollator()
                              ? Value(mergeCtx->getCollator()->getSpec().toBSON())
                              : Value(Document{CollationSpec::kSimpleSpec}));
    }

    std::string outputNsOrEmpty;
    if (DocumentSourceOut* out =
            dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) {
        outputNsOrEmpty = out->getOutputNs().ns();
    }

    // Run merging command on random shard, unless a stage needs the primary shard. Need to use
    // ShardConnection so that the merging mongod is sent the config servers on connection init.
    auto& prng = txn->getClient()->getPrng();
    const auto& mergingShardId =
        (needPrimaryShardMerger || internalQueryAlwaysMergeOnPrimaryShard.load())
        ? conf->getPrimaryId()
        : shardResults[prng.nextInt32(shardResults.size())].shardTargetId;
    const auto mergingShard =
        uassertStatusOK(Grid::get(txn)->shardRegistry()->getShard(txn, mergingShardId));

    ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty);
    BSONObj mergedResults =
        aggRunCommand(txn, conn.get(), namespaces, mergeCmd.freeze().toBson(), options);
    conn.done();

    if (auto wcErrorElem = mergedResults["writeConcernError"]) {
        appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result);
    }

    // Copy output from merging (primary) shard to the output object from our command.
    // Also, propagates errmsg and code if ok == false.
    result->appendElementsUnique(mergedResults);

    return getStatusFromCommandResult(result->asTempObj());
}
Пример #7
0
    UpdateResult update(
            OperationContext* txn,
            Database* db,
            const UpdateRequest& request,
            OpDebug* opDebug,
            UpdateDriver* driver,
            CanonicalQuery* cq) {

        LOG(3) << "processing update : " << request;

        std::auto_ptr<CanonicalQuery> cqHolder(cq);
        const NamespaceString& nsString = request.getNamespaceString();
        UpdateLifecycle* lifecycle = request.getLifecycle();

        Collection* collection = db->getCollection(nsString.ns());

        validateUpdate(nsString.ns().c_str(), request.getUpdates(), request.getQuery());


        // TODO: This seems a bit circuitious.
        opDebug->updateobj = request.getUpdates();

        if (lifecycle) {
            lifecycle->setCollection(collection);
            driver->refreshIndexKeys(lifecycle->getIndexKeys());
        }

        Runner* rawRunner;
        Status status = cq ?
            getRunner(collection, cqHolder.release(), &rawRunner) :
            getRunner(collection, nsString.ns(), request.getQuery(), &rawRunner, &cq);
        uassert(17243,
                "could not get runner " + request.getQuery().toString() + "; " + causedBy(status),
                status.isOK());

        // Create the runner and setup all deps.
        auto_ptr<Runner> runner(rawRunner);

        // Register Runner with ClientCursor
        const ScopedRunnerRegistration safety(runner.get());

        //
        // We'll start assuming we have one or more documents for this update. (Otherwise,
        // we'll fall-back to insert case (if upsert is true).)
        //

        // We are an update until we fall into the insert case below.
        driver->setContext(ModifierInterface::ExecInfo::UPDATE_CONTEXT);

        int numMatched = 0;

        // If the update was in-place, we may see it again.  This only matters if we're doing
        // a multi-update; if we're not doing a multi-update we stop after one update and we
        // won't see any more docs.
        //
        // For example: If we're scanning an index {x:1} and performing {$inc:{x:5}}, we'll keep
        // moving the document forward and it will continue to reappear in our index scan.
        // Unless the index is multikey, the underlying query machinery won't de-dup.
        //
        // If the update wasn't in-place we may see it again.  Our query may return the new
        // document and we wouldn't want to update that.
        //
        // So, no matter what, we keep track of where the doc wound up.
        typedef unordered_set<DiskLoc, DiskLoc::Hasher> DiskLocSet;
        const scoped_ptr<DiskLocSet> updatedLocs(request.isMulti() ? new DiskLocSet : NULL);

        // Reset these counters on each call. We might re-enter this function to retry this
        // update if we throw a page fault exception below, and we rely on these counters
        // reflecting only the actions taken locally. In particlar, we must have the no-op
        // counter reset so that we can meaningfully comapre it with numMatched above.
        opDebug->nscanned = 0;
        opDebug->nscannedObjects = 0;
        opDebug->nModified = 0;

        // Get the cached document from the update driver.
        mutablebson::Document& doc = driver->getDocument();
        mutablebson::DamageVector damages;

        // Used during iteration of docs
        BSONObj oldObj;

        // Get first doc, and location
        Runner::RunnerState state = Runner::RUNNER_ADVANCED;

        uassert(ErrorCodes::NotMaster,
                mongoutils::str::stream() << "Not primary while updating " << nsString.ns(),
                !request.shouldCallLogOp() || isMasterNs(nsString.ns().c_str()));

        while (true) {
            // Get next doc, and location
            DiskLoc loc;
            state = runner->getNext(&oldObj, &loc);

            if (state != Runner::RUNNER_ADVANCED) {
                if (state == Runner::RUNNER_EOF) {
                    // We have reached the logical end of the loop, so do yielding recovery
                    break;
                }
                else {
                    uassertStatusOK(Status(ErrorCodes::InternalError,
                                           str::stream() << " Update query failed -- "
                                                         << Runner::statestr(state)));
                }
            }

            // We fill this with the new locs of moved doc so we don't double-update.
            if (updatedLocs && updatedLocs->count(loc) > 0) {
                continue;
            }

            // We count how many documents we scanned even though we may skip those that are
            // deemed duplicated. The final 'numMatched' and 'nscanned' numbers may differ for
            // that reason.
            // TODO: Do we want to pull this out of the underlying query plan?
            opDebug->nscanned++;

            // Found a matching document
            opDebug->nscannedObjects++;
            numMatched++;

            // Ask the driver to apply the mods. It may be that the driver can apply those "in
            // place", that is, some values of the old document just get adjusted without any
            // change to the binary layout on the bson layer. It may be that a whole new
            // document is needed to accomodate the new bson layout of the resulting document.
            doc.reset(oldObj, mutablebson::Document::kInPlaceEnabled);
            BSONObj logObj;


            FieldRefSet updatedFields;

            Status status = Status::OK();
            if (!driver->needMatchDetails()) {
                // If we don't need match details, avoid doing the rematch
                status = driver->update(StringData(), &doc, &logObj, &updatedFields);
            }
            else {
                // If there was a matched field, obtain it.
                MatchDetails matchDetails;
                matchDetails.requestElemMatchKey();

                dassert(cq);
                verify(cq->root()->matchesBSON(oldObj, &matchDetails));

                string matchedField;
                if (matchDetails.hasElemMatchKey())
                    matchedField = matchDetails.elemMatchKey();

                // TODO: Right now, each mod checks in 'prepare' that if it needs positional
                // data, that a non-empty StringData() was provided. In principle, we could do
                // that check here in an else clause to the above conditional and remove the
                // checks from the mods.

                status = driver->update(matchedField, &doc, &logObj, &updatedFields);
            }

            if (!status.isOK()) {
                uasserted(16837, status.reason());
            }

            // Ensure _id exists and is first
            uassertStatusOK(ensureIdAndFirst(doc));

            // If the driver applied the mods in place, we can ask the mutable for what
            // changed. We call those changes "damages". :) We use the damages to inform the
            // journal what was changed, and then apply them to the original document
            // ourselves. If, however, the driver applied the mods out of place, we ask it to
            // generate a new, modified document for us. In that case, the file manager will
            // take care of the journaling details for us.
            //
            // This code flow is admittedly odd. But, right now, journaling is baked in the file
            // manager. And if we aren't using the file manager, we have to do jounaling
            // ourselves.
            bool docWasModified = false;
            BSONObj newObj;
            const char* source = NULL;
            bool inPlace = doc.getInPlaceUpdates(&damages, &source);

            // If something changed in the document, verify that no immutable fields were changed
            // and data is valid for storage.
            if ((!inPlace || !damages.empty()) ) {
                if (!(request.isFromReplication() || request.isFromMigration())) {
                    const std::vector<FieldRef*>* immutableFields = NULL;
                    if (lifecycle)
                        immutableFields = lifecycle->getImmutableFields();

                    uassertStatusOK(validate(oldObj,
                                             updatedFields,
                                             doc,
                                             immutableFields,
                                             driver->modOptions()) );
                }
            }

            // Save state before making changes
            runner->saveState();

            if (inPlace && !driver->modsAffectIndices()) {

                // If a set of modifiers were all no-ops, we are still 'in place', but there is
                // no work to do, in which case we want to consider the object unchanged.
                if (!damages.empty() ) {
                    collection->updateDocumentWithDamages( txn, loc, source, damages );
                    docWasModified = true;
                    opDebug->fastmod = true;
                }

                newObj = oldObj;
            }
            else {

                // The updates were not in place. Apply them through the file manager.
                newObj = doc.getObject();
                uassert(17419,
                        str::stream() << "Resulting document after update is larger than "
                                      << BSONObjMaxUserSize,
                        newObj.objsize() <= BSONObjMaxUserSize);
                StatusWith<DiskLoc> res = collection->updateDocument(txn,
                                                                     loc,
                                                                     newObj,
                                                                     true,
                                                                     opDebug);
                uassertStatusOK(res.getStatus());
                DiskLoc newLoc = res.getValue();
                docWasModified = true;

                // If the document moved, we might see it again in a collection scan (maybe it's
                // a document after our current document).
                //
                // If the document is indexed and the mod changes an indexed value, we might see it
                // again.  For an example, see the comment above near declaration of updatedLocs.
                if (updatedLocs && (newLoc != loc || driver->modsAffectIndices())) {
                    updatedLocs->insert(newLoc);
                }
            }

            // Restore state after modification
            uassert(17278,
                    "Update could not restore runner state after updating a document.",
                    runner->restoreState());

            // Call logOp if requested.
            if (request.shouldCallLogOp() && !logObj.isEmpty()) {
                BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti());
                logOp(txn, "u", nsString.ns().c_str(), logObj , &idQuery,
                      NULL, request.isFromMigration());
            }

            // Only record doc modifications if they wrote (exclude no-ops)
            if (docWasModified)
                opDebug->nModified++;

            if (!request.isMulti()) {
                break;
            }

            // Opportunity for journaling to write during the update.
            txn->recoveryUnit()->commitIfNeeded();
        }

        // TODO: Can this be simplified?
        if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) {
            opDebug->nMatched = numMatched;
            return UpdateResult(numMatched > 0 /* updated existing object(s) */,
                                !driver->isDocReplacement() /* $mod or obj replacement */,
                                opDebug->nModified /* number of modified docs, no no-ops */,
                                numMatched /* # of docs matched/updated, even no-ops */,
                                BSONObj());
        }

        //
        // We haven't found any existing document so an insert is done
        // (upsert is true).
        //
        opDebug->upsert = true;

        // Since this is an insert (no docs found and upsert:true), we will be logging it
        // as an insert in the oplog. We don't need the driver's help to build the
        // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT.
        // Some mods may only work in that context (e.g. $setOnInsert).
        driver->setLogOp(false);
        driver->setContext(ModifierInterface::ExecInfo::INSERT_CONTEXT);

        // Reset the document we will be writing to
        doc.reset();

        // This remains the empty object in the case of an object replacement, but in the case
        // of an upsert where we are creating a base object from the query and applying mods,
        // we capture the query as the original so that we can detect immutable field mutations.
        BSONObj original = BSONObj();

        // Calling createFromQuery will populate the 'doc' with fields from the query which
        // creates the base of the update for the inserterd doc (because upsert was true)
        if (cq) {
            uassertStatusOK(driver->populateDocumentWithQueryFields(cq, doc));
            // Validate the base doc, as taken from the query -- no fields means validate all.
            FieldRefSet noFields;
            uassertStatusOK(validate(BSONObj(), noFields, doc, NULL, driver->modOptions()));
            if (!driver->isDocReplacement()) {
                opDebug->fastmodinsert = true;
                // We need all the fields from the query to compare against for validation below.
                original = doc.getObject();
            }
            else {
                original = request.getQuery();
            }
        }
        else {
            fassert(17354, CanonicalQuery::isSimpleIdQuery(request.getQuery()));
            BSONElement idElt = request.getQuery()["_id"];
            original = idElt.wrap();
            fassert(17352, doc.root().appendElement(idElt));
        }

        // Apply the update modifications and then log the update as an insert manually.
        FieldRefSet updatedFields;
        status = driver->update(StringData(), &doc, NULL, &updatedFields);
        if (!status.isOK()) {
            uasserted(16836, status.reason());
        }

        // Ensure _id exists and is first
        uassertStatusOK(ensureIdAndFirst(doc));

        // Validate that the object replacement or modifiers resulted in a document
        // that contains all the immutable keys and can be stored.
        if (!(request.isFromReplication() || request.isFromMigration())){
            const std::vector<FieldRef*>* immutableFields = NULL;
            if (lifecycle)
                immutableFields = lifecycle->getImmutableFields();

            // This will only validate the modified fields if not a replacement.
            uassertStatusOK(validate(original,
                                     updatedFields,
                                     doc,
                                     immutableFields,
                                     driver->modOptions()) );
        }

        // Only create the collection if the doc will be inserted.
        if (!collection) {
            collection = db->getCollection(request.getNamespaceString().ns());
            if (!collection) {
                collection = db->createCollection(txn, request.getNamespaceString().ns());
            }
        }

        // Insert the doc
        BSONObj newObj = doc.getObject();
        uassert(17420,
                str::stream() << "Document to upsert is larger than " << BSONObjMaxUserSize,
                newObj.objsize() <= BSONObjMaxUserSize);

        StatusWith<DiskLoc> newLoc = collection->insertDocument(txn,
                                                                newObj,
                                                                !request.isGod() /*enforceQuota*/);
        uassertStatusOK(newLoc.getStatus());
        if (request.shouldCallLogOp()) {
            logOp(txn, "i", nsString.ns().c_str(), newObj,
                   NULL, NULL, request.isFromMigration());
        }

        opDebug->nMatched = 1;
        return UpdateResult(false /* updated a non existing document */,
                            !driver->isDocReplacement() /* $mod or obj replacement? */,
                            1 /* docs written*/,
                            1 /* count of updated documents */,
                            newObj /* object that was upserted */ );
    }
Пример #8
0
void ServiceContextMongoD::initializeGlobalStorageEngine() {
    // This should be set once.
    invariant(!_storageEngine);

    // We should have a _lockFile or be in read-only mode. Confusingly, we can still have a lockFile
    // if we are in read-only mode. This can happen if the server is started in read-only mode on a
    // writable dbpath.
    invariant(_lockFile || storageGlobalParams.readOnly);

    const std::string dbpath = storageGlobalParams.dbpath;
    if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) {
        if (storageGlobalParams.engineSetByUser) {
            // Verify that the name of the user-supplied storage engine matches the contents of
            // the metadata file.
            const StorageEngine::Factory* factory =
                mapFindWithDefault(_storageFactories,
                                   storageGlobalParams.engine,
                                   static_cast<const StorageEngine::Factory*>(nullptr));

            if (factory) {
                uassert(28662,
                        str::stream()
                            << "Cannot start server. Detected data files in " << dbpath
                            << " created by"
                            << " the '" << *existingStorageEngine << "' storage engine, but the"
                            << " specified storage engine was '" << factory->getCanonicalName()
                            << "'.",
                        factory->getCanonicalName() == *existingStorageEngine);
            }
        } else {
            // Otherwise set the active storage engine as the contents of the metadata file.
            log() << "Detected data files in " << dbpath << " created by the '"
                  << *existingStorageEngine << "' storage engine, so setting the active"
                  << " storage engine to '" << *existingStorageEngine << "'.";
            storageGlobalParams.engine = *existingStorageEngine;
        }
    } else if (!storageGlobalParams.engineSetByUser) {
        // Ensure the default storage engine is available with this build of mongod.
        uassert(28663,
                str::stream()
                    << "Cannot start server. The default storage engine '"
                    << storageGlobalParams.engine
                    << "' is not available with this build of mongod. Please specify a different"
                    << " storage engine explicitly, e.g. --storageEngine=mmapv1.",
                isRegisteredStorageEngine(storageGlobalParams.engine));
    }

    const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine];

    uassert(18656,
            str::stream() << "Cannot start server with an unknown storage engine: "
                          << storageGlobalParams.engine,
            factory);

    if (storageGlobalParams.readOnly) {
        uassert(34368,
                str::stream()
                    << "Server was started in read-only mode, but the configured storage engine, "
                    << storageGlobalParams.engine << ", does not support read-only operation",
                factory->supportsReadOnly());
    }

    std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath);

    if (storageGlobalParams.readOnly) {
        uassert(34415,
                "Server was started in read-only mode, but the storage metadata file was not"
                " found.",
                metadata.get());
    }

    // Validate options in metadata against current startup options.
    if (metadata.get()) {
        uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));
    }

    ScopeGuard guard = MakeGuard([&] {
        if (_lockFile) {
            _lockFile->close();
        }
    });

    _storageEngine = factory->create(storageGlobalParams, _lockFile.get());
    _storageEngine->finishInit();

    if (_lockFile) {
        uassertStatusOK(_lockFile->writePid());
    }

    // Write a new metadata file if it is not present.
    if (!metadata.get()) {
        invariant(!storageGlobalParams.readOnly);
        metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));
        metadata->setStorageEngine(factory->getCanonicalName().toString());
        metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams));
        uassertStatusOK(metadata->write());
    }

    guard.Dismiss();

    _supportsDocLocking = _storageEngine->supportsDocLocking();
}
Пример #9
0
void readRequestMetadata(OperationContext* opCtx, const BSONObj& metadataObj) {
    BSONElement readPreferenceElem;
    BSONElement auditElem;
    BSONElement configSvrElem;
    BSONElement trackingElem;
    BSONElement clientElem;
    BSONElement logicalTimeElem;

    for (const auto& metadataElem : metadataObj) {
        auto fieldName = metadataElem.fieldNameStringData();
        if (fieldName == "$readPreference") {
            readPreferenceElem = metadataElem;
        } else if (fieldName == AuditMetadata::fieldName()) {
            auditElem = metadataElem;
        } else if (fieldName == ConfigServerMetadata::fieldName()) {
            configSvrElem = metadataElem;
        } else if (fieldName == ClientMetadata::fieldName()) {
            clientElem = metadataElem;
        } else if (fieldName == TrackingMetadata::fieldName()) {
            trackingElem = metadataElem;
        } else if (fieldName == LogicalTimeMetadata::fieldName()) {
            logicalTimeElem = metadataElem;
        }
    }

    if (readPreferenceElem) {
        ReadPreferenceSetting::get(opCtx) =
            uassertStatusOK(ReadPreferenceSetting::fromInnerBSON(readPreferenceElem));
    }

    AuditMetadata::get(opCtx) = uassertStatusOK(AuditMetadata::readFromMetadata(auditElem));

    uassertStatusOK(ClientMetadataIsMasterState::readFromMetadata(opCtx, clientElem));

    ConfigServerMetadata::get(opCtx) =
        uassertStatusOK(ConfigServerMetadata::readFromMetadata(configSvrElem));

    TrackingMetadata::get(opCtx) =
        uassertStatusOK(TrackingMetadata::readFromMetadata(trackingElem));

    auto logicalClock = LogicalClock::get(opCtx);
    if (logicalClock) {
        auto logicalTimeMetadata =
            uassertStatusOK(rpc::LogicalTimeMetadata::readFromMetadata(logicalTimeElem));

        auto& signedTime = logicalTimeMetadata.getSignedTime();
        // LogicalTimeMetadata is default constructed if no cluster time metadata was sent, so a
        // default constructed SignedLogicalTime should be ignored.
        if (signedTime.getTime() != LogicalTime::kUninitialized) {
            // Cluster times are only sent by sharding aware mongod servers, so this point is only
            // reached in sharded clusters.
            if (serverGlobalParams.featureCompatibility.version.load() !=
                ServerGlobalParams::FeatureCompatibility::Version::k34) {
                auto logicalTimeValidator = LogicalTimeValidator::get(opCtx);
                if (!LogicalTimeValidator::isAuthorizedToAdvanceClock(opCtx)) {
                    if (!logicalTimeValidator) {
                        uasserted(ErrorCodes::CannotVerifyAndSignLogicalTime,
                                  "Cannot accept logicalTime: " + signedTime.getTime().toString() +
                                      ". May not be a part of a sharded cluster");
                    } else {
                        uassertStatusOK(logicalTimeValidator->validate(opCtx, signedTime));
                    }
                }

                uassertStatusOK(logicalClock->advanceClusterTime(signedTime.getTime()));
            }
        }
    }
}
Пример #10
0
void updateSessionEntry(OperationContext* opCtx, const UpdateRequest& updateRequest) {
    // Current code only supports replacement update.
    dassert(UpdateDriver::isDocReplacement(updateRequest.getUpdates()));

    AutoGetCollection autoColl(opCtx, NamespaceString::kSessionTransactionsTableNamespace, MODE_IX);

    uassert(40527,
            str::stream() << "Unable to persist transaction state because the session transaction "
                             "collection is missing. This indicates that the "
                          << NamespaceString::kSessionTransactionsTableNamespace.ns()
                          << " collection has been manually deleted.",
            autoColl.getCollection());

    WriteUnitOfWork wuow(opCtx);

    auto collection = autoColl.getCollection();
    auto idIndex = collection->getIndexCatalog()->findIdIndex(opCtx);

    uassert(40672,
            str::stream() << "Failed to fetch _id index for "
                          << NamespaceString::kSessionTransactionsTableNamespace.ns(),
            idIndex);

    auto indexAccess = collection->getIndexCatalog()->getIndex(idIndex);
    // Since we are looking up a key inside the _id index, create a key object consisting of only
    // the _id field.
    auto idToFetch = updateRequest.getQuery().firstElement();
    auto toUpdateIdDoc = idToFetch.wrap();
    dassert(idToFetch.fieldNameStringData() == "_id"_sd);
    auto recordId = indexAccess->findSingle(opCtx, toUpdateIdDoc);
    auto startingSnapshotId = opCtx->recoveryUnit()->getSnapshotId();

    if (recordId.isNull()) {
        // Upsert case.
        auto status = collection->insertDocument(
            opCtx, InsertStatement(updateRequest.getUpdates()), nullptr, true, false);

        if (status == ErrorCodes::DuplicateKey) {
            throw WriteConflictException();
        }

        uassertStatusOK(status);
        wuow.commit();
        return;
    }

    auto originalRecordData = collection->getRecordStore()->dataFor(opCtx, recordId);
    auto originalDoc = originalRecordData.toBson();

    invariant(collection->getDefaultCollator() == nullptr);
    boost::intrusive_ptr<ExpressionContext> expCtx(new ExpressionContext(opCtx, nullptr));

    auto matcher = fassertStatusOK(
        40673, MatchExpressionParser::parse(updateRequest.getQuery(), std::move(expCtx)));
    if (!matcher->matchesBSON(originalDoc)) {
        // Document no longer match what we expect so throw WCE to make the caller re-examine.
        throw WriteConflictException();
    }

    OplogUpdateEntryArgs args;
    args.nss = NamespaceString::kSessionTransactionsTableNamespace;
    args.uuid = collection->uuid();
    args.update = updateRequest.getUpdates();
    args.criteria = toUpdateIdDoc;
    args.fromMigrate = false;

    collection->updateDocument(opCtx,
                               recordId,
                               Snapshotted<BSONObj>(startingSnapshotId, originalDoc),
                               updateRequest.getUpdates(),
                               true,   // enforceQuota
                               false,  // indexesAffected = false because _id is the only index
                               nullptr,
                               &args);

    wuow.commit();
}
Пример #11
0
    auto mock = DocumentSourceMock::create(inputs);
    facetStage->setSource(mock.get());

    auto output = facetStage->getNext();
    ASSERT(output.isAdvanced());
    ASSERT_DOCUMENT_EQ(output.getDocument(), Document(fromjson("{subPipe: [{_id: 0}, {_id: 1}]}")));
}

// TODO: DocumentSourceFacet will have to propagate pauses if we ever allow nested $facets.
DEATH_TEST_F(DocumentSourceFacetTest,
             ShouldFailIfGivenPausedInput,
             "Invariant failure !input.isPaused()") {
    auto ctx = getExpCtx();

    auto firstDummy = DocumentSourcePassthrough::create();
    auto pipeline = uassertStatusOK(Pipeline::create({firstDummy}, ctx));

    auto facetStage = DocumentSourceFacet::create({{"subPipe", pipeline}}, ctx);

    auto mock = DocumentSourceMock::create(DocumentSource::GetNextResult::makePauseExecution());
    facetStage->setSource(mock.get());

    facetStage->getNext();  // This should cause a crash.
}

//
// Miscellaneous.
//

TEST_F(DocumentSourceFacetTest, ShouldBeAbleToReParseSerializedStage) {
    auto ctx = getExpCtx();
boost::intrusive_ptr<DocumentSource> DocumentSourceMergeCursors::createFromBson(
    BSONElement elem, const boost::intrusive_ptr<ExpressionContext>& expCtx) {
    if (elem.type() == BSONType::Object) {
        // This is the modern serialization format. We de-serialize using the IDL.
        auto ownedObj = elem.embeddedObject().getOwned();
        auto armParams =
            AsyncResultsMergerParams::parse(IDLParserErrorContext(kStageName), ownedObj);
        return new DocumentSourceMergeCursors(
            Grid::get(expCtx->opCtx)->getExecutorPool()->getArbitraryExecutor(),
            std::move(armParams),
            expCtx,
            std::move(ownedObj));
    }

    // This is the old serialization format which can still be generated by mongos processes
    // older than 4.0.
    // TODO SERVER-34009 Remove support for this format.
    uassert(17026,
            "$mergeCursors stage expected either an array or an object as argument",
            elem.type() == BSONType::Array);
    const auto serializedRemotes = elem.Array();
    uassert(50729,
            "$mergeCursors stage expected array with at least one entry",
            serializedRemotes.size() > 0);

    boost::optional<NamespaceString> nss;
    std::vector<RemoteCursor> remotes;
    for (auto&& cursor : serializedRemotes) {
        BSONElement nsElem;
        BSONElement hostElem;
        BSONElement idElem;
        uassert(17027,
                "$mergeCursors stage requires each cursor in array to be an object",
                cursor.type() == BSONType::Object);
        for (auto&& cursorElem : cursor.Obj()) {
            const auto fieldName = cursorElem.fieldNameStringData();
            if (fieldName == "ns"_sd) {
                nsElem = cursorElem;
            } else if (fieldName == "host"_sd) {
                hostElem = cursorElem;
            } else if (fieldName == "id"_sd) {
                idElem = cursorElem;
            } else {
                uasserted(50730,
                          str::stream() << "Unrecognized option " << fieldName
                                        << " within cursor provided to $mergeCursors: "
                                        << cursor);
            }
        }
        uassert(
            50731,
            "$mergeCursors stage requires \'ns\' field with type string for each cursor in array",
            nsElem.type() == BSONType::String);

        // We require each cursor to have the same namespace. This isn't a fundamental limit of the
        // system, but needs to be true due to the implementation of AsyncResultsMerger, which
        // tracks one namespace for all cursors.
        uassert(50720,
                "$mergeCursors requires each cursor to have the same namespace",
                !nss || nss->ns() == nsElem.valueStringData());
        nss = NamespaceString(nsElem.String());

        uassert(
            50721,
            "$mergeCursors stage requires \'host\' field with type string for each cursor in array",
            hostElem.type() == BSONType::String);
        auto host = uassertStatusOK(HostAndPort::parse(hostElem.valueStringData()));

        uassert(50722,
                "$mergeCursors stage requires \'id\' field with type long for each cursor in array",
                idElem.type() == BSONType::NumberLong);
        auto cursorId = idElem.Long();

        // We are assuming that none of the cursors have been iterated at all, and so will not have
        // any data in the initial batch.
        // TODO SERVER-33323 We use a fake shard id because the AsyncResultsMerger won't use it for
        // anything, and finding the real one is non-trivial.
        RemoteCursor remoteCursor;
        remoteCursor.setShardId(ShardId("fakeShardIdForMergeCursors"));
        remoteCursor.setHostAndPort(std::move(host));
        std::vector<BSONObj> emptyBatch;
        remoteCursor.setCursorResponse(CursorResponse{*nss, cursorId, emptyBatch});
        remotes.push_back(std::move(remoteCursor));
    }
    invariant(nss);  // We know there is at least one cursor in 'serializedRemotes', and we require
                     // each cursor to have a 'ns' field.

    AsyncResultsMergerParams params;
    params.setRemotes(std::move(remotes));
    params.setNss(*nss);
    return new DocumentSourceMergeCursors(
        Grid::get(expCtx->opCtx)->getExecutorPool()->getArbitraryExecutor(),
        std::move(params),
        expCtx,
        elem.embeddedObject().getOwned());
}
Пример #13
0
    long long DeleteExecutor::execute(OperationContext* txn, Database* db) {
        uassertStatusOK(prepare());
        uassert(17417,
                mongoutils::str::stream() <<
                "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(),
                _isQueryParsed);
        const bool logop = _request->shouldCallLogOp();
        const NamespaceString& ns(_request->getNamespaceString());
        if (!_request->isGod()) {
            if (ns.isSystem()) {
                uassert(12050,
                        "cannot delete from system namespace",
                        legalClientSystemNS(ns.ns(), true));
            }
            if (ns.ns().find('$') != string::npos) {
                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
                uasserted( 10100, "cannot delete from collection with reserved $ in name" );
            }
        }

        Collection* collection = db->getCollection(ns.ns());
        if (NULL == collection) {
            return 0;
        }

        uassert(10101,
                str::stream() << "cannot remove from a capped collection: " << ns.ns(),
                !collection->isCapped());

        uassert(ErrorCodes::NotMaster,
                str::stream() << "Not primary while removing from " << ns.ns(),
                !logop || replset::isMasterNs(ns.ns().c_str()));

        long long nDeleted = 0;

        Runner* rawRunner;
        if (_canonicalQuery.get()) {
            uassertStatusOK(getRunner(collection, _canonicalQuery.release(), &rawRunner));
        }
        else {
            CanonicalQuery* ignored;
            uassertStatusOK(getRunner(collection,
                                      ns.ns(),
                                      _request->getQuery(),
                                      &rawRunner,
                                      &ignored));
        }

        auto_ptr<Runner> runner(rawRunner);
        ScopedRunnerRegistration safety(runner.get());

        DiskLoc rloc;
        Runner::RunnerState state;
        CurOp* curOp = cc().curop();
        int oldYieldCount = curOp->numYields();
        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &rloc))) {
            if (oldYieldCount != curOp->numYields()) {
                uassert(ErrorCodes::NotMaster,
                        str::stream() << "No longer primary while removing from " << ns.ns(),
                        !logop || replset::isMasterNs(ns.ns().c_str()));
                oldYieldCount = curOp->numYields();
            }
            BSONObj toDelete;

            // TODO: do we want to buffer docs and delete them in a group rather than
            // saving/restoring state repeatedly?
            runner->saveState();
            collection->deleteDocument(txn, rloc, false, false, logop ? &toDelete : NULL );
            runner->restoreState(txn);

            nDeleted++;

            if (logop) {
                if ( toDelete.isEmpty() ) {
                    problem() << "Deleted object without id in collection " << collection->ns()
                              << ", not logging.";
                }
                else {
                    bool replJustOne = true;
                    replset::logOp(txn, "d", ns.ns().c_str(), toDelete, 0, &replJustOne);
                }
            }

            if (!_request->isMulti()) {
                break;
            }

            if (!_request->isGod()) {
                txn->recoveryUnit()->commitIfNeeded();
            }

            if (debug && _request->isGod() && nDeleted == 100) {
                log() << "warning high number of deletes with god=true "
                      << " which could use significant memory b/c we don't commit journal";
            }
        }

        return nDeleted;
    }
ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx,
                                                const NamespaceString& nss,
                                                bool forceRefreshFromThisThread) {
    invariant(!opCtx->lockState()->isLocked());
    invariant(!opCtx->getClient()->isInDirectClient());

    auto* const shardingState = ShardingState::get(opCtx);
    invariant(shardingState->canAcceptShardedCommands());

    auto routingInfo =
        uassertStatusOK(Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh(
            opCtx, nss, forceRefreshFromThisThread));
    auto cm = routingInfo.cm();

    if (!cm) {
        // No chunk manager, so unsharded.

        // Exclusive collection lock needed since we're now changing the metadata
        AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X);
        CollectionShardingRuntime::get(opCtx, nss)
            ->setFilteringMetadata(opCtx, CollectionMetadata());

        return ChunkVersion::UNSHARDED();
    }

    // Optimistic check with only IS lock in order to avoid threads piling up on the collection X
    // lock below
    {
        AutoGetCollection autoColl(opCtx, nss, MODE_IS);
        auto optMetadata = CollectionShardingState::get(opCtx, nss)->getCurrentMetadataIfKnown();

        // We already have newer version
        if (optMetadata) {
            const auto& metadata = *optMetadata;
            if (metadata->isSharded() &&
                metadata->getCollVersion().epoch() == cm->getVersion().epoch() &&
                metadata->getCollVersion() >= cm->getVersion()) {
                LOG(1) << "Skipping refresh of metadata for " << nss << " "
                       << metadata->getCollVersion() << " with an older " << cm->getVersion();
                return metadata->getShardVersion();
            }
        }
    }

    // Exclusive collection lock needed since we're now changing the metadata
    AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X);
    auto* const css = CollectionShardingRuntime::get(opCtx, nss);

    {
        auto optMetadata = CollectionShardingState::get(opCtx, nss)->getCurrentMetadataIfKnown();

        // We already have newer version
        if (optMetadata) {
            const auto& metadata = *optMetadata;
            if (metadata->isSharded() &&
                metadata->getCollVersion().epoch() == cm->getVersion().epoch() &&
                metadata->getCollVersion() >= cm->getVersion()) {
                LOG(1) << "Skipping refresh of metadata for " << nss << " "
                       << metadata->getCollVersion() << " with an older " << cm->getVersion();
                return metadata->getShardVersion();
            }
        }
    }

    CollectionMetadata metadata(std::move(cm), shardingState->shardId());
    const auto newShardVersion = metadata.getShardVersion();

    css->setFilteringMetadata(opCtx, std::move(metadata));
    return newShardVersion;
}
Пример #15
0
void ChunkSplitter::_runAutosplit(const NamespaceString& nss,
                                  const BSONObj& min,
                                  const BSONObj& max,
                                  long dataWritten) {
    if (!_isPrimary) {
        return;
    }

    try {
        const auto opCtx = cc().makeOperationContext();
        const auto routingInfo = uassertStatusOK(
            Grid::get(opCtx.get())->catalogCache()->getCollectionRoutingInfo(opCtx.get(), nss));

        uassert(ErrorCodes::NamespaceNotSharded,
                "Could not split chunk. Collection is no longer sharded",
                routingInfo.cm());

        const auto cm = routingInfo.cm();
        const auto chunk = cm->findIntersectingChunkWithSimpleCollation(min);

        // Stop if chunk's range differs from the range we were expecting to split.
        if ((0 != chunk.getMin().woCompare(min)) || (0 != chunk.getMax().woCompare(max)) ||
            (chunk.getShardId() != ShardingState::get(opCtx.get())->getShardName())) {
            LOG(1) << "Cannot auto-split chunk with range '"
                   << redact(ChunkRange(min, max).toString()) << "' for nss '" << nss
                   << "' on shard '" << ShardingState::get(opCtx.get())->getShardName()
                   << "' because since scheduling auto-split the chunk has been changed to '"
                   << redact(chunk.toString()) << "'";
            return;
        }

        const ChunkRange chunkRange(chunk.getMin(), chunk.getMax());

        const auto balancerConfig = Grid::get(opCtx.get())->getBalancerConfiguration();
        // Ensure we have the most up-to-date balancer configuration
        uassertStatusOK(balancerConfig->refreshAndCheck(opCtx.get()));

        if (!balancerConfig->getShouldAutoSplit()) {
            return;
        }

        const uint64_t maxChunkSizeBytes = balancerConfig->getMaxChunkSizeBytes();

        LOG(1) << "about to initiate autosplit: " << redact(chunk.toString())
               << " dataWritten since last check: " << dataWritten
               << " maxChunkSizeBytes: " << maxChunkSizeBytes;

        auto splitPoints = uassertStatusOK(splitVector(opCtx.get(),
                                                       nss,
                                                       cm->getShardKeyPattern().toBSON(),
                                                       chunk.getMin(),
                                                       chunk.getMax(),
                                                       false,
                                                       boost::none,
                                                       boost::none,
                                                       boost::none,
                                                       maxChunkSizeBytes));

        if (splitPoints.size() <= 1) {
            // No split points means there isn't enough data to split on; 1 split point means we
            // have between half the chunk size to full chunk size so there is no need to split yet
            return;
        }

        // We assume that if the chunk being split is the first (or last) one on the collection,
        // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the
        // very first (or last) key as a split point.
        //
        // This heuristic is skipped for "special" shard key patterns that are not likely to produce
        // monotonically increasing or decreasing values (e.g. hashed shard keys).

        // Keeps track of the minKey of the top chunk after the split so we can migrate the chunk.
        BSONObj topChunkMinKey;

        if (KeyPattern::isOrderedKeyPattern(cm->getShardKeyPattern().toBSON())) {
            if (0 ==
                cm->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk.getMin())) {
                // MinKey is infinity (This is the first chunk on the collection)
                BSONObj key =
                    findExtremeKeyForShard(opCtx.get(), nss, cm->getShardKeyPattern(), true);
                if (!key.isEmpty()) {
                    splitPoints.front() = key.getOwned();
                    topChunkMinKey = cm->getShardKeyPattern().getKeyPattern().globalMin();
                }
            } else if (0 ==
                       cm->getShardKeyPattern().getKeyPattern().globalMax().woCompare(
                           chunk.getMax())) {
                // MaxKey is infinity (This is the last chunk on the collection)
                BSONObj key =
                    findExtremeKeyForShard(opCtx.get(), nss, cm->getShardKeyPattern(), false);
                if (!key.isEmpty()) {
                    splitPoints.back() = key.getOwned();
                    topChunkMinKey = key.getOwned();
                }
            }
        }

        uassertStatusOK(splitChunkAtMultiplePoints(opCtx.get(),
                                                   chunk.getShardId(),
                                                   nss,
                                                   cm->getShardKeyPattern(),
                                                   cm->getVersion(),
                                                   chunkRange,
                                                   splitPoints));

        const bool shouldBalance = isAutoBalanceEnabled(opCtx.get(), nss, balancerConfig);

        log() << "autosplitted " << nss << " chunk: " << redact(chunk.toString()) << " into "
              << (splitPoints.size() + 1) << " parts (maxChunkSizeBytes " << maxChunkSizeBytes
              << ")"
              << (topChunkMinKey.isEmpty() ? "" : " (top chunk migration suggested" +
                          (std::string)(shouldBalance ? ")" : ", but no migrations allowed)"));

        // Balance the resulting chunks if the autobalance option is enabled and if we split at the
        // first or last chunk on the collection as part of top chunk optimization.

        if (!shouldBalance || topChunkMinKey.isEmpty()) {
            return;
        }

        // Tries to move the top chunk out of the shard to prevent the hot spot from staying on a
        // single shard. This is based on the assumption that succeeding inserts will fall on the
        // top chunk.
        moveChunk(opCtx.get(), nss, topChunkMinKey);
    } catch (const DBException& ex) {
        log() << "Unable to auto-split chunk " << redact(ChunkRange(min, max).toString())
              << " in nss " << nss << causedBy(redact(ex.toStatus()));
    } catch (const std::exception& e) {
        log() << "caught exception while splitting chunk: " << redact(e.what());
    }
}
bool RunOnAllShardsCommand::run(OperationContext* txn,
                                const std::string& dbName,
                                BSONObj& cmdObj,
                                int options,
                                std::string& errmsg,
                                BSONObjBuilder& output) {
    LOG(1) << "RunOnAllShardsCommand db: " << dbName << " cmd:" << cmdObj;

    if (_implicitCreateDb) {
        uassertStatusOK(ScopedShardDatabase::getOrCreate(txn, dbName));
    }

    std::vector<ShardId> shardIds;
    getShardIds(txn, dbName, cmdObj, shardIds);

    std::list<std::shared_ptr<Future::CommandResult>> futures;
    for (const ShardId& shardId : shardIds) {
        const auto shard = grid.shardRegistry()->getShard(txn, shardId);
        if (!shard) {
            continue;
        }

        futures.push_back(Future::spawnCommand(
            shard->getConnString().toString(), dbName, cmdObj, 0, NULL, _useShardConn));
    }

    std::vector<ShardAndReply> results;
    BSONObjBuilder subobj(output.subobjStart("raw"));
    BSONObjBuilder errors;
    int commonErrCode = -1;

    std::list<std::shared_ptr<Future::CommandResult>>::iterator futuresit;
    std::vector<ShardId>::const_iterator shardIdsIt;

    BSONElement wcErrorElem;
    ShardId wcErrorShardId;
    bool hasWCError = false;

    // We iterate over the set of shard ids and their corresponding futures in parallel.
    // TODO: replace with zip iterator if we ever decide to use one from Boost or elsewhere
    for (futuresit = futures.begin(), shardIdsIt = shardIds.cbegin();
         futuresit != futures.end() && shardIdsIt != shardIds.end();
         ++futuresit, ++shardIdsIt) {
        std::shared_ptr<Future::CommandResult> res = *futuresit;

        if (res->join(txn)) {
            // success :)
            BSONObj result = res->result();
            results.emplace_back(shardIdsIt->toString(), result);
            subobj.append(res->getServer(), result);

            if (!hasWCError) {
                if ((wcErrorElem = result["writeConcernError"])) {
                    wcErrorShardId = *shardIdsIt;
                    hasWCError = true;
                }
            }
            continue;
        }

        BSONObj result = res->result();

        if (!hasWCError) {
            if ((wcErrorElem = result["writeConcernError"])) {
                wcErrorShardId = *shardIdsIt;
                hasWCError = true;
            }
        }

        if (result["errmsg"].type() || result["code"].numberInt() != 0) {
            result = specialErrorHandler(res->getServer(), dbName, cmdObj, result);

            BSONElement errmsgObj = result["errmsg"];
            if (errmsgObj.eoo() || errmsgObj.String().empty()) {
                // it was fixed!
                results.emplace_back(shardIdsIt->toString(), result);
                subobj.append(res->getServer(), result);
                continue;
            }
        }

        // Handle "errmsg".
        if (!result["errmsg"].eoo()) {
            errors.appendAs(result["errmsg"], res->getServer());
        } else {
            // Can happen if message is empty, for some reason
            errors.append(res->getServer(),
                          str::stream() << "result without error message returned : " << result);
        }

        // Handle "code".
        int errCode = result["code"].numberInt();
        if (commonErrCode == -1) {
            commonErrCode = errCode;
        } else if (commonErrCode != errCode) {
            commonErrCode = 0;
        }
        results.emplace_back(shardIdsIt->toString(), result);
        subobj.append(res->getServer(), result);
    }

    subobj.done();

    if (hasWCError) {
        appendWriteConcernErrorToCmdResponse(wcErrorShardId, wcErrorElem, output);
    }

    BSONObj errobj = errors.done();

    if (!errobj.isEmpty()) {
        errmsg = errobj.toString();

        // If every error has a code, and the code for all errors is the same, then add
        // a top-level field "code" with this value to the output object.
        if (commonErrCode > 0) {
            output.append("code", commonErrCode);
        }

        return false;
    }

    aggregateResults(results, output);
    return true;
}
Пример #17
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            context.relocked();

            bool createdCollection = false;
            Collection* collection = NULL;

            while( i.moreInCurrentBatch() ) {
                if ( numSeen % 128 == 127 /*yield some*/ ) {
                    collection = NULL;
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) {
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << numSeen << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                if ( isindex == false && collection == NULL ) {
                    collection = context.db()->getCollection( to_collection );
                    if ( !collection ) {
                        massert( 17321,
                                 str::stream()
                                 << "collection dropped during clone ["
                                 << to_collection << "]",
                                 !createdCollection );
                        createdCollection = true;
                        collection = context.db()->createCollection( txn, to_collection );
                        verify( collection );
                    }
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                const Status status = validateBSON(tmp.objdata(), tmp.objsize());
                if (!status.isOK()) {
                    out() << "Cloner: skipping corrupt object from " << from_collection
                          << ": " << status.reason();
                    continue;
                }

                ++numSeen;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify(nsToCollectionSubstring(from_collection) == "system.indexes");
                    js = fixindex(context.db()->name(), tmp);
                    indexesToBuild->push_back( js.getOwned() );
                    continue;
                }

                verify(nsToCollectionSubstring(from_collection) != "system.indexes");

                StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true );
                if ( !loc.isOK() ) {
                    error() << "error: exception cloning object in " << from_collection
                            << ' ' << loc.toString() << " obj:" << js;
                }
                uassertStatusOK( loc.getStatus() );
                if ( logForRepl )
                    logOp(txn, "i", to_collection, js);

                txn->commitIfNeeded();

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << numSeen << " objects cloned so far from collection " << from_collection;
                    saveLast = time( 0 );
                }
            }
        }
Пример #18
0
    DiskLoc DataFileMgr::insert(const char* ns,
                                const void* obuf,
                                int32_t len,
                                bool mayInterrupt,
                                bool god,
                                bool mayAddIndex,
                                bool* addedID) {

        Database* database = cc().database();

        bool wouldAddIndex = false;
        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
        {
            const char *sys = strstr(ns, "system.");
            if ( sys ) {

                if ( !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
                    return DiskLoc();

                if ( mayAddIndex && wouldAddIndex ) {
                    // TODO: this should be handled above this function
                    BSONObj spec( static_cast<const char*>( obuf ) );
                    string collectionToIndex = spec.getStringField( "ns" );
                    uassert(10096, "invalid ns to index", collectionToIndex.find( '.' ) != string::npos);
                    massert(10097,
                            str::stream() << "trying to create index on wrong db "
                            << " db: " << database->name() << " collection: " << collectionToIndex,
                            database->ownsNS( collectionToIndex ) );

                    Collection* collection = database->getCollection( collectionToIndex );
                    if ( !collection ) {
                        collection = database->createCollection( collectionToIndex, false, NULL, true );
                        verify( collection );
                        if ( !god )
                            ensureIdIndexForNewNs( collection );
                    }

                    Status status = collection->getIndexCatalog()->createIndex( spec, mayInterrupt );
                    if ( status.code() == ErrorCodes::IndexAlreadyExists )
                        return DiskLoc();
                    uassertStatusOK( status );
                    return DiskLoc();
                }
            }
        }

        Collection* collection = database->getCollection( ns );
        if ( collection == NULL ) {
            collection = database->createCollection( ns, false, NULL, false );

            int ies = Extent::initialSize(len);
            if( str::contains(ns, '$') &&
                len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 &&
                len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) {
                // probably an index.  so we pick a value here for the first extent instead of using
                // initialExtentSize() which is more for user collections.
                // TODO: we could look at the # of records in the parent collection to be smarter here.
                ies = (32+4) * 1024;
            }
            collection->increaseStorageSize( ies, false);
            if ( !god )
                ensureIdIndexForNewNs( collection );
        }

        NamespaceDetails* d = collection->details();

        IDToInsert idToInsert; // only initialized if needed

        if( !god ) {
            /* Check if we have an _id field. If we don't, we'll add it.
               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
            */
            BSONObj io((const char *) obuf);
            BSONElement idField = io.getField( "_id" );
            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
            // we don't add _id for capped collections in local as they don't have an _id index
            if( idField.eoo() &&
                !wouldAddIndex &&
                nsToDatabase( ns ) != "local" &&
                d->haveIdIndex() ) {

                if( addedID )
                    *addedID = true;

                idToInsert.init();
                len += idToInsert.size();
            }

            BSONElementManipulator::lookForTimestamps( io );
        }

        int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize );
        fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) );

        // If the collection is capped, check if the new object will violate a unique index
        // constraint before allocating space.
        if ( d->isCapped() && !god) {
            BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) );
            Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp );
            uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() );
        }

        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);

        if ( loc.isNull() ) {
            string errmsg = str::stream() << "insert: couldn't alloc space for object ns:" << ns
                                          << " capped:" << d->isCapped();
            log() << errmsg;
            uasserted( 17248, errmsg );
        }

        Record *r = loc.rec();
        {
            verify( r->lengthWithHeaders() >= lenWHdr );
            r = (Record*) getDur().writingPtr(r, lenWHdr);
            if( idToInsert.needed() ) {
                /* a little effort was made here to avoid a double copy when we add an ID */
                int originalSize = *((int*) obuf);
                ((int&)*r->data()) = originalSize + idToInsert.size();
                memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size());
                memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4);
            }
            else {
                if( obuf ) // obuf can be null from internal callers
                    memcpy(r->data(), obuf, len);
            }
        }

        addRecordToRecListInExtent(r, loc);

        d->incrementStats( r->netLength(), 1 );

        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
        if ( !god )
            collection->infoCache()->notifyOfWriteOp();

        /* add this record to our indexes */
        if ( d->getTotalIndexCount() > 0 ) {
            try {
                BSONObj obj(r->data());
                collection->getIndexCatalog()->indexRecord(obj, loc);
            }
            catch( AssertionException& e ) {
                // should be a dup key error on _id index
                if( d->isCapped() ) {
                    massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() );
                    string s = e.toString();
                    s += " : on addIndex/capped - collection and its index will not match";
                    setLastError(0, s.c_str());
                    error() << s << endl;
                }
                else {
                    // normal case -- we can roll back
                    _deleteRecord(d, ns, r, loc);
                    throw;
                }
            }
        }

        d->paddingFits();

        return loc;
    }
Пример #19
0
    void _initAndListen(int listenPort ) {

        Client::initThread("initandlisten");

        bool is32bit = sizeof(int*) == 4;

        {
            ProcessId pid = ProcessId::getCurrent();
            LogstreamBuilder l = log();
            l << "MongoDB starting : pid=" << pid
              << " port=" << serverGlobalParams.port
              << " dbpath=" << storageGlobalParams.dbpath;
            if( replSettings.master ) l << " master=" << replSettings.master;
            if( replSettings.slave )  l << " slave=" << (int) replSettings.slave;
            l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl;
        }
        DEV log() << "_DEBUG build (which is slower)" << endl;
        logStartupWarnings();
#if defined(_WIN32)
        printTargetMinOS();
#endif
        logProcessDetails();
        {
            stringstream ss;
            ss << endl;
            ss << "*********************************************************************" << endl;
            ss << " ERROR: dbpath (" << storageGlobalParams.dbpath << ") does not exist." << endl;
            ss << " Create this directory or give existing directory in --dbpath." << endl;
            ss << " See http://dochub.mongodb.org/core/startingandstoppingmongo" << endl;
            ss << "*********************************************************************" << endl;
            uassert(10296,  ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath));
        }
        {
            stringstream ss;
            ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist";
            uassert(12590,  ss.str().c_str(),
                    boost::filesystem::exists(storageGlobalParams.repairpath));
        }

        // TODO check non-journal subdirs if using directory-per-db
        checkReadAhead(storageGlobalParams.dbpath);

        acquirePathLock(mongodGlobalParams.repair);
        boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/");

        FileAllocator::get()->start();

        // TODO:  This should go into a MONGO_INITIALIZER once we have figured out the correct
        // dependencies.
        if (snmpInit) {
            snmpInit();
        }

        MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" );

        dur::startup();

        if (storageGlobalParams.durOptions & StorageGlobalParams::DurRecoverOnly)
            return;

        unsigned long long missingRepl = checkIfReplMissingFromCommandLine();
        if (missingRepl) {
            log() << startupWarningsLog;
            log() << "** WARNING: mongod started without --replSet yet " << missingRepl
                  << " documents are present in local.system.replset" << startupWarningsLog;
            log() << "**          Restart with --replSet unless you are doing maintenance and no"
                  << " other clients are connected." << startupWarningsLog;
            log() << "**          The TTL collection monitor will not start because of this." << startupWarningsLog;
            log() << "**          For more info see http://dochub.mongodb.org/core/ttlcollections" << startupWarningsLog;
            log() << startupWarningsLog;
        }

        if (mongodGlobalParams.scriptingEnabled) {
            ScriptEngine::setup();
            globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback );
            globalScriptEngine->setGetCurrentOpIdCallback( jsGetCurrentOpIdCallback );
        }

        // On replica set members we only clear temp collections on DBs other than "local" during
        // promotion to primary. On pure slaves, they are only cleared when the oplog tells them to.
        // The local DB is special because it is not replicated.  See SERVER-10927 for more details.
        const bool shouldClearNonLocalTmpCollections = !(missingRepl
                                                         || replSettings.usingReplSets()
                                                         || replSettings.slave == SimpleSlave);
        repairDatabasesAndCheckVersion(shouldClearNonLocalTmpCollections);

        if (mongodGlobalParams.upgrade)
            return;

        uassertStatusOK(getGlobalAuthorizationManager()->initialize());

        /* this is for security on certain platforms (nonce generation) */
        srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros()));

        // The snapshot thread provides historical collection level and lock statistics for use
        // by the web interface. Only needed when HTTP is enabled.
        if (serverGlobalParams.isHttpInterfaceEnabled)
            snapshotThread.go();

        d.clientCursorMonitor.go();
        PeriodicTask::startRunningPeriodicTasks();
        if (missingRepl) {
            // a warning was logged earlier
        }
        else {
            startTTLBackgroundJob();
        }

#ifndef _WIN32
        mongo::signalForkSuccess();
#endif

        if(getGlobalAuthorizationManager()->isAuthEnabled()) {
            // open admin db in case we need to use it later. TODO this is not the right way to
            // resolve this.
            Client::WriteContext c("admin", storageGlobalParams.dbpath);
        }

        authindex::configureSystemIndexes("admin");

        getDeleter()->startWorkers();

        // Starts a background thread that rebuilds all incomplete indices. 
        indexRebuilder.go(); 

        listen(listenPort);

        // listen() will return when exit code closes its socket.
        exitCleanly(EXIT_NET_ERROR);
    }
Пример #20
0
    void KVRecordStoreCapped::deleteAsNeeded(OperationContext *txn) {
        if (!needsDelete(txn)) {
            // nothing to do
            return;
        }

        // Only one thread should do deletes at a time, otherwise they'll conflict.
        boost::mutex::scoped_lock lock(_cappedDeleteMutex, boost::defer_lock);
        if (_cappedMaxDocs != -1) {
            lock.lock();
        } else {
            if (!lock.try_lock()) {
                // Someone else is deleting old records. Apply back-pressure if too far behind,
                // otherwise continue.
                if ((dataSize(txn) - _cappedMaxSize) < _cappedMaxSizeSlack)
                    return;

                lock.lock();

                // If we already waited, let someone else do cleanup unless we are significantly
                // over the limit.
                if ((dataSize(txn) - _cappedMaxSize) < (2 * _cappedMaxSizeSlack))
                    return;
            }
        }

        // we do this is a side transaction in case it aborts
        TempRecoveryUnitSwap swap(txn);

        int64_t ds = dataSize(txn);
        int64_t nr = numRecords(txn);
        int64_t sizeOverCap = (ds > _cappedMaxSize) ? ds - _cappedMaxSize : 0;
        int64_t sizeSaved = 0;
        int64_t docsOverCap = (_cappedMaxDocs != -1 && nr > _cappedMaxDocs) ? nr - _cappedMaxDocs : 0;
        int64_t docsRemoved = 0;

        try {
            WriteUnitOfWork wuow(txn);

            // We're going to notify the underlying store that we've
            // deleted this range of ids.  In TokuFT, this will trigger an
            // optimize.
            RecordId firstDeleted, lastDeleted;

            Timer t;

            // Delete documents while we are over-full and the iterator has more.
            //
            // Note that the iterator we get has the _idTracker's logic
            // already built in, so we don't need to worry about deleting
            // records that are not yet committed, including the one we
            // just inserted
            for (boost::scoped_ptr<RecordIterator> iter(getIterator(txn));
                 ((sizeSaved < sizeOverCap || docsRemoved < docsOverCap) &&
                  !iter->isEOF());
                 ) {
                const RecordId oldest = iter->getNext();

                ++docsRemoved;
                sizeSaved += iter->dataFor(oldest).size();

                if (_cappedDeleteCallback) {
                    // need to notify higher layers that a RecordId is about to be deleted
                    uassertStatusOK(_cappedDeleteCallback->aboutToDeleteCapped(txn, oldest, iter->dataFor(oldest)));
                }
                deleteRecord(txn, oldest);

                if (firstDeleted.isNull()) {
                    firstDeleted = oldest;
                }
                dassert(oldest > lastDeleted);
                lastDeleted = oldest;

                // Now, decide whether to keep working, we want to balance
                // staying on top of the deletion workload with the
                // latency of the client that's doing the deletes for
                // everyone.
                if (sizeOverCap >= _cappedMaxSizeSlack) {
                    // If we're over the slack amount, everyone's going to
                    // block on us anyway, so we may as well keep working.
                    continue;
                }
                if (sizeOverCap < (_cappedMaxSizeSlack / 4) && docsRemoved >= 1000) {
                    // If we aren't too much over and we've done a fair
                    // amount of work, take a break.
                    break;
                } else if (docsRemoved % 1000 == 0 && t.seconds() >= 4) {
                    // If we're under the slack amount and we've already
                    // spent a second working on this, return and give
                    // someone else a chance to shoulder that latency.
                    break;
                }
            }

            if (docsRemoved > 0) {
                _db->justDeletedCappedRange(txn, Slice::of(KeyString(firstDeleted)), Slice::of(KeyString(lastDeleted)),
                                            sizeSaved, docsRemoved);
                wuow.commit();
                dassert(lastDeleted > _lastDeletedId);
                _lastDeletedId = lastDeleted;
            }
        } catch (WriteConflictException) {
            log() << "Got conflict truncating capped, ignoring.";
            return;
        }
    }
Пример #21
0
    // throws DBException
    void buildAnIndex( OperationContext* txn,
                       Collection* collection,
                       IndexCatalogEntry* btreeState,
                       bool mayInterrupt ) {

        const string ns = collection->ns().ns(); // our copy
        verify(txn->lockState()->isWriteLocked(ns));

        const IndexDescriptor* idx = btreeState->descriptor();
        const BSONObj& idxInfo = idx->infoObj();

        LOG(0) << "build index on: " << ns
               << " properties: " << idx->toString() << endl;
        audit::logCreateIndex( currentClient.get(), &idxInfo, idx->indexName(), ns );

        Timer t;

        // this is so that people know there are more keys to look at when doing
        // things like in place updates, etc...
        collection->infoCache()->addedIndex();

        if ( collection->numRecords() == 0 ) {
            Status status = btreeState->accessMethod()->initializeAsEmpty(txn);
            massert( 17343,
                     str::stream() << "IndexAccessMethod::initializeAsEmpty failed" << status.toString(),
                     status.isOK() );
            LOG(0) << "\t added index to empty collection";
            return;
        }

        scoped_ptr<BackgroundOperation> backgroundOperation;
        bool doInBackground = false;

        if ( idxInfo["background"].trueValue() && !inDBRepair ) {
            doInBackground = true;
            backgroundOperation.reset( new BackgroundOperation(ns) );
            uassert( 13130,
                     "can't start bg index b/c in recursive lock (db.eval?)",
                     !txn->lockState()->isRecursive() );
            log() << "\t building index in background";
        }

        Status status = btreeState->accessMethod()->initializeAsEmpty(txn);
        massert( 17342,
                 str::stream()
                 << "IndexAccessMethod::initializeAsEmpty failed"
                 << status.toString(),
                 status.isOK() );

        IndexAccessMethod* bulk = doInBackground ?
            NULL : btreeState->accessMethod()->initiateBulk(txn);
        scoped_ptr<IndexAccessMethod> bulkHolder(bulk);
        IndexAccessMethod* iam = bulk ? bulk : btreeState->accessMethod();

        if ( bulk )
            log() << "\t building index using bulk method";

        unsigned long long n = addExistingToIndex( txn,
                                                   collection,
                                                   btreeState->descriptor(),
                                                   iam,
                                                   doInBackground );

        if ( bulk ) {
            LOG(1) << "\t bulk commit starting";
            std::set<DiskLoc> dupsToDrop;

            Status status = btreeState->accessMethod()->commitBulk( bulk,
                                                                    mayInterrupt,
                                                                    &dupsToDrop );

            // Code above us expects a uassert in case of dupkey errors.
            if (ErrorCodes::DuplicateKey == status.code()) {
                uassertStatusOK(status);
            }

            // Any other errors are probably bad and deserve a massert.
            massert( 17398,
                     str::stream() << "commitBulk failed: " << status.toString(),
                     status.isOK() );

            if ( dupsToDrop.size() )
                log() << "\t bulk dropping " << dupsToDrop.size() << " dups";

            for( set<DiskLoc>::const_iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); ++i ) {
                BSONObj toDelete;
                collection->deleteDocument( txn,
                                            *i,
                                            false /* cappedOk */,
                                            true /* noWarn */,
                                            &toDelete );
                if (repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(
                        collection->ns().db())) {
                    repl::logOp(txn, "d", ns.c_str(), toDelete);
                }
                
                txn->recoveryUnit()->commitIfNeeded();

                RARELY if ( mayInterrupt ) {
                    txn->checkForInterrupt();
                }
            }
        }

        verify( !btreeState->head().isNull() );
        LOG(0) << "build index done.  scanned " << n << " total records. "
               << t.millis() / 1000.0 << " secs" << endl;

        // this one is so people know that the index is finished
        collection->infoCache()->addedIndex();
    }
Пример #22
0
ConnectionPool::ConnectionList::iterator ConnectionPool::acquireConnection(
    const HostAndPort& target, Date_t now, Milliseconds timeout) {
    stdx::unique_lock<stdx::mutex> lk(_mutex);

    // Clean up connections on stale/unused hosts
    _cleanUpStaleHosts_inlock(now);

    for (HostConnectionMap::iterator hostConns;
         (hostConns = _connections.find(target)) != _connections.end();) {
        // Clean up the requested host to remove stale/unused connections
        _cleanUpOlderThan_inlock(now, &hostConns->second);

        if (hostConns->second.empty()) {
            // prevent host from causing unnecessary cleanups
            _lastUsedHosts[hostConns->first] = kNeverTooStale;
            break;
        }

        _inUseConnections.splice(
            _inUseConnections.begin(), hostConns->second, hostConns->second.begin());

        const ConnectionList::iterator candidate = _inUseConnections.begin();
        lk.unlock();

        try {
            if (candidate->conn->isStillConnected()) {
                // setSoTimeout takes a double representing the number of seconds for send and
                // receive timeouts.  Thus, we must express 'timeout' in milliseconds and divide by
                // 1000.0 to get the number of seconds with a fractional part.
                candidate->conn->setSoTimeout(durationCount<Milliseconds>(timeout) / 1000.0);
                return candidate;
            }
        } catch (...) {
            lk.lock();
            _destroyConnection_inlock(&_inUseConnections, candidate);
            throw;
        }

        lk.lock();
        _destroyConnection_inlock(&_inUseConnections, candidate);
    }

    // No idle connection in the pool; make a new one.
    lk.unlock();

    std::unique_ptr<DBClientConnection> conn;
    if (_hook) {
        conn.reset(new DBClientConnection(
            false,  // auto reconnect
            0,      // socket timeout
            {},     // MongoURI
            [this, target](const executor::RemoteCommandResponse& isMasterReply) {
                return _hook->validateHost(target, isMasterReply);
            }));
    } else {
        conn.reset(new DBClientConnection());
    }

    // setSoTimeout takes a double representing the number of seconds for send and receive
    // timeouts.  Thus, we must express 'timeout' in milliseconds and divide by 1000.0 to get
    // the number of seconds with a fractional part.
    conn->setSoTimeout(durationCount<Milliseconds>(timeout) / 1000.0);

    uassertStatusOK(conn->connect(target, StringData()));
    conn->port().setTag(conn->port().getTag() | _messagingPortTags);

    if (isInternalAuthSet()) {
        conn->auth(getInternalUserAuthParams());
    }

    if (_hook) {
        auto postConnectRequest = uassertStatusOK(_hook->makeRequest(target));

        // We might not have a postConnectRequest
        if (postConnectRequest != boost::none) {
            auto start = Date_t::now();
            auto reply =
                conn->runCommandWithMetadata(postConnectRequest->dbname,
                                             postConnectRequest->cmdObj.firstElementFieldName(),
                                             postConnectRequest->metadata,
                                             postConnectRequest->cmdObj);

            auto rcr = executor::RemoteCommandResponse(reply->getCommandReply().getOwned(),
                                                       reply->getMetadata().getOwned(),
                                                       Date_t::now() - start);

            uassertStatusOK(_hook->handleReply(target, std::move(rcr)));
        }
    }

    lk.lock();
    return _inUseConnections.insert(_inUseConnections.begin(), ConnectionInfo(conn.release(), now));
}
Пример #23
0
shared_ptr<ChunkManager> ChunkManager::reload(OperationContext* txn, bool force) const {
    const NamespaceString nss(_ns);
    auto config = uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString()));

    return config->getChunkManagerIfExists(txn, getns(), force);
}
Пример #24
0
    void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber,
                                    vector<IndexAccessMethod*>& indexesToInsertTo,
                                    const CompactOptions* compactOptions, CompactStats* stats ) {

        log() << "compact begin extent #" << extentNumber
              << " for namespace " << _ns << " " << diskloc;

        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates(diskloc) );

        {
            // the next/prev pointers within the extent might not be in order so we first
            // page the whole thing in sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            size_t length = e->length;

            touch_pages( reinterpret_cast<const char*>(e), length );
            int ms = t.millis();
            if( ms > 1000 )
                log() << "compact end paging in " << ms << "ms "
                      << e->length/1000000.0/t.seconds() << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = getExtentManager()->getNextRecordInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if ( compactOptions->validateDocuments && !objOld.valid() ) {
                        // object is corrupt!
                        log() << "compact skipping corrupt document!";
                        stats->corruptDocuments++;
                    }
                    else {
                        unsigned docSize = objOld.objsize();

                        nrecords++;
                        oldObjSize += docSize;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = docSize + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;

                        switch( compactOptions->paddingMode ) {
                        case CompactOptions::NONE:
                            if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) )
                                lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding);
                            break;
                        case CompactOptions::PRESERVE:
                            // if we are preserving the padding, the record should not change size
                            lenWPadding = recOld->lengthWithHeaders();
                            break;
                        case CompactOptions::MANUAL:
                            lenWPadding = compactOptions->computeRecordSize(lenWPadding);
                            if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
                                lenWPadding = lenWHdr;
                            }
                            break;
                        }

                        CompactDocWriter writer( objOld, lenWPadding );
                        StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 );
                        uassertStatusOK( status.getStatus() );
                        datasize += _recordStore->recordFor( status.getValue() )->netLength();

                        InsertDeleteOptions options;
                        options.logIfError = false;
                        options.dupsAllowed = true; // in compact we should be doing no checking

                        for ( size_t i = 0; i < indexesToInsertTo.size(); i++ ) {
                            Status idxStatus = indexesToInsertTo[i]->insert( objOld,
                                                                             status.getValue(),
                                                                             options,
                                                                             NULL );
                            uassertStatusOK( idxStatus );
                        }
                    }

                    if( L.isNull() ) {
                        // we just did the very last record from the old extent.  it's still pointed to
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            verify( details()->firstExtent() == diskloc );
            verify( details()->lastExtent() != diskloc );
            DiskLoc newFirst = e->xnext;
            details()->firstExtent().writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            getExtentManager()->freeExtents( diskloc, diskloc );

            getDur().commitIfNeeded();

            {
                double op = 1.0;
                if( oldObjSize )
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                      << " documents (" << datasize/1000000.0 << "MB)"
                      << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100;
            }
        }

    }
StatusWith<ResolvedView> ViewCatalog::resolveView(OperationContext* opCtx,
                                                  const NamespaceString& nss) {
    stdx::unique_lock<stdx::mutex> lock(_mutex);

    // Keep looping until the resolution completes. If the catalog is invalidated during the
    // resolution, we start over from the beginning.
    while (true) {
        // Points to the name of the most resolved namespace.
        const NamespaceString* resolvedNss = &nss;

        // Holds the combination of all the resolved views.
        std::vector<BSONObj> resolvedPipeline;

        // If the catalog has not been tampered with, all views seen during the resolution will have
        // the same collation. As an optimization, we fill out the collation spec only once.
        boost::optional<BSONObj> collation;

        // The last seen view definition, which owns the NamespaceString pointed to by
        // 'resolvedNss'.
        std::shared_ptr<ViewDefinition> lastViewDefinition;

        int depth = 0;
        for (; depth < ViewGraph::kMaxViewDepth; depth++) {
            while (MONGO_FAIL_POINT(hangDuringViewResolution)) {
                log() << "Yielding mutex and hanging due to 'hangDuringViewResolution' failpoint";
                lock.unlock();
                sleepmillis(1000);
                lock.lock();
            }

            // If the catalog has been invalidated, bail and restart.
            if (!_valid.load()) {
                uassertStatusOK(_reloadIfNeeded_inlock(opCtx));
                break;
            }

            auto view = _lookup_inlock(opCtx, resolvedNss->ns());
            if (!view) {
                // Return error status if pipeline is too large.
                int pipelineSize = 0;
                for (auto obj : resolvedPipeline) {
                    pipelineSize += obj.objsize();
                }
                if (pipelineSize > ViewGraph::kMaxViewPipelineSizeBytes) {
                    return {ErrorCodes::ViewPipelineMaxSizeExceeded,
                            str::stream() << "View pipeline exceeds maximum size; maximum size is "
                                          << ViewGraph::kMaxViewPipelineSizeBytes};
                }
                return StatusWith<ResolvedView>(
                    {*resolvedNss, std::move(resolvedPipeline), std::move(collation.get())});
            }

            resolvedNss = &view->viewOn();
            if (!collation) {
                collation = view->defaultCollator() ? view->defaultCollator()->getSpec().toBSON()
                                                    : CollationSpec::kSimpleSpec;
            }

            // Prepend the underlying view's pipeline to the current working pipeline.
            const std::vector<BSONObj>& toPrepend = view->pipeline();
            resolvedPipeline.insert(resolvedPipeline.begin(), toPrepend.begin(), toPrepend.end());

            // If the first stage is a $collStats, then we return early with the viewOn namespace.
            if (toPrepend.size() > 0 && !toPrepend[0]["$collStats"].eoo()) {
                return StatusWith<ResolvedView>(
                    {*resolvedNss, std::move(resolvedPipeline), std::move(collation.get())});
            }
        }

        if (depth >= ViewGraph::kMaxViewDepth) {
            return {ErrorCodes::ViewDepthLimitExceeded,
                    str::stream() << "View depth too deep or view cycle detected; maximum depth is "
                                  << ViewGraph::kMaxViewDepth};
        }
    };
    MONGO_UNREACHABLE;
}
    virtual bool run(OperationContext* opCtx,
                     const string&,
                     const BSONObj& cmdObj,
                     BSONObjBuilder& result) {
        /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
           authenticated.
        */
        if (cmdObj["forShell"].trueValue()) {
            LastError::get(opCtx->getClient()).disable();
        }

        transport::Session::TagMask sessionTagsToSet = 0;
        transport::Session::TagMask sessionTagsToUnset = 0;

        // Tag connections to avoid closing them on stepdown.
        auto hangUpElement = cmdObj["hangUpOnStepDown"];
        if (!hangUpElement.eoo() && !hangUpElement.trueValue()) {
            sessionTagsToSet |= transport::Session::kKeepOpen;
        }

        auto& clientMetadataIsMasterState = ClientMetadataIsMasterState::get(opCtx->getClient());
        bool seenIsMaster = clientMetadataIsMasterState.hasSeenIsMaster();
        if (!seenIsMaster) {
            clientMetadataIsMasterState.setSeenIsMaster();
        }

        BSONElement element = cmdObj[kMetadataDocumentName];
        if (!element.eoo()) {
            if (seenIsMaster) {
                uasserted(ErrorCodes::ClientMetadataCannotBeMutated,
                          "The client metadata document may only be sent in the first isMaster");
            }

            auto swParseClientMetadata = ClientMetadata::parse(element);

            uassertStatusOK(swParseClientMetadata.getStatus());

            invariant(swParseClientMetadata.getValue());

            swParseClientMetadata.getValue().get().logClientMetadata(opCtx->getClient());

            clientMetadataIsMasterState.setClientMetadata(
                opCtx->getClient(), std::move(swParseClientMetadata.getValue()));
        }

        // Parse the optional 'internalClient' field. This is provided by incoming connections from
        // mongod and mongos.
        auto internalClientElement = cmdObj["internalClient"];
        if (internalClientElement) {
            sessionTagsToSet |= transport::Session::kInternalClient;

            uassert(ErrorCodes::TypeMismatch,
                    str::stream() << "'internalClient' must be of type Object, but was of type "
                                  << typeName(internalClientElement.type()),
                    internalClientElement.type() == BSONType::Object);

            bool foundMaxWireVersion = false;
            for (auto&& elem : internalClientElement.Obj()) {
                auto fieldName = elem.fieldNameStringData();
                if (fieldName == "minWireVersion") {
                    // We do not currently use 'internalClient.minWireVersion'.
                    continue;
                } else if (fieldName == "maxWireVersion") {
                    foundMaxWireVersion = true;

                    uassert(ErrorCodes::TypeMismatch,
                            str::stream() << "'maxWireVersion' field of 'internalClient' must be "
                                             "of type int, but was of type "
                                          << typeName(elem.type()),
                            elem.type() == BSONType::NumberInt);

                    // All incoming connections from mongod/mongos of earlier versions should be
                    // closed if the featureCompatibilityVersion is bumped to 3.6.
                    if (elem.numberInt() >=
                        WireSpec::instance().incomingInternalClient.maxWireVersion) {
                        sessionTagsToSet |=
                            transport::Session::kLatestVersionInternalClientKeepOpen;
                    } else {
                        sessionTagsToUnset |=
                            transport::Session::kLatestVersionInternalClientKeepOpen;
                    }
                } else {
                    uasserted(ErrorCodes::BadValue,
                              str::stream() << "Unrecognized field of 'internalClient': '"
                                            << fieldName
                                            << "'");
                }
            }

            uassert(ErrorCodes::BadValue,
                    "Missing required field 'maxWireVersion' of 'internalClient'",
                    foundMaxWireVersion);
        } else {
            sessionTagsToUnset |= (transport::Session::kInternalClient |
                                   transport::Session::kLatestVersionInternalClientKeepOpen);
            sessionTagsToSet |= transport::Session::kExternalClientKeepOpen;
        }

        auto session = opCtx->getClient()->session();
        if (session) {
            session->mutateTags(
                [sessionTagsToSet, sessionTagsToUnset](transport::Session::TagMask originalTags) {
                    // After a mongos sends the initial "isMaster" command with its mongos client
                    // information, it sometimes sends another "isMaster" command that is forwarded
                    // from its client. Once kInternalClient has been set, we assume that any future
                    // "isMaster" commands are forwarded in this manner, and we do not update the
                    // session tags.
                    if ((originalTags & transport::Session::kInternalClient) == 0) {
                        return (originalTags | sessionTagsToSet) & ~sessionTagsToUnset;
                    } else {
                        return originalTags;
                    }
                });
        }

        appendReplicationInfo(opCtx, result, 0);

        if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
            const int configServerModeNumber = 2;
            result.append("configsvr", configServerModeNumber);
        }

        result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
        result.appendNumber("maxMessageSizeBytes", MaxMessageSizeBytes);
        result.appendNumber("maxWriteBatchSize", write_ops::kMaxWriteBatchSize);
        result.appendDate("localTime", jsTime());
        result.append("logicalSessionTimeoutMinutes", localLogicalSessionTimeoutMinutes);
        result.appendNumber("connectionId", opCtx->getClient()->getConnectionId());

        if (internalClientElement) {
            result.append("minWireVersion",
                          WireSpec::instance().incomingInternalClient.minWireVersion);
            result.append("maxWireVersion",
                          WireSpec::instance().incomingInternalClient.maxWireVersion);
        } else {
            result.append("minWireVersion",
                          WireSpec::instance().incomingExternalClient.minWireVersion);
            result.append("maxWireVersion",
                          WireSpec::instance().incomingExternalClient.maxWireVersion);
        }

        result.append("readOnly", storageGlobalParams.readOnly);

        const auto parameter = mapFindWithDefault(ServerParameterSet::getGlobal()->getMap(),
                                                  "automationServiceDescriptor",
                                                  static_cast<ServerParameter*>(nullptr));
        if (parameter)
            parameter->append(opCtx, result, "automationServiceDescriptor");

        if (opCtx->getClient()->session()) {
            MessageCompressorManager::forSession(opCtx->getClient()->session())
                .serverNegotiate(cmdObj, &result);
        }

        auto& saslMechanismRegistry = SASLServerMechanismRegistry::get(opCtx->getServiceContext());
        saslMechanismRegistry.advertiseMechanismNamesForUser(opCtx, cmdObj, &result);

        return true;
    }
Пример #27
0
void Strategy::queryOp(OperationContext* txn, Request& r) {
    verify(!NamespaceString(r.getns()).isCommand());

    Timer queryTimer;

    globalOpCounters.gotQuery();

    QueryMessage q(r.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForQuery(ns, q.query);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        auto txn = cc().makeOperationContext();

        ReadPreferenceSetting readPreference(ReadPreference::PrimaryOnly, TagSet::primaryOnly());

        BSONElement rpElem;
        auto readPrefExtractStatus = bsonExtractTypedField(
            q.query, LiteParsedQuery::kFindCommandReadPrefField, mongo::Object, &rpElem);

        if (readPrefExtractStatus.isOK()) {
            auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
            uassertStatusOK(parsedRps.getStatus());
            readPreference = parsedRps.getValue();
        } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
            uassertStatusOK(readPrefExtractStatus);
        }

        auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop());
        uassertStatusOK(canonicalQuery.getStatus());

        // Do the work to generate the first batch of results. This blocks waiting to get responses
        // from the shard(s).
        std::vector<BSONObj> batch;

        // 0 means the cursor is exhausted and
        // otherwise we assume that a cursor with the returned id can be retrieved via the
        // ClusterCursorManager
        auto cursorId =
            ClusterFind::runQuery(txn.get(), *canonicalQuery.getValue(), readPreference, &batch);
        uassertStatusOK(cursorId.getStatus());

        // Build the response document.
        // TODO: this constant should be shared between mongos and mongod, and should
        // not be inside ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        int numResults = 0;
        for (const auto& obj : batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            numResults++;
        }

        replyToQuery(0,  // query result flags
                     r.p(),
                     r.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     0,  // startingFrom
                     cursorId.getValue());
        return;
    }

    QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions);

    // Parse "$maxTimeMS".
    StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query);
    uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK());

    if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, r, qSpec)) {
        return;
    }

    ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo());
    verify(cursor);

    // TODO:  Move out to Request itself, not strategy based
    try {
        cursor->init(txn);

        if (qSpec.isExplain()) {
            BSONObjBuilder explain_builder;
            cursor->explain(explain_builder);
            explain_builder.appendNumber("executionTimeMillis",
                                         static_cast<long long>(queryTimer.millis()));
            BSONObj b = explain_builder.obj();

            replyToQuery(0, r.p(), r.m(), b);
            delete (cursor);
            return;
        }
    } catch (...) {
        delete cursor;
        throw;
    }

    // TODO: Revisit all of this when we revisit the sharded cursor cache

    if (cursor->getNumQueryShards() != 1) {
        // More than one shard (or zero), manage with a ShardedClientCursor
        // NOTE: We may also have *zero* shards here when the returnPartial flag is set.
        // Currently the code in ShardedClientCursor handles this.

        ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor));

        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cc->getTotalSent();
        bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount);

        if (hasMore) {
            LOG(5) << "storing cursor : " << cc->getId();

            int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis();
            if (maxTimeMS.getValue() == 0) {  // 0 represents "no limit".
                cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit;
            } else if (cursorLeftoverMillis <= 0) {
                cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
            }

            cursorCache.store(cc, cursorLeftoverMillis);
        }

        replyToQuery(0,
                     r.p(),
                     r.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cc->getId() : 0);
    } else {
        // Only one shard is used

        // Remote cursors are stored remotely, we shouldn't need this around.
        unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor);

        ShardPtr shard = cursor->getQueryShard();
        verify(shard.get());
        DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId());

        // Implicitly stores the cursor in the cache
        r.reply(*(shardCursor->getMessage()), shardCursor->originalHost());

        // We don't want to kill the cursor remotely if there's still data left
        shardCursor->decouple();
    }
}
Пример #28
0
    BSONObj rewriteCommandForListingOwnCollections(OperationContext* opCtx,
                                                   const std::string& dbName,
                                                   const BSONObj& cmdObj) {
        mutablebson::Document rewrittenCmdObj(cmdObj);
        mutablebson::Element ownCollections =
            mutablebson::findFirstChildNamed(rewrittenCmdObj.root(), "authorizedCollections");

        AuthorizationSession* authzSession = AuthorizationSession::get(opCtx->getClient());

        // We must strip $ownCollections from the delegated command.
        uassertStatusOK(ownCollections.remove());

        BSONObj collectionFilter;

        // Extract and retain any previous filter
        mutablebson::Element oldFilter =
            mutablebson::findFirstChildNamed(rewrittenCmdObj.root(), "filter");

        // Make a new filter, containing a $and array.
        mutablebson::Element newFilter = rewrittenCmdObj.makeElementObject("filter");
        mutablebson::Element newFilterAnd = rewrittenCmdObj.makeElementArray("$and");
        uassertStatusOK(newFilter.pushBack(newFilterAnd));

        // Append a rule to the $and, which rejects system collections.
        mutablebson::Element systemCollectionsFilter = rewrittenCmdObj.makeElementObject(
            "", BSON("name" << BSON("$regex" << BSONRegEx("^(?!system\\.)"))));
        uassertStatusOK(newFilterAnd.pushBack(systemCollectionsFilter));

        if (!authzSession->isAuthorizedForAnyActionOnResource(
                ResourcePattern::forDatabaseName(dbName))) {
            // We passed an auth check which said we might be able to render some collections,
            // but it doesn't seem like we should render all of them. We must filter.

            // Compute the set of collection names which would be permissible to return.
            std::set<std::string> collectionNames;
            for (UserNameIterator nameIter = authzSession->getAuthenticatedUserNames();
                 nameIter.more();
                 nameIter.next()) {
                User* authUser = authzSession->lookupUser(*nameIter);
                const User::ResourcePrivilegeMap& resourcePrivilegeMap = authUser->getPrivileges();
                for (const std::pair<ResourcePattern, Privilege>& resourcePrivilege :
                     resourcePrivilegeMap) {
                    const auto& resource = resourcePrivilege.first;
                    if (resource.isCollectionPattern() || (resource.isExactNamespacePattern() &&
                                                           resource.databaseToMatch() == dbName)) {
                        collectionNames.emplace(resource.collectionToMatch().toString());
                    }
                }
            }

            // Construct a new filter predicate which returns only collections we were found to
            // have privileges for.
            BSONObjBuilder predicateBuilder;
            BSONObjBuilder nameBuilder(predicateBuilder.subobjStart("name"));
            BSONArrayBuilder setBuilder(nameBuilder.subarrayStart("$in"));

            // Load the de-duplicated set into a BSON array
            for (StringData collectionName : collectionNames) {
                setBuilder << collectionName;
            }
            setBuilder.done();
            nameBuilder.done();

            collectionFilter = predicateBuilder.obj();

            // Filter the results by our collection names.
            mutablebson::Element newFilterAndIn =
                rewrittenCmdObj.makeElementObject("", collectionFilter);
            uassertStatusOK(newFilterAnd.pushBack(newFilterAndIn));
        }

        // If there was a pre-existing filter, compose it with our new one.
        if (oldFilter.ok()) {
            uassertStatusOK(oldFilter.remove());
            uassertStatusOK(newFilterAnd.pushBack(oldFilter));
        }

        // Attach our new composite filter back onto the listCollections command object.
        uassertStatusOK(rewrittenCmdObj.root().pushBack(newFilter));

        return rewrittenCmdObj.getObject();
    }
Пример #29
0
bool ChunkManager::_load(OperationContext* txn,
                         ChunkMap& chunkMap,
                         set<ShardId>& shardIds,
                         ShardVersionMap* shardVersions,
                         const ChunkManager* oldManager) {
    // Reset the max version, but not the epoch, when we aren't loading from the oldManager
    _version = ChunkVersion(0, 0, _version.epoch());

    // If we have a previous version of the ChunkManager to work from, use that info to reduce
    // our config query
    if (oldManager && oldManager->getVersion().isSet()) {
        // Get the old max version
        _version = oldManager->getVersion();

        // Load a copy of the old versions
        *shardVersions = oldManager->_shardVersions;

        // Load a copy of the chunk map, replacing the chunk manager with our own
        const ChunkMap& oldChunkMap = oldManager->getChunkMap();

        // Could be v.expensive
        // TODO: If chunks were immutable and didn't reference the manager, we could do more
        // interesting things here
        for (const auto& oldChunkMapEntry : oldChunkMap) {
            shared_ptr<Chunk> oldC = oldChunkMapEntry.second;
            shared_ptr<Chunk> newC(new Chunk(
                this, oldC->getMin(), oldC->getMax(), oldC->getShardId(), oldC->getLastmod()));

            newC->setBytesWritten(oldC->getBytesWritten());

            chunkMap.insert(make_pair(oldC->getMax(), newC));
        }

        LOG(2) << "loading chunk manager for collection " << _ns
               << " using old chunk manager w/ version " << _version.toString() << " and "
               << oldChunkMap.size() << " chunks";
    }

    // Attach a diff tracker for the versioned chunk data
    CMConfigDiffTracker differ(this);
    differ.attach(_ns, chunkMap, _version, *shardVersions);

    // Diff tracker should *always* find at least one chunk if collection exists
    // Get the diff query required
    auto diffQuery = differ.configDiffQuery();

    repl::OpTime opTime;
    std::vector<ChunkType> chunks;
    uassertStatusOK(grid.catalogManager(txn)->getChunks(
        txn, diffQuery.query, diffQuery.sort, boost::none, &chunks, &opTime));

    invariant(opTime >= _configOpTime);
    _configOpTime = opTime;

    int diffsApplied = differ.calculateConfigDiff(txn, chunks);
    if (diffsApplied > 0) {
        LOG(2) << "loaded " << diffsApplied << " chunks into new chunk manager for " << _ns
               << " with version " << _version;

        // Add all existing shards we find to the shards set
        for (ShardVersionMap::iterator it = shardVersions->begin(); it != shardVersions->end();) {
            shared_ptr<Shard> shard = grid.shardRegistry()->getShard(txn, it->first);
            if (shard) {
                shardIds.insert(it->first);
                ++it;
            } else {
                shardVersions->erase(it++);
            }
        }

        _configOpTime = opTime;

        return true;
    } else if (diffsApplied == 0) {
        // No chunks were found for the ns
        warning() << "no chunks found when reloading " << _ns << ", previous version was "
                  << _version;

        // Set all our data to empty
        chunkMap.clear();
        shardVersions->clear();

        _version = ChunkVersion(0, 0, OID());
        _configOpTime = opTime;

        return true;
    } else {  // diffsApplied < 0

        bool allInconsistent = (differ.numValidDiffs() == 0);
        if (allInconsistent) {
            // All versions are different, this can be normal
            warning() << "major change in chunk information found when reloading " << _ns
                      << ", previous version was " << _version;
        } else {
            // Inconsistent load halfway through (due to yielding cursor during load)
            // should be rare
            warning() << "inconsistent chunks found when reloading " << _ns
                      << ", previous version was " << _version << ", this should be rare";
        }

        // Set all our data to empty to be extra safe
        chunkMap.clear();
        shardVersions->clear();

        _version = ChunkVersion(0, 0, OID());

        return allInconsistent;
    }
}
Пример #30
0
    long long DeleteExecutor::execute() {
        uassertStatusOK(prepare());
        uassert(17417,
                mongoutils::str::stream() <<
                "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(),
                _isQueryParsed);
        const bool logop = _request->shouldCallLogOp();
        const NamespaceString& ns(_request->getNamespaceString());
        if (!_request->isGod()) {
            if (ns.isSystem()) {
                uassert(12050,
                        "cannot delete from system namespace",
                        legalClientSystemNS(ns.ns(), true));
            }
            if (ns.ns().find('$') != string::npos) {
                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
                uasserted( 10100, "cannot delete from collection with reserved $ in name" );
            }
        }

        massert(17418,
                mongoutils::str::stream() <<
                "dbname = " << currentClient.get()->database()->name() <<
                "; ns = " << ns.ns(),
                currentClient.get()->database()->name() == nsToDatabaseSubstring(ns.ns()));
        Collection* collection = currentClient.get()->database()->getCollection(ns.ns());
        if (NULL == collection) {
            return 0;
        }

        uassert(10101,
                str::stream() << "cannot remove from a capped collection: " << ns.ns(),
                !collection->isCapped());

        uassert(ErrorCodes::NotMaster,
                str::stream() << "Not primary while removing from " << ns.ns(),
                !logop || isMasterNs(ns.ns().c_str()));

        long long nDeleted = 0;

        const bool canYield = !_request->isGod() && (
                _canonicalQuery.get() ?
                !QueryPlannerCommon::hasNode(_canonicalQuery->root(), MatchExpression::ATOMIC) :
                LiteParsedQuery::isQueryIsolated(_request->getQuery()));

        Runner* rawRunner;
        if (_canonicalQuery.get()) {
            uassertStatusOK(getRunner(collection, _canonicalQuery.release(), &rawRunner));
        }
        else {
            CanonicalQuery* ignored;
            uassertStatusOK(getRunner(collection,
                                      ns.ns(),
                                      _request->getQuery(),
                                      &rawRunner,
                                      &ignored));
        }

        auto_ptr<Runner> runner(rawRunner);
        auto_ptr<ScopedRunnerRegistration> safety;

        if (canYield) {
            safety.reset(new ScopedRunnerRegistration(runner.get()));
            runner->setYieldPolicy(Runner::YIELD_AUTO);
        }

        DiskLoc rloc;
        Runner::RunnerState state;
        CurOp* curOp = cc().curop();
        int oldYieldCount = curOp->numYields();
        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &rloc))) {
            if (oldYieldCount != curOp->numYields()) {
                uassert(ErrorCodes::NotMaster,
                        str::stream() << "No longer primary while removing from " << ns.ns(),
                        !logop || isMasterNs(ns.ns().c_str()));
                oldYieldCount = curOp->numYields();
            }
            BSONObj toDelete;

            // TODO: do we want to buffer docs and delete them in a group rather than
            // saving/restoring state repeatedly?
            runner->saveState();
            collection->deleteDocument(rloc, false, false, logop ? &toDelete : NULL );
            runner->restoreState();

            nDeleted++;

            if (logop) {
                if ( toDelete.isEmpty() ) {
                    problem() << "deleted object without id, not logging" << endl;
                }
                else {
                    bool replJustOne = true;
                    logOp("d", ns.ns().c_str(), toDelete, 0, &replJustOne);
                }
            }

            if (!_request->isMulti()) {
                break;
            }

            if (!_request->isGod()) {
                getDur().commitIfNeeded();
            }

            if (debug && _request->isGod() && nDeleted == 100) {
                log() << "warning high number of deletes with god=true "
                      << " which could use significant memory b/c we don't commit journal";
            }
        }

        return nDeleted;
    }