Example #1
    int Balancer::_moveChunks( const vector<CandidateChunkPtr>* candidateChunks , bool secondaryThrottle ) {
        int movedCount = 0;

        for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
            const CandidateChunk& chunkInfo = *it->get();

            DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
            verify( cfg );

            ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
            verify( cm );

            ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min );
            if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                // likely a split happened somewhere
                cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
                verify( cm );

                c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                    log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;

            BSONObj res;
            if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , secondaryThrottle , res ) ) {

            // the move requires acquiring the collection metadata's lock, which can fail
            log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
                  << " chunk: " << chunkInfo.chunk << endl;

            if ( res["chunkTooBig"].trueValue() ) {
                // reload just to be safe
                cm = cfg->getChunkManager( chunkInfo.ns );
                verify( cm );
                c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                log() << "forcing a split because migrate failed for size reasons" << endl;
                res = BSONObj();
                c->singleSplit( true , res );
                log() << "forced split results: " << res << endl;
                if ( ! res["ok"].trueValue() ) {
                    log() << "marking chunk as jumbo: " << c->toString() << endl;
                    // we increment moveCount so we do another round right away


        return movedCount;
Example #2
     * Splits the chunks touched based from the targeter stats if needed.
    static void splitIfNeeded( const string& ns, const TargeterStats& stats ) {
        if ( !Chunk::ShouldAutoSplit ) {

        DBConfigPtr config;

        try {
            config = grid.getDBConfig( ns );
        catch ( const DBException& ex ) {
            warning() << "failed to get database config for " << ns
                      << " while checking for auto-split: " << causedBy( ex ) << endl;

        ChunkManagerPtr chunkManager;
        ShardPtr dummyShard;
        config->getChunkManagerOrPrimary( ns, chunkManager, dummyShard );

        if ( !chunkManager ) {

        for ( map<BSONObj, int>::const_iterator it = stats.chunkSizeDelta.begin();
            it != stats.chunkSizeDelta.end(); ++it ) {

            ChunkPtr chunk;
            try {
                chunk = chunkManager->findIntersectingChunk( it->first );
            catch ( const AssertionException& ex ) {
                warning() << "could not find chunk while checking for auto-split: "
                          << causedBy( ex ) << endl;

            chunk->splitIfShould( it->second );
Example #3
    int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks,
                              bool secondaryThrottle,
                              bool waitForDelete)
        int movedCount = 0;

        for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
            const CandidateChunk& chunkInfo = *it->get();

            // Changes to metadata, borked metadata, and connectivity problems should cause us to
            // abort this chunk move, but shouldn't cause us to abort the entire round of chunks.
            // TODO: Handle all these things more cleanly, since they're expected problems
            try {

                DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
                verify( cfg );

                // NOTE: We purposely do not reload metadata here, since _doBalanceRound already
                // tried to do so once.
                ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
                verify( cm );

                ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                    // likely a split happened somewhere
                    cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
                    verify( cm );

                    c = cm->findIntersectingChunk( chunkInfo.chunk.min );
                    if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
                        log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;

                BSONObj res;
                if (c->moveAndCommit(Shard::make(chunkInfo.to),
                                     0, /* maxTimeMS */
                                     res)) {

                // the move requires acquiring the collection metadata's lock, which can fail
                log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
                      << " chunk: " << chunkInfo.chunk << endl;

                if ( res["chunkTooBig"].trueValue() ) {
                    // reload just to be safe
                    cm = cfg->getChunkManager( chunkInfo.ns );
                    verify( cm );
                    c = cm->findIntersectingChunk( chunkInfo.chunk.min );

                    log() << "forcing a split because migrate failed for size reasons" << endl;

                    res = BSONObj();
                    c->singleSplit( true , res );
                    log() << "forced split results: " << res << endl;

                    if ( ! res["ok"].trueValue() ) {
                        log() << "marking chunk as jumbo: " << c->toString() << endl;
                        // we increment moveCount so we do another round right away

            catch( const DBException& ex ) {
                warning() << "could not move chunk " << chunkInfo.chunk.toString()
                          << ", continuing balancing round" << causedBy( ex ) << endl;

        return movedCount;
Example #4
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        verify( candidateChunks );

        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection

        auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj());

        if ( NULL == cursor.get() ) {
            warning() << "could not query " << CollectionType::ConfigNS
                      << " while trying to balance" << endl;

        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col[CollectionType::keyPattern()].eoo() &&
                 ! col[CollectionType::noBalance()].trueValue() ){
                collections.push_back( col[CollectionType::ns()].String() );
            else if( col[CollectionType::noBalance()].trueValue() ){
                LOG(1) << "not balancing collection " << col[CollectionType::ns()].String()
                       << ", explicitly disabled" << endl;


        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;

        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        // TODO: skip unresponsive shards and mark information as stale.

        ShardInfoMap shardInfo;
        Status loadStatus = DistributionStatus::populateShardInfoMap(&shardInfo);

        if (!loadStatus.isOK()) {
            warning() << "failed to load shard metadata" << causedBy(loadStatus) << endl;

        if (shardInfo.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;

        OCCASIONALLY warnOnMultiVersion( shardInfo );

        // 3. For each collection, check if the balancing policy recommends moving anything around.

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            OwnedPointerMap<string, OwnedPointerVector<ChunkType> > shardToChunksMap;
            cursor = conn.query(ChunkType::ConfigNS,

            set<BSONObj> allChunkMinimums;

            while ( cursor->more() ) {
                BSONObj chunkDoc = cursor->nextSafe().getOwned();

                auto_ptr<ChunkType> chunk(new ChunkType());
                string errmsg;
                if (!chunk->parseBSON(chunkDoc, &errmsg)) {
                    error() << "bad chunk format for " << chunkDoc
                            << ": " << errmsg << endl;

                OwnedPointerVector<ChunkType>*& chunkList =

                if (chunkList == NULL) {
                    chunkList = new OwnedPointerVector<ChunkType>();


            if (shardToChunksMap.map().empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";

            for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                OwnedPointerVector<ChunkType>*& chunkList =

                if (chunkList == NULL) {
                    chunkList = new OwnedPointerVector<ChunkType>();

            DistributionStatus status(shardInfo, shardToChunksMap.map());

            // load tags
            Status result = clusterCreateIndex(TagsType::ConfigNS,
                                               BSON(TagsType::ns() << 1 << TagsType::min() << 1),
                                               true, // unique

            if ( !result.isOK() ) {
                warning() << "could not create index tags_1_min_1: " << result.reason() << endl;

            cursor = conn.query(TagsType::ConfigNS,

            vector<TagRange> ranges;

            while ( cursor->more() ) {
                BSONObj tag = cursor->nextSafe();
                TagRange tr(tag[TagsType::min()].Obj().getOwned(),
                        str::stream() << "tag ranges not valid for: " << ns,
                        status.addTagRange(tr) );


            DBConfigPtr cfg = grid.getDBConfig( ns );
            if ( !cfg ) {
                warning() << "could not load db config to balance " << ns << " collection" << endl;

            // This line reloads the chunk manager once if this process doesn't know the collection
            // is sharded yet.
            ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true );
            if ( !cm ) {
                warning() << "could not load chunks to balance " << ns << " collection" << endl;

            // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks
            bool didAnySplits = false;
            for ( unsigned i = 0; i < ranges.size(); i++ ) {
                BSONObj min = ranges[i].min;

                min = cm->getShardKey().extendRangeBound( min, false );

                if ( allChunkMinimums.count( min ) > 0 )

                didAnySplits = true;

                log() << "ns: " << ns << " need to split on "
                      << min << " because there is a range there" << endl;

                ChunkPtr c = cm->findIntersectingChunk( min );

                vector<BSONObj> splitPoints;
                splitPoints.push_back( min );

                BSONObj res;
                if ( !c->multiSplit( splitPoints, res ) ) {
                    error() << "split failed: " << res << endl;
                else {
                    LOG(1) << "split worked: " << res << endl;

            if ( didAnySplits ) {
                // state change, just wait till next round

            CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
Example #5
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        verify( candidateChunks );

        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection

        auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj());
        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col[CollectionType::keyPattern()].eoo() &&
                 ! col[CollectionType::noBalance()].trueValue() ){
                collections.push_back( col[CollectionType::ns()].String() );
            else if( col[CollectionType::noBalance()].trueValue() ){
                LOG(1) << "not balancing collection " << col[CollectionType::ns()].String()
                       << ", explicitly disabled" << endl;


        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;

        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        // TODO: skip unresponsive shards and mark information as stale.

        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;
        ShardInfoMap shardInfo;
        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
            const Shard& s = *it;
            ShardStatus status = s.getStatus();
            shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(),

        // 3. For each collection, check if the balancing policy recommends moving anything around.

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            map< string,vector<BSONObj> > shardToChunksMap;
            cursor = conn.query(ChunkType::ConfigNS,

            set<BSONObj> allChunkMinimums;

            while ( cursor->more() ) {
                BSONObj chunk = cursor->nextSafe().getOwned();
                vector<BSONObj>& chunks = shardToChunksMap[chunk[ChunkType::shard()].String()];
                allChunkMinimums.insert( chunk[ChunkType::min()].Obj() );
                chunks.push_back( chunk );

            if (shardToChunksMap.empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";

            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;

            DistributionStatus status( shardInfo, shardToChunksMap );

            // load tags
                             BSON(TagsType::ns() << 1 << TagsType::min() << 1),

            cursor = conn.query(TagsType::ConfigNS,

            vector<TagRange> ranges;

            while ( cursor->more() ) {
                BSONObj tag = cursor->nextSafe();
                TagRange tr(tag[TagsType::min()].Obj().getOwned(),
                        str::stream() << "tag ranges not valid for: " << ns,
                        status.addTagRange(tr) );


            DBConfigPtr cfg = grid.getDBConfig( ns );
            verify( cfg );
            ChunkManagerPtr cm = cfg->getChunkManager( ns );
            verify( cm );

            // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks
            bool didAnySplits = false;
            for ( unsigned i = 0; i < ranges.size(); i++ ) {
                BSONObj min = ranges[i].min;

                min = cm->getShardKey().extendRangeBound( min, false );

                if ( allChunkMinimums.count( min ) > 0 )

                didAnySplits = true;

                log() << "ns: " << ns << " need to split on "
                      << min << " because there is a range there" << endl;

                ChunkPtr c = cm->findIntersectingChunk( min );

                vector<BSONObj> splitPoints;
                splitPoints.push_back( min );

                BSONObj res;
                if ( !c->multiSplit( splitPoints, res ) ) {
                    error() << "split failed: " << res << endl;
                else {
                    LOG(1) << "split worked: " << res << endl;

            if ( didAnySplits ) {
                // state change, just wait till next round

            CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
Example #6
int Balancer::_moveChunks(const vector<CandidateChunkPtr>* candidateChunks,
                          const WriteConcernOptions* writeConcern,
                          bool waitForDelete) {
    int movedCount = 0;

    for (vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin();
         it != candidateChunks->end();
         ++it) {
        // If the balancer was disabled since we started this round, don't start new
        // chunks moves.
        SettingsType balancerConfig;
        std::string errMsg;

        if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) {
            warning() << errMsg;
            // No point in continuing the round if the config servers are unreachable.
            return movedCount;

        if ((balancerConfig.isKeySet() &&  // balancer config doc exists
             !grid.shouldBalance(balancerConfig)) ||
            MONGO_FAIL_POINT(skipBalanceRound)) {
            LOG(1) << "Stopping balancing round early as balancing was disabled";
            return movedCount;

        // Changes to metadata, borked metadata, and connectivity problems between shards should
        // cause us to abort this chunk move, but shouldn't cause us to abort the entire round
        // of chunks.
        // TODO(spencer): We probably *should* abort the whole round on issues communicating
        // with the config servers, but its impossible to distinguish those types of failures
        // at the moment.
        // TODO: Handle all these things more cleanly, since they're expected problems
        const CandidateChunk& chunkInfo = *it->get();
        try {
            DBConfigPtr cfg = grid.getDBConfig(chunkInfo.ns);

            // NOTE: We purposely do not reload metadata here, since _doBalanceRound already
            // tried to do so once.
            ChunkManagerPtr cm = cfg->getChunkManager(chunkInfo.ns);

            ChunkPtr c = cm->findIntersectingChunk(chunkInfo.chunk.min);
            if (c->getMin().woCompare(chunkInfo.chunk.min) ||
                c->getMax().woCompare(chunkInfo.chunk.max)) {
                // likely a split happened somewhere
                cm = cfg->getChunkManager(chunkInfo.ns, true /* reload */);

                c = cm->findIntersectingChunk(chunkInfo.chunk.min);
                if (c->getMin().woCompare(chunkInfo.chunk.min) ||
                    c->getMax().woCompare(chunkInfo.chunk.max)) {
                    log() << "chunk mismatch after reload, ignoring will retry issue "
                          << chunkInfo.chunk.toString() << endl;

            BSONObj res;
            if (c->moveAndCommit(Shard::make(chunkInfo.to),
                                 0, /* maxTimeMS */
                                 res)) {

            // the move requires acquiring the collection metadata's lock, which can fail
            log() << "balancer move failed: " << res << " from: " << chunkInfo.from
                  << " to: " << chunkInfo.to << " chunk: " << chunkInfo.chunk << endl;

            if (res["chunkTooBig"].trueValue()) {
                // reload just to be safe
                cm = cfg->getChunkManager(chunkInfo.ns);
                c = cm->findIntersectingChunk(chunkInfo.chunk.min);

                log() << "performing a split because migrate failed for size reasons";

                Status status = c->split(Chunk::normal, NULL, NULL);
                log() << "split results: " << status << endl;

                if (!status.isOK()) {
                    log() << "marking chunk as jumbo: " << c->toString() << endl;
                    // we increment moveCount so we do another round right away
        } catch (const DBException& ex) {
            warning() << "could not move chunk " << chunkInfo.chunk.toString()
                      << ", continuing balancing round" << causedBy(ex) << endl;

    return movedCount;
Status ClusterAggregate::runAggregate(OperationContext* txn,
                                      const Namespaces& namespaces,
                                      BSONObj cmdObj,
                                      int options,
                                      BSONObjBuilder* result) {
    auto dbname = namespaces.executionNss.db().toString();
    auto status = grid.catalogCache()->getDatabase(txn, dbname);
    if (!status.isOK()) {
        appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns());
        return Status::OK();

    std::shared_ptr<DBConfig> conf = status.getValue();

    if (!conf->isShardingEnabled()) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);

    auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj);
    if (!request.isOK()) {
        return request.getStatus();

    boost::intrusive_ptr<ExpressionContext> mergeCtx =
        new ExpressionContext(txn, request.getValue());
    mergeCtx->inRouter = true;
    // explicitly *not* setting mergeCtx->tempDir

    // Parse and optimize the pipeline specification.
    auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx);
    if (!pipeline.isOK()) {
        return pipeline.getStatus();

    for (auto&& ns : pipeline.getValue()->getInvolvedCollections()) {
        uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns()));
        // We won't try to execute anything on a mongos, but we still have to populate this map
        // so that any $lookups etc will be able to have a resolved view definition. It's okay
        // that this is incorrect, we will repopulate the real resolved namespace map on the
        // mongod.
        // TODO SERVER-25038 This should become unnecessary once we can get the involved
        // namespaces before parsing.
        mergeCtx->resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}};

    if (!conf->isSharded(namespaces.executionNss.ns())) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);

    ChunkManagerPtr chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns());

    // If there was no collation specified, but there is a default collation for the collation,
    // use that.
    if (request.getValue().getCollation().isEmpty() && chunkMgr->getDefaultCollator()) {

    // Now that we know the collation we'll be using, inject the ExpressionContext and optimize.
    // TODO SERVER-25038: this must happen before we parse the pipeline, since we can make
    // string comparisons during parse time.

    // If the first $match stage is an exact match on the shard key (with a simple collation or
    // no string matching), we only have to send it to one shard, so send the command to that
    // shard.
    BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery();
    BSONObj shardKeyMatches;
    shardKeyMatches = uassertStatusOK(
        chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery));
    bool singleShard = false;
    if (!shardKeyMatches.isEmpty()) {
        auto chunk = chunkMgr->findIntersectingChunk(
            txn, shardKeyMatches, request.getValue().getCollation());
        if (chunk.isOK()) {
            singleShard = true;

    // Don't need to split pipeline if the first $match is an exact match on shard key, unless
    // there is a stage that needs to be run on the primary shard.
    const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger();
    const bool needSplit = !singleShard || needPrimaryShardMerger;

    // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true,
    // 'pipeline' will become the merger side.
    boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded()
                                                           : pipeline.getValue());

    // Create the command for the shards. The 'fromRouter' field means produce output to be
    // merged.
    MutableDocument commandBuilder(request.getValue().serializeToCommandObj());
    commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize());
    if (needSplit) {
        commandBuilder[AggregationRequest::kFromRouterName] = Value(true);
        commandBuilder[AggregationRequest::kCursorName] =
            Value(DOC(AggregationRequest::kBatchSizeName << 0));

    // These fields are not part of the AggregationRequest since they are not handled by the
    // aggregation subsystem, so we serialize them separately.
    const std::initializer_list<StringData> fieldsToPropagateToShards = {
        "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS,
    for (auto&& field : fieldsToPropagateToShards) {
        commandBuilder[field] = Value(cmdObj[field]);

    BSONObj shardedCommand = commandBuilder.freeze().toBson();
    BSONObj shardQuery = shardPipeline->getInitialQuery();

    // Run the command on the shards
    // TODO need to make sure cursors are killed if a retry is needed
    std::vector<Strategy::CommandResult> shardResults;

    if (mergeCtx->isExplain) {
        // This must be checked before we start modifying result.

        if (needSplit) {
            *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline"
                    << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart"
                                        << pipeline.getValue()->writeExplainOps());
        } else {
            *result << "splitPipeline" << BSONNULL;

        BSONObjBuilder shardExplains(result->subobjStart("shards"));
        for (size_t i = 0; i < shardResults.size(); i++) {
                                 BSON("host" << shardResults[i].target.toString() << "stages"
                                             << shardResults[i].result["stages"]));

        return Status::OK();

    if (!needSplit) {
        invariant(shardResults.size() == 1);
        invariant(shardResults[0].target.getServers().size() == 1);
        auto executorPool = grid.getExecutorPool();
        const BSONObj reply =
        return getStatusFromCommandResult(reply);

        DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx));

    MutableDocument mergeCmd(request.getValue().serializeToCommandObj());
    mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize());
    mergeCmd["cursor"] = Value(cmdObj["cursor"]);

    if (cmdObj.hasField("$queryOptions")) {
        mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]);

    if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) {
        mergeCmd[QueryRequest::cmdOptionMaxTimeMS] =

    mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"]));

    // Not propagating readConcern to merger since it doesn't do local reads.

    // If the user didn't specify a collation already, make sure there's a collation attached to
    // the merge command, since the merging shard may not have the collection metadata.
    if (mergeCmd.peek()["collation"].missing()) {
                              ? Value(mergeCtx->getCollator()->getSpec().toBSON())
                              : Value(Document{CollationSpec::kSimpleSpec}));

    std::string outputNsOrEmpty;
    if (DocumentSourceOut* out =
            dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) {
        outputNsOrEmpty = out->getOutputNs().ns();

    // Run merging command on random shard, unless a stage needs the primary shard. Need to use
    // ShardConnection so that the merging mongod is sent the config servers on connection init.
    auto& prng = txn->getClient()->getPrng();
    const auto& mergingShardId = needPrimaryShardMerger
        ? conf->getPrimaryId()
        : shardResults[prng.nextInt32(shardResults.size())].shardTargetId;
    const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId));

    ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty);
    BSONObj mergedResults =
        aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options);

    if (auto wcErrorElem = mergedResults["writeConcernError"]) {
        appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result);

    // Copy output from merging (primary) shard to the output object from our command.
    // Also, propagates errmsg and code if ok == false.

    return getStatusFromCommandResult(result->asTempObj());