Example #1
0
void Balancer::_doBalanceRound(DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks) {
    verify(candidateChunks);

    //
    // 1. Check whether there is any sharded collection to be balanced by querying
    // the ShardsNS::collections collection
    //

    auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj());

    if (NULL == cursor.get()) {
        warning() << "could not query " << CollectionType::ConfigNS << " while trying to balance"
                  << endl;
        return;
    }

    vector<string> collections;
    while (cursor->more()) {
        BSONObj col = cursor->nextSafe();

        // sharded collections will have a shard "key".
        if (!col[CollectionType::keyPattern()].eoo() &&
            !col[CollectionType::noBalance()].trueValue()) {
            collections.push_back(col[CollectionType::ns()].String());
        } else if (col[CollectionType::noBalance()].trueValue()) {
            LOG(1) << "not balancing collection " << col[CollectionType::ns()].String()
                   << ", explicitly disabled" << endl;
        }
    }
    cursor.reset();

    if (collections.empty()) {
        LOG(1) << "no collections to balance" << endl;
        return;
    }

    //
    // 2. Get a list of all the shards that are participating in this balance round
    // along with any maximum allowed quotas and current utilization. We get the
    // latter by issuing db.serverStatus() (mem.mapped) to all shards.
    //
    // TODO: skip unresponsive shards and mark information as stale.
    //

    ShardInfoMap shardInfo;
    Status loadStatus = DistributionStatus::populateShardInfoMap(&shardInfo);

    if (!loadStatus.isOK()) {
        warning() << "failed to load shard metadata" << causedBy(loadStatus);
        return;
    }

    if (shardInfo.size() < 2) {
        LOG(1) << "can't balance without more active shards";
        return;
    }

    OCCASIONALLY warnOnMultiVersion(shardInfo);

    //
    // 3. For each collection, check if the balancing policy recommends moving anything around.
    //

    for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) {
        const string& ns = *it;

        OwnedPointerMap<string, OwnedPointerVector<ChunkType>> shardToChunksMap;
        cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min()));

        set<BSONObj> allChunkMinimums;

        while (cursor->more()) {
            BSONObj chunkDoc = cursor->nextSafe().getOwned();

            auto_ptr<ChunkType> chunk(new ChunkType());
            string errmsg;
            if (!chunk->parseBSON(chunkDoc, &errmsg)) {
                error() << "bad chunk format for " << chunkDoc << ": " << errmsg << endl;
                return;
            }

            allChunkMinimums.insert(chunk->getMin().getOwned());
            OwnedPointerVector<ChunkType>*& chunkList =
                shardToChunksMap.mutableMap()[chunk->getShard()];

            if (chunkList == NULL) {
                chunkList = new OwnedPointerVector<ChunkType>();
            }

            chunkList->mutableVector().push_back(chunk.release());
        }
        cursor.reset();

        if (shardToChunksMap.map().empty()) {
            LOG(1) << "skipping empty collection (" << ns << ")";
            continue;
        }

        for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) {
            // this just makes sure there is an entry in shardToChunksMap for every shard
            OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[i->first];

            if (chunkList == NULL) {
                chunkList = new OwnedPointerVector<ChunkType>();
            }
        }

        DistributionStatus status(shardInfo, shardToChunksMap.map());

        // load tags
        cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min()));

        vector<TagRange> ranges;

        while (cursor->more()) {
            BSONObj tag = cursor->nextSafe();
            TagRange tr(tag[TagsType::min()].Obj().getOwned(),
                        tag[TagsType::max()].Obj().getOwned(),
                        tag[TagsType::tag()].String());
            ranges.push_back(tr);
            uassert(
                16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr));
        }
        cursor.reset();

        DBConfigPtr cfg = grid.getDBConfig(ns);
        if (!cfg) {
            warning() << "could not load db config to balance " << ns << " collection" << endl;
            continue;
        }

        // This line reloads the chunk manager once if this process doesn't know the collection
        // is sharded yet.
        ChunkManagerPtr cm = cfg->getChunkManagerIfExists(ns, true);
        if (!cm) {
            warning() << "could not load chunks to balance " << ns << " collection" << endl;
            continue;
        }

        // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks
        bool didAnySplits = false;
        for (unsigned i = 0; i < ranges.size(); i++) {
            BSONObj min = ranges[i].min;

            min = cm->getShardKeyPattern().getKeyPattern().extendRangeBound(min, false);

            if (allChunkMinimums.count(min) > 0)
                continue;

            didAnySplits = true;

            log() << "ns: " << ns << " need to split on " << min
                  << " because there is a range there" << endl;

            ChunkPtr c = cm->findIntersectingChunk(min);

            vector<BSONObj> splitPoints;
            splitPoints.push_back(min);

            Status status = c->multiSplit(splitPoints, NULL);
            if (!status.isOK()) {
                error() << "split failed: " << status << endl;
            } else {
                LOG(1) << "split worked" << endl;
            }
            break;
        }

        if (didAnySplits) {
            // state change, just wait till next round
            continue;
        }

        CandidateChunk* p = _policy->balance(ns, status, _balancedLastTime);
        if (p)
            candidateChunks->push_back(CandidateChunkPtr(p));
    }
}
Status ClusterAggregate::runAggregate(OperationContext* txn,
                                      const Namespaces& namespaces,
                                      BSONObj cmdObj,
                                      int options,
                                      BSONObjBuilder* result) {
    auto dbname = namespaces.executionNss.db().toString();
    auto status = grid.catalogCache()->getDatabase(txn, dbname);
    if (!status.isOK()) {
        appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns());
        return Status::OK();
    }

    std::shared_ptr<DBConfig> conf = status.getValue();

    if (!conf->isShardingEnabled()) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }

    auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj);
    if (!request.isOK()) {
        return request.getStatus();
    }

    boost::intrusive_ptr<ExpressionContext> mergeCtx =
        new ExpressionContext(txn, request.getValue());
    mergeCtx->inRouter = true;
    // explicitly *not* setting mergeCtx->tempDir

    // Parse and optimize the pipeline specification.
    auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx);
    if (!pipeline.isOK()) {
        return pipeline.getStatus();
    }

    for (auto&& ns : pipeline.getValue()->getInvolvedCollections()) {
        uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns()));
        // We won't try to execute anything on a mongos, but we still have to populate this map
        // so that any $lookups etc will be able to have a resolved view definition. It's okay
        // that this is incorrect, we will repopulate the real resolved namespace map on the
        // mongod.
        // TODO SERVER-25038 This should become unnecessary once we can get the involved
        // namespaces before parsing.
        mergeCtx->resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}};
    }

    if (!conf->isSharded(namespaces.executionNss.ns())) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }

    ChunkManagerPtr chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns());

    // If there was no collation specified, but there is a default collation for the collation,
    // use that.
    if (request.getValue().getCollation().isEmpty() && chunkMgr->getDefaultCollator()) {
        mergeCtx->setCollator(chunkMgr->getDefaultCollator()->clone());
    }

    // Now that we know the collation we'll be using, inject the ExpressionContext and optimize.
    // TODO SERVER-25038: this must happen before we parse the pipeline, since we can make
    // string comparisons during parse time.
    pipeline.getValue()->injectExpressionContext(mergeCtx);
    pipeline.getValue()->optimizePipeline();

    // If the first $match stage is an exact match on the shard key (with a simple collation or
    // no string matching), we only have to send it to one shard, so send the command to that
    // shard.
    BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery();
    BSONObj shardKeyMatches;
    shardKeyMatches = uassertStatusOK(
        chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery));
    bool singleShard = false;
    if (!shardKeyMatches.isEmpty()) {
        auto chunk = chunkMgr->findIntersectingChunk(
            txn, shardKeyMatches, request.getValue().getCollation());
        if (chunk.isOK()) {
            singleShard = true;
        }
    }

    // Don't need to split pipeline if the first $match is an exact match on shard key, unless
    // there is a stage that needs to be run on the primary shard.
    const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger();
    const bool needSplit = !singleShard || needPrimaryShardMerger;

    // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true,
    // 'pipeline' will become the merger side.
    boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded()
                                                           : pipeline.getValue());

    // Create the command for the shards. The 'fromRouter' field means produce output to be
    // merged.
    MutableDocument commandBuilder(request.getValue().serializeToCommandObj());
    commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize());
    if (needSplit) {
        commandBuilder[AggregationRequest::kFromRouterName] = Value(true);
        commandBuilder[AggregationRequest::kCursorName] =
            Value(DOC(AggregationRequest::kBatchSizeName << 0));
    }

    // These fields are not part of the AggregationRequest since they are not handled by the
    // aggregation subsystem, so we serialize them separately.
    const std::initializer_list<StringData> fieldsToPropagateToShards = {
        "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS,
    };
    for (auto&& field : fieldsToPropagateToShards) {
        commandBuilder[field] = Value(cmdObj[field]);
    }

    BSONObj shardedCommand = commandBuilder.freeze().toBson();
    BSONObj shardQuery = shardPipeline->getInitialQuery();

    // Run the command on the shards
    // TODO need to make sure cursors are killed if a retry is needed
    std::vector<Strategy::CommandResult> shardResults;
    Strategy::commandOp(txn,
                        dbname,
                        shardedCommand,
                        options,
                        namespaces.executionNss.ns(),
                        shardQuery,
                        request.getValue().getCollation(),
                        &shardResults);

    if (mergeCtx->isExplain) {
        // This must be checked before we start modifying result.
        uassertAllShardsSupportExplain(shardResults);

        if (needSplit) {
            *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline"
                    << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart"
                                        << pipeline.getValue()->writeExplainOps());
        } else {
            *result << "splitPipeline" << BSONNULL;
        }

        BSONObjBuilder shardExplains(result->subobjStart("shards"));
        for (size_t i = 0; i < shardResults.size(); i++) {
            shardExplains.append(shardResults[i].shardTargetId,
                                 BSON("host" << shardResults[i].target.toString() << "stages"
                                             << shardResults[i].result["stages"]));
        }

        return Status::OK();
    }

    if (!needSplit) {
        invariant(shardResults.size() == 1);
        invariant(shardResults[0].target.getServers().size() == 1);
        auto executorPool = grid.getExecutorPool();
        const BSONObj reply =
            uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0],
                                                shardResults[0].result,
                                                namespaces.requestedNss,
                                                executorPool->getArbitraryExecutor(),
                                                grid.getCursorManager()));
        result->appendElements(reply);
        return getStatusFromCommandResult(reply);
    }

    pipeline.getValue()->addInitialSource(
        DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx));

    MutableDocument mergeCmd(request.getValue().serializeToCommandObj());
    mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize());
    mergeCmd["cursor"] = Value(cmdObj["cursor"]);

    if (cmdObj.hasField("$queryOptions")) {
        mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]);
    }

    if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) {
        mergeCmd[QueryRequest::cmdOptionMaxTimeMS] =
            Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]);
    }

    mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"]));

    // Not propagating readConcern to merger since it doesn't do local reads.

    // If the user didn't specify a collation already, make sure there's a collation attached to
    // the merge command, since the merging shard may not have the collection metadata.
    if (mergeCmd.peek()["collation"].missing()) {
        mergeCmd.setField("collation",
                          mergeCtx->getCollator()
                              ? Value(mergeCtx->getCollator()->getSpec().toBSON())
                              : Value(Document{CollationSpec::kSimpleSpec}));
    }

    std::string outputNsOrEmpty;
    if (DocumentSourceOut* out =
            dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) {
        outputNsOrEmpty = out->getOutputNs().ns();
    }

    // Run merging command on random shard, unless a stage needs the primary shard. Need to use
    // ShardConnection so that the merging mongod is sent the config servers on connection init.
    auto& prng = txn->getClient()->getPrng();
    const auto& mergingShardId = needPrimaryShardMerger
        ? conf->getPrimaryId()
        : shardResults[prng.nextInt32(shardResults.size())].shardTargetId;
    const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId));

    ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty);
    BSONObj mergedResults =
        aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options);
    conn.done();

    if (auto wcErrorElem = mergedResults["writeConcernError"]) {
        appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result);
    }

    // Copy output from merging (primary) shard to the output object from our command.
    // Also, propagates errmsg and code if ok == false.
    result->appendElementsUnique(mergedResults);

    return getStatusFromCommandResult(result->asTempObj());
}