void Balancer::_doBalanceRound(DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks) { verify(candidateChunks); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj()); if (NULL == cursor.get()) { warning() << "could not query " << CollectionType::ConfigNS << " while trying to balance" << endl; return; } vector<string> collections; while (cursor->more()) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if (!col[CollectionType::keyPattern()].eoo() && !col[CollectionType::noBalance()].trueValue()) { collections.push_back(col[CollectionType::ns()].String()); } else if (col[CollectionType::noBalance()].trueValue()) { LOG(1) << "not balancing collection " << col[CollectionType::ns()].String() << ", explicitly disabled" << endl; } } cursor.reset(); if (collections.empty()) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // ShardInfoMap shardInfo; Status loadStatus = DistributionStatus::populateShardInfoMap(&shardInfo); if (!loadStatus.isOK()) { warning() << "failed to load shard metadata" << causedBy(loadStatus); return; } if (shardInfo.size() < 2) { LOG(1) << "can't balance without more active shards"; return; } OCCASIONALLY warnOnMultiVersion(shardInfo); // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) { const string& ns = *it; OwnedPointerMap<string, OwnedPointerVector<ChunkType>> shardToChunksMap; cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min())); set<BSONObj> allChunkMinimums; while (cursor->more()) { BSONObj chunkDoc = cursor->nextSafe().getOwned(); auto_ptr<ChunkType> chunk(new ChunkType()); string errmsg; if (!chunk->parseBSON(chunkDoc, &errmsg)) { error() << "bad chunk format for " << chunkDoc << ": " << errmsg << endl; return; } allChunkMinimums.insert(chunk->getMin().getOwned()); OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[chunk->getShard()]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } chunkList->mutableVector().push_back(chunk.release()); } cursor.reset(); if (shardToChunksMap.map().empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) { // this just makes sure there is an entry in shardToChunksMap for every shard OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[i->first]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } } DistributionStatus status(shardInfo, shardToChunksMap.map()); // load tags cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min())); vector<TagRange> ranges; while (cursor->more()) { BSONObj tag = cursor->nextSafe(); TagRange tr(tag[TagsType::min()].Obj().getOwned(), tag[TagsType::max()].Obj().getOwned(), tag[TagsType::tag()].String()); ranges.push_back(tr); uassert( 16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr)); } cursor.reset(); DBConfigPtr cfg = grid.getDBConfig(ns); if (!cfg) { warning() << "could not load db config to balance " << ns << " collection" << endl; continue; } // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. ChunkManagerPtr cm = cfg->getChunkManagerIfExists(ns, true); if (!cm) { warning() << "could not load chunks to balance " << ns << " collection" << endl; continue; } // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; for (unsigned i = 0; i < ranges.size(); i++) { BSONObj min = ranges[i].min; min = cm->getShardKeyPattern().getKeyPattern().extendRangeBound(min, false); if (allChunkMinimums.count(min) > 0) continue; didAnySplits = true; log() << "ns: " << ns << " need to split on " << min << " because there is a range there" << endl; ChunkPtr c = cm->findIntersectingChunk(min); vector<BSONObj> splitPoints; splitPoints.push_back(min); Status status = c->multiSplit(splitPoints, NULL); if (!status.isOK()) { error() << "split failed: " << status << endl; } else { LOG(1) << "split worked" << endl; } break; } if (didAnySplits) { // state change, just wait till next round continue; } CandidateChunk* p = _policy->balance(ns, status, _balancedLastTime); if (p) candidateChunks->push_back(CandidateChunkPtr(p)); } }
Status ClusterAggregate::runAggregate(OperationContext* txn, const Namespaces& namespaces, BSONObj cmdObj, int options, BSONObjBuilder* result) { auto dbname = namespaces.executionNss.db().toString(); auto status = grid.catalogCache()->getDatabase(txn, dbname); if (!status.isOK()) { appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns()); return Status::OK(); } std::shared_ptr<DBConfig> conf = status.getValue(); if (!conf->isShardingEnabled()) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj); if (!request.isOK()) { return request.getStatus(); } boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext(txn, request.getValue()); mergeCtx->inRouter = true; // explicitly *not* setting mergeCtx->tempDir // Parse and optimize the pipeline specification. auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx); if (!pipeline.isOK()) { return pipeline.getStatus(); } for (auto&& ns : pipeline.getValue()->getInvolvedCollections()) { uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns())); // We won't try to execute anything on a mongos, but we still have to populate this map // so that any $lookups etc will be able to have a resolved view definition. It's okay // that this is incorrect, we will repopulate the real resolved namespace map on the // mongod. // TODO SERVER-25038 This should become unnecessary once we can get the involved // namespaces before parsing. mergeCtx->resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}}; } if (!conf->isSharded(namespaces.executionNss.ns())) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } ChunkManagerPtr chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns()); // If there was no collation specified, but there is a default collation for the collation, // use that. if (request.getValue().getCollation().isEmpty() && chunkMgr->getDefaultCollator()) { mergeCtx->setCollator(chunkMgr->getDefaultCollator()->clone()); } // Now that we know the collation we'll be using, inject the ExpressionContext and optimize. // TODO SERVER-25038: this must happen before we parse the pipeline, since we can make // string comparisons during parse time. pipeline.getValue()->injectExpressionContext(mergeCtx); pipeline.getValue()->optimizePipeline(); // If the first $match stage is an exact match on the shard key (with a simple collation or // no string matching), we only have to send it to one shard, so send the command to that // shard. BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery(); BSONObj shardKeyMatches; shardKeyMatches = uassertStatusOK( chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery)); bool singleShard = false; if (!shardKeyMatches.isEmpty()) { auto chunk = chunkMgr->findIntersectingChunk( txn, shardKeyMatches, request.getValue().getCollation()); if (chunk.isOK()) { singleShard = true; } } // Don't need to split pipeline if the first $match is an exact match on shard key, unless // there is a stage that needs to be run on the primary shard. const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger(); const bool needSplit = !singleShard || needPrimaryShardMerger; // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true, // 'pipeline' will become the merger side. boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded() : pipeline.getValue()); // Create the command for the shards. The 'fromRouter' field means produce output to be // merged. MutableDocument commandBuilder(request.getValue().serializeToCommandObj()); commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize()); if (needSplit) { commandBuilder[AggregationRequest::kFromRouterName] = Value(true); commandBuilder[AggregationRequest::kCursorName] = Value(DOC(AggregationRequest::kBatchSizeName << 0)); } // These fields are not part of the AggregationRequest since they are not handled by the // aggregation subsystem, so we serialize them separately. const std::initializer_list<StringData> fieldsToPropagateToShards = { "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS, }; for (auto&& field : fieldsToPropagateToShards) { commandBuilder[field] = Value(cmdObj[field]); } BSONObj shardedCommand = commandBuilder.freeze().toBson(); BSONObj shardQuery = shardPipeline->getInitialQuery(); // Run the command on the shards // TODO need to make sure cursors are killed if a retry is needed std::vector<Strategy::CommandResult> shardResults; Strategy::commandOp(txn, dbname, shardedCommand, options, namespaces.executionNss.ns(), shardQuery, request.getValue().getCollation(), &shardResults); if (mergeCtx->isExplain) { // This must be checked before we start modifying result. uassertAllShardsSupportExplain(shardResults); if (needSplit) { *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline" << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart" << pipeline.getValue()->writeExplainOps()); } else { *result << "splitPipeline" << BSONNULL; } BSONObjBuilder shardExplains(result->subobjStart("shards")); for (size_t i = 0; i < shardResults.size(); i++) { shardExplains.append(shardResults[i].shardTargetId, BSON("host" << shardResults[i].target.toString() << "stages" << shardResults[i].result["stages"])); } return Status::OK(); } if (!needSplit) { invariant(shardResults.size() == 1); invariant(shardResults[0].target.getServers().size() == 1); auto executorPool = grid.getExecutorPool(); const BSONObj reply = uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0], shardResults[0].result, namespaces.requestedNss, executorPool->getArbitraryExecutor(), grid.getCursorManager())); result->appendElements(reply); return getStatusFromCommandResult(reply); } pipeline.getValue()->addInitialSource( DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx)); MutableDocument mergeCmd(request.getValue().serializeToCommandObj()); mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize()); mergeCmd["cursor"] = Value(cmdObj["cursor"]); if (cmdObj.hasField("$queryOptions")) { mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]); } if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) { mergeCmd[QueryRequest::cmdOptionMaxTimeMS] = Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]); } mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"])); // Not propagating readConcern to merger since it doesn't do local reads. // If the user didn't specify a collation already, make sure there's a collation attached to // the merge command, since the merging shard may not have the collection metadata. if (mergeCmd.peek()["collation"].missing()) { mergeCmd.setField("collation", mergeCtx->getCollator() ? Value(mergeCtx->getCollator()->getSpec().toBSON()) : Value(Document{CollationSpec::kSimpleSpec})); } std::string outputNsOrEmpty; if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) { outputNsOrEmpty = out->getOutputNs().ns(); } // Run merging command on random shard, unless a stage needs the primary shard. Need to use // ShardConnection so that the merging mongod is sent the config servers on connection init. auto& prng = txn->getClient()->getPrng(); const auto& mergingShardId = needPrimaryShardMerger ? conf->getPrimaryId() : shardResults[prng.nextInt32(shardResults.size())].shardTargetId; const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId)); ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty); BSONObj mergedResults = aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options); conn.done(); if (auto wcErrorElem = mergedResults["writeConcernError"]) { appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result); } // Copy output from merging (primary) shard to the output object from our command. // Also, propagates errmsg and code if ok == false. result->appendElementsUnique(mergedResults); return getStatusFromCommandResult(result->asTempObj()); }