Status Strategy::commandOpUnsharded(OperationContext* txn, const std::string& db, const BSONObj& command, int options, const std::string& versionedNS, CommandResult* cmdResult) { // Note that this implementation will not handle targeting retries and fails when the // sharding metadata is too stale auto status = grid.catalogCache()->getDatabase(txn, db); if (!status.isOK()) { mongoutils::str::stream ss; ss << "Passthrough command failed: " << command.toString() << " on ns " << versionedNS << ". Caused by " << causedBy(status.getStatus()); return Status(ErrorCodes::IllegalOperation, ss); } shared_ptr<DBConfig> conf = status.getValue(); if (conf->isSharded(versionedNS)) { mongoutils::str::stream ss; ss << "Passthrough command failed: " << command.toString() << " on ns " << versionedNS << ". Cannot run on sharded namespace."; return Status(ErrorCodes::IllegalOperation, ss); } const auto primaryShard = grid.shardRegistry()->getShard(txn, conf->getPrimaryId()); BSONObj shardResult; try { ShardConnection conn(primaryShard->getConnString(), ""); // TODO: this can throw a stale config when mongos is not up-to-date -- fix. if (!conn->runCommand(db, command, shardResult, options)) { conn.done(); return Status(ErrorCodes::OperationFailed, str::stream() << "Passthrough command failed: " << command << " on ns " << versionedNS << "; result: " << shardResult); } conn.done(); } catch (const DBException& ex) { return ex.toStatus(); } // Fill out the command result. cmdResult->shardTargetId = conf->getPrimaryId(); cmdResult->result = shardResult; cmdResult->target = primaryShard->getConnString(); return Status::OK(); }
StatusWith<boost::optional<executor::RemoteCommandRequest>> ShardingNetworkConnectionHook::makeRequest(const HostAndPort& remoteHost) { if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { // TODO: SERVER-23973 Temporary crutch until we decide where to get the config server // connection string. return {boost::none}; } auto shard = grid.shardRegistry()->getShardForHostNoReload(remoteHost); if (!shard) { return {ErrorCodes::ShardNotFound, str::stream() << "No shard found for host: " << remoteHost.toString()}; } if (shard->isConfig()) { // No need to initialize sharding metadata if talking to a config server return {boost::none}; } SetShardVersionRequest ssv = SetShardVersionRequest::makeForInitNoPersist( grid.shardRegistry()->getConfigServerConnectionString(), shard->getId(), shard->getConnString()); executor::RemoteCommandRequest request; request.dbname = "admin"; request.target = remoteHost; request.timeout = stdx::chrono::seconds{30}; request.cmdObj = ssv.toBSON(); return {request}; }
void ParallelSortClusteredCursor::setupVersionAndHandleSlaveOk( OperationContext* txn, PCStatePtr state, const ShardId& shardId, std::shared_ptr<Shard> primary, const NamespaceString& ns, const string& vinfo, std::shared_ptr<ChunkManager> manager) { if (manager) { state->manager = manager; } else if (primary) { state->primary = primary; } verify(!primary || shardId == primary->getId()); // Setup conn if (!state->conn) { const auto shard = grid.shardRegistry()->getShard(txn, shardId); state->conn.reset(new ShardConnection(shard->getConnString(), ns.ns(), manager)); } const DBClientBase* rawConn = state->conn->getRawConn(); bool allowShardVersionFailure = rawConn->type() == ConnectionString::SET && DBClientReplicaSet::isSecondaryQuery(_qSpec.ns(), _qSpec.query(), _qSpec.options()); bool connIsDown = rawConn->isFailed(); if (allowShardVersionFailure && !connIsDown) { // If the replica set connection believes that it has a valid primary that is up, // confirm that the replica set monitor agrees that the suspected primary is indeed up. const DBClientReplicaSet* replConn = dynamic_cast<const DBClientReplicaSet*>(rawConn); invariant(replConn); ReplicaSetMonitorPtr rsMonitor = ReplicaSetMonitor::get(replConn->getSetName()); if (!rsMonitor->isHostUp(replConn->getSuspectedPrimaryHostAndPort())) { connIsDown = true; } } if (allowShardVersionFailure && connIsDown) { // If we're doing a secondary-allowed query and the primary is down, don't attempt to // set the shard version. state->conn->donotCheckVersion(); // A side effect of this short circuiting is the mongos will not be able figure out that // the primary is now up on it's own and has to rely on other threads to refresh node // states. OCCASIONALLY { const DBClientReplicaSet* repl = dynamic_cast<const DBClientReplicaSet*>(rawConn); dassert(repl); warning() << "Primary for " << repl->getServerAddress() << " was down before, bypassing setShardVersion." << " The local replica set view and targeting may be stale."; } } else {
BSONObj Shard::runCommand( const string& db , const BSONObj& cmd , bool internal ) const { scoped_ptr<ScopedDbConnection> conn; if ( internal ) { conn.reset( ScopedDbConnection::getInternalScopedDbConnection( getConnString() ) ); } else { conn.reset( ScopedDbConnection::getScopedDbConnection( getConnString() ) ); } BSONObj res; bool ok = conn->get()->runCommand( db , cmd , res ); if ( ! ok ) { stringstream ss; ss << "runCommand (" << cmd << ") on shard (" << _name << ") failed : " << res; conn->done(); throw UserException( 13136 , ss.str() ); } res = res.getOwned(); conn->done(); return res; }
void ParallelSortClusteredCursor::setupVersionAndHandleSlaveOk( OperationContext* txn, PCStatePtr state, const ShardId& shardId, std::shared_ptr<Shard> primary, const NamespaceString& ns, const string& vinfo, std::shared_ptr<ChunkManager> manager) { if (manager) { state->manager = manager; } else if (primary) { state->primary = primary; } verify(!primary || shardId == primary->getId()); // Setup conn if (!state->conn) { const auto shard = uassertStatusOK(Grid::get(txn)->shardRegistry()->getShard(txn, shardId)); state->conn.reset(new ShardConnection(shard->getConnString(), ns.ns(), manager)); } const DBClientBase* rawConn = state->conn->getRawConn(); bool allowShardVersionFailure = rawConn->type() == ConnectionString::SET && DBClientReplicaSet::isSecondaryQuery(_qSpec.ns(), _qSpec.query(), _qSpec.options()); // Skip shard version checking if primary is known to be down. if (allowShardVersionFailure) { const DBClientReplicaSet* replConn = dynamic_cast<const DBClientReplicaSet*>(rawConn); invariant(replConn); ReplicaSetMonitorPtr rsMonitor = ReplicaSetMonitor::get(replConn->getSetName()); uassert(16388, str::stream() << "cannot access unknown replica set: " << replConn->getSetName(), rsMonitor != nullptr); if (!rsMonitor->isKnownToHaveGoodPrimary()) { state->conn->donotCheckVersion(); // A side effect of this short circuiting is the mongos will not be able figure out // that the primary is now up on it's own and has to rely on other threads to refresh // node states. OCCASIONALLY { const DBClientReplicaSet* repl = dynamic_cast<const DBClientReplicaSet*>(rawConn); dassert(repl); warning() << "Primary for " << repl->getServerAddress() << " was down before, bypassing setShardVersion." << " The local replica set view and targeting may be stale."; } return; } }
Status MigrationChunkClonerSourceLegacy::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); // Resolve the donor and recipient shards and their connection string { auto donorShard = grid.shardRegistry()->getShard(txn, _args.getFromShardId()); _donorCS = donorShard->getConnString(); } { auto recipientShard = grid.shardRegistry()->getShard(txn, _args.getToShardId()); auto shardHostStatus = recipientShard->getTargeter()->findHost( ReadPreferenceSetting{ReadPreference::PrimaryOnly}); if (!shardHostStatus.isOK()) { return shardHostStatus.getStatus(); } _recipientHost = std::move(shardHostStatus.getValue()); } // Prepare the currently available documents Status status = _storeCurrentLocs(txn); if (!status.isOK()) { return status; } // Tell the recipient shard to start cloning BSONObjBuilder cmdBuilder; StartChunkCloneRequest::appendAsCommand(&cmdBuilder, _args.getNss(), _sessionId, _args.getConfigServerCS(), _donorCS, _args.getToShardId(), _args.getMinKey(), _args.getMaxKey(), _shardKeyPattern.toBSON(), _args.getSecondaryThrottle()); auto responseStatus = _callRecipient(cmdBuilder.obj()); if (!responseStatus.isOK()) { return responseStatus.getStatus(); } scopedGuard.Dismiss(); return Status::OK(); }
void ChunkManager::calcInitSplitsAndShards(const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds, vector<BSONObj>* splitPoints, vector<ShardId>* shardIds) const { verify(_chunkMap.size() == 0); unsigned long long numObjects = 0; Chunk c(this, _keyPattern.getKeyPattern().globalMin(), _keyPattern.getKeyPattern().globalMax(), primaryShardId); if (!initPoints || !initPoints->size()) { // discover split points { const auto primaryShard = grid.shardRegistry()->getShard(primaryShardId); // get stats to see if there is any data ScopedDbConnection shardConn(primaryShard->getConnString()); numObjects = shardConn->count(getns()); shardConn.done(); } if (numObjects > 0) c.pickSplitVector(*splitPoints, Chunk::MaxChunkSize); // since docs alread exists, must use primary shard shardIds->push_back(primaryShardId); } else { // make sure points are unique and ordered set<BSONObj> orderedPts; for (unsigned i = 0; i < initPoints->size(); ++i) { BSONObj pt = (*initPoints)[i]; orderedPts.insert(pt); } for (set<BSONObj>::iterator it = orderedPts.begin(); it != orderedPts.end(); ++it) { splitPoints->push_back(*it); } if (!initShardIds || !initShardIds->size()) { // If not specified, only use the primary shard (note that it's not safe for mongos // to put initial chunks on other shards without the primary mongod knowing). shardIds->push_back(primaryShardId); } else { std::copy(initShardIds->begin(), initShardIds->end(), std::back_inserter(*shardIds)); } } }
StatusWith<boost::optional<executor::RemoteCommandRequest>> ShardingNetworkConnectionHook::makeRequest(const HostAndPort& remoteHost) { auto shard = grid.shardRegistry()->getShardForHostNoReload(remoteHost); if (!shard) { return {ErrorCodes::ShardNotFound, str::stream() << "No shard found for host: " << remoteHost.toString()}; } if (shard->isConfig()) { // No need to initialize sharding metadata if talking to a config server return {boost::none}; } SetShardVersionRequest ssv = SetShardVersionRequest::makeForInitNoPersist( grid.shardRegistry()->getConfigServerConnectionString(), shard->getId(), shard->getConnString()); executor::RemoteCommandRequest request; request.dbname = "admin"; request.target = remoteHost; request.timeout = Seconds{30}; request.cmdObj = ssv.toBSON(); return {request}; }
/** * Updates the remote cached version on the remote shard host (primary, in the case of replica * sets) if needed with a fully-qualified shard version for the given namespace: * config server(s) + shard name + shard version * * If no remote cached version has ever been set, an initial shard version is sent. * * If the namespace is empty and no version has ever been sent, the config server + shard name * is sent to the remote shard host to initialize the connection as coming from mongos. * NOTE: This initialization is *best-effort only*. Operations which wish to correctly version * must send the namespace. * * Config servers are special and are not (unless otherwise a shard) kept up to date with this * protocol. This is safe so long as config servers only contain unversioned collections. * * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB). * * @return true if we contacted the remote host */ bool checkShardVersion(DBClientBase* conn_in, const string& ns, ChunkManagerPtr refManager, bool authoritative, int tryNumber) { // TODO: cache, optimize, etc... // Empty namespaces are special - we require initialization but not versioning if (ns.size() == 0) { return initShardVersionEmptyNS(conn_in); } auto status = grid.catalogCache()->getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return false; } shared_ptr<DBConfig> conf = status.getValue(); DBClientBase* conn = getVersionable(conn_in); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ShardPtr primary; ChunkManagerPtr manager; if (authoritative) conf->getChunkManagerIfExists(ns, true); conf->getChunkManagerOrPrimary(ns, manager, primary); if (manager) { officialSequenceNumber = manager->getSequenceNumber(); } const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress()); uassert(ErrorCodes::ShardNotFound, str::stream() << conn->getServerAddress() << " is not recognized as a shard", shard); // Check this manager against the reference manager if (manager) { if (refManager && !refManager->compatibleWith(*manager, shard->getId())) { const ChunkVersion refVersion(refManager->getVersion(shard->getId())); const ChunkVersion currentVersion(manager->getVersion(shard->getId())); string msg(str::stream() << "manager (" << currentVersion.toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refVersion.toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard->getId() << " (" << shard->getConnString().toString() << ")"); throw SendStaleConfigException(ns, msg, refVersion, currentVersion); } } else if (refManager) { string msg(str::stream() << "not sharded (" << ((manager.get() == 0) ? string("<none>") : str::stream() << manager->getSequenceNumber()) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")"); throw SendStaleConfigException( ns, msg, refManager->getVersion(shard->getId()), ChunkVersion::UNSHARDED()); } // Do not send setShardVersion to collections on the config servers - this causes problems // when config servers are also shards and get SSV with conflicting names. // TODO: Make config servers regular shards if (primary && primary->getId() == "config") { return false; } // Has the ChunkManager been reloaded since the last time we updated the shard version over // this connection? If we've never updated the shard version, do so now. unsigned long long sequenceNumber = 0; if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) { if (sequenceNumber == officialSequenceNumber) { return false; } } ChunkVersion version = ChunkVersion(0, 0, OID()); if (manager) { version = manager->getVersion(shard->getId()); } LOG(1) << "setting shard version of " << version << " for " << ns << " on shard " << shard->toString(); LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber << ", current chunk manager iteration is " << officialSequenceNumber; BSONObj result; if (setShardVersion(*conn, ns, grid.catalogManager()->connectionString().toString(), version, manager.get(), authoritative, result)) { LOG(1) << " setShardVersion success: " << result; connectionShardStatus.setSequence(conn, ns, officialSequenceNumber); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if (result["need_authoritative"].trueValue()) massert(10428, "need_authoritative set but in authoritative mode already", !authoritative); if (!authoritative) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if (result["reloadConfig"].trueValue()) { if (result["version"].timestampTime() == Date_t()) { warning() << "reloading full configuration for " << conf->name() << ", connection state indicates significant version changes"; // reload db conf->reload(); } else { // reload config conf->getChunkManager(ns, true); } } const int maxNumTries = 7; if (tryNumber < maxNumTries) { LOG(tryNumber < (maxNumTries / 2) ? 1 : 0) << "going to retry checkShardVersion shard: " << shard->toString() << " " << result; sleepmillis(10 * tryNumber); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed shard: " << shard->toString() << " " << result; log() << " " << errmsg << endl; massert(10429, errmsg, 0); return true; }
Status ClusterAggregate::runAggregate(OperationContext* txn, const Namespaces& namespaces, BSONObj cmdObj, int options, BSONObjBuilder* result) { auto dbname = namespaces.executionNss.db().toString(); auto status = grid.catalogCache()->getDatabase(txn, dbname); if (!status.isOK()) { appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns()); return Status::OK(); } std::shared_ptr<DBConfig> conf = status.getValue(); if (!conf->isShardingEnabled()) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj); if (!request.isOK()) { return request.getStatus(); } // Determine the appropriate collation and 'resolve' involved namespaces to make the // ExpressionContext. // We won't try to execute anything on a mongos, but we still have to populate this map so that // any $lookups, etc. will be able to have a resolved view definition. It's okay that this is // incorrect, we will repopulate the real resolved namespace map on the mongod. Note that we // need to check if any involved collections are sharded before forwarding an aggregation // command on an unsharded collection. StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces; LiteParsedPipeline liteParsedPipeline(request.getValue()); for (auto&& ns : liteParsedPipeline.getInvolvedNamespaces()) { uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns())); resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}}; } if (!conf->isSharded(namespaces.executionNss.ns())) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns()); std::unique_ptr<CollatorInterface> collation; if (!request.getValue().getCollation().isEmpty()) { collation = uassertStatusOK(CollatorFactoryInterface::get(txn->getServiceContext()) ->makeFromBSON(request.getValue().getCollation())); } else if (chunkMgr->getDefaultCollator()) { collation = chunkMgr->getDefaultCollator()->clone(); } boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext( txn, request.getValue(), std::move(collation), std::move(resolvedNamespaces)); mergeCtx->inRouter = true; // explicitly *not* setting mergeCtx->tempDir // Parse and optimize the pipeline specification. auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx); if (!pipeline.isOK()) { return pipeline.getStatus(); } pipeline.getValue()->optimizePipeline(); // If the first $match stage is an exact match on the shard key (with a simple collation or // no string matching), we only have to send it to one shard, so send the command to that // shard. BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery(); BSONObj shardKeyMatches; shardKeyMatches = uassertStatusOK( chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery)); bool singleShard = false; if (!shardKeyMatches.isEmpty()) { auto chunk = chunkMgr->findIntersectingChunk( txn, shardKeyMatches, request.getValue().getCollation()); if (chunk.isOK()) { singleShard = true; } } // Don't need to split pipeline if the first $match is an exact match on shard key, unless // there is a stage that needs to be run on the primary shard. const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger(); const bool needSplit = !singleShard || needPrimaryShardMerger; // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true, // 'pipeline' will become the merger side. boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded() : pipeline.getValue()); // Create the command for the shards. The 'fromRouter' field means produce output to be // merged. MutableDocument commandBuilder(request.getValue().serializeToCommandObj()); commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize()); if (needSplit) { commandBuilder[AggregationRequest::kFromRouterName] = Value(true); commandBuilder[AggregationRequest::kCursorName] = Value(DOC(AggregationRequest::kBatchSizeName << 0)); } // These fields are not part of the AggregationRequest since they are not handled by the // aggregation subsystem, so we serialize them separately. const std::initializer_list<StringData> fieldsToPropagateToShards = { "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS, }; for (auto&& field : fieldsToPropagateToShards) { commandBuilder[field] = Value(cmdObj[field]); } BSONObj shardedCommand = commandBuilder.freeze().toBson(); BSONObj shardQuery = shardPipeline->getInitialQuery(); // Run the command on the shards // TODO need to make sure cursors are killed if a retry is needed std::vector<Strategy::CommandResult> shardResults; Strategy::commandOp(txn, dbname, shardedCommand, options, namespaces.executionNss.ns(), shardQuery, request.getValue().getCollation(), &shardResults); if (mergeCtx->isExplain) { // This must be checked before we start modifying result. uassertAllShardsSupportExplain(shardResults); if (needSplit) { *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline" << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart" << pipeline.getValue()->writeExplainOps()); } else { *result << "splitPipeline" << BSONNULL; } BSONObjBuilder shardExplains(result->subobjStart("shards")); for (size_t i = 0; i < shardResults.size(); i++) { shardExplains.append(shardResults[i].shardTargetId, BSON("host" << shardResults[i].target.toString() << "stages" << shardResults[i].result["stages"])); } return Status::OK(); } if (!needSplit) { invariant(shardResults.size() == 1); invariant(shardResults[0].target.getServers().size() == 1); auto executorPool = grid.getExecutorPool(); const BSONObj reply = uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0], shardResults[0].result, namespaces.requestedNss, executorPool->getArbitraryExecutor(), grid.getCursorManager())); result->appendElements(reply); return getStatusFromCommandResult(reply); } pipeline.getValue()->addInitialSource( DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx)); MutableDocument mergeCmd(request.getValue().serializeToCommandObj()); mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize()); mergeCmd["cursor"] = Value(cmdObj["cursor"]); if (cmdObj.hasField("$queryOptions")) { mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]); } if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) { mergeCmd[QueryRequest::cmdOptionMaxTimeMS] = Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]); } mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"])); mergeCmd.setField("readConcern", Value(cmdObj["readConcern"])); // If the user didn't specify a collation already, make sure there's a collation attached to // the merge command, since the merging shard may not have the collection metadata. if (mergeCmd.peek()["collation"].missing()) { mergeCmd.setField("collation", mergeCtx->getCollator() ? Value(mergeCtx->getCollator()->getSpec().toBSON()) : Value(Document{CollationSpec::kSimpleSpec})); } std::string outputNsOrEmpty; if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) { outputNsOrEmpty = out->getOutputNs().ns(); } // Run merging command on random shard, unless a stage needs the primary shard. Need to use // ShardConnection so that the merging mongod is sent the config servers on connection init. auto& prng = txn->getClient()->getPrng(); const auto& mergingShardId = (needPrimaryShardMerger || internalQueryAlwaysMergeOnPrimaryShard) ? conf->getPrimaryId() : shardResults[prng.nextInt32(shardResults.size())].shardTargetId; const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId)); ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty); BSONObj mergedResults = aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options); conn.done(); if (auto wcErrorElem = mergedResults["writeConcernError"]) { appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result); } // Copy output from merging (primary) shard to the output object from our command. // Also, propagates errmsg and code if ok == false. result->appendElementsUnique(mergedResults); return getStatusFromCommandResult(result->asTempObj()); }
bool appendRawResponses(OperationContext* opCtx, std::string* errmsg, BSONObjBuilder* output, std::vector<AsyncRequestsSender::Response> shardResponses, std::set<ErrorCodes::Error> ignoredErrors) { // Always include ShardNotFound as an ignored error, since this node may not have realized a // shard has been removed. ignoredErrors.insert(ErrorCodes::ShardNotFound); BSONObjBuilder subobj; // Stores raw responses by ConnectionString // Stores all errors; we will remove ignoredErrors later if some shard returned success. std::vector<std::pair<std::string, Status>> errors; // Stores errors by ConnectionString BSONElement wcErrorElem; // Stores the first writeConcern error we encounter ShardId wcErrorShardId; // Stores the shardId for the first writeConcern error we encounter bool hasWCError = false; // Whether we have encountered a writeConcern error yet for (const auto& shardResponse : shardResponses) { // Get the Shard object in order to get the shard's ConnectionString. const auto swShard = Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardResponse.shardId); if (ErrorCodes::ShardNotFound == swShard.getStatus().code()) { // Store the error by ShardId, since we cannot know the shard connection string, and it // is only used for reporting. errors.push_back(std::make_pair(shardResponse.shardId.toString(), swShard.getStatus())); continue; } const auto shard = uassertStatusOK(swShard); const auto shardConnStr = shard->getConnString().toString(); Status sendStatus = shardResponse.swResponse.getStatus(); if (!sendStatus.isOK()) { // Convert the error status back into the form of a command result and append it as the // raw response. BSONObjBuilder statusObjBob; CommandHelpers::appendCommandStatusNoThrow(statusObjBob, sendStatus); subobj.append(shardConnStr, statusObjBob.obj()); errors.push_back(std::make_pair(shardConnStr, sendStatus)); continue; } // Got a response from the shard. auto& resObj = shardResponse.swResponse.getValue().data; // Append the shard's raw response. subobj.append(shardConnStr, CommandHelpers::filterCommandReplyForPassthrough(resObj)); auto commandStatus = getStatusFromCommandResult(resObj); if (!commandStatus.isOK()) { errors.push_back(std::make_pair(shardConnStr, std::move(commandStatus))); } // Report the first writeConcern error we see. if (!hasWCError && (wcErrorElem = resObj["writeConcernError"])) { wcErrorShardId = shardResponse.shardId; hasWCError = true; } } output->append("raw", subobj.done()); if (hasWCError) { appendWriteConcernErrorToCmdResponse(wcErrorShardId, wcErrorElem, *output); } // If any shard returned success, filter out ignored errors bool someShardReturnedOK = (errors.size() != shardResponses.size()); BSONObjBuilder errorBob; int commonErrCode = -1; auto it = errors.begin(); while (it != errors.end()) { if (someShardReturnedOK && ignoredErrors.find(it->second.code()) != ignoredErrors.end()) { // Ignore the error. it = errors.erase(it); } else { errorBob.append(it->first, it->second.reason()); if (commonErrCode == -1) { commonErrCode = it->second.code(); } else if (commonErrCode != it->second.code()) { commonErrCode = 0; } ++it; } } BSONObj errobj = errorBob.obj(); if (!errobj.isEmpty()) { *errmsg = errobj.toString(); // If every error has a code, and the code for all errors is the same, then add // a top-level field "code" with this value to the output object. if (commonErrCode > 0) { output->append("code", commonErrCode); output->append("codeName", ErrorCodes::errorString(ErrorCodes::Error(commonErrCode))); if (errors.size() == 1) { // Only propagate extra info if there was a single error object. if (auto extraInfo = errors.begin()->second.extraInfo()) { extraInfo->serialize(output); } } } return false; } return true; }
bool RunOnAllShardsCommand::run(OperationContext* txn, const std::string& dbName, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& output) { LOG(1) << "RunOnAllShardsCommand db: " << dbName << " cmd:" << redact(cmdObj); if (_implicitCreateDb) { uassertStatusOK(ScopedShardDatabase::getOrCreate(txn, dbName)); } std::vector<ShardId> shardIds; getShardIds(txn, dbName, cmdObj, shardIds); std::list<std::shared_ptr<Future::CommandResult>> futures; for (const ShardId& shardId : shardIds) { const auto shard = grid.shardRegistry()->getShard(txn, shardId); if (!shard) { continue; } futures.push_back(Future::spawnCommand( shard->getConnString().toString(), dbName, cmdObj, 0, NULL, _useShardConn)); } std::vector<ShardAndReply> results; BSONObjBuilder subobj(output.subobjStart("raw")); BSONObjBuilder errors; int commonErrCode = -1; std::list<std::shared_ptr<Future::CommandResult>>::iterator futuresit; std::vector<ShardId>::const_iterator shardIdsIt; BSONElement wcErrorElem; ShardId wcErrorShardId; bool hasWCError = false; // We iterate over the set of shard ids and their corresponding futures in parallel. // TODO: replace with zip iterator if we ever decide to use one from Boost or elsewhere for (futuresit = futures.begin(), shardIdsIt = shardIds.cbegin(); futuresit != futures.end() && shardIdsIt != shardIds.end(); ++futuresit, ++shardIdsIt) { std::shared_ptr<Future::CommandResult> res = *futuresit; if (res->join(txn)) { // success :) BSONObj result = res->result(); results.emplace_back(shardIdsIt->toString(), result); subobj.append(res->getServer(), result); if (!hasWCError) { if ((wcErrorElem = result["writeConcernError"])) { wcErrorShardId = *shardIdsIt; hasWCError = true; } } continue; } BSONObj result = res->result(); if (!hasWCError) { if ((wcErrorElem = result["writeConcernError"])) { wcErrorShardId = *shardIdsIt; hasWCError = true; } } if (result["errmsg"].type() || result["code"].numberInt() != 0) { result = specialErrorHandler(res->getServer(), dbName, cmdObj, result); BSONElement errmsgObj = result["errmsg"]; if (errmsgObj.eoo() || errmsgObj.String().empty()) { // it was fixed! results.emplace_back(shardIdsIt->toString(), result); subobj.append(res->getServer(), result); continue; } } // Handle "errmsg". if (!result["errmsg"].eoo()) { errors.appendAs(result["errmsg"], res->getServer()); } else { // Can happen if message is empty, for some reason errors.append(res->getServer(), str::stream() << "result without error message returned : " << result); } // Handle "code". int errCode = result["code"].numberInt(); if (commonErrCode == -1) { commonErrCode = errCode; } else if (commonErrCode != errCode) { commonErrCode = 0; } results.emplace_back(shardIdsIt->toString(), result); subobj.append(res->getServer(), result); } subobj.done(); if (hasWCError) { appendWriteConcernErrorToCmdResponse(wcErrorShardId, wcErrorElem, output); } BSONObj errobj = errors.done(); if (!errobj.isEmpty()) { errmsg = errobj.toString(); // If every error has a code, and the code for all errors is the same, then add // a top-level field "code" with this value to the output object. if (commonErrCode > 0) { output.append("code", commonErrCode); } return false; } aggregateResults(results, output); return true; }
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn, const string& ns, const ShardKeyPattern& fieldsAndOrder, bool unique, const vector<BSONObj>& initPoints, const set<ShardId>& initShardIds) { // Lock the collection globally so that no other mongos can try to shard or drop the collection // at the same time. auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection"); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } auto status = getDatabase(txn, nsToDatabase(ns)); if (!status.isOK()) { return status.getStatus(); } ShardId dbPrimaryShardId = status.getValue().value.getPrimary(); const auto primaryShard = grid.shardRegistry()->getShard(txn, dbPrimaryShardId); { // In 3.0 and prior we include this extra safety check that the collection is not getting // sharded concurrently by two different mongos instances. It is not 100%-proof, but it // reduces the chance that two invocations of shard collection will step on each other's // toes. Now we take the distributed lock so going forward this check won't be necessary // but we leave it around for compatibility with other mongoses from 3.0. // TODO(spencer): Remove this after 3.2 ships. auto countStatus = _runCountCommandOnConfig( txn, NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::AlreadyInitialized, str::stream() << "collection " << ns << " already sharded with " << countStatus.getValue() << " chunks."); } } // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", ns); collectionDetail.append("primary", primaryShard->toString()); { BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards")); for (const ShardId& shardId : initShardIds) { initialShards.append(shardId); } } collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1)); logChange(txn, txn->getClient()->clientAddress(true), "shardCollection.start", ns, collectionDetail.obj()); } shared_ptr<ChunkManager> manager(new ChunkManager(ns, fieldsAndOrder, unique)); manager->createFirstChunks(txn, dbPrimaryShardId, &initPoints, &initShardIds); manager->loadExistingRanges(txn, nullptr); CollectionInfo collInfo; collInfo.useChunkManager(manager); collInfo.save(txn, ns); manager->reload(txn, true); // Tell the primary mongod to refresh its data // TODO: Think the real fix here is for mongos to just // assume that all collections are sharded, when we get there SetShardVersionRequest ssv = SetShardVersionRequest::makeForVersioningNoPersist( grid.shardRegistry()->getConfigServerConnectionString(), dbPrimaryShardId, primaryShard->getConnString(), NamespaceString(ns), manager->getVersion(), true); auto ssvStatus = grid.shardRegistry()->runCommandWithNotMasterRetries( txn, dbPrimaryShardId, "admin", ssv.toBSON()); if (!ssvStatus.isOK()) { warning() << "could not update initial version of " << ns << " on shard primary " << dbPrimaryShardId << ssvStatus.getStatus(); } logChange(txn, txn->getClient()->clientAddress(true), "shardCollection", ns, BSON("version" << manager->getVersion().toString())); return Status::OK(); }