void DBConfig::CollectionInfo::shard( const string& ns , const ShardKeyPattern& key , bool unique ) { _cm.reset( new ChunkManager( ns , key , unique ) ); _key = key.key().getOwned(); _unqiue = unique; _dirty = true; _dropped = false; }
ChunkManager::ChunkManager(const string& ns, const ShardKeyPattern& pattern, bool unique) : _ns( ns ), _keyPattern( pattern.getKeyPattern() ), _unique( unique ), _sequenceNumber(NextSequenceNumber.addAndFetch(1)), _chunkRanges() { }
bool CollectionShardingState::_shouldSplitChunk(OperationContext* opCtx, const ShardKeyPattern& shardKeyPattern, const Chunk& chunk) { const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); invariant(balancerConfig); const KeyPattern keyPattern = shardKeyPattern.getKeyPattern(); const bool minIsInf = (0 == keyPattern.globalMin().woCompare(chunk.getMin())); const bool maxIsInf = (0 == keyPattern.globalMax().woCompare(chunk.getMax())); return chunk.shouldSplit(balancerConfig->getMaxChunkSizeBytes(), minIsInf, maxIsInf); }
StatusWith<std::vector<BSONObj>> selectChunkSplitPoints(OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, const BSONObj& minKey, const BSONObj& maxKey, long long chunkSizeBytes, int maxPoints, int maxObjs) { BSONObjBuilder cmd; cmd.append("splitVector", nss.ns()); cmd.append("keyPattern", shardKeyPattern.toBSON()); cmd.append(kMinKey, minKey); cmd.append(kMaxKey, maxKey); cmd.append("maxChunkSizeBytes", chunkSizeBytes); cmd.append("maxSplitPoints", maxPoints); cmd.append("maxChunkObjects", maxObjs); auto shard = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "shard " << shardId << " not found"); } auto cmdStatus = shard->runCommand(txn, ReadPreferenceSetting{ReadPreference::PrimaryPreferred}, "admin", cmd.obj(), Shard::RetryPolicy::kIdempotent); if (!cmdStatus.isOK()) { return std::move(cmdStatus.getStatus()); } if (!cmdStatus.getValue().commandStatus.isOK()) { return std::move(cmdStatus.getValue().commandStatus); } const auto response = std::move(cmdStatus.getValue().response); std::vector<BSONObj> splitPoints; BSONObjIterator it(response.getObjectField("splitKeys")); while (it.more()) { splitPoints.push_back(it.next().Obj().getOwned()); } return std::move(splitPoints); }
StatusWith<std::vector<BSONObj>> selectChunkSplitPoints(OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, const ChunkRange& chunkRange, long long chunkSizeBytes, boost::optional<int> maxObjs) { BSONObjBuilder cmd; cmd.append("splitVector", nss.ns()); cmd.append("keyPattern", shardKeyPattern.toBSON()); chunkRange.append(&cmd); cmd.append("maxChunkSizeBytes", chunkSizeBytes); if (maxObjs) { cmd.append("maxChunkObjects", *maxObjs); } auto shardStatus = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shardStatus.isOK()) { return shardStatus.getStatus(); } auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( txn, ReadPreferenceSetting{ReadPreference::PrimaryPreferred}, "admin", cmd.obj(), Shard::RetryPolicy::kIdempotent); if (!cmdStatus.isOK()) { return std::move(cmdStatus.getStatus()); } if (!cmdStatus.getValue().commandStatus.isOK()) { return std::move(cmdStatus.getValue().commandStatus); } const auto response = std::move(cmdStatus.getValue().response); std::vector<BSONObj> splitPoints; BSONObjIterator it(response.getObjectField("splitKeys")); while (it.more()) { splitPoints.push_back(it.next().Obj().getOwned()); } return std::move(splitPoints); }
StatusWith<BSONObj> selectMedianKey(OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, const BSONObj& minKey, const BSONObj& maxKey) { BSONObjBuilder cmd; cmd.append("splitVector", nss.ns()); cmd.append("keyPattern", shardKeyPattern.toBSON()); cmd.append(kMinKey, minKey); cmd.append(kMaxKey, maxKey); cmd.appendBool("force", true); auto shard = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "shard " << shardId << " not found"); } auto cmdStatus = shard->runCommand(txn, ReadPreferenceSetting{ReadPreference::PrimaryPreferred}, "admin", cmd.obj(), Shard::RetryPolicy::kIdempotent); if (!cmdStatus.isOK()) { return std::move(cmdStatus.getStatus()); } if (!cmdStatus.getValue().commandStatus.isOK()) { return std::move(cmdStatus.getValue().commandStatus); } const auto response = std::move(cmdStatus.getValue().response); BSONObjIterator it(response.getObjectField("splitKeys")); if (it.more()) { return it.next().Obj().getOwned(); } return BSONObj(); }
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn, const string& ns, const ShardKeyPattern& fieldsAndOrder, bool unique, const vector<BSONObj>& initPoints, const set<ShardId>& initShardIds) { // Lock the collection globally so that no other mongos can try to shard or drop the collection // at the same time. auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection"); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } StatusWith<DatabaseType> status = getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return status.getStatus(); } DatabaseType dbt = status.getValue(); ShardId dbPrimaryShardId = dbt.getPrimary(); const auto primaryShard = grid.shardRegistry()->getShard(dbPrimaryShardId); { // In 3.0 and prior we include this extra safety check that the collection is not getting // sharded concurrently by two different mongos instances. It is not 100%-proof, but it // reduces the chance that two invocations of shard collection will step on each other's // toes. Now we take the distributed lock so going forward this check won't be necessary // but we leave it around for compatibility with other mongoses from 3.0. // TODO(spencer): Remove this after 3.2 ships. const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto countStatus = _runCountCommand( readHost.getValue(), NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::AlreadyInitialized, str::stream() << "collection " << ns << " already sharded with " << countStatus.getValue() << " chunks."); } } // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", ns); collectionDetail.append("primary", primaryShard->toString()); { BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards")); for (const ShardId& shardId : initShardIds) { initialShards.append(shardId); } } collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1)); logChange(txn->getClient()->clientAddress(true), "shardCollection.start", ns, collectionDetail.obj()); } ChunkManagerPtr manager(new ChunkManager(ns, fieldsAndOrder, unique)); manager->createFirstChunks(dbPrimaryShardId, &initPoints, &initShardIds); manager->loadExistingRanges(nullptr); CollectionInfo collInfo; collInfo.useChunkManager(manager); collInfo.save(ns); manager->reload(true); // TODO(spencer) SERVER-19319: Send setShardVersion to primary shard so it knows to start // rejecting unversioned writes. BSONObj finishDetail = BSON("version" << ""); // TODO(spencer) SERVER-19319 Report actual version used logChange(txn->getClient()->clientAddress(true), "shardCollection", ns, finishDetail); return Status::OK(); }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const BSONObj& minKey, const BSONObj& maxKey, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); invariant(minKey.woCompare(maxKey) < 0); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("configdb", Grid::get(txn)->shardRegistry()->getConfigServerConnectionString().toString()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); collectionVersion.appendForCommands(&cmd); cmd.append(kMinKey, minKey); cmd.append(kMaxKey, maxKey); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shard = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shard) { status = Status(ErrorCodes::ShardNotFound, str::stream() << "shard " << shardId << " not found"); } else { auto cmdStatus = shard->runCommand(txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const ChunkRange& chunkRange, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } // Sanity check that we are not attempting to split at the boundaries of the chunk. This check // is already performed at chunk split commit time, but we are performing it here for parity // with old auto-split code, which might rely on it. if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMin() == splitPoints.front())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.front() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMax() == splitPoints.back())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.back() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("configdb", Grid::get(txn)->shardRegistry()->getConfigServerConnectionString().toString()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); collectionVersion.appendForCommands(&cmd); chunkRange.append(&cmd); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shardStatus = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shardStatus.isOK()) { status = shardStatus.getStatus(); } else { auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
void ShardingCatalogManager::shardCollection(OperationContext* opCtx, const NamespaceString& nss, const boost::optional<UUID> uuid, const ShardKeyPattern& fieldsAndOrder, const BSONObj& defaultCollation, bool unique, const vector<BSONObj>& splitPoints, bool isFromMapReduce, const ShardId& dbPrimaryShardId) { const auto shardRegistry = Grid::get(opCtx)->shardRegistry(); const auto primaryShard = uassertStatusOK(shardRegistry->getShard(opCtx, dbPrimaryShardId)); // Fail if there are partially written chunks from a previous failed shardCollection. checkForExistingChunks(opCtx, nss); // Prior to 4.0.5, zones cannot be taken into account at collection sharding time, so ignore // them and let the balancer apply them later const std::vector<TagsType> treatAsNoZonesDefined; // Map/reduce with output to sharded collection ignores consistency checks and requires the // initial chunks to be spread across shards unconditionally const bool treatAsEmpty = isFromMapReduce; // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", nss.ns()); if (uuid) uuid->appendToBuilder(&collectionDetail, "uuid"); collectionDetail.append("empty", treatAsEmpty); collectionDetail.append("fromMapReduce", isFromMapReduce); collectionDetail.append("primary", primaryShard->toString()); collectionDetail.append("numChunks", static_cast<int>(splitPoints.size() + 1)); uassertStatusOK(ShardingLogging::get(opCtx)->logChangeChecked( opCtx, "shardCollection.start", nss.ns(), collectionDetail.obj(), ShardingCatalogClient::kMajorityWriteConcern)); } // Construct the collection default collator. std::unique_ptr<CollatorInterface> defaultCollator; if (!defaultCollation.isEmpty()) { defaultCollator = uassertStatusOK(CollatorFactoryInterface::get(opCtx->getServiceContext()) ->makeFromBSON(defaultCollation)); } const auto initialChunks = InitialSplitPolicy::createFirstChunks(opCtx, nss, fieldsAndOrder, dbPrimaryShardId, splitPoints, treatAsNoZonesDefined, treatAsEmpty); InitialSplitPolicy::writeFirstChunksToConfig(opCtx, initialChunks); { CollectionType coll; coll.setNs(nss); if (uuid) coll.setUUID(*uuid); coll.setEpoch(initialChunks.collVersion().epoch()); coll.setUpdatedAt(Date_t::fromMillisSinceEpoch(initialChunks.collVersion().toLong())); coll.setKeyPattern(fieldsAndOrder.toBSON()); coll.setDefaultCollation(defaultCollator ? defaultCollator->getSpec().toBSON() : BSONObj()); coll.setUnique(unique); uassertStatusOK(ShardingCatalogClientImpl::updateShardingCatalogEntryForCollection( opCtx, nss, coll, true /*upsert*/)); } auto shard = uassertStatusOK(shardRegistry->getShard(opCtx, dbPrimaryShardId)); invariant(!shard->isConfig()); // Tell the primary mongod to refresh its data SetShardVersionRequest ssv = SetShardVersionRequest::makeForVersioningNoPersist( shardRegistry->getConfigServerConnectionString(), dbPrimaryShardId, primaryShard->getConnString(), nss, initialChunks.collVersion(), true /* isAuthoritative */, true /* forceRefresh */); auto ssvResponse = shard->runCommandWithFixedRetryAttempts(opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", ssv.toBSON(), Shard::RetryPolicy::kIdempotent); auto status = ssvResponse.isOK() ? std::move(ssvResponse.getValue().commandStatus) : std::move(ssvResponse.getStatus()); if (!status.isOK()) { warning() << "could not update initial version of " << nss.ns() << " on shard primary " << dbPrimaryShardId << causedBy(redact(status)); } ShardingLogging::get(opCtx)->logChange( opCtx, "shardCollection.end", nss.ns(), BSON("version" << initialChunks.collVersion().toString()), ShardingCatalogClient::kMajorityWriteConcern); }
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn, const string& ns, const ShardKeyPattern& fieldsAndOrder, bool unique, const vector<BSONObj>& initPoints, const set<ShardId>& initShardIds) { // Lock the collection globally so that no other mongos can try to shard or drop the collection // at the same time. auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection"); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } auto status = getDatabase(txn, nsToDatabase(ns)); if (!status.isOK()) { return status.getStatus(); } ShardId dbPrimaryShardId = status.getValue().value.getPrimary(); const auto primaryShard = grid.shardRegistry()->getShard(txn, dbPrimaryShardId); { // In 3.0 and prior we include this extra safety check that the collection is not getting // sharded concurrently by two different mongos instances. It is not 100%-proof, but it // reduces the chance that two invocations of shard collection will step on each other's // toes. Now we take the distributed lock so going forward this check won't be necessary // but we leave it around for compatibility with other mongoses from 3.0. // TODO(spencer): Remove this after 3.2 ships. auto countStatus = _runCountCommandOnConfig( txn, NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::AlreadyInitialized, str::stream() << "collection " << ns << " already sharded with " << countStatus.getValue() << " chunks."); } } // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", ns); collectionDetail.append("primary", primaryShard->toString()); { BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards")); for (const ShardId& shardId : initShardIds) { initialShards.append(shardId); } } collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1)); logChange(txn, txn->getClient()->clientAddress(true), "shardCollection.start", ns, collectionDetail.obj()); } shared_ptr<ChunkManager> manager(new ChunkManager(ns, fieldsAndOrder, unique)); manager->createFirstChunks(txn, dbPrimaryShardId, &initPoints, &initShardIds); manager->loadExistingRanges(txn, nullptr); CollectionInfo collInfo; collInfo.useChunkManager(manager); collInfo.save(txn, ns); manager->reload(txn, true); // Tell the primary mongod to refresh its data // TODO: Think the real fix here is for mongos to just // assume that all collections are sharded, when we get there SetShardVersionRequest ssv = SetShardVersionRequest::makeForVersioningNoPersist( grid.shardRegistry()->getConfigServerConnectionString(), dbPrimaryShardId, primaryShard->getConnString(), NamespaceString(ns), manager->getVersion(), true); auto ssvStatus = grid.shardRegistry()->runCommandWithNotMasterRetries( txn, dbPrimaryShardId, "admin", ssv.toBSON()); if (!ssvStatus.isOK()) { warning() << "could not update initial version of " << ns << " on shard primary " << dbPrimaryShardId << ssvStatus.getStatus(); } logChange(txn, txn->getClient()->clientAddress(true), "shardCollection", ns, BSON("version" << manager->getVersion().toString())); return Status::OK(); }