StatusWith<repl::OpTimeWith<CollectionType>> ShardingCatalogClientImpl::getCollection(
    OperationContext* opCtx, const NamespaceString& nss, repl::ReadConcernLevel readConcernLevel) {
    auto statusFind = _exhaustiveFindOnConfig(opCtx,
    if (!statusFind.isOK()) {
        return statusFind.getStatus();

    const auto& retOpTimePair = statusFind.getValue();
    const auto& retVal = retOpTimePair.value;
    if (retVal.empty()) {
        return Status(ErrorCodes::NamespaceNotFound,
                      stream() << "collection " << nss.ns() << " not found");

    invariant(retVal.size() == 1);

    auto parseStatus = CollectionType::fromBSON(retVal.front());
    if (!parseStatus.isOK()) {
        return parseStatus.getStatus();

    auto collType = parseStatus.getValue();
    if (collType.getDropped()) {
        return Status(ErrorCodes::NamespaceNotFound,
                      stream() << "collection " << nss.ns() << " was dropped");

    return repl::OpTimeWith<CollectionType>(collType, retOpTimePair.opTime);
StatusWith<repl::OpTimeWith<std::vector<ShardType>>> ShardingCatalogClientImpl::getAllShards(
    OperationContext* opCtx, repl::ReadConcernLevel readConcern) {
    std::vector<ShardType> shards;
    auto findStatus = _exhaustiveFindOnConfig(opCtx,
                                              BSONObj(),     // no query filter
                                              BSONObj(),     // no sort
                                              boost::none);  // no limit
    if (!findStatus.isOK()) {
        return findStatus.getStatus();

    for (const BSONObj& doc : findStatus.getValue().value) {
        auto shardRes = ShardType::fromBSON(doc);
        if (!shardRes.isOK()) {
            return shardRes.getStatus().withContext(stream() << "Failed to parse shard document "
                                                             << doc);

        Status validateStatus = shardRes.getValue().validate();
        if (!validateStatus.isOK()) {
            return validateStatus.withContext(stream() << "Failed to validate shard document "
                                                       << doc);


    return repl::OpTimeWith<std::vector<ShardType>>{std::move(shards),
Example #3
void MigrationManager::_schedule_inlock(OperationContext* txn,
                                        const HostAndPort& targetHost,
                                        Migration migration) {
    executor::TaskExecutor* const executor = Grid::get(txn)->getExecutorPool()->getFixedExecutor();

    const NamespaceString nss(migration.nss);

    auto it = _activeMigrations.find(nss);
    if (it == _activeMigrations.end()) {
        const std::string whyMessage(stream() << "Migrating chunk(s) in collection " << nss.ns());

        // Acquire the collection distributed lock (blocking call)
        auto statusWithDistLockHandle =

        if (!statusWithDistLockHandle.isOK()) {
                       stream() << "Could not acquire collection lock for " << nss.ns()
                                << " to migrate chunks, due to "
                                << statusWithDistLockHandle.getStatus().reason()));

        it = _activeMigrations.insert(std::make_pair(nss, MigrationsList())).first;

    auto migrations = &it->second;

    // Add ourselves to the list of migrations on this collection
    auto itMigration = migrations->begin();

    const RemoteCommandRequest remoteRequest(
        targetHost, NamespaceString::kAdminDb.toString(), itMigration->moveChunkCmdObj, txn);

    StatusWith<executor::TaskExecutor::CallbackHandle> callbackHandleWithStatus =
            [this, itMigration](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) {
                ON_BLOCK_EXIT([&] { Client::destroy(); });
                auto txn = cc().makeOperationContext();

                stdx::lock_guard<stdx::mutex> lock(_mutex);
                _complete_inlock(txn.get(), itMigration, args.response);

    if (callbackHandleWithStatus.isOK()) {
        itMigration->callbackHandle = std::move(callbackHandleWithStatus.getValue());

    _complete_inlock(txn, itMigration, std::move(callbackHandleWithStatus.getStatus()));
Example #4
Status ChunkMoveOperationState::initialize(const BSONObj& cmdObj) {
    // Make sure we're as up-to-date as possible with shard information. This catches the case where
    // we might have changed a shard's host by removing/adding a shard with the same name.

    _fromShard = cmdObj["fromShard"].str();
    if (_fromShard.empty()) {
        return {ErrorCodes::InvalidOptions, "need to specify shard to move chunk from"};

    _toShard = cmdObj["toShard"].str();
    if (_toShard.empty()) {
        return {ErrorCodes::InvalidOptions, "need to specify shard to move chunk to"};

    Status epochStatus = bsonExtractOIDField(cmdObj, "epoch", &_collectionEpoch);
    if (!epochStatus.isOK()) {
        return epochStatus;

    _minKey = cmdObj["min"].Obj();
    if (_minKey.isEmpty()) {
        return {ErrorCodes::InvalidOptions, "need to specify a min"};

    _maxKey = cmdObj["max"].Obj();
    if (_maxKey.isEmpty()) {
        return {ErrorCodes::InvalidOptions, "need to specify a max"};

        std::shared_ptr<Shard> fromShard = grid.shardRegistry()->getShard(_txn, _fromShard);
        if (!fromShard) {
            return {ErrorCodes::ShardNotFound,
                    stream() << "Source shard " << _fromShard
                             << " is missing. This indicates metadata corruption."};

        _fromShardCS = fromShard->getConnString();

        std::shared_ptr<Shard> toShard = grid.shardRegistry()->getShard(_txn, _toShard);
        if (!toShard) {
            return {ErrorCodes::ShardNotFound,
                    stream() << "Destination shard " << _toShard
                             << " is missing. This indicates metadata corruption."};

        _toShardCS = toShard->getConnString();

    return Status::OK();
Example #5
 * Returns the config version of the cluster pointed at by the connection string.
 * @return OK if version found successfully, error status if something bad happened.
Status getConfigVersion(CatalogManager* catalogManager, VersionType* versionInfo) {
    try {

        ScopedDbConnection conn(grid.shardRegistry()->getConfigServerConnectionString(), 30);

        unique_ptr<DBClientCursor> cursor(_safeCursor(conn->query("config.version", BSONObj())));

        bool hasConfigData = conn->count(ShardType::ConfigNS) ||
            conn->count(DatabaseType::ConfigNS) || conn->count(CollectionType::ConfigNS);

        if (!cursor->more()) {
            // Version is 1 if we have data, 0 if we're completely empty
            if (hasConfigData) {
            } else {

            return Status::OK();

        BSONObj versionDoc = cursor->next();
        auto versionInfoResult = VersionType::fromBSON(versionDoc);
        if (!versionInfoResult.isOK()) {

            return Status(ErrorCodes::UnsupportedFormat,
                          stream() << "invalid config version document " << versionDoc
                                   << versionInfoResult.getStatus().toString());
        *versionInfo = versionInfoResult.getValue();

        if (cursor->more()) {

            return Status(ErrorCodes::RemoteValidationError,
                          stream() << "should only have 1 document "
                                   << "in config.version collection");
    } catch (const DBException& e) {
        return e.toStatus();

    return Status::OK();
StatusWith<repl::OpTimeWith<DatabaseType>> ShardingCatalogClientImpl::getDatabase(
    OperationContext* opCtx, const std::string& dbName, repl::ReadConcernLevel readConcernLevel) {
    if (!NamespaceString::validDBName(dbName, NamespaceString::DollarInDbNameBehavior::Allow)) {
        return {ErrorCodes::InvalidNamespace, stream() << dbName << " is not a valid db name"};

    // The admin database is always hosted on the config server.
    if (dbName == NamespaceString::kAdminDb) {
        DatabaseType dbt(
            dbName, ShardRegistry::kConfigServerShardId, false, databaseVersion::makeFixed());
        return repl::OpTimeWith<DatabaseType>(dbt);

    // The config database's primary shard is always config, and it is always sharded.
    if (dbName == NamespaceString::kConfigDb) {
        DatabaseType dbt(
            dbName, ShardRegistry::kConfigServerShardId, true, databaseVersion::makeFixed());
        return repl::OpTimeWith<DatabaseType>(dbt);

    auto result = _fetchDatabaseMetadata(opCtx, dbName, kConfigReadSelector, readConcernLevel);
    if (result == ErrorCodes::NamespaceNotFound) {
        // If we failed to find the database metadata on the 'nearest' config server, try again
        // against the primary, in case the database was recently created.
        result = _fetchDatabaseMetadata(
            opCtx, dbName, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, readConcernLevel);
        if (!result.isOK() && (result != ErrorCodes::NamespaceNotFound)) {
            return result.getStatus().withContext(
                str::stream() << "Could not confirm non-existence of database " << dbName);

    return result;
StatusWith<string> CatalogManagerReplicaSet::getTagForChunk(const std::string& collectionNs,
                                                            const ChunkType& chunk) {
    auto configShard = grid.shardRegistry()->getShard("config");
    auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector);
    if (!readHostStatus.isOK()) {
        return readHostStatus.getStatus();

    BSONObj query =
        BSON(TagsType::ns(collectionNs) << TagsType::min() << BSON("$lte" << chunk.getMin())
                                        << TagsType::max() << BSON("$gte" << chunk.getMax()));
    auto findStatus = grid.shardRegistry()->exhaustiveFind(
        readHostStatus.getValue(), NamespaceString(TagsType::ConfigNS), query, BSONObj(), 1);
    if (!findStatus.isOK()) {
        return findStatus.getStatus();

    const auto& docs = findStatus.getValue();
    if (docs.empty()) {
        return string{};

    invariant(docs.size() == 1);
    BSONObj tagsDoc = docs.front();

    const auto tagsResult = TagsType::fromBSON(tagsDoc);
    if (!tagsResult.isOK()) {
        return {ErrorCodes::FailedToParse,
                stream() << "error while parsing " << TagsType::ConfigNS << " document: " << tagsDoc
                         << " : " << tagsResult.getStatus().toString()};
    return tagsResult.getValue().getTag();
StatusWith<CollectionType> CatalogManagerReplicaSet::getCollection(const std::string& collNs) {
    auto configShard = grid.shardRegistry()->getShard("config");

    auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector);
    if (!readHostStatus.isOK()) {
        return readHostStatus.getStatus();

    auto statusFind =

    if (!statusFind.isOK()) {
        return statusFind.getStatus();

    const auto& retVal = statusFind.getValue();
    if (retVal.empty()) {
        return Status(ErrorCodes::NamespaceNotFound,
                      stream() << "collection " << collNs << " not found");

    invariant(retVal.size() == 1);

    return CollectionType::fromBSON(retVal.front());
Status CatalogManagerReplicaSet::getChunks(const Query& query,
                                           int nToReturn,
                                           vector<ChunkType>* chunks) {

    auto configShard = grid.shardRegistry()->getShard("config");
    auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector);
    if (!readHostStatus.isOK()) {
        return readHostStatus.getStatus();

    auto findStatus = grid.shardRegistry()->exhaustiveFind(readHostStatus.getValue(),
                                                           boost::none);  // no limit
    if (!findStatus.isOK()) {
        return findStatus.getStatus();

    for (const BSONObj& obj : findStatus.getValue()) {
        auto chunkRes = ChunkType::fromBSON(obj);
        if (!chunkRes.isOK()) {
            return {ErrorCodes::FailedToParse,
                    stream() << "Failed to parse chunk with id ("
                             << obj[ChunkType::name()].toString()
                             << "): " << chunkRes.getStatus().toString()};


    return Status::OK();
Status CatalogManagerReplicaSet::getAllShards(vector<ShardType>* shards) {
    const auto configShard = grid.shardRegistry()->getShard("config");
    const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector);
    if (!readHost.isOK()) {
        return readHost.getStatus();

    auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(),
                                                           BSONObj(),     // no query filter
                                                           boost::none);  // no limit
    if (!findStatus.isOK()) {
        return findStatus.getStatus();

    for (const BSONObj& doc : findStatus.getValue()) {
        auto shardRes = ShardType::fromBSON(doc);
        if (!shardRes.isOK()) {
            return {ErrorCodes::FailedToParse,
                    stream() << "Failed to parse shard with id ("
                             << doc[ShardType::name()].toString()
                             << "): " << shardRes.getStatus().toString()};


    return Status::OK();
StatusWith<repl::OpTimeWith<DatabaseType>> ShardingCatalogClientImpl::_fetchDatabaseMetadata(
    OperationContext* opCtx,
    const std::string& dbName,
    const ReadPreferenceSetting& readPref,
    repl::ReadConcernLevel readConcernLevel) {
    invariant(dbName != NamespaceString::kAdminDb && dbName != NamespaceString::kConfigDb);

    auto findStatus = _exhaustiveFindOnConfig(opCtx,
    if (!findStatus.isOK()) {
        return findStatus.getStatus();

    const auto& docsWithOpTime = findStatus.getValue();
    if (docsWithOpTime.value.empty()) {
        return {ErrorCodes::NamespaceNotFound, stream() << "database " << dbName << " not found"};

    invariant(docsWithOpTime.value.size() == 1);

    auto parseStatus = DatabaseType::fromBSON(docsWithOpTime.value.front());
    if (!parseStatus.isOK()) {
        return parseStatus.getStatus();

    return repl::OpTimeWith<DatabaseType>(parseStatus.getValue(), docsWithOpTime.opTime);
Example #12
CollectionMetadata* CollectionMetadata::clonePlusChunk(const ChunkType& chunk,
                                                       const ChunkVersion& newShardVersion,
                                                       string* errMsg) const {
    // The error message string is optional.
    string dummy;
    if (errMsg == NULL) {
        errMsg = &dummy;

    // It is acceptable to move version backwards (e.g., undoing a migration that went bad
    // during commit) but only cloning away the last chunk may reset the version to 0.
    if (!newShardVersion.isSet()) {
        *errMsg = stream() << "cannot add chunk " << rangeToString(chunk.getMin(), chunk.getMax())
                           << " with zero shard version";

        warning() << *errMsg;
        return NULL;

    invariant(chunk.getMin().woCompare(chunk.getMax()) < 0);

    // Check that there isn't any chunk on the interval to be added.
    if (rangeMapOverlaps(_chunksMap, chunk.getMin(), chunk.getMax())) {
        RangeVector overlap;
        getRangeMapOverlap(_chunksMap, chunk.getMin(), chunk.getMax(), &overlap);

        *errMsg = stream() << "cannot add chunk " << rangeToString(chunk.getMin(), chunk.getMax())
                           << " because the chunk overlaps " << overlapToString(overlap);

        warning() << *errMsg;
        return NULL;

    unique_ptr<CollectionMetadata> metadata(new CollectionMetadata);
    metadata->_keyPattern = this->_keyPattern;
    metadata->_pendingMap = this->_pendingMap;
    metadata->_chunksMap = this->_chunksMap;
    metadata->_chunksMap.insert(make_pair(chunk.getMin().getOwned(), chunk.getMax().getOwned()));
    metadata->_shardVersion = newShardVersion;
    metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion;

    return metadata.release();
Example #13
Status MigrationManager::_processRemoteCommandResponse(
    const RemoteCommandResponse& remoteCommandResponse,
    ScopedMigrationRequest* scopedMigrationRequest) {

    stdx::lock_guard<stdx::mutex> lock(_mutex);
    Status commandStatus(ErrorCodes::InternalError, "Uninitialized value.");

    // Check for local errors sending the remote command caused by stepdown.
    if (isErrorDueToConfigStepdown(remoteCommandResponse.status,
                                   _state != State::kEnabled && _state != State::kRecovering)) {
        return {ErrorCodes::BalancerInterrupted,
                stream() << "Migration interrupted because the balancer is stopping."
                         << " Command status: "
                         << remoteCommandResponse.status.toString()};

    if (!remoteCommandResponse.isOK()) {
        commandStatus = remoteCommandResponse.status;
    } else {
        // TODO: delete in 3.8
        commandStatus = extractMigrationStatusFromCommandResponse(;

    if (!Shard::shouldErrorBePropagated(commandStatus.code())) {
        commandStatus = {ErrorCodes::OperationFailed,
                         stream() << "moveChunk command failed on source shard."
                                  << causedBy(commandStatus)};

    // Any failure to remove the migration document should be because the config server is
    // stepping/shutting down. In this case we must fail the moveChunk command with a retryable
    // error so that the caller does not move on to other distlock requiring operations that could
    // fail when the balancer recovers and takes distlocks for migration recovery.
    Status status = scopedMigrationRequest->tryToRemoveMigration();
    if (!status.isOK()) {
        commandStatus = {
            stream() << "Migration interrupted because the balancer is stopping"
                     << " and failed to remove the config.migrations document."
                     << " Command status: "
                     << (commandStatus.isOK() ? status.toString() : commandStatus.toString())};

    return commandStatus;
Example #14
CollectionMetadata* CollectionMetadata::clonePlusPending(const ChunkType& pending,
                                                         string* errMsg) const {
    // The error message string is optional.
    string dummy;
    if (errMsg == NULL) {
        errMsg = &dummy;

    if (rangeMapOverlaps(_chunksMap, pending.getMin(), pending.getMax())) {
        RangeVector overlap;
        getRangeMapOverlap(_chunksMap, pending.getMin(), pending.getMax(), &overlap);

        *errMsg = stream() << "cannot add pending chunk "
                           << rangeToString(pending.getMin(), pending.getMax())
                           << " because the chunk overlaps " << overlapToString(overlap);

        warning() << *errMsg;
        return NULL;

    unique_ptr<CollectionMetadata> metadata(new CollectionMetadata);
    metadata->_keyPattern = this->_keyPattern;
    metadata->_pendingMap = this->_pendingMap;
    metadata->_chunksMap = this->_chunksMap;
    metadata->_rangesMap = this->_rangesMap;
    metadata->_shardVersion = _shardVersion;
    metadata->_collVersion = _collVersion;

    // If there are any pending chunks on the interval to be added this is ok, since pending
    // chunks aren't officially tracked yet and something may have changed on servers we do not
    // see yet.
    // We remove any chunks we overlap, the remote request starting a chunk migration must have
    // been authoritative.

    if (rangeMapOverlaps(_pendingMap, pending.getMin(), pending.getMax())) {
        RangeVector pendingOverlap;
        getRangeMapOverlap(_pendingMap, pending.getMin(), pending.getMax(), &pendingOverlap);

        warning() << "new pending chunk " << rangeToString(pending.getMin(), pending.getMax())
                  << " overlaps existing pending chunks " << overlapToString(pendingOverlap)
                  << ", a migration may not have completed";

        for (RangeVector::iterator it = pendingOverlap.begin(); it != pendingOverlap.end(); ++it) {

    metadata->_pendingMap.insert(make_pair(pending.getMin(), pending.getMax()));

    return metadata.release();
Example #15
CollectionMetadata* CollectionMetadata::cloneMinusPending(const ChunkType& pending,
                                                          string* errMsg) const {
    // The error message string is optional.
    string dummy;
    if (errMsg == NULL) {
        errMsg = &dummy;

    // Check that we have the exact chunk that will be subtracted.
    if (!rangeMapContains(_pendingMap, pending.getMin(), pending.getMax())) {
        *errMsg = stream() << "cannot remove pending chunk "
                           << rangeToString(pending.getMin(), pending.getMax())
                           << ", this shard does not contain the chunk";

        if (rangeMapOverlaps(_pendingMap, pending.getMin(), pending.getMax())) {
            RangeVector overlap;
            getRangeMapOverlap(_pendingMap, pending.getMin(), pending.getMax(), &overlap);

            *errMsg += stream() << " and it overlaps " << overlapToString(overlap);

        warning() << *errMsg;
        return NULL;

    unique_ptr<CollectionMetadata> metadata(new CollectionMetadata);
    metadata->_keyPattern = this->_keyPattern;
    metadata->_pendingMap = this->_pendingMap;
    metadata->_chunksMap = this->_chunksMap;
    metadata->_rangesMap = this->_rangesMap;
    metadata->_shardVersion = _shardVersion;
    metadata->_collVersion = _collVersion;

    return metadata.release();
StatusWith<std::vector<ChunkType>> ShardingCatalogClientImpl::getChunks(
    OperationContext* opCtx,
    const BSONObj& query,
    const BSONObj& sort,
    boost::optional<int> limit,
    OpTime* opTime,
    repl::ReadConcernLevel readConcern) {
    invariant(serverGlobalParams.clusterRole == ClusterRole::ConfigServer ||
              readConcern == repl::ReadConcernLevel::kMajorityReadConcern);

    // Convert boost::optional<int> to boost::optional<long long>.
    auto longLimit = limit ? boost::optional<long long>(*limit) : boost::none;
    auto findStatus = _exhaustiveFindOnConfig(
        opCtx, kConfigReadSelector, readConcern, ChunkType::ConfigNS, query, sort, longLimit);
    if (!findStatus.isOK()) {
        return findStatus.getStatus().withContext("Failed to load chunks");

    const auto& chunkDocsOpTimePair = findStatus.getValue();

    std::vector<ChunkType> chunks;
    for (const BSONObj& obj : chunkDocsOpTimePair.value) {
        auto chunkRes = ChunkType::fromConfigBSON(obj);
        if (!chunkRes.isOK()) {
            return chunkRes.getStatus().withContext(stream() << "Failed to parse chunk with id "
                                                             << obj[ChunkType::name()]);


    if (opTime) {
        *opTime = chunkDocsOpTimePair.opTime;

    return chunks;
StatusWith<DatabaseType> CatalogManagerReplicaSet::getDatabase(const std::string& dbName) {

    // The two databases that are hosted on the config server are config and admin
    if (dbName == "config" || dbName == "admin") {
        DatabaseType dbt;

        return dbt;

    const auto configShard = grid.shardRegistry()->getShard("config");
    const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector);
    if (!readHost.isOK()) {
        return readHost.getStatus();

    auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(),

    if (!findStatus.isOK()) {
        return findStatus.getStatus();

    const auto& docs = findStatus.getValue();
    if (docs.empty()) {
        return {ErrorCodes::NamespaceNotFound, stream() << "database " << dbName << " not found"};

    invariant(docs.size() == 1);

    return DatabaseType::fromBSON(docs.front());
Example #18
shared_ptr<Notification<Status>> MigrationManager::_schedule(
    OperationContext* txn,
    const MigrateInfo& migrateInfo,
    bool shardTakesCollectionDistLock,
    uint64_t maxChunkSizeBytes,
    const MigrationSecondaryThrottleOptions& secondaryThrottle,
    bool waitForDelete) {
    const NamespaceString nss(migrateInfo.ns);

    // Sanity checks that the chunk being migrated is actually valid. These will be repeated at the
    // shard as well, but doing them here saves an extra network call, which might otherwise fail.
    auto statusWithScopedChunkManager = ScopedChunkManager::getExisting(txn, nss);
    if (!statusWithScopedChunkManager.isOK()) {
        return std::make_shared<Notification<Status>>(

    ChunkManager* const chunkManager = statusWithScopedChunkManager.getValue().cm();

    auto chunk = chunkManager->findIntersectingChunkWithSimpleCollation(txn, migrateInfo.minKey);

    // If the chunk is not found exactly as requested, the caller must have stale data
    if (chunk->getMin() != migrateInfo.minKey || chunk->getMax() != migrateInfo.maxKey) {
        return std::make_shared<Notification<Status>>(Status(
            stream() << "Chunk " << ChunkRange(migrateInfo.minKey, migrateInfo.maxKey).toString()
                     << " does not exist."));

    // If chunk is already on the correct shard, just treat the operation as success
    if (chunk->getShardId() == {
        return std::make_shared<Notification<Status>>(Status::OK());

    const auto recipientShard = Grid::get(txn)->shardRegistry()->getShard(txn, migrateInfo.from);
    auto hostStatus = recipientShard->getTargeter()->findHost(
    if (!hostStatus.isOK()) {
        return std::make_shared<Notification<Status>>(std::move(hostStatus.getStatus()));

    BSONObjBuilder builder;
        ChunkRange(migrateInfo.minKey, migrateInfo.maxKey),

    Migration migration(nss, builder.obj());

    auto retVal = migration.completionNotification;

    if (shardTakesCollectionDistLock) {
        _scheduleWithoutDistLock(txn, hostStatus.getValue(), std::move(migration));
    } else {
        _scheduleWithDistLock(txn, hostStatus.getValue(), std::move(migration));

    return retVal;
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneMerge(
    const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion) const {
    invariant(newShardVersion.epoch() == _shardVersion.epoch());
    invariant(newShardVersion > _shardVersion);

    RangeVector overlap;
    getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap);

    if (overlap.empty() || overlap.size() == 1) {
        return {ErrorCodes::IllegalOperation,
                stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                         << (overlap.empty() ? ", no chunks found in this range"
                                             : ", only one chunk found in this range")};

    bool validStartEnd = true;
    bool validNoHoles = true;

    if (overlap.begin()->first.woCompare(minKey) != 0) {
        // First chunk doesn't start with minKey
        validStartEnd = false;
    } else if (overlap.rbegin()->second.woCompare(maxKey) != 0) {
        // Last chunk doesn't end with maxKey
        validStartEnd = false;
    } else {
        // Check that there are no holes
        BSONObj prevMaxKey = minKey;
        for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) {
            if (it->first.woCompare(prevMaxKey) != 0) {
                validNoHoles = false;
            prevMaxKey = it->second;

    if (!validStartEnd || !validNoHoles) {
        return {ErrorCodes::IllegalOperation,
                stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                         << ", overlapping chunks " << overlapToString(overlap)
                         << (!validStartEnd ? " do not have the same min and max key"
                                            : " are not all adjacent")};

    unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>());
    metadata->_keyPattern = _keyPattern.getOwned();
    metadata->_pendingMap = _pendingMap;
    metadata->_chunksMap = _chunksMap;
    metadata->_rangesMap = _rangesMap;
    metadata->_shardVersion = newShardVersion;
    metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion;

    for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) {

    metadata->_chunksMap.insert(make_pair(minKey, maxKey));

    return std::move(metadata);
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneSplit(
    const BSONObj& minKey,
    const BSONObj& maxKey,
    const std::vector<BSONObj>& splitKeys,
    const ChunkVersion& newShardVersion) const {
    invariant(newShardVersion.epoch() == _shardVersion.epoch());
    invariant(newShardVersion > _shardVersion);

    // The version required in both resulting chunks could be simply an increment in the
    // minor portion of the current version.  However, we are enforcing uniqueness over the
    // attributes <ns, version> of the configdb collection 'chunks'.  So in practice, a
    // migrate somewhere may force this split to pick up a version that has the major
    // portion higher than the one that this shard has been using.
    // TODO drop the uniqueness constraint and tighten the check below so that only the
    // minor portion of version changes

    // Check that we have the exact chunk that will be subtracted.
    if (!rangeMapContains(_chunksMap, minKey, maxKey)) {
        stream errMsg;
        errMsg << "cannot split chunk " << rangeToString(minKey, maxKey)
               << ", this shard does not contain the chunk";

        if (rangeMapOverlaps(_chunksMap, minKey, maxKey)) {
            RangeVector overlap;
            getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap);

            errMsg << " and it overlaps " << overlapToString(overlap);

        return {ErrorCodes::IllegalOperation, errMsg};

    unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>());
    metadata->_keyPattern = _keyPattern.getOwned();
    metadata->_pendingMap = _pendingMap;
    metadata->_chunksMap = _chunksMap;
    metadata->_shardVersion = newShardVersion;  // will increment 2nd, 3rd,... chunks below

    BSONObj startKey = minKey;
    for (const auto& split : splitKeys) {
        // Check that the split key is valid
        if (!rangeContains(minKey, maxKey, split)) {
            return {ErrorCodes::IllegalOperation,
                    stream() << "cannot split chunk " << rangeToString(minKey, maxKey) << " at key "
                             << split};

        // Check that the split keys are in order
        if (split.woCompare(startKey) <= 0) {
            // The split keys came in out of order, this probably indicates a bug, so fail the
            // operation. Re-iterate splitKeys to build a useful error message including the array
            // of splitKeys in the order received.
            str::stream errMsg;
            errMsg << "Invalid input to splitChunk, split keys must be in order, got: [";
            for (auto it2 = splitKeys.cbegin(); it2 != splitKeys.cend(); ++it2) {
                if (it2 != splitKeys.begin()) {
                    errMsg << ", ";
                errMsg << it2->toString();
            errMsg << "]";
            return {ErrorCodes::IllegalOperation, errMsg};

        metadata->_chunksMap[startKey] = split.getOwned();
        metadata->_chunksMap.insert(make_pair(split.getOwned(), maxKey.getOwned()));
        startKey = split;

    metadata->_collVersion =
        metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion;

    return std::move(metadata);
Example #21
string CollectionMetadata::toStringBasic() const {
    return stream() << "collection version: " << _collVersion.toString()
                    << ", shard version: " << _shardVersion.toString();
Example #22
StatusWith<ForwardingCatalogManager::ScopedDistLock*> ChunkMoveOperationState::acquireMoveMetadata(
    OperationContext* txn) {
    // Get the distributed lock
    const string whyMessage(stream() << "migrating chunk [" << minKey << ", " << maxKey << ") in "
                                     << _nss.ns());
    _distLockStatus = grid.forwardingCatalogManager()->distLock(txn, _nss.ns(), whyMessage);

    if (!_distLockStatus->isOK()) {
        const string msg = stream() << "could not acquire collection lock for " << _nss.ns()
                                    << " to migrate chunk [" << minKey << "," << maxKey << ")"
                                    << causedBy(_distLockStatus->getStatus());
        warning() << msg;
        return Status(_distLockStatus->getStatus().code(), msg);

    ShardingState* const shardingState = ShardingState::get(txn);

    // Snapshot the metadata
    Status refreshStatus = shardingState->refreshMetadataNow(txn, _nss.ns(), &_shardVersion);
    if (!refreshStatus.isOK()) {
        const string msg = stream() << "moveChunk cannot start migrate of chunk "
                                    << "[" << minKey << "," << maxKey << ")"
                                    << causedBy(refreshStatus.reason());
        warning() << msg;
        return Status(refreshStatus.code(), msg);

    if (_shardVersion.majorVersion() == 0) {
        // It makes no sense to migrate if our version is zero and we have no chunks
        const string msg = stream() << "moveChunk cannot start migrate of chunk "
                                    << "[" << minKey << "," << maxKey << ")"
                                    << " with zero shard version";
        warning() << msg;
        return Status(ErrorCodes::IncompatibleShardingMetadata, msg);

    if (_collectionEpoch != _shardVersion.epoch()) {
        const string msg = stream() << "moveChunk cannot move chunk "
                                    << "[" << minKey << "," << maxKey << "), "
                                    << "collection may have been dropped. "
                                    << "current epoch: " << _shardVersion.epoch()
                                    << ", cmd epoch: " << _collectionEpoch;
        warning() << msg;
        return Status(ErrorCodes::IncompatibleShardingMetadata, msg);

    _collMetadata = shardingState->getCollectionMetadata(_nss.ns());

    // With nonzero shard version, we must have a coll version >= our shard version
    invariant(_collMetadata->getCollVersion() >= _shardVersion);

    // With nonzero shard version, we must have a shard key

    ChunkType origChunk;
    if (!_collMetadata->getNextChunk(getMinKey(), &origChunk) ||
        origChunk.getMin().woCompare(getMinKey()) || origChunk.getMax().woCompare(getMaxKey())) {
        // Our boundaries are different from those passed in
        const string msg = stream() << "moveChunk cannot find chunk "
                                    << "[" << minKey << "," << maxKey << ")"
                                    << " to migrate, the chunk boundaries may be stale";
        warning() << msg;
        return Status(ErrorCodes::IncompatibleShardingMetadata, msg);

    return &_distLockStatus->getValue();
Status ShardingCatalogClientImpl::insertConfigDocument(OperationContext* opCtx,
                                                       const NamespaceString& nss,
                                                       const BSONObj& doc,
                                                       const WriteConcernOptions& writeConcern) {
    invariant(nss.db() == NamespaceString::kAdminDb || nss.db() == NamespaceString::kConfigDb);

    const BSONElement idField = doc.getField("_id");

    BatchedCommandRequest request([&] {
        write_ops::Insert insertOp(nss);
        return insertOp;

    auto configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard();
    for (int retry = 1; retry <= kMaxWriteRetry; retry++) {
        auto response = configShard->runBatchWriteCommand(
            opCtx, Shard::kDefaultConfigCommandTimeout, request, Shard::RetryPolicy::kNoRetry);

        Status status = response.toStatus();

        if (retry < kMaxWriteRetry &&
            configShard->isRetriableError(status.code(), Shard::RetryPolicy::kIdempotent)) {
            // Pretend like the operation is idempotent because we're handling DuplicateKey errors
            // specially

        // If we get DuplicateKey error on the first attempt to insert, this definitively means that
        // we are trying to insert the same entry a second time, so error out. If it happens on a
        // retry attempt though, it is not clear whether we are actually inserting a duplicate key
        // or it is because we failed to wait for write concern on the first attempt. In order to
        // differentiate, fetch the entry and check.
        if (retry > 1 && status == ErrorCodes::DuplicateKey) {
            LOG(1) << "Insert retry failed because of duplicate key error, rechecking.";

            auto fetchDuplicate =
            if (!fetchDuplicate.isOK()) {
                return fetchDuplicate.getStatus();

            auto existingDocs = fetchDuplicate.getValue().value;
            if (existingDocs.empty()) {
                return {status.withContext(
                    stream() << "DuplicateKey error was returned after a retry attempt, but no "
                                "documents were found. This means a concurrent change occurred "
                                "together with the retries.")};

            invariant(existingDocs.size() == 1);

            BSONObj existing = std::move(existingDocs.front());
            if (existing.woCompare(doc) == 0) {
                // Documents match, so treat the operation as success
                return Status::OK();

        return status;

Example #24
CollectionMetadata* CollectionMetadata::cloneMigrate(const ChunkType& chunk,
                                                     const ChunkVersion& newShardVersion,
                                                     string* errMsg) const {
    // The error message string is optional.
    string dummy;
    if (errMsg == NULL) {
        errMsg = &dummy;

    // Check that we have the exact chunk that will be subtracted.
    if (!rangeMapContains(_chunksMap, chunk.getMin(), chunk.getMax())) {
        *errMsg = stream() << "cannot remove chunk "
                           << rangeToString(chunk.getMin(), chunk.getMax())
                           << ", this shard does not contain the chunk";

        if (rangeMapOverlaps(_chunksMap, chunk.getMin(), chunk.getMax())) {
            RangeVector overlap;
            getRangeMapOverlap(_chunksMap, chunk.getMin(), chunk.getMax(), &overlap);

            *errMsg += stream() << " and it overlaps " << overlapToString(overlap);

        warning() << *errMsg;
        return NULL;

    // If left with no chunks, check that the version is zero.
    if (_chunksMap.size() == 1) {
        if (newShardVersion.isSet()) {
            *errMsg = stream() << "cannot set shard version to non-zero value "
                               << newShardVersion.toString() << " when removing last chunk "
                               << rangeToString(chunk.getMin(), chunk.getMax());

            warning() << *errMsg;
            return NULL;
    // Can't move version backwards when subtracting chunks.  This is what guarantees that
    // no read or write would be taken once we subtract data from the current shard.
    else if (newShardVersion <= _shardVersion) {
        *errMsg = stream() << "cannot remove chunk "
                           << rangeToString(chunk.getMin(), chunk.getMax())
                           << " because the new shard version " << newShardVersion.toString()
                           << " is not greater than the current shard version "
                           << _shardVersion.toString();

        warning() << *errMsg;
        return NULL;

    unique_ptr<CollectionMetadata> metadata(new CollectionMetadata);
    metadata->_keyPattern = this->_keyPattern;
    metadata->_pendingMap = this->_pendingMap;
    metadata->_chunksMap = this->_chunksMap;
    metadata->_shardVersion = newShardVersion;
    metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion;

    return metadata.release();
Example #25
bool checkAndUpgradeConfigVersion(CatalogManager* catalogManager,
                                  bool upgrade,
                                  VersionType* initialVersionInfo,
                                  VersionType* versionInfo,
                                  string* errMsg) {
    string dummy;
    if (!errMsg) {
        errMsg = &dummy;

    Status getConfigStatus = getConfigVersion(catalogManager, versionInfo);
    if (!getConfigStatus.isOK()) {
        *errMsg = stream() << "could not load config version for upgrade"
                           << causedBy(getConfigStatus);
        return false;


    VersionStatus comp = isConfigVersionCompatible(*versionInfo, errMsg);

    if (comp == VersionStatus_Incompatible)
        return false;
    if (comp == VersionStatus_Compatible)
        return true;

    invariant(comp == VersionStatus_NeedUpgrade);

    // Our current config version is now greater than the current version, so we should upgrade
    // if possible.

    // The first empty version is technically an upgrade, but has special semantics
    bool isEmptyVersion = versionInfo->getCurrentVersion() == UpgradeHistory_EmptyVersion;

    // First check for the upgrade flag (but no flag is needed if we're upgrading from empty)
    if (!isEmptyVersion && !upgrade) {
        *errMsg = stream() << "newer version " << CURRENT_CONFIG_VERSION
                           << " of mongo config metadata is required, "
                           << "current version is " << versionInfo->getCurrentVersion() << ", "
                           << "need to run mongos with --upgrade";

        return false;

    // Contact the config servers to make sure all are online - otherwise we wait a long time
    // for locks.
    if (!_checkConfigServersAlive(catalogManager->connectionString(), errMsg)) {
        if (isEmptyVersion) {
            *errMsg = stream() << "all config servers must be reachable for initial"
                               << " config database creation" << causedBy(errMsg);
        } else {
            *errMsg = stream() << "all config servers must be reachable for config upgrade"
                               << causedBy(errMsg);

        return false;

    // Check whether or not the balancer is online, if it is online we will not upgrade
    // (but we will initialize the config server)
    if (!isEmptyVersion) {
        auto balSettingsResult = catalogManager->getGlobalSettings(SettingsType::BalancerDocKey);
        if (balSettingsResult.isOK()) {
            SettingsType balSettings = balSettingsResult.getValue();
            if (!balSettings.getBalancerStopped()) {
                *errMsg = stream() << "balancer must be stopped for config upgrade"
                                   << causedBy(errMsg);

    // Acquire a lock for the upgrade process.
    // We want to ensure that only a single mongo process is upgrading the config server at a
    // time.

    string whyMessage(stream() << "upgrading config database to new format v"
                               << CURRENT_CONFIG_VERSION);
    auto lockTimeout = stdx::chrono::milliseconds(20 * 60 * 1000);
    auto scopedDistLock =
        catalogManager->getDistLockManager()->lock("configUpgrade", whyMessage, lockTimeout);
    if (!scopedDistLock.isOK()) {
        *errMsg = scopedDistLock.getStatus().toString();
        return false;

    // Double-check compatibility inside the upgrade lock
    // Another process may have won the lock earlier and done the upgrade for us, check
    // if this is the case.

    getConfigStatus = getConfigVersion(catalogManager, versionInfo);
    if (!getConfigStatus.isOK()) {
        *errMsg = stream() << "could not reload config version for upgrade"
                           << causedBy(getConfigStatus);
        return false;


    comp = isConfigVersionCompatible(*versionInfo, errMsg);

    if (comp == VersionStatus_Incompatible)
        return false;
    if (comp == VersionStatus_Compatible)
        return true;

    invariant(comp == VersionStatus_NeedUpgrade);

    // Run through the upgrade steps necessary to bring our config version to the current
    // version

    log() << "starting upgrade of config server from v" << versionInfo->getCurrentVersion()
          << " to v" << CURRENT_CONFIG_VERSION;

    ConfigUpgradeRegistry registry(createRegistry());

    while (versionInfo->getCurrentVersion() < CURRENT_CONFIG_VERSION) {
        int fromVersion = versionInfo->getCurrentVersion();

        // Run the next upgrade process and replace versionInfo with the result of the
        // upgrade.

        if (!_nextUpgrade(catalogManager, registry, *versionInfo, versionInfo, errMsg)) {
            return false;

        // Ensure we're making progress here
        if (versionInfo->getCurrentVersion() <= fromVersion) {
            *errMsg = stream() << "bad v" << fromVersion << " config version upgrade, "
                               << "version did not increment and is now "
                               << versionInfo->getCurrentVersion();

            return false;

    invariant(versionInfo->getCurrentVersion() == CURRENT_CONFIG_VERSION);

    log() << "upgrade of config server to v" << versionInfo->getCurrentVersion() << " successful";

    return true;
Example #26
ChunkMoveOperationState::acquireMoveMetadata() {
    // Get the distributed lock
    const string whyMessage(stream() << "migrating chunk [" << _minKey << ", " << _maxKey << ") in "
                                     << _nss.ns());
    _distLockStatus = grid.forwardingCatalogManager()->distLock(_txn, _nss.ns(), whyMessage);

    if (!_distLockStatus->isOK()) {
        const string msg = stream() << "could not acquire collection lock for " << _nss.ns()
                                    << " to migrate chunk [" << _minKey << "," << _maxKey << ")"
                                    << causedBy(_distLockStatus->getStatus());
        warning() << msg;
        return Status(_distLockStatus->getStatus().code(), msg);

    ShardingState* const shardingState = ShardingState::get(_txn);

    // Snapshot the metadata
    Status refreshStatus = shardingState->refreshMetadataNow(_txn, _nss.ns(), &_shardVersion);
    if (!refreshStatus.isOK()) {
        const string msg = stream() << "moveChunk cannot start migrate of chunk "
                                    << "[" << _minKey << "," << _maxKey << ")"
                                    << causedBy(refreshStatus.reason());
        warning() << msg;
        return Status(refreshStatus.code(), msg);

    if (_shardVersion.majorVersion() == 0) {
        // It makes no sense to migrate if our version is zero and we have no chunks
        const string msg = stream() << "moveChunk cannot start migrate of chunk "
                                    << "[" << _minKey << "," << _maxKey << ")"
                                    << " with zero shard version";
        warning() << msg;
        return Status(ErrorCodes::IncompatibleShardingMetadata, msg);

        // Mongos >= v3.2 sends the full version, v3.0 only sends the epoch.
        // TODO(SERVER-20742): Stop parsing epoch separately after 3.2.
        auto& operationVersion = OperationShardVersion::get(_txn);
        if (operationVersion.hasShardVersion()) {
            _collectionVersion = operationVersion.getShardVersion(_nss);
            _collectionEpoch = _collectionVersion.epoch();
        }  // else the epoch will already be set from the parsing of the ChunkMoveOperationState

        if (_collectionEpoch != _shardVersion.epoch()) {
            const string msg = stream() << "moveChunk cannot move chunk "
                                        << "[" << _minKey << "," << _maxKey << "), "
                                        << "collection may have been dropped. "
                                        << "current epoch: " << _shardVersion.epoch()
                                        << ", cmd epoch: " << _collectionEpoch;
            warning() << msg;
            throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion);

    _collMetadata = shardingState->getCollectionMetadata(_nss.ns());

    // With nonzero shard version, we must have a coll version >= our shard version
    invariant(_collMetadata->getCollVersion() >= _shardVersion);

    // With nonzero shard version, we must have a shard key

    ChunkType origChunk;
    if (!_collMetadata->getNextChunk(_minKey, &origChunk) ||
        origChunk.getMin().woCompare(_minKey) || origChunk.getMax().woCompare(_maxKey)) {
        // Our boundaries are different from those passed in
        const string msg = stream() << "moveChunk cannot find chunk "
                                    << "[" << _minKey << "," << _maxKey << ")"
                                    << " to migrate, the chunk boundaries may be stale";
        warning() << msg;
        throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion);

    return &_distLockStatus->getValue();
Example #27
Status ChunkMoveOperationState::commitMigration() {

    log() << "About to enter migrate critical section";

    // We're under the collection distributed lock here, so no other migrate can change maxVersion
    // or CollectionMetadata state.
    ShardingState* const shardingState = ShardingState::get(_txn);

    Status startStatus = ShardingStateRecovery::startMetadataOp(_txn);
    if (!startStatus.isOK())
        return startStatus;


    const ChunkVersion originalCollVersion = getCollMetadata()->getCollVersion();

    ChunkVersion myVersion = originalCollVersion;

        ScopedTransaction transaction(_txn, MODE_IX);
        Lock::DBLock lk(_txn->lockState(), _nss.db(), MODE_IX);
        Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

        invariant(myVersion > shardingState->getVersion(_nss.ns()));

        // Bump the metadata's version up and "forget" about the chunk being moved. This is
        // not the commit point, but in practice the state in this shard won't change until
        // the commit it done.
        shardingState->donateChunk(_txn, _nss.ns(), _minKey, _maxKey, myVersion);

    log() << "moveChunk setting version to: " << myVersion << migrateLog;

    // We're under the collection lock here, too, so we can undo the chunk donation because
    // no other state change could be ongoing
    BSONObj res;
    Status recvChunkCommitStatus{ErrorCodes::InternalError, "status not set"};

    try {
        ScopedDbConnection connTo(_toShardCS, 35.0);
        connTo->runCommand("admin", BSON("_recvChunkCommit" << 1), res);
        recvChunkCommitStatus = getStatusFromCommandResult(res);
    } catch (const DBException& e) {
        const string msg = stream() << "moveChunk could not contact to shard " << _toShard
                                    << " to commit transfer" << causedBy(e);
        warning() << msg;
        recvChunkCommitStatus = Status(e.toStatus().code(), msg);

    if (MONGO_FAIL_POINT(failMigrationCommit) && recvChunkCommitStatus.isOK()) {
        recvChunkCommitStatus =
            Status(ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint.");

    if (!recvChunkCommitStatus.isOK()) {
        log() << "moveChunk migrate commit not accepted by TO-shard: " << res
              << " resetting shard version to: " << getShardVersion() << migrateLog;

            ScopedTransaction transaction(_txn, MODE_IX);
            Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX);
            Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

            log() << "moveChunk collection lock acquired to reset shard version from "
                     "failed migration";

            // Revert the chunk manager back to the state before "forgetting" about the chunk
            shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata());

        log() << "Shard version successfully reset to clean up failed migration";

        return Status(recvChunkCommitStatus.code(),
                      stream() << "_recvChunkCommit failed: " << causedBy(recvChunkCommitStatus));

    log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog;

    BSONArrayBuilder updates;

        // Update for the chunk being moved
        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);  // No upserting
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey));
        myVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _nss.ns());
        n.append(ChunkType::min(), _minKey);
        n.append(ChunkType::max(), _maxKey);
        n.append(ChunkType::shard(), _toShard);

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey));


    // Version at which the next highest lastmod will be set. If the chunk being moved is the last
    // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the
    // chunk being bumped on the FROM-shard.
    ChunkVersion nextVersion = myVersion;

    // If we have chunks left on the FROM shard, update the version of one of them as well. We can
    // figure that out by grabbing the metadata as it has been changed.
    const std::shared_ptr<CollectionMetadata> bumpedCollMetadata(
    if (bumpedCollMetadata->getNumChunks() > 0) {
        // get another chunk on that shard
        ChunkType bumpChunk;
        invariant(bumpedCollMetadata->getNextChunk(bumpedCollMetadata->getMinKey(), &bumpChunk));

        BSONObj bumpMin = bumpChunk.getMin();
        BSONObj bumpMax = bumpChunk.getMax();

        dassert(bumpMin.woCompare(_minKey) != 0);

        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);
        op.append("ns", ChunkType::ConfigNS);

        nextVersion.incMinor();  // same as used on donateChunk

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin));
        nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _nss.ns());
        n.append(ChunkType::min(), bumpMin);
        n.append(ChunkType::max(), bumpMax);
        n.append(ChunkType::shard(), _fromShard);

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin));


        log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin
              << " -> " << bumpMax << " for collection '" << _nss.ns() << "'" << migrateLog;
    } else {
        log() << "moveChunk moved last chunk out for collection '" << _nss.ns() << "'"
              << migrateLog;

    BSONArrayBuilder preCond;
        BSONObjBuilder b;
        b.append("ns", ChunkType::ConfigNS);
                 BSON("query" << BSON(ChunkType::ns(_nss.ns())) << "orderby"
                              << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
            BSONObjBuilder bb(b.subobjStart("res"));

            // TODO: For backwards compatibility, we can't yet require an epoch here
            bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), originalCollVersion.toLong());


    Status applyOpsStatus{Status::OK()};
    try {
        // For testing migration failures
        if (MONGO_FAIL_POINT(failMigrationConfigWritePrepare)) {
            throw DBException("mock migration failure before config write",

        applyOpsStatus =
            grid.catalogManager(_txn)->applyChunkOpsDeprecated(_txn, updates.arr(), preCond.arr());

        if (MONGO_FAIL_POINT(failMigrationApplyOps)) {
            throw SocketException(SocketException::RECV_ERROR,
    } catch (const DBException& ex) {
        warning() << ex << migrateLog;
        applyOpsStatus = ex.toStatus();

    if (applyOpsStatus == ErrorCodes::PrepareConfigsFailed) {
        // In the process of issuing the migrate commit, the SyncClusterConnection checks that
        // the config servers are reachable. If they are not, we are sure that the applyOps
        // command was not sent to any of the configs, so we can safely back out of the
        // migration here, by resetting the shard version that we bumped up to in the
        // donateChunk() call above.
        log() << "About to acquire moveChunk coll lock to reset shard version from "
              << "failed migration";

            ScopedTransaction transaction(_txn, MODE_IX);
            Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX);
            Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

            // Revert the metadata back to the state before "forgetting" about the chunk
            shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata());

        log() << "Shard version successfully reset to clean up failed migration";

        const string msg = stream() << "Failed to send migrate commit to configs "
                                    << causedBy(applyOpsStatus);
        return Status(applyOpsStatus.code(), msg);
    } else if (!applyOpsStatus.isOK()) {
        // This could be a blip in the connectivity. Wait out a few seconds and check if the
        // commit request made it.
        // If the commit made it to the config, we'll see the chunk in the new shard and
        // there's no further action to be done.
        // If the commit did not make it, currently the only way to fix this state is to
        // bounce the mongod so that the old state (before migrating) is brought in.

        warning() << "moveChunk commit outcome ongoing" << migrateLog;

        // Look for the chunk in this shard whose version got bumped. We assume that if that
        // mod made it to the config server, then applyOps was successful.
        try {
            std::vector<ChunkType> newestChunk;
            Status status =
                                                     BSON(ChunkType::DEPRECATED_lastmod() << -1),

            ChunkVersion checkVersion;
            if (!newestChunk.empty()) {
                invariant(newestChunk.size() == 1);
                checkVersion = newestChunk[0].getVersion();

            if (checkVersion.equals(nextVersion)) {
                log() << "moveChunk commit confirmed" << migrateLog;
            } else {
                error() << "moveChunk commit failed: version is at " << checkVersion
                        << " instead of " << nextVersion << migrateLog;
                error() << "TERMINATING" << migrateLog;

        } catch (...) {
            error() << "moveChunk failed to get confirmation of commit" << migrateLog;
            error() << "TERMINATING" << migrateLog;




    // Migration is done, just log some diagnostics information
    BSONObj chunkInfo =
        BSON("min" << _minKey << "max" << _maxKey << "from" << _fromShard << "to" << _toShard);

    BSONObjBuilder commitInfo;
    if (res["counts"].type() == Object) {

    grid.catalogManager(_txn)->logChange(_txn, "moveChunk.commit", _nss.ns(), commitInfo.obj());

    _isRunning = false;

    return Status::OK();
Example #28
CollectionMetadata* CollectionMetadata::cloneSplit(const ChunkType& chunk,
                                                   const vector<BSONObj>& splitKeys,
                                                   const ChunkVersion& newShardVersion,
                                                   string* errMsg) const {
    // The error message string is optional.
    string dummy;
    if (errMsg == NULL) {
        errMsg = &dummy;

    // The version required in both resulting chunks could be simply an increment in the
    // minor portion of the current version.  However, we are enforcing uniqueness over the
    // attributes <ns, version> of the configdb collection 'chunks'.  So in practice, a
    // migrate somewhere may force this split to pick up a version that has the major
    // portion higher than the one that this shard has been using.
    // TODO drop the uniqueness constraint and tighten the check below so that only the
    // minor portion of version changes
    if (newShardVersion <= _shardVersion) {
        *errMsg = stream() << "cannot split chunk " << rangeToString(chunk.getMin(), chunk.getMax())
                           << ", new shard version " << newShardVersion.toString()
                           << " is not greater than current version " << _shardVersion.toString();

        warning() << *errMsg;
        return NULL;

    // Check that we have the exact chunk that will be subtracted.
    if (!rangeMapContains(_chunksMap, chunk.getMin(), chunk.getMax())) {
        *errMsg = stream() << "cannot split chunk " << rangeToString(chunk.getMin(), chunk.getMax())
                           << ", this shard does not contain the chunk";

        if (rangeMapOverlaps(_chunksMap, chunk.getMin(), chunk.getMax())) {
            RangeVector overlap;
            getRangeMapOverlap(_chunksMap, chunk.getMin(), chunk.getMax(), &overlap);

            *errMsg += stream() << " and it overlaps " << overlapToString(overlap);

        warning() << *errMsg;
        return NULL;

    // Check that the split key is valid
    for (vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it) {
        if (!rangeContains(chunk.getMin(), chunk.getMax(), *it)) {
            *errMsg = stream() << "cannot split chunk "
                               << rangeToString(chunk.getMin(), chunk.getMax()) << " at key "
                               << *it;

            warning() << *errMsg;
            return NULL;

    unique_ptr<CollectionMetadata> metadata(new CollectionMetadata);
    metadata->_keyPattern = this->_keyPattern;
    metadata->_pendingMap = this->_pendingMap;
    metadata->_chunksMap = this->_chunksMap;
    metadata->_shardVersion = newShardVersion;  // will increment 2nd, 3rd,... chunks below

    BSONObj startKey = chunk.getMin();
    for (vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it) {
        BSONObj split = *it;
        invariant(split.woCompare(startKey) > 0);
        metadata->_chunksMap[startKey] = split.getOwned();
        metadata->_chunksMap.insert(make_pair(split.getOwned(), chunk.getMax().getOwned()));
        startKey = split;

    metadata->_collVersion =
        metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion;

    return metadata.release();
Example #29
Status checkAndInitConfigVersion(OperationContext* txn,
                                 CatalogManager* catalogManager,
                                 DistLockManager* distLockManager) {
    VersionType versionInfo;
    Status status = getConfigVersion(catalogManager, &versionInfo);
    if (!status.isOK()) {
        return status;

    string errMsg;
    VersionStatus comp = isConfigVersionCompatible(versionInfo, &errMsg);

    if (comp == VersionStatus_Incompatible)
        return {ErrorCodes::IncompatibleShardingMetadata, errMsg};
    if (comp == VersionStatus_Compatible)
        return Status::OK();

    invariant(comp == VersionStatus_NeedUpgrade);

    if (versionInfo.getCurrentVersion() != UpgradeHistory_EmptyVersion) {
        return {ErrorCodes::IncompatibleShardingMetadata,
                stream() << "newer version " << CURRENT_CONFIG_VERSION
                         << " of mongo config metadata is required, "
                         << "current version is " << versionInfo.getCurrentVersion()};

    // Contact the config servers to make sure all are online - otherwise we wait a long time
    // for locks.
    status = _checkConfigServersAlive(grid.shardRegistry()->getConfigServerConnectionString());
    if (!status.isOK()) {
        return status;

    // Acquire a lock for the upgrade process.
    // We want to ensure that only a single mongo process is upgrading the config server at a
    // time.

    string whyMessage(stream() << "initializing config database to new format v"
                               << CURRENT_CONFIG_VERSION);
    auto lockTimeout = stdx::chrono::minutes(20);
    auto scopedDistLock = distLockManager->lock(txn, "configUpgrade", whyMessage, lockTimeout);
    if (!scopedDistLock.isOK()) {
        return scopedDistLock.getStatus();

    // Double-check compatibility inside the upgrade lock
    // Another process may have won the lock earlier and done the upgrade for us, check
    // if this is the case.

    status = getConfigVersion(catalogManager, &versionInfo);
    if (!status.isOK()) {
        return status;

    comp = isConfigVersionCompatible(versionInfo, &errMsg);

    if (comp == VersionStatus_Incompatible) {
        return {ErrorCodes::IncompatibleShardingMetadata, errMsg};
    if (comp == VersionStatus_Compatible)
        return Status::OK();

    invariant(comp == VersionStatus_NeedUpgrade);

    // Run through the upgrade steps necessary to bring our config version to the current
    // version

    log() << "initializing config server version to " << CURRENT_CONFIG_VERSION;

    status = makeConfigVersionDocument(txn, catalogManager);
    if (!status.isOK())
        return status;

    log() << "initialization of config server to v" << CURRENT_CONFIG_VERSION << " successful";

    return Status::OK();
Example #30
CollectionMetadata* CollectionMetadata::cloneMerge(const BSONObj& minKey,
                                                   const BSONObj& maxKey,
                                                   const ChunkVersion& newShardVersion,
                                                   string* errMsg) const {
    if (newShardVersion <= _shardVersion) {
        *errMsg = stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                           << ", new shard version " << newShardVersion.toString()
                           << " is not greater than current version " << _shardVersion.toString();

        warning() << *errMsg;
        return NULL;

    RangeVector overlap;
    getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap);

    if (overlap.empty() || overlap.size() == 1) {
        *errMsg = stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                           << (overlap.empty() ? ", no chunks found in this range"
                                               : ", only one chunk found in this range");

        warning() << *errMsg;
        return NULL;

    bool validStartEnd = true;
    bool validNoHoles = true;
    if (overlap.begin()->first.woCompare(minKey) != 0) {
        // First chunk doesn't start with minKey
        validStartEnd = false;
    } else if (overlap.rbegin()->second.woCompare(maxKey) != 0) {
        // Last chunk doesn't end with maxKey
        validStartEnd = false;
    } else {
        // Check that there are no holes
        BSONObj prevMaxKey = minKey;
        for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) {
            if (it->first.woCompare(prevMaxKey) != 0) {
                validNoHoles = false;
            prevMaxKey = it->second;

    if (!validStartEnd || !validNoHoles) {
        *errMsg = stream() << "cannot merge range " << rangeToString(minKey, maxKey)
                           << ", overlapping chunks " << overlapToString(overlap)
                           << (!validStartEnd ? " do not have the same min and max key"
                                              : " are not all adjacent");

        warning() << *errMsg;
        return NULL;

    unique_ptr<CollectionMetadata> metadata(new CollectionMetadata);
    metadata->_keyPattern = this->_keyPattern;
    metadata->_pendingMap = this->_pendingMap;
    metadata->_chunksMap = this->_chunksMap;
    metadata->_rangesMap = this->_rangesMap;
    metadata->_shardVersion = newShardVersion;
    metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion;

    for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) {

    metadata->_chunksMap.insert(make_pair(minKey, maxKey));

    return metadata.release();