Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) {
    invariant(_state == kCriticalSection);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

    // Tell the recipient shard to fetch the latest changes
    Status commitCloneStatus = _cloneDriver->commitClone(txn);

    if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) {
        commitCloneStatus = {ErrorCodes::InternalError,
                             "Failing _recvChunkCommit due to failpoint."};

    if (!commitCloneStatus.isOK()) {
        return {commitCloneStatus.code(),
                str::stream() << "commit clone failed due to " << commitCloneStatus.toString()};

    // Generate the next collection version.
    ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion();

    // applyOps preparation for reflecting the uncommitted metadata on the config server

    // Preconditions
    BSONArrayBuilder preCond;
        BSONObjBuilder b;
        b.append("ns", ChunkType::ConfigNS);
                 BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby"
                              << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
            BSONObjBuilder bb(b.subobjStart("res"));

            // TODO: For backwards compatibility, we can't yet require an epoch here


    // Update for the chunk which is being donated
    BSONArrayBuilder updates;
        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);  // No upserting
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey()));
        uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _args.getNss().ns());
        n.append(ChunkType::min(), _args.getMinKey());
        n.append(ChunkType::max(), _args.getMaxKey());
        n.append(ChunkType::shard(), _args.getToShardId());

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey()));


    // Update for the chunk being moved

    // Version at which the next highest lastmod will be set. If the chunk being moved is the last
    // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the
    // chunk being bumped on the FROM-shard.
    ChunkVersion nextVersion = uncommittedCollVersion;

    // If we have chunks left on the FROM shard, update the version of one of them as well. We can
    // figure that out by grabbing the metadata as it has been changed.
    if (_committedMetadata->getNumChunks() > 1) {
        ChunkType bumpChunk;
        invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk));

        BSONObj bumpMin = bumpChunk.getMin();
        BSONObj bumpMax = bumpChunk.getMax();

        dassert(bumpMin.woCompare(_args.getMinKey()) != 0);

        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin));
        nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _args.getNss().ns());
        n.append(ChunkType::min(), bumpMin);
        n.append(ChunkType::max(), bumpMax);
        n.append(ChunkType::shard(), _args.getFromShardId());

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin));


        log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin
              << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'";
    } else {
        log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'";


    Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated(
        txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion);

    if (MONGO_FAIL_POINT(failCommitMigrationCommand)) {
        applyOpsStatus = Status(ErrorCodes::InternalError,
                                "Failpoint 'failCommitMigrationCommand' generated error");

    if (applyOpsStatus.isOK()) {
        // Now that applyOps succeeded and the new collection version is committed, update the
        // collection metadata to the new collection version and forget the migrated chunk.
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        ChunkType migratingChunkToForget;
        _committedMetadata =
            _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion);
        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
    } else {
        // This could be an unrelated error (e.g. network error). Check whether the metadata update
        // succeeded by refreshing the collection metadata from the config server and checking that
        // the original chunks no longer exist.

        warning() << "Migration metadata commit may have failed: refreshing metadata to check"
                  << causedBy(applyOpsStatus);

        // Need to get the latest optime in case the refresh request goes to a secondary --
        // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done.
        Status status = grid.catalogClient(txn)->logChange(
            BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from"
                       << _args.getFromShardId()
                       << "to"
                       << _args.getToShardId()));
        if (!status.isOK()) {
                 str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << ","
                               << _args.getMaxKey()
                               << ") due to "
                               << causedBy(applyOpsStatus)
                               << ", and updating the optime with a write before refreshing the "
                               << "metadata also failed: "
                               << causedBy(status)});

        ShardingState* const shardingState = ShardingState::get(txn);
        ChunkVersion shardVersion;
        Status refreshStatus =
            shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion);
                         str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey()
                                       << ","
                                       << _args.getMaxKey()
                                       << ") due to "
                                       << causedBy(applyOpsStatus)
                                       << ", and refreshing collection metadata failed: "
                                       << causedBy(refreshStatus)});

            ScopedTransaction scopedXact(txn, MODE_IS);
            AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

            auto css = CollectionShardingState::get(txn, _args.getNss());
            std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata();

            if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) {
                invariant(refreshedMetadata->getCollVersion() ==

                // After refresh, the collection metadata indicates that the donor shard still owns
                // the chunk, so no migration changes were written to the config server metadata.
                return {applyOpsStatus.code(),
                        str::stream() << "Migration was not committed, applyOps failed: "
                                      << causedBy(applyOpsStatus)};

            ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion();
            if (!refreshedCollectionVersion.equals(nextVersion)) {
                // The refreshed collection metadata's collection version does not match the control
                // chunk's updated collection version, which should now be the highest. The control
                // chunk was not committed, but the migrated chunk was. This state is not
                // recoverable.
                                 str::stream() << "Migration was partially committed, state is "
                                               << "unrecoverable. applyOps error: "
                                               << causedBy(applyOpsStatus)});



                                       BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey()
                                                  << "from"
                                                  << _args.getFromShardId()
                                                  << "to"
                                                  << _args.getToShardId()));

    return Status::OK();
Status ChunkMoveOperationState::commitMigration() {

    log() << "About to enter migrate critical section";

    // We're under the collection distributed lock here, so no other migrate can change maxVersion
    // or CollectionMetadata state.
    ShardingState* const shardingState = ShardingState::get(_txn);

    Status startStatus = ShardingStateRecovery::startMetadataOp(_txn);
    if (!startStatus.isOK())
        return startStatus;


    const ChunkVersion originalCollVersion = getCollMetadata()->getCollVersion();

    ChunkVersion myVersion = originalCollVersion;

        ScopedTransaction transaction(_txn, MODE_IX);
        Lock::DBLock lk(_txn->lockState(), _nss.db(), MODE_IX);
        Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

        invariant(myVersion > shardingState->getVersion(_nss.ns()));

        // Bump the metadata's version up and "forget" about the chunk being moved. This is
        // not the commit point, but in practice the state in this shard won't change until
        // the commit it done.
        shardingState->donateChunk(_txn, _nss.ns(), _minKey, _maxKey, myVersion);

    log() << "moveChunk setting version to: " << myVersion << migrateLog;

    // We're under the collection lock here, too, so we can undo the chunk donation because
    // no other state change could be ongoing
    BSONObj res;
    Status recvChunkCommitStatus{ErrorCodes::InternalError, "status not set"};

    try {
        ScopedDbConnection connTo(_toShardCS, 35.0);
        connTo->runCommand("admin", BSON("_recvChunkCommit" << 1), res);
        recvChunkCommitStatus = getStatusFromCommandResult(res);
    } catch (const DBException& e) {
        const string msg = stream() << "moveChunk could not contact to shard " << _toShard
                                    << " to commit transfer" << causedBy(e);
        warning() << msg;
        recvChunkCommitStatus = Status(e.toStatus().code(), msg);

    if (MONGO_FAIL_POINT(failMigrationCommit) && recvChunkCommitStatus.isOK()) {
        recvChunkCommitStatus =
            Status(ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint.");

    if (!recvChunkCommitStatus.isOK()) {
        log() << "moveChunk migrate commit not accepted by TO-shard: " << res
              << " resetting shard version to: " << getShardVersion() << migrateLog;

            ScopedTransaction transaction(_txn, MODE_IX);
            Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX);
            Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

            log() << "moveChunk collection lock acquired to reset shard version from "
                     "failed migration";

            // Revert the chunk manager back to the state before "forgetting" about the chunk
            shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata());

        log() << "Shard version successfully reset to clean up failed migration";

        return Status(recvChunkCommitStatus.code(),
                      stream() << "_recvChunkCommit failed: " << causedBy(recvChunkCommitStatus));

    log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog;

    BSONArrayBuilder updates;

        // Update for the chunk being moved
        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);  // No upserting
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey));
        myVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _nss.ns());
        n.append(ChunkType::min(), _minKey);
        n.append(ChunkType::max(), _maxKey);
        n.append(ChunkType::shard(), _toShard);

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey));


    // Version at which the next highest lastmod will be set. If the chunk being moved is the last
    // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the
    // chunk being bumped on the FROM-shard.
    ChunkVersion nextVersion = myVersion;

    // If we have chunks left on the FROM shard, update the version of one of them as well. We can
    // figure that out by grabbing the metadata as it has been changed.
    const std::shared_ptr<CollectionMetadata> bumpedCollMetadata(
    if (bumpedCollMetadata->getNumChunks() > 0) {
        // get another chunk on that shard
        ChunkType bumpChunk;
        invariant(bumpedCollMetadata->getNextChunk(bumpedCollMetadata->getMinKey(), &bumpChunk));

        BSONObj bumpMin = bumpChunk.getMin();
        BSONObj bumpMax = bumpChunk.getMax();

        dassert(bumpMin.woCompare(_minKey) != 0);

        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);
        op.append("ns", ChunkType::ConfigNS);

        nextVersion.incMinor();  // same as used on donateChunk

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin));
        nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _nss.ns());
        n.append(ChunkType::min(), bumpMin);
        n.append(ChunkType::max(), bumpMax);
        n.append(ChunkType::shard(), _fromShard);

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin));


        log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin
              << " -> " << bumpMax << " for collection '" << _nss.ns() << "'" << migrateLog;
    } else {
        log() << "moveChunk moved last chunk out for collection '" << _nss.ns() << "'"
              << migrateLog;

    BSONArrayBuilder preCond;
        BSONObjBuilder b;
        b.append("ns", ChunkType::ConfigNS);
                 BSON("query" << BSON(ChunkType::ns(_nss.ns())) << "orderby"
                              << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
            BSONObjBuilder bb(b.subobjStart("res"));

            // TODO: For backwards compatibility, we can't yet require an epoch here
            bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), originalCollVersion.toLong());


    Status applyOpsStatus{Status::OK()};
    try {
        // For testing migration failures
        if (MONGO_FAIL_POINT(failMigrationConfigWritePrepare)) {
            throw DBException("mock migration failure before config write",

        applyOpsStatus =
            grid.catalogManager(_txn)->applyChunkOpsDeprecated(_txn, updates.arr(), preCond.arr());

        if (MONGO_FAIL_POINT(failMigrationApplyOps)) {
            throw SocketException(SocketException::RECV_ERROR,
    } catch (const DBException& ex) {
        warning() << ex << migrateLog;
        applyOpsStatus = ex.toStatus();

    if (applyOpsStatus == ErrorCodes::PrepareConfigsFailed) {
        // In the process of issuing the migrate commit, the SyncClusterConnection checks that
        // the config servers are reachable. If they are not, we are sure that the applyOps
        // command was not sent to any of the configs, so we can safely back out of the
        // migration here, by resetting the shard version that we bumped up to in the
        // donateChunk() call above.
        log() << "About to acquire moveChunk coll lock to reset shard version from "
              << "failed migration";

            ScopedTransaction transaction(_txn, MODE_IX);
            Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX);
            Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

            // Revert the metadata back to the state before "forgetting" about the chunk
            shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata());

        log() << "Shard version successfully reset to clean up failed migration";

        const string msg = stream() << "Failed to send migrate commit to configs "
                                    << causedBy(applyOpsStatus);
        return Status(applyOpsStatus.code(), msg);
    } else if (!applyOpsStatus.isOK()) {
        // This could be a blip in the connectivity. Wait out a few seconds and check if the
        // commit request made it.
        // If the commit made it to the config, we'll see the chunk in the new shard and
        // there's no further action to be done.
        // If the commit did not make it, currently the only way to fix this state is to
        // bounce the mongod so that the old state (before migrating) is brought in.

        warning() << "moveChunk commit outcome ongoing" << migrateLog;

        // Look for the chunk in this shard whose version got bumped. We assume that if that
        // mod made it to the config server, then applyOps was successful.
        try {
            std::vector<ChunkType> newestChunk;
            Status status =
                                                     BSON(ChunkType::DEPRECATED_lastmod() << -1),

            ChunkVersion checkVersion;
            if (!newestChunk.empty()) {
                invariant(newestChunk.size() == 1);
                checkVersion = newestChunk[0].getVersion();

            if (checkVersion.equals(nextVersion)) {
                log() << "moveChunk commit confirmed" << migrateLog;
            } else {
                error() << "moveChunk commit failed: version is at " << checkVersion
                        << " instead of " << nextVersion << migrateLog;
                error() << "TERMINATING" << migrateLog;

        } catch (...) {
            error() << "moveChunk failed to get confirmation of commit" << migrateLog;
            error() << "TERMINATING" << migrateLog;




    // Migration is done, just log some diagnostics information
    BSONObj chunkInfo =
        BSON("min" << _minKey << "max" << _maxKey << "from" << _fromShard << "to" << _toShard);

    BSONObjBuilder commitInfo;
    if (res["counts"].type() == Object) {

    grid.catalogManager(_txn)->logChange(_txn, "moveChunk.commit", _nss.ns(), commitInfo.obj());

    _isRunning = false;

    return Status::OK();
std::shared_ptr<ChunkManager> DBConfig::getChunkManager(OperationContext* txn,
                                                        const string& ns,
                                                        bool shouldReload,
                                                        bool forceReload) {
    BSONObj key;
    ChunkVersion oldVersion;
    std::shared_ptr<ChunkManager> oldManager;

        stdx::lock_guard<stdx::mutex> lk(_lock);

        bool earlyReload = !_collections[ns].isSharded() && (shouldReload || forceReload);
        if (earlyReload) {
            // This is to catch cases where there this is a new sharded collection.
            // Note: read the _reloadCount inside the _lock mutex, so _loadIfNeeded will always
            // be forced to perform a reload.
            const auto currentReloadIteration = _reloadCount.load();
            _loadIfNeeded(txn, currentReloadIteration);

        CollectionInfo& ci = _collections[ns];
        uassert(10181, str::stream() << "not sharded:" << ns, ci.isSharded());


        if (!(shouldReload || forceReload) || earlyReload) {
            return ci.getCM();

        key = ci.key().copy();

        if (ci.getCM()) {
            oldManager = ci.getCM();
            oldVersion = ci.getCM()->getVersion();


    // TODO: We need to keep this first one-chunk check in until we have a more efficient way of
    // creating/reusing a chunk manager, as doing so requires copying the full set of chunks
    // currently
    vector<ChunkType> newestChunk;
    if (oldVersion.isSet() && !forceReload) {
                                               BSON(ChunkType::DEPRECATED_lastmod() << -1),

        if (!newestChunk.empty()) {
            invariant(newestChunk.size() == 1);
            ChunkVersion v = newestChunk[0].getVersion();
            if (v.equals(oldVersion)) {
                stdx::lock_guard<stdx::mutex> lk(_lock);
                const CollectionInfo& ci = _collections[ns];
                        str::stream() << "not sharded after reloading from chunks : " << ns,
                return ci.getCM();

    } else if (!oldVersion.isSet()) {
        warning() << "version 0 found when " << (forceReload ? "reloading" : "checking")
                  << " chunk manager; collection '" << ns << "' initially detected as sharded";

    // we are not locked now, and want to load a new ChunkManager

    unique_ptr<ChunkManager> tempChunkManager;

        stdx::lock_guard<stdx::mutex> lll(_hitConfigServerLock);

        if (!newestChunk.empty() && !forceReload) {
            // If we have a target we're going for see if we've hit already
            stdx::lock_guard<stdx::mutex> lk(_lock);

            CollectionInfo& ci = _collections[ns];

            if (ci.isSharded() && ci.getCM()) {
                ChunkVersion currentVersion = newestChunk[0].getVersion();

                // Only reload if the version we found is newer than our own in the same epoch
                if (currentVersion <= ci.getCM()->getVersion() &&
                    ci.getCM()->getVersion().hasEqualEpoch(currentVersion)) {
                    return ci.getCM();

        tempChunkManager.reset(new ChunkManager(oldManager->getns(),
        tempChunkManager->loadExistingRanges(txn, oldManager.get());

        if (tempChunkManager->numChunks() == 0) {
            // Maybe we're not sharded any more, so do a full reload

            return getChunkManager(txn, ns, false);

    stdx::lock_guard<stdx::mutex> lk(_lock);

    CollectionInfo& ci = _collections[ns];
    uassert(14822, (string) "state changed in the middle: " + ns, ci.isSharded());

    // Reset if our versions aren't the same
    bool shouldReset = !tempChunkManager->getVersion().equals(ci.getCM()->getVersion());

    // Also reset if we're forced to do so
    if (!shouldReset && forceReload) {
        shouldReset = true;
        warning() << "chunk manager reload forced for collection '" << ns << "', config version is "
                  << tempChunkManager->getVersion();

    // It's possible to get into a state when dropping collections when our new version is
    // less than our prev version. Behave identically to legacy mongos, for now, and warn to
    // draw attention to the problem.
    // TODO: Assert in next version, to allow smooth upgrades

    if (shouldReset && tempChunkManager->getVersion() < ci.getCM()->getVersion()) {
        shouldReset = false;

        warning() << "not resetting chunk manager for collection '" << ns << "', config version is "
                  << tempChunkManager->getVersion() << " and "
                  << "old version is " << ci.getCM()->getVersion();

    // end legacy behavior

    if (shouldReset) {
        const auto cmOpTime = tempChunkManager->getConfigOpTime();

        // The existing ChunkManager could have been updated since we last checked, so
        // replace the existing chunk manager only if it is strictly newer.
        // The condition should be (>) than instead of (>=), but use (>=) since legacy non-repl
        // config servers will always have an opTime of zero.
        if (cmOpTime >= ci.getCM()->getConfigOpTime()) {

        15883, str::stream() << "not sharded after chunk manager reset : " << ns, ci.isSharded());

    return ci.getCM();