OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* txn,
                                                                      bool isV1ElectionProtocol) {

    // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be
    // done before we add anything to our oplog.
    _storageInterface->setAppliedThrough(txn, {});

    if (isV1ElectionProtocol) {
            ScopedTransaction scopedXact(txn, MODE_X);

            WriteUnitOfWork wuow(txn);
                     << "new primary"));
            txn, "logging transition to primary to oplog", "local.oplog.rs");
    const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(txn));


    return opTimeToReturn;
Status MigrationSourceManager::startClone(OperationContext* txn) {
    invariant(_state == kCreated);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

                                       BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey()
                                                  << "from"
                                                  << _args.getFromShardId()
                                                  << "to"
                                                  << _args.getToShardId()));

    _cloneDriver = stdx::make_unique<MigrationChunkClonerSourceLegacy>(
        _args, _committedMetadata->getKeyPattern());

        // Register for notifications from the replication subsystem
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
        css->setMigrationSourceManager(txn, this);

    Status startCloneStatus = _cloneDriver->startClone(txn);
    if (!startCloneStatus.isOK()) {
        return startCloneStatus;

    _state = kCloning;
    return Status::OK();
Beispiel #3
    Status verifySystemIndexes(OperationContext* txn) {
        const NamespaceString systemUsers = AuthorizationManager::usersCollectionNamespace;

        // Make sure the old unique index from v2.4 on system.users doesn't exist.
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetDb autoDb(txn, systemUsers.db(), MODE_X);
        if (!autoDb.getDb()) {
            return Status::OK();

        Collection* collection = autoDb.getDb()->getCollection(NamespaceString(systemUsers));
        if (!collection) {
            return Status::OK();

        IndexCatalog* indexCatalog = collection->getIndexCatalog();
        IndexDescriptor* oldIndex = NULL;

        if (indexCatalog &&
            (oldIndex = indexCatalog->findIndexByKeyPattern(txn, v1SystemUsersKeyPattern))) {
            return Status(ErrorCodes::AuthSchemaIncompatible,
                          "Old 2.4 style user index identified. "
                          "The authentication schema needs to be updated by "
                          "running authSchemaUpgrade on a 2.6 server.");

        return Status::OK();
        bool run(OperationContext* txn,
                 const string& dbname,
                 BSONObj& jsobj,
                 string& errmsg,
                 BSONObjBuilder& result,
                 bool /*fromRepl*/) {

            ScopedTransaction scopedXact(txn, MODE_IS);
            AutoGetDb autoDb(txn, dbname, MODE_S);

            const Database* d = autoDb.getDb();
            const DatabaseCatalogEntry* dbEntry = NULL;

            list<string> names;
            if ( d ) {
                dbEntry = d->getDatabaseCatalogEntry();
                dbEntry->getCollectionNamespaces( &names );

            scoped_ptr<MatchExpression> matcher;
            if ( jsobj["filter"].isABSONObj() ) {
                StatusWithMatchExpression parsed =
                    MatchExpressionParser::parse( jsobj["filter"].Obj() );
                if ( !parsed.isOK() ) {
                    return appendCommandStatus( result, parsed.getStatus() );
                matcher.reset( parsed.getValue() );

            BSONArrayBuilder arr;

            for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) {
                string ns = *i;

                StringData collection = nsToCollectionSubstring( ns );
                if ( collection == "system.namespaces" ) {

                BSONObjBuilder b;
                b.append( "name", collection );

                CollectionOptions options =
                    dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn);
                b.append( "options", options.toBSON() );

                BSONObj maybe = b.obj();
                if ( matcher && !matcher->matchesBSON( maybe ) ) {

                arr.append( maybe );

            result.append( "collections", arr.arr() );

            return true;
Beispiel #5
    void restartInProgressIndexesFromLastShutdown(OperationContext* txn) {

        std::vector<std::string> dbNames;

        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        storageEngine->listDatabases( &dbNames );

        try {
            std::list<std::string> collNames;
            for (std::vector<std::string>::const_iterator dbName = dbNames.begin();
                 dbName < dbNames.end();
                 ++dbName) {

                ScopedTransaction scopedXact(txn, MODE_IS);
                AutoGetDb autoDb(txn, *dbName, MODE_S);

                Database* db = autoDb.getDb();
            checkNS(txn, collNames);
        catch (const DBException& e) {
            error() << "Index verification did not complete: " << e.toString();
        LOG(1) << "checking complete" << endl;
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn,
                                                                         const BSONObj& config) {
    try {

            ScopedTransaction scopedXact(txn, MODE_X);
            Lock::GlobalWrite globalWrite(txn->lockState());

            WriteUnitOfWork wuow(txn);
            Helpers::putSingleton(txn, configCollectionName, config);
            const auto msgObj = BSON("msg"
                                     << "initiating set");
            getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj);
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs");

        // This initializes the minvalid document with a null "ts" because older versions (<=3.2)
        // get angry if the minValid document is present but doesn't have a "ts" field.
        // Consider removing this once we no longer need to support downgrading to 3.2.
        _storageInterface->setMinValidToAtLeast(txn, {});

        FeatureCompatibilityVersion::setIfCleanStartup(txn, _storageInterface);
    } catch (const DBException& ex) {
        return ex.toStatus();
    return Status::OK();
void MigrationSourceManager::_cleanup(OperationContext* txn) {
    invariant(_state != kDone);

        // Unregister from the collection's sharding state
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(txn, _args.getNss().ns());

        // The migration source manager is not visible anymore after it is unregistered from the
        // collection

        // Leave the critical section.
        if (_state == kCriticalSection) {

    // Decrement the metadata op counter outside of the collection lock in order to hold it for as
    // short as possible.
    if (_state == kCriticalSection) {

    if (_cloneDriver) {

    _state = kDone;
    void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) {

        ScopedTransaction scopedXact(txn, MODE_X);
        Lock::GlobalWrite globalWrite(txn->lockState());

        WriteUnitOfWork wuow(txn);
        getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, BSON("msg" << "initiating set"));
    void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) {

            ScopedTransaction scopedXact(txn, MODE_X);
            Lock::GlobalWrite globalWrite(txn->lockState());

            WriteUnitOfWork wuow(txn);
            getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, BSON("msg" << "initiating set"));
        } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs");
void MigrationChunkClonerSourceLegacy::_cleanup(OperationContext* txn) {
        stdx::lock_guard<stdx::mutex> sl(_mutex);
        _cloneCompleted = true;

    ScopedTransaction scopedXact(txn, MODE_IS);
    AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

    if (_deleteNotifyExec) {
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn,
                                                                         const BSONObj& config,
                                                                         bool updateReplOpTime) {
    try {
        createOplog(txn, rsOplogName, true);

            ScopedTransaction scopedXact(txn, MODE_X);
            Lock::GlobalWrite globalWrite(txn->lockState());

            WriteUnitOfWork wuow(txn);
            Helpers::putSingleton(txn, configCollectionName, config);
            const auto msgObj = BSON("msg"
                                     << "initiating set");
            if (updateReplOpTime) {
                getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj);
            } else {
                // 'updateReplOpTime' is false when called from the replSetInitiate command when the
                // server is running with replication disabled. We bypass onOpMessage to invoke
                // _logOp directly so that we can override the replication mode and keep _logO from
                // updating the replication coordinator's op time (illegal operation when
                // replication is not enabled).
                repl::oplogCheckCloseDatabase(txn, nullptr);
                repl::oplogCheckCloseDatabase(txn, nullptr);
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs");
    } catch (const DBException& ex) {
        return ex.toStatus();
    return Status::OK();
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn,
                                                                         const BSONObj& config) {
    try {

            ScopedTransaction scopedXact(txn, MODE_X);
            Lock::GlobalWrite globalWrite(txn->lockState());

            WriteUnitOfWork wuow(txn);
            Helpers::putSingleton(txn, configCollectionName, config);
            const auto msgObj = BSON("msg"
                                     << "initiating set");
            getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj);
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs");
    } catch (const DBException& ex) {
        return ex.toStatus();
    return Status::OK();
Beispiel #13
    Status emptyCapped(OperationContext* txn,
                       const NamespaceString& collectionName) {
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetDb autoDb(txn, collectionName.db(), MODE_X);

        bool userInitiatedWritesAndNotPrimary = txn->writesAreReplicated() &&

        if (userInitiatedWritesAndNotPrimary) {
            return Status(ErrorCodes::NotMaster,
                          str::stream() << "Not primary while truncating collection "
                                        << collectionName.ns());

        Database* db = autoDb.getDb();
        massert(13429, "no such database", db);

        Collection* collection = db->getCollection(collectionName);
        massert(28584, "no such collection", collection);

        std::vector<BSONObj> indexes = stopIndexBuildsEmptyCapped(txn, db, collectionName);

        WriteUnitOfWork wuow(txn);

        Status status = collection->truncate(txn);
        if (!status.isOK()) {
            return status;

        IndexBuilder::restoreIndexes(txn, indexes);

        getGlobalServiceContext()->getOpObserver()->onEmptyCapped(txn, collection->ns());


        return Status::OK();
Status MigrationSourceManager::enterCriticalSection(OperationContext* txn) {
    invariant(_state == kCloneCaughtUp);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

    // Mark the shard as running critical operation, which requires recovery on crash
    Status status = ShardingStateRecovery::startMetadataOp(txn);
    if (!status.isOK()) {
        return status;

        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
        if (!css->getMetadata() ||
            !css->getMetadata()->getCollVersion().equals(_committedMetadata->getCollVersion())) {
            return {ErrorCodes::IncompatibleShardingMetadata,
                        << "Sharding metadata changed while holding distributed lock. Expected: "
                        << _committedMetadata->getCollVersion().toString()
                        << ", actual: "
                        << css->getMetadata()->getCollVersion().toString()};

        // IMPORTANT: After this line, the critical section is in place and needs to be rolled back
        // if anything fails, which would prevent commit to the config servers.
        _critSecSignal = std::make_shared<Notification<void>>();

    log() << "Successfully entered critical section.";

    _state = kCriticalSection;
    return Status::OK();
Beispiel #15
    virtual void subthread(int tnumber) {

        const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext();
        OperationContext& txn = *txnPtr;

        for (int i = 0; i < N; i++) {
            int x = std::rand();
            bool sometimes = (x % 15 == 0);
            if (i % 7 == 0) {
                Lock::GlobalRead r(txn.lockState());  // nested test
                Lock::GlobalRead r2(txn.lockState());
            } else if (i % 7 == 1) {
                Lock::GlobalRead r(txn.lockState());
            } else if (i % 7 == 4 && tnumber == 1 /*only one upgrader legal*/) {
                Lock::GlobalWrite w(txn.lockState());
                if (i % 7 == 2) {
                    Lock::TempRelease t(txn.lockState());
            } else if (i % 7 == 2) {
                Lock::GlobalWrite w(txn.lockState());
                if (sometimes) {
                    Lock::TempRelease t(txn.lockState());
            } else if (i % 7 == 3) {
                Lock::GlobalWrite w(txn.lockState());
                { Lock::TempRelease t(txn.lockState()); }
                Lock::GlobalRead r(txn.lockState());
                if (sometimes) {
                    Lock::TempRelease t(txn.lockState());
            } else if (i % 7 == 5) {
                    ScopedTransaction scopedXact(&txn, MODE_IS);
                    Lock::DBLock r(txn.lockState(), "foo", MODE_S);
                    ScopedTransaction scopedXact(&txn, MODE_IS);
                    Lock::DBLock r(txn.lockState(), "bar", MODE_S);
            } else if (i % 7 == 6) {
                if (i > N / 2) {
                    int q = i % 11;
                    if (q == 0) {
                        ScopedTransaction scopedXact(&txn, MODE_IS);

                        Lock::DBLock r(txn.lockState(), "foo", MODE_S);
                        ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S));

                        Lock::DBLock r2(txn.lockState(), "foo", MODE_S);
                        ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S));

                        Lock::DBLock r3(txn.lockState(), "local", MODE_S);
                        ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S));
                        ASSERT(txn.lockState()->isDbLockedForMode("local", MODE_S));
                    } else if (q == 1) {
                        // test locking local only -- with no preceding lock
                            ScopedTransaction scopedXact(&txn, MODE_IS);
                            Lock::DBLock x(txn.lockState(), "local", MODE_S);
                            ScopedTransaction scopedXact(&txn, MODE_IX);
                            Lock::DBLock x(txn.lockState(), "local", MODE_X);

                            //  No actual writing here, so no WriteUnitOfWork
                            if (sometimes) {
                                Lock::TempRelease t(txn.lockState());
                    } else if (q == 1) {
                            ScopedTransaction scopedXact(&txn, MODE_IS);
                            Lock::DBLock x(txn.lockState(), "admin", MODE_S);

                            ScopedTransaction scopedXact(&txn, MODE_IX);
                            Lock::DBLock x(txn.lockState(), "admin", MODE_X);
                    } else if (q == 3) {
                        ScopedTransaction scopedXact(&txn, MODE_IX);

                        Lock::DBLock x(txn.lockState(), "foo", MODE_X);
                        Lock::DBLock y(txn.lockState(), "admin", MODE_S);
                    } else if (q == 4) {
                        ScopedTransaction scopedXact(&txn, MODE_IS);

                        Lock::DBLock x(txn.lockState(), "foo2", MODE_S);
                        Lock::DBLock y(txn.lockState(), "admin", MODE_S);
                    } else {
                        ScopedTransaction scopedXact(&txn, MODE_IX);

                        Lock::DBLock w(txn.lockState(), "foo", MODE_X);

                        { Lock::TempRelease t(txn.lockState()); }

                        Lock::DBLock r2(txn.lockState(), "foo", MODE_S);
                        Lock::DBLock r3(txn.lockState(), "local", MODE_S);
                } else {
                    ScopedTransaction scopedXact(&txn, MODE_IS);

                    Lock::DBLock r(txn.lockState(), "foo", MODE_S);
                    Lock::DBLock r2(txn.lockState(), "foo", MODE_S);
                    Lock::DBLock r3(txn.lockState(), "local", MODE_S);
Status MigrationChunkClonerSourceLegacy::_storeCurrentLocs(OperationContext* txn) {
    ScopedTransaction scopedXact(txn, MODE_IS);
    AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

    Collection* const collection = autoColl.getCollection();
    if (!collection) {
        return {ErrorCodes::NamespaceNotFound,
                str::stream() << "Collection " << _args.getNss().ns() << " does not exist."};

    // Allow multiKey based on the invariant that shard keys must be single-valued. Therefore, any
    // multi-key index prefixed by shard key cannot be multikey over the shard key fields.
    IndexDescriptor* idx =
                                                                 false);  // requireSingleKey
    if (!idx) {
        return {ErrorCodes::IndexNotFound,
                str::stream() << "can't find index with prefix " << _shardKeyPattern.toBSON()
                              << " in storeCurrentLocs for "
                              << _args.getNss().ns()};

    // Install the stage, which will listen for notifications on the collection
        stdx::lock_guard<stdx::mutex> sl(_mutex);


        // Takes ownership of 'ws' and 'dns'.
        auto statusWithPlanExecutor =
                               stdx::make_unique<DeleteNotificationStage>(this, txn),

        _deleteNotifyExec = std::move(statusWithPlanExecutor.getValue());

    // Assume both min and max non-empty, append MinKey's to make them fit chosen index
    const KeyPattern kp(idx->keyPattern());

    BSONObj min = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMinKey(), false));
    BSONObj max = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMaxKey(), false));

    std::unique_ptr<PlanExecutor> exec(InternalPlanner::indexScan(txn,
                                                                  false,  // endKeyInclusive

    // We can afford to yield here because any change to the base data that we might miss is already
    // being queued and will migrate in the 'transferMods' stage.
    exec->setYieldPolicy(PlanExecutor::YIELD_AUTO, collection);

    // Use the average object size to estimate how many objects a full chunk would carry do that
    // while traversing the chunk's range using the sharding index, below there's a fair amount of
    // slack before we determine a chunk is too large because object sizes will vary.
    unsigned long long maxRecsWhenFull;
    long long avgRecSize;

    const long long totalRecs = collection->numRecords(txn);
    if (totalRecs > 0) {
        avgRecSize = collection->dataSize(txn) / totalRecs;
        maxRecsWhenFull = _args.getMaxChunkSizeBytes() / avgRecSize;
        maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1),
                                   130 * maxRecsWhenFull / 100 /* slack */);
    } else {
        avgRecSize = 0;
        maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1;

    // Do a full traversal of the chunk and don't stop even if we think it is a large chunk we want
    // the number of records to better report, in that case.
    bool isLargeChunk = false;
    unsigned long long recCount = 0;

    BSONObj obj;
    RecordId recordId;
    PlanExecutor::ExecState state;
    while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, &recordId))) {
        if (!isLargeChunk) {
            stdx::lock_guard<stdx::mutex> lk(_mutex);

        if (++recCount > maxRecsWhenFull) {
            isLargeChunk = true;
            // Continue on despite knowing that it will fail, just to get the correct value for
            // recCount

    if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) {
        return {ErrorCodes::InternalError,
                str::stream() << "Executor error while scanning for documents belonging to chunk: "
                              << WorkingSetCommon::toStatusString(obj)};


    if (isLargeChunk) {
        return {
            str::stream() << "Cannot move chunk: the maximum number of documents for a chunk is "
                          << maxRecsWhenFull
                          << ", the maximum chunk size is "
                          << _args.getMaxChunkSizeBytes()
                          << ", average document size is "
                          << avgRecSize
                          << ". Found "
                          << recCount
                          << " documents in chunk "
                          << " ns: "
                          << _args.getNss().ns()
                          << " "
                          << _args.getMinKey()
                          << " -> "
                          << _args.getMaxKey()};

    _averageObjectSizeForCloneLocs = static_cast<uint64_t>(collection->averageObjectSize(txn) + 12);

    return Status::OK();
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request)
    : _args(std::move(request)), _startTime() {

    const auto& oss = OperationShardingState::get(txn);
    if (!oss.hasShardVersion()) {
        uasserted(ErrorCodes::InvalidOptions, "collection version is missing");

    // Even though the moveChunk command transmits a value in the operation's shardVersion field,
    // this value does not actually contain the shard version, but the global collection version.
    const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss());

    log() << "Starting chunk migration for "
          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
          << " with expected collection version " << expectedCollectionVersion;

    // Now that the collection is locked, snapshot the metadata and fetch the latest versions
    ShardingState* const shardingState = ShardingState::get(txn);

    ChunkVersion shardVersion;

    Status refreshStatus =
        shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion);
    if (!refreshStatus.isOK()) {
                  str::stream() << "cannot start migrate of chunk "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " due to "
                                << refreshStatus.toString());

    if (shardVersion.majorVersion() == 0) {
        // If the major version is zero, this means we do not have any chunks locally to migrate in
        // the first place
                  str::stream() << "cannot start migrate of chunk "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " with zero shard version");

    // Snapshot the committed metadata from the time the migration starts
        ScopedTransaction scopedXact(txn, MODE_IS);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

        auto css = CollectionShardingState::get(txn, _args.getNss());
        _committedMetadata = css->getMetadata();

    const ChunkVersion collectionVersion = _committedMetadata->getCollVersion();

    if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) {
        throw SendStaleConfigException(
            str::stream() << "cannot move chunk "
                          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                          << " because collection may have been dropped. "
                          << "current epoch: "
                          << collectionVersion.epoch()
                          << ", cmd epoch: "
                          << expectedCollectionVersion.epoch(),

    // With nonzero shard version, we must have a coll version >= our shard version
    invariant(collectionVersion >= shardVersion);

    // With nonzero shard version, we must have a shard key

    ChunkType origChunk;
    if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) {
        // If this assertion is hit, it means that whoever called the shard moveChunk command
        // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this
        // shard. It is a benign error and does not indicate data corruption.
                  str::stream() << "Chunk with bounds "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " is not owned by this shard.");

            str::stream() << "Unable to find chunk with the exact bounds "
                          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                          << " at collection version "
                          << collectionVersion.toString()
                          << ". This indicates corrupted metadata.",
            origChunk.getMin().woCompare(_args.getMinKey()) == 0 &&
                origChunk.getMax().woCompare(_args.getMaxKey()) == 0);
Beispiel #18
        virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {

            if ( cmdObj.firstElement().type() != Array ) {
                errmsg = "ops has to be an array";
                return false;

            BSONObj ops = cmdObj.firstElement().Obj();

                // check input
                BSONObjIterator i( ops );
                while ( i.more() ) {
                    BSONElement e = i.next();
                    if (!_checkOperation(e, errmsg)) {
                        return false;

            // SERVER-4328 todo : is global ok or does this take a long time? i believe multiple 
            // ns used so locking individually requires more analysis
            ScopedTransaction scopedXact(txn, MODE_X);
            Lock::GlobalWrite globalWriteLock(txn->lockState());

            if (!fromRepl &&
                !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) {
                return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream()
                    << "Not primary while applying ops to database " << dbname));

            // Preconditions check reads the database state, so needs to be done locked
            if ( cmdObj["preCondition"].type() == Array ) {
                BSONObjIterator i( cmdObj["preCondition"].Obj() );
                while ( i.more() ) {
                    BSONObj f = i.next().Obj();

                    DBDirectClient db( txn );
                    BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() );

                    // Apply-ops would never have a $where matcher, so use the default callback,
                    // which will throw an error if $where is found.
                    Matcher m(f["res"].Obj());
                    if ( ! m.matches( realres ) ) {
                        result.append( "got" , realres );
                        result.append( "whatFailed" , f );
                        errmsg = "pre-condition failed";
                        return false;

            // apply
            int num = 0;
            int errors = 0;
            BSONObjIterator i( ops );
            BSONArrayBuilder ab;
            const bool alwaysUpsert = cmdObj.hasField("alwaysUpsert") ?
                    cmdObj["alwaysUpsert"].trueValue() : true;
            while ( i.more() ) {
                BSONElement e = i.next();
                const BSONObj& temp = e.Obj();

                // Ignore 'n' operations.
                const char *opType = temp["op"].valuestrsafe();
                if (*opType == 'n') continue;

                const string ns = temp["ns"].String();

                // Run operations under a nested lock as a hack to prevent yielding.
                // The list of operations is supposed to be applied atomically; yielding
                // would break atomicity by allowing an interruption or a shutdown to occur
                // after only some operations are applied.  We are already locked globally
                // at this point, so taking a DBLock on the namespace creates a nested lock,
                // and yields are disallowed for operations that hold a nested lock.
                // We do not have a wrapping WriteUnitOfWork so it is possible for a journal
                // commit to happen with a subset of ops applied.
                // TODO figure out what to do about this.
                Lock::GlobalWrite globalWriteLockDisallowTempRelease(txn->lockState());

                // Ensures that yielding will not happen (see the comment above).
                DEV {
                    Locker::LockSnapshot lockSnapshot;

                OldClientContext ctx(txn, ns);

                Status status(ErrorCodes::InternalError, "");
                while (true) {
                    try {
                        // We assume that in the WriteConflict retry case, either the op rolls back
                        // any changes it makes or is otherwise safe to rerun.
                        status =
                            repl::applyOperation_inlock(txn, ctx.db(), temp, false, alwaysUpsert);
                    catch (const WriteConflictException& wce) {
                        LOG(2) << "WriteConflictException in applyOps command, retrying.";

                if (!status.isOK()) {


                WriteUnitOfWork wuow(txn);
                logOpForDbHash(txn, ns.c_str());

            result.append( "applied" , num );
            result.append( "results" , ab.arr() );

            if ( ! fromRepl ) {
                // We want this applied atomically on slaves
                // so we re-wrap without the pre-condition for speed

                string tempNS = str::stream() << dbname << ".$cmd";

                // TODO: possibly use mutable BSON to remove preCondition field
                // once it is available
                BSONObjIterator iter(cmdObj);
                BSONObjBuilder cmdBuilder;

                while (iter.more()) {
                    BSONElement elem(iter.next());
                    if (strcmp(elem.fieldName(), "preCondition") != 0) {

                const BSONObj cmdRewritten = cmdBuilder.done();

                // We currently always logOp the command regardless of whether the individial ops
                // succeeded and rely on any failures to also happen on secondaries. This isn't
                // perfect, but it's what the command has always done and is part of its "correct"
                // behavior.
                while (true) {
                    try {
                        WriteUnitOfWork wunit(txn);
                    catch (const WriteConflictException& wce) {
                        LOG(2) <<
                            "WriteConflictException while logging applyOps command, retrying.";

            if (errors != 0) {
                return false;

            return true;
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) {
    invariant(_state == kCriticalSection);
    auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); });

    // Tell the recipient shard to fetch the latest changes
    Status commitCloneStatus = _cloneDriver->commitClone(txn);

    if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) {
        commitCloneStatus = {ErrorCodes::InternalError,
                             "Failing _recvChunkCommit due to failpoint."};

    if (!commitCloneStatus.isOK()) {
        return {commitCloneStatus.code(),
                str::stream() << "commit clone failed due to " << commitCloneStatus.toString()};

    // Generate the next collection version.
    ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion();

    // applyOps preparation for reflecting the uncommitted metadata on the config server

    // Preconditions
    BSONArrayBuilder preCond;
        BSONObjBuilder b;
        b.append("ns", ChunkType::ConfigNS);
                 BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby"
                              << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
            BSONObjBuilder bb(b.subobjStart("res"));

            // TODO: For backwards compatibility, we can't yet require an epoch here


    // Update for the chunk which is being donated
    BSONArrayBuilder updates;
        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);  // No upserting
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey()));
        uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _args.getNss().ns());
        n.append(ChunkType::min(), _args.getMinKey());
        n.append(ChunkType::max(), _args.getMaxKey());
        n.append(ChunkType::shard(), _args.getToShardId());

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey()));


    // Update for the chunk being moved

    // Version at which the next highest lastmod will be set. If the chunk being moved is the last
    // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the
    // chunk being bumped on the FROM-shard.
    ChunkVersion nextVersion = uncommittedCollVersion;

    // If we have chunks left on the FROM shard, update the version of one of them as well. We can
    // figure that out by grabbing the metadata as it has been changed.
    if (_committedMetadata->getNumChunks() > 1) {
        ChunkType bumpChunk;
        invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk));

        BSONObj bumpMin = bumpChunk.getMin();
        BSONObj bumpMax = bumpChunk.getMax();

        dassert(bumpMin.woCompare(_args.getMinKey()) != 0);

        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin));
        nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _args.getNss().ns());
        n.append(ChunkType::min(), bumpMin);
        n.append(ChunkType::max(), bumpMax);
        n.append(ChunkType::shard(), _args.getFromShardId());

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin));


        log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin
              << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'";
    } else {
        log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'";


    Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated(
        txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion);

    if (MONGO_FAIL_POINT(failCommitMigrationCommand)) {
        applyOpsStatus = Status(ErrorCodes::InternalError,
                                "Failpoint 'failCommitMigrationCommand' generated error");

    if (applyOpsStatus.isOK()) {
        // Now that applyOps succeeded and the new collection version is committed, update the
        // collection metadata to the new collection version and forget the migrated chunk.
        ScopedTransaction scopedXact(txn, MODE_IX);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X);

        ChunkType migratingChunkToForget;
        _committedMetadata =
            _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion);
        auto css = CollectionShardingState::get(txn, _args.getNss().ns());
    } else {
        // This could be an unrelated error (e.g. network error). Check whether the metadata update
        // succeeded by refreshing the collection metadata from the config server and checking that
        // the original chunks no longer exist.

        warning() << "Migration metadata commit may have failed: refreshing metadata to check"
                  << causedBy(applyOpsStatus);

        // Need to get the latest optime in case the refresh request goes to a secondary --
        // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done.
        Status status = grid.catalogClient(txn)->logChange(
            BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from"
                       << _args.getFromShardId()
                       << "to"
                       << _args.getToShardId()));
        if (!status.isOK()) {
                 str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << ","
                               << _args.getMaxKey()
                               << ") due to "
                               << causedBy(applyOpsStatus)
                               << ", and updating the optime with a write before refreshing the "
                               << "metadata also failed: "
                               << causedBy(status)});

        ShardingState* const shardingState = ShardingState::get(txn);
        ChunkVersion shardVersion;
        Status refreshStatus =
            shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion);
                         str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey()
                                       << ","
                                       << _args.getMaxKey()
                                       << ") due to "
                                       << causedBy(applyOpsStatus)
                                       << ", and refreshing collection metadata failed: "
                                       << causedBy(refreshStatus)});

            ScopedTransaction scopedXact(txn, MODE_IS);
            AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

            auto css = CollectionShardingState::get(txn, _args.getNss());
            std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata();

            if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) {
                invariant(refreshedMetadata->getCollVersion() ==

                // After refresh, the collection metadata indicates that the donor shard still owns
                // the chunk, so no migration changes were written to the config server metadata.
                return {applyOpsStatus.code(),
                        str::stream() << "Migration was not committed, applyOps failed: "
                                      << causedBy(applyOpsStatus)};

            ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion();
            if (!refreshedCollectionVersion.equals(nextVersion)) {
                // The refreshed collection metadata's collection version does not match the control
                // chunk's updated collection version, which should now be the highest. The control
                // chunk was not committed, but the migrated chunk was. This state is not
                // recoverable.
                                 str::stream() << "Migration was partially committed, state is "
                                               << "unrecoverable. applyOps error: "
                                               << causedBy(applyOpsStatus)});



                                       BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey()
                                                  << "from"
                                                  << _args.getFromShardId()
                                                  << "to"
                                                  << _args.getToShardId()));

    return Status::OK();