Exemple #1
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
        else {
            wcStatus = writeConcern.parse( wcDoc );

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );

        // End validation

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.

        bulkExecute( request, &upserted, &writeErrors );

        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );

        // Refresh metadata if needed

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                // First, we refresh metadata if we need to based on the requested version.

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );

                if ( queueForMigrationCommit ) {

                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );

        // Construct response

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );

            const repl::ReplicationCoordinator::Mode replMode =
            if (replMode != repl::ReplicationCoordinator::modeNone) {
                response->setLastOp( _client->getLastOp() );
                if (replMode == repl::ReplicationCoordinator::modeReplSet) {

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );

        dassert( response->isValid( NULL ) );
Exemple #2
        bool run(OperationContext* txn, const string& dbname,
                  BSONObj& cmdObj,
                  string& errmsg,
                  BSONObjBuilder& result,
                  bool fromRepl ) {

            // Correct behavior here is very finicky.
            // 1.  The first step is to append the error that occurred on the previous operation.
            // This adds an "err" field to the command, which is *not* the command failing.
            // 2.  Next we parse and validate write concern options.  If these options are invalid
            // the command fails no matter what, even if we actually had an error earlier.  The
            // reason for checking here is to match legacy behavior on these kind of failures -
            // we'll still get an "err" field for the write error.
            // 3.  If we had an error on the previous operation, we then return immediately.
            // 4.  Finally, we actually enforce the write concern.  All errors *except* timeout are
            // reported with ok : 0.0, to match legacy behavior.
            // There is a special case when "wOpTime" and "wElectionId" are explicitly provided by 
            // the client (mongos) - in this case we *only* enforce the write concern if it is 
            // valid.
            // We always need to either report "err" (if ok : 1) or "errmsg" (if ok : 0), even if
            // err is null.

            LastError *le = lastError.disableForCommand();

            // Always append lastOp and connectionId
            Client& c = cc();
            c.appendLastOp( result );

            // for sharding; also useful in general for debugging
            result.appendNumber( "connectionId" , c.getConnectionId() );

            OpTime lastOpTime;
            BSONField<OpTime> wOpTimeField("wOpTime");
            FieldParser::FieldState extracted = FieldParser::extract(cmdObj, wOpTimeField, 
                                                                     &lastOpTime, &errmsg);
            if (!extracted) {
                result.append("badGLE", cmdObj);
                appendCommandStatus(result, false, errmsg);
                return false;
            bool lastOpTimePresent = extracted != FieldParser::FIELD_NONE;
            if (!lastOpTimePresent) {
                // Use the client opTime if no wOpTime is specified
                lastOpTime = cc().getLastOp();
            OID electionId;
            BSONField<OID> wElectionIdField("wElectionId");
            extracted = FieldParser::extract(cmdObj, wElectionIdField, 
                                             &electionId, &errmsg);
            if (!extracted) {
                result.append("badGLE", cmdObj);
                appendCommandStatus(result, false, errmsg);
                return false;

            bool electionIdPresent = extracted != FieldParser::FIELD_NONE;
            bool errorOccurred = false;

            // Errors aren't reported when wOpTime is used
            if ( !lastOpTimePresent ) {
                if ( le->nPrev != 1 ) {
                    errorOccurred = LastError::noError.appendSelf( result, false );
                    le->appendSelfStatus( result );
                else {
                    errorOccurred = le->appendSelf( result, false );

            BSONObj writeConcernDoc = cmdObj;
            // Use the default options if we have no gle options aside from wOpTime/wElectionId
            const int nFields = cmdObj.nFields();
            bool useDefaultGLEOptions = (nFields == 1) || 
                (nFields == 2 && lastOpTimePresent) ||
                (nFields == 3 && lastOpTimePresent && electionIdPresent);

            if ( useDefaultGLEOptions && getLastErrorDefault ) {
                writeConcernDoc = *getLastErrorDefault;

            // Validate write concern no matter what, this matches 2.4 behavior

            WriteConcernOptions writeConcern;
            Status status = writeConcern.parse( writeConcernDoc );

            if ( status.isOK() ) {
                // Ensure options are valid for this host
                status = validateWriteConcern( writeConcern );

            if ( !status.isOK() ) {
                result.append( "badGLE", writeConcernDoc );
                return appendCommandStatus( result, status );

            // Don't wait for replication if there was an error reported - this matches 2.4 behavior
            if ( errorOccurred ) {
                dassert( !lastOpTimePresent );
                return true;

            // No error occurred, so we won't duplicate these fields with write concern errors
            dassert( result.asTempObj()["err"].eoo() );
            dassert( result.asTempObj()["code"].eoo() );

            // If we got an electionId, make sure it matches
            if (electionIdPresent) {
                if (!theReplSet) {
                    // Ignore electionIds of 0 from mongos.
                    if (electionId != OID()) {
                        errmsg = "wElectionId passed but no replication active";
                        result.append("code", ErrorCodes::BadValue);
                        return false;
                else {
                    if (electionId != theReplSet->getElectionId()) {
                        LOG(3) << "oid passed in is " << electionId
                               << ", but our id is " << theReplSet->getElectionId();
                        errmsg = "election occurred after write";
                        result.append("code", ErrorCodes::WriteConcernFailed);
                        return false;

            cc().curop()->setMessage( "waiting for write concern" );

            WriteConcernResult wcResult;
            status = waitForWriteConcern( txn, writeConcern, lastOpTime, &wcResult );
            wcResult.appendTo( writeConcern, &result );

            // For backward compatibility with 2.4, wtimeout returns ok : 1.0
            if ( wcResult.wTimedOut ) {
                dassert( !wcResult.err.empty() ); // so we always report err
                dassert( !status.isOK() );
                result.append( "errmsg", "timed out waiting for slaves" );
                result.append( "code", status.code() );
                return true;

            return appendCommandStatus( result, status );
Exemple #3
    Status waitForWriteConcern( OperationContext* txn,
                                const WriteConcernOptions& writeConcern,
                                const OpTime& replOpTime,
                                WriteConcernResult* result ) {

        // We assume all options have been validated earlier, if not, programming error
        dassert( validateWriteConcern( writeConcern ).isOK() );

        // Next handle blocking on disk

        Timer syncTimer;

        switch( writeConcern.syncMode ) {
        case WriteConcernOptions::NONE:
        case WriteConcernOptions::FSYNC:
            if ( !getDur().isDurable() ) {
                result->fsyncFiles = globalStorageEngine->flushAllFiles( true );
            else {
                // We only need to commit the journal if we're durable
        case WriteConcernOptions::JOURNAL:

        result->syncMillis = syncTimer.millis();

        // Now wait for replication

        if (replOpTime.isNull()) {
            // no write happened for this client yet
            return Status::OK();

        // needed to avoid incrementing gleWtimeStats SERVER-9005
        if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) {
            // no desired replication check
            return Status::OK();

        // Now we wait for replication
        // Note that replica set stepdowns and gle mode changes are thrown as errors
        repl::ReplicationCoordinator::StatusAndDuration replStatus =
        if (replStatus.status == ErrorCodes::ExceededTimeLimit) {
            replStatus.status = Status(ErrorCodes::WriteConcernFailed,
                                       "waiting for replication timed out");
            result->err = "timeout";
            result->wTimedOut = true;
        // Add stats
        result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime);
        result->wTime = replStatus.duration.total_milliseconds();

        return replStatus.status;
Exemple #4
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // TODO: Lift write concern parsing out of this entirely.
        WriteConcernOptions writeConcern;
        Status status = Status::OK();

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();

        if ( wcDoc.isEmpty() ) {
            status = writeConcern.parse( _defaultWriteConcern );
        else {
            status = writeConcern.parse( wcDoc );

        if ( status.isOK() ) {
            status = validateWriteConcern( writeConcern );

        if ( !status.isOK() ) {
            response->setErrCode( status.code() );
            response->setErrMessage( status.reason() );
            response->setOk( false );
            dassert( response->isValid(NULL) );

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.

        bulkExecute( request, &upserted, &writeErrors );

        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            status = waitForWriteConcern( writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );

        // Refresh metadata if needed

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                // First, we refresh metadata if we need to based on the requested version.

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );

                if ( queueForMigrationCommit ) {

                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );

        // Construct response

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );

            if ( anyReplEnabled() ) {
                response->setLastOp( _client->getLastOp() );
                if (theReplSet) {
                    response->setElectionId( theReplSet->getElectionId() );

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );

        dassert( response->isValid( NULL ) );
Exemple #5
Status waitForWriteConcern( OperationContext* txn,
                            const WriteConcernOptions& writeConcern,
                            const OpTime& replOpTime,
                            WriteConcernResult* result ) {

    // We assume all options have been validated earlier, if not, programming error
    dassert( validateWriteConcern( writeConcern ).isOK() );

    // Next handle blocking on disk

    Timer syncTimer;

    switch( writeConcern.syncMode ) {
    case WriteConcernOptions::NONE:
    case WriteConcernOptions::FSYNC:
        if ( !getDur().isDurable() ) {
            result->fsyncFiles = MemoryMappedFile::flushAll( true );
        else {
            // We only need to commit the journal if we're durable
    case WriteConcernOptions::JOURNAL:

    result->syncMillis = syncTimer.millis();

    // Now wait for replication

    if ( replOpTime.isNull() ) {
        // no write happened for this client yet
        return Status::OK();

    if ( writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty() ) {
        // no desired replication check
        return Status::OK();

    if (!replset::anyReplEnabled() || serverGlobalParams.configsvr) {
        // no replication check needed (validated above)
        return Status::OK();

    const bool isMasterSlaveNode = replset::anyReplEnabled() && !replset::theReplSet;
    if ( writeConcern.wMode == "majority" && isMasterSlaveNode ) {
        // with master/slave, majority is equivalent to w=1
        return Status::OK();

    // We're sure that replication is enabled and that we have more than one node or a wMode
    TimerHolder gleTimerHolder( &gleWtimeStats );

    // Now we wait for replication
    // Note that replica set stepdowns and gle mode changes are thrown as errors
    // TODO: Make this cleaner
    Status replStatus = Status::OK();
    try {
        while ( 1 ) {

            if ( writeConcern.wNumNodes > 0 ) {
                if (replset::opReplicatedEnough(replOpTime, writeConcern.wNumNodes)) {
            else if (replset::opReplicatedEnough(replOpTime, writeConcern.wMode)) {

            if ( writeConcern.wTimeout > 0 &&
                    gleTimerHolder.millis() >= writeConcern.wTimeout ) {
                result->err = "timeout";
                result->wTimedOut = true;
                replStatus = Status( ErrorCodes::WriteConcernFailed,
                                     "waiting for replication timed out" );

    catch( const AssertionException& ex ) {
        // Our replication state changed while enforcing write concern
        replStatus = ex.toStatus();

    // Add stats
    result->writtenTo = replset::getHostsWrittenTo(replOpTime);
    result->wTime = gleTimerHolder.recordMillis();

    return replStatus;