Пример #1
    void WriteBatchExecutor::bulkExecute( const BatchedCommandRequest& request,
                                          std::vector<BatchedUpsertDetail*>* upsertedIds,
                                          std::vector<WriteErrorDetail*>* errors ) {

        if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) {
            execInserts( request, errors );
        else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) {
            for ( size_t i = 0; i < request.sizeWriteOps(); i++ ) {

                WriteErrorDetail* error = NULL;
                BSONObj upsertedId;
                execUpdate( BatchItemRef( &request, i ), &upsertedId, &error );

                if ( !upsertedId.isEmpty() ) {
                    BatchedUpsertDetail* batchUpsertedId = new BatchedUpsertDetail;
                    batchUpsertedId->setIndex( i );
                    batchUpsertedId->setUpsertedID( upsertedId );
                    upsertedIds->push_back( batchUpsertedId );

                if ( error ) {
                    errors->push_back( error );
                    if ( request.getOrdered() )
        else {
            dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete );
            for ( size_t i = 0; i < request.sizeWriteOps(); i++ ) {

                WriteErrorDetail* error = NULL;
                execRemove( BatchItemRef( &request, i ), &error );

                if ( error ) {
                    errors->push_back( error );
                    if ( request.getOrdered() )

        // Fill in stale version errors for unordered batches (update/delete can't do this on own)
        if ( !errors->empty() && !request.getOrdered() ) {

            const WriteErrorDetail* finalError = errors->back();

            if ( finalError->getErrCode() == ErrorCodes::StaleShardVersion ) {
                for ( size_t i = finalError->getIndex() + 1; i < request.sizeWriteOps(); i++ ) {
                    WriteErrorDetail* dupStaleError = new WriteErrorDetail;
                    finalError->cloneTo( dupStaleError );
                    errors->push_back( dupStaleError );
Пример #2
bool BatchedCommandRequest::containsNoIDUpsert(const BatchedCommandRequest& request) {
    if (request.getBatchType() != BatchedCommandRequest::BatchType_Update)
        return false;

    const vector<BatchedUpdateDocument*>& updates = request.getUpdateRequest()->getUpdates();

    for (vector<BatchedUpdateDocument*>::const_iterator it = updates.begin(); it != updates.end();
         ++it) {
        const BatchedUpdateDocument* updateDoc = *it;
        if (updateDoc->getUpsert() && updateDoc->getQuery()["_id"].eoo())
            return true;

    return false;
Пример #3
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
        else {
            wcStatus = writeConcern.parse( wcDoc );

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );

        // End validation

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.

        bulkExecute( request, &upserted, &writeErrors );

        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );

        // Refresh metadata if needed

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                // First, we refresh metadata if we need to based on the requested version.

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );

                if ( queueForMigrationCommit ) {

                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );

        // Construct response

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );

            const repl::ReplicationCoordinator::Mode replMode =
            if (replMode != repl::ReplicationCoordinator::modeNone) {
                response->setLastOp( _client->getLastOp() );
                if (replMode == repl::ReplicationCoordinator::modeReplSet) {

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );

        dassert( response->isValid( NULL ) );
Пример #4
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        // Current batch status
        bool refreshedTargeter = false;
        int rounds = 0;
        int numCompletedOps = 0;
        int numRoundsWithoutProgress = 0;

        while ( !batchOp.isFinished() ) {

            // Get child batches to send using the targeter
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.

            vector<TargetedWriteBatch*> childBatches;

            // If we've already had a targeting error, we've refreshed the metadata once and can
            // record target errors definitively.
            bool recordTargetErrors = refreshedTargeter;
            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                // Don't do anything until a targeter refresh
                refreshedTargeter = true;
                dassert( childBatches.size() == 0u );

            // Send all child batches

            size_t numSent = 0;
            size_t numToSend = childBatches.size();
            bool remoteMetadataChanging = false;
            while ( numSent != numToSend ) {

                // Collect batches out on the network, mapped by endpoint
                HostBatchMap pendingBatches;

                // Send side

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    // Collect the info needed to dispatch our targeted batch

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;

                    // Figure out what host we need to dispatch our targeted batch
                    ConnectionString shardHost;
                    Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint()
                                                                       &shardHost );
                    if ( !resolveStatus.isOK() ) {


                        // Record a resolve failure
                        // TODO: It may be necessary to refresh the cache if stale, or maybe just
                        // cancel and retarget the batch
                        WriteErrorDetail error;
                        buildErrorFrom( resolveStatus, &error );
                        batchOp.noteBatchError( *nextBatch, error );

                        // We're done with this batch
                        *it = NULL;

                    // If we already have a batch for this host, wait until the next time
                    HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost );
                    if ( pendingIt != pendingBatches.end() ) continue;

                    // We now have all the info needed to dispatch the batch

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( shardHost, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( shardHost, nextBatch ) );

                // Send them all out
                numSent += pendingBatches.size();

                // Recv side

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString shardHost;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    dassert( pendingBatches.find( shardHost ) != pendingBatches.end() );
                    TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );

                        // Remember if the shard is actively changing metadata right now
                        if ( isShardMetadataChanging( staleErrors ) ) {
                            remoteMetadataChanging = true;

                        // Remember that we successfully wrote to this shard
                        // NOTE: This will record lastOps for shards where we actually didn't update
                        // or delete any documents, which preserves old behavior but is conservative
                        _stats->noteWriteAt( shardHost,
                                             response.isLastOpSet() ? 
                                             response.getLastOp() : OpTime(),
                                             response.isElectionIdSet() ?
                                             response.getElectionId() : OID());
                    else {

                        // Error occurred dispatching, note it
                        WriteErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );


            // If we're done, get out
            if ( batchOp.isFinished() )

            // MORE WORK TO DO

            // Refresh the targeter if we need to (no-op if nothing stale)

            bool targeterChanged = false;
            Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged );

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;

            // Ensure progress is being made toward completing the batch op

            int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed );
            if ( currCompletedOps == numCompletedOps && !targeterChanged
                 && !remoteMetadataChanging ) {
            else {
                numRoundsWithoutProgress = 0;
            numCompletedOps = currCompletedOps;

            if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) {

                stringstream msg;
                msg << "no progress was made executing batch write op in " << clientRequest.getNS()
                    << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                    << " ops completed in " << rounds << " rounds total)";

                WriteErrorDetail error;
                buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error );
                batchOp.setBatchError( error );

        batchOp.buildClientResponse( clientResponse );
Пример #5
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        int numTargetErrors = 0;
        int numStaleBatches = 0;

        for ( int rounds = 0; !batchOp.isFinished(); rounds++ ) {

            // Refresh the targeter if we need to (no-op if nothing stale)

            Status refreshStatus = _targeter->refreshIfNeeded();

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;

            // Get child batches to send

            vector<TargetedWriteBatch*> childBatches;

            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            // If we've had a targeting error or stale error, we've refreshed the metadata once and
            // can record target errors.
            bool recordTargetErrors = numTargetErrors > 0 || numStaleBatches > 0;

            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {

            // Send all child batches

            size_t numSent = 0;
            while ( numSent != childBatches.size() ) {

                // Collect batches out on the network, mapped by endpoint
                EndpointBatchMap pendingBatches;

                // Send side

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;
                    const ConnectionString& hostEndpoint = nextBatch->getEndpoint().shardHost;

                    EndpointBatchMap::iterator pendingIt = pendingBatches.find( &hostEndpoint );

                    // If we already have a batch for this endpoint, continue
                    if ( pendingIt != pendingBatches.end() ) continue;

                    // Otherwise send it out to the endpoint via a command to a database

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( hostEndpoint, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( &hostEndpoint, nextBatch ) );

                // Send them all out
                numSent += pendingBatches.size();

                // Recv side

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString endpoint;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &endpoint, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    TargetedWriteBatch* batchRaw = pendingBatches.find( &endpoint )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                    else {

                        // Error occurred dispatching, note it
                        BatchedErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );

        batchOp.buildClientResponse( clientResponse );
Пример #6
bool batchErrorToLastError(const BatchedCommandRequest& request,
                           const BatchedCommandResponse& response,
                           LastError* error) {
    unique_ptr<WriteErrorDetail> commandError;
    WriteErrorDetail* lastBatchError = NULL;

    if (!response.getOk()) {
        // Command-level error, all writes failed

        commandError.reset(new WriteErrorDetail);
        buildErrorFromResponse(response, commandError.get());
        lastBatchError = commandError.get();
    } else if (response.isErrDetailsSet()) {
        // The last error in the batch is always reported - this matches expected COE
        // semantics for insert batches. For updates and deletes, error is only reported
        // if the error was on the last item.

        const bool lastOpErrored = response.getErrDetails().back()->getIndex() ==
            static_cast<int>(request.sizeWriteOps() - 1);
        if (request.getBatchType() == BatchedCommandRequest::BatchType_Insert || lastOpErrored) {
            lastBatchError = response.getErrDetails().back();
    } else {
        // We don't care about write concern errors, these happen in legacy mode in GLE.

    // Record an error if one exists
    if (lastBatchError) {
        string errMsg = lastBatchError->getErrMessage();
                            errMsg.empty() ? "see code for details" : errMsg.c_str());
        return true;

    // Record write stats otherwise
    // NOTE: For multi-write batches, our semantics change a little because we don't have
    // un-aggregated "n" stats.
    if (request.getBatchType() == BatchedCommandRequest::BatchType_Update) {
        BSONObj upsertedId;
        if (response.isUpsertDetailsSet()) {
            // Only report the very last item's upserted id if applicable
            if (response.getUpsertDetails().back()->getIndex() + 1 ==
                static_cast<int>(request.sizeWriteOps())) {
                upsertedId = response.getUpsertDetails().back()->getUpsertedID();

        int numUpserted = 0;
        if (response.isUpsertDetailsSet())
            numUpserted = response.sizeUpsertDetails();

        int numMatched = response.getN() - numUpserted;
        dassert(numMatched >= 0);

        // Wrap upserted id in "upserted" field
        BSONObj leUpsertedId;
        if (!upsertedId.isEmpty())
            leUpsertedId = upsertedId.firstElement().wrap(kUpsertedFieldName);

        error->recordUpdate(numMatched > 0, response.getN(), leUpsertedId);
    } else if (request.getBatchType() == BatchedCommandRequest::BatchType_Delete) {

    return false;
Пример #7
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // TODO: Lift write concern parsing out of this entirely.
        WriteConcernOptions writeConcern;
        Status status = Status::OK();

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();

        if ( wcDoc.isEmpty() ) {
            status = writeConcern.parse( _defaultWriteConcern );
        else {
            status = writeConcern.parse( wcDoc );

        if ( status.isOK() ) {
            status = validateWriteConcern( writeConcern );

        if ( !status.isOK() ) {
            response->setErrCode( status.code() );
            response->setErrMessage( status.reason() );
            response->setOk( false );
            dassert( response->isValid(NULL) );

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.

        bulkExecute( request, &upserted, &writeErrors );

        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            status = waitForWriteConcern( writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );

        // Refresh metadata if needed

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                // First, we refresh metadata if we need to based on the requested version.

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );

                if ( queueForMigrationCommit ) {

                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );

        // Construct response

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );

            if ( anyReplEnabled() ) {
                response->setLastOp( _client->getLastOp() );
                if (theReplSet) {
                    response->setElectionId( theReplSet->getElectionId() );

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );

        dassert( response->isValid( NULL ) );
Пример #8
     * The core config write functionality.
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
    void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest,
                                          BatchedCommandResponse* clientResponse,
                                          bool fsyncCheck ) {

        NamespaceString nss( clientRequest.getNS() );
        dassert( nss.db() == "config" || nss.db() == "admin" );
        dassert( clientRequest.sizeWriteOps() == 1u );

        if ( fsyncCheck ) {

            // Sanity check that all configs are still reachable using fsync, preserving legacy
            // behavior

            OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned;
            vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector();

            // Send side

            for ( vector<ConnectionString>::iterator it = _configHosts.begin();
                it != _configHosts.end(); ++it ) {
                ConnectionString& configHost = *it;
                FsyncRequest fsyncRequest;
                _dispatcher->addCommand( configHost, "admin", fsyncRequest );


            // Recv side

            bool fsyncError = false;
            while ( _dispatcher->numPending() > 0 ) {

                fsyncResponses.push_back( new ConfigFsyncResponse() );
                ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back();
                Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost,
                                                              &fsyncResponse.response );

                // We've got to recv everything, no matter what
                if ( !dispatchStatus.isOK() ) {
                    fsyncError = true;
                    buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response );
                else if ( !fsyncResponse.response.getOk() ) {
                    fsyncError = true;

            if ( fsyncError ) {
                combineFsyncErrors( fsyncResponses, clientResponse );
            else {

        // Do the actual writes

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        // Send the actual config writes

        // Get as many batches as we can at once
        for ( vector<ConnectionString>::iterator it = _configHosts.begin();
            it != _configHosts.end(); ++it ) {
            ConnectionString& configHost = *it;
            _dispatcher->addCommand( configHost, nss.db(), configRequest );

        // Send them all out

        // Recv side

        while ( _dispatcher->numPending() > 0 ) {

            // Get the response
            responses.push_back( new ConfigResponse() );
            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost,
                                                          &configResponse.response );

            if ( !dispatchStatus.isOK() ) {
                buildErrorFrom( dispatchStatus, &configResponse.response );

        combineResponses( responses, clientResponse );
Пример #9
    void BatchSafeWriter::safeWriteBatch( DBClientBase* conn,
                                          const BatchedCommandRequest& request,
                                          BatchedCommandResponse* response ) {

        const NamespaceString nss( request.getNS() );

        // N starts at zero, and we add to it for each item
        response->setN( 0 );

        for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) {

            // Break on first error if we're ordered
            if ( request.getOrdered() && response->isErrDetailsSet() )

            BatchItemRef itemRef( &request, static_cast<int>( i ) );
            bool isLastItem = ( i == request.sizeWriteOps() - 1 );

            BSONObj writeConcern;
            if ( isLastItem && request.isWriteConcernSet() ) {
                writeConcern = request.getWriteConcern();
                // Pre-2.4.2 mongods react badly to 'w' being set on config servers
                if ( nss.db() == "config" )
                    writeConcern = fixWCForConfig( writeConcern );

            BSONObj gleResult;
            GLEErrors errors;
            Status status = _safeWriter->safeWrite( conn, itemRef, writeConcern, &gleResult );
            if ( status.isOK() ) {
                status = extractGLEErrors( gleResult, &errors );

            if ( !status.isOK() ) {
                response->setOk( false );
                response->setErrCode( status.code() );
                response->setErrMessage( status.reason() );

            // STATS HANDLING

            GLEStats stats;
            extractGLEStats( gleResult, &stats );

            // Special case for making legacy "n" field result for insert match the write
            // command result.
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert
                 && !errors.writeError.get() ) {
                // n is always 0 for legacy inserts.
                dassert( stats.n == 0 );
                stats.n = 1;

            response->setN( response->getN() + stats.n );

            if ( !stats.upsertedId.isEmpty() ) {
                BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail;
                upsertedId->setIndex( i );
                upsertedId->setUpsertedID( stats.upsertedId );
                response->addToUpsertDetails( upsertedId );

            response->setLastOp( stats.lastOp );


            // If any error occurs (except stale config) the previous GLE was not enforced
            bool enforcedWC = !errors.writeError.get()
                              || errors.writeError->getErrCode() == ErrorCodes::StaleShardVersion;

            // Save write error
            if ( errors.writeError.get() ) {
                errors.writeError->setIndex( i );
                response->addToErrDetails( errors.writeError.release() );


            // The last write is weird, since we enforce write concern and check the error through
            // the same GLE if possible.  If the last GLE was an error, the write concern may not
            // have been enforced in that same GLE, so we need to send another after resetting the
            // error.
            if ( isLastItem ) {

                // Try to enforce the write concern if everything succeeded (unordered or ordered)
                // OR if something succeeded and we're unordered.
                bool needToEnforceWC =
                    || ( !request.getOrdered()
                         && response->sizeErrDetails() < request.sizeWriteOps() );

                if ( !enforcedWC && needToEnforceWC ) {
                    dassert( !errors.writeError.get() ); // emptied above

                    // Might have gotten a write concern validity error earlier, these are
                    // enforced even if the wc isn't applied, so we ignore.

                    Status status = _safeWriter->enforceWriteConcern( conn,
                                                                      &gleResult );

                    if ( status.isOK() ) {
                        status = extractGLEErrors( gleResult, &errors );

                    if ( !status.isOK() ) {
                        response->setOk( false );
                        response->setErrCode( status.code() );
                        response->setErrMessage( status.reason() );
                // END Write concern retry

                if ( errors.wcError.get() ) {
                    response->setWriteConcernError( errors.wcError.release() );

        response->setOk( true );
        dassert( response->isValid( NULL ) );
Пример #10
    void batchErrorToLastError( const BatchedCommandRequest& request,
                                const BatchedCommandResponse& response,
                                LastError* error ) {

        scoped_ptr<BatchedErrorDetail> topLevelError;
        BatchedErrorDetail* lastBatchError = NULL;

        if ( !response.getOk() ) {

            int code = response.getErrCode();

            // Check for batch error
            // We don't care about write concern errors, these happen in legacy mode in GLE
            if ( code != ErrorCodes::WriteConcernFailed && !response.isErrDetailsSet() ) {
                // Top-level error, all writes failed
                topLevelError.reset( new BatchedErrorDetail );
                buildErrorFromResponse( response, topLevelError.get() );
                lastBatchError = topLevelError.get();
            else if ( response.isErrDetailsSet() ) {
                // The last error in the batch is always reported - this matches expected COE
                // semantics for insert batches and works for single writes
                lastBatchError = response.getErrDetails().back();

        // Record an error if one exists
        if ( lastBatchError ) {
            error->raiseError( lastBatchError->getErrCode(),
                               lastBatchError->getErrMessage().c_str() );

        // Record write stats otherwise
        // NOTE: For multi-write batches, our semantics change a little because we don't have
        // un-aggregated "n" stats.
        if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) {

            BSONObj upsertedId;
            if ( response.isSingleUpsertedSet() ) upsertedId = response.getSingleUpserted();
            else if( response.isUpsertDetailsSet() ) {
                // Only report the very last item's upserted id if applicable
                if ( response.getUpsertDetails().back()->getIndex() + 1
                     == static_cast<int>( request.sizeWriteOps() ) ) {
                    upsertedId = response.getUpsertDetails().back()->getUpsertedID();

            int numUpserted = 0;
            if ( response.isSingleUpsertedSet() )
            else if ( response.isUpsertDetailsSet() )
                numUpserted += response.sizeUpsertDetails();

            int numUpdated = response.getN() - numUpserted;
            dassert( numUpdated >= 0 );
            error->recordUpdate( numUpdated > 0, response.getN(), upsertedId );
        else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ) {
            error->recordDelete( response.getN() );
Пример #11
     * The core config write functionality.
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
    void ConfigCoordinator::executeBatch(const BatchedCommandRequest& clientRequest,
                                         BatchedCommandResponse* clientResponse) {

        const NamespaceString nss(clientRequest.getNS());

        // Should never use it for anything other than DBs residing on the config server
        dassert(nss.db() == "config" || nss.db() == "admin");
        dassert(clientRequest.sizeWriteOps() == 1u);

        // This is an opportunistic check that all config servers look healthy by calling
        // getLastError on each one of them. If there was some form of write/journaling error, get
        // last error would fail.
            for (vector<ConnectionString>::iterator it = _configHosts.begin();
                 it != _configHosts.end();
                 ++it) {

                                        RawBSONSerializable(BSON("getLastError" << true <<
                                                                 "fsync" << true)));


            bool error = false;
            while (_dispatcher->numPending()) {
                ConnectionString host;
                RawBSONSerializable response;

                Status status = _dispatcher->recvAny(&host, &response);
                if (status.isOK()) {
                    BSONObj obj = response.toBSON();

                    LOG(3) << "Response " << obj.toString();

                    // If the ok field is anything other than 1, count it as error
                    if (!obj["ok"].trueValue()) {
                        error = true;
                        log() << "Config server check for host " << host
                              << " returned error: " << response;
                else {
                    error = true;
                    log() << "Config server check for host " << host
                          << " failed with status: " << status;

            // All responses should have been gathered by this point
            if (error) {
                clientResponse->setErrMessage("Could not verify that config servers were active"
                                              " and reachable before write");

        if (!_checkConfigString(clientResponse)) {

        // Do the actual writes

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        // Send the actual config writes

        // Get as many batches as we can at once
        for (vector<ConnectionString>::const_iterator it = _configHosts.begin();
             it != _configHosts.end();
             ++it) {

            const ConnectionString& configHost = *it;
            _dispatcher->addCommand(configHost, nss.db(), configRequest);

        // Send them all out

        // Recv side

        while (_dispatcher->numPending() > 0) {
            // Get the response
            responses.push_back(new ConfigResponse());

            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny(&configResponse.configHost,

            if (!dispatchStatus.isOK()) {
                buildErrorFrom(dispatchStatus, &configResponse.response);

        combineResponses(responses, clientResponse);
Пример #12
bool WriteBatchExecutor::applyWriteItem( const BatchedCommandRequest& request,
        int index,
        WriteStats* stats,
        BatchedErrorDetail* error ) {
    const string& ns = request.getNS();

    // Clear operation's LastError before starting.
    _le->reset( true );

    //uint64_t itemTimeMicros = 0;
    bool opSuccess = true;

    // Each write operation executes in its own PageFaultRetryableSection.  This means that
    // a single batch can throw multiple PageFaultException's, which is not the case for
    // other operations.
    PageFaultRetryableSection s;
    while ( true ) {
        try {
            // Execute the write item as a child operation of the current operation.
            CurOp childOp( _client, _client->curop() );

            // TODO Modify CurOp "wrapped" constructor to take an opcode, so calling .reset()
            // is unneeded
            childOp.reset( _client->getRemote(), getOpCode( request.getBatchType() ) );

            OpDebug& opDebug = childOp.debug();
            opDebug.ns = ns;
                Client::WriteContext ctx( ns );

                switch ( request.getBatchType() ) {
                case BatchedCommandRequest::BatchType_Insert:
                    opSuccess =
                        applyInsert( ns,
                                     request.getInsertRequest()->getDocumentsAt( index ),
                                     error );
                case BatchedCommandRequest::BatchType_Update:
                    opSuccess = applyUpdate( ns,
                                             *request.getUpdateRequest()->getUpdatesAt( index ),
                                             error );
                    dassert( request.getBatchType() ==
                             BatchedCommandRequest::BatchType_Delete );
                    opSuccess = applyDelete( ns,
                                             *request.getDeleteRequest()->getDeletesAt( index ),
                                             error );
            //itemTimeMicros = childOp.totalTimeMicros();

            opDebug.executionTime = childOp.totalTimeMillis();

            // Log operation if running with at least "-v", or if exceeds slow threshold.
            if ( logger::globalLogDomain()->shouldLog( logger::LogSeverity::Debug( 1 ) )
                    || opDebug.executionTime > cmdLine.slowMS + childOp.getExpectedLatencyMs() ) {

                MONGO_TLOG(1) << opDebug.report( childOp ) << endl;

            // TODO Log operation if logLevel >= 3 and assertion thrown (as assembleResponse()
            // does).

            // Save operation to system.profile if shouldDBProfile().
            if ( childOp.shouldDBProfile( opDebug.executionTime ) ) {
                profile( *_client, getOpCode( request.getBatchType() ), childOp );
        catch ( PageFaultException& e ) {

    return opSuccess;
Пример #13
    void BatchSafeWriter::safeWriteBatch( DBClientBase* conn,
                                          const BatchedCommandRequest& request,
                                          BatchedCommandResponse* response ) {

        const NamespaceString nss( request.getNS() );

        // N starts at zero, and we add to it for each item
        response->setN( 0 );

        // GLE path always sets nModified to -1 (sentinel) to indicate we should omit it later.

        for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) {

            // Break on first error if we're ordered
            if ( request.getOrdered() && response->isErrDetailsSet() )

            BatchItemRef itemRef( &request, static_cast<int>( i ) );

            BSONObj gleResult;
            GLEErrors errors;
            Status status = _safeWriter->safeWrite( conn,
                                                    &gleResult );

            if ( status.isOK() ) {
                status = extractGLEErrors( gleResult, &errors );

            if ( !status.isOK() ) {
                response->setOk( false );
                response->setErrCode( ErrorCodes::RemoteResultsUnavailable );
                StringBuilder builder;
                builder << "could not get write error from safe write";
                builder << causedBy( status.toString() );
                response->setErrMessage( builder.str() );

            if ( errors.wcError.get() ) {
                response->setWriteConcernError( errors.wcError.release() );

            // STATS HANDLING

            GLEStats stats;
            extractGLEStats( gleResult, &stats );

            // Special case for making legacy "n" field result for insert match the write
            // command result.
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert
                 && !errors.writeError.get() ) {
                // n is always 0 for legacy inserts.
                dassert( stats.n == 0 );
                stats.n = 1;

            response->setN( response->getN() + stats.n );

            if ( !stats.upsertedId.isEmpty() ) {
                BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail;
                upsertedId->setIndex( i );
                upsertedId->setUpsertedID( stats.upsertedId );
                response->addToUpsertDetails( upsertedId );

            response->setLastOp( stats.lastOp );

            // Save write error
            if ( errors.writeError.get() ) {
                errors.writeError->setIndex( i );
                response->addToErrDetails( errors.writeError.release() );


        // The last write is weird, since we enforce write concern and check the error through
        // the same GLE if possible.  If the last GLE was an error, the write concern may not
        // have been enforced in that same GLE, so we need to send another after resetting the
        // error.

        BSONObj writeConcern;
        if ( request.isWriteConcernSet() ) {
            writeConcern = request.getWriteConcern();
            // Pre-2.4.2 mongods react badly to 'w' being set on config servers
            if ( nss.db() == "config" )
                writeConcern = fixWCForConfig( writeConcern );

        bool needToEnforceWC = WriteConcernOptions::Acknowledged.woCompare(writeConcern) != 0 &&
                WriteConcernOptions::Unacknowledged.woCompare(writeConcern) != 0;

        if ( needToEnforceWC &&
                ( !response->isErrDetailsSet() ||
                        ( !request.getOrdered() &&
                                // Not all errored. Note: implicit response->isErrDetailsSet().
                                response->sizeErrDetails() < request.sizeWriteOps() ))) {

            // Might have gotten a write concern validity error earlier, these are
            // enforced even if the wc isn't applied, so we ignore.

            const string dbName( nss.db().toString() );

            Status status( Status::OK() );

            if ( response->isErrDetailsSet() ) {
                const WriteErrorDetail* lastError = response->getErrDetails().back();

                // If last write op was an error.
                if ( lastError->getIndex() == static_cast<int>( request.sizeWriteOps() - 1 )) {
                    // Reset previous errors so we can apply the write concern no matter what
                    // as long as it is valid.
                    status = _safeWriter->clearErrors( conn, dbName );

            BSONObj gleResult;
            if ( status.isOK() ) {
                status = _safeWriter->enforceWriteConcern( conn,
                                                           &gleResult );

            GLEErrors errors;
            if ( status.isOK() ) {
                status = extractGLEErrors( gleResult, &errors );
            if ( !status.isOK() ) {
                auto_ptr<WCErrorDetail> wcError( new WCErrorDetail );
                wcError->setErrCode( status.code() );
                wcError->setErrMessage( status.reason() );
                response->setWriteConcernError( wcError.release() ); 
            else if ( errors.wcError.get() ) {
                response->setWriteConcernError( errors.wcError.release() );

        response->setOk( true );
        dassert( response->isValid( NULL ) );