示例#1
0
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        // Current batch status
        bool refreshedTargeter = false;
        int rounds = 0;
        int numCompletedOps = 0;
        int numRoundsWithoutProgress = 0;

        while ( !batchOp.isFinished() ) {

            //
            // Get child batches to send using the targeter
            //
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            //
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            //
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            //
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            //

            vector<TargetedWriteBatch*> childBatches;

            // If we've already had a targeting error, we've refreshed the metadata once and can
            // record target errors definitively.
            bool recordTargetErrors = refreshedTargeter;
            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       recordTargetErrors,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                // Don't do anything until a targeter refresh
                _targeter->noteCouldNotTarget();
                refreshedTargeter = true;
                ++_stats->numTargetErrors;
                dassert( childBatches.size() == 0u );
            }

            //
            // Send all child batches
            //

            size_t numSent = 0;
            size_t numToSend = childBatches.size();
            bool remoteMetadataChanging = false;
            while ( numSent != numToSend ) {

                // Collect batches out on the network, mapped by endpoint
                HostBatchMap pendingBatches;

                //
                // Send side
                //

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    //
                    // Collect the info needed to dispatch our targeted batch
                    //

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;

                    // Figure out what host we need to dispatch our targeted batch
                    ConnectionString shardHost;
                    Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint()
                                                                           .shardName,
                                                                       &shardHost );
                    if ( !resolveStatus.isOK() ) {

                        ++_stats->numResolveErrors;

                        // Record a resolve failure
                        // TODO: It may be necessary to refresh the cache if stale, or maybe just
                        // cancel and retarget the batch
                        WriteErrorDetail error;
                        buildErrorFrom( resolveStatus, &error );
                        batchOp.noteBatchError( *nextBatch, error );

                        // We're done with this batch
                        *it = NULL;
                        --numToSend;
                        continue;
                    }

                    // If we already have a batch for this host, wait until the next time
                    HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost );
                    if ( pendingIt != pendingBatches.end() ) continue;

                    //
                    // We now have all the info needed to dispatch the batch
                    //

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( shardHost, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( shardHost, nextBatch ) );
                }

                // Send them all out
                _dispatcher->sendAll();
                numSent += pendingBatches.size();

                //
                // Recv side
                //

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString shardHost;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    dassert( pendingBatches.find( shardHost ) != pendingBatches.end() );
                    TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                            ++_stats->numStaleBatches;
                        }

                        // Remember if the shard is actively changing metadata right now
                        if ( isShardMetadataChanging( staleErrors ) ) {
                            remoteMetadataChanging = true;
                        }

                        // Remember that we successfully wrote to this shard
                        // NOTE: This will record lastOps for shards where we actually didn't update
                        // or delete any documents, which preserves old behavior but is conservative
                        _stats->noteWriteAt( shardHost,
                                             response.isLastOpSet() ? 
                                             response.getLastOp() : OpTime(),
                                             response.isElectionIdSet() ?
                                             response.getElectionId() : OID());
                    }
                    else {

                        // Error occurred dispatching, note it
                        WriteErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );
                    }
                }
            }

            ++rounds;
            ++_stats->numRounds;

            // If we're done, get out
            if ( batchOp.isFinished() )
                break;

            // MORE WORK TO DO

            //
            // Refresh the targeter if we need to (no-op if nothing stale)
            //

            bool targeterChanged = false;
            Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged );

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;
            }

            //
            // Ensure progress is being made toward completing the batch op
            //

            int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed );
            if ( currCompletedOps == numCompletedOps && !targeterChanged
                 && !remoteMetadataChanging ) {
                ++numRoundsWithoutProgress;
            }
            else {
                numRoundsWithoutProgress = 0;
            }
            numCompletedOps = currCompletedOps;

            if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) {

                stringstream msg;
                msg << "no progress was made executing batch write op in " << clientRequest.getNS()
                    << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                    << " ops completed in " << rounds << " rounds total)";

                WriteErrorDetail error;
                buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error );
                batchOp.setBatchError( error );
                break;
            }
        }

        batchOp.buildClientResponse( clientResponse );
    }
示例#2
0
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        int numTargetErrors = 0;
        int numStaleBatches = 0;

        for ( int rounds = 0; !batchOp.isFinished(); rounds++ ) {

            //
            // Refresh the targeter if we need to (no-op if nothing stale)
            //

            Status refreshStatus = _targeter->refreshIfNeeded();

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;
            }

            //
            // Get child batches to send
            //

            vector<TargetedWriteBatch*> childBatches;

            //
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            //
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            //
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            //
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            //
            // If we've had a targeting error or stale error, we've refreshed the metadata once and
            // can record target errors.
            bool recordTargetErrors = numTargetErrors > 0 || numStaleBatches > 0;

            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       recordTargetErrors,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                _targeter->noteCouldNotTarget();
                ++numTargetErrors;
                continue;
            }

            //
            // Send all child batches
            //

            size_t numSent = 0;
            while ( numSent != childBatches.size() ) {

                // Collect batches out on the network, mapped by endpoint
                EndpointBatchMap pendingBatches;

                //
                // Send side
                //

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;
                    const ConnectionString& hostEndpoint = nextBatch->getEndpoint().shardHost;

                    EndpointBatchMap::iterator pendingIt = pendingBatches.find( &hostEndpoint );

                    // If we already have a batch for this endpoint, continue
                    if ( pendingIt != pendingBatches.end() ) continue;

                    // Otherwise send it out to the endpoint via a command to a database

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( hostEndpoint, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( &hostEndpoint, nextBatch ) );
                }

                // Send them all out
                _dispatcher->sendAll();
                numSent += pendingBatches.size();

                //
                // Recv side
                //

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString endpoint;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &endpoint, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    TargetedWriteBatch* batchRaw = pendingBatches.find( &endpoint )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                            ++numStaleBatches;
                        }
                    }
                    else {

                        // Error occurred dispatching, note it
                        BatchedErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );
                    }
                }
            }
        }

        batchOp.buildClientResponse( clientResponse );
    }