Beispiel #1
0
void BatchWriteOp::buildBatchRequest(const TargetedWriteBatch& targetedBatch,
                                     BatchedCommandRequest* request) const {
    request->setNS(_clientRequest->getNS());
    request->setShouldBypassValidation(_clientRequest->shouldBypassValidation());

    const vector<TargetedWrite*>& targetedWrites = targetedBatch.getWrites();

    for (vector<TargetedWrite*>::const_iterator it = targetedWrites.begin();
         it != targetedWrites.end();
         ++it) {
        const WriteOpRef& writeOpRef = (*it)->writeOpRef;
        BatchedCommandRequest::BatchType batchType = _clientRequest->getBatchType();

        // NOTE:  We copy the batch items themselves here from the client request
        // TODO: This could be inefficient, maybe we want to just reference in the future
        if (batchType == BatchedCommandRequest::BatchType_Insert) {
            BatchedInsertRequest* clientInsertRequest = _clientRequest->getInsertRequest();
            BSONObj insertDoc = clientInsertRequest->getDocumentsAt(writeOpRef.first);
            request->getInsertRequest()->addToDocuments(insertDoc);
        } else if (batchType == BatchedCommandRequest::BatchType_Update) {
            BatchedUpdateRequest* clientUpdateRequest = _clientRequest->getUpdateRequest();
            BatchedUpdateDocument* updateDoc = new BatchedUpdateDocument;
            clientUpdateRequest->getUpdatesAt(writeOpRef.first)->cloneTo(updateDoc);
            request->getUpdateRequest()->addToUpdates(updateDoc);
        } else {
            dassert(batchType == BatchedCommandRequest::BatchType_Delete);
            BatchedDeleteRequest* clientDeleteRequest = _clientRequest->getDeleteRequest();
            BatchedDeleteDocument* deleteDoc = new BatchedDeleteDocument;
            clientDeleteRequest->getDeletesAt(writeOpRef.first)->cloneTo(deleteDoc);
            request->getDeleteRequest()->addToDeletes(deleteDoc);
        }

        // TODO: We can add logic here to allow aborting individual ops
        // if ( NULL == response ) {
        //    ->responses.erase( it++ );
        //    continue;
        //}
    }

    if (_clientRequest->isWriteConcernSet()) {
        if (_clientRequest->isVerboseWC()) {
            request->setWriteConcern(_clientRequest->getWriteConcern());
        } else {
            // Mongos needs to send to the shard with w > 0 so it will be able to
            // see the writeErrors.
            request->setWriteConcern(upgradeWriteConcern(_clientRequest->getWriteConcern()));
        }
    }

    if (!request->isOrderedSet()) {
        request->setOrdered(_clientRequest->getOrdered());
    }

    unique_ptr<BatchedRequestMetadata> requestMetadata(new BatchedRequestMetadata());
    requestMetadata->setShardVersion(
        ChunkVersionAndOpTime(targetedBatch.getEndpoint().shardVersion));
    requestMetadata->setSession(0);

    request->setMetadata(requestMetadata.release());
}
Beispiel #2
0
// Helper function to cancel all the write ops of targeted batches in a map
static void cancelBatches(const WriteErrorDetail& why,
                          WriteOp* writeOps,
                          TargetedBatchMap* batchMap) {
    set<WriteOp*> targetedWriteOps;

    // Collect all the writeOps that are currently targeted
    for (TargetedBatchMap::iterator it = batchMap->begin(); it != batchMap->end();) {
        TargetedWriteBatch* batch = it->second;
        const vector<TargetedWrite*>& writes = batch->getWrites();

        for (vector<TargetedWrite*>::const_iterator writeIt = writes.begin();
             writeIt != writes.end();
             ++writeIt) {
            TargetedWrite* write = *writeIt;

            // NOTE: We may repeatedly cancel a write op here, but that's fast and we want to
            // cancel before erasing the TargetedWrite* (which owns the cancelled targeting
            // info) for reporting reasons.
            writeOps[write->writeOpRef.first].cancelWrites(&why);
        }

        // Note that we need to *erase* first, *then* delete, since the map keys are ptrs from
        // the values
        batchMap->erase(it++);
        delete batch;
    }
    batchMap->clear();
}
Beispiel #3
0
    // Helper function to cancel all the write ops of targeted batches in a map
    static void cancelBatches( const BatchedErrorDetail& why,
                               WriteOp* writeOps,
                               TargetedBatchMap* batchMap ) {

        set<WriteOp*> targetedWriteOps;

        // Collect all the writeOps that are currently targeted
        for ( TargetedBatchMap::iterator it = batchMap->begin(); it != batchMap->end(); ) {

            TargetedWriteBatch* batch = it->second;
            const vector<TargetedWrite*>& writes = batch->getWrites();

            for ( vector<TargetedWrite*>::const_iterator writeIt = writes.begin();
                writeIt != writes.end(); ++writeIt ) {

                TargetedWrite* write = *writeIt;
                targetedWriteOps.insert( &writeOps[write->writeOpRef.first] );
            }

            // Note that we need to *erase* first, *then* delete, since the map keys are ptrs from
            // the values
            batchMap->erase( it++ );
            delete batch;
        }
        batchMap->clear();

        // Cancel all the write ops we found above
        for ( set<WriteOp*>::iterator it = targetedWriteOps.begin(); it != targetedWriteOps.end();
            ++it ) {
            WriteOp* writeOp = *it;
            writeOp->cancelWrites( &why );
        }
    }
Beispiel #4
0
    void BatchWriteOp::buildBatchRequest( const TargetedWriteBatch& targetedBatch,
                                          BatchedCommandRequest* request ) const {

        request->setNS( _clientRequest->getNS() );
        request->setShardVersion( targetedBatch.getEndpoint().shardVersion );

        const vector<TargetedWrite*>& targetedWrites = targetedBatch.getWrites();

        for ( vector<TargetedWrite*>::const_iterator it = targetedWrites.begin();
            it != targetedWrites.end(); ++it ) {

            const WriteOpRef& writeOpRef = ( *it )->writeOpRef;
            BatchedCommandRequest::BatchType batchType = _clientRequest->getBatchType();

            // NOTE:  We copy the batch items themselves here from the client request
            // TODO: This could be inefficient, maybe we want to just reference in the future
            if ( batchType == BatchedCommandRequest::BatchType_Insert ) {
                BatchedInsertRequest* clientInsertRequest = _clientRequest->getInsertRequest();
                BSONObj insertDoc = clientInsertRequest->getDocumentsAt( writeOpRef.first );
                request->getInsertRequest()->addToDocuments( insertDoc );
            }
            else if ( batchType == BatchedCommandRequest::BatchType_Update ) {
                BatchedUpdateRequest* clientUpdateRequest = _clientRequest->getUpdateRequest();
                BatchedUpdateDocument* updateDoc = new BatchedUpdateDocument;
                clientUpdateRequest->getUpdatesAt( writeOpRef.first )->cloneTo( updateDoc );
                request->getUpdateRequest()->addToUpdates( updateDoc );
            }
            else {
                dassert( batchType == BatchedCommandRequest::BatchType_Delete );
                BatchedDeleteRequest* clientDeleteRequest = _clientRequest->getDeleteRequest();
                BatchedDeleteDocument* deleteDoc = new BatchedDeleteDocument;
                clientDeleteRequest->getDeletesAt( writeOpRef.first )->cloneTo( deleteDoc );
                request->getDeleteRequest()->addToDeletes( deleteDoc );
            }

            // TODO: We can add logic here to allow aborting individual ops
            //if ( NULL == response ) {
            //    ->responses.erase( it++ );
            //    continue;
            //}
        }

        if ( _clientRequest->isWriteConcernSet() ) {
            request->setWriteConcern( _clientRequest->getWriteConcern() );
        }
        if ( _clientRequest->isContinueOnErrorSet() ) {
            request->setContinueOnError( _clientRequest->getContinueOnError() );
        }
        request->setSession( 0 );
    }
Beispiel #5
0
void BatchWriteOp::noteBatchError(const TargetedWriteBatch& targetedBatch,
                                  const WriteErrorDetail& error) {
    // Treat errors to get a batch response as failures of the contained writes
    BatchedCommandResponse emulatedResponse;
    toWriteErrorResponse(
        error, _clientRequest->getOrdered(), targetedBatch.getWrites().size(), &emulatedResponse);

    noteBatchResponse(targetedBatch, emulatedResponse, NULL);
}
Beispiel #6
0
    // Helper function to cancel all the write ops of targeted batch.
    static void cancelBatch( const TargetedWriteBatch& targetedBatch,
                             WriteOp* writeOps,
                             const WriteErrorDetail& why ) {
        const vector<TargetedWrite*>& writes = targetedBatch.getWrites();

        for ( vector<TargetedWrite*>::const_iterator writeIt = writes.begin();
            writeIt != writes.end(); ++writeIt ) {

            TargetedWrite* write = *writeIt;
            // NOTE: We may repeatedly cancel a write op here, but that's fast.
            writeOps[write->writeOpRef.first].cancelWrites( &why );
        }
    }
Beispiel #7
0
void BatchWriteOp::noteBatchResponse(const TargetedWriteBatch& targetedBatch,
                                     const BatchedCommandResponse& response,
                                     TrackedErrors* trackedErrors) {
    if (!response.getOk()) {
        WriteErrorDetail error;
        cloneCommandErrorTo(response, &error);

        // Treat command errors exactly like other failures of the batch
        // Note that no errors will be tracked from these failures - as-designed
        noteBatchError(targetedBatch, error);
        return;
    }

    dassert(response.getOk());

    // Stop tracking targeted batch
    _targeted.erase(&targetedBatch);

    // Increment stats for this batch
    incBatchStats(_clientRequest->getBatchType(), response, _stats.get());

    //
    // Assign errors to particular items.
    // Write Concern errors are stored and handled later.
    //

    // Special handling for write concern errors, save for later
    if (response.isWriteConcernErrorSet()) {
        unique_ptr<ShardWCError> wcError(
            new ShardWCError(targetedBatch.getEndpoint(), *response.getWriteConcernError()));
        _wcErrors.mutableVector().push_back(wcError.release());
    }

    vector<WriteErrorDetail*> itemErrors;

    // Handle batch and per-item errors
    if (response.isErrDetailsSet()) {
        // Per-item errors were set
        itemErrors.insert(
            itemErrors.begin(), response.getErrDetails().begin(), response.getErrDetails().end());

        // Sort per-item errors by index
        std::sort(itemErrors.begin(), itemErrors.end(), WriteErrorDetailComp());
    }

    //
    // Go through all pending responses of the op and sorted remote reponses, populate errors
    // This will either set all errors to the batch error or apply per-item errors as-needed
    //
    // If the batch is ordered, cancel all writes after the first error for retargeting.
    //

    bool ordered = _clientRequest->getOrdered();

    vector<WriteErrorDetail*>::iterator itemErrorIt = itemErrors.begin();
    int index = 0;
    WriteErrorDetail* lastError = NULL;
    for (vector<TargetedWrite*>::const_iterator it = targetedBatch.getWrites().begin();
         it != targetedBatch.getWrites().end();
         ++it, ++index) {
        const TargetedWrite* write = *it;
        WriteOp& writeOp = _writeOps[write->writeOpRef.first];

        dassert(writeOp.getWriteState() == WriteOpState_Pending);

        // See if we have an error for the write
        WriteErrorDetail* writeError = NULL;

        if (itemErrorIt != itemErrors.end() && (*itemErrorIt)->getIndex() == index) {
            // We have an per-item error for this write op's index
            writeError = *itemErrorIt;
            ++itemErrorIt;
        }

        // Finish the response (with error, if needed)
        if (NULL == writeError) {
            if (!ordered || !lastError) {
                writeOp.noteWriteComplete(*write);
            } else {
                // We didn't actually apply this write - cancel so we can retarget
                dassert(writeOp.getNumTargeted() == 1u);
                writeOp.cancelWrites(lastError);
            }
        } else {
            writeOp.noteWriteError(*write, *writeError);
            lastError = writeError;
        }
    }

    // Track errors we care about, whether batch or individual errors
    if (NULL != trackedErrors) {
        trackErrors(targetedBatch.getEndpoint(), itemErrors, trackedErrors);
    }

    // Track upserted ids if we need to
    if (response.isUpsertDetailsSet()) {
        const vector<BatchedUpsertDetail*>& upsertedIds = response.getUpsertDetails();
        for (vector<BatchedUpsertDetail*>::const_iterator it = upsertedIds.begin();
             it != upsertedIds.end();
             ++it) {
            // The child upserted details don't have the correct index for the full batch
            const BatchedUpsertDetail* childUpsertedId = *it;

            // Work backward from the child batch item index to the batch item index
            int childBatchIndex = childUpsertedId->getIndex();
            int batchIndex = targetedBatch.getWrites()[childBatchIndex]->writeOpRef.first;

            // Push the upserted id with the correct index into the batch upserted ids
            BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail;
            upsertedId->setIndex(batchIndex);
            upsertedId->setUpsertedID(childUpsertedId->getUpsertedID());
            _upsertedIds.mutableVector().push_back(upsertedId);
        }
    }
}
Beispiel #8
0
Status BatchWriteOp::targetBatch(OperationContext* txn,
                                 const NSTargeter& targeter,
                                 bool recordTargetErrors,
                                 vector<TargetedWriteBatch*>* targetedBatches) {
    //
    // Targeting of unordered batches is fairly simple - each remaining write op is targeted,
    // and each of those targeted writes are grouped into a batch for a particular shard
    // endpoint.
    //
    // Targeting of ordered batches is a bit more complex - to respect the ordering of the
    // batch, we can only send:
    // A) a single targeted batch to one shard endpoint
    // B) multiple targeted batches, but only containing targeted writes for a single write op
    //
    // This means that any multi-shard write operation must be targeted and sent one-by-one.
    // Subsequent single-shard write operations can be batched together if they go to the same
    // place.
    //
    // Ex: ShardA : { skey : a->k }, ShardB : { skey : k->z }
    //
    // Ordered insert batch of: [{ skey : a }, { skey : b }, { skey : x }]
    // broken into:
    //  [{ skey : a }, { skey : b }],
    //  [{ skey : x }]
    //
    // Ordered update Batch of :
    //  [{ skey : a }{ $push },
    //   { skey : b }{ $push },
    //   { skey : [c, x] }{ $push },
    //   { skey : y }{ $push },
    //   { skey : z }{ $push }]
    // broken into:
    //  [{ skey : a }, { skey : b }],
    //  [{ skey : [c,x] }],
    //  [{ skey : y }, { skey : z }]
    //

    const bool ordered = _clientRequest->getOrdered();

    TargetedBatchMap batchMap;
    TargetedBatchSizeMap batchSizes;

    int numTargetErrors = 0;

    size_t numWriteOps = _clientRequest->sizeWriteOps();
    for (size_t i = 0; i < numWriteOps; ++i) {
        WriteOp& writeOp = _writeOps[i];

        // Only target _Ready ops
        if (writeOp.getWriteState() != WriteOpState_Ready)
            continue;

        //
        // Get TargetedWrites from the targeter for the write operation
        //

        // TargetedWrites need to be owned once returned
        OwnedPointerVector<TargetedWrite> writesOwned;
        vector<TargetedWrite*>& writes = writesOwned.mutableVector();

        Status targetStatus = writeOp.targetWrites(txn, targeter, &writes);

        if (!targetStatus.isOK()) {
            WriteErrorDetail targetError;
            buildTargetError(targetStatus, &targetError);

            if (!recordTargetErrors) {
                // Cancel current batch state with an error

                cancelBatches(targetError, _writeOps, &batchMap);
                dassert(batchMap.empty());
                return targetStatus;
            } else if (!ordered || batchMap.empty()) {
                // Record an error for this batch

                writeOp.setOpError(targetError);
                ++numTargetErrors;

                if (ordered)
                    return Status::OK();

                continue;
            } else {
                dassert(ordered && !batchMap.empty());

                // Send out what we have, but don't record an error yet, since there may be an
                // error in the writes before this point.

                writeOp.cancelWrites(&targetError);
                break;
            }
        }

        //
        // If ordered and we have a previous endpoint, make sure we don't need to send these
        // targeted writes to any other endpoints.
        //

        if (ordered && !batchMap.empty()) {
            dassert(batchMap.size() == 1u);
            if (isNewBatchRequired(writes, batchMap)) {
                writeOp.cancelWrites(NULL);
                break;
            }
        }

        //
        // If this write will push us over some sort of size limit, stop targeting
        //

        int writeSizeBytes = getWriteSizeBytes(writeOp);
        if (wouldMakeBatchesTooBig(writes, writeSizeBytes, batchSizes)) {
            invariant(!batchMap.empty());
            writeOp.cancelWrites(NULL);
            break;
        }

        //
        // Targeting went ok, add to appropriate TargetedBatch
        //

        for (vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it) {
            TargetedWrite* write = *it;

            TargetedBatchMap::iterator batchIt = batchMap.find(&write->endpoint);
            TargetedBatchSizeMap::iterator batchSizeIt = batchSizes.find(&write->endpoint);

            if (batchIt == batchMap.end()) {
                TargetedWriteBatch* newBatch = new TargetedWriteBatch(write->endpoint);
                batchIt = batchMap.insert(make_pair(&newBatch->getEndpoint(), newBatch)).first;
                batchSizeIt =
                    batchSizes.insert(make_pair(&newBatch->getEndpoint(), BatchSize())).first;
            }

            TargetedWriteBatch* batch = batchIt->second;
            BatchSize& batchSize = batchSizeIt->second;

            ++batchSize.numOps;
            batchSize.sizeBytes += writeSizeBytes;
            batch->addWrite(write);
        }

        // Relinquish ownership of TargetedWrites, now the TargetedBatches own them
        writesOwned.mutableVector().clear();

        //
        // Break if we're ordered and we have more than one endpoint - later writes cannot be
        // enforced as ordered across multiple shard endpoints.
        //

        if (ordered && batchMap.size() > 1u)
            break;
    }

    //
    // Send back our targeted batches
    //

    for (TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it) {
        TargetedWriteBatch* batch = it->second;

        if (batch->getWrites().empty())
            continue;

        // Remember targeted batch for reporting
        _targeted.insert(batch);
        // Send the handle back to caller
        targetedBatches->push_back(batch);
    }

    return Status::OK();
}
Beispiel #9
0
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        // Current batch status
        bool refreshedTargeter = false;
        int rounds = 0;
        int numCompletedOps = 0;
        int numRoundsWithoutProgress = 0;

        while ( !batchOp.isFinished() ) {

            //
            // Get child batches to send using the targeter
            //
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            //
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            //
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            //
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            //

            vector<TargetedWriteBatch*> childBatches;

            // If we've already had a targeting error, we've refreshed the metadata once and can
            // record target errors definitively.
            bool recordTargetErrors = refreshedTargeter;
            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       recordTargetErrors,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                // Don't do anything until a targeter refresh
                _targeter->noteCouldNotTarget();
                refreshedTargeter = true;
                ++_stats->numTargetErrors;
                dassert( childBatches.size() == 0u );
            }

            //
            // Send all child batches
            //

            size_t numSent = 0;
            size_t numToSend = childBatches.size();
            bool remoteMetadataChanging = false;
            while ( numSent != numToSend ) {

                // Collect batches out on the network, mapped by endpoint
                HostBatchMap pendingBatches;

                //
                // Send side
                //

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    //
                    // Collect the info needed to dispatch our targeted batch
                    //

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;

                    // Figure out what host we need to dispatch our targeted batch
                    ConnectionString shardHost;
                    Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint()
                                                                           .shardName,
                                                                       &shardHost );
                    if ( !resolveStatus.isOK() ) {

                        ++_stats->numResolveErrors;

                        // Record a resolve failure
                        // TODO: It may be necessary to refresh the cache if stale, or maybe just
                        // cancel and retarget the batch
                        WriteErrorDetail error;
                        buildErrorFrom( resolveStatus, &error );
                        batchOp.noteBatchError( *nextBatch, error );

                        // We're done with this batch
                        *it = NULL;
                        --numToSend;
                        continue;
                    }

                    // If we already have a batch for this host, wait until the next time
                    HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost );
                    if ( pendingIt != pendingBatches.end() ) continue;

                    //
                    // We now have all the info needed to dispatch the batch
                    //

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( shardHost, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( shardHost, nextBatch ) );
                }

                // Send them all out
                _dispatcher->sendAll();
                numSent += pendingBatches.size();

                //
                // Recv side
                //

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString shardHost;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    dassert( pendingBatches.find( shardHost ) != pendingBatches.end() );
                    TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                            ++_stats->numStaleBatches;
                        }

                        // Remember if the shard is actively changing metadata right now
                        if ( isShardMetadataChanging( staleErrors ) ) {
                            remoteMetadataChanging = true;
                        }

                        // Remember that we successfully wrote to this shard
                        // NOTE: This will record lastOps for shards where we actually didn't update
                        // or delete any documents, which preserves old behavior but is conservative
                        _stats->noteWriteAt( shardHost,
                                             response.isLastOpSet() ? 
                                             response.getLastOp() : OpTime(),
                                             response.isElectionIdSet() ?
                                             response.getElectionId() : OID());
                    }
                    else {

                        // Error occurred dispatching, note it
                        WriteErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );
                    }
                }
            }

            ++rounds;
            ++_stats->numRounds;

            // If we're done, get out
            if ( batchOp.isFinished() )
                break;

            // MORE WORK TO DO

            //
            // Refresh the targeter if we need to (no-op if nothing stale)
            //

            bool targeterChanged = false;
            Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged );

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;
            }

            //
            // Ensure progress is being made toward completing the batch op
            //

            int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed );
            if ( currCompletedOps == numCompletedOps && !targeterChanged
                 && !remoteMetadataChanging ) {
                ++numRoundsWithoutProgress;
            }
            else {
                numRoundsWithoutProgress = 0;
            }
            numCompletedOps = currCompletedOps;

            if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) {

                stringstream msg;
                msg << "no progress was made executing batch write op in " << clientRequest.getNS()
                    << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                    << " ops completed in " << rounds << " rounds total)";

                WriteErrorDetail error;
                buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error );
                batchOp.setBatchError( error );
                break;
            }
        }

        batchOp.buildClientResponse( clientResponse );
    }
Beispiel #10
0
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        int numTargetErrors = 0;
        int numStaleBatches = 0;

        for ( int rounds = 0; !batchOp.isFinished(); rounds++ ) {

            //
            // Refresh the targeter if we need to (no-op if nothing stale)
            //

            Status refreshStatus = _targeter->refreshIfNeeded();

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;
            }

            //
            // Get child batches to send
            //

            vector<TargetedWriteBatch*> childBatches;

            //
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            //
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            //
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            //
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            //
            // If we've had a targeting error or stale error, we've refreshed the metadata once and
            // can record target errors.
            bool recordTargetErrors = numTargetErrors > 0 || numStaleBatches > 0;

            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       recordTargetErrors,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                _targeter->noteCouldNotTarget();
                ++numTargetErrors;
                continue;
            }

            //
            // Send all child batches
            //

            size_t numSent = 0;
            while ( numSent != childBatches.size() ) {

                // Collect batches out on the network, mapped by endpoint
                EndpointBatchMap pendingBatches;

                //
                // Send side
                //

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;
                    const ConnectionString& hostEndpoint = nextBatch->getEndpoint().shardHost;

                    EndpointBatchMap::iterator pendingIt = pendingBatches.find( &hostEndpoint );

                    // If we already have a batch for this endpoint, continue
                    if ( pendingIt != pendingBatches.end() ) continue;

                    // Otherwise send it out to the endpoint via a command to a database

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( hostEndpoint, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( &hostEndpoint, nextBatch ) );
                }

                // Send them all out
                _dispatcher->sendAll();
                numSent += pendingBatches.size();

                //
                // Recv side
                //

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString endpoint;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &endpoint, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    TargetedWriteBatch* batchRaw = pendingBatches.find( &endpoint )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                            ++numStaleBatches;
                        }
                    }
                    else {

                        // Error occurred dispatching, note it
                        BatchedErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );
                    }
                }
            }
        }

        batchOp.buildClientResponse( clientResponse );
    }
Beispiel #11
0
    void BatchWriteOp::noteBatchResponse( const TargetedWriteBatch& targetedBatch,
                                          const BatchedCommandResponse& response,
                                          TrackedErrors* trackedErrors ) {

        //
        // Organize errors based on error code.
        // We may have *either* a batch error or errors per-item.
        // (Write Concern errors are stored and handled later.)
        //

        vector<BatchedErrorDetail*> itemErrors;
        scoped_ptr<BatchedErrorDetail> batchError;

        if ( !response.getOk() ) {

            int errCode = response.getErrCode();
            bool isWCError = isWCErrCode( errCode );

            // Special handling for write concern errors, save for later
            if ( isWCError ) {
                BatchedErrorDetail error;
                cloneBatchErrorTo( response, &error );
                ShardError* wcError = new ShardError( targetedBatch.getEndpoint(), error );
                _wcErrors.mutableVector().push_back( wcError );
            }

            // Handle batch and per-item errors
            if ( response.isErrDetailsSet() ) {

                // Per-item errors were set
                itemErrors.insert( itemErrors.begin(),
                                   response.getErrDetails().begin(),
                                   response.getErrDetails().end() );

                // Sort per-item errors by index
                std::sort( itemErrors.begin(), itemErrors.end(), BatchedErrorDetailComp() );
            }
            else if ( !isWCError ) {

                // Per-item errors were not set and this error is not a WC error
                // => this is a full-batch error
                batchError.reset( new BatchedErrorDetail );
                cloneBatchErrorTo( response, batchError.get() );
            }
        }

        // We can't have both a batch error and per-item errors
        dassert( !( batchError && !itemErrors.empty() ) );

        //
        // Go through all pending responses of the op and sorted remote reponses, populate errors
        // This will either set all errors to the batch error or apply per-item errors as-needed
        //

        vector<BatchedErrorDetail*>::iterator itemErrorIt = itemErrors.begin();
        int index = 0;
        for ( vector<TargetedWrite*>::const_iterator it = targetedBatch.getWrites().begin();
            it != targetedBatch.getWrites().end(); ++it, ++index ) {

            const TargetedWrite* write = *it;
            WriteOp& writeOp = _writeOps[write->writeOpRef.first];

            dassert( writeOp.getWriteState() == WriteOpState_Pending );

            // See if we have an error for the write
            BatchedErrorDetail* writeError = NULL;

            if ( batchError ) {
                // Default to batch error, if it exists
                writeError = batchError.get();
            }
            else if ( itemErrorIt != itemErrors.end() && ( *itemErrorIt )->getIndex() == index ) {
                // We have an per-item error for this write op's index
                writeError = *itemErrorIt;
                ++itemErrorIt;
            }

            // Finish the response (with error, if needed)
            if ( NULL == writeError ) {
                writeOp.noteWriteComplete( *write );
            }
            else {
                writeOp.noteWriteError( *write, *writeError );
            }
        }

        // Track errors we care about, whether batch or individual errors
        if ( NULL != trackedErrors ) {
            trackErrors( targetedBatch.getEndpoint(), batchError.get(), itemErrors, trackedErrors );
        }

        // Stop tracking targeted batch
        _targeted.erase( &targetedBatch );
    }
Beispiel #12
0
    Status BatchWriteOp::targetBatch( const NSTargeter& targeter,
                                      bool recordTargetErrors,
                                      vector<TargetedWriteBatch*>* targetedBatches ) {

        TargetedBatchMap batchMap;

        size_t numWriteOps = _clientRequest->sizeWriteOps();
        for ( size_t i = 0; i < numWriteOps; ++i ) {

            // Only do one-at-a-time ops if COE is false
            if ( !_clientRequest->getContinueOnError() && !batchMap.empty() ) break;

            WriteOp& writeOp = _writeOps[i];

            // Only target _Ready ops
            if ( writeOp.getWriteState() != WriteOpState_Ready ) continue;

            //
            // Get TargetedWrites from the targeter for the write operation
            //

            // TargetedWrites need to be owned once returned
            OwnedPointerVector<TargetedWrite> writesOwned;
            vector<TargetedWrite*>& writes = writesOwned.mutableVector();

            Status targetStatus = writeOp.targetWrites( targeter, &writes );

            if ( !targetStatus.isOK() ) {

                //
                // We're not sure how to target here, so either record the error or cancel the
                // current batches.
                //

                BatchedErrorDetail targetError;
                buildTargetError( targetStatus, &targetError );

                if ( recordTargetErrors ) {
                    writeOp.setOpError( targetError );
                    continue;
                }
                else {
                    // Cancel current batch state with an error
                    cancelBatches( targetError, _writeOps, &batchMap );
                    dassert( batchMap.empty() );
                    return targetStatus;
                }
            }

            //
            // Targeting went ok, add to appropriate TargetedBatch
            //

            for ( vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it ) {

                TargetedWrite* write = *it;

                TargetedBatchMap::iterator seenIt = batchMap.find( &write->endpoint );
                if ( seenIt == batchMap.end() ) {
                    TargetedWriteBatch* newBatch = new TargetedWriteBatch( write->endpoint );
                    seenIt = batchMap.insert( make_pair( &newBatch->getEndpoint(), //
                                                         newBatch ) ).first;
                }

                TargetedWriteBatch* batch = seenIt->second;
                batch->addWrite( write );
            }

            // Relinquish ownership of TargetedWrites, now the TargetedBatches own them
            writesOwned.mutableVector().clear();
        }

        //
        // Send back our targeted batches
        //

        for ( TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it ) {

            TargetedWriteBatch* batch = it->second;

            // Remember targeted batch for reporting
            _targeted.insert( batch );
            // Send the handle back to caller
            targetedBatches->push_back( batch );
        }

        return Status::OK();
    }