void BatchWriteOp::buildBatchRequest(const TargetedWriteBatch& targetedBatch, BatchedCommandRequest* request) const { request->setNS(_clientRequest->getNS()); request->setShouldBypassValidation(_clientRequest->shouldBypassValidation()); const vector<TargetedWrite*>& targetedWrites = targetedBatch.getWrites(); for (vector<TargetedWrite*>::const_iterator it = targetedWrites.begin(); it != targetedWrites.end(); ++it) { const WriteOpRef& writeOpRef = (*it)->writeOpRef; BatchedCommandRequest::BatchType batchType = _clientRequest->getBatchType(); // NOTE: We copy the batch items themselves here from the client request // TODO: This could be inefficient, maybe we want to just reference in the future if (batchType == BatchedCommandRequest::BatchType_Insert) { BatchedInsertRequest* clientInsertRequest = _clientRequest->getInsertRequest(); BSONObj insertDoc = clientInsertRequest->getDocumentsAt(writeOpRef.first); request->getInsertRequest()->addToDocuments(insertDoc); } else if (batchType == BatchedCommandRequest::BatchType_Update) { BatchedUpdateRequest* clientUpdateRequest = _clientRequest->getUpdateRequest(); BatchedUpdateDocument* updateDoc = new BatchedUpdateDocument; clientUpdateRequest->getUpdatesAt(writeOpRef.first)->cloneTo(updateDoc); request->getUpdateRequest()->addToUpdates(updateDoc); } else { dassert(batchType == BatchedCommandRequest::BatchType_Delete); BatchedDeleteRequest* clientDeleteRequest = _clientRequest->getDeleteRequest(); BatchedDeleteDocument* deleteDoc = new BatchedDeleteDocument; clientDeleteRequest->getDeletesAt(writeOpRef.first)->cloneTo(deleteDoc); request->getDeleteRequest()->addToDeletes(deleteDoc); } // TODO: We can add logic here to allow aborting individual ops // if ( NULL == response ) { // ->responses.erase( it++ ); // continue; //} } if (_clientRequest->isWriteConcernSet()) { if (_clientRequest->isVerboseWC()) { request->setWriteConcern(_clientRequest->getWriteConcern()); } else { // Mongos needs to send to the shard with w > 0 so it will be able to // see the writeErrors. request->setWriteConcern(upgradeWriteConcern(_clientRequest->getWriteConcern())); } } if (!request->isOrderedSet()) { request->setOrdered(_clientRequest->getOrdered()); } unique_ptr<BatchedRequestMetadata> requestMetadata(new BatchedRequestMetadata()); requestMetadata->setShardVersion( ChunkVersionAndOpTime(targetedBatch.getEndpoint().shardVersion)); requestMetadata->setSession(0); request->setMetadata(requestMetadata.release()); }
// Helper function to cancel all the write ops of targeted batches in a map static void cancelBatches(const WriteErrorDetail& why, WriteOp* writeOps, TargetedBatchMap* batchMap) { set<WriteOp*> targetedWriteOps; // Collect all the writeOps that are currently targeted for (TargetedBatchMap::iterator it = batchMap->begin(); it != batchMap->end();) { TargetedWriteBatch* batch = it->second; const vector<TargetedWrite*>& writes = batch->getWrites(); for (vector<TargetedWrite*>::const_iterator writeIt = writes.begin(); writeIt != writes.end(); ++writeIt) { TargetedWrite* write = *writeIt; // NOTE: We may repeatedly cancel a write op here, but that's fast and we want to // cancel before erasing the TargetedWrite* (which owns the cancelled targeting // info) for reporting reasons. writeOps[write->writeOpRef.first].cancelWrites(&why); } // Note that we need to *erase* first, *then* delete, since the map keys are ptrs from // the values batchMap->erase(it++); delete batch; } batchMap->clear(); }
// Helper function to cancel all the write ops of targeted batches in a map static void cancelBatches( const BatchedErrorDetail& why, WriteOp* writeOps, TargetedBatchMap* batchMap ) { set<WriteOp*> targetedWriteOps; // Collect all the writeOps that are currently targeted for ( TargetedBatchMap::iterator it = batchMap->begin(); it != batchMap->end(); ) { TargetedWriteBatch* batch = it->second; const vector<TargetedWrite*>& writes = batch->getWrites(); for ( vector<TargetedWrite*>::const_iterator writeIt = writes.begin(); writeIt != writes.end(); ++writeIt ) { TargetedWrite* write = *writeIt; targetedWriteOps.insert( &writeOps[write->writeOpRef.first] ); } // Note that we need to *erase* first, *then* delete, since the map keys are ptrs from // the values batchMap->erase( it++ ); delete batch; } batchMap->clear(); // Cancel all the write ops we found above for ( set<WriteOp*>::iterator it = targetedWriteOps.begin(); it != targetedWriteOps.end(); ++it ) { WriteOp* writeOp = *it; writeOp->cancelWrites( &why ); } }
void BatchWriteOp::buildBatchRequest( const TargetedWriteBatch& targetedBatch, BatchedCommandRequest* request ) const { request->setNS( _clientRequest->getNS() ); request->setShardVersion( targetedBatch.getEndpoint().shardVersion ); const vector<TargetedWrite*>& targetedWrites = targetedBatch.getWrites(); for ( vector<TargetedWrite*>::const_iterator it = targetedWrites.begin(); it != targetedWrites.end(); ++it ) { const WriteOpRef& writeOpRef = ( *it )->writeOpRef; BatchedCommandRequest::BatchType batchType = _clientRequest->getBatchType(); // NOTE: We copy the batch items themselves here from the client request // TODO: This could be inefficient, maybe we want to just reference in the future if ( batchType == BatchedCommandRequest::BatchType_Insert ) { BatchedInsertRequest* clientInsertRequest = _clientRequest->getInsertRequest(); BSONObj insertDoc = clientInsertRequest->getDocumentsAt( writeOpRef.first ); request->getInsertRequest()->addToDocuments( insertDoc ); } else if ( batchType == BatchedCommandRequest::BatchType_Update ) { BatchedUpdateRequest* clientUpdateRequest = _clientRequest->getUpdateRequest(); BatchedUpdateDocument* updateDoc = new BatchedUpdateDocument; clientUpdateRequest->getUpdatesAt( writeOpRef.first )->cloneTo( updateDoc ); request->getUpdateRequest()->addToUpdates( updateDoc ); } else { dassert( batchType == BatchedCommandRequest::BatchType_Delete ); BatchedDeleteRequest* clientDeleteRequest = _clientRequest->getDeleteRequest(); BatchedDeleteDocument* deleteDoc = new BatchedDeleteDocument; clientDeleteRequest->getDeletesAt( writeOpRef.first )->cloneTo( deleteDoc ); request->getDeleteRequest()->addToDeletes( deleteDoc ); } // TODO: We can add logic here to allow aborting individual ops //if ( NULL == response ) { // ->responses.erase( it++ ); // continue; //} } if ( _clientRequest->isWriteConcernSet() ) { request->setWriteConcern( _clientRequest->getWriteConcern() ); } if ( _clientRequest->isContinueOnErrorSet() ) { request->setContinueOnError( _clientRequest->getContinueOnError() ); } request->setSession( 0 ); }
void BatchWriteOp::noteBatchError(const TargetedWriteBatch& targetedBatch, const WriteErrorDetail& error) { // Treat errors to get a batch response as failures of the contained writes BatchedCommandResponse emulatedResponse; toWriteErrorResponse( error, _clientRequest->getOrdered(), targetedBatch.getWrites().size(), &emulatedResponse); noteBatchResponse(targetedBatch, emulatedResponse, NULL); }
// Helper function to cancel all the write ops of targeted batch. static void cancelBatch( const TargetedWriteBatch& targetedBatch, WriteOp* writeOps, const WriteErrorDetail& why ) { const vector<TargetedWrite*>& writes = targetedBatch.getWrites(); for ( vector<TargetedWrite*>::const_iterator writeIt = writes.begin(); writeIt != writes.end(); ++writeIt ) { TargetedWrite* write = *writeIt; // NOTE: We may repeatedly cancel a write op here, but that's fast. writeOps[write->writeOpRef.first].cancelWrites( &why ); } }
void BatchWriteOp::noteBatchResponse(const TargetedWriteBatch& targetedBatch, const BatchedCommandResponse& response, TrackedErrors* trackedErrors) { if (!response.getOk()) { WriteErrorDetail error; cloneCommandErrorTo(response, &error); // Treat command errors exactly like other failures of the batch // Note that no errors will be tracked from these failures - as-designed noteBatchError(targetedBatch, error); return; } dassert(response.getOk()); // Stop tracking targeted batch _targeted.erase(&targetedBatch); // Increment stats for this batch incBatchStats(_clientRequest->getBatchType(), response, _stats.get()); // // Assign errors to particular items. // Write Concern errors are stored and handled later. // // Special handling for write concern errors, save for later if (response.isWriteConcernErrorSet()) { unique_ptr<ShardWCError> wcError( new ShardWCError(targetedBatch.getEndpoint(), *response.getWriteConcernError())); _wcErrors.mutableVector().push_back(wcError.release()); } vector<WriteErrorDetail*> itemErrors; // Handle batch and per-item errors if (response.isErrDetailsSet()) { // Per-item errors were set itemErrors.insert( itemErrors.begin(), response.getErrDetails().begin(), response.getErrDetails().end()); // Sort per-item errors by index std::sort(itemErrors.begin(), itemErrors.end(), WriteErrorDetailComp()); } // // Go through all pending responses of the op and sorted remote reponses, populate errors // This will either set all errors to the batch error or apply per-item errors as-needed // // If the batch is ordered, cancel all writes after the first error for retargeting. // bool ordered = _clientRequest->getOrdered(); vector<WriteErrorDetail*>::iterator itemErrorIt = itemErrors.begin(); int index = 0; WriteErrorDetail* lastError = NULL; for (vector<TargetedWrite*>::const_iterator it = targetedBatch.getWrites().begin(); it != targetedBatch.getWrites().end(); ++it, ++index) { const TargetedWrite* write = *it; WriteOp& writeOp = _writeOps[write->writeOpRef.first]; dassert(writeOp.getWriteState() == WriteOpState_Pending); // See if we have an error for the write WriteErrorDetail* writeError = NULL; if (itemErrorIt != itemErrors.end() && (*itemErrorIt)->getIndex() == index) { // We have an per-item error for this write op's index writeError = *itemErrorIt; ++itemErrorIt; } // Finish the response (with error, if needed) if (NULL == writeError) { if (!ordered || !lastError) { writeOp.noteWriteComplete(*write); } else { // We didn't actually apply this write - cancel so we can retarget dassert(writeOp.getNumTargeted() == 1u); writeOp.cancelWrites(lastError); } } else { writeOp.noteWriteError(*write, *writeError); lastError = writeError; } } // Track errors we care about, whether batch or individual errors if (NULL != trackedErrors) { trackErrors(targetedBatch.getEndpoint(), itemErrors, trackedErrors); } // Track upserted ids if we need to if (response.isUpsertDetailsSet()) { const vector<BatchedUpsertDetail*>& upsertedIds = response.getUpsertDetails(); for (vector<BatchedUpsertDetail*>::const_iterator it = upsertedIds.begin(); it != upsertedIds.end(); ++it) { // The child upserted details don't have the correct index for the full batch const BatchedUpsertDetail* childUpsertedId = *it; // Work backward from the child batch item index to the batch item index int childBatchIndex = childUpsertedId->getIndex(); int batchIndex = targetedBatch.getWrites()[childBatchIndex]->writeOpRef.first; // Push the upserted id with the correct index into the batch upserted ids BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail; upsertedId->setIndex(batchIndex); upsertedId->setUpsertedID(childUpsertedId->getUpsertedID()); _upsertedIds.mutableVector().push_back(upsertedId); } } }
Status BatchWriteOp::targetBatch(OperationContext* txn, const NSTargeter& targeter, bool recordTargetErrors, vector<TargetedWriteBatch*>* targetedBatches) { // // Targeting of unordered batches is fairly simple - each remaining write op is targeted, // and each of those targeted writes are grouped into a batch for a particular shard // endpoint. // // Targeting of ordered batches is a bit more complex - to respect the ordering of the // batch, we can only send: // A) a single targeted batch to one shard endpoint // B) multiple targeted batches, but only containing targeted writes for a single write op // // This means that any multi-shard write operation must be targeted and sent one-by-one. // Subsequent single-shard write operations can be batched together if they go to the same // place. // // Ex: ShardA : { skey : a->k }, ShardB : { skey : k->z } // // Ordered insert batch of: [{ skey : a }, { skey : b }, { skey : x }] // broken into: // [{ skey : a }, { skey : b }], // [{ skey : x }] // // Ordered update Batch of : // [{ skey : a }{ $push }, // { skey : b }{ $push }, // { skey : [c, x] }{ $push }, // { skey : y }{ $push }, // { skey : z }{ $push }] // broken into: // [{ skey : a }, { skey : b }], // [{ skey : [c,x] }], // [{ skey : y }, { skey : z }] // const bool ordered = _clientRequest->getOrdered(); TargetedBatchMap batchMap; TargetedBatchSizeMap batchSizes; int numTargetErrors = 0; size_t numWriteOps = _clientRequest->sizeWriteOps(); for (size_t i = 0; i < numWriteOps; ++i) { WriteOp& writeOp = _writeOps[i]; // Only target _Ready ops if (writeOp.getWriteState() != WriteOpState_Ready) continue; // // Get TargetedWrites from the targeter for the write operation // // TargetedWrites need to be owned once returned OwnedPointerVector<TargetedWrite> writesOwned; vector<TargetedWrite*>& writes = writesOwned.mutableVector(); Status targetStatus = writeOp.targetWrites(txn, targeter, &writes); if (!targetStatus.isOK()) { WriteErrorDetail targetError; buildTargetError(targetStatus, &targetError); if (!recordTargetErrors) { // Cancel current batch state with an error cancelBatches(targetError, _writeOps, &batchMap); dassert(batchMap.empty()); return targetStatus; } else if (!ordered || batchMap.empty()) { // Record an error for this batch writeOp.setOpError(targetError); ++numTargetErrors; if (ordered) return Status::OK(); continue; } else { dassert(ordered && !batchMap.empty()); // Send out what we have, but don't record an error yet, since there may be an // error in the writes before this point. writeOp.cancelWrites(&targetError); break; } } // // If ordered and we have a previous endpoint, make sure we don't need to send these // targeted writes to any other endpoints. // if (ordered && !batchMap.empty()) { dassert(batchMap.size() == 1u); if (isNewBatchRequired(writes, batchMap)) { writeOp.cancelWrites(NULL); break; } } // // If this write will push us over some sort of size limit, stop targeting // int writeSizeBytes = getWriteSizeBytes(writeOp); if (wouldMakeBatchesTooBig(writes, writeSizeBytes, batchSizes)) { invariant(!batchMap.empty()); writeOp.cancelWrites(NULL); break; } // // Targeting went ok, add to appropriate TargetedBatch // for (vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it) { TargetedWrite* write = *it; TargetedBatchMap::iterator batchIt = batchMap.find(&write->endpoint); TargetedBatchSizeMap::iterator batchSizeIt = batchSizes.find(&write->endpoint); if (batchIt == batchMap.end()) { TargetedWriteBatch* newBatch = new TargetedWriteBatch(write->endpoint); batchIt = batchMap.insert(make_pair(&newBatch->getEndpoint(), newBatch)).first; batchSizeIt = batchSizes.insert(make_pair(&newBatch->getEndpoint(), BatchSize())).first; } TargetedWriteBatch* batch = batchIt->second; BatchSize& batchSize = batchSizeIt->second; ++batchSize.numOps; batchSize.sizeBytes += writeSizeBytes; batch->addWrite(write); } // Relinquish ownership of TargetedWrites, now the TargetedBatches own them writesOwned.mutableVector().clear(); // // Break if we're ordered and we have more than one endpoint - later writes cannot be // enforced as ordered across multiple shard endpoints. // if (ordered && batchMap.size() > 1u) break; } // // Send back our targeted batches // for (TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it) { TargetedWriteBatch* batch = it->second; if (batch->getWrites().empty()) continue; // Remember targeted batch for reporting _targeted.insert(batch); // Send the handle back to caller targetedBatches->push_back(batch); } return Status::OK(); }
void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse ) { BatchWriteOp batchOp; batchOp.initClientRequest( &clientRequest ); // Current batch status bool refreshedTargeter = false; int rounds = 0; int numCompletedOps = 0; int numRoundsWithoutProgress = 0; while ( !batchOp.isFinished() ) { // // Get child batches to send using the targeter // // Targeting errors can be caused by remote metadata changing (the collection could have // been dropped and recreated, for example with a new shard key). If a remote metadata // change occurs *before* a client sends us a batch, we need to make sure that we don't // error out just because we're staler than the client - otherwise mongos will be have // unpredictable behavior. // // (If a metadata change happens *during* or *after* a client sends us a batch, however, // we make no guarantees about delivery.) // // For this reason, we don't record targeting errors until we've refreshed our targeting // metadata at least once *after* receiving the client batch - at that point, we know: // // 1) our new metadata is the same as the metadata when the client sent a batch, and so // targeting errors are real. // OR // 2) our new metadata is a newer version than when the client sent a batch, and so // the metadata must have changed after the client batch was sent. We don't need to // deliver in this case, since for all the client knows we may have gotten the batch // exactly when the metadata changed. // vector<TargetedWriteBatch*> childBatches; // If we've already had a targeting error, we've refreshed the metadata once and can // record target errors definitively. bool recordTargetErrors = refreshedTargeter; Status targetStatus = batchOp.targetBatch( *_targeter, recordTargetErrors, &childBatches ); if ( !targetStatus.isOK() ) { // Don't do anything until a targeter refresh _targeter->noteCouldNotTarget(); refreshedTargeter = true; ++_stats->numTargetErrors; dassert( childBatches.size() == 0u ); } // // Send all child batches // size_t numSent = 0; size_t numToSend = childBatches.size(); bool remoteMetadataChanging = false; while ( numSent != numToSend ) { // Collect batches out on the network, mapped by endpoint HostBatchMap pendingBatches; // // Send side // // Get as many batches as we can at once for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin(); it != childBatches.end(); ++it ) { // // Collect the info needed to dispatch our targeted batch // TargetedWriteBatch* nextBatch = *it; // If the batch is NULL, we sent it previously, so skip if ( nextBatch == NULL ) continue; // Figure out what host we need to dispatch our targeted batch ConnectionString shardHost; Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint() .shardName, &shardHost ); if ( !resolveStatus.isOK() ) { ++_stats->numResolveErrors; // Record a resolve failure // TODO: It may be necessary to refresh the cache if stale, or maybe just // cancel and retarget the batch WriteErrorDetail error; buildErrorFrom( resolveStatus, &error ); batchOp.noteBatchError( *nextBatch, error ); // We're done with this batch *it = NULL; --numToSend; continue; } // If we already have a batch for this host, wait until the next time HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost ); if ( pendingIt != pendingBatches.end() ) continue; // // We now have all the info needed to dispatch the batch // BatchedCommandRequest request( clientRequest.getBatchType() ); batchOp.buildBatchRequest( *nextBatch, &request ); // Internally we use full namespaces for request/response, but we send the // command to a database with the collection name in the request. NamespaceString nss( request.getNS() ); request.setNS( nss.coll() ); _dispatcher->addCommand( shardHost, nss.db(), request ); // Indicate we're done by setting the batch to NULL // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast // endpoints for the same host, so this should be pretty efficient without // moving stuff around. *it = NULL; // Recv-side is responsible for cleaning up the nextBatch when used pendingBatches.insert( make_pair( shardHost, nextBatch ) ); } // Send them all out _dispatcher->sendAll(); numSent += pendingBatches.size(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response ConnectionString shardHost; BatchedCommandResponse response; Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response ); // Get the TargetedWriteBatch to find where to put the response dassert( pendingBatches.find( shardHost ) != pendingBatches.end() ); TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second; scoped_ptr<TargetedWriteBatch> batch( batchRaw ); if ( dispatchStatus.isOK() ) { TrackedErrors trackedErrors; trackedErrors.startTracking( ErrorCodes::StaleShardVersion ); // Dispatch was ok, note response batchOp.noteBatchResponse( *batch, response, &trackedErrors ); // Note if anything was stale const vector<ShardError*>& staleErrors = trackedErrors.getErrors( ErrorCodes::StaleShardVersion ); if ( staleErrors.size() > 0 ) { noteStaleResponses( staleErrors, _targeter ); ++_stats->numStaleBatches; } // Remember if the shard is actively changing metadata right now if ( isShardMetadataChanging( staleErrors ) ) { remoteMetadataChanging = true; } // Remember that we successfully wrote to this shard // NOTE: This will record lastOps for shards where we actually didn't update // or delete any documents, which preserves old behavior but is conservative _stats->noteWriteAt( shardHost, response.isLastOpSet() ? response.getLastOp() : OpTime(), response.isElectionIdSet() ? response.getElectionId() : OID()); } else { // Error occurred dispatching, note it WriteErrorDetail error; buildErrorFrom( dispatchStatus, &error ); batchOp.noteBatchError( *batch, error ); } } } ++rounds; ++_stats->numRounds; // If we're done, get out if ( batchOp.isFinished() ) break; // MORE WORK TO DO // // Refresh the targeter if we need to (no-op if nothing stale) // bool targeterChanged = false; Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged ); if ( !refreshStatus.isOK() ) { // It's okay if we can't refresh, we'll just record errors for the ops if // needed. warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() ) << endl; } // // Ensure progress is being made toward completing the batch op // int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed ); if ( currCompletedOps == numCompletedOps && !targeterChanged && !remoteMetadataChanging ) { ++numRoundsWithoutProgress; } else { numRoundsWithoutProgress = 0; } numCompletedOps = currCompletedOps; if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) { stringstream msg; msg << "no progress was made executing batch write op in " << clientRequest.getNS() << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps << " ops completed in " << rounds << " rounds total)"; WriteErrorDetail error; buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error ); batchOp.setBatchError( error ); break; } } batchOp.buildClientResponse( clientResponse ); }
void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse ) { BatchWriteOp batchOp; batchOp.initClientRequest( &clientRequest ); int numTargetErrors = 0; int numStaleBatches = 0; for ( int rounds = 0; !batchOp.isFinished(); rounds++ ) { // // Refresh the targeter if we need to (no-op if nothing stale) // Status refreshStatus = _targeter->refreshIfNeeded(); if ( !refreshStatus.isOK() ) { // It's okay if we can't refresh, we'll just record errors for the ops if // needed. warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() ) << endl; } // // Get child batches to send // vector<TargetedWriteBatch*> childBatches; // // Targeting errors can be caused by remote metadata changing (the collection could have // been dropped and recreated, for example with a new shard key). If a remote metadata // change occurs *before* a client sends us a batch, we need to make sure that we don't // error out just because we're staler than the client - otherwise mongos will be have // unpredictable behavior. // // (If a metadata change happens *during* or *after* a client sends us a batch, however, // we make no guarantees about delivery.) // // For this reason, we don't record targeting errors until we've refreshed our targeting // metadata at least once *after* receiving the client batch - at that point, we know: // // 1) our new metadata is the same as the metadata when the client sent a batch, and so // targeting errors are real. // OR // 2) our new metadata is a newer version than when the client sent a batch, and so // the metadata must have changed after the client batch was sent. We don't need to // deliver in this case, since for all the client knows we may have gotten the batch // exactly when the metadata changed. // // If we've had a targeting error or stale error, we've refreshed the metadata once and // can record target errors. bool recordTargetErrors = numTargetErrors > 0 || numStaleBatches > 0; Status targetStatus = batchOp.targetBatch( *_targeter, recordTargetErrors, &childBatches ); if ( !targetStatus.isOK() ) { _targeter->noteCouldNotTarget(); ++numTargetErrors; continue; } // // Send all child batches // size_t numSent = 0; while ( numSent != childBatches.size() ) { // Collect batches out on the network, mapped by endpoint EndpointBatchMap pendingBatches; // // Send side // // Get as many batches as we can at once for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin(); it != childBatches.end(); ++it ) { TargetedWriteBatch* nextBatch = *it; // If the batch is NULL, we sent it previously, so skip if ( nextBatch == NULL ) continue; const ConnectionString& hostEndpoint = nextBatch->getEndpoint().shardHost; EndpointBatchMap::iterator pendingIt = pendingBatches.find( &hostEndpoint ); // If we already have a batch for this endpoint, continue if ( pendingIt != pendingBatches.end() ) continue; // Otherwise send it out to the endpoint via a command to a database BatchedCommandRequest request( clientRequest.getBatchType() ); batchOp.buildBatchRequest( *nextBatch, &request ); // Internally we use full namespaces for request/response, but we send the // command to a database with the collection name in the request. NamespaceString nss( request.getNS() ); request.setNS( nss.coll() ); _dispatcher->addCommand( hostEndpoint, nss.db(), request ); // Indicate we're done by setting the batch to NULL // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast // endpoints for the same host, so this should be pretty efficient without // moving stuff around. *it = NULL; // Recv-side is responsible for cleaning up the nextBatch when used pendingBatches.insert( make_pair( &hostEndpoint, nextBatch ) ); } // Send them all out _dispatcher->sendAll(); numSent += pendingBatches.size(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response ConnectionString endpoint; BatchedCommandResponse response; Status dispatchStatus = _dispatcher->recvAny( &endpoint, &response ); // Get the TargetedWriteBatch to find where to put the response TargetedWriteBatch* batchRaw = pendingBatches.find( &endpoint )->second; scoped_ptr<TargetedWriteBatch> batch( batchRaw ); if ( dispatchStatus.isOK() ) { TrackedErrors trackedErrors; trackedErrors.startTracking( ErrorCodes::StaleShardVersion ); // Dispatch was ok, note response batchOp.noteBatchResponse( *batch, response, &trackedErrors ); // Note if anything was stale const vector<ShardError*>& staleErrors = trackedErrors.getErrors( ErrorCodes::StaleShardVersion ); if ( staleErrors.size() > 0 ) { noteStaleResponses( staleErrors, _targeter ); ++numStaleBatches; } } else { // Error occurred dispatching, note it BatchedErrorDetail error; buildErrorFrom( dispatchStatus, &error ); batchOp.noteBatchError( *batch, error ); } } } } batchOp.buildClientResponse( clientResponse ); }
void BatchWriteOp::noteBatchResponse( const TargetedWriteBatch& targetedBatch, const BatchedCommandResponse& response, TrackedErrors* trackedErrors ) { // // Organize errors based on error code. // We may have *either* a batch error or errors per-item. // (Write Concern errors are stored and handled later.) // vector<BatchedErrorDetail*> itemErrors; scoped_ptr<BatchedErrorDetail> batchError; if ( !response.getOk() ) { int errCode = response.getErrCode(); bool isWCError = isWCErrCode( errCode ); // Special handling for write concern errors, save for later if ( isWCError ) { BatchedErrorDetail error; cloneBatchErrorTo( response, &error ); ShardError* wcError = new ShardError( targetedBatch.getEndpoint(), error ); _wcErrors.mutableVector().push_back( wcError ); } // Handle batch and per-item errors if ( response.isErrDetailsSet() ) { // Per-item errors were set itemErrors.insert( itemErrors.begin(), response.getErrDetails().begin(), response.getErrDetails().end() ); // Sort per-item errors by index std::sort( itemErrors.begin(), itemErrors.end(), BatchedErrorDetailComp() ); } else if ( !isWCError ) { // Per-item errors were not set and this error is not a WC error // => this is a full-batch error batchError.reset( new BatchedErrorDetail ); cloneBatchErrorTo( response, batchError.get() ); } } // We can't have both a batch error and per-item errors dassert( !( batchError && !itemErrors.empty() ) ); // // Go through all pending responses of the op and sorted remote reponses, populate errors // This will either set all errors to the batch error or apply per-item errors as-needed // vector<BatchedErrorDetail*>::iterator itemErrorIt = itemErrors.begin(); int index = 0; for ( vector<TargetedWrite*>::const_iterator it = targetedBatch.getWrites().begin(); it != targetedBatch.getWrites().end(); ++it, ++index ) { const TargetedWrite* write = *it; WriteOp& writeOp = _writeOps[write->writeOpRef.first]; dassert( writeOp.getWriteState() == WriteOpState_Pending ); // See if we have an error for the write BatchedErrorDetail* writeError = NULL; if ( batchError ) { // Default to batch error, if it exists writeError = batchError.get(); } else if ( itemErrorIt != itemErrors.end() && ( *itemErrorIt )->getIndex() == index ) { // We have an per-item error for this write op's index writeError = *itemErrorIt; ++itemErrorIt; } // Finish the response (with error, if needed) if ( NULL == writeError ) { writeOp.noteWriteComplete( *write ); } else { writeOp.noteWriteError( *write, *writeError ); } } // Track errors we care about, whether batch or individual errors if ( NULL != trackedErrors ) { trackErrors( targetedBatch.getEndpoint(), batchError.get(), itemErrors, trackedErrors ); } // Stop tracking targeted batch _targeted.erase( &targetedBatch ); }
Status BatchWriteOp::targetBatch( const NSTargeter& targeter, bool recordTargetErrors, vector<TargetedWriteBatch*>* targetedBatches ) { TargetedBatchMap batchMap; size_t numWriteOps = _clientRequest->sizeWriteOps(); for ( size_t i = 0; i < numWriteOps; ++i ) { // Only do one-at-a-time ops if COE is false if ( !_clientRequest->getContinueOnError() && !batchMap.empty() ) break; WriteOp& writeOp = _writeOps[i]; // Only target _Ready ops if ( writeOp.getWriteState() != WriteOpState_Ready ) continue; // // Get TargetedWrites from the targeter for the write operation // // TargetedWrites need to be owned once returned OwnedPointerVector<TargetedWrite> writesOwned; vector<TargetedWrite*>& writes = writesOwned.mutableVector(); Status targetStatus = writeOp.targetWrites( targeter, &writes ); if ( !targetStatus.isOK() ) { // // We're not sure how to target here, so either record the error or cancel the // current batches. // BatchedErrorDetail targetError; buildTargetError( targetStatus, &targetError ); if ( recordTargetErrors ) { writeOp.setOpError( targetError ); continue; } else { // Cancel current batch state with an error cancelBatches( targetError, _writeOps, &batchMap ); dassert( batchMap.empty() ); return targetStatus; } } // // Targeting went ok, add to appropriate TargetedBatch // for ( vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it ) { TargetedWrite* write = *it; TargetedBatchMap::iterator seenIt = batchMap.find( &write->endpoint ); if ( seenIt == batchMap.end() ) { TargetedWriteBatch* newBatch = new TargetedWriteBatch( write->endpoint ); seenIt = batchMap.insert( make_pair( &newBatch->getEndpoint(), // newBatch ) ).first; } TargetedWriteBatch* batch = seenIt->second; batch->addWrite( write ); } // Relinquish ownership of TargetedWrites, now the TargetedBatches own them writesOwned.mutableVector().clear(); } // // Send back our targeted batches // for ( TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it ) { TargetedWriteBatch* batch = it->second; // Remember targeted batch for reporting _targeted.insert( batch ); // Send the handle back to caller targetedBatches->push_back( batch ); } return Status::OK(); }