void WriteBatchExecutor::bulkExecute( const BatchedCommandRequest& request, std::vector<BatchedUpsertDetail*>* upsertedIds, std::vector<WriteErrorDetail*>* errors ) { if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) { execInserts( request, errors ); } else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) { for ( size_t i = 0; i < request.sizeWriteOps(); i++ ) { WriteErrorDetail* error = NULL; BSONObj upsertedId; execUpdate( BatchItemRef( &request, i ), &upsertedId, &error ); if ( !upsertedId.isEmpty() ) { BatchedUpsertDetail* batchUpsertedId = new BatchedUpsertDetail; batchUpsertedId->setIndex( i ); batchUpsertedId->setUpsertedID( upsertedId ); upsertedIds->push_back( batchUpsertedId ); } if ( error ) { errors->push_back( error ); if ( request.getOrdered() ) break; } } } else { dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ); for ( size_t i = 0; i < request.sizeWriteOps(); i++ ) { WriteErrorDetail* error = NULL; execRemove( BatchItemRef( &request, i ), &error ); if ( error ) { errors->push_back( error ); if ( request.getOrdered() ) break; } } } // Fill in stale version errors for unordered batches (update/delete can't do this on own) if ( !errors->empty() && !request.getOrdered() ) { const WriteErrorDetail* finalError = errors->back(); if ( finalError->getErrCode() == ErrorCodes::StaleShardVersion ) { for ( size_t i = finalError->getIndex() + 1; i < request.sizeWriteOps(); i++ ) { WriteErrorDetail* dupStaleError = new WriteErrorDetail; finalError->cloneTo( dupStaleError ); errors->push_back( dupStaleError ); } } } }
bool BatchedCommandRequest::containsNoIDUpsert(const BatchedCommandRequest& request) { if (request.getBatchType() != BatchedCommandRequest::BatchType_Update) return false; const vector<BatchedUpdateDocument*>& updates = request.getUpdateRequest()->getUpdates(); for (vector<BatchedUpdateDocument*>::const_iterator it = updates.begin(); it != updates.end(); ++it) { const BatchedUpdateDocument* updateDoc = *it; if (updateDoc->getUpsert() && updateDoc->getQuery()["_id"].eoo()) return true; } return false; }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { toBatchError( allowedStatus, response ); return; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Validate write concern // TODO: Lift write concern parsing out of this entirely WriteConcernOptions writeConcern; BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } Status wcStatus = Status::OK(); if ( wcDoc.isEmpty() ) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 wcStatus = writeConcern.parse( _defaultWriteConcern.isEmpty() ? WriteConcernOptions::Acknowledged : _defaultWriteConcern ); if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) { writeConcern.wNumNodes = 1; } } else { wcStatus = writeConcern.parse( wcDoc ); } if ( wcStatus.isOK() ) { wcStatus = validateWriteConcern( writeConcern ); } if ( !wcStatus.isOK() ) { toBatchError( wcStatus, response ); return; } if ( request.sizeWriteOps() == 0u ) { toBatchError( Status( ErrorCodes::InvalidLength, "no write ops were included in the batch" ), response ); return; } // Validate batch size if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::InvalidLength, stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } // // End validation // bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if (replMode != repl::ReplicationCoordinator::modeNone) { response->setLastOp( _client->getLastOp() ); if (replMode == repl::ReplicationCoordinator::modeReplSet) { response->setElectionId(repl::theReplSet->getElectionId()); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse ) { BatchWriteOp batchOp; batchOp.initClientRequest( &clientRequest ); // Current batch status bool refreshedTargeter = false; int rounds = 0; int numCompletedOps = 0; int numRoundsWithoutProgress = 0; while ( !batchOp.isFinished() ) { // // Get child batches to send using the targeter // // Targeting errors can be caused by remote metadata changing (the collection could have // been dropped and recreated, for example with a new shard key). If a remote metadata // change occurs *before* a client sends us a batch, we need to make sure that we don't // error out just because we're staler than the client - otherwise mongos will be have // unpredictable behavior. // // (If a metadata change happens *during* or *after* a client sends us a batch, however, // we make no guarantees about delivery.) // // For this reason, we don't record targeting errors until we've refreshed our targeting // metadata at least once *after* receiving the client batch - at that point, we know: // // 1) our new metadata is the same as the metadata when the client sent a batch, and so // targeting errors are real. // OR // 2) our new metadata is a newer version than when the client sent a batch, and so // the metadata must have changed after the client batch was sent. We don't need to // deliver in this case, since for all the client knows we may have gotten the batch // exactly when the metadata changed. // vector<TargetedWriteBatch*> childBatches; // If we've already had a targeting error, we've refreshed the metadata once and can // record target errors definitively. bool recordTargetErrors = refreshedTargeter; Status targetStatus = batchOp.targetBatch( *_targeter, recordTargetErrors, &childBatches ); if ( !targetStatus.isOK() ) { // Don't do anything until a targeter refresh _targeter->noteCouldNotTarget(); refreshedTargeter = true; ++_stats->numTargetErrors; dassert( childBatches.size() == 0u ); } // // Send all child batches // size_t numSent = 0; size_t numToSend = childBatches.size(); bool remoteMetadataChanging = false; while ( numSent != numToSend ) { // Collect batches out on the network, mapped by endpoint HostBatchMap pendingBatches; // // Send side // // Get as many batches as we can at once for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin(); it != childBatches.end(); ++it ) { // // Collect the info needed to dispatch our targeted batch // TargetedWriteBatch* nextBatch = *it; // If the batch is NULL, we sent it previously, so skip if ( nextBatch == NULL ) continue; // Figure out what host we need to dispatch our targeted batch ConnectionString shardHost; Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint() .shardName, &shardHost ); if ( !resolveStatus.isOK() ) { ++_stats->numResolveErrors; // Record a resolve failure // TODO: It may be necessary to refresh the cache if stale, or maybe just // cancel and retarget the batch WriteErrorDetail error; buildErrorFrom( resolveStatus, &error ); batchOp.noteBatchError( *nextBatch, error ); // We're done with this batch *it = NULL; --numToSend; continue; } // If we already have a batch for this host, wait until the next time HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost ); if ( pendingIt != pendingBatches.end() ) continue; // // We now have all the info needed to dispatch the batch // BatchedCommandRequest request( clientRequest.getBatchType() ); batchOp.buildBatchRequest( *nextBatch, &request ); // Internally we use full namespaces for request/response, but we send the // command to a database with the collection name in the request. NamespaceString nss( request.getNS() ); request.setNS( nss.coll() ); _dispatcher->addCommand( shardHost, nss.db(), request ); // Indicate we're done by setting the batch to NULL // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast // endpoints for the same host, so this should be pretty efficient without // moving stuff around. *it = NULL; // Recv-side is responsible for cleaning up the nextBatch when used pendingBatches.insert( make_pair( shardHost, nextBatch ) ); } // Send them all out _dispatcher->sendAll(); numSent += pendingBatches.size(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response ConnectionString shardHost; BatchedCommandResponse response; Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response ); // Get the TargetedWriteBatch to find where to put the response dassert( pendingBatches.find( shardHost ) != pendingBatches.end() ); TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second; scoped_ptr<TargetedWriteBatch> batch( batchRaw ); if ( dispatchStatus.isOK() ) { TrackedErrors trackedErrors; trackedErrors.startTracking( ErrorCodes::StaleShardVersion ); // Dispatch was ok, note response batchOp.noteBatchResponse( *batch, response, &trackedErrors ); // Note if anything was stale const vector<ShardError*>& staleErrors = trackedErrors.getErrors( ErrorCodes::StaleShardVersion ); if ( staleErrors.size() > 0 ) { noteStaleResponses( staleErrors, _targeter ); ++_stats->numStaleBatches; } // Remember if the shard is actively changing metadata right now if ( isShardMetadataChanging( staleErrors ) ) { remoteMetadataChanging = true; } // Remember that we successfully wrote to this shard // NOTE: This will record lastOps for shards where we actually didn't update // or delete any documents, which preserves old behavior but is conservative _stats->noteWriteAt( shardHost, response.isLastOpSet() ? response.getLastOp() : OpTime(), response.isElectionIdSet() ? response.getElectionId() : OID()); } else { // Error occurred dispatching, note it WriteErrorDetail error; buildErrorFrom( dispatchStatus, &error ); batchOp.noteBatchError( *batch, error ); } } } ++rounds; ++_stats->numRounds; // If we're done, get out if ( batchOp.isFinished() ) break; // MORE WORK TO DO // // Refresh the targeter if we need to (no-op if nothing stale) // bool targeterChanged = false; Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged ); if ( !refreshStatus.isOK() ) { // It's okay if we can't refresh, we'll just record errors for the ops if // needed. warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() ) << endl; } // // Ensure progress is being made toward completing the batch op // int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed ); if ( currCompletedOps == numCompletedOps && !targeterChanged && !remoteMetadataChanging ) { ++numRoundsWithoutProgress; } else { numRoundsWithoutProgress = 0; } numCompletedOps = currCompletedOps; if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) { stringstream msg; msg << "no progress was made executing batch write op in " << clientRequest.getNS() << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps << " ops completed in " << rounds << " rounds total)"; WriteErrorDetail error; buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error ); batchOp.setBatchError( error ); break; } } batchOp.buildClientResponse( clientResponse ); }
void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse ) { BatchWriteOp batchOp; batchOp.initClientRequest( &clientRequest ); int numTargetErrors = 0; int numStaleBatches = 0; for ( int rounds = 0; !batchOp.isFinished(); rounds++ ) { // // Refresh the targeter if we need to (no-op if nothing stale) // Status refreshStatus = _targeter->refreshIfNeeded(); if ( !refreshStatus.isOK() ) { // It's okay if we can't refresh, we'll just record errors for the ops if // needed. warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() ) << endl; } // // Get child batches to send // vector<TargetedWriteBatch*> childBatches; // // Targeting errors can be caused by remote metadata changing (the collection could have // been dropped and recreated, for example with a new shard key). If a remote metadata // change occurs *before* a client sends us a batch, we need to make sure that we don't // error out just because we're staler than the client - otherwise mongos will be have // unpredictable behavior. // // (If a metadata change happens *during* or *after* a client sends us a batch, however, // we make no guarantees about delivery.) // // For this reason, we don't record targeting errors until we've refreshed our targeting // metadata at least once *after* receiving the client batch - at that point, we know: // // 1) our new metadata is the same as the metadata when the client sent a batch, and so // targeting errors are real. // OR // 2) our new metadata is a newer version than when the client sent a batch, and so // the metadata must have changed after the client batch was sent. We don't need to // deliver in this case, since for all the client knows we may have gotten the batch // exactly when the metadata changed. // // If we've had a targeting error or stale error, we've refreshed the metadata once and // can record target errors. bool recordTargetErrors = numTargetErrors > 0 || numStaleBatches > 0; Status targetStatus = batchOp.targetBatch( *_targeter, recordTargetErrors, &childBatches ); if ( !targetStatus.isOK() ) { _targeter->noteCouldNotTarget(); ++numTargetErrors; continue; } // // Send all child batches // size_t numSent = 0; while ( numSent != childBatches.size() ) { // Collect batches out on the network, mapped by endpoint EndpointBatchMap pendingBatches; // // Send side // // Get as many batches as we can at once for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin(); it != childBatches.end(); ++it ) { TargetedWriteBatch* nextBatch = *it; // If the batch is NULL, we sent it previously, so skip if ( nextBatch == NULL ) continue; const ConnectionString& hostEndpoint = nextBatch->getEndpoint().shardHost; EndpointBatchMap::iterator pendingIt = pendingBatches.find( &hostEndpoint ); // If we already have a batch for this endpoint, continue if ( pendingIt != pendingBatches.end() ) continue; // Otherwise send it out to the endpoint via a command to a database BatchedCommandRequest request( clientRequest.getBatchType() ); batchOp.buildBatchRequest( *nextBatch, &request ); // Internally we use full namespaces for request/response, but we send the // command to a database with the collection name in the request. NamespaceString nss( request.getNS() ); request.setNS( nss.coll() ); _dispatcher->addCommand( hostEndpoint, nss.db(), request ); // Indicate we're done by setting the batch to NULL // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast // endpoints for the same host, so this should be pretty efficient without // moving stuff around. *it = NULL; // Recv-side is responsible for cleaning up the nextBatch when used pendingBatches.insert( make_pair( &hostEndpoint, nextBatch ) ); } // Send them all out _dispatcher->sendAll(); numSent += pendingBatches.size(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response ConnectionString endpoint; BatchedCommandResponse response; Status dispatchStatus = _dispatcher->recvAny( &endpoint, &response ); // Get the TargetedWriteBatch to find where to put the response TargetedWriteBatch* batchRaw = pendingBatches.find( &endpoint )->second; scoped_ptr<TargetedWriteBatch> batch( batchRaw ); if ( dispatchStatus.isOK() ) { TrackedErrors trackedErrors; trackedErrors.startTracking( ErrorCodes::StaleShardVersion ); // Dispatch was ok, note response batchOp.noteBatchResponse( *batch, response, &trackedErrors ); // Note if anything was stale const vector<ShardError*>& staleErrors = trackedErrors.getErrors( ErrorCodes::StaleShardVersion ); if ( staleErrors.size() > 0 ) { noteStaleResponses( staleErrors, _targeter ); ++numStaleBatches; } } else { // Error occurred dispatching, note it BatchedErrorDetail error; buildErrorFrom( dispatchStatus, &error ); batchOp.noteBatchError( *batch, error ); } } } } batchOp.buildClientResponse( clientResponse ); }
bool batchErrorToLastError(const BatchedCommandRequest& request, const BatchedCommandResponse& response, LastError* error) { unique_ptr<WriteErrorDetail> commandError; WriteErrorDetail* lastBatchError = NULL; if (!response.getOk()) { // Command-level error, all writes failed commandError.reset(new WriteErrorDetail); buildErrorFromResponse(response, commandError.get()); lastBatchError = commandError.get(); } else if (response.isErrDetailsSet()) { // The last error in the batch is always reported - this matches expected COE // semantics for insert batches. For updates and deletes, error is only reported // if the error was on the last item. const bool lastOpErrored = response.getErrDetails().back()->getIndex() == static_cast<int>(request.sizeWriteOps() - 1); if (request.getBatchType() == BatchedCommandRequest::BatchType_Insert || lastOpErrored) { lastBatchError = response.getErrDetails().back(); } } else { // We don't care about write concern errors, these happen in legacy mode in GLE. } // Record an error if one exists if (lastBatchError) { string errMsg = lastBatchError->getErrMessage(); error->setLastError(lastBatchError->getErrCode(), errMsg.empty() ? "see code for details" : errMsg.c_str()); return true; } // Record write stats otherwise // NOTE: For multi-write batches, our semantics change a little because we don't have // un-aggregated "n" stats. if (request.getBatchType() == BatchedCommandRequest::BatchType_Update) { BSONObj upsertedId; if (response.isUpsertDetailsSet()) { // Only report the very last item's upserted id if applicable if (response.getUpsertDetails().back()->getIndex() + 1 == static_cast<int>(request.sizeWriteOps())) { upsertedId = response.getUpsertDetails().back()->getUpsertedID(); } } int numUpserted = 0; if (response.isUpsertDetailsSet()) numUpserted = response.sizeUpsertDetails(); int numMatched = response.getN() - numUpserted; dassert(numMatched >= 0); // Wrap upserted id in "upserted" field BSONObj leUpsertedId; if (!upsertedId.isEmpty()) leUpsertedId = upsertedId.firstElement().wrap(kUpsertedFieldName); error->recordUpdate(numMatched > 0, response.getN(), leUpsertedId); } else if (request.getBatchType() == BatchedCommandRequest::BatchType_Delete) { error->recordDelete(response.getN()); } return false; }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // TODO: Lift write concern parsing out of this entirely. WriteConcernOptions writeConcern; Status status = Status::OK(); BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } if ( wcDoc.isEmpty() ) { status = writeConcern.parse( _defaultWriteConcern ); } else { status = writeConcern.parse( wcDoc ); } if ( status.isOK() ) { status = validateWriteConcern( writeConcern ); } if ( !status.isOK() ) { response->setErrCode( status.code() ); response->setErrMessage( status.reason() ); response->setOk( false ); dassert( response->isValid(NULL) ); return; } bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; status = waitForWriteConcern( writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); upserted.clear(); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); writeErrors.clear(); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } if ( anyReplEnabled() ) { response->setLastOp( _client->getLastOp() ); if (theReplSet) { response->setElectionId( theReplSet->getElectionId() ); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
/** * The core config write functionality. * * Config writes run in two passes - the first is a quick check to ensure the config servers * are all reachable, the second runs the actual write. * * TODO: Upgrade and move this logic to the config servers, a state machine implementation * is probably the next step. */ void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse, bool fsyncCheck ) { NamespaceString nss( clientRequest.getNS() ); dassert( nss.db() == "config" || nss.db() == "admin" ); dassert( clientRequest.sizeWriteOps() == 1u ); if ( fsyncCheck ) { // // Sanity check that all configs are still reachable using fsync, preserving legacy // behavior // OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned; vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector(); // // Send side // for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; FsyncRequest fsyncRequest; _dispatcher->addCommand( configHost, "admin", fsyncRequest ); } _dispatcher->sendAll(); // // Recv side // bool fsyncError = false; while ( _dispatcher->numPending() > 0 ) { fsyncResponses.push_back( new ConfigFsyncResponse() ); ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back(); Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost, &fsyncResponse.response ); // We've got to recv everything, no matter what if ( !dispatchStatus.isOK() ) { fsyncError = true; buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response ); } else if ( !fsyncResponse.response.getOk() ) { fsyncError = true; } } if ( fsyncError ) { combineFsyncErrors( fsyncResponses, clientResponse ); return; } else { fsyncResponsesOwned.clear(); } } // // Do the actual writes // BatchedCommandRequest configRequest( clientRequest.getBatchType() ); clientRequest.cloneTo( &configRequest ); configRequest.setNS( nss.coll() ); OwnedPointerVector<ConfigResponse> responsesOwned; vector<ConfigResponse*>& responses = responsesOwned.mutableVector(); // // Send the actual config writes // // Get as many batches as we can at once for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; _dispatcher->addCommand( configHost, nss.db(), configRequest ); } // Send them all out _dispatcher->sendAll(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response responses.push_back( new ConfigResponse() ); ConfigResponse& configResponse = *responses.back(); Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost, &configResponse.response ); if ( !dispatchStatus.isOK() ) { buildErrorFrom( dispatchStatus, &configResponse.response ); } } combineResponses( responses, clientResponse ); }
void BatchSafeWriter::safeWriteBatch( DBClientBase* conn, const BatchedCommandRequest& request, BatchedCommandResponse* response ) { const NamespaceString nss( request.getNS() ); // N starts at zero, and we add to it for each item response->setN( 0 ); for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) { // Break on first error if we're ordered if ( request.getOrdered() && response->isErrDetailsSet() ) break; BatchItemRef itemRef( &request, static_cast<int>( i ) ); bool isLastItem = ( i == request.sizeWriteOps() - 1 ); BSONObj writeConcern; if ( isLastItem && request.isWriteConcernSet() ) { writeConcern = request.getWriteConcern(); // Pre-2.4.2 mongods react badly to 'w' being set on config servers if ( nss.db() == "config" ) writeConcern = fixWCForConfig( writeConcern ); } BSONObj gleResult; GLEErrors errors; Status status = _safeWriter->safeWrite( conn, itemRef, writeConcern, &gleResult ); if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { response->clear(); response->setOk( false ); response->setErrCode( status.code() ); response->setErrMessage( status.reason() ); return; } // // STATS HANDLING // GLEStats stats; extractGLEStats( gleResult, &stats ); // Special case for making legacy "n" field result for insert match the write // command result. if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert && !errors.writeError.get() ) { // n is always 0 for legacy inserts. dassert( stats.n == 0 ); stats.n = 1; } response->setN( response->getN() + stats.n ); if ( !stats.upsertedId.isEmpty() ) { BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail; upsertedId->setIndex( i ); upsertedId->setUpsertedID( stats.upsertedId ); response->addToUpsertDetails( upsertedId ); } response->setLastOp( stats.lastOp ); // // WRITE ERROR HANDLING // // If any error occurs (except stale config) the previous GLE was not enforced bool enforcedWC = !errors.writeError.get() || errors.writeError->getErrCode() == ErrorCodes::StaleShardVersion; // Save write error if ( errors.writeError.get() ) { errors.writeError->setIndex( i ); response->addToErrDetails( errors.writeError.release() ); } // // WRITE CONCERN ERROR HANDLING // // The last write is weird, since we enforce write concern and check the error through // the same GLE if possible. If the last GLE was an error, the write concern may not // have been enforced in that same GLE, so we need to send another after resetting the // error. if ( isLastItem ) { // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. bool needToEnforceWC = !response->isErrDetailsSet() || ( !request.getOrdered() && response->sizeErrDetails() < request.sizeWriteOps() ); if ( !enforcedWC && needToEnforceWC ) { dassert( !errors.writeError.get() ); // emptied above // Might have gotten a write concern validity error earlier, these are // enforced even if the wc isn't applied, so we ignore. errors.wcError.reset(); Status status = _safeWriter->enforceWriteConcern( conn, nss.db().toString(), writeConcern, &gleResult ); if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { response->clear(); response->setOk( false ); response->setErrCode( status.code() ); response->setErrMessage( status.reason() ); return; } } // END Write concern retry if ( errors.wcError.get() ) { response->setWriteConcernError( errors.wcError.release() ); } } } response->setOk( true ); dassert( response->isValid( NULL ) ); }
void batchErrorToLastError( const BatchedCommandRequest& request, const BatchedCommandResponse& response, LastError* error ) { scoped_ptr<BatchedErrorDetail> topLevelError; BatchedErrorDetail* lastBatchError = NULL; if ( !response.getOk() ) { int code = response.getErrCode(); // Check for batch error // We don't care about write concern errors, these happen in legacy mode in GLE if ( code != ErrorCodes::WriteConcernFailed && !response.isErrDetailsSet() ) { // Top-level error, all writes failed topLevelError.reset( new BatchedErrorDetail ); buildErrorFromResponse( response, topLevelError.get() ); lastBatchError = topLevelError.get(); } else if ( response.isErrDetailsSet() ) { // The last error in the batch is always reported - this matches expected COE // semantics for insert batches and works for single writes lastBatchError = response.getErrDetails().back(); } } // Record an error if one exists if ( lastBatchError ) { error->raiseError( lastBatchError->getErrCode(), lastBatchError->getErrMessage().c_str() ); return; } // Record write stats otherwise // NOTE: For multi-write batches, our semantics change a little because we don't have // un-aggregated "n" stats. if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) { BSONObj upsertedId; if ( response.isSingleUpsertedSet() ) upsertedId = response.getSingleUpserted(); else if( response.isUpsertDetailsSet() ) { // Only report the very last item's upserted id if applicable if ( response.getUpsertDetails().back()->getIndex() + 1 == static_cast<int>( request.sizeWriteOps() ) ) { upsertedId = response.getUpsertDetails().back()->getUpsertedID(); } } int numUpserted = 0; if ( response.isSingleUpsertedSet() ) ++numUpserted; else if ( response.isUpsertDetailsSet() ) numUpserted += response.sizeUpsertDetails(); int numUpdated = response.getN() - numUpserted; dassert( numUpdated >= 0 ); error->recordUpdate( numUpdated > 0, response.getN(), upsertedId ); } else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ) { error->recordDelete( response.getN() ); } }
/** * The core config write functionality. * * Config writes run in two passes - the first is a quick check to ensure the config servers * are all reachable, the second runs the actual write. * * TODO: Upgrade and move this logic to the config servers, a state machine implementation * is probably the next step. */ void ConfigCoordinator::executeBatch(const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse) { const NamespaceString nss(clientRequest.getNS()); // Should never use it for anything other than DBs residing on the config server dassert(nss.db() == "config" || nss.db() == "admin"); dassert(clientRequest.sizeWriteOps() == 1u); // This is an opportunistic check that all config servers look healthy by calling // getLastError on each one of them. If there was some form of write/journaling error, get // last error would fail. { for (vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it) { _dispatcher->addCommand(*it, "admin", RawBSONSerializable(BSON("getLastError" << true << "fsync" << true))); } _dispatcher->sendAll(); bool error = false; while (_dispatcher->numPending()) { ConnectionString host; RawBSONSerializable response; Status status = _dispatcher->recvAny(&host, &response); if (status.isOK()) { BSONObj obj = response.toBSON(); LOG(3) << "Response " << obj.toString(); // If the ok field is anything other than 1, count it as error if (!obj["ok"].trueValue()) { error = true; log() << "Config server check for host " << host << " returned error: " << response; } } else { error = true; log() << "Config server check for host " << host << " failed with status: " << status; } } // All responses should have been gathered by this point if (error) { clientResponse->setOk(false); clientResponse->setErrCode(ErrorCodes::RemoteValidationError); clientResponse->setErrMessage("Could not verify that config servers were active" " and reachable before write"); return; } } if (!_checkConfigString(clientResponse)) { return; } // // Do the actual writes // BatchedCommandRequest configRequest( clientRequest.getBatchType() ); clientRequest.cloneTo( &configRequest ); configRequest.setNS( nss.coll() ); OwnedPointerVector<ConfigResponse> responsesOwned; vector<ConfigResponse*>& responses = responsesOwned.mutableVector(); // // Send the actual config writes // // Get as many batches as we can at once for (vector<ConnectionString>::const_iterator it = _configHosts.begin(); it != _configHosts.end(); ++it) { const ConnectionString& configHost = *it; _dispatcher->addCommand(configHost, nss.db(), configRequest); } // Send them all out _dispatcher->sendAll(); // // Recv side // while (_dispatcher->numPending() > 0) { // Get the response responses.push_back(new ConfigResponse()); ConfigResponse& configResponse = *responses.back(); Status dispatchStatus = _dispatcher->recvAny(&configResponse.configHost, &configResponse.response); if (!dispatchStatus.isOK()) { buildErrorFrom(dispatchStatus, &configResponse.response); } } combineResponses(responses, clientResponse); }
bool WriteBatchExecutor::applyWriteItem( const BatchedCommandRequest& request, int index, WriteStats* stats, BatchedErrorDetail* error ) { const string& ns = request.getNS(); // Clear operation's LastError before starting. _le->reset( true ); //uint64_t itemTimeMicros = 0; bool opSuccess = true; // Each write operation executes in its own PageFaultRetryableSection. This means that // a single batch can throw multiple PageFaultException's, which is not the case for // other operations. PageFaultRetryableSection s; while ( true ) { try { // Execute the write item as a child operation of the current operation. CurOp childOp( _client, _client->curop() ); // TODO Modify CurOp "wrapped" constructor to take an opcode, so calling .reset() // is unneeded childOp.reset( _client->getRemote(), getOpCode( request.getBatchType() ) ); childOp.ensureStarted(); OpDebug& opDebug = childOp.debug(); opDebug.ns = ns; { Client::WriteContext ctx( ns ); switch ( request.getBatchType() ) { case BatchedCommandRequest::BatchType_Insert: opSuccess = applyInsert( ns, request.getInsertRequest()->getDocumentsAt( index ), &childOp, stats, error ); break; case BatchedCommandRequest::BatchType_Update: opSuccess = applyUpdate( ns, *request.getUpdateRequest()->getUpdatesAt( index ), &childOp, stats, error ); break; default: dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ); opSuccess = applyDelete( ns, *request.getDeleteRequest()->getDeletesAt( index ), &childOp, stats, error ); break; } } childOp.done(); //itemTimeMicros = childOp.totalTimeMicros(); opDebug.executionTime = childOp.totalTimeMillis(); opDebug.recordStats(); // Log operation if running with at least "-v", or if exceeds slow threshold. if ( logger::globalLogDomain()->shouldLog( logger::LogSeverity::Debug( 1 ) ) || opDebug.executionTime > cmdLine.slowMS + childOp.getExpectedLatencyMs() ) { MONGO_TLOG(1) << opDebug.report( childOp ) << endl; } // TODO Log operation if logLevel >= 3 and assertion thrown (as assembleResponse() // does). // Save operation to system.profile if shouldDBProfile(). if ( childOp.shouldDBProfile( opDebug.executionTime ) ) { profile( *_client, getOpCode( request.getBatchType() ), childOp ); } break; } catch ( PageFaultException& e ) { e.touch(); } } return opSuccess; }
void BatchSafeWriter::safeWriteBatch( DBClientBase* conn, const BatchedCommandRequest& request, BatchedCommandResponse* response ) { const NamespaceString nss( request.getNS() ); // N starts at zero, and we add to it for each item response->setN( 0 ); // GLE path always sets nModified to -1 (sentinel) to indicate we should omit it later. response->setNModified(-1); for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) { // Break on first error if we're ordered if ( request.getOrdered() && response->isErrDetailsSet() ) break; BatchItemRef itemRef( &request, static_cast<int>( i ) ); BSONObj gleResult; GLEErrors errors; Status status = _safeWriter->safeWrite( conn, itemRef, WriteConcernOptions::Acknowledged, &gleResult ); if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { response->clear(); response->setOk( false ); response->setErrCode( ErrorCodes::RemoteResultsUnavailable ); StringBuilder builder; builder << "could not get write error from safe write"; builder << causedBy( status.toString() ); response->setErrMessage( builder.str() ); return; } if ( errors.wcError.get() ) { response->setWriteConcernError( errors.wcError.release() ); } // // STATS HANDLING // GLEStats stats; extractGLEStats( gleResult, &stats ); // Special case for making legacy "n" field result for insert match the write // command result. if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert && !errors.writeError.get() ) { // n is always 0 for legacy inserts. dassert( stats.n == 0 ); stats.n = 1; } response->setN( response->getN() + stats.n ); if ( !stats.upsertedId.isEmpty() ) { BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail; upsertedId->setIndex( i ); upsertedId->setUpsertedID( stats.upsertedId ); response->addToUpsertDetails( upsertedId ); } response->setLastOp( stats.lastOp ); // Save write error if ( errors.writeError.get() ) { errors.writeError->setIndex( i ); response->addToErrDetails( errors.writeError.release() ); } } // // WRITE CONCERN ERROR HANDLING // // The last write is weird, since we enforce write concern and check the error through // the same GLE if possible. If the last GLE was an error, the write concern may not // have been enforced in that same GLE, so we need to send another after resetting the // error. BSONObj writeConcern; if ( request.isWriteConcernSet() ) { writeConcern = request.getWriteConcern(); // Pre-2.4.2 mongods react badly to 'w' being set on config servers if ( nss.db() == "config" ) writeConcern = fixWCForConfig( writeConcern ); } bool needToEnforceWC = WriteConcernOptions::Acknowledged.woCompare(writeConcern) != 0 && WriteConcernOptions::Unacknowledged.woCompare(writeConcern) != 0; if ( needToEnforceWC && ( !response->isErrDetailsSet() || ( !request.getOrdered() && // Not all errored. Note: implicit response->isErrDetailsSet(). response->sizeErrDetails() < request.sizeWriteOps() ))) { // Might have gotten a write concern validity error earlier, these are // enforced even if the wc isn't applied, so we ignore. response->unsetWriteConcernError(); const string dbName( nss.db().toString() ); Status status( Status::OK() ); if ( response->isErrDetailsSet() ) { const WriteErrorDetail* lastError = response->getErrDetails().back(); // If last write op was an error. if ( lastError->getIndex() == static_cast<int>( request.sizeWriteOps() - 1 )) { // Reset previous errors so we can apply the write concern no matter what // as long as it is valid. status = _safeWriter->clearErrors( conn, dbName ); } } BSONObj gleResult; if ( status.isOK() ) { status = _safeWriter->enforceWriteConcern( conn, dbName, writeConcern, &gleResult ); } GLEErrors errors; if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { auto_ptr<WCErrorDetail> wcError( new WCErrorDetail ); wcError->setErrCode( status.code() ); wcError->setErrMessage( status.reason() ); response->setWriteConcernError( wcError.release() ); } else if ( errors.wcError.get() ) { response->setWriteConcernError( errors.wcError.release() ); } } response->setOk( true ); dassert( response->isValid( NULL ) ); }