void ClusterWriter::write(OperationContext* opCtx, const BatchedCommandRequest& request, BatchWriteExecStats* stats, BatchedCommandResponse* response) { const NamespaceString& nss = request.getNS(); LastError::Disabled disableLastError(&LastError::get(opCtx->getClient())); // Config writes and shard writes are done differently if (nss.db() == NamespaceString::kConfigDb || nss.db() == NamespaceString::kAdminDb) { Grid::get(opCtx)->catalogClient()->writeConfigServerDirect(opCtx, request, response); } else { TargeterStats targeterStats; { ChunkManagerTargeter targeter(request.getTargetingNS(), &targeterStats); Status targetInitStatus = targeter.init(opCtx); if (!targetInitStatus.isOK()) { toBatchError({targetInitStatus.code(), str::stream() << "unable to target" << (request.isInsertIndexRequest() ? " index" : "") << " write op for collection " << request.getTargetingNS().ns() << causedBy(targetInitStatus)}, response); return; } BatchWriteExec::executeBatch(opCtx, targeter, request, response, stats); } splitIfNeeded(opCtx, request.getNS(), targeterStats); } }
void ClusterWriter::shardWrite( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { ChunkManagerTargeter targeter; Status targetInitStatus = targeter.init( NamespaceString( request.getTargetingNS() ) ); if ( !targetInitStatus.isOK() ) { warning() << "could not initialize targeter for" << ( request.isInsertIndexRequest() ? " index" : "" ) << " write op in collection " << request.getTargetingNS() << endl; // Errors will be reported in response if we are unable to target } DBClientShardResolver resolver; DBClientMultiCommand dispatcher; BatchWriteExec exec( &targeter, &resolver, &dispatcher ); exec.executeBatch( request, response ); if ( _autoSplit ) splitIfNeeded( request.getNS(), *targeter.getStats() ); _stats->setShardStats( exec.releaseStats() ); }
BatchedCommandResponse Shard::runBatchWriteCommandOnConfig( OperationContext* txn, const BatchedCommandRequest& batchRequest, RetryPolicy retryPolicy) { invariant(isConfig()); const std::string dbname = batchRequest.getNS().db().toString(); invariant(batchRequest.sizeWriteOps() == 1); const BSONObj cmdObj = batchRequest.toBSON(); for (int retry = 1; retry <= kOnErrorNumRetries; ++retry) { auto response = _runCommand(txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, dbname, kDefaultConfigCommandTimeout, cmdObj); BatchedCommandResponse batchResponse; Status writeStatus = CommandResponse::processBatchWriteResponse(response.commandResponse, &batchResponse); if (!writeStatus.isOK() && response.host) { updateReplSetMonitor(response.host.get(), writeStatus); } if (retry < kOnErrorNumRetries && isRetriableError(writeStatus.code(), retryPolicy)) { LOG(2) << "Batch write command failed with retriable error and will be retried" << causedBy(redact(writeStatus)); continue; } return batchResponse; } MONGO_UNREACHABLE; }
BatchedCommandResponse Shard::runBatchWriteCommand(OperationContext* opCtx, const Milliseconds maxTimeMS, const BatchedCommandRequest& batchRequest, RetryPolicy retryPolicy) { const std::string dbname = batchRequest.getNS().db().toString(); const BSONObj cmdObj = batchRequest.toBSON(); for (int retry = 1; retry <= kOnErrorNumRetries; ++retry) { // Note: write commands can only be issued against a primary. auto swResponse = _runCommand( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, dbname, maxTimeMS, cmdObj); BatchedCommandResponse batchResponse; auto writeStatus = CommandResponse::processBatchWriteResponse(swResponse, &batchResponse); if (retry < kOnErrorNumRetries && isRetriableError(writeStatus.code(), retryPolicy)) { LOG(2) << "Batch write command to " << getId() << " failed with retriable error and will be retried" << causedBy(redact(writeStatus)); continue; } return batchResponse; } MONGO_UNREACHABLE; }
void Strategy::writeOp(OperationContext* txn, int op, Request& request) { // make sure we have a last error dassert(&LastError::get(cc())); OwnedPointerVector<BatchedCommandRequest> commandRequestsOwned; vector<BatchedCommandRequest*>& commandRequests = commandRequestsOwned.mutableVector(); msgToBatchRequests(request.m(), &commandRequests); for (vector<BatchedCommandRequest*>::iterator it = commandRequests.begin(); it != commandRequests.end(); ++it) { // Multiple commands registered to last error as multiple requests if (it != commandRequests.begin()) LastError::get(cc()).startRequest(); BatchedCommandRequest* commandRequest = *it; // Adjust namespaces for command NamespaceString fullNS(commandRequest->getNS()); string cmdNS = fullNS.getCommandNS(); // We only pass in collection name to command commandRequest->setNS(fullNS); BSONObjBuilder builder; BSONObj requestBSON = commandRequest->toBSON(); { // Disable the last error object for the duration of the write cmd LastError::Disabled disableLastError(&LastError::get(cc())); Command::runAgainstRegistered(txn, cmdNS.c_str(), requestBSON, builder, 0); } BatchedCommandResponse commandResponse; bool parsed = commandResponse.parseBSON(builder.done(), NULL); (void)parsed; // for compile dassert(parsed && commandResponse.isValid(NULL)); // Populate the lastError object based on the write response LastError::get(cc()).reset(); bool hadError = batchErrorToLastError(*commandRequest, commandResponse, &LastError::get(cc())); // Check if this is an ordered batch and we had an error which should stop processing if (commandRequest->getOrdered() && hadError) break; } }
void clusterWrite( const BatchedCommandRequest& request, BatchedCommandResponse* response, bool autoSplit ) { // App-level validation of a create index insert if ( request.isInsertIndexRequest() ) { if ( request.sizeWriteOps() != 1 || request.isWriteConcernSet() ) { // Invalid request to create index response->setOk( false ); response->setErrCode( ErrorCodes::InvalidOptions ); response->setErrMessage( "invalid batch request for index creation" ); dassert( response->isValid( NULL ) ); return; } } // Config writes and shard writes are done differently string dbName = NamespaceString( request.getNS() ).db().toString(); if ( dbName == "config" || dbName == "admin" ) { bool verboseWC = request.isVerboseWC(); // We only support batch sizes of one and {w:0} write concern for config writes if ( request.sizeWriteOps() != 1 || ( verboseWC && request.isWriteConcernSet() ) ) { // Invalid config server write response->setOk( false ); response->setErrCode( ErrorCodes::InvalidOptions ); response->setErrMessage( "invalid batch request for config write" ); dassert( response->isValid( NULL ) ); return; } // We need to support "best-effort" writes for pings to the config server. // {w:0} (!verbose) writes are interpreted as best-effort in this case - they may still // error, but do not do the initial fsync check. configWrite( request, response, verboseWC ); } else { shardWrite( request, response, autoSplit ); } }
void CatalogManagerReplicaSet::writeConfigServerDirect(const BatchedCommandRequest& batchRequest, BatchedCommandResponse* batchResponse) { std::string dbname = batchRequest.getNS().db().toString(); invariant(dbname == "config" || dbname == "admin"); const BSONObj cmdObj = batchRequest.toBSON(); auto response = _runConfigServerCommandWithNotMasterRetries(dbname, cmdObj); if (!response.isOK()) { _toBatchError(response.getStatus(), batchResponse); return; } string errmsg; if (!batchResponse->parseBSON(response.getValue(), &errmsg)) { _toBatchError(Status(ErrorCodes::FailedToParse, str::stream() << "Failed to parse config server response: " << errmsg), batchResponse); } }
// static Status WriteBatchExecutor::validateBatch( const BatchedCommandRequest& request ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { return Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ); } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { return allowedStatus; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { return Status( ErrorCodes::InvalidOptions, errMsg ); } return Status::OK(); }
void BatchSafeWriter::safeWriteBatch( DBClientBase* conn, const BatchedCommandRequest& request, BatchedCommandResponse* response ) { const NamespaceString nss( request.getNS() ); // N starts at zero, and we add to it for each item response->setN( 0 ); // GLE path always sets nModified to -1 (sentinel) to indicate we should omit it later. response->setNModified(-1); for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) { // Break on first error if we're ordered if ( request.getOrdered() && response->isErrDetailsSet() ) break; BatchItemRef itemRef( &request, static_cast<int>( i ) ); BSONObj gleResult; GLEErrors errors; Status status = _safeWriter->safeWrite( conn, itemRef, WriteConcernOptions::Acknowledged, &gleResult ); if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { response->clear(); response->setOk( false ); response->setErrCode( ErrorCodes::RemoteResultsUnavailable ); StringBuilder builder; builder << "could not get write error from safe write"; builder << causedBy( status.toString() ); response->setErrMessage( builder.str() ); return; } if ( errors.wcError.get() ) { response->setWriteConcernError( errors.wcError.release() ); } // // STATS HANDLING // GLEStats stats; extractGLEStats( gleResult, &stats ); // Special case for making legacy "n" field result for insert match the write // command result. if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert && !errors.writeError.get() ) { // n is always 0 for legacy inserts. dassert( stats.n == 0 ); stats.n = 1; } response->setN( response->getN() + stats.n ); if ( !stats.upsertedId.isEmpty() ) { BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail; upsertedId->setIndex( i ); upsertedId->setUpsertedID( stats.upsertedId ); response->addToUpsertDetails( upsertedId ); } response->setLastOp( stats.lastOp ); // Save write error if ( errors.writeError.get() ) { errors.writeError->setIndex( i ); response->addToErrDetails( errors.writeError.release() ); } } // // WRITE CONCERN ERROR HANDLING // // The last write is weird, since we enforce write concern and check the error through // the same GLE if possible. If the last GLE was an error, the write concern may not // have been enforced in that same GLE, so we need to send another after resetting the // error. BSONObj writeConcern; if ( request.isWriteConcernSet() ) { writeConcern = request.getWriteConcern(); // Pre-2.4.2 mongods react badly to 'w' being set on config servers if ( nss.db() == "config" ) writeConcern = fixWCForConfig( writeConcern ); } bool needToEnforceWC = WriteConcernOptions::Acknowledged.woCompare(writeConcern) != 0 && WriteConcernOptions::Unacknowledged.woCompare(writeConcern) != 0; if ( needToEnforceWC && ( !response->isErrDetailsSet() || ( !request.getOrdered() && // Not all errored. Note: implicit response->isErrDetailsSet(). response->sizeErrDetails() < request.sizeWriteOps() ))) { // Might have gotten a write concern validity error earlier, these are // enforced even if the wc isn't applied, so we ignore. response->unsetWriteConcernError(); const string dbName( nss.db().toString() ); Status status( Status::OK() ); if ( response->isErrDetailsSet() ) { const WriteErrorDetail* lastError = response->getErrDetails().back(); // If last write op was an error. if ( lastError->getIndex() == static_cast<int>( request.sizeWriteOps() - 1 )) { // Reset previous errors so we can apply the write concern no matter what // as long as it is valid. status = _safeWriter->clearErrors( conn, dbName ); } } BSONObj gleResult; if ( status.isOK() ) { status = _safeWriter->enforceWriteConcern( conn, dbName, writeConcern, &gleResult ); } GLEErrors errors; if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { auto_ptr<WCErrorDetail> wcError( new WCErrorDetail ); wcError->setErrCode( status.code() ); wcError->setErrMessage( status.reason() ); response->setWriteConcernError( wcError.release() ); } else if ( errors.wcError.get() ) { response->setWriteConcernError( errors.wcError.release() ); } } response->setOk( true ); dassert( response->isValid( NULL ) ); }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { toBatchError( allowedStatus, response ); return; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Validate write concern // TODO: Lift write concern parsing out of this entirely WriteConcernOptions writeConcern; BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } Status wcStatus = Status::OK(); if ( wcDoc.isEmpty() ) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 wcStatus = writeConcern.parse( _defaultWriteConcern.isEmpty() ? WriteConcernOptions::Acknowledged : _defaultWriteConcern ); if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) { writeConcern.wNumNodes = 1; } } else { wcStatus = writeConcern.parse( wcDoc ); } if ( wcStatus.isOK() ) { wcStatus = validateWriteConcern( writeConcern ); } if ( !wcStatus.isOK() ) { toBatchError( wcStatus, response ); return; } if ( request.sizeWriteOps() == 0u ) { toBatchError( Status( ErrorCodes::InvalidLength, "no write ops were included in the batch" ), response ); return; } // Validate batch size if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::InvalidLength, stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } // // End validation // bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if (replMode != repl::ReplicationCoordinator::modeNone) { response->setLastOp( _client->getLastOp() ); if (replMode == repl::ReplicationCoordinator::modeReplSet) { response->setElectionId(repl::theReplSet->getElectionId()); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse ) { BatchWriteOp batchOp; batchOp.initClientRequest( &clientRequest ); // Current batch status bool refreshedTargeter = false; int rounds = 0; int numCompletedOps = 0; int numRoundsWithoutProgress = 0; while ( !batchOp.isFinished() ) { // // Get child batches to send using the targeter // // Targeting errors can be caused by remote metadata changing (the collection could have // been dropped and recreated, for example with a new shard key). If a remote metadata // change occurs *before* a client sends us a batch, we need to make sure that we don't // error out just because we're staler than the client - otherwise mongos will be have // unpredictable behavior. // // (If a metadata change happens *during* or *after* a client sends us a batch, however, // we make no guarantees about delivery.) // // For this reason, we don't record targeting errors until we've refreshed our targeting // metadata at least once *after* receiving the client batch - at that point, we know: // // 1) our new metadata is the same as the metadata when the client sent a batch, and so // targeting errors are real. // OR // 2) our new metadata is a newer version than when the client sent a batch, and so // the metadata must have changed after the client batch was sent. We don't need to // deliver in this case, since for all the client knows we may have gotten the batch // exactly when the metadata changed. // vector<TargetedWriteBatch*> childBatches; // If we've already had a targeting error, we've refreshed the metadata once and can // record target errors definitively. bool recordTargetErrors = refreshedTargeter; Status targetStatus = batchOp.targetBatch( *_targeter, recordTargetErrors, &childBatches ); if ( !targetStatus.isOK() ) { // Don't do anything until a targeter refresh _targeter->noteCouldNotTarget(); refreshedTargeter = true; ++_stats->numTargetErrors; dassert( childBatches.size() == 0u ); } // // Send all child batches // size_t numSent = 0; size_t numToSend = childBatches.size(); bool remoteMetadataChanging = false; while ( numSent != numToSend ) { // Collect batches out on the network, mapped by endpoint HostBatchMap pendingBatches; // // Send side // // Get as many batches as we can at once for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin(); it != childBatches.end(); ++it ) { // // Collect the info needed to dispatch our targeted batch // TargetedWriteBatch* nextBatch = *it; // If the batch is NULL, we sent it previously, so skip if ( nextBatch == NULL ) continue; // Figure out what host we need to dispatch our targeted batch ConnectionString shardHost; Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint() .shardName, &shardHost ); if ( !resolveStatus.isOK() ) { ++_stats->numResolveErrors; // Record a resolve failure // TODO: It may be necessary to refresh the cache if stale, or maybe just // cancel and retarget the batch WriteErrorDetail error; buildErrorFrom( resolveStatus, &error ); batchOp.noteBatchError( *nextBatch, error ); // We're done with this batch *it = NULL; --numToSend; continue; } // If we already have a batch for this host, wait until the next time HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost ); if ( pendingIt != pendingBatches.end() ) continue; // // We now have all the info needed to dispatch the batch // BatchedCommandRequest request( clientRequest.getBatchType() ); batchOp.buildBatchRequest( *nextBatch, &request ); // Internally we use full namespaces for request/response, but we send the // command to a database with the collection name in the request. NamespaceString nss( request.getNS() ); request.setNS( nss.coll() ); _dispatcher->addCommand( shardHost, nss.db(), request ); // Indicate we're done by setting the batch to NULL // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast // endpoints for the same host, so this should be pretty efficient without // moving stuff around. *it = NULL; // Recv-side is responsible for cleaning up the nextBatch when used pendingBatches.insert( make_pair( shardHost, nextBatch ) ); } // Send them all out _dispatcher->sendAll(); numSent += pendingBatches.size(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response ConnectionString shardHost; BatchedCommandResponse response; Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response ); // Get the TargetedWriteBatch to find where to put the response dassert( pendingBatches.find( shardHost ) != pendingBatches.end() ); TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second; scoped_ptr<TargetedWriteBatch> batch( batchRaw ); if ( dispatchStatus.isOK() ) { TrackedErrors trackedErrors; trackedErrors.startTracking( ErrorCodes::StaleShardVersion ); // Dispatch was ok, note response batchOp.noteBatchResponse( *batch, response, &trackedErrors ); // Note if anything was stale const vector<ShardError*>& staleErrors = trackedErrors.getErrors( ErrorCodes::StaleShardVersion ); if ( staleErrors.size() > 0 ) { noteStaleResponses( staleErrors, _targeter ); ++_stats->numStaleBatches; } // Remember if the shard is actively changing metadata right now if ( isShardMetadataChanging( staleErrors ) ) { remoteMetadataChanging = true; } // Remember that we successfully wrote to this shard // NOTE: This will record lastOps for shards where we actually didn't update // or delete any documents, which preserves old behavior but is conservative _stats->noteWriteAt( shardHost, response.isLastOpSet() ? response.getLastOp() : OpTime(), response.isElectionIdSet() ? response.getElectionId() : OID()); } else { // Error occurred dispatching, note it WriteErrorDetail error; buildErrorFrom( dispatchStatus, &error ); batchOp.noteBatchError( *batch, error ); } } } ++rounds; ++_stats->numRounds; // If we're done, get out if ( batchOp.isFinished() ) break; // MORE WORK TO DO // // Refresh the targeter if we need to (no-op if nothing stale) // bool targeterChanged = false; Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged ); if ( !refreshStatus.isOK() ) { // It's okay if we can't refresh, we'll just record errors for the ops if // needed. warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() ) << endl; } // // Ensure progress is being made toward completing the batch op // int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed ); if ( currCompletedOps == numCompletedOps && !targeterChanged && !remoteMetadataChanging ) { ++numRoundsWithoutProgress; } else { numRoundsWithoutProgress = 0; } numCompletedOps = currCompletedOps; if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) { stringstream msg; msg << "no progress was made executing batch write op in " << clientRequest.getNS() << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps << " ops completed in " << rounds << " rounds total)"; WriteErrorDetail error; buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error ); batchOp.setBatchError( error ); break; } } batchOp.buildClientResponse( clientResponse ); }
void WriteBatchExecutor::execInserts( const BatchedCommandRequest& request, std::vector<WriteErrorDetail*>* errors ) { // Bulk insert is a bit different from other bulk operations in that multiple request docs // can be processed at once inside the write lock. const NamespaceString nss( request.getTargetingNS() ); scoped_ptr<BatchItemRef> currInsertItem( new BatchItemRef( &request, 0 ) ); // Go through our request and do some preprocessing on insert documents outside the lock to // validate and put them in a normalized form - i.e. put _id in front and fill in // timestamps. The insert document may also be invalid. // TODO: Might be more efficient to do in batches. vector<StatusWith<BSONObj> > normalInserts; normalizeInserts( request, &normalInserts ); while ( currInsertItem->getItemIndex() < static_cast<int>( request.sizeWriteOps() ) ) { WriteOpResult currResult; // Don't (re-)acquire locks and create database until it's necessary if ( !normalInserts[currInsertItem->getItemIndex()].isOK() ) { currResult.error = toWriteError( normalInserts[currInsertItem->getItemIndex()].getStatus() ); } else { PageFaultRetryableSection pFaultSection; //////////////////////////////////// Lock::DBWrite writeLock( nss.ns() ); //////////////////////////////////// // Check version inside of write lock if ( checkIsMasterForCollection( nss, &currResult.error ) && checkShardVersion( &shardingState, request, &currResult.error ) && checkIndexConstraints( &shardingState, request, &currResult.error ) ) { // // Get the collection for the insert // scoped_ptr<Client::Context> writeContext; Collection* collection = NULL; try { // Context once we're locked, to set more details in currentOp() // TODO: better constructor? writeContext.reset( new Client::Context( request.getNS(), storageGlobalParams.dbpath, false /* don't check version */) ); Database* database = writeContext->db(); dassert( database ); collection = database->getCollection( nss.ns() ); if ( !collection ) { // Implicitly create if it doesn't exist collection = database->createCollection( nss.ns() ); if ( !collection ) { currResult.error = toWriteError( Status( ErrorCodes::InternalError, "could not create collection" ) ); } } } catch ( const DBException& ex ) { Status status(ex.toStatus()); if (ErrorCodes::isInterruption(status.code())) { throw; } currResult.error = toWriteError(status); } // // Perform writes inside write lock // while ( collection && currInsertItem->getItemIndex() < static_cast<int>( request.sizeWriteOps() ) ) { // // BEGIN CURRENT OP // scoped_ptr<CurOp> currentOp( beginCurrentOp( _client, *currInsertItem ) ); incOpStats( *currInsertItem ); // Get the actual document we want to write, assuming it's valid const StatusWith<BSONObj>& normalInsert = // normalInserts[currInsertItem->getItemIndex()]; const BSONObj& normalInsertDoc = normalInsert.getValue().isEmpty() ? currInsertItem->getDocument() : normalInsert.getValue(); if ( !normalInsert.isOK() ) { // This insert failed on preprocessing currResult.error = toWriteError( normalInsert.getStatus() ); } else if ( !request.isInsertIndexRequest() ) { // Try the insert singleInsert( *currInsertItem, normalInsertDoc, collection, &currResult ); } else { // Try the create index singleCreateIndex( *currInsertItem, normalInsertDoc, collection, &currResult ); } // // END CURRENT OP // finishCurrentOp( _client, currentOp.get(), currResult.error ); // Faults release the write lock if ( currResult.fault ) break; // In general, we might have stats and errors incWriteStats( *currInsertItem, currResult.stats, currResult.error, currentOp.get() ); // Errors release the write lock if ( currResult.error ) break; // Increment in the write lock and reset the stats for next time currInsertItem.reset( new BatchItemRef( &request, currInsertItem->getItemIndex() + 1 ) ); currResult.reset(); // Destruct curop so that our parent curop is restored, so that we // record the yield count in the parent. currentOp.reset(NULL); // yield sometimes int micros = ClientCursor::suggestYieldMicros(); if (micros > 0) { ClientCursor::staticYield(micros, "", NULL); } } } } // END WRITE LOCK // // Store the current error if it exists // if ( currResult.error ) { errors->push_back( currResult.releaseError() ); errors->back()->setIndex( currInsertItem->getItemIndex() ); // Break early for ordered batches if ( request.getOrdered() ) break; } // // Fault or increment // if ( currResult.fault ) { // Check page fault out of lock currResult.fault->touch(); } else { // Increment if not a fault currInsertItem.reset( new BatchItemRef( &request, currInsertItem->getItemIndex() + 1 ) ); } } }
/** * The core config write functionality. * * Config writes run in two passes - the first is a quick check to ensure the config servers * are all reachable, the second runs the actual write. * * TODO: Upgrade and move this logic to the config servers, a state machine implementation * is probably the next step. */ void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse, bool fsyncCheck ) { NamespaceString nss( clientRequest.getNS() ); dassert( nss.db() == "config" || nss.db() == "admin" ); dassert( clientRequest.sizeWriteOps() == 1u ); if ( fsyncCheck ) { // // Sanity check that all configs are still reachable using fsync, preserving legacy // behavior // OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned; vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector(); // // Send side // for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; FsyncRequest fsyncRequest; _dispatcher->addCommand( configHost, "admin", fsyncRequest ); } _dispatcher->sendAll(); // // Recv side // bool fsyncError = false; while ( _dispatcher->numPending() > 0 ) { fsyncResponses.push_back( new ConfigFsyncResponse() ); ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back(); Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost, &fsyncResponse.response ); // We've got to recv everything, no matter what if ( !dispatchStatus.isOK() ) { fsyncError = true; buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response ); } else if ( !fsyncResponse.response.getOk() ) { fsyncError = true; } } if ( fsyncError ) { combineFsyncErrors( fsyncResponses, clientResponse ); return; } else { fsyncResponsesOwned.clear(); } } // // Do the actual writes // BatchedCommandRequest configRequest( clientRequest.getBatchType() ); clientRequest.cloneTo( &configRequest ); configRequest.setNS( nss.coll() ); OwnedPointerVector<ConfigResponse> responsesOwned; vector<ConfigResponse*>& responses = responsesOwned.mutableVector(); // // Send the actual config writes // // Get as many batches as we can at once for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; _dispatcher->addCommand( configHost, nss.db(), configRequest ); } // Send them all out _dispatcher->sendAll(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response responses.push_back( new ConfigResponse() ); ConfigResponse& configResponse = *responses.back(); Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost, &configResponse.response ); if ( !dispatchStatus.isOK() ) { buildErrorFrom( dispatchStatus, &configResponse.response ); } } combineResponses( responses, clientResponse ); }
bool WriteBatchExecutor::applyWriteItem( const BatchedCommandRequest& request, int index, WriteStats* stats, BatchedErrorDetail* error ) { const string& ns = request.getNS(); // Clear operation's LastError before starting. _le->reset( true ); //uint64_t itemTimeMicros = 0; bool opSuccess = true; // Each write operation executes in its own PageFaultRetryableSection. This means that // a single batch can throw multiple PageFaultException's, which is not the case for // other operations. PageFaultRetryableSection s; while ( true ) { try { // Execute the write item as a child operation of the current operation. CurOp childOp( _client, _client->curop() ); // TODO Modify CurOp "wrapped" constructor to take an opcode, so calling .reset() // is unneeded childOp.reset( _client->getRemote(), getOpCode( request.getBatchType() ) ); childOp.ensureStarted(); OpDebug& opDebug = childOp.debug(); opDebug.ns = ns; { Client::WriteContext ctx( ns ); switch ( request.getBatchType() ) { case BatchedCommandRequest::BatchType_Insert: opSuccess = applyInsert( ns, request.getInsertRequest()->getDocumentsAt( index ), &childOp, stats, error ); break; case BatchedCommandRequest::BatchType_Update: opSuccess = applyUpdate( ns, *request.getUpdateRequest()->getUpdatesAt( index ), &childOp, stats, error ); break; default: dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ); opSuccess = applyDelete( ns, *request.getDeleteRequest()->getDeletesAt( index ), &childOp, stats, error ); break; } } childOp.done(); //itemTimeMicros = childOp.totalTimeMicros(); opDebug.executionTime = childOp.totalTimeMillis(); opDebug.recordStats(); // Log operation if running with at least "-v", or if exceeds slow threshold. if ( logger::globalLogDomain()->shouldLog( logger::LogSeverity::Debug( 1 ) ) || opDebug.executionTime > cmdLine.slowMS + childOp.getExpectedLatencyMs() ) { MONGO_TLOG(1) << opDebug.report( childOp ) << endl; } // TODO Log operation if logLevel >= 3 and assertion thrown (as assembleResponse() // does). // Save operation to system.profile if shouldDBProfile(). if ( childOp.shouldDBProfile( opDebug.executionTime ) ) { profile( *_client, getOpCode( request.getBatchType() ), childOp ); } break; } catch ( PageFaultException& e ) { e.touch(); } } return opSuccess; }
/** * The core config write functionality. * * Config writes run in two passes - the first is a quick check to ensure the config servers * are all reachable, the second runs the actual write. * * TODO: Upgrade and move this logic to the config servers, a state machine implementation * is probably the next step. */ void ConfigCoordinator::executeBatch(const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse) { const NamespaceString nss(clientRequest.getNS()); // Should never use it for anything other than DBs residing on the config server dassert(nss.db() == "config" || nss.db() == "admin"); dassert(clientRequest.sizeWriteOps() == 1u); // This is an opportunistic check that all config servers look healthy by calling // getLastError on each one of them. If there was some form of write/journaling error, get // last error would fail. { for (vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it) { _dispatcher->addCommand(*it, "admin", RawBSONSerializable(BSON("getLastError" << true << "fsync" << true))); } _dispatcher->sendAll(); bool error = false; while (_dispatcher->numPending()) { ConnectionString host; RawBSONSerializable response; Status status = _dispatcher->recvAny(&host, &response); if (status.isOK()) { BSONObj obj = response.toBSON(); LOG(3) << "Response " << obj.toString(); // If the ok field is anything other than 1, count it as error if (!obj["ok"].trueValue()) { error = true; log() << "Config server check for host " << host << " returned error: " << response; } } else { error = true; log() << "Config server check for host " << host << " failed with status: " << status; } } // All responses should have been gathered by this point if (error) { clientResponse->setOk(false); clientResponse->setErrCode(ErrorCodes::RemoteValidationError); clientResponse->setErrMessage("Could not verify that config servers were active" " and reachable before write"); return; } } if (!_checkConfigString(clientResponse)) { return; } // // Do the actual writes // BatchedCommandRequest configRequest( clientRequest.getBatchType() ); clientRequest.cloneTo( &configRequest ); configRequest.setNS( nss.coll() ); OwnedPointerVector<ConfigResponse> responsesOwned; vector<ConfigResponse*>& responses = responsesOwned.mutableVector(); // // Send the actual config writes // // Get as many batches as we can at once for (vector<ConnectionString>::const_iterator it = _configHosts.begin(); it != _configHosts.end(); ++it) { const ConnectionString& configHost = *it; _dispatcher->addCommand(configHost, nss.db(), configRequest); } // Send them all out _dispatcher->sendAll(); // // Recv side // while (_dispatcher->numPending() > 0) { // Get the response responses.push_back(new ConfigResponse()); ConfigResponse& configResponse = *responses.back(); Status dispatchStatus = _dispatcher->recvAny(&configResponse.configHost, &configResponse.response); if (!dispatchStatus.isOK()) { buildErrorFrom(dispatchStatus, &configResponse.response); } } combineResponses(responses, clientResponse); }
void ClusterWriter::write( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } if ( !NamespaceString::validCollectionName( nss.coll() ) ) { toBatchError( Status( ErrorCodes::BadValue, str::stream() << "invalid collection name " << nss.coll() ), response ); return; } if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::FailedToParse, str::stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Config writes and shard writes are done differently string dbName = nss.db().toString(); if ( dbName == "config" || dbName == "admin" ) { bool verboseWC = request.isVerboseWC(); // We only support batch sizes of one for config writes if ( request.sizeWriteOps() != 1 ) { toBatchError( Status( ErrorCodes::InvalidOptions, mongoutils::str::stream() << "Writes to config servers must " "have batch size of 1, found " << request.sizeWriteOps() ), response ); return; } // We only support {w: 0}, {w: 1}, and {w: 'majority'} write concern for config writes if ( request.isWriteConcernSet() && !validConfigWC( request.getWriteConcern() )) { toBatchError( Status( ErrorCodes::InvalidOptions, mongoutils::str::stream() << "Invalid write concern for write" " to config servers: " << request.getWriteConcern() ), response ); return; } // We need to support "best-effort" writes for pings to the config server. // {w:0} (!verbose) writes are interpreted as best-effort in this case - they may still // error, but do not do the initial fsync check. configWrite( request, response, verboseWC ); } else { shardWrite( request, response ); } }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { Timer commandTimer; WriteStats stats; std::auto_ptr<BatchedErrorDetail> error( new BatchedErrorDetail ); BSONObj upsertedID = BSONObj(); bool batchSuccess = true; bool staleBatch = false; // Apply each batch item, stopping on an error if we were asked to apply the batch // sequentially. size_t numBatchOps = request.sizeWriteOps(); bool verbose = verboseResponse( request ); for ( size_t i = 0; i < numBatchOps; i++ ) { if ( applyWriteItem( BatchItemRef( &request, i ), &stats, &upsertedID, error.get() ) ) { // In case updates turned out to be upserts, the callers may be interested // in learning what _id was used for that document. if ( !upsertedID.isEmpty() ) { if ( numBatchOps == 1 ) { response->setSingleUpserted(upsertedID); } else if ( verbose ) { std::auto_ptr<BatchedUpsertDetail> upsertDetail(new BatchedUpsertDetail); upsertDetail->setIndex(i); upsertDetail->setUpsertedID(upsertedID); response->addToUpsertDetails(upsertDetail.release()); } upsertedID = BSONObj(); } } else { // The applyWriteItem did not go thgrou // If the error is sharding related, we'll have to investigate whether we // have a stale view of sharding state. if ( error->getErrCode() == ErrorCodes::StaleShardVersion ) staleBatch = true; // Don't bother recording if the user doesn't want a verbose answer. We want to // keep the error if this is a one-item batch, since we already compact the // response for those. if (verbose || numBatchOps == 1) { error->setIndex( static_cast<int>( i ) ); response->addToErrDetails( error.release() ); } batchSuccess = false; if ( request.getOrdered() ) break; error.reset( new BatchedErrorDetail ); } } // So far, we may have failed some of the batch's items. So we record // that. Rergardless, we still need to apply the write concern. If that generates a // more specific error, we'd replace for the intermediate error here. Note that we // "compatct" the error messge if this is an one-item batch. (See rationale later in // this file.) if ( !batchSuccess ) { if (numBatchOps > 1) { // TODO // Define the final error code here. // Might be used as a final error, depending on write concern success. response->setErrCode( 99999 ); response->setErrMessage( "batch op errors occurred" ); } else { // Promote the single error. const BatchedErrorDetail* error = response->getErrDetailsAt( 0 ); response->setErrCode( error->getErrCode() ); if ( error->isErrInfoSet() ) response->setErrInfo( error->getErrInfo() ); response->setErrMessage( error->getErrMessage() ); response->unsetErrDetails(); error = NULL; } } // Apply write concern. Note, again, that we're only assembling a full response if the // user is interested in it. BSONObj writeConcern; if ( request.isWriteConcernSet() ) { writeConcern = request.getWriteConcern(); } else { writeConcern = _defaultWriteConcern; } string errMsg; BSONObjBuilder wcResultsB; if ( !waitForWriteConcern( writeConcern, !batchSuccess, &wcResultsB, &errMsg ) ) { // TODO Revisit when user visible family error codes are set response->setErrCode( ErrorCodes::WriteConcernFailed ); response->setErrMessage( errMsg ); if ( verbose ) { response->setErrInfo( wcResultsB.obj() ); } } // TODO: Audit where we want to queue here if ( staleBatch ) { ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getNS(), request.getShardVersion(), &latestShardVersion ); } // Set the main body of the response. We assume that, if there was an error, the error // code would already be set. response->setOk( !response->isErrCodeSet() ); response->setN( stats.numUpdated + stats.numInserted + stats.numDeleted ); }
void BatchSafeWriter::safeWriteBatch( DBClientBase* conn, const BatchedCommandRequest& request, BatchedCommandResponse* response ) { const NamespaceString nss( request.getNS() ); // N starts at zero, and we add to it for each item response->setN( 0 ); for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) { // Break on first error if we're ordered if ( request.getOrdered() && response->isErrDetailsSet() ) break; BatchItemRef itemRef( &request, static_cast<int>( i ) ); bool isLastItem = ( i == request.sizeWriteOps() - 1 ); BSONObj writeConcern; if ( isLastItem && request.isWriteConcernSet() ) { writeConcern = request.getWriteConcern(); // Pre-2.4.2 mongods react badly to 'w' being set on config servers if ( nss.db() == "config" ) writeConcern = fixWCForConfig( writeConcern ); } BSONObj gleResult; GLEErrors errors; Status status = _safeWriter->safeWrite( conn, itemRef, writeConcern, &gleResult ); if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { response->clear(); response->setOk( false ); response->setErrCode( status.code() ); response->setErrMessage( status.reason() ); return; } // // STATS HANDLING // GLEStats stats; extractGLEStats( gleResult, &stats ); // Special case for making legacy "n" field result for insert match the write // command result. if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert && !errors.writeError.get() ) { // n is always 0 for legacy inserts. dassert( stats.n == 0 ); stats.n = 1; } response->setN( response->getN() + stats.n ); if ( !stats.upsertedId.isEmpty() ) { BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail; upsertedId->setIndex( i ); upsertedId->setUpsertedID( stats.upsertedId ); response->addToUpsertDetails( upsertedId ); } response->setLastOp( stats.lastOp ); // // WRITE ERROR HANDLING // // If any error occurs (except stale config) the previous GLE was not enforced bool enforcedWC = !errors.writeError.get() || errors.writeError->getErrCode() == ErrorCodes::StaleShardVersion; // Save write error if ( errors.writeError.get() ) { errors.writeError->setIndex( i ); response->addToErrDetails( errors.writeError.release() ); } // // WRITE CONCERN ERROR HANDLING // // The last write is weird, since we enforce write concern and check the error through // the same GLE if possible. If the last GLE was an error, the write concern may not // have been enforced in that same GLE, so we need to send another after resetting the // error. if ( isLastItem ) { // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. bool needToEnforceWC = !response->isErrDetailsSet() || ( !request.getOrdered() && response->sizeErrDetails() < request.sizeWriteOps() ); if ( !enforcedWC && needToEnforceWC ) { dassert( !errors.writeError.get() ); // emptied above // Might have gotten a write concern validity error earlier, these are // enforced even if the wc isn't applied, so we ignore. errors.wcError.reset(); Status status = _safeWriter->enforceWriteConcern( conn, nss.db().toString(), writeConcern, &gleResult ); if ( status.isOK() ) { status = extractGLEErrors( gleResult, &errors ); } if ( !status.isOK() ) { response->clear(); response->setOk( false ); response->setErrCode( status.code() ); response->setErrMessage( status.reason() ); return; } } // END Write concern retry if ( errors.wcError.get() ) { response->setWriteConcernError( errors.wcError.release() ); } } } response->setOk( true ); dassert( response->isValid( NULL ) ); }