void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { toBatchError( allowedStatus, response ); return; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Validate write concern // TODO: Lift write concern parsing out of this entirely WriteConcernOptions writeConcern; BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } Status wcStatus = Status::OK(); if ( wcDoc.isEmpty() ) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 wcStatus = writeConcern.parse( _defaultWriteConcern.isEmpty() ? WriteConcernOptions::Acknowledged : _defaultWriteConcern ); if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) { writeConcern.wNumNodes = 1; } } else { wcStatus = writeConcern.parse( wcDoc ); } if ( wcStatus.isOK() ) { wcStatus = validateWriteConcern( writeConcern ); } if ( !wcStatus.isOK() ) { toBatchError( wcStatus, response ); return; } if ( request.sizeWriteOps() == 0u ) { toBatchError( Status( ErrorCodes::InvalidLength, "no write ops were included in the batch" ), response ); return; } // Validate batch size if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::InvalidLength, stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } // // End validation // bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if (replMode != repl::ReplicationCoordinator::modeNone) { response->setLastOp( _client->getLastOp() ); if (replMode == repl::ReplicationCoordinator::modeReplSet) { response->setElectionId(repl::theReplSet->getElectionId()); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { // // Correct behavior here is very finicky. // // 1. The first step is to append the error that occurred on the previous operation. // This adds an "err" field to the command, which is *not* the command failing. // // 2. Next we parse and validate write concern options. If these options are invalid // the command fails no matter what, even if we actually had an error earlier. The // reason for checking here is to match legacy behavior on these kind of failures - // we'll still get an "err" field for the write error. // // 3. If we had an error on the previous operation, we then return immediately. // // 4. Finally, we actually enforce the write concern. All errors *except* timeout are // reported with ok : 0.0, to match legacy behavior. // // There is a special case when "wOpTime" and "wElectionId" are explicitly provided by // the client (mongos) - in this case we *only* enforce the write concern if it is // valid. // // We always need to either report "err" (if ok : 1) or "errmsg" (if ok : 0), even if // err is null. // LastError *le = lastError.disableForCommand(); // Always append lastOp and connectionId Client& c = cc(); c.appendLastOp( result ); // for sharding; also useful in general for debugging result.appendNumber( "connectionId" , c.getConnectionId() ); OpTime lastOpTime; BSONField<OpTime> wOpTimeField("wOpTime"); FieldParser::FieldState extracted = FieldParser::extract(cmdObj, wOpTimeField, &lastOpTime, &errmsg); if (!extracted) { result.append("badGLE", cmdObj); appendCommandStatus(result, false, errmsg); return false; } bool lastOpTimePresent = extracted != FieldParser::FIELD_NONE; if (!lastOpTimePresent) { // Use the client opTime if no wOpTime is specified lastOpTime = cc().getLastOp(); } OID electionId; BSONField<OID> wElectionIdField("wElectionId"); extracted = FieldParser::extract(cmdObj, wElectionIdField, &electionId, &errmsg); if (!extracted) { result.append("badGLE", cmdObj); appendCommandStatus(result, false, errmsg); return false; } bool electionIdPresent = extracted != FieldParser::FIELD_NONE; bool errorOccurred = false; // Errors aren't reported when wOpTime is used if ( !lastOpTimePresent ) { if ( le->nPrev != 1 ) { errorOccurred = LastError::noError.appendSelf( result, false ); le->appendSelfStatus( result ); } else { errorOccurred = le->appendSelf( result, false ); } } BSONObj writeConcernDoc = cmdObj; // Use the default options if we have no gle options aside from wOpTime/wElectionId const int nFields = cmdObj.nFields(); bool useDefaultGLEOptions = (nFields == 1) || (nFields == 2 && lastOpTimePresent) || (nFields == 3 && lastOpTimePresent && electionIdPresent); if ( useDefaultGLEOptions && getLastErrorDefault ) { writeConcernDoc = *getLastErrorDefault; } // // Validate write concern no matter what, this matches 2.4 behavior // WriteConcernOptions writeConcern; Status status = writeConcern.parse( writeConcernDoc ); if ( status.isOK() ) { // Ensure options are valid for this host status = validateWriteConcern( writeConcern ); } if ( !status.isOK() ) { result.append( "badGLE", writeConcernDoc ); return appendCommandStatus( result, status ); } // Don't wait for replication if there was an error reported - this matches 2.4 behavior if ( errorOccurred ) { dassert( !lastOpTimePresent ); return true; } // No error occurred, so we won't duplicate these fields with write concern errors dassert( result.asTempObj()["err"].eoo() ); dassert( result.asTempObj()["code"].eoo() ); // If we got an electionId, make sure it matches if (electionIdPresent) { if (!theReplSet) { // Ignore electionIds of 0 from mongos. if (electionId != OID()) { errmsg = "wElectionId passed but no replication active"; result.append("code", ErrorCodes::BadValue); return false; } } else { if (electionId != theReplSet->getElectionId()) { LOG(3) << "oid passed in is " << electionId << ", but our id is " << theReplSet->getElectionId(); errmsg = "election occurred after write"; result.append("code", ErrorCodes::WriteConcernFailed); return false; } } } cc().curop()->setMessage( "waiting for write concern" ); WriteConcernResult wcResult; status = waitForWriteConcern( txn, writeConcern, lastOpTime, &wcResult ); wcResult.appendTo( writeConcern, &result ); // For backward compatibility with 2.4, wtimeout returns ok : 1.0 if ( wcResult.wTimedOut ) { dassert( !wcResult.err.empty() ); // so we always report err dassert( !status.isOK() ); result.append( "errmsg", "timed out waiting for slaves" ); result.append( "code", status.code() ); return true; } return appendCommandStatus( result, status ); }
Status waitForWriteConcern( OperationContext* txn, const WriteConcernOptions& writeConcern, const OpTime& replOpTime, WriteConcernResult* result ) { // We assume all options have been validated earlier, if not, programming error dassert( validateWriteConcern( writeConcern ).isOK() ); // Next handle blocking on disk Timer syncTimer; switch( writeConcern.syncMode ) { case WriteConcernOptions::NONE: break; case WriteConcernOptions::FSYNC: if ( !getDur().isDurable() ) { result->fsyncFiles = globalStorageEngine->flushAllFiles( true ); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->awaitCommit(); } break; case WriteConcernOptions::JOURNAL: txn->recoveryUnit()->awaitCommit(); break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) { // no desired replication check return Status::OK(); } // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplication(txn, replOpTime, writeConcern); if (replStatus.status == ErrorCodes::ExceededTimeLimit) { gleWtimeouts.increment(); replStatus.status = Status(ErrorCodes::WriteConcernFailed, "waiting for replication timed out"); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime); gleWtimeStats.recordMillis(replStatus.duration.total_milliseconds()); result->wTime = replStatus.duration.total_milliseconds(); return replStatus.status; }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // TODO: Lift write concern parsing out of this entirely. WriteConcernOptions writeConcern; Status status = Status::OK(); BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } if ( wcDoc.isEmpty() ) { status = writeConcern.parse( _defaultWriteConcern ); } else { status = writeConcern.parse( wcDoc ); } if ( status.isOK() ) { status = validateWriteConcern( writeConcern ); } if ( !status.isOK() ) { response->setErrCode( status.code() ); response->setErrMessage( status.reason() ); response->setOk( false ); dassert( response->isValid(NULL) ); return; } bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; status = waitForWriteConcern( writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); upserted.clear(); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); writeErrors.clear(); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } if ( anyReplEnabled() ) { response->setLastOp( _client->getLastOp() ); if (theReplSet) { response->setElectionId( theReplSet->getElectionId() ); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
Status waitForWriteConcern( OperationContext* txn, const WriteConcernOptions& writeConcern, const OpTime& replOpTime, WriteConcernResult* result ) { // We assume all options have been validated earlier, if not, programming error dassert( validateWriteConcern( writeConcern ).isOK() ); // Next handle blocking on disk Timer syncTimer; switch( writeConcern.syncMode ) { case WriteConcernOptions::NONE: break; case WriteConcernOptions::FSYNC: if ( !getDur().isDurable() ) { result->fsyncFiles = MemoryMappedFile::flushAll( true ); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->awaitCommit(); } break; case WriteConcernOptions::JOURNAL: txn->recoveryUnit()->awaitCommit(); break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if ( replOpTime.isNull() ) { // no write happened for this client yet return Status::OK(); } if ( writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty() ) { // no desired replication check return Status::OK(); } if (!replset::anyReplEnabled() || serverGlobalParams.configsvr) { // no replication check needed (validated above) return Status::OK(); } const bool isMasterSlaveNode = replset::anyReplEnabled() && !replset::theReplSet; if ( writeConcern.wMode == "majority" && isMasterSlaveNode ) { // with master/slave, majority is equivalent to w=1 return Status::OK(); } // We're sure that replication is enabled and that we have more than one node or a wMode TimerHolder gleTimerHolder( &gleWtimeStats ); // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors // TODO: Make this cleaner Status replStatus = Status::OK(); try { while ( 1 ) { if ( writeConcern.wNumNodes > 0 ) { if (replset::opReplicatedEnough(replOpTime, writeConcern.wNumNodes)) { break; } } else if (replset::opReplicatedEnough(replOpTime, writeConcern.wMode)) { break; } if ( writeConcern.wTimeout > 0 && gleTimerHolder.millis() >= writeConcern.wTimeout ) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; replStatus = Status( ErrorCodes::WriteConcernFailed, "waiting for replication timed out" ); break; } sleepmillis(1); txn->checkForInterrupt(); } } catch( const AssertionException& ex ) { // Our replication state changed while enforcing write concern replStatus = ex.toStatus(); } // Add stats result->writtenTo = replset::getHostsWrittenTo(replOpTime); result->wTime = gleTimerHolder.recordMillis(); return replStatus; }