void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { toBatchError( allowedStatus, response ); return; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Validate write concern // TODO: Lift write concern parsing out of this entirely WriteConcernOptions writeConcern; BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } Status wcStatus = Status::OK(); if ( wcDoc.isEmpty() ) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 wcStatus = writeConcern.parse( _defaultWriteConcern.isEmpty() ? WriteConcernOptions::Acknowledged : _defaultWriteConcern ); if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) { writeConcern.wNumNodes = 1; } } else { wcStatus = writeConcern.parse( wcDoc ); } if ( wcStatus.isOK() ) { wcStatus = validateWriteConcern( writeConcern ); } if ( !wcStatus.isOK() ) { toBatchError( wcStatus, response ); return; } if ( request.sizeWriteOps() == 0u ) { toBatchError( Status( ErrorCodes::InvalidLength, "no write ops were included in the batch" ), response ); return; } // Validate batch size if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::InvalidLength, stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } // // End validation // bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if (replMode != repl::ReplicationCoordinator::modeNone) { response->setLastOp( _client->getLastOp() ); if (replMode == repl::ReplicationCoordinator::modeReplSet) { response->setElectionId(repl::theReplSet->getElectionId()); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { // // Correct behavior here is very finicky. // // 1. The first step is to append the error that occurred on the previous operation. // This adds an "err" field to the command, which is *not* the command failing. // // 2. Next we parse and validate write concern options. If these options are invalid // the command fails no matter what, even if we actually had an error earlier. The // reason for checking here is to match legacy behavior on these kind of failures - // we'll still get an "err" field for the write error. // // 3. If we had an error on the previous operation, we then return immediately. // // 4. Finally, we actually enforce the write concern. All errors *except* timeout are // reported with ok : 0.0, to match legacy behavior. // // There is a special case when "wOpTime" and "wElectionId" are explicitly provided by // the client (mongos) - in this case we *only* enforce the write concern if it is // valid. // // We always need to either report "err" (if ok : 1) or "errmsg" (if ok : 0), even if // err is null. // LastError *le = lastError.disableForCommand(); // Always append lastOp and connectionId Client& c = cc(); c.appendLastOp( result ); // for sharding; also useful in general for debugging result.appendNumber( "connectionId" , c.getConnectionId() ); OpTime lastOpTime; BSONField<OpTime> wOpTimeField("wOpTime"); FieldParser::FieldState extracted = FieldParser::extract(cmdObj, wOpTimeField, &lastOpTime, &errmsg); if (!extracted) { result.append("badGLE", cmdObj); appendCommandStatus(result, false, errmsg); return false; } bool lastOpTimePresent = extracted != FieldParser::FIELD_NONE; if (!lastOpTimePresent) { // Use the client opTime if no wOpTime is specified lastOpTime = cc().getLastOp(); } OID electionId; BSONField<OID> wElectionIdField("wElectionId"); extracted = FieldParser::extract(cmdObj, wElectionIdField, &electionId, &errmsg); if (!extracted) { result.append("badGLE", cmdObj); appendCommandStatus(result, false, errmsg); return false; } bool electionIdPresent = extracted != FieldParser::FIELD_NONE; bool errorOccurred = false; // Errors aren't reported when wOpTime is used if ( !lastOpTimePresent ) { if ( le->nPrev != 1 ) { errorOccurred = LastError::noError.appendSelf( result, false ); le->appendSelfStatus( result ); } else { errorOccurred = le->appendSelf( result, false ); } } BSONObj writeConcernDoc = cmdObj; // Use the default options if we have no gle options aside from wOpTime/wElectionId const int nFields = cmdObj.nFields(); bool useDefaultGLEOptions = (nFields == 1) || (nFields == 2 && lastOpTimePresent) || (nFields == 3 && lastOpTimePresent && electionIdPresent); if (useDefaultGLEOptions) { BSONObj getLastErrorDefault = repl::getGlobalReplicationCoordinator()->getGetLastErrorDefault(); if (!getLastErrorDefault.isEmpty()) { writeConcernDoc = getLastErrorDefault; } } // // Validate write concern no matter what, this matches 2.4 behavior // WriteConcernOptions writeConcern; Status status = writeConcern.parse( writeConcernDoc ); if ( status.isOK() ) { // Ensure options are valid for this host status = validateWriteConcern( writeConcern ); } if ( !status.isOK() ) { result.append( "badGLE", writeConcernDoc ); return appendCommandStatus( result, status ); } // Don't wait for replication if there was an error reported - this matches 2.4 behavior if ( errorOccurred ) { dassert( !lastOpTimePresent ); return true; } // No error occurred, so we won't duplicate these fields with write concern errors dassert( result.asTempObj()["err"].eoo() ); dassert( result.asTempObj()["code"].eoo() ); // If we got an electionId, make sure it matches if (electionIdPresent) { if (repl::getGlobalReplicationCoordinator()->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) { // Ignore electionIds of 0 from mongos. if (electionId != OID()) { errmsg = "wElectionId passed but no replication active"; result.append("code", ErrorCodes::BadValue); return false; } } else { if (electionId != repl::getGlobalReplicationCoordinator()->getElectionId()) { LOG(3) << "oid passed in is " << electionId << ", but our id is " << repl::getGlobalReplicationCoordinator()->getElectionId(); errmsg = "election occurred after write"; result.append("code", ErrorCodes::WriteConcernFailed); return false; } } } txn->setMessage( "waiting for write concern" ); WriteConcernResult wcResult; status = waitForWriteConcern( txn, writeConcern, lastOpTime, &wcResult ); wcResult.appendTo( writeConcern, &result ); // For backward compatibility with 2.4, wtimeout returns ok : 1.0 if ( wcResult.wTimedOut ) { dassert( !wcResult.err.empty() ); // so we always report err dassert( !status.isOK() ); result.append( "errmsg", "timed out waiting for slaves" ); result.append( "code", status.code() ); return true; } return appendCommandStatus( result, status ); }
boost::optional<Date_t> CollectionRangeDeleter::cleanUpNextRange( OperationContext* opCtx, NamespaceString const& nss, OID const& epoch, int maxToDelete, CollectionRangeDeleter* forTestOnly) { if (maxToDelete <= 0) { maxToDelete = rangeDeleterBatchSize.load(); if (maxToDelete <= 0) { maxToDelete = std::max(int(internalQueryExecYieldIterations.load()), 1); } } StatusWith<int> wrote = 0; auto range = boost::optional<ChunkRange>(boost::none); auto notification = DeleteNotification(); { UninterruptibleLockGuard noInterrupt(opCtx->lockState()); AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto* const collection = autoColl.getCollection(); auto* const csr = CollectionShardingRuntime::get(opCtx, nss); auto& metadataManager = csr->_metadataManager; if (!_checkCollectionMetadataStillValid( opCtx, nss, epoch, forTestOnly, collection, metadataManager)) { return boost::none; } auto* const self = forTestOnly ? forTestOnly : &metadataManager->_rangesToClean; bool writeOpLog = false; { stdx::lock_guard<stdx::mutex> scopedLock(csr->_metadataManager->_managerLock); if (self->isEmpty()) { LOG(1) << "No further range deletions scheduled on " << nss.ns(); return boost::none; } auto& orphans = self->_orphans; if (orphans.empty()) { // We have delayed deletions; see if any are ready. auto& df = self->_delayedOrphans.front(); if (df.whenToDelete > Date_t::now()) { LOG(0) << "Deferring deletion of " << nss.ns() << " range " << redact(df.range.toString()) << " until " << df.whenToDelete; return df.whenToDelete; } // Move a single range from _delayedOrphans to _orphans orphans.splice(orphans.end(), self->_delayedOrphans, self->_delayedOrphans.begin()); LOG(1) << "Proceeding with deferred deletion of " << nss.ns() << " range " << redact(orphans.front().range.toString()); writeOpLog = true; } invariant(!orphans.empty()); const auto& frontRange = orphans.front().range; range.emplace(frontRange.getMin().getOwned(), frontRange.getMax().getOwned()); notification = orphans.front().notification; } invariant(range); if (writeOpLog) { // Secondaries will watch for this update, and kill any queries that may depend on // documents in the range -- excepting any queries with a read-concern option // 'ignoreChunkMigration' try { AutoGetCollection autoAdmin( opCtx, NamespaceString::kServerConfigurationNamespace, MODE_IX); Helpers::upsert(opCtx, NamespaceString::kServerConfigurationNamespace.ns(), BSON("_id" << "startRangeDeletion" << "ns" << nss.ns() << "epoch" << epoch << "min" << range->getMin() << "max" << range->getMax())); } catch (const DBException& e) { stdx::lock_guard<stdx::mutex> scopedLock(csr->_metadataManager->_managerLock); csr->_metadataManager->_clearAllCleanups( scopedLock, e.toStatus("cannot push startRangeDeletion record to Op Log," " abandoning scheduled range deletions")); return boost::none; } } const auto scopedCollectionMetadata = metadataManager->getActiveMetadata(metadataManager, boost::none); const auto& metadata = *scopedCollectionMetadata; try { wrote = self->_doDeletion( opCtx, collection, metadata->getKeyPattern(), *range, maxToDelete); } catch (const DBException& e) { wrote = e.toStatus(); warning() << e.what(); } } // drop autoColl if (!wrote.isOK() || wrote.getValue() == 0) { if (wrote.isOK()) { LOG(0) << "No documents remain to delete in " << nss << " range " << redact(range->toString()); } // Wait for majority replication even when wrote isn't OK or == 0, because it might have // been OK and/or > 0 previously, and the deletions must be persistent before notifying // clients in _pop(). LOG(0) << "Waiting for majority replication of local deletions in " << nss.ns() << " range " << redact(range->toString()); repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx); const auto clientOpTime = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); // Wait for replication outside the lock const auto replicationStatus = [&] { try { WriteConcernResult unusedWCResult; return waitForWriteConcern( opCtx, clientOpTime, kMajorityWriteConcern, &unusedWCResult); } catch (const DBException& e) { return e.toStatus(); } }(); // Get the lock again to finish off this range (including notifying, if necessary). // Don't allow lock interrupts while cleaning up. UninterruptibleLockGuard noInterrupt(opCtx->lockState()); AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto* const collection = autoColl.getCollection(); auto* const csr = CollectionShardingRuntime::get(opCtx, nss); auto& metadataManager = csr->_metadataManager; if (!_checkCollectionMetadataStillValid( opCtx, nss, epoch, forTestOnly, collection, metadataManager)) { return boost::none; } auto* const self = forTestOnly ? forTestOnly : &metadataManager->_rangesToClean; stdx::lock_guard<stdx::mutex> scopedLock(csr->_metadataManager->_managerLock); if (!replicationStatus.isOK()) { LOG(0) << "Error when waiting for write concern after removing " << nss << " range " << redact(range->toString()) << " : " << redact(replicationStatus.reason()); // If range were already popped (e.g. by dropping nss during the waitForWriteConcern // above) its notification would have been triggered, so this check suffices to ensure // that it is safe to pop the range here if (!notification.ready()) { invariant(!self->isEmpty() && self->_orphans.front().notification == notification); LOG(0) << "Abandoning deletion of latest range in " << nss.ns() << " after local " << "deletions because of replication failure"; self->_pop(replicationStatus); } } else { LOG(0) << "Finished deleting documents in " << nss.ns() << " range " << redact(range->toString()); self->_pop(wrote.getStatus()); } if (!self->_orphans.empty()) { LOG(1) << "Deleting " << nss.ns() << " range " << redact(self->_orphans.front().range.toString()) << " next."; } return Date_t::now() + Milliseconds(rangeDeleterBatchDelayMS.load()); } invariant(range); invariant(wrote.getStatus()); invariant(wrote.getValue() > 0); notification.abandon(); return Date_t::now() + Milliseconds(rangeDeleterBatchDelayMS.load()); }
/** * Outline: * * 1. Get oplog with session info from the source shard. * 2. For each oplog entry, convert to type 'n' if not yet type 'n' while preserving all info * needed for retryable writes. * 3. Also update the sessionCatalog for every oplog entry. * 4. Once the source shard returned an empty oplog buffer, it means that this should enter * ReadyToCommit state and wait for the commit signal (by calling finish()). * 5. Once finish() is called, keep on trying to get more oplog from the source shard until it * returns an empty result again. * 6. Wait for writes to be committed to majority of the replica set. */ void SessionCatalogMigrationDestination::_retrieveSessionStateFromSource(ServiceContext* service) { Client::initThread( "sessionCatalogMigration-" + _migrationSessionId.toString(), service, nullptr); auto uniqueCtx = cc().makeOperationContext(); auto opCtx = uniqueCtx.get(); bool oplogDrainedAfterCommiting = false; ProcessOplogResult lastResult; repl::OpTime lastOpTimeWaited; while (true) { { stdx::lock_guard<stdx::mutex> lk(_mutex); if (_state == State::ErrorOccurred) { return; } } try { auto nextBatch = getNextSessionOplogBatch(opCtx, _fromShard, _migrationSessionId); BSONArray oplogArray(nextBatch[kOplogField].Obj()); BSONArrayIteratorSorted oplogIter(oplogArray); if (!oplogIter.more()) { { stdx::lock_guard<stdx::mutex> lk(_mutex); if (_state == State::Committing) { // The migration is considered done only when it gets an empty result from // the source shard while this is in state committing. This is to make sure // that it doesn't miss any new oplog created between the time window where // this depleted the buffer from the source shard and receiving the commit // command. if (oplogDrainedAfterCommiting) { break; } oplogDrainedAfterCommiting = true; } } WriteConcernResult wcResult; auto wcStatus = waitForWriteConcern(opCtx, lastResult.oplogTime, kMajorityWC, &wcResult); if (!wcStatus.isOK()) { _errorOccurred(wcStatus.toString()); return; } // We depleted the buffer at least once, transition to ready for commit. { stdx::lock_guard<stdx::mutex> lk(_mutex); // Note: only transition to "ready to commit" if state is not error/force stop. if (_state == State::Migrating) { _state = State::ReadyToCommit; _isStateChanged.notify_all(); } } if (lastOpTimeWaited == lastResult.oplogTime) { // We got an empty result at least twice in a row from the source shard so // space it out a little bit so we don't hammer the shard. opCtx->sleepFor(Milliseconds(200)); } lastOpTimeWaited = lastResult.oplogTime; } while (oplogIter.more()) { lastResult = processSessionOplog(opCtx, oplogIter.next().Obj(), lastResult); } } catch (const DBException& excep) { if (excep.code() == ErrorCodes::ConflictingOperationInProgress || excep.code() == ErrorCodes::TransactionTooOld) { // This means that the server has a newer txnNumber than the oplog being migrated, // so just skip it. continue; } if (excep.code() == ErrorCodes::CommandNotFound) { // TODO: remove this after v3.7 // // This means that the donor shard is running at an older version so it is safe to // just end this because there is no session information to transfer. break; } _errorOccurred(excep.toString()); return; } } WriteConcernResult wcResult; auto wcStatus = waitForWriteConcern(opCtx, lastResult.oplogTime, kMajorityWC, &wcResult); if (!wcStatus.isOK()) { _errorOccurred(wcStatus.toString()); return; } { stdx::lock_guard<stdx::mutex> lk(_mutex); _state = State::Done; _isStateChanged.notify_all(); } }