/** * Apply the read concern from the cursor to this operation. */ void applyCursorReadConcern(OperationContext* opCtx, repl::ReadConcernArgs rcArgs) { const auto replicationMode = repl::ReplicationCoordinator::get(opCtx)->getReplicationMode(); // Select the appropriate read source. if (replicationMode == repl::ReplicationCoordinator::modeReplSet && rcArgs.getLevel() == repl::ReadConcernLevel::kMajorityReadConcern) { switch (rcArgs.getMajorityReadMechanism()) { case repl::ReadConcernArgs::MajorityReadMechanism::kMajoritySnapshot: { // Make sure we read from the majority snapshot. opCtx->recoveryUnit()->setTimestampReadSource( RecoveryUnit::ReadSource::kMajorityCommitted); uassertStatusOK(opCtx->recoveryUnit()->obtainMajorityCommittedSnapshot()); break; } case repl::ReadConcernArgs::MajorityReadMechanism::kSpeculative: { // Mark the operation as speculative and select the correct read source. repl::SpeculativeMajorityReadInfo::get(opCtx).setIsSpeculativeRead(); opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap); break; } } } // For cursor commands that take locks internally, the read concern on the // OperationContext may affect the timestamp read source selected by the storage engine. // We place the cursor read concern onto the OperationContext so the lock acquisition // respects the cursor's read concern. { stdx::lock_guard<Client> lk(*opCtx->getClient()); repl::ReadConcernArgs::get(opCtx) = rcArgs; } }
StatusWith<Shard::QueryResponse> ShardRemote::_exhaustiveFindOnConfig( OperationContext* txn, const ReadPreferenceSetting& readPref, const repl::ReadConcernLevel& readConcernLevel, const NamespaceString& nss, const BSONObj& query, const BSONObj& sort, boost::optional<long long> limit) { invariant(getId() == "config"); ReadPreferenceSetting readPrefWithMinOpTime(readPref); readPrefWithMinOpTime.minOpTime = grid.configOpTime(); const auto host = _targeter->findHost(txn, readPrefWithMinOpTime); if (!host.isOK()) { return host.getStatus(); } QueryResponse response; // If for some reason the callback never gets invoked, we will return this status in response. Status status = Status(ErrorCodes::InternalError, "Internal error running find command"); auto fetcherCallback = [this, &status, &response](const Fetcher::QueryResponseStatus& dataStatus, Fetcher::NextAction* nextAction, BSONObjBuilder* getMoreBob) { // Throw out any accumulated results on error if (!dataStatus.isOK()) { status = dataStatus.getStatus(); response.docs.clear(); return; } auto& data = dataStatus.getValue(); if (data.otherFields.metadata.hasField(rpc::kReplSetMetadataFieldName)) { auto replParseStatus = rpc::ReplSetMetadata::readFromMetadata(data.otherFields.metadata); if (!replParseStatus.isOK()) { status = replParseStatus.getStatus(); response.docs.clear(); return; } response.opTime = replParseStatus.getValue().getLastOpCommitted(); // We return the config opTime that was returned for this particular request, but as // a safeguard we ensure our global configOpTime is at least as large as it. invariant(grid.configOpTime() >= response.opTime); } for (const BSONObj& doc : data.documents) { response.docs.push_back(doc.getOwned()); } status = Status::OK(); if (!getMoreBob) { return; } getMoreBob->append("getMore", data.cursorId); getMoreBob->append("collection", data.nss.coll()); }; BSONObj readConcernObj; { invariant(readConcernLevel == repl::ReadConcernLevel::kMajorityReadConcern); const repl::ReadConcernArgs readConcern{grid.configOpTime(), readConcernLevel}; BSONObjBuilder bob; readConcern.appendInfo(&bob); readConcernObj = bob.done().getObjectField(repl::ReadConcernArgs::kReadConcernFieldName).getOwned(); } const Milliseconds maxTimeMS = std::min(txn->getRemainingMaxTimeMillis(), kDefaultConfigCommandTimeout); BSONObjBuilder findCmdBuilder; { QueryRequest qr(nss); qr.setFilter(query); qr.setSort(sort); qr.setReadConcern(readConcernObj); qr.setLimit(limit); if (maxTimeMS < Milliseconds::max()) { qr.setMaxTimeMS(durationCount<Milliseconds>(maxTimeMS)); } qr.asFindCommand(&findCmdBuilder); } Fetcher fetcher(Grid::get(txn)->getExecutorPool()->getFixedExecutor(), host.getValue(), nss.db().toString(), findCmdBuilder.done(), fetcherCallback, _appendMetadataForCommand(txn, readPrefWithMinOpTime), maxTimeMS); Status scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { return scheduleStatus; } fetcher.join(); updateReplSetMonitor(host.getValue(), status); if (!status.isOK()) { if (status.compareCode(ErrorCodes::ExceededTimeLimit)) { LOG(0) << "Operation timed out " << causedBy(status); } return status; } return response; }
StatusWith<Shard::QueryResponse> ShardRemote::_exhaustiveFindOnConfig( OperationContext* txn, const ReadPreferenceSetting& readPref, const repl::ReadConcernLevel& readConcernLevel, const NamespaceString& nss, const BSONObj& query, const BSONObj& sort, boost::optional<long long> limit) { // Do not allow exhaustive finds to be run against regular shards. invariant(getId() == "config"); const auto host = _targeter->findHost(readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(txn)); if (!host.isOK()) { return host.getStatus(); } QueryResponse response; // If for some reason the callback never gets invoked, we will return this status in response. Status status = Status(ErrorCodes::InternalError, "Internal error running find command"); auto fetcherCallback = [this, &status, &response](const Fetcher::QueryResponseStatus& dataStatus, Fetcher::NextAction* nextAction, BSONObjBuilder* getMoreBob) { // Throw out any accumulated results on error if (!dataStatus.isOK()) { status = dataStatus.getStatus(); response.docs.clear(); return; } auto& data = dataStatus.getValue(); if (data.otherFields.metadata.hasField(rpc::kReplSetMetadataFieldName)) { auto replParseStatus = rpc::ReplSetMetadata::readFromMetadata(data.otherFields.metadata); if (!replParseStatus.isOK()) { status = replParseStatus.getStatus(); response.docs.clear(); return; } response.opTime = replParseStatus.getValue().getLastOpCommitted(); // We return the config opTime that was returned for this particular request, but as // a safeguard we ensure our global configOpTime is at least as large as it. invariant(grid.configOpTime() >= response.opTime); } for (const BSONObj& doc : data.documents) { response.docs.push_back(doc.getOwned()); } status = Status::OK(); if (!getMoreBob) { return; } getMoreBob->append("getMore", data.cursorId); getMoreBob->append("collection", data.nss.coll()); }; BSONObj readConcernObj; { invariant(readConcernLevel == repl::ReadConcernLevel::kMajorityReadConcern); const repl::ReadConcernArgs readConcern{grid.configOpTime(), readConcernLevel}; BSONObjBuilder bob; readConcern.appendInfo(&bob); readConcernObj = bob.done().getObjectField(repl::ReadConcernArgs::kReadConcernFieldName).getOwned(); } auto qr = stdx::make_unique<QueryRequest>(nss); qr->setFilter(query); qr->setSort(sort); qr->setReadConcern(readConcernObj); qr->setLimit(limit); BSONObjBuilder findCmdBuilder; qr->asFindCommand(&findCmdBuilder); Microseconds maxTime = std::min(duration_cast<Microseconds>(kConfigCommandTimeout), txn->getRemainingMaxTimeMicros()); if (maxTime < Milliseconds{1}) { // If there is less than 1ms remaining before the maxTime timeout expires, set the max time // to 1ms, since setting maxTimeMs to 1ms in a find command means "no max time". maxTime = Milliseconds{1}; } findCmdBuilder.append(QueryRequest::cmdOptionMaxTimeMS, durationCount<Milliseconds>(maxTime)); Fetcher fetcher(Grid::get(txn)->getExecutorPool()->getFixedExecutor(), host.getValue(), nss.db().toString(), findCmdBuilder.done(), fetcherCallback, _getMetadataForCommand(readPref), duration_cast<Milliseconds>(maxTime)); Status scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { return scheduleStatus; } fetcher.wait(); updateReplSetMonitor(host.getValue(), status); if (!status.isOK()) { if (status.compareCode(ErrorCodes::ExceededTimeLimit)) { LOG(0) << "Operation timed out with status " << status; } return status; } return response; }
Status waitForReadConcern(OperationContext* txn, const repl::ReadConcernArgs& readConcernArgs) { repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(txn); if (readConcernArgs.getLevel() == repl::ReadConcernLevel::kLinearizableReadConcern) { if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) { // For master/slave and standalone nodes, Linearizable Read is not supported. return {ErrorCodes::NotAReplicaSet, "node needs to be a replica set member to use read concern"}; } // Replica sets running pv0 do not support linearizable read concern until further testing // is completed (SERVER-27025). if (!replCoord->isV1ElectionProtocol()) { return { ErrorCodes::IncompatibleElectionProtocol, "Replica sets running protocol version 0 do not support readConcern: linearizable"}; } if (!readConcernArgs.getOpTime().isNull()) { return {ErrorCodes::FailedToParse, "afterOpTime not compatible with linearizable read concern"}; } if (!replCoord->getMemberState().primary()) { return {ErrorCodes::NotMaster, "cannot satisfy linearizable read concern on non-primary node"}; } } // Skip waiting for the OpTime when testing snapshot behavior if (!testingSnapshotBehaviorInIsolation && !readConcernArgs.isEmpty()) { Status status = replCoord->waitUntilOpTimeForRead(txn, readConcernArgs); if (!status.isOK()) { return status; } } if ((replCoord->getReplicationMode() == repl::ReplicationCoordinator::Mode::modeReplSet || testingSnapshotBehaviorInIsolation) && readConcernArgs.getLevel() == repl::ReadConcernLevel::kMajorityReadConcern) { // ReadConcern Majority is not supported in ProtocolVersion 0. if (!testingSnapshotBehaviorInIsolation && !replCoord->isV1ElectionProtocol()) { return {ErrorCodes::ReadConcernMajorityNotEnabled, str::stream() << "Replica sets running protocol version 0 do not support " "readConcern: majority"}; } const int debugLevel = serverGlobalParams.clusterRole == ClusterRole::ConfigServer ? 1 : 2; LOG(debugLevel) << "Waiting for 'committed' snapshot to be available for reading: " << readConcernArgs; Status status = txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot(); // Wait until a snapshot is available. while (status == ErrorCodes::ReadConcernMajorityNotAvailableYet) { LOG(debugLevel) << "Snapshot not available yet."; replCoord->waitUntilSnapshotCommitted(txn, SnapshotName::min()); status = txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot(); } if (!status.isOK()) { return status; } LOG(debugLevel) << "Using 'committed' snapshot: " << CurOp::get(txn)->query(); } return Status::OK(); }