bool isMMAPV1() { StorageEngine* globalStorageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); invariant(globalStorageEngine); return globalStorageEngine->isMmapV1(); }
static void handleCursorCommand(OperationContext* txn, const string& ns, ClientCursorPin* pin, PlanExecutor* exec, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getExecutor() == exec); invariant(cursor->isAggCursor()); } BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize"); const long long batchSize = batchSizeElem.isNumber() ? batchSizeElem.numberLong() : 101; // same as query // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = MaxBytesToReturnToClientAtOnce; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineProxyStage may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) { if (pin) pin->deleteUnderlying(); // make it an obvious error to use cursor or executor after this point cursor = NULL; exec = NULL; break; } if (resultsArray.len() + next.objsize() > byteLimit) { // Get the pipeline proxy stage wrapped by this PlanExecutor. PipelineProxyStage* proxy = static_cast<PipelineProxyStage*>(exec->getRootStage()); // too big. next will be the first doc in the second batch proxy->pushBack(next); break; } resultsArray.append(next); } // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && exec && !exec->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted(17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros( txn->getCurOp()->getRemainingMaxTimeMicros() ); // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cursor->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit()); // Cursor needs to be in a saved state while we yield locks for getmore. State // will be restored in getMore(). exec->saveState(); } const long long cursorId = cursor ? cursor->cursorid() : 0LL; Command::appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result); }
static void repairDatabasesAndCheckVersion(OperationContext* txn) { LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); vector<string> dbNames; StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&dbNames); // Repair all databases first, so that we do not try to open them if they are in bad shape if (storageGlobalParams.repair) { for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Repairing database: " << dbName << endl; fassert(18506, repairDatabase(txn, storageEngine, dbName)); } } const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them // to. The local DB is special because it is not replicated. See SERVER-10927 for more // details. const bool shouldClearNonLocalTmpCollections = !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() || replSettings.slave == repl::SimpleSlave); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Recovering database: " << dbName << endl; Database* db = dbHolder().openDb(txn, dbName); invariant(db); // First thing after opening the database is to check for file compatibility, // otherwise we might crash if this is a deprecated format. if (!db->getDatabaseCatalogEntry()->currentFilesCompatible(txn)) { log() << "****"; log() << "cannot do this upgrade without an upgrade in the middle"; log() << "please do a --repair with 2.6 and then start this version"; dbexit(EXIT_NEED_UPGRADE); return; } // Major versions match, check indexes const string systemIndexes = db->name() + ".system.indexes"; Collection* coll = db->getCollection(systemIndexes); unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) { if (IndexNames::existedBefore24(plugin)) { continue; } log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } if (index["v"].isNumber() && index["v"].numberInt() == 0) { log() << "WARNING: The index: " << index << " was created with the deprecated" << " v:0 format. This format will not be supported in a future release." << startupWarningsLog; log() << "\t To fix this, you need to rebuild this index." << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes" << startupWarningsLog; } } if (PlanExecutor::IS_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } if (replSettings.usingReplSets()) { // We only care about the _id index if we are in a replset checkForIdIndexes(txn, db); } if (shouldClearNonLocalTmpCollections || dbName == "local") { db->clearTmpCollections(txn); } } LOG(1) << "done repairDatabases" << endl; }
Database* DatabaseHolder::getOrCreate(OperationContext* txn, const StringData& ns, bool& justCreated) { const StringData dbname = _todb( ns ); invariant(txn->lockState()->isAtLeastReadLocked(dbname)); if (txn->lockState()->isWriteLocked() && FileAllocator::get()->hasFailed()) { uassert(17507, "Can't take a write lock while out of disk space", false); } { SimpleMutex::scoped_lock lk(_m); { DBs::const_iterator i = _dbs.find(dbname); if( i != _dbs.end() ) { justCreated = false; return i->second; } } // todo: protect against getting sprayed with requests for different db names that DNE - // that would make the DBs map very large. not clear what to do to handle though, // perhaps just log it, which is what we do here with the "> 40" : bool cant = !txn->lockState()->isWriteLocked(ns); if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || _dbs.size() > 40 || cant || DEBUG_BUILD ) { log() << "opening db: " << dbname; } massert(15927, "can't open database in a read lock. if db was just closed, consider retrying the query. might otherwise indicate an internal error", !cant); } // we know we have a db exclusive lock here { // check casing string duplicate = Database::duplicateUncasedName(dbname.toString()); if ( !duplicate.empty() ) { stringstream ss; ss << "db already exists with different case already have: [" << duplicate << "] trying to create [" << dbname.toString() << "]"; uasserted( DatabaseDifferCaseCode , ss.str() ); } } // we mark our thread as having done writes now as we do not want any exceptions // once we start creating a new database cc().writeHappened(); // this locks _m for defensive checks, so we don't want to be locked right here : StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); invariant(storageEngine); Database *db; { WriteUnitOfWork wunit(txn); DatabaseCatalogEntry* entry = storageEngine->getDatabaseCatalogEntry(txn, dbname); invariant(entry); justCreated = !entry->exists(); db = new Database(txn, dbname, entry); wunit.commit(); } { SimpleMutex::scoped_lock lk(_m); _dbs[dbname] = db; } return db; }
static void repairDatabasesAndCheckVersion(OperationContext* txn) { LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); vector<string> dbNames; StorageEngine* storageEngine = txn->getServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&dbNames); // Repair all databases first, so that we do not try to open them if they are in bad shape if (storageGlobalParams.repair) { invariant(!storageGlobalParams.readOnly); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Repairing database: " << dbName << endl; fassert(18506, repairDatabase(txn, storageEngine, dbName)); } } const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them // to. The local DB is special because it is not replicated. See SERVER-10927 for more // details. const bool shouldClearNonLocalTmpCollections = !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() || replSettings.isSlave()); const bool shouldDoCleanupForSERVER23299 = isSubjectToSERVER23299(txn); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Recovering database: " << dbName << endl; Database* db = dbHolder().openDb(txn, dbName); invariant(db); // First thing after opening the database is to check for file compatibility, // otherwise we might crash if this is a deprecated format. auto status = db->getDatabaseCatalogEntry()->currentFilesCompatible(txn); if (!status.isOK()) { if (status.code() == ErrorCodes::CanRepairToDowngrade) { // Convert CanRepairToDowngrade statuses to MustUpgrade statuses to avoid logging a // potentially confusing and inaccurate message. // // TODO SERVER-24097: Log a message informing the user that they can start the // current version of mongod with --repair and then proceed with normal startup. status = {ErrorCodes::MustUpgrade, status.reason()}; } severe() << "Unable to start mongod due to an incompatibility with the data files and" " this version of mongod: " << status; severe() << "Please consult our documentation when trying to downgrade to a previous" " major release"; quickExit(EXIT_NEED_UPGRADE); return; } // Major versions match, check indexes const string systemIndexes = db->name() + ".system.indexes"; Collection* coll = db->getCollection(systemIndexes); unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) { if (IndexNames::existedBefore24(plugin)) { continue; } log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } if (index["v"].isNumber() && index["v"].numberInt() == 0) { log() << "WARNING: The index: " << index << " was created with the deprecated" << " v:0 format. This format will not be supported in a future release." << startupWarningsLog; log() << "\t To fix this, you need to rebuild this index." << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes" << startupWarningsLog; } } // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::IS_EOF == state); if (replSettings.usingReplSets()) { // We only care about the _id index if we are in a replset checkForIdIndexes(txn, db); // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped // collections) repl::checkForCappedOplog(txn); } if (shouldDoCleanupForSERVER23299) { handleSERVER23299ForDb(txn, db); } if (!storageGlobalParams.readOnly && (shouldClearNonLocalTmpCollections || dbName == "local")) { db->clearTmpCollections(txn); } } LOG(1) << "done repairDatabases" << endl; }
void IndexCatalogEntry::setMultikey(OperationContext* txn, const MultikeyPaths& multikeyPaths) { if (!_indexTracksPathLevelMultikeyInfo && isMultikey()) { // If the index is already set as multikey and we don't have any path-level information to // update, then there's nothing more for us to do. return; } if (_indexTracksPathLevelMultikeyInfo) { stdx::lock_guard<stdx::mutex> lk(_indexMultikeyPathsMutex); invariant(multikeyPaths.size() == _indexMultikeyPaths.size()); bool newPathIsMultikey = false; for (size_t i = 0; i < multikeyPaths.size(); ++i) { if (!std::includes(_indexMultikeyPaths[i].begin(), _indexMultikeyPaths[i].end(), multikeyPaths[i].begin(), multikeyPaths[i].end())) { // If 'multikeyPaths' contains a new path component that causes this index to be // multikey, then we must update the index metadata in the CollectionCatalogEntry. newPathIsMultikey = true; break; } } if (!newPathIsMultikey) { // Otherwise, if all the path components in 'multikeyPaths' are already tracked in // '_indexMultikeyPaths', then there's nothing more for us to do. return; } } { // Only one thread should set the multi-key value per collection, because the metadata for a // collection is one large document. Lock::ResourceLock collMDLock(txn->lockState(), ResourceId(RESOURCE_METADATA, _ns), MODE_X); if (!_indexTracksPathLevelMultikeyInfo && isMultikey()) { // It's possible that we raced with another thread when acquiring the MD lock. If the // index is already set as multikey and we don't have any path-level information to // update, then there's nothing more for us to do. return; } // This effectively emulates a side-transaction off the main transaction, which invoked // setMultikey. The reason we need is to avoid artificial WriteConflicts, which happen with // snapshot isolation. { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); RecoveryUnitSwap ruSwap(txn, storageEngine->newRecoveryUnit()); WriteUnitOfWork wuow(txn); // It's possible that the index type (e.g. ascending/descending index) supports tracking // path-level multikey information, but this particular index doesn't. // CollectionCatalogEntry::setIndexIsMultikey() requires that we discard the path-level // multikey information in order to avoid unintentionally setting path-level multikey // information on an index created before 3.4. if (_collection->setIndexIsMultikey( txn, _descriptor->indexName(), _indexTracksPathLevelMultikeyInfo ? multikeyPaths : MultikeyPaths{})) { if (_infoCache) { LOG(1) << _ns << ": clearing plan cache - index " << _descriptor->keyPattern() << " set to multi key."; _infoCache->clearQueryCache(); } } wuow.commit(); } } _isMultikey.store(true); if (_indexTracksPathLevelMultikeyInfo) { stdx::lock_guard<stdx::mutex> lk(_indexMultikeyPathsMutex); for (size_t i = 0; i < multikeyPaths.size(); ++i) { _indexMultikeyPaths[i].insert(multikeyPaths[i].begin(), multikeyPaths[i].end()); } } }
void createOplog(OperationContext* txn) { Lock::GlobalWrite lk(txn->lockState()); const char * ns = "local.oplog.$main"; const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); bool rs = !replSettings.replSet.empty(); if( rs ) ns = rsoplog; Client::Context ctx(txn, ns); Collection* collection = ctx.db()->getCollection(txn, ns ); if ( collection ) { if (replSettings.oplogSize != 0) { int o = (int)(collection->getRecordStore()->storageSize(txn) / ( 1024 * 1024 ) ); int n = (int)(replSettings.oplogSize / (1024 * 1024)); if ( n != o ) { stringstream ss; ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog"; log() << ss.str() << endl; throw UserException( 13257 , ss.str() ); } } if( rs ) return; initOpTimeFromOplog(txn, ns); return; } /* create an oplog collection, if it doesn't yet exist. */ long long sz = 0; if ( replSettings.oplogSize != 0 ) { sz = replSettings.oplogSize; } else { /* not specified. pick a default size */ sz = 50LL * 1024LL * 1024LL; if ( sizeof(int *) >= 8 ) { #if defined(__APPLE__) // typically these are desktops (dev machines), so keep it smallish sz = (256-64) * 1024 * 1024; #else sz = 990LL * 1024 * 1024; double free = File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported. long long fivePct = static_cast<long long>( free * 0.05 ); if ( fivePct > sz ) sz = fivePct; // we use 5% of free space up to 50GB (1TB free) static long long upperBound = 50LL * 1024 * 1024 * 1024; if (fivePct > upperBound) sz = upperBound; #endif } } log() << "******" << endl; log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl; CollectionOptions options; options.capped = true; options.cappedSize = sz; options.autoIndexId = CollectionOptions::NO; WriteUnitOfWork wunit(txn); invariant(ctx.db()->createCollection(txn, ns, options)); if( !rs ) logOp(txn, "n", "", BSONObj() ); wunit.commit(); /* sync here so we don't get any surprising lag later when we try to sync */ StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->flushAllFiles(true); log() << "******" << endl; }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { LOG(2) << "Waiting for write concern. OpTime: " << replOpTime << ", write concern: " << writeConcern.toBSON(); auto replCoord = repl::ReplicationCoordinator::get(txn); MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeWaitingForWriteConcern); // Next handle blocking on disk Timer syncTimer; WriteConcernOptions writeConcernWithPopulatedSyncMode = replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); switch (writeConcernWithPopulatedSyncMode.syncMode) { case WriteConcernOptions::SyncMode::UNSET: severe() << "Attempting to wait on a WriteConcern with an unset sync option"; fassertFailed(34410); case WriteConcernOptions::SyncMode::NONE: break; case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::SyncMode::JOURNAL: if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { // Wait for ops to become durable then update replication system's // knowledge of this. OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); replCoord->setMyLastDurableOpTimeForward(appliedOpTime); } else { txn->recoveryUnit()->waitUntilDurable(); } break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } // Replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = replCoord->awaitReplication(txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( replOpTime, writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
Status AuthzManagerExternalStateMongod::getAllDatabaseNames( OperationContext* txn, std::vector<std::string>* dbnames) { StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->listDatabases(dbnames); return Status::OK(); }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { // We assume all options have been validated earlier, if not, programming error. // Passing localDB name is a hack to avoid more rigorous check that performed for non local DB. dassert(validateWriteConcern(txn, writeConcern, kLocalDB).isOK()); // We should never be waiting for write concern while holding any sort of lock, because this may // lead to situations where the replication heartbeats are stalled. // // This check does not hold for writes done through dbeval because it runs with a global X lock. dassert(!txn->lockState()->isLocked() || txn->getClient()->isInDirectClient()); // Next handle blocking on disk Timer syncTimer; auto replCoord = repl::getGlobalReplicationCoordinator(); WriteConcernOptions writeConcernWithPopulatedSyncMode = replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); switch (writeConcernWithPopulatedSyncMode.syncMode) { case WriteConcernOptions::SyncMode::UNSET: severe() << "Attempting to wait on a WriteConcern with an unset sync option"; fassertFailed(34410); case WriteConcernOptions::SyncMode::NONE: break; case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::SyncMode::JOURNAL: if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { // Wait for ops to become durable then update replication system's // knowledge of this. OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); replCoord->setMyLastDurableOpTimeForward(appliedOpTime); } else { txn->recoveryUnit()->waitUntilDurable(); } break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplication( txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( replOpTime, writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
bool run(OperationContext* opCtx, const string& dbname, const BSONObj& cmdObj, BSONObjBuilder& result) final { CommandHelpers::handleMarkKillOnClientDisconnect(opCtx); IDLParserErrorContext ctx("listDatabases"); auto cmd = ListDatabasesCommand::parse(ctx, cmdObj); auto* as = AuthorizationSession::get(opCtx->getClient()); // {nameOnly: bool} - default false. const bool nameOnly = cmd.getNameOnly(); // {authorizedDatabases: bool} - Dynamic default based on permissions. const bool authorizedDatabases = ([as](const boost::optional<bool>& authDB) { const bool mayListAllDatabases = as->isAuthorizedForActionsOnResource( ResourcePattern::forClusterResource(), ActionType::listDatabases); if (authDB) { uassert(ErrorCodes::Unauthorized, "Insufficient permissions to list all databases", authDB.get() || mayListAllDatabases); return authDB.get(); } // By default, list all databases if we can, otherwise // only those we're allowed to find on. return !mayListAllDatabases; })(cmd.getAuthorizedDatabases()); // {filter: matchExpression}. std::unique_ptr<MatchExpression> filter; if (auto filterObj = cmd.getFilter()) { // The collator is null because database metadata objects are compared using simple // binary comparison. const CollatorInterface* collator = nullptr; boost::intrusive_ptr<ExpressionContext> expCtx(new ExpressionContext(opCtx, collator)); auto matcher = uassertStatusOK(MatchExpressionParser::parse(filterObj.get(), std::move(expCtx))); filter = std::move(matcher); } vector<string> dbNames; StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); { Lock::GlobalLock lk(opCtx, MODE_IS); CurOpFailpointHelpers::waitWhileFailPointEnabled( &hangBeforeListDatabases, opCtx, "hangBeforeListDatabases", []() {}); dbNames = storageEngine->listDatabases(); } vector<BSONObj> dbInfos; const bool filterNameOnly = filter && filter->getCategory() == MatchExpression::MatchCategory::kLeaf && filter->path() == kNameField; intmax_t totalSize = 0; for (const auto& dbname : dbNames) { if (authorizedDatabases && !as->isAuthorizedForAnyActionOnAnyResourceInDB(dbname)) { // We don't have listDatabases on the cluser or find on this database. continue; } BSONObjBuilder b; b.append("name", dbname); int64_t size = 0; if (!nameOnly) { // Filtering on name only should not require taking locks on filtered-out names. if (filterNameOnly && !filter->matchesBSON(b.asTempObj())) continue; AutoGetDb autoDb(opCtx, dbname, MODE_IS); Database* const db = autoDb.getDb(); if (!db) continue; writeConflictRetry(opCtx, "sizeOnDisk", dbname, [&] { size = storageEngine->sizeOnDiskForDb(opCtx, dbname); }); b.append("sizeOnDisk", static_cast<double>(size)); b.appendBool( "empty", CollectionCatalog::get(opCtx).getAllCollectionUUIDsFromDb(dbname).empty()); } BSONObj curDbObj = b.obj(); if (!filter || filter->matchesBSON(curDbObj)) { totalSize += size; dbInfos.push_back(curDbObj); } } result.append("databases", dbInfos); if (!nameOnly) { result.append("totalSize", double(totalSize)); } return true; }
// ran at startup. static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) { LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; OperationContextImpl txn; Lock::GlobalWrite lk(txn.lockState()); vector< string > dbNames; StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->listDatabases( &dbNames ); for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { string dbName = *i; LOG(1) << "\t" << dbName << endl; Client::Context ctx(&txn, dbName ); if (repl::getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { // we only care about the _id index if we are in a replset checkForIdIndexes(&txn, ctx.db()); } if (shouldClearNonLocalTmpCollections || dbName == "local") ctx.db()->clearTmpCollections(&txn); if ( storageGlobalParams.repair ) { fassert(18506, storageEngine->repairDatabase(&txn, dbName)); } else if (!ctx.db()->getDatabaseCatalogEntry()->currentFilesCompatible(&txn)) { log() << "****"; log() << "cannot do this upgrade without an upgrade in the middle"; log() << "please do a --repair with 2.6 and then start this version"; dbexit( EXIT_NEED_UPGRADE ); invariant( false ); return; } else { // major versions match, check indexes const string systemIndexes = ctx.db()->name() + ".system.indexes"; Collection* coll = ctx.db()->getCollection( &txn, systemIndexes ); auto_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(&txn, systemIndexes,coll)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (ctx.db()->getDatabaseCatalogEntry()->isOlderThan24(&txn)) { if (IndexNames::existedBefore24(plugin)) continue; log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } } if (PlanExecutor::IS_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } dbHolder().close( &txn, dbName ); } } LOG(1) << "done repairDatabases" << endl; }
void FSyncLockThread::run() { ThreadClient tc("fsyncLockWorker", getGlobalServiceContext()); stdx::lock_guard<SimpleMutex> lkf(filesLockedFsync); stdx::unique_lock<stdx::mutex> lk(fsyncCmd.lockStateMutex); invariant(fsyncCmd.getLockCount_inLock() == 1); try { const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; Lock::GlobalRead global(&opCtx); // Block any writes in order to flush the files. StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); try { storageEngine->flushAllFiles(&opCtx, true); } catch (const std::exception& e) { error() << "error doing flushAll: " << e.what(); fsyncCmd.threadStatus = Status(ErrorCodes::CommandFailed, e.what()); fsyncCmd.acquireFsyncLockSyncCV.notify_one(); return; } bool successfulFsyncLock = false; auto backupCursorHooks = BackupCursorHooks::get(opCtx.getServiceContext()); try { writeConflictRetry(&opCtx, "beginBackup", "global", [&opCtx, backupCursorHooks, &successfulFsyncLock, storageEngine] { if (backupCursorHooks->enabled()) { backupCursorHooks->fsyncLock(&opCtx); successfulFsyncLock = true; } else { // Have the uassert be caught by the DBException // block. Maintain "allowFsyncFailure" compatibility in // community. uassertStatusOK(storageEngine->beginBackup(&opCtx)); successfulFsyncLock = true; } }); } catch (const DBException& e) { if (_allowFsyncFailure) { warning() << "Locking despite storage engine being unable to begin backup : " << e.toString(); opCtx.recoveryUnit()->waitUntilDurable(); } else { error() << "storage engine unable to begin backup : " << e.toString(); fsyncCmd.threadStatus = e.toStatus(); fsyncCmd.acquireFsyncLockSyncCV.notify_one(); return; } } fsyncCmd.threadStarted = true; fsyncCmd.acquireFsyncLockSyncCV.notify_one(); while (fsyncCmd.getLockCount_inLock() > 0) { fsyncCmd.releaseFsyncLockSyncCV.wait(lk); } if (successfulFsyncLock) { if (backupCursorHooks->enabled()) { backupCursorHooks->fsyncUnlock(&opCtx); } else { storageEngine->endBackup(&opCtx); } } } catch (const std::exception& e) { severe() << "FSyncLockThread exception: " << e.what(); fassertFailed(40350); } }
virtual bool errmsgRun(OperationContext* opCtx, const string& dbname, const BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) { if (opCtx->lockState()->isLocked()) { errmsg = "fsync: Cannot execute fsync command from contexts that hold a data lock"; return false; } const bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately const bool lock = cmdObj["lock"].trueValue(); log() << "CMD fsync: sync:" << sync << " lock:" << lock; // fsync + lock is sometimes used to block writes out of the system and does not care if // the `BackupCursorService::fsyncLock` call succeeds. const bool allowFsyncFailure = getTestCommandsEnabled() && cmdObj["allowFsyncFailure"].trueValue(); if (!lock) { // Take a global IS lock to ensure the storage engine is not shutdown Lock::GlobalLock global(opCtx, MODE_IS); StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); result.append("numFiles", storageEngine->flushAllFiles(opCtx, sync)); return true; } Lock::ExclusiveLock lk(opCtx->lockState(), commandMutex); if (!sync) { errmsg = "fsync: sync option must be true when using lock"; return false; } const auto lockCountAtStart = getLockCount(); invariant(lockCountAtStart > 0 || !_lockThread); acquireLock(); if (lockCountAtStart == 0) { Status status = Status::OK(); { stdx::unique_lock<stdx::mutex> lk(lockStateMutex); threadStatus = Status::OK(); threadStarted = false; _lockThread = stdx::make_unique<FSyncLockThread>(allowFsyncFailure); _lockThread->go(); while (!threadStarted && threadStatus.isOK()) { acquireFsyncLockSyncCV.wait(lk); } // 'threadStatus' must be copied while 'lockStateMutex' is held. status = threadStatus; } if (!status.isOK()) { releaseLock(); warning() << "fsyncLock failed. Lock count reset to 0. Status: " << status; uassertStatusOK(status); } } log() << "mongod is locked and no writes are allowed. db.fsyncUnlock() to unlock"; log() << "Lock count is " << getLockCount(); log() << " For more info see " << FSyncCommand::url(); result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock"); result.append("lockCount", getLockCount()); result.append("seeAlso", FSyncCommand::url()); return true; }