RocksEngine::RocksEngine(const std::string& path, bool durable) : _path(path), _durable(durable) { { // create block cache uint64_t cacheSizeGB = 0; ProcessInfo pi; unsigned long long memSizeMB = pi.getMemSizeMB(); if (memSizeMB > 0) { double cacheMB = memSizeMB / 2; cacheSizeGB = static_cast<uint64_t>(cacheMB / 1024); } if (cacheSizeGB < 1) { cacheSizeGB = 1; } _block_cache = rocksdb::NewLRUCache(cacheSizeGB * 1024 * 1024 * 1024LL); } // open DB rocksdb::DB* db; auto s = rocksdb::DB::Open(_options(), path, &db); ROCKS_STATUS_OK(s); _db.reset(db); // open iterator boost::scoped_ptr<rocksdb::Iterator> _iter(_db->NewIterator(rocksdb::ReadOptions())); // find maxPrefix _maxPrefix = 0; _iter->SeekToLast(); if (_iter->Valid()) { // otherwise the DB is empty, so we just keep it at 0 bool ok = extractPrefix(_iter->key(), &_maxPrefix); // this is DB corruption here invariant(ok); } // load ident to prefix map { boost::mutex::scoped_lock lk(_identPrefixMapMutex); for (_iter->Seek(kMetadataPrefix); _iter->Valid() && _iter->key().starts_with(kMetadataPrefix); _iter->Next()) { rocksdb::Slice ident(_iter->key()); ident.remove_prefix(kMetadataPrefix.size()); // this could throw DBException, which then means DB corruption. We just let it fly // to the caller BSONObj identConfig(_iter->value().data()); BSONElement element = identConfig.getField("prefix"); // TODO: SERVER-16979 Correctly handle errors returned by RocksDB // This is DB corruption invariant(!element.eoo() || !element.isNumber()); uint32_t identPrefix = static_cast<uint32_t>(element.numberInt()); _identPrefixMap[StringData(ident.data(), ident.size())] = identPrefix; } } }
size_t WiredTigerUtil::getCacheSizeMB(double requestedCacheSizeGB) { double cacheSizeMB; const double kMaxSizeCacheMB = 10 * 1000 * 1000; if (requestedCacheSizeGB == 0) { // Choose a reasonable amount of cache when not explicitly specified by user. // Set a minimum of 256MB, otherwise use 50% of available memory over 1GB. ProcessInfo pi; double memSizeMB = pi.getMemSizeMB(); cacheSizeMB = std::max((memSizeMB - 1024) * 0.5, 256.0); } else { cacheSizeMB = 1024 * requestedCacheSizeGB; } if (cacheSizeMB > kMaxSizeCacheMB) { log() << "Requested cache size: " << cacheSizeMB << "MB exceeds max; setting to " << kMaxSizeCacheMB << "MB"; cacheSizeMB = kMaxSizeCacheMB; } return static_cast<size_t>(cacheSizeMB); }
bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { ProcessInfo p; BSONObjBuilder bSys, bOs; bSys.appendDate( "currentTime" , jsTime() ); bSys.append( "hostname" , prettyHostName() ); bSys.append( "cpuAddrSize", p.getAddrSize() ); bSys.append( "memSizeMB", static_cast <unsigned>( p.getMemSizeMB() ) ); bSys.append( "numCores", p.getNumCores() ); bSys.append( "cpuArch", p.getArch() ); bSys.append( "numaEnabled", p.hasNumaEnabled() ); bOs.append( "type", p.getOsType() ); bOs.append( "name", p.getOsName() ); bOs.append( "version", p.getOsVersion() ); result.append( StringData( "system" ), bSys.obj() ); result.append( StringData( "os" ), bOs.obj() ); p.appendSystemDetails( result ); return true; }
WiredTigerKVEngine::WiredTigerKVEngine(const std::string& path, const std::string& extraOpenOptions, bool durable, bool repair) : _eventHandler(WiredTigerUtil::defaultEventHandlers()), _path(path), _durable(durable), _sizeStorerSyncTracker(100000, 60 * 1000) { size_t cacheSizeGB = wiredTigerGlobalOptions.cacheSizeGB; if (cacheSizeGB == 0) { // Since the user didn't provide a cache size, choose a reasonable default value. // We want to reserve 1GB for the system and binaries, but it's not bad to // leave a fair amount left over for pagecache since that's compressed storage. ProcessInfo pi; double memSizeMB = pi.getMemSizeMB(); if (memSizeMB > 0) { double cacheMB = (memSizeMB - 1024) * 0.6; cacheSizeGB = static_cast<size_t>(cacheMB / 1024); if (cacheSizeGB < 1) cacheSizeGB = 1; } } boost::filesystem::path journalPath = path; journalPath /= "journal"; if (_durable) { if (!boost::filesystem::exists(journalPath)) { try { boost::filesystem::create_directory(journalPath); } catch (std::exception& e) { log() << "error creating journal dir " << journalPath.string() << ' ' << e.what(); throw; } } } _previousCheckedDropsQueued = Date_t::now(); std::stringstream ss; ss << "create,"; ss << "cache_size=" << cacheSizeGB << "G,"; ss << "session_max=20000,"; ss << "eviction=(threads_max=4),"; ss << "config_base=false,"; ss << "statistics=(fast),"; // The setting may have a later setting override it if not using the journal. We make it // unconditional here because even nojournal may need this setting if it is a transition // from using the journal. ss << "log=(enabled=true,archive=true,path=journal,compressor="; ss << wiredTigerGlobalOptions.journalCompressor << "),"; ss << "file_manager=(close_idle_time=100000),"; //~28 hours, will put better fix in 3.1.x ss << "checkpoint=(wait=" << wiredTigerGlobalOptions.checkpointDelaySecs; ss << ",log_size=2GB),"; ss << "statistics_log=(wait=" << wiredTigerGlobalOptions.statisticsLogDelaySecs << "),"; ss << WiredTigerCustomizationHooks::get(getGlobalServiceContext())->getOpenConfig("system"); ss << extraOpenOptions; if (!_durable) { // If we started without the journal, but previously used the journal then open with the // WT log enabled to perform any unclean shutdown recovery and then close and reopen in // the normal path without the journal. if (boost::filesystem::exists(journalPath)) { string config = ss.str(); log() << "Detected WT journal files. Running recovery from last checkpoint."; log() << "journal to nojournal transition config: " << config; int ret = wiredtiger_open(path.c_str(), &_eventHandler, config.c_str(), &_conn); if (ret == EINVAL) { fassertFailedNoTrace(28717); } else if (ret != 0) { Status s(wtRCToStatus(ret)); msgassertedNoTrace(28718, s.reason()); } invariantWTOK(_conn->close(_conn, NULL)); } // This setting overrides the earlier setting because it is later in the config string. ss << ",log=(enabled=false),"; } string config = ss.str(); log() << "wiredtiger_open config: " << config; int ret = wiredtiger_open(path.c_str(), &_eventHandler, config.c_str(), &_conn); // Invalid argument (EINVAL) is usually caused by invalid configuration string. // We still fassert() but without a stack trace. if (ret == EINVAL) { fassertFailedNoTrace(28561); } else if (ret != 0) { Status s(wtRCToStatus(ret)); msgassertedNoTrace(28595, s.reason()); } _sessionCache.reset(new WiredTigerSessionCache(this)); if (_durable) { _journalFlusher = stdx::make_unique<WiredTigerJournalFlusher>(_sessionCache.get()); _journalFlusher->go(); } _sizeStorerUri = "table:sizeStorer"; { WiredTigerSession session(_conn); if (repair && _hasUri(session.getSession(), _sizeStorerUri)) { log() << "Repairing size cache"; fassertNoTrace(28577, _salvageIfNeeded(_sizeStorerUri.c_str())); } _sizeStorer.reset(new WiredTigerSizeStorer(_conn, _sizeStorerUri)); _sizeStorer->fillCache(); } }
RocksEngine::RocksEngine(const std::string& path, bool durable) : _path(path), _durable(durable), _maxPrefix(0) { { // create block cache uint64_t cacheSizeGB = rocksGlobalOptions.cacheSizeGB; if (cacheSizeGB == 0) { ProcessInfo pi; unsigned long long memSizeMB = pi.getMemSizeMB(); if (memSizeMB > 0) { double cacheMB = memSizeMB / 2; cacheSizeGB = static_cast<uint64_t>(cacheMB / 1024); } if (cacheSizeGB < 1) { cacheSizeGB = 1; } } _block_cache = rocksdb::NewLRUCache(cacheSizeGB * 1024 * 1024 * 1024LL, 6); } _maxWriteMBPerSec = rocksGlobalOptions.maxWriteMBPerSec; _rateLimiter.reset( rocksdb::NewGenericRateLimiter(static_cast<int64_t>(_maxWriteMBPerSec) * 1024 * 1024)); // open DB rocksdb::DB* db; auto s = rocksdb::DB::Open(_options(), path, &db); invariantRocksOK(s); _db.reset(db); _counterManager.reset( new RocksCounterManager(_db.get(), rocksGlobalOptions.crashSafeCounters)); _compactionScheduler.reset(new RocksCompactionScheduler(_db.get())); // open iterator boost::scoped_ptr<rocksdb::Iterator> iter(_db->NewIterator(rocksdb::ReadOptions())); // find maxPrefix iter->SeekToLast(); if (iter->Valid()) { // otherwise the DB is empty, so we just keep it at 0 bool ok = extractPrefix(iter->key(), &_maxPrefix); // this is DB corruption here invariant(ok); } // load ident to prefix map. also update _maxPrefix if there's any prefix bigger than // current _maxPrefix { boost::lock_guard<boost::mutex> lk(_identPrefixMapMutex); for (iter->Seek(kMetadataPrefix); iter->Valid() && iter->key().starts_with(kMetadataPrefix); iter->Next()) { invariantRocksOK(iter->status()); rocksdb::Slice ident(iter->key()); ident.remove_prefix(kMetadataPrefix.size()); // this could throw DBException, which then means DB corruption. We just let it fly // to the caller BSONObj identConfig(iter->value().data()); BSONElement element = identConfig.getField("prefix"); if (element.eoo() || !element.isNumber()) { log() << "Mongo metadata in RocksDB database is corrupted."; invariant(false); } uint32_t identPrefix = static_cast<uint32_t>(element.numberInt()); _identPrefixMap[StringData(ident.data(), ident.size())] = identPrefix; _maxPrefix = std::max(_maxPrefix, identPrefix); } } // just to be extra sure. we need this if last collection is oplog -- in that case we // reserve prefix+1 for oplog key tracker ++_maxPrefix; // load dropped prefixes { rocksdb::WriteBatch wb; // we will use this iter to check if prefixes are still alive boost::scoped_ptr<rocksdb::Iterator> prefixIter( _db->NewIterator(rocksdb::ReadOptions())); for (iter->Seek(kDroppedPrefix); iter->Valid() && iter->key().starts_with(kDroppedPrefix); iter->Next()) { invariantRocksOK(iter->status()); rocksdb::Slice prefix(iter->key()); prefix.remove_prefix(kDroppedPrefix.size()); prefixIter->Seek(prefix); invariantRocksOK(iter->status()); if (prefixIter->Valid() && prefixIter->key().starts_with(prefix)) { // prefix is still alive, let's instruct the compaction filter to clear it up uint32_t int_prefix; bool ok = extractPrefix(prefix, &int_prefix); invariant(ok); { boost::lock_guard<boost::mutex> lk(_droppedPrefixesMutex); _droppedPrefixes.insert(int_prefix); } } else { // prefix is no longer alive. let's remove the prefix from our dropped prefixes // list wb.Delete(iter->key()); } } if (wb.Count() > 0) { auto s = _db->Write(rocksdb::WriteOptions(), &wb); invariantRocksOK(s); } } }
WiredTigerKVEngine::WiredTigerKVEngine( const std::string& path, const std::string& extraOpenOptions, bool durable ) : _durable( durable ), _epoch( 0 ), _sizeStorerSyncTracker( 100000, 60 * 1000 ) { _eventHandler.handle_error = mdb_handle_error; _eventHandler.handle_message = mdb_handle_message; _eventHandler.handle_progress = mdb_handle_progress; _eventHandler.handle_close = mdb_handle_close; int cacheSizeGB = 1; { ProcessInfo pi; unsigned long long memSizeMB = pi.getMemSizeMB(); if ( memSizeMB > 0 ) { double cacheMB = memSizeMB / 50; cacheSizeGB = static_cast<int>( cacheMB / 1024 ); if ( cacheSizeGB < 1 ) cacheSizeGB = 1; } } if ( _durable ) { boost::filesystem::path journalPath = path; journalPath /= "journal"; if ( !boost::filesystem::exists( journalPath ) ) { try { boost::filesystem::create_directory( journalPath ); } catch( std::exception& e) { log() << "error creating journal dir " << journalPath.string() << ' ' << e.what(); throw; } } } std::stringstream ss; ss << "create,"; ss << "cache_size=" << cacheSizeGB << "G,"; ss << "session_max=20000,"; ss << "extensions=[local=(entry=index_collator_extension)],"; ss << "statistics=(all),"; if ( _durable ) { ss << "log=(enabled=true,archive=true,path=journal),"; } ss << "checkpoint=(wait=60,log_size=2GB),"; ss << extraOpenOptions; string config = ss.str(); log() << "wiredtiger_open config: " << config; invariantWTOK(wiredtiger_open(path.c_str(), &_eventHandler, config.c_str(), &_conn)); _sessionCache.reset( new WiredTigerSessionCache( this ) ); _sizeStorerUri = "table:sizeStorer"; { WiredTigerSession session( _conn, -1 ); WiredTigerSizeStorer* ss = new WiredTigerSizeStorer(); ss->loadFrom( &session, _sizeStorerUri ); _sizeStorer.reset( ss ); } }