void FSyncLockThread::doRealWork() { SimpleMutex::scoped_lock lkf(filesLockedFsync); OperationContextImpl txn; ScopedTransaction transaction(&txn, MODE_X); Lock::GlobalWrite global(txn.lockState()); // No WriteUnitOfWork needed SimpleMutex::scoped_lock lk(fsyncCmd.m); invariant(!fsyncCmd.locked); // impossible to get here if locked is true try { getDur().syncDataAndTruncateJournal(&txn); } catch( std::exception& e ) { error() << "error doing syncDataAndTruncateJournal: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } txn.lockState()->downgradeGlobalXtoSForMMAPV1(); try { StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->flushAllFiles(true); } catch( std::exception& e ) { error() << "error doing flushAll: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } invariant(!fsyncCmd.locked); fsyncCmd.locked = true; fsyncCmd._threadSync.notify_one(); while ( ! fsyncCmd.pendingUnlock ) { fsyncCmd._unlockSync.wait(fsyncCmd.m); } fsyncCmd.pendingUnlock = false; fsyncCmd.locked = false; fsyncCmd.err = "unlocked"; fsyncCmd._unlockSync.notify_one(); }
bool getInitialSyncFlag() { OperationContextImpl txn; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction transaction(&txn, MODE_IX); Lock::DBLock lk(txn.lockState(), "local", MODE_X); BSONObj mv; bool found = Helpers::getSingleton(&txn, minvalidNS, mv); if (found) { return mv[initialSyncFlagString].trueValue(); } return false; } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(&txn, "getInitialSyncFlags", minvalidNS); }
TEST(DBHelperTests, FindDiskLocs) { OperationContextImpl txn; DBDirectClient client(&txn); // Some unique tag we can use to make sure we're pulling back the right data OID tag = OID::gen(); client.remove( ns, BSONObj() ); int numDocsInserted = 10; for ( int i = 0; i < numDocsInserted; ++i ) { client.insert( ns, BSON( "_id" << i << "tag" << tag ) ); } long long maxSizeBytes = 1024 * 1024 * 1024; set<DiskLoc> locs; long long numDocsFound; long long estSizeBytes; { // search _id range (0, 10) Lock::DBRead lk(txn.lockState(), ns); KeyRange range( ns, BSON( "_id" << 0 ), BSON( "_id" << numDocsInserted ), BSON( "_id" << 1 ) ); Status result = Helpers::getLocsInRange( &txn, range, maxSizeBytes, &locs, &numDocsFound, &estSizeBytes ); ASSERT_EQUALS( result, Status::OK() ); ASSERT_EQUALS( numDocsFound, numDocsInserted ); ASSERT_NOT_EQUALS( estSizeBytes, 0 ); ASSERT_LESS_THAN( estSizeBytes, maxSizeBytes ); Database* db = dbHolder().get( &txn, nsToDatabase(range.ns) ); const Collection* collection = db->getCollection(&txn, ns); // Make sure all the disklocs actually correspond to the right info for ( set<DiskLoc>::const_iterator it = locs.begin(); it != locs.end(); ++it ) { const BSONObj obj = collection->docFor(&txn, *it); ASSERT_EQUALS(obj["tag"].OID(), tag); } } }
void FSyncLockThread::doRealWork() { SimpleMutex::scoped_lock lkf(filesLockedFsync); OperationContextImpl txn; // XXX? Lock::GlobalWrite global(txn.lockState()); SimpleMutex::scoped_lock lk(fsyncCmd.m); verify( ! fsyncCmd.locked ); // impossible to get here if locked is true try { getDur().syncDataAndTruncateJournal(); } catch( std::exception& e ) { error() << "error doing syncDataAndTruncateJournal: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } global.downgrade(); try { MemoryMappedFile::flushAll(true); } catch( std::exception& e ) { error() << "error doing flushAll: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } verify( ! fsyncCmd.locked ); fsyncCmd.locked = true; fsyncCmd._threadSync.notify_one(); while ( ! fsyncCmd.pendingUnlock ) { fsyncCmd._unlockSync.wait(fsyncCmd.m); } fsyncCmd.pendingUnlock = false; fsyncCmd.locked = false; fsyncCmd.err = "unlocked"; fsyncCmd._unlockSync.notify_one(); }
void ServiceContextMongoDTest::_dropAllDBs() { OperationContextImpl txn; dropAllDatabasesExceptLocal(&txn); ScopedTransaction transaction(&txn, MODE_X); Lock::GlobalWrite lk(txn.lockState()); AutoGetDb autoDBLocal(&txn, "local", MODE_X); const auto localDB = autoDBLocal.getDb(); if (localDB) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // Do not wrap in a WriteUnitOfWork until SERVER-17103 is addressed. autoDBLocal.getDb()->dropDatabase(&txn, localDB); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(&txn, "_dropAllDBs", "local"); } }
/** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ void _logOpObjRS(const BSONObj& op) { OperationContextImpl txn; Lock::DBWrite lk(txn.lockState(), "local"); const OpTime ts = op["ts"]._opTime(); long long h = op["h"].numberLong(); { if ( localOplogRSCollection == 0 ) { Client::Context ctx(rsoplog, storageGlobalParams.dbpath); localDB = ctx.db(); verify( localDB ); localOplogRSCollection = localDB->getCollection( &txn, rsoplog ); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); } Client::Context ctx(rsoplog, localDB); checkOplogInsert( localOplogRSCollection->insertDocument( &txn, op, false ) ); /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. */ if( theReplSet ) { if( !(theReplSet->lastOpTimeWritten<ts) ) { log() << "replication oplog stream went back in time. previous timestamp: " << theReplSet->lastOpTimeWritten << " newest timestamp: " << ts << ". attempting to sync directly from primary." << endl; std::string errmsg; BSONObjBuilder result; if (!theReplSet->forceSyncFrom(theReplSet->box.getPrimary()->fullName(), errmsg, result)) { log() << "Can't sync from primary: " << errmsg << endl; } } theReplSet->lastOpTimeWritten = ts; theReplSet->lastH = h; ctx.getClient()->setLastOp( ts ); BackgroundSync::notify(); } } setNewOptime(ts); }
void run() { const string dbName = "rollback_drop_collection"; const string droppedName = dbName + ".dropped"; const string rolledBackName = dbName + ".rolled_back"; OperationContextImpl txn; ScopedTransaction transaction(&txn, MODE_IX); Lock::DBLock lk(txn.lockState(), dbName, MODE_X); bool justCreated; Database* db = dbHolder().openDb(&txn, dbName, &justCreated); ASSERT(justCreated); { WriteUnitOfWork wunit(&txn); ASSERT_FALSE(db->getCollection(droppedName)); Collection* droppedColl; droppedColl = db->createCollection(&txn, droppedName); ASSERT_EQUALS(db->getCollection(droppedName), droppedColl); db->dropCollection(&txn, droppedName); wunit.commit(); } // Should have been really dropped ASSERT_FALSE(db->getCollection(droppedName)); { WriteUnitOfWork wunit(&txn); ASSERT_FALSE(db->getCollection(rolledBackName)); Collection* rolledBackColl = db->createCollection(&txn, rolledBackName); wunit.commit(); ASSERT_EQUALS(db->getCollection(rolledBackName), rolledBackColl); db->dropCollection(&txn, rolledBackName); // not committing so dropping should be rolled back } // The rolledBackCollection dropping should have been rolled back. // Original Collection pointers are no longer valid. ASSERT(db->getCollection(rolledBackName)); // The droppedCollection should not have been restored by the rollback. ASSERT_FALSE(db->getCollection(droppedName)); }
static void logStartup() { BSONObjBuilder toLog; stringstream id; id << getHostNameCached() << "-" << jsTime().asInt64(); toLog.append("_id", id.str()); toLog.append("hostname", getHostNameCached()); toLog.appendTimeT("startTime", time(0)); toLog.append("startTimeLocal", dateToCtimeString(Date_t::now())); toLog.append("cmdLine", serverGlobalParams.parsedOpts); toLog.append("pid", ProcessId::getCurrent().asLongLong()); BSONObjBuilder buildinfo(toLog.subobjStart("buildinfo")); appendBuildInfo(buildinfo); appendStorageEngineList(&buildinfo); buildinfo.doneFast(); BSONObj o = toLog.obj(); OperationContextImpl txn; ScopedTransaction transaction(&txn, MODE_X); Lock::GlobalWrite lk(txn.lockState()); AutoGetOrCreateDb autoDb(&txn, "local", mongo::MODE_X); Database* db = autoDb.getDb(); const std::string ns = "local.startup_log"; Collection* collection = db->getCollection(ns); WriteUnitOfWork wunit(&txn); if (!collection) { BSONObj options = BSON("capped" << true << "size" << 10 * 1024 * 1024); bool shouldReplicateWrites = txn.writesAreReplicated(); txn.setReplicatedWrites(false); ON_BLOCK_EXIT(&OperationContext::setReplicatedWrites, &txn, shouldReplicateWrites); uassertStatusOK(userCreateNS(&txn, db, ns, options)); collection = db->getCollection(ns); } invariant(collection); uassertStatusOK(collection->insertDocument(&txn, o, false).getStatus()); wunit.commit(); }
OpTime SyncTail::applyOpsToOplog(std::deque<BSONObj>* ops) { OpTime lastOpTime; { OperationContextImpl txn; // XXX? Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); while (!ops->empty()) { const BSONObj& op = ops->front(); // this updates lastOpTimeApplied lastOpTime = _logOpObjRS(&txn, op); ops->pop_front(); } wunit.commit(); } // Update write concern on primary BackgroundSync::get()->notify(); return lastOpTime; }
void SyncTail::applyOpsToOplog(std::deque<BSONObj>* ops) { { OperationContextImpl txn; // XXX? Lock::DBWrite lk(txn.lockState(), "local"); while (!ops->empty()) { const BSONObj& op = ops->front(); // this updates theReplSet->lastOpTimeWritten _logOpObjRS(op); ops->pop_front(); } } if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "notifying BackgroundSync"; } // Update write concern on primary BackgroundSync::notify(); }
void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) { Client* c = currentClient.get(); if (c == 0) { Client::initThread("pretouchN"); c = &cc(); } OperationContextImpl txn; // XXX ScopedTransaction transaction(&txn, MODE_S); Lock::GlobalRead lk(txn.lockState()); for (unsigned i = a; i <= b; i++) { const BSONObj& op = v[i]; const char* which = "o"; const char* opType = op.getStringField("op"); if (*opType == 'i') ; else if (*opType == 'u') which = "o2"; else continue; /* todo : other operations */ try { BSONObj o = op.getObjectField(which); BSONElement _id; if (o.getObjectID(_id)) { const char* ns = op.getStringField("ns"); BSONObjBuilder b; b.append(_id); BSONObj result; Client::Context ctx(&txn, ns); if (Helpers::findById(&txn, ctx.db(), ns, b.done(), result)) _dummy_z += result.objsize(); // touch } } catch (DBException& e) { log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl; } } }
void run() { for ( int i = 0; i < 10; ++i ) { client.insert( ns, BSON( "_id" << i ) ); } { // Remove _id range [_min, _max). OperationContextImpl txn; Lock::DBWrite lk(txn.lockState(), ns); Client::Context ctx( ns ); KeyRange range( ns, BSON( "_id" << _min ), BSON( "_id" << _max ), BSON( "_id" << 1 ) ); Helpers::removeRange( &txn, range ); } // Check that the expected documents remain. ASSERT_EQUALS( expected(), docs() ); }
void IndexBuilder::run() { Client::initThread(name().c_str()); LOG(2) << "IndexBuilder building index " << _index; OperationContextImpl txn; txn.lockState()->setIsBatchWriter(true); txn.getClient()->getAuthorizationSession()->grantInternalAuthorization(); txn.getCurOp()->reset(HostAndPort(), dbInsert); NamespaceString ns(_index["ns"].String()); ScopedTransaction transaction(&txn, MODE_IX); Lock::DBLock dlk(txn.lockState(), ns.db(), MODE_X); Client::Context ctx(&txn, ns.getSystemIndexesCollection()); Database* db = dbHolder().get(&txn, ns.db().toString()); Status status = _build(&txn, db, true, &dlk); if ( !status.isOK() ) { error() << "IndexBuilder could not build index: " << status.toString(); fassert(28555, ErrorCodes::isInterruption(status.code())); } txn.getClient()->shutdown(); }
void run() { const string dbName = "rollback_create_collection"; const string committedName = dbName + ".committed"; const string rolledBackName = dbName + ".rolled_back"; OperationContextImpl txn; ScopedTransaction transaction(&txn, MODE_IX); Lock::DBLock lk(txn.lockState(), dbName, MODE_X); bool justCreated; Database* db = dbHolder().openDb(&txn, dbName, &justCreated); ASSERT(justCreated); Collection* committedColl; { WriteUnitOfWork wunit(&txn); ASSERT_FALSE(db->getCollection(committedName)); committedColl = db->createCollection(&txn, committedName); ASSERT_EQUALS(db->getCollection(committedName), committedColl); wunit.commit(); } ASSERT_EQUALS(db->getCollection(committedName), committedColl); { WriteUnitOfWork wunit(&txn); ASSERT_FALSE(db->getCollection(rolledBackName)); Collection* rolledBackColl = db->createCollection(&txn, rolledBackName); ASSERT_EQUALS(db->getCollection(rolledBackName), rolledBackColl); // not committing so creation should be rolled back } // The rolledBackCollection creation should have been rolled back ASSERT_FALSE(db->getCollection(rolledBackName)); // The committedCollection should not have been affected by the rollback. Holders // of the original Collection pointer should still be valid. ASSERT_EQUALS(db->getCollection(committedName), committedColl); }
// This free function is used by the initial sync writer threads to apply each op void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; txn.setReplicatedWrites(false); DisableDocumentValidation validationDisabler(&txn); // allow us to get through the magic barrier txn.lockState()->setIsBatchWriter(true); bool convertUpdatesToUpserts = false; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s.isOK()) { if (st->shouldRetry(&txn, *it)) { const Status s2 = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s2.isOK()) { severe() << "Error applying operation (" << it->toString() << "): " << s2; fassertFailedNoTrace(15915); } } // If shouldRetry() returns false, fall through. // This can happen if the document that was moved and missed by Cloner // subsequently got deleted and no longer exists on the Sync Target at all } } catch (const DBException& e) { severe() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString(); if (inShutdown()) { return; } fassertFailedNoTrace(16361); } } }
TEST(DBHelperTests, FindDiskLocsTooBig) { DBDirectClient client; OperationContextImpl txn; client.remove( ns, BSONObj() ); int numDocsInserted = 10; for ( int i = 0; i < numDocsInserted; ++i ) { client.insert( ns, BSON( "_id" << i ) ); } // Very small max size long long maxSizeBytes = 10; set<DiskLoc> locs; long long numDocsFound; long long estSizeBytes; { Lock::DBRead lk(txn.lockState(), ns); Client::Context ctx( ns ); KeyRange range( ns, BSON( "_id" << 0 ), BSON( "_id" << numDocsInserted ), BSON( "_id" << 1 ) ); Status result = Helpers::getLocsInRange( &txn, range, maxSizeBytes, &locs, &numDocsFound, &estSizeBytes ); // Make sure we get the right error code and our count and size estimates are valid ASSERT_EQUALS( result.code(), ErrorCodes::InvalidLength ); ASSERT_EQUALS( numDocsFound, numDocsInserted ); ASSERT_GREATER_THAN( estSizeBytes, maxSizeBytes ); } }
static void insert( const BSONObj &o, bool god = false ) { OperationContextImpl txn; Lock::DBWrite lk(txn.lockState(), ns()); Client::Context ctx(ns()); Database* db = ctx.db(); Collection* coll = db->getCollection(&txn, ns()); if (!coll) { coll = db->createCollection(&txn, ns()); } if (o.hasField("_id")) { coll->insertDocument(&txn, o, true); return; } class BSONObjBuilder b; OID id; id.init(); b.appendOID("_id", &id); b.appendElements(o); coll->insertDocument(&txn, b.obj(), true); }
TEST(DBHelperTests, FindDiskLocsNoIndex) { DBDirectClient client; OperationContextImpl txn; client.remove( ns, BSONObj() ); client.insert( ns, BSON( "_id" << OID::gen() ) ); long long maxSizeBytes = 1024 * 1024 * 1024; set<DiskLoc> locs; long long numDocsFound; long long estSizeBytes; { Lock::DBRead lk(txn.lockState(), ns); Client::Context ctx( ns ); // search invalid index range KeyRange range( ns, BSON( "badIndex" << 0 ), BSON( "badIndex" << 10 ), BSON( "badIndex" << 1 ) ); Status result = Helpers::getLocsInRange( &txn, range, maxSizeBytes, &locs, &numDocsFound, &estSizeBytes ); // Make sure we get the right error code ASSERT_EQUALS( result.code(), ErrorCodes::IndexNotFound ); ASSERT_EQUALS( static_cast<long long>( locs.size() ), 0 ); ASSERT_EQUALS( numDocsFound, 0 ); ASSERT_EQUALS( estSizeBytes, 0 ); } }
void IndexBuilder::run() { LOG(2) << "IndexBuilder building index " << _index; OperationContextImpl txn; Client::initThread(name().c_str()); Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState()); cc().getAuthorizationSession()->grantInternalAuthorization(); txn.getCurOp()->reset(HostAndPort(), dbInsert); NamespaceString ns(_index["ns"].String()); Client::WriteContext ctx(&txn, ns.getSystemIndexesCollection()); Database* db = dbHolder().get(&txn, ns.db().toString()); Status status = build(&txn, db, true); if ( !status.isOK() ) { log() << "IndexBuilder could not build index: " << status.toString(); } ctx.commit(); txn.getClient()->shutdown(); }
void run() { OperationContextImpl txn; DBDirectClient client(&txn); for ( int i = 0; i < 10; ++i ) { client.insert( ns, BSON( "_id" << i ) ); } { // Remove _id range [_min, _max). Lock::DBLock lk(txn.lockState(), nsToDatabaseSubstring(ns), MODE_X); Client::Context ctx(&txn, ns ); KeyRange range( ns, BSON( "_id" << _min ), BSON( "_id" << _max ), BSON( "_id" << 1 ) ); mongo::WriteConcernOptions dummyWriteConcern; Helpers::removeRange(&txn, range, false, dummyWriteConcern); } // Check that the expected documents remain. ASSERT_EQUALS( expected(), docs(&txn) ); }
// This free function is used by the writer threads to apply each op void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; // allow us to get through the magic barrier Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState()); bool convertUpdatesToUpserts = true; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) { fassertFailedNoTrace(16359); } } catch (const DBException& e) { error() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString() << endl; fassertFailedNoTrace(16360); } } }
static void durThreadGroupCommit() { OperationContextImpl txn; SimpleMutex::scoped_lock flk(filesLockedFsync); const int N = 10; static int n; if (privateMapBytes < UncommittedBytesLimit && ++n % N && (storageGlobalParams.durOptions & StorageGlobalParams::DurAlwaysRemap) == 0) { // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes // is in an acceptable range. also every Nth commit, we do everything so we can do some remapping; // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once. if( groupCommitWithLimitedLocks(&txn) ) return; } // we get a write lock, downgrade, do work, upgrade, finish work. // getting a write lock is helpful also as we need to be greedy and not be starved here // note our "stopgreed" parm -- to stop greed by others while we are working. you can't write // anytime soon anyway if we are journaling for a while, that was the idea. Lock::GlobalWrite w(txn.lockState()); w.downgrade(); groupCommit(&txn, &w); }
// This free function is used by the initial sync writer threads to apply each op void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; // allow us to get through the magic barrier Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState()); for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { if (!st->syncApply(&txn, *it)) { bool status; { Lock::GlobalWrite lk(txn.lockState()); status = st->shouldRetry(&txn, *it); } if (status) { // retry if (!st->syncApply(&txn, *it)) { fassertFailedNoTrace(15915); } } // If shouldRetry() returns false, fall through. // This can happen if the document that was moved and missed by Cloner // subsequently got deleted and no longer exists on the Sync Target at all } } catch (const DBException& e) { error() << "exception: " << causedBy(e) << " on: " << it->toString() << endl; fassertFailedNoTrace(16361); } } }
CappedInitialSync() : _cappedNs("unittests.foo.bar"), _lk(_txn.lockState(), _cappedNs) { dropCapped(); create(); }
static void repairDatabasesAndCheckVersion() { LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; OperationContextImpl txn; ScopedTransaction transaction(&txn, MODE_X); Lock::GlobalWrite lk(txn.lockState()); vector<string> dbNames; StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&dbNames); // Repair all databases first, so that we do not try to open them if they are in bad shape if (storageGlobalParams.repair) { for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Repairing database: " << dbName << endl; fassert(18506, repairDatabase(&txn, storageEngine, dbName)); } } const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them // to. The local DB is special because it is not replicated. See SERVER-10927 for more // details. const bool shouldClearNonLocalTmpCollections = !(checkIfReplMissingFromCommandLine(&txn) || replSettings.usingReplSets() || replSettings.slave == repl::SimpleSlave); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Recovering database: " << dbName << endl; Database* db = dbHolder().openDb(&txn, dbName); invariant(db); // First thing after opening the database is to check for file compatibility, // otherwise we might crash if this is a deprecated format. if (!db->getDatabaseCatalogEntry()->currentFilesCompatible(&txn)) { log() << "****"; log() << "cannot do this upgrade without an upgrade in the middle"; log() << "please do a --repair with 2.6 and then start this version"; dbexit(EXIT_NEED_UPGRADE); return; } // Major versions match, check indexes const string systemIndexes = db->name() + ".system.indexes"; Collection* coll = db->getCollection(systemIndexes); unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan(&txn, systemIndexes, coll)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (db->getDatabaseCatalogEntry()->isOlderThan24(&txn)) { if (IndexNames::existedBefore24(plugin)) { continue; } log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } } if (PlanExecutor::IS_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } if (replSettings.usingReplSets()) { // We only care about the _id index if we are in a replset checkForIdIndexes(&txn, db); } if (shouldClearNonLocalTmpCollections || dbName == "local") { db->clearTmpCollections(&txn); } } LOG(1) << "done repairDatabases" << endl; }
// ran at startup. static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) { LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; OperationContextImpl txn; Lock::GlobalWrite lk(txn.lockState()); vector< string > dbNames; getDatabaseNames( dbNames ); for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { string dbName = *i; LOG(1) << "\t" << dbName << endl; Client::Context ctx( dbName ); DataFile *p = ctx.db()->getExtentManager()->getFile(&txn, 0); DataFileHeader *h = p->getHeader(); if (repl::replSettings.usingReplSets()) { // we only care about the _id index if we are in a replset checkForIdIndexes(&txn, ctx.db()); } if (shouldClearNonLocalTmpCollections || dbName == "local") ctx.db()->clearTmpCollections(&txn); if (!h->isCurrentVersion() || mongodGlobalParams.repair) { if( h->version <= 0 ) { uasserted(14026, str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version << " info: " << h->versionMinor << ' ' << h->fileLength); } if ( !h->isCurrentVersion() ) { log() << "****" << endl; log() << "****" << endl; log() << "need to upgrade database " << dbName << " " << "with pdfile version " << h->version << "." << h->versionMinor << ", " << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR_22_AND_OLDER << endl; } if (mongodGlobalParams.upgrade) { // QUESTION: Repair even if file format is higher version than code? doDBUpgrade( dbName, h ); } else { log() << "\t Not upgrading, exiting" << endl; log() << "\t run --upgrade to upgrade dbs, then start again" << endl; log() << "****" << endl; dbexit( EXIT_NEED_UPGRADE ); mongodGlobalParams.upgrade = 1; return; } } else { const string systemIndexes = ctx.db()->name() + ".system.indexes"; Collection* coll = ctx.db()->getCollection( &txn, systemIndexes ); auto_ptr<Runner> runner(InternalPlanner::collectionScan(systemIndexes,coll)); BSONObj index; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (h->versionMinor == PDFILE_VERSION_MINOR_22_AND_OLDER) { if (IndexNames::existedBefore24(plugin)) continue; log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } } if (Runner::RUNNER_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } Database::closeDatabase(&txn, dbName.c_str(), storageGlobalParams.dbpath); } } LOG(1) << "done repairDatabases" << endl; if (mongodGlobalParams.upgrade) { log() << "finished checking dbs" << endl; cc().shutdown(); dbexit( EXIT_CLEAN ); } }
/** * The main durability thread loop. There is a single instance of this function running. */ static void durThread() { Client::initThread("durability"); log() << "Durability thread started"; bool samePartition = true; try { const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string(); samePartition = onSamePartition(getJournalDir().string(), dbpathDir); } catch (...) { } // Spawn the journal writer thread JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites); journalWriter.start(); // Used as an estimate of how much / how fast to remap uint64_t commitCounter(0); uint64_t estimatedPrivateMapSize(0); uint64_t remapLastTimestamp(0); while (shutdownRequested.loadRelaxed() == 0) { unsigned ms = storageGlobalParams.journalCommitIntervalMs; if (ms == 0) { ms = samePartition ? 100 : 30; } // +1 so it never goes down to zero const unsigned oneThird = (ms / 3) + 1; // Reset the stats based on the reset interval if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) { stats.reset(); } try { stdx::unique_lock<stdx::mutex> lock(flushMutex); for (unsigned i = 0; i <= 2; i++) { if (stdx::cv_status::no_timeout == flushRequested.wait_for(lock, Milliseconds(oneThird))) { // Someone forced a flush break; } if (commitNotify.nWaiting()) { // One or more getLastError j:true is pending break; } if (commitJob.bytes() > UncommittedBytesLimit / 2) { // The number of written bytes is growing break; } } // The commit logic itself LOG(4) << "groupCommit begin"; Timer t; OperationContextImpl txn; AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(txn.lockState()); // We need to snapshot the commitNumber after the flush lock has been obtained, // because at this point we know that we have a stable snapshot of the data. const NotifyAll::When commitNumber(commitNotify.now()); LOG(4) << "Processing commit number " << commitNumber; if (!commitJob.hasWritten()) { // We do not need the journal lock anymore. Free it here, for the really // unlikely possibility that the writeBuffer command below blocks. autoFlushLock.release(); // getlasterror request could have came after the data was already committed. // No need to call committingReset though, because we have not done any // writes (hasWritten == false). JournalWriter::Buffer* const buffer = journalWriter.newBuffer(); buffer->setNoop(); journalWriter.writeBuffer(buffer, commitNumber); } else { // This copies all the in-memory changes into the journal writer's buffer. JournalWriter::Buffer* const buffer = journalWriter.newBuffer(); PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder()); estimatedPrivateMapSize += commitJob.bytes(); commitCounter++; // Now that the write intents have been copied to the buffer, the commit job is // free to be reused. We need to reset the commit job's contents while under // the S flush lock, because otherwise someone might have done a write and this // would wipe out their changes without ever being committed. commitJob.committingReset(); double systemMemoryPressurePercentage = ProcessInfo::getSystemMemoryPressurePercentage(); // Now that the in-memory modifications have been collected, we can potentially // release the flush lock if remap is not necessary. // When we remap due to memory pressure, we look at two criteria // 1. If the amount of 4k pages touched exceeds 512 MB, // a reasonable estimate of memory pressure on Linux. // 2. Check if the amount of free memory on the machine is running low, // since #1 is underestimates the memory pressure on Windows since // commits in 64MB chunks. const bool shouldRemap = (estimatedPrivateMapSize >= UncommittedBytesLimit) || (systemMemoryPressurePercentage > 0.0) || (commitCounter % NumCommitsBeforeRemap == 0) || (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap); double remapFraction = 0.0; if (shouldRemap) { // We want to remap all private views about every 2 seconds. There could be // ~1000 views so we do a little each pass. There will be copy on write // faults after remapping, so doing a little bit at a time will avoid big // load spikes when the pages are touched. // // TODO: Instead of the time-based logic above, consider using ProcessInfo // and watching for getResidentSize to drop, which is more precise. remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0; if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap) { remapFraction = 1; } else { // We don't want to get close to the UncommittedBytesLimit const double remapMemFraction = estimatedPrivateMapSize / ((double)UncommittedBytesLimit); remapFraction = std::max(remapMemFraction, remapFraction); remapFraction = std::max(systemMemoryPressurePercentage, remapFraction); } } else { LOG(4) << "Early release flush lock"; // We will not be doing a remap so drop the flush lock. That way we will be // doing the journal I/O outside of lock, so other threads can proceed. invariant(!shouldRemap); autoFlushLock.release(); } // Request async I/O to the journal. This may block. journalWriter.writeBuffer(buffer, commitNumber); // Data has now been written to the shared view. If remap was requested, we // would still be holding the S flush lock here, so just upgrade it and // perform the remap. if (shouldRemap) { // Need to wait for the previously scheduled journal writes to complete // before any remap is attempted. journalWriter.flush(); journalWriter.assertIdle(); // Upgrading the journal lock to flush stops all activity on the system, // because we will be remapping memory and we don't want readers to be // accessing it. Technically this step could be avoided on systems, which // support atomic remap. autoFlushLock.upgradeFlushLockToExclusive(); remapPrivateView(remapFraction); autoFlushLock.release(); // Reset the private map estimate outside of the lock estimatedPrivateMapSize = 0; remapLastTimestamp = curTimeMicros64(); stats.curr()->_commitsInWriteLock++; stats.curr()->_commitsInWriteLockMicros += t.micros(); } } stats.curr()->_commits++; stats.curr()->_commitsMicros += t.micros(); LOG(4) << "groupCommit end"; } catch (DBException& e) { severe() << "dbexception in durThread causing immediate shutdown: " << e.toString(); invariant(false); } catch (std::ios_base::failure& e) { severe() << "ios_base exception in durThread causing immediate shutdown: " << e.what(); invariant(false); } catch (std::bad_alloc& e) { severe() << "bad_alloc exception in durThread causing immediate shutdown: " << e.what(); invariant(false); } catch (std::exception& e) { severe() << "exception in durThread causing immediate shutdown: " << e.what(); invariant(false); } catch (...) { severe() << "unhandled exception in durThread causing immediate shutdown"; invariant(false); } } // Stops the journal thread and ensures everything was written invariant(!commitJob.hasWritten()); journalWriter.flush(); journalWriter.shutdown(); log() << "Durability thread stopped"; }
static void durThread() { Client::initThread("journal"); bool samePartition = true; try { const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string(); samePartition = onSamePartition(getJournalDir().string(), dbpathDir); } catch(...) { } while (shutdownRequested.loadRelaxed() == 0) { unsigned ms = storageGlobalParams.journalCommitInterval; if( ms == 0 ) { ms = samePartition ? 100 : 30; } unsigned oneThird = (ms / 3) + 1; // +1 so never zero try { stats.rotate(); boost::mutex::scoped_lock lock(flushMutex); // commit sooner if one or more getLastError j:true is pending for (unsigned i = 0; i <= 2; i++) { if (flushRequested.timed_wait(lock, Milliseconds(oneThird))) { // Someone forced a flush break; } if (commitJob._notify.nWaiting()) break; if (commitJob.bytes() > UncommittedBytesLimit / 2) break; } OperationContextImpl txn; // Waits for all active operations to drain and won't let new ones start. This // should be optimized to allow readers in (see SERVER-15262). AutoAcquireFlushLockForMMAPV1Commit flushLock(txn.lockState()); groupCommit(); remapPrivateView(); } catch(std::exception& e) { log() << "exception in durThread causing immediate shutdown: " << e.what() << endl; mongoAbort("exception in durThread"); } catch (...) { log() << "unhandled exception in durThread causing immediate shutdown" << endl; mongoAbort("unhandled exception in durThread"); } } cc().shutdown(); }
void createOplog() { OperationContextImpl txn; Lock::GlobalWrite lk(txn.lockState()); const char * ns = "local.oplog.$main"; bool rs = !replSettings.replSet.empty(); if( rs ) ns = rsoplog; Client::Context ctx(ns); Collection* collection = ctx.db()->getCollection( &txn, ns ); if ( collection ) { if (replSettings.oplogSize != 0) { int o = (int)(collection->getRecordStore()->storageSize() / ( 1024 * 1024 ) ); int n = (int)(replSettings.oplogSize / (1024 * 1024)); if ( n != o ) { stringstream ss; ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog"; log() << ss.str() << endl; throw UserException( 13257 , ss.str() ); } } if( rs ) return; initOpTimeFromOplog(&txn, ns); return; } /* create an oplog collection, if it doesn't yet exist. */ long long sz = 0; if ( replSettings.oplogSize != 0 ) { sz = replSettings.oplogSize; } else { /* not specified. pick a default size */ sz = 50LL * 1024LL * 1024LL; if ( sizeof(int *) >= 8 ) { #if defined(__APPLE__) // typically these are desktops (dev machines), so keep it smallish sz = (256-64) * 1024 * 1024; #else sz = 990LL * 1024 * 1024; double free = File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported. long long fivePct = static_cast<long long>( free * 0.05 ); if ( fivePct > sz ) sz = fivePct; // we use 5% of free space up to 50GB (1TB free) static long long upperBound = 50LL * 1024 * 1024 * 1024; if (fivePct > upperBound) sz = upperBound; #endif } } log() << "******" << endl; log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl; CollectionOptions options; options.capped = true; options.cappedSize = sz; options.autoIndexId = CollectionOptions::NO; invariant( ctx.db()->createCollection( &txn, ns, options ) ); if( !rs ) logOp( &txn, "n", "", BSONObj() ); /* sync here so we don't get any surprising lag later when we try to sync */ MemoryMappedFile::flushAll(true); log() << "******" << endl; }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while(!inShutdown()) { OpQueue ops; OperationContextImpl txn; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); Client::Context ctx(&txn, "local"); ctx.db()->dropCollection(&txn, "local.oplog.rs"); // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync (done in this thread after we return); thus, we must // ensure the lastAppliedOptime is empty before pausing the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. replCoord->setMyLastOptime(&txn, OpTime()); bgsync->stop(); replCoord->clearSyncSourceBlacklist(); wunit.commit(); return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); // TODO(emilkie): This can be removed once we switch over from legacy; // this code is what moves 1-node sets to PRIMARY state. // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet && theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } if (ops.empty()) { continue; } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); if (replCoord->getCurrentMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog OpTime minValid = lastOp["ts"]._opTime(); setMinValid(&txn, minValid); multiApply(ops.getDeque()); applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication // coordinator if (theReplSet && !theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }