// This free function is used by the initial sync writer threads to apply each op void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { OperationContextImpl txn; if (!st->syncApply(&txn, *it)) { bool status; { Lock::GlobalWrite lk(txn.lockState()); status = st->shouldRetry(&txn, *it); } if (status) { // retry if (!st->syncApply(&txn, *it)) { fassertFailedNoTrace(15915); } } // If shouldRetry() returns false, fall through. // This can happen if the document that was moved and missed by Cloner // subsequently got deleted and no longer exists on the Sync Target at all } } catch (const DBException& e) { error() << "exception: " << causedBy(e) << " on: " << it->toString() << endl; fassertFailedNoTrace(16361); } } }
// This free function is used by the writer threads to apply each op void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; txn.setReplicatedWrites(false); DisableDocumentValidation validationDisabler(&txn); // allow us to get through the magic barrier txn.lockState()->setIsBatchWriter(true); bool convertUpdatesToUpserts = true; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s.isOK()) { severe() << "Error applying operation (" << it->toString() << "): " << s; fassertFailedNoTrace(16359); } } catch (const DBException& e) { severe() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString(); if (inShutdown()) { return; } fassertFailedNoTrace(16360); } } }
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, Timestamp oplogApplicationStartPoint, Timestamp topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; DBDirectClient db(opCtx); auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(), QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << oplogApplicationStartPoint.toBSON() << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstTimestampFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp(); if (firstTimestampFound != oplogApplicationStartPoint) { severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON() << " is missing; actual entry found is " << firstTimestampFound.toBSON(); fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. UnreplicatedWritesBlock uwb(opCtx); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering)); _consistencyMarkers->setAppliedThrough( opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }
/* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { unsigned long long bytesApplied = 0; unsigned long long entriesApplied = 0; while (true) { OpQueue ops; OperationContextImpl ctx; while (!tryPopAndWaitForMore(&ops, getGlobalReplicationCoordinator())) { // nothing came back last time, so go again if (ops.empty()) continue; // Check if we reached the end const BSONObj currentOp = ops.back(); const OpTime currentOpTime = currentOp["ts"]._opTime(); // When we reach the end return this batch if (currentOpTime == endOpTime) { break; } else if (currentOpTime > endOpTime) { severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime << " without seeing it. Rollback?" << rsLog; fassertFailedNoTrace(18693); } // apply replication batch limits if (ops.getSize() > replBatchLimitBytes) break; if (ops.getDeque().size() > replBatchLimitOperations) break; }; if (ops.empty()) { severe() << "got no ops for batch..."; fassertFailedNoTrace(18692); } const BSONObj lastOp = ops.back().getOwned(); // Tally operation information bytesApplied += ops.getSize(); entriesApplied += ops.getDeque().size(); multiApply(ops.getDeque()); OpTime lastOpTime = applyOpsToOplog(&ops.getDeque()); // if the last op applied was our end, return if (lastOpTime == endOpTime) { LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied << " bytes)" << " and finished at opTime " << endOpTime.toStringPretty(); return; } } // end of while (true) }
void restartInProgressIndexesFromLastShutdown(OperationContext* txn) { txn->getClient()->getAuthorizationSession()->grantInternalAuthorization(); std::vector<std::string> dbNames; StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->listDatabases( &dbNames ); try { std::list<std::string> collNames; for (std::vector<std::string>::const_iterator dbName = dbNames.begin(); dbName < dbNames.end(); ++dbName) { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, *dbName, MODE_S); Database* db = autoDb.getDb(); db->getDatabaseCatalogEntry()->getCollectionNamespaces(&collNames); } checkNS(txn, collNames); } catch (const DBException& e) { error() << "Index verification did not complete: " << e.toString(); fassertFailedNoTrace(18643); } LOG(1) << "checking complete" << endl; }
void ServiceContextMongoD::createLockFile() { try { _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath)); } catch (const std::exception& ex) { uassert(28596, str::stream() << "Unable to determine status of lock file in the data directory " << storageGlobalParams.dbpath << ": " << ex.what(), false); } bool wasUnclean = _lockFile->createdByUncleanShutdown(); auto openStatus = _lockFile->open(); if (storageGlobalParams.readOnly && openStatus == ErrorCodes::IllegalOperation) { _lockFile.reset(); } else { uassertStatusOK(openStatus); } if (wasUnclean) { if (storageGlobalParams.readOnly) { severe() << "Attempted to open dbpath in readOnly mode, but the server was " "previously not shut down cleanly."; fassertFailedNoTrace(34416); } warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty."; } }
BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? const char* ns = o.getStringField("ns"); // capped collections Collection* collection = db->getCollection(ns); if (collection && collection->isCapped()) { log() << "missing doc, but this is okay for a capped collection (" << ns << ")"; return BSONObj(); } const int retryMax = 3; for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { if (retryCount != 1) { // if we are retrying, sleep a bit to let the network possibly recover sleepsecs(retryCount * retryCount); } try { bool ok = missingObjReader.connect(HostAndPort(_hostname)); if (!ok) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } } catch (const SocketException&) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } // get _id from oplog entry to create query to fetch document. const BSONElement opElem = o.getField("op"); const bool isUpdate = !opElem.eoo() && opElem.str() == "u"; const BSONElement idElem = o.getObjectField(isUpdate ? "o2" : "o")["_id"]; if (idElem.eoo()) { severe() << "cannot fetch missing document without _id field: " << o.toString(); fassertFailedNoTrace(28742); } BSONObj query = BSONObjBuilder().append(idElem).obj(); BSONObj missingObj; try { missingObj = missingObjReader.findOne(ns, query); } catch (const SocketException&) { warning() << "network problem detected while fetching a missing document from the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } catch (DBException& e) { error() << "assertion fetching missing object: " << e.what() << endl; throw; } // success! return missingObj; } // retry count exceeded msgasserted(15916, str::stream() << "Can no longer connect to initial sync source: " << _hostname); }
void ReplSet::haveNewConfig(OperationContext* txn, ReplSetConfig& newConfig, bool addComment) { bo comment; if( addComment ) comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version ); newConfig.saveConfigLocally(txn, comment); try { BSONObj oldConfForAudit = config().asBson(); BSONObj newConfForAudit = newConfig.asBson(); audit::logReplSetReconfig(ClientBasic::getCurrent(), &oldConfForAudit, &newConfForAudit); if (initFromConfig(txn, newConfig, true)) { log() << "replSet replSetReconfig new config saved locally" << rsLog; } } catch (const DBException& e) { log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog; fassertFailedNoTrace(18755); } catch (...) { std::terminate(); } }
void restartInProgressIndexesFromLastShutdown() { OperationContextImpl txn; cc().getAuthorizationSession()->grantInternalAuthorization(); std::vector<std::string> dbNames; StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->listDatabases( &dbNames ); try { std::list<std::string> collNames; for (std::vector<std::string>::const_iterator dbName = dbNames.begin(); dbName < dbNames.end(); ++dbName) { Client::ReadContext ctx(&txn, *dbName); Database* db = ctx.ctx().db(); db->getDatabaseCatalogEntry()->getCollectionNamespaces(&collNames); } checkNS(&txn, collNames); } catch (const DBException& e) { error() << "Index rebuilding did not complete: " << e.toString(); log() << "note: restart the server with --noIndexBuildRetry to skip index rebuilds"; fassertFailedNoTrace(18643); } LOG(1) << "checking complete" << endl; }
// Copies ops out of the bgsync queue into the deque passed in as a parameter. // Returns true if the batch should be ended early. // Batch should end early if we encounter a command, or if // there are no further ops in the bgsync queue to read. // This function also blocks 1 second waiting for new ops to appear in the bgsync // queue. We can't block forever because there are maintenance things we need // to periodically check in the loop. bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops, ReplicationCoordinator* replCoord) { BSONObj op; // Check to see if there are ops waiting in the bgsync queue bool peek_success = peek(&op); if (!peek_success) { // if we don't have anything in the queue, wait a bit for something to appear if (ops->empty()) { replCoord->signalDrainComplete(); // block up to 1 second _networkQueue->waitForMore(); return false; } // otherwise, apply what we have return true; } const char* ns = op["ns"].valuestrsafe(); // check for commands if ((op["op"].valuestrsafe()[0] == 'c') || // Index builds are acheived through the use of an insert op, not a command op. // The following line is the same as what the insert code uses to detect an index build. ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) { if (ops->empty()) { // apply commands one-at-a-time ops->push_back(op); _networkQueue->consume(); } // otherwise, apply what we have so far and come back for the command return true; } // check for oplog version change BSONElement elemVersion = op["v"]; int curVersion = 0; if (elemVersion.eoo()) // missing version means version 1 curVersion = 1; else curVersion = elemVersion.Int(); if (curVersion != OPLOG_VERSION) { severe() << "expected oplog version " << OPLOG_VERSION << " but found version " << curVersion << " in oplog entry: " << op; fassertFailedNoTrace(18820); } // Copy the op to the deque and remove it from the bgsync queue. ops->push_back(op); _networkQueue->consume(); // Go back for more ops return false; }
// Copies ops out of the bgsync queue into the deque passed in as a parameter. // Returns true if the batch should be ended early. // Batch should end early if we encounter a command, or if // there are no further ops in the bgsync queue to read. // This function also blocks 1 second waiting for new ops to appear in the bgsync // queue. We can't block forever because there are maintenance things we need // to periodically check in the loop. bool SyncTail::tryPopAndWaitForMore(OperationContext* txn, SyncTail::OpQueue* ops) { BSONObj op; // Check to see if there are ops waiting in the bgsync queue bool peek_success = peek(&op); if (!peek_success) { // if we don't have anything in the queue, wait a bit for something to appear if (ops->empty()) { // block up to 1 second _networkQueue->waitForMore(); return false; } // otherwise, apply what we have return true; } auto entry = OplogEntry(op); // Check for ops that must be processed one at a time. if (entry.raw.isEmpty() || // sentinel that network queue is drained. (entry.opType[0] == 'c') || // commands. // Index builds are acheived through the use of an insert op, not a command op. // The following line is the same as what the insert code uses to detect an index build. (!entry.ns.empty() && nsToCollectionSubstring(entry.ns) == "system.indexes")) { if (ops->empty()) { // apply commands one-at-a-time ops->push_back(std::move(entry)); _networkQueue->consume(); } // otherwise, apply what we have so far and come back for the command return true; } // check for oplog version change int curVersion = 0; if (entry.version.eoo()) // missing version means version 1 curVersion = 1; else curVersion = entry.version.Int(); if (curVersion != OPLOG_VERSION) { severe() << "expected oplog version " << OPLOG_VERSION << " but found version " << curVersion << " in oplog entry: " << op; fassertFailedNoTrace(18820); } // Copy the op to the deque and remove it from the bgsync queue. ops->push_back(std::move(entry)); _networkQueue->consume(); // Go back for more ops return false; }
void syncDoInitialSync(ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) { stdx::unique_lock<stdx::mutex> lk(_initialSyncMutex, stdx::defer_lock); if (!lk.try_lock()) { uasserted(34474, "Initial Sync Already Active."); } std::unique_ptr<BackgroundSync> bgsync; { log() << "Starting replication fetcher thread for initial sync"; auto txn = cc().makeOperationContext(); bgsync = stdx::make_unique<BackgroundSync>( replicationCoordinatorExternalState, replicationCoordinatorExternalState->makeInitialSyncOplogBuffer(txn.get())); bgsync->startup(txn.get()); createOplog(txn.get()); } ON_BLOCK_EXIT([&bgsync]() { log() << "Stopping replication fetcher thread for initial sync"; auto txn = cc().makeOperationContext(); bgsync->shutdown(txn.get()); bgsync->join(txn.get()); }); int failedAttempts = 0; while (failedAttempts < kMaxFailedAttempts) { try { // leave loop when successful Status status = _initialSync(bgsync.get()); if (status.isOK()) { break; } else { error() << status; } } catch (const DBException& e) { error() << e; // Return if in shutdown if (inShutdown()) { return; } } if (inShutdown()) { return; } error() << "initial sync attempt failed, " << (kMaxFailedAttempts - ++failedAttempts) << " attempts remaining"; sleepmillis(durationCount<Milliseconds>(kInitialSyncRetrySleepDuration)); } // No need to print a stack if (failedAttempts >= kMaxFailedAttempts) { severe() << "The maximum number of retries have been exhausted for initial sync."; fassertFailedNoTrace(16233); } }
void BackgroundSync::_rollback(OperationContext* txn, const HostAndPort& source, stdx::function<DBClientBase*()> getConnection) { // Abort only when syncRollback detects we are in a unrecoverable state. // In other cases, we log the message contained in the error status and retry later. auto status = syncRollback(txn, OplogInterfaceLocal(txn, rsOplogName), RollbackSourceImpl(getConnection, source, rsOplogName), _replCoord); if (status.isOK()) { // When the syncTail thread sees there is no new data by adding something to the buffer. _signalNoNewDataForApplier(txn); // Wait until the buffer is empty. // This is an indication that syncTail has removed the sentinal marker from the buffer // and reset its local lastAppliedOpTime via the replCoord. while (!_oplogBuffer->isEmpty()) { sleepmillis(10); if (inShutdown()) { return; } } // At this point we are about to leave rollback. Before we do, wait for any writes done // as part of rollback to be durable, and then do any necessary checks that we didn't // wind up rolling back something illegal. We must wait for the rollback to be durable // so that if we wind up shutting down uncleanly in response to something we rolled back // we know that we won't wind up right back in the same situation when we start back up // because the rollback wasn't durable. txn->recoveryUnit()->waitUntilDurable(); // If we detected that we rolled back the shardIdentity document as part of this rollback // then we must shut down to clear the in-memory ShardingState associated with the // shardIdentity document. if (ShardIdentityRollbackNotifier::get(txn)->didRollbackHappen()) { severe() << "shardIdentity document rollback detected. Shutting down to clear " "in-memory sharding state. Restarting this process should safely return it " "to a healthy state"; fassertFailedNoTrace(40276); } // It is now safe to clear the ROLLBACK state, which may result in the applier thread // transitioning to SECONDARY. This is safe because the applier thread has now reloaded // the new rollback minValid from the database. if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) << " but found self in " << _replCoord->getMemberState(); } return; } if (ErrorCodes::UnrecoverableRollbackError == status.code()) { fassertNoTrace(28723, status); } warning() << "rollback cannot proceed at this time (retrying later): " << redact(status); }
/** * Check that the oplog is capped, and abort the process if it is not. * Caller must lock DB before calling this function. */ void checkForCappedOplog(OperationContext* txn, Database* db) { const NamespaceString oplogNss(repl::rsOplogName); invariant(txn->lockState()->isDbLockedForMode(oplogNss.db(), MODE_IS)); Collection* oplogCollection = db->getCollection(oplogNss); if (oplogCollection && !oplogCollection->isCapped()) { severe() << "The oplog collection " << oplogNss << " is not capped; a capped oplog is a requirement for replication to function."; fassertFailedNoTrace(40115); } }
void markThreadIdle() { if (!threadIdleCallback) { return; } try { threadIdleCallback(); } catch (...) { severe() << "Exception escaped from threadIdleCallback"; fassertFailedNoTrace(28603); } }
// This free function is used by the initial sync writer threads to apply each op void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; txn.setReplicatedWrites(false); DisableDocumentValidation validationDisabler(&txn); // allow us to get through the magic barrier txn.lockState()->setIsBatchWriter(true); bool convertUpdatesToUpserts = false; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s.isOK()) { if (st->shouldRetry(&txn, *it)) { const Status s2 = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts); if (!s2.isOK()) { severe() << "Error applying operation (" << it->toString() << "): " << s2; fassertFailedNoTrace(15915); } } // If shouldRetry() returns false, fall through. // This can happen if the document that was moved and missed by Cloner // subsequently got deleted and no longer exists on the Sync Target at all } } catch (const DBException& e) { severe() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString(); if (inShutdown()) { return; } fassertFailedNoTrace(16361); } } }
// This free function is used by the writer threads to apply each op void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); // convert update operations only for 2.2.1 or greater, because we need guaranteed // idempotent operations for this to work. See SERVER-6825 bool convertUpdatesToUpserts = theReplSet->oplogVersion > 1 ? true : false; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { OperationContextImpl txn; if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) { fassertFailedNoTrace(16359); } } catch (const DBException& e) { error() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString() << endl; fassertFailedNoTrace(16360); } } }
void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp) { Timer timer; const NamespaceString oplogNss(NamespaceString::kRsOplogNamespace); AutoGetDb autoDb(opCtx, oplogNss.db(), MODE_IX); Lock::CollectionLock oplogCollectionLoc(opCtx->lockState(), oplogNss.ns(), MODE_X); Collection* oplogCollection = autoDb.getDb()->getCollection(opCtx, oplogNss); if (!oplogCollection) { fassertFailedWithStatusNoTrace( 34418, Status(ErrorCodes::NamespaceNotFound, str::stream() << "Can't find " << NamespaceString::kRsOplogNamespace.ns())); } // Scan through oplog in reverse, from latest entry to first, to find the truncateTimestamp. RecordId oldestIDToDelete; // Non-null if there is something to delete. auto oplogRs = oplogCollection->getRecordStore(); auto oplogReverseCursor = oplogRs->getCursor(opCtx, /*forward=*/false); size_t count = 0; while (auto next = oplogReverseCursor->next()) { const BSONObj entry = next->data.releaseToBson(); const RecordId id = next->id; count++; const auto tsElem = entry["ts"]; if (count == 1) { if (tsElem.eoo()) LOG(2) << "Oplog tail entry: " << redact(entry); else LOG(2) << "Oplog tail entry ts field: " << tsElem; } if (tsElem.timestamp() < truncateTimestamp) { // If count == 1, that means that we have nothing to delete because everything in the // oplog is < truncateTimestamp. if (count != 1) { invariant(!oldestIDToDelete.isNull()); oplogCollection->cappedTruncateAfter(opCtx, oldestIDToDelete, /*inclusive=*/true); } log() << "Replication recovery oplog truncation finished in: " << timer.millis() << "ms"; return; } oldestIDToDelete = id; } severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON() << " but couldn't find any after looking through " << count << " entries."; fassertFailedNoTrace(40296); }
// This free function is used by the writer threads to apply each op void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); OperationContextImpl txn; // allow us to get through the magic barrier Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState()); bool convertUpdatesToUpserts = true; for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) { fassertFailedNoTrace(16359); } } catch (const DBException& e) { error() << "writer worker caught exception: " << causedBy(e) << " on: " << it->toString() << endl; fassertFailedNoTrace(16360); } } }
/** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ OpTime _logOpObjRS(OperationContext* txn, const BSONObj& op) { Lock::DBLock lk(txn->lockState(), "local", newlm::MODE_X); // XXX soon this needs to be part of an outer WUOW not its own. // We can't do this yet due to locking limitations. WriteUnitOfWork wunit(txn); const OpTime ts = op["ts"]._opTime(); long long hash = op["h"].numberLong(); { if ( localOplogRSCollection == 0 ) { Client::Context ctx(txn, rsoplog); localDB = ctx.db(); verify( localDB ); localOplogRSCollection = localDB->getCollection(txn, rsoplog); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); } Client::Context ctx(txn, rsoplog, localDB); checkOplogInsert(localOplogRSCollection->insertDocument(txn, op, false)); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); OpTime myLastOptime = replCoord->getMyLastOptime(); if (!(myLastOptime < ts)) { severe() << "replication oplog stream went back in time. previous timestamp: " << myLastOptime << " newest timestamp: " << ts; fassertFailedNoTrace(18905); } BackgroundSync* bgsync = BackgroundSync::get(); // Keep this up-to-date, in case we step up to primary. bgsync->setLastAppliedHash(hash); ctx.getClient()->setLastOp( ts ); replCoord->setMyLastOptime(txn, ts); bgsync->notify(); } setNewOptime(ts); wunit.commit(); return ts; }
void syncDoInitialSync(BackgroundSync* bgsync) { stdx::unique_lock<stdx::mutex> lk(_initialSyncMutex, stdx::defer_lock); if (!lk.try_lock()) { uasserted(34474, "Initial Sync Already Active."); } { const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext(); OperationContext& txn = *txnPtr; createOplog(&txn); } int failedAttempts = 0; while (failedAttempts < kMaxFailedAttempts) { try { // leave loop when successful Status status = _initialSync(bgsync); if (status.isOK()) { break; } else { error() << status; } } catch (const DBException& e) { error() << e; // Return if in shutdown if (inShutdown()) { return; } } if (inShutdown()) { return; } error() << "initial sync attempt failed, " << (kMaxFailedAttempts - ++failedAttempts) << " attempts remaining"; sleepmillis(durationCount<Milliseconds>(kInitialSyncRetrySleepDuration)); } // No need to print a stack if (failedAttempts >= kMaxFailedAttempts) { severe() << "The maximum number of retries have been exhausted for initial sync."; fassertFailedNoTrace(16233); } }
void DropPendingCollectionReaper::addDropPendingNamespace( const OpTime& dropOpTime, const NamespaceString& dropPendingNamespace) { invariant(dropPendingNamespace.isDropPendingNamespace()); stdx::lock_guard<stdx::mutex> lock(_mutex); const auto equalRange = _dropPendingNamespaces.equal_range(dropOpTime); const auto& lowerBound = equalRange.first; const auto& upperBound = equalRange.second; auto matcher = [&dropPendingNamespace](const auto& pair) { return pair.second == dropPendingNamespace; }; if (std::find_if(lowerBound, upperBound, matcher) == upperBound) { _dropPendingNamespaces.insert(std::make_pair(dropOpTime, dropPendingNamespace)); } else { severe() << "Failed to add drop-pending collection " << dropPendingNamespace << " with drop optime " << dropOpTime << ": duplicate optime and namespace pair."; fassertFailedNoTrace(40448); } }
void syncDoInitialSync() { static const int maxFailedAttempts = 10; { OperationContextImpl txn; createOplog(&txn); } int failedAttempts = 0; while ( failedAttempts < maxFailedAttempts ) { try { // leave loop when successful Status status = _initialSync(); if (status.isOK()) { break; } if (status == ErrorCodes::InitialSyncOplogSourceMissing) { sleepsecs(1); return; } } catch(const DBException& e) { error() << e ; // Return if in shutdown if (inShutdown()) { return; } } if (inShutdown()) { return; } error() << "initial sync attempt failed, " << (maxFailedAttempts - ++failedAttempts) << " attempts remaining"; sleepsecs(5); } // No need to print a stack if (failedAttempts >= maxFailedAttempts) { severe() << "The maximum number of retries have been exhausted for initial sync."; fassertFailedNoTrace(16233); } }
Status WiredTigerSnapshotManager::setTransactionReadTimestamp(Timestamp pointInTime, WT_SESSION* session, bool roundToOldest) const { char readTSConfigString[15 /* read_timestamp= */ + 16 /* 16 hexadecimal digits */ + 17 /* ,round_to_oldest= */ + 5 /* false */ + 1 /* trailing null */]; auto size = std::snprintf(readTSConfigString, sizeof(readTSConfigString), "read_timestamp=%llx,round_to_oldest=%s", pointInTime.asULL(), (roundToOldest) ? "true" : "false"); if (size < 0) { int e = errno; error() << "error snprintf " << errnoWithDescription(e); fassertFailedNoTrace(40664); } invariant(static_cast<std::size_t>(size) < sizeof(readTSConfigString)); return wtRCToStatus(session->timestamp_transaction(session, readTSConfigString)); }
/* forked as a thread during startup it can run quite a while looking for config. but once found, a separate thread takes over as ReplSetImpl::Manager, and this thread terminates. */ void startReplSets(ReplSetSeedList *replSetSeedList) { Client::initThread("rsStart"); OperationContextImpl txn; try { verify( theReplSet == 0 ); if( replSetSeedList == 0 ) { return; } replLocalAuth(); (theReplSet = ReplSet::make(&txn, *replSetSeedList))->go(); } catch(std::exception& e) { log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog; if( theReplSet ) fassertFailedNoTrace(18756); } cc().shutdown(); }
// Uses the collator factory to convert the BSON representation of a collator to a // CollatorInterface. Returns null if the BSONObj is empty. We expect the stored collation to be // valid, since it gets validated on collection create. std::unique_ptr<CollatorInterface> parseCollation(OperationContext* opCtx, const NamespaceString& nss, BSONObj collationSpec) { if (collationSpec.isEmpty()) { return {nullptr}; } auto collator = CollatorFactoryInterface::get(opCtx->getServiceContext())->makeFromBSON(collationSpec); // If the collection's default collator has a version not currently supported by our ICU // integration, shut down the server. Errors other than IncompatibleCollationVersion should not // be possible, so these are an invariant rather than fassert. if (collator == ErrorCodes::IncompatibleCollationVersion) { log() << "Collection " << nss << " has a default collation which is incompatible with this version: " << collationSpec; fassertFailedNoTrace(40144); } invariantOK(collator.getStatus()); return std::move(collator.getValue()); }
void repairDatabasesAndCheckVersion(OperationContext* txn) { LOG(1) << "enter repairDatabases (to check pdfile version #)"; ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); vector<string> dbNames; StorageEngine* storageEngine = txn->getServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&dbNames); // Repair all databases first, so that we do not try to open them if they are in bad shape if (storageGlobalParams.repair) { invariant(!storageGlobalParams.readOnly); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Repairing database: " << dbName; fassert(18506, repairDatabase(txn, storageEngine, dbName)); } } const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them // to. The local DB is special because it is not replicated. See SERVER-10927 for more // details. const bool shouldClearNonLocalTmpCollections = !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() || replSettings.isSlave()); const bool shouldDoCleanupForSERVER23299 = isSubjectToSERVER23299(txn); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Recovering database: " << dbName; Database* db = dbHolder().openDb(txn, dbName); invariant(db); // First thing after opening the database is to check for file compatibility, // otherwise we might crash if this is a deprecated format. auto status = db->getDatabaseCatalogEntry()->currentFilesCompatible(txn); if (!status.isOK()) { if (status.code() == ErrorCodes::CanRepairToDowngrade) { // Convert CanRepairToDowngrade statuses to MustUpgrade statuses to avoid logging a // potentially confusing and inaccurate message. // // TODO SERVER-24097: Log a message informing the user that they can start the // current version of mongod with --repair and then proceed with normal startup. status = {ErrorCodes::MustUpgrade, status.reason()}; } severe() << "Unable to start mongod due to an incompatibility with the data files and" " this version of mongod: " << redact(status); severe() << "Please consult our documentation when trying to downgrade to a previous" " major release"; quickExit(EXIT_NEED_UPGRADE); return; } // Check if admin.system.version contains an invalid featureCompatibilityVersion. // If a valid featureCompatibilityVersion is present, cache it as a server parameter. if (dbName == "admin") { if (Collection* versionColl = db->getCollection(FeatureCompatibilityVersion::kCollection)) { BSONObj featureCompatibilityVersion; if (Helpers::findOne(txn, versionColl, BSON("_id" << FeatureCompatibilityVersion::kParameterName), featureCompatibilityVersion)) { auto version = FeatureCompatibilityVersion::parse(featureCompatibilityVersion); if (!version.isOK()) { severe() << version.getStatus(); fassertFailedNoTrace(40283); } serverGlobalParams.featureCompatibilityVersion.store(version.getValue()); } } } // Major versions match, check indexes const string systemIndexes = db->name() + ".system.indexes"; Collection* coll = db->getCollection(systemIndexes); unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) { if (IndexNames::existedBefore24(plugin)) { continue; } log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << redact(keyStatus) << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } if (index["v"].isNumber() && index["v"].numberInt() == 0) { log() << "WARNING: The index: " << index << " was created with the deprecated" << " v:0 format. This format will not be supported in a future release." << startupWarningsLog; log() << "\t To fix this, you need to rebuild this index." << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes" << startupWarningsLog; } } // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::IS_EOF == state); if (replSettings.usingReplSets()) { // We only care about the _id index if we are in a replset checkForIdIndexes(txn, db); // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped // collections) if (db->name() == "local") { checkForCappedOplog(txn, db); } } if (shouldDoCleanupForSERVER23299) { handleSERVER23299ForDb(txn, db); } if (!storageGlobalParams.readOnly && (shouldClearNonLocalTmpCollections || dbName == "local")) { db->clearTmpCollections(txn); } } LOG(1) << "done repairDatabases"; }
void ThreadPool::_consumeTasks() { stdx::unique_lock<stdx::mutex> lk(_mutex); while (_state == running) { if (_pendingTasks.empty()) { if (_threads.size() > _options.minThreads) { // Since there are more than minThreads threads, this thread may be eligible for // retirement. If it isn't now, it may be later, so it must put a time limit on how // long it waits on _workAvailable. const auto now = Date_t::now(); const auto nextThreadRetirementDate = _lastFullUtilizationDate + _options.maxIdleThreadAge; if (now >= nextThreadRetirementDate) { _lastFullUtilizationDate = now; LOG(1) << "Reaping this thread; next thread reaped no earlier than " << _lastFullUtilizationDate + _options.maxIdleThreadAge; break; } LOG(3) << "Not reaping because the earliest retirement date is " << nextThreadRetirementDate; _workAvailable.wait_until(lk, nextThreadRetirementDate.toSystemTimePoint()); } else { // Since the number of threads is not more than minThreads, this thread is not // eligible for retirement. It is OK to sleep until _workAvailable is signaled, // because any new threads that put the number of total threads above minThreads // would be eligible for retirement once they had no work left to do. LOG(3) << "waiting for work; I am one of " << _threads.size() << " thread(s);" << " the minimum number of threads is " << _options.minThreads; _workAvailable.wait(lk); } continue; } _doOneTask(&lk); } // We still hold the lock, but this thread is retiring. If the whole pool is shutting down, this // thread lends a hand in draining the work pool and returns so it can be joined. Otherwise, it // falls through to the detach code, below. if (_state == joinRequired || _state == joining) { // Drain the leftover pending tasks. while (!_pendingTasks.empty()) { _doOneTask(&lk); } --_numIdleThreads; return; } --_numIdleThreads; if (_state != running) { severe() << "State of pool " << _options.poolName << " is " << static_cast<int32_t>(_state) << ", but expected " << static_cast<int32_t>(running); fassertFailedNoTrace(28701); } // This thread is ending because it was idle for too long. Find self in _threads, remove self // from _threads, detach self. for (size_t i = 0; i < _threads.size(); ++i) { auto& t = _threads[i]; if (t.get_id() != stdx::this_thread::get_id()) { continue; } t.detach(); t.swap(_threads.back()); _threads.pop_back(); return; } severe().stream() << "Could not find this thread, with id " << stdx::this_thread::get_id() << " in pool " << _options.poolName; fassertFailedNoTrace(28703); }
void ReplSetImpl::loadConfig(OperationContext* txn) { startupStatus = LOADINGCONFIG; startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)"); LOG(1) << "loadConfig() " << rsConfigNs << endl; while (1) { try { OwnedPointerVector<ReplSetConfig> configs; try { configs.mutableVector().push_back(ReplSetConfig::makeDirect(txn)); } catch (DBException& e) { log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog; } for (vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++) { try { configs.mutableVector().push_back(ReplSetConfig::make(txn, *i)); } catch (DBException& e) { log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog; } } ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); { scoped_lock lck(replSettings.discoveredSeeds_mx); if (replSettings.discoveredSeeds.size() > 0) { for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); i != replSettings.discoveredSeeds.end(); i++) { try { configs.mutableVector().push_back( ReplSetConfig::make(txn, HostAndPort(*i))); } catch (DBException&) { LOG(1) << "replSet exception trying to load config from discovered " "seed " << *i << rsLog; replSettings.discoveredSeeds.erase(*i); } } } } if (!replSettings.reconfig.isEmpty()) { try { configs.mutableVector().push_back(ReplSetConfig::make(txn, replSettings.reconfig, true)); } catch (DBException& re) { log() << "replSet couldn't load reconfig: " << re.what() << rsLog; replSettings.reconfig = BSONObj(); } } int nok = 0; int nempty = 0; for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin(); i != configs.mutableVector().end(); i++) { if ((*i)->ok()) nok++; if ((*i)->empty()) nempty++; } if (nok == 0) { if (nempty == (int) configs.mutableVector().size()) { startupStatus = EMPTYCONFIG; startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)"); log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog; static unsigned once; if (++once == 1) { log() << "replSet info you may need to run replSetInitiate -- rs.initia" "te() in the shell -- if that is not already done" << rsLog; } if (_seeds->size() == 0) { LOG(1) << "replSet info no seed hosts were specified on the --replSet " "command line" << rsLog; } } else { startupStatus = EMPTYUNREACHABLE; startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)"); log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog; } sleepsecs(1); continue; } if (!_loadConfigFinish(txn, configs.mutableVector())) { log() << "replSet info Couldn't load config yet. Sleeping 3 sec and will try " "again." << rsLog; sleepsecs(3); continue; } } catch (DBException& e) { startupStatus = BADCONFIG; startupStatusMsg.set("replSet error loading set config (BADCONFIG)"); log() << "replSet error loading configurations " << e.toString() << rsLog; log() << "replSet error replication will not start" << rsLog; sethbmsg("error loading set config"); fassertFailedNoTrace(18754); throw; } break; } startupStatusMsg.set("? started"); startupStatus = STARTED; }
void ReplicationCoordinatorExternalStateImpl::cleanUpLastApplyBatch(OperationContext* txn) { if (_storageInterface->getInitialSyncFlag(txn)) { return; // Initial Sync will take over so no cleanup is needed. } const auto deleteFromPoint = _storageInterface->getOplogDeleteFromPoint(txn); const auto appliedThrough = _storageInterface->getAppliedThrough(txn); const bool needToDeleteEndOfOplog = !deleteFromPoint.isNull() && // This version should never have a non-null deleteFromPoint with a null appliedThrough. // This scenario means that we downgraded after unclean shutdown, then the downgraded node // deleted the ragged end of our oplog, then did a clean shutdown. !appliedThrough.isNull() && // Similarly we should never have an appliedThrough higher than the deleteFromPoint. This // means that the downgraded node deleted our ragged end then applied ahead of our // deleteFromPoint and then had an unclean shutdown before upgrading. We are ok with // applying these ops because older versions wrote to the oplog from a single thread so we // know they are in order. !(appliedThrough.getTimestamp() >= deleteFromPoint); if (needToDeleteEndOfOplog) { log() << "Removing unapplied entries starting at: " << deleteFromPoint; truncateOplogTo(txn, deleteFromPoint); } _storageInterface->setOplogDeleteFromPoint(txn, {}); // clear the deleteFromPoint if (appliedThrough.isNull()) { // No follow-up work to do. return; } // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. const auto topOfOplog = fassertStatusOK(40290, loadLastOpTime(txn)); if (topOfOplog >= appliedThrough) { return; // We've applied all the valid oplog we have. } log() << "Replaying stored operations from " << appliedThrough << " (exclusive) to " << topOfOplog << " (inclusive)."; DBDirectClient db(txn); auto cursor = db.query(rsOplogName, QUERY("ts" << BSON("$gte" << appliedThrough.getTimestamp())), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << appliedThrough << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstOpTimeFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())); if (firstOpTimeFound != appliedThrough) { severe() << "Oplog entry at " << appliedThrough << " is missing; actual entry found is " << firstOpTimeFound; fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. const bool wereWritesReplicated = txn->writesAreReplicated(); ON_BLOCK_EXIT([&] { txn->setReplicatedWrites(wereWritesReplicated); }); txn->setReplicatedWrites(false); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(txn, entry, true)); _storageInterface->setAppliedThrough( txn, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }