void WiredTigerRecoveryUnit::markNoTicketRequired() { invariant(!_ticket.hasTicket()); _noTicketNeeded = true; }
Status DeleteExecutor::prepareInLock(Database* db) { // If we have a non-NULL PlanExecutor, then we've already done the in-lock preparation. if (_exec.get()) { return Status::OK(); } uassert(17417, mongoutils::str::stream() << "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(), _isQueryParsed); const NamespaceString& ns(_request->getNamespaceString()); if (!_request->isGod()) { if (ns.isSystem()) { uassert(12050, "cannot delete from system namespace", legalClientSystemNS(ns.ns(), true)); } if (ns.ns().find('$') != string::npos) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uasserted(10100, "cannot delete from collection with reserved $ in name"); } } // Note that 'collection' may by NULL in the case that the collection we are trying to // delete from does not exist. NULL 'collection' is handled by getExecutorDelete(); we // expect to get back a plan executor whose plan is a DeleteStage on top of an EOFStage. Collection* collection = db->getCollection(_request->getOpCtx(), ns.ns()); if (collection && collection->isCapped()) { return Status(ErrorCodes::IllegalOperation, str::stream() << "cannot remove from a capped collection: " << ns.ns()); } if (_request->shouldCallLogOp() && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(ns.db())) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while removing from " << ns.ns()); } // If yielding is allowed for this plan, then set an auto yield policy. Otherwise set // a manual yield policy. const bool canYield = !_request->isGod() && PlanExecutor::YIELD_AUTO == _request->getYieldPolicy() && ( _canonicalQuery.get() ? !QueryPlannerCommon::hasNode(_canonicalQuery->root(), MatchExpression::ATOMIC) : !LiteParsedQuery::isQueryIsolated(_request->getQuery())); PlanExecutor::YieldPolicy policy = canYield ? PlanExecutor::YIELD_AUTO : PlanExecutor::YIELD_MANUAL; PlanExecutor* rawExec; Status getExecStatus = Status::OK(); if (_canonicalQuery.get()) { // This is the non-idhack branch. getExecStatus = getExecutorDelete(_request->getOpCtx(), collection, _canonicalQuery.release(), _request->isMulti(), _request->shouldCallLogOp(), _request->isFromMigrate(), _request->isExplain(), policy, &rawExec); } else { // This is the idhack branch. getExecStatus = getExecutorDelete(_request->getOpCtx(), collection, ns.ns(), _request->getQuery(), _request->isMulti(), _request->shouldCallLogOp(), _request->isFromMigrate(), _request->isExplain(), policy, &rawExec); } if (!getExecStatus.isOK()) { return getExecStatus; } invariant(rawExec); _exec.reset(rawExec); return Status::OK(); }
void WiredTigerRecoveryUnit::registerChange(Change* change) { invariant(_inUnitOfWork); _changes.push_back(change); }
WiredTigerRecoveryUnit* WiredTigerRecoveryUnit::get(OperationContext* txn) { invariant(txn); return checked_cast<WiredTigerRecoveryUnit*>(txn->recoveryUnit()); }
void WiredTigerRecoveryUnit::commitUnitOfWork() { invariant(_inUnitOfWork); _inUnitOfWork = false; _commit(); }
void WiredTigerRecoveryUnit::abortUnitOfWork() { invariant(_inUnitOfWork); _inUnitOfWork = false; _abort(); }
virtual void rollback() { mongo::mutex::scoped_lock lk(_db->_collectionLock); Collection*& inMap = _db->_collections[_coll->ns().ns()]; invariant(!inMap); inMap = _coll; }
void ReplicationCoordinatorImpl::_heartbeatReconfigStore( const ReplicationExecutor::CallbackData& cbd, const ReplicaSetConfig& newConfig) { if (cbd.status.code() == ErrorCodes::CallbackCanceled) { log() << "The callback to persist the replica set configuration was canceled - " << "the configuration was not persisted but was used: " << newConfig.toBSON(); return; } boost::unique_lock<boost::mutex> lk(_mutex, boost::defer_lock_t()); const StatusWith<int> myIndex = validateConfigForHeartbeatReconfig( _externalState.get(), newConfig); if (myIndex.getStatus() == ErrorCodes::NodeNotFound) { lk.lock(); // If this node absent in newConfig, and this node was not previously initialized, // return to kConfigUninitialized immediately, rather than storing the config and // transitioning into the RS_REMOVED state. See SERVER-15740. if (!_rsConfig.isInitialized()) { invariant(_rsConfigState == kConfigHBReconfiguring); LOG(1) << "Ignoring new configuration in heartbeat response because we are " "uninitialized and not a member of the new configuration"; _setConfigState_inlock(kConfigUninitialized); return; } lk.unlock(); } if (!myIndex.getStatus().isOK() && myIndex.getStatus() != ErrorCodes::NodeNotFound) { warning() << "Not persisting new configuration in heartbeat response to disk because " "it is invalid: "<< myIndex.getStatus(); } else { Status status = _externalState->storeLocalConfigDocument(cbd.txn, newConfig.toBSON()); lk.lock(); if (!status.isOK()) { error() << "Ignoring new configuration in heartbeat response because we failed to" " write it to stable storage; " << status; invariant(_rsConfigState == kConfigHBReconfiguring); if (_rsConfig.isInitialized()) { _setConfigState_inlock(kConfigSteady); } else { _setConfigState_inlock(kConfigUninitialized); } return; } lk.unlock(); _externalState->startThreads(); } const stdx::function<void (const ReplicationExecutor::CallbackData&)> reconfigFinishFn( stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigFinish, this, stdx::placeholders::_1, newConfig, myIndex)); // Make sure that the reconfigFinishFn doesn't finish until we've reset // _heartbeatReconfigThread. lk.lock(); if (_memberState.primary()) { // If the primary is receiving a heartbeat reconfig, that strongly suggests // that there has been a force reconfiguration. In any event, it might lead // to this node stepping down as primary, so we'd better do it with the global // lock. _replExecutor.scheduleWorkWithGlobalExclusiveLock(reconfigFinishFn); } else { _replExecutor.scheduleWork(reconfigFinishFn); } }
void repairDatabasesAndCheckVersion(OperationContext* txn) { LOG(1) << "enter repairDatabases (to check pdfile version #)"; ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); vector<string> dbNames; StorageEngine* storageEngine = txn->getServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&dbNames); // Repair all databases first, so that we do not try to open them if they are in bad shape if (storageGlobalParams.repair) { invariant(!storageGlobalParams.readOnly); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Repairing database: " << dbName; fassert(18506, repairDatabase(txn, storageEngine, dbName)); } } const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them // to. The local DB is special because it is not replicated. See SERVER-10927 for more // details. const bool shouldClearNonLocalTmpCollections = !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() || replSettings.isSlave()); const bool shouldDoCleanupForSERVER23299 = isSubjectToSERVER23299(txn); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Recovering database: " << dbName; Database* db = dbHolder().openDb(txn, dbName); invariant(db); // First thing after opening the database is to check for file compatibility, // otherwise we might crash if this is a deprecated format. auto status = db->getDatabaseCatalogEntry()->currentFilesCompatible(txn); if (!status.isOK()) { if (status.code() == ErrorCodes::CanRepairToDowngrade) { // Convert CanRepairToDowngrade statuses to MustUpgrade statuses to avoid logging a // potentially confusing and inaccurate message. // // TODO SERVER-24097: Log a message informing the user that they can start the // current version of mongod with --repair and then proceed with normal startup. status = {ErrorCodes::MustUpgrade, status.reason()}; } severe() << "Unable to start mongod due to an incompatibility with the data files and" " this version of mongod: " << redact(status); severe() << "Please consult our documentation when trying to downgrade to a previous" " major release"; quickExit(EXIT_NEED_UPGRADE); return; } // Check if admin.system.version contains an invalid featureCompatibilityVersion. // If a valid featureCompatibilityVersion is present, cache it as a server parameter. if (dbName == "admin") { if (Collection* versionColl = db->getCollection(FeatureCompatibilityVersion::kCollection)) { BSONObj featureCompatibilityVersion; if (Helpers::findOne(txn, versionColl, BSON("_id" << FeatureCompatibilityVersion::kParameterName), featureCompatibilityVersion)) { auto version = FeatureCompatibilityVersion::parse(featureCompatibilityVersion); if (!version.isOK()) { severe() << version.getStatus(); fassertFailedNoTrace(40283); } serverGlobalParams.featureCompatibility.version.store(version.getValue()); } } } // Major versions match, check indexes const string systemIndexes = db->name() + ".system.indexes"; Collection* coll = db->getCollection(systemIndexes); unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) { if (IndexNames::existedBefore24(plugin)) { continue; } log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } if (index["v"].isNumber() && index["v"].numberInt() == 0) { log() << "WARNING: The index: " << index << " was created with the deprecated" << " v:0 format. This format will not be supported in a future release." << startupWarningsLog; log() << "\t To fix this, you need to rebuild this index." << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes" << startupWarningsLog; } } // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::IS_EOF == state); if (replSettings.usingReplSets()) { // We only care about the _id index if we are in a replset checkForIdIndexes(txn, db); // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped // collections) if (db->name() == "local") { checkForCappedOplog(txn, db); } } if (shouldDoCleanupForSERVER23299) { handleSERVER23299ForDb(txn, db); } if (!storageGlobalParams.readOnly && (shouldClearNonLocalTmpCollections || dbName == "local")) { db->clearTmpCollections(txn); } } LOG(1) << "done repairDatabases"; }
ExitCode _initAndListen(int listenPort) { Client::initThread("initandlisten"); _initWireSpec(); auto globalServiceContext = getGlobalServiceContext(); globalServiceContext->setFastClockSource(FastClockSourceFactory::create(Milliseconds(10))); globalServiceContext->setOpObserver(stdx::make_unique<OpObserver>()); DBDirectClientFactory::get(globalServiceContext) .registerImplementation([](OperationContext* txn) { return std::unique_ptr<DBClientBase>(new DBDirectClient(txn)); }); const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); { ProcessId pid = ProcessId::getCurrent(); LogstreamBuilder l = log(LogComponent::kControl); l << "MongoDB starting : pid=" << pid << " port=" << serverGlobalParams.port << " dbpath=" << storageGlobalParams.dbpath; if (replSettings.isMaster()) l << " master=" << replSettings.isMaster(); if (replSettings.isSlave()) l << " slave=" << (int)replSettings.isSlave(); const bool is32bit = sizeof(int*) == 4; l << (is32bit ? " 32" : " 64") << "-bit host=" << getHostNameCached() << endl; } DEV log(LogComponent::kControl) << "DEBUG build (which is slower)" << endl; #if defined(_WIN32) VersionInfoInterface::instance().logTargetMinOS(); #endif logProcessDetails(); checked_cast<ServiceContextMongoD*>(getGlobalServiceContext())->createLockFile(); transport::TransportLayerLegacy::Options options; options.port = listenPort; options.ipList = serverGlobalParams.bind_ip; auto sep = stdx::make_unique<ServiceEntryPointMongod>(getGlobalServiceContext()->getTransportLayer()); auto sepPtr = sep.get(); getGlobalServiceContext()->setServiceEntryPoint(std::move(sep)); // Create, start, and attach the TL auto transportLayer = stdx::make_unique<transport::TransportLayerLegacy>(options, sepPtr); auto res = transportLayer->setup(); if (!res.isOK()) { error() << "Failed to set up listener: " << res; return EXIT_NET_ERROR; } std::shared_ptr<DbWebServer> dbWebServer; if (serverGlobalParams.isHttpInterfaceEnabled) { dbWebServer.reset(new DbWebServer(serverGlobalParams.bind_ip, serverGlobalParams.port + 1000, getGlobalServiceContext(), new RestAdminAccess())); if (!dbWebServer->setupSockets()) { error() << "Failed to set up sockets for HTTP interface during startup."; return EXIT_NET_ERROR; } } getGlobalServiceContext()->initializeGlobalStorageEngine(); #ifdef MONGO_CONFIG_WIREDTIGER_ENABLED if (WiredTigerCustomizationHooks::get(getGlobalServiceContext())->restartRequired()) { exitCleanly(EXIT_CLEAN); } #endif // Warn if we detect configurations for multiple registered storage engines in // the same configuration file/environment. if (serverGlobalParams.parsedOpts.hasField("storage")) { BSONElement storageElement = serverGlobalParams.parsedOpts.getField("storage"); invariant(storageElement.isABSONObj()); BSONObj storageParamsObj = storageElement.Obj(); BSONObjIterator i = storageParamsObj.begin(); while (i.more()) { BSONElement e = i.next(); // Ignore if field name under "storage" matches current storage engine. if (storageGlobalParams.engine == e.fieldName()) { continue; } // Warn if field name matches non-active registered storage engine. if (getGlobalServiceContext()->isRegisteredStorageEngine(e.fieldName())) { warning() << "Detected configuration for non-active storage engine " << e.fieldName() << " when current storage engine is " << storageGlobalParams.engine; } } } if (!getGlobalServiceContext()->getGlobalStorageEngine()->getSnapshotManager()) { if (moe::startupOptionsParsed.count("replication.enableMajorityReadConcern") && moe::startupOptionsParsed["replication.enableMajorityReadConcern"].as<bool>()) { // Note: we are intentionally only erroring if the user explicitly requested that we // enable majority read concern. We do not error if the they are implicitly enabled for // CSRS because a required step in the upgrade procedure can involve an mmapv1 node in // the CSRS in the REMOVED state. This is handled by the TopologyCoordinator. invariant(replSettings.isMajorityReadConcernEnabled()); severe() << "Majority read concern requires a storage engine that supports" << " snapshots, such as wiredTiger. " << storageGlobalParams.engine << " does not support snapshots."; exitCleanly(EXIT_BADOPTIONS); } } logMongodStartupWarnings(storageGlobalParams, serverGlobalParams); { stringstream ss; ss << endl; ss << "*********************************************************************" << endl; ss << " ERROR: dbpath (" << storageGlobalParams.dbpath << ") does not exist." << endl; ss << " Create this directory or give existing directory in --dbpath." << endl; ss << " See http://dochub.mongodb.org/core/startingandstoppingmongo" << endl; ss << "*********************************************************************" << endl; uassert(10296, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath)); } { stringstream ss; ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist"; uassert(12590, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.repairpath)); } // TODO: This should go into a MONGO_INITIALIZER once we have figured out the correct // dependencies. if (snmpInit) { snmpInit(); } if (!storageGlobalParams.readOnly) { boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/"); } if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalRecoverOnly) return EXIT_NET_ERROR; if (mongodGlobalParams.scriptingEnabled) { ScriptEngine::setup(); } auto startupOpCtx = getGlobalServiceContext()->makeOperationContext(&cc()); repairDatabasesAndCheckVersion(startupOpCtx.get()); if (storageGlobalParams.upgrade) { log() << "finished checking dbs"; exitCleanly(EXIT_CLEAN); } uassertStatusOK(getGlobalAuthorizationManager()->initialize(startupOpCtx.get())); /* this is for security on certain platforms (nonce generation) */ srand((unsigned)(curTimeMicros64() ^ startupSrandTimer.micros())); // The snapshot thread provides historical collection level and lock statistics for use // by the web interface. Only needed when HTTP is enabled. if (serverGlobalParams.isHttpInterfaceEnabled) { statsSnapshotThread.go(); invariant(dbWebServer); stdx::thread web(stdx::bind(&webServerListenThread, dbWebServer)); web.detach(); } AuthorizationManager* globalAuthzManager = getGlobalAuthorizationManager(); if (globalAuthzManager->shouldValidateAuthSchemaOnStartup()) { Status status = authindex::verifySystemIndexes(startupOpCtx.get()); if (!status.isOK()) { log() << redact(status); exitCleanly(EXIT_NEED_UPGRADE); } // SERVER-14090: Verify that auth schema version is schemaVersion26Final. int foundSchemaVersion; status = globalAuthzManager->getAuthorizationVersion(startupOpCtx.get(), &foundSchemaVersion); if (!status.isOK()) { log() << "Auth schema version is incompatible: " << "User and role management commands require auth data to have " << "at least schema version " << AuthorizationManager::schemaVersion26Final << " but startup could not verify schema version: " << status; exitCleanly(EXIT_NEED_UPGRADE); } if (foundSchemaVersion < AuthorizationManager::schemaVersion26Final) { log() << "Auth schema version is incompatible: " << "User and role management commands require auth data to have " << "at least schema version " << AuthorizationManager::schemaVersion26Final << " but found " << foundSchemaVersion << ". In order to upgrade " << "the auth schema, first downgrade MongoDB binaries to version " << "2.6 and then run the authSchemaUpgrade command."; exitCleanly(EXIT_NEED_UPGRADE); } } else if (globalAuthzManager->isAuthEnabled()) { error() << "Auth must be disabled when starting without auth schema validation"; exitCleanly(EXIT_BADOPTIONS); } else { // If authSchemaValidation is disabled and server is running without auth, // warn the user and continue startup without authSchema metadata checks. log() << startupWarningsLog; log() << "** WARNING: Startup auth schema validation checks are disabled for the " "database." << startupWarningsLog; log() << "** This mode should only be used to manually repair corrupted auth " "data." << startupWarningsLog; } auto shardingInitialized = uassertStatusOK(ShardingState::get(startupOpCtx.get()) ->initializeShardingAwarenessIfNeeded(startupOpCtx.get())); if (shardingInitialized) { reloadShardRegistryUntilSuccess(startupOpCtx.get()); } if (!storageGlobalParams.readOnly) { logStartup(startupOpCtx.get()); startFTDC(); getDeleter()->startWorkers(); restartInProgressIndexesFromLastShutdown(startupOpCtx.get()); if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { // Note: For replica sets, ShardingStateRecovery happens on transition to primary. if (!repl::getGlobalReplicationCoordinator()->isReplEnabled()) { uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get())); } } else if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { uassertStatusOK( initializeGlobalShardingStateForMongod(startupOpCtx.get(), ConnectionString::forLocal(), kDistLockProcessIdForConfigServer)); Balancer::create(startupOpCtx->getServiceContext()); } repl::getGlobalReplicationCoordinator()->startup(startupOpCtx.get()); const unsigned long long missingRepl = checkIfReplMissingFromCommandLine(startupOpCtx.get()); if (missingRepl) { log() << startupWarningsLog; log() << "** WARNING: mongod started without --replSet yet " << missingRepl << " documents are present in local.system.replset" << startupWarningsLog; log() << "** Restart with --replSet unless you are doing maintenance and " << " no other clients are connected." << startupWarningsLog; log() << "** The TTL collection monitor will not start because of this." << startupWarningsLog; log() << "** "; log() << " For more info see http://dochub.mongodb.org/core/ttlcollections"; log() << startupWarningsLog; } else { startTTLBackgroundJob(); } if (!replSettings.usingReplSets() && !replSettings.isSlave() && storageGlobalParams.engine != "devnull") { ScopedTransaction transaction(startupOpCtx.get(), MODE_X); Lock::GlobalWrite lk(startupOpCtx.get()->lockState()); FeatureCompatibilityVersion::setIfCleanStartup( startupOpCtx.get(), repl::StorageInterface::get(getGlobalServiceContext())); } if (replSettings.usingReplSets() || (!replSettings.isMaster() && replSettings.isSlave()) || !internalValidateFeaturesAsMaster) { serverGlobalParams.featureCompatibility.validateFeaturesAsMaster.store(false); } } startClientCursorMonitor(); PeriodicTask::startRunningPeriodicTasks(); // MessageServer::run will return when exit code closes its socket and we don't need the // operation context anymore startupOpCtx.reset(); auto start = getGlobalServiceContext()->addAndStartTransportLayer(std::move(transportLayer)); if (!start.isOK()) { error() << "Failed to start the listener: " << start.toString(); return EXIT_NET_ERROR; } #ifndef _WIN32 mongo::signalForkSuccess(); #else if (ntservice::shouldStartService()) { ntservice::reportStatus(SERVICE_RUNNING); log() << "Service running"; } #endif return waitForShutdown(); }
Status ClusterAggregate::runAggregate(OperationContext* txn, const Namespaces& namespaces, BSONObj cmdObj, int options, BSONObjBuilder* result) { auto dbname = namespaces.executionNss.db().toString(); auto status = grid.catalogCache()->getDatabase(txn, dbname); if (!status.isOK()) { appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns()); return Status::OK(); } std::shared_ptr<DBConfig> conf = status.getValue(); if (!conf->isShardingEnabled()) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj); if (!request.isOK()) { return request.getStatus(); } boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext(txn, request.getValue()); mergeCtx->inRouter = true; // explicitly *not* setting mergeCtx->tempDir LiteParsedPipeline liteParsedPipeline(request.getValue()); for (auto&& ns : liteParsedPipeline.getInvolvedNamespaces()) { uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns())); // We won't try to execute anything on a mongos, but we still have to populate this map // so that any $lookups etc will be able to have a resolved view definition. It's okay // that this is incorrect, we will repopulate the real resolved namespace map on the // mongod. mergeCtx->resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}}; } if (!conf->isSharded(namespaces.executionNss.ns())) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns()); // If there was no collation specified, but there is a default collation for the collation, // use that. if (request.getValue().getCollation().isEmpty() && chunkMgr->getDefaultCollator()) { mergeCtx->setCollator(chunkMgr->getDefaultCollator()->clone()); } // Parse and optimize the pipeline specification. auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx); if (!pipeline.isOK()) { return pipeline.getStatus(); } pipeline.getValue()->injectExpressionContext(mergeCtx); pipeline.getValue()->optimizePipeline(); // If the first $match stage is an exact match on the shard key (with a simple collation or // no string matching), we only have to send it to one shard, so send the command to that // shard. BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery(); BSONObj shardKeyMatches; shardKeyMatches = uassertStatusOK( chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery)); bool singleShard = false; if (!shardKeyMatches.isEmpty()) { auto chunk = chunkMgr->findIntersectingChunk( txn, shardKeyMatches, request.getValue().getCollation()); if (chunk.isOK()) { singleShard = true; } } // Don't need to split pipeline if the first $match is an exact match on shard key, unless // there is a stage that needs to be run on the primary shard. const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger(); const bool needSplit = !singleShard || needPrimaryShardMerger; // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true, // 'pipeline' will become the merger side. boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded() : pipeline.getValue()); // Create the command for the shards. The 'fromRouter' field means produce output to be // merged. MutableDocument commandBuilder(request.getValue().serializeToCommandObj()); commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize()); if (needSplit) { commandBuilder[AggregationRequest::kFromRouterName] = Value(true); commandBuilder[AggregationRequest::kCursorName] = Value(DOC(AggregationRequest::kBatchSizeName << 0)); } // These fields are not part of the AggregationRequest since they are not handled by the // aggregation subsystem, so we serialize them separately. const std::initializer_list<StringData> fieldsToPropagateToShards = { "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS, }; for (auto&& field : fieldsToPropagateToShards) { commandBuilder[field] = Value(cmdObj[field]); } BSONObj shardedCommand = commandBuilder.freeze().toBson(); BSONObj shardQuery = shardPipeline->getInitialQuery(); // Run the command on the shards // TODO need to make sure cursors are killed if a retry is needed std::vector<Strategy::CommandResult> shardResults; Strategy::commandOp(txn, dbname, shardedCommand, options, namespaces.executionNss.ns(), shardQuery, request.getValue().getCollation(), &shardResults); if (mergeCtx->isExplain) { // This must be checked before we start modifying result. uassertAllShardsSupportExplain(shardResults); if (needSplit) { *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline" << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart" << pipeline.getValue()->writeExplainOps()); } else { *result << "splitPipeline" << BSONNULL; } BSONObjBuilder shardExplains(result->subobjStart("shards")); for (size_t i = 0; i < shardResults.size(); i++) { shardExplains.append(shardResults[i].shardTargetId, BSON("host" << shardResults[i].target.toString() << "stages" << shardResults[i].result["stages"])); } return Status::OK(); } if (!needSplit) { invariant(shardResults.size() == 1); invariant(shardResults[0].target.getServers().size() == 1); auto executorPool = grid.getExecutorPool(); const BSONObj reply = uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0], shardResults[0].result, namespaces.requestedNss, executorPool->getArbitraryExecutor(), grid.getCursorManager())); result->appendElements(reply); return getStatusFromCommandResult(reply); } pipeline.getValue()->addInitialSource( DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx)); MutableDocument mergeCmd(request.getValue().serializeToCommandObj()); mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize()); mergeCmd["cursor"] = Value(cmdObj["cursor"]); if (cmdObj.hasField("$queryOptions")) { mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]); } if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) { mergeCmd[QueryRequest::cmdOptionMaxTimeMS] = Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]); } mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"])); mergeCmd.setField("readConcern", Value(cmdObj["readConcern"])); // If the user didn't specify a collation already, make sure there's a collation attached to // the merge command, since the merging shard may not have the collection metadata. if (mergeCmd.peek()["collation"].missing()) { mergeCmd.setField("collation", mergeCtx->getCollator() ? Value(mergeCtx->getCollator()->getSpec().toBSON()) : Value(Document{CollationSpec::kSimpleSpec})); } std::string outputNsOrEmpty; if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) { outputNsOrEmpty = out->getOutputNs().ns(); } // Run merging command on random shard, unless a stage needs the primary shard. Need to use // ShardConnection so that the merging mongod is sent the config servers on connection init. auto& prng = txn->getClient()->getPrng(); const auto& mergingShardId = needPrimaryShardMerger ? conf->getPrimaryId() : shardResults[prng.nextInt32(shardResults.size())].shardTargetId; const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId)); ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty); BSONObj mergedResults = aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options); conn.done(); if (auto wcErrorElem = mergedResults["writeConcernError"]) { appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result); } // Copy output from merging (primary) shard to the output object from our command. // Also, propagates errmsg and code if ok == false. result->appendElementsUnique(mergedResults); return getStatusFromCommandResult(result->asTempObj()); }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn, int lenToAlloc ) { // align size up to a multiple of 4 lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1); freelistAllocs.increment(); DiskLoc loc; { DiskLoc *prev = 0; DiskLoc *bestprev = 0; DiskLoc bestmatch; int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough int b = bucket(lenToAlloc); DiskLoc cur = _details->deletedListEntry(b); int extra = 5; // look for a better fit, a little. int chain = 0; while ( 1 ) { { // defensive check int fileNumber = cur.a(); int fileOffset = cur.getOfs(); if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) { StringBuilder sb; sb << "Deleted record list corrupted in collection " << _ns << ", bucket " << b << ", link number " << chain << ", invalid link is " << cur.toString() << ", throwing Fatal Assertion"; log() << sb.str() << endl; fassertFailed(16469); } } if ( cur.isNull() ) { // move to next bucket. if we were doing "extra", just break if ( bestmatchlen < INT_MAX ) break; if ( chain > 0 ) { // if we looked at things in the right bucket, but they were not suitable freelistBucketExhausted.increment(); } b++; if ( b > MaxBucket ) { // out of space. alloc a new extent. freelistIterations.increment( 1 + chain ); return DiskLoc(); } cur = _details->deletedListEntry(b); prev = 0; continue; } DeletedRecord *r = drec(cur); if ( r->lengthWithHeaders() >= lenToAlloc && r->lengthWithHeaders() < bestmatchlen ) { bestmatchlen = r->lengthWithHeaders(); bestmatch = cur; bestprev = prev; if (r->lengthWithHeaders() == lenToAlloc) // exact match, stop searching break; } if ( bestmatchlen < INT_MAX && --extra <= 0 ) break; if ( ++chain > 30 && b <= MaxBucket ) { // too slow, force move to next bucket to grab a big chunk //b++; freelistIterations.increment( chain ); chain = 0; cur.Null(); } else { cur = r->nextDeleted(); prev = &r->nextDeleted(); } } // unlink ourself from the deleted list DeletedRecord *bmr = drec(bestmatch); if ( bestprev ) { *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted(); } else { // should be the front of a free-list int myBucket = bucket(bmr->lengthWithHeaders()); invariant( _details->deletedListEntry(myBucket) == bestmatch ); _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted()); } *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive. invariant(bmr->extentOfs() < bestmatch.getOfs()); freelistIterations.increment( 1 + chain ); loc = bestmatch; } if ( loc.isNull() ) return loc; // determine if we should chop up DeletedRecord *r = drec(loc); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ int regionlen = r->lengthWithHeaders(); invariant( r->extentOfs() < loc.getOfs() ); int left = regionlen - lenToAlloc; if ( left < 24 || left < (lenToAlloc / 8) ) { // you get the whole thing. return loc; } // don't quantize: // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425 if ( _normalCollection ) { // we quantize here so that it only impacts newly sized records // this prevents oddities with older records and space re-use SERVER-8435 lenToAlloc = std::min( r->lengthWithHeaders(), quantizeAllocationSpace( lenToAlloc ) ); left = regionlen - lenToAlloc; if ( left < 24 ) { // you get the whole thing. return loc; } } /* split off some for further use. */ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord* newDel = drec(newDelLoc); DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); newDelW->extentOfs() = r->extentOfs(); newDelW->lengthWithHeaders() = left; newDelW->nextDeleted().Null(); addDeletedRec( txn, newDelLoc ); return loc; }
void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc diskloc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << diskloc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = _extentManager->getExtent( diskloc ); e->assertOk(); fassert( 17437, e->validates(diskloc) ); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; size_t length = e->length; touch_pages( reinterpret_cast<const char*>(e), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/t.seconds() << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = recordFor(L); RecordData oldData = recOld->toRecordData(); L = getNextRecordInExtent(txn, L); if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) { // object is corrupt! log() << "compact skipping corrupt document!"; stats->corruptDocuments++; } else { unsigned dataSize = adaptor->dataSize( oldData ); unsigned docSize = dataSize; nrecords++; oldObjSize += docSize; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = docSize + Record::HeaderSize; unsigned lenWPadding = lenWHdr; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) ) lenWPadding = quantizePowerOf2AllocationSpace(lenWPadding); break; case CompactOptions::PRESERVE: // if we are preserving the padding, the record should not change size lenWPadding = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: lenWPadding = compactOptions->computeRecordSize(lenWPadding); if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } break; } CompactDocWriter writer( recOld, dataSize, lenWPadding ); StatusWith<DiskLoc> status = insertRecord( txn, &writer, false ); uassertStatusOK( status.getStatus() ); datasize += recordFor( status.getValue() )->netLength(); adaptor->inserted( dataFor( txn, status.getValue() ), status.getValue() ); } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = !txn->checkForInterruptNoAssert().isOK(); if( stopping || txn->recoveryUnit()->isCommitNeeded() ) { *txn->recoveryUnit()->writing(&e->firstRecord) = L; Record *r = recordFor(L); txn->recoveryUnit()->writingInt(r->prevOfs()) = DiskLoc::NullOfs; txn->recoveryUnit()->commitIfNeeded(); txn->checkForInterrupt(); } } } // if !L.isNull() invariant( _details->firstExtent(txn) == diskloc ); invariant( _details->lastExtent(txn) != diskloc ); DiskLoc newFirst = e->xnext; _details->setFirstExtent( txn, newFirst ); *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc(); _extentManager->freeExtent( txn, diskloc ); txn->recoveryUnit()->commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; } } }
Pipeline::SourceContainer::iterator DocumentSourceSequentialDocumentCache::doOptimizeAt( Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) { // The DocumentSourceSequentialDocumentCache should always be the last stage in the pipeline // pre-optimization. By the time optimization reaches this point, all preceding stages are in // the final positions which they would have occupied if no cache stage was present. invariant(_hasOptimizedPos || std::next(itr) == container->end()); invariant((*itr).get() == this); // If we have already optimized our position, stay where we are. if (_hasOptimizedPos) { return std::next(itr); } // Mark this stage as having optimized itself. _hasOptimizedPos = true; // If the cache is the only stage in the pipeline, return immediately. if (itr == container->begin()) { return container->end(); } // Pop the cache stage off the back of the pipeline. auto cacheStage = std::move(*itr); container->erase(itr); // Get all variable IDs defined in this scope. auto varIDs = pExpCtx->variablesParseState.getDefinedVariableIDs(); auto prefixSplit = container->begin(); // In the context of this optimization, we are only interested in figuring out // which external variables are referenced in the pipeline. We are not attempting // to enforce that any referenced metadata are in fact available, this is done // elsewhere. So without knowledge of what metadata is in fact available, here // we "lie" and say that all metadata is available to avoid tripping any // assertions. DepsTracker deps(DepsTracker::kAllMetadataAvailable); // Iterate through the pipeline stages until we find one which references an external variable. for (; prefixSplit != container->end(); ++prefixSplit) { (*prefixSplit)->getDependencies(&deps); if (deps.hasVariableReferenceTo(varIDs)) { break; } } // The 'prefixSplit' iterator is now pointing to the first stage of the correlated suffix. If // the split point is the first stage, then the entire pipeline is correlated and we should not // attempt to perform any caching. Abandon the cache and return. if (prefixSplit == container->begin()) { _cache->abandon(); return container->end(); } // If the cache has been populated and is serving results, remove the non-correlated prefix. if (_cache->isServing()) { container->erase(container->begin(), prefixSplit); } container->insert(prefixSplit, std::move(cacheStage)); return container->end(); }
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx, Database* db, const std::string& shortFrom, const std::string& shortTo, long long size, bool temp) { NamespaceString fromNss(db->name(), shortFrom); NamespaceString toNss(db->name(), shortTo); Collection* fromCollection = db->getCollection(opCtx, fromNss); if (!fromCollection) { if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) { return Status(ErrorCodes::CommandNotSupportedOnView, str::stream() << "cloneCollectionAsCapped not supported for views: " << fromNss.ns()); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " does not exist"); } if (fromNss.isDropPendingNamespace()) { return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " is currently in a drop-pending state."); } if (db->getCollection(opCtx, toNss)) { return Status(ErrorCodes::NamespaceExists, str::stream() << "cloneCollectionAsCapped failed - destination collection " << toNss.ns() << " already exists. source collection: " << fromNss.ns()); } // create new collection { auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx); // The capped collection will get its own new unique id, as the conversion isn't reversible, // so it can't be rolled back. options.uuid.reset(); options.capped = true; options.cappedSize = size; if (temp) options.temp = true; BSONObjBuilder cmd; cmd.append("create", toNss.coll()); cmd.appendElements(options.toBSON()); Status status = createCollection(opCtx, toNss.db().toString(), cmd.done()); if (!status.isOK()) return status; } Collection* toCollection = db->getCollection(opCtx, toNss); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2)); long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess; auto exec = InternalPlanner::collectionScan(opCtx, fromNss.ns(), fromCollection, PlanExecutor::WRITE_CONFLICT_RETRY_ONLY, InternalPlanner::FORWARD); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(opCtx); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above MONGO_UNREACHABLE; } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(opCtx, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(opCtx); OpDebug* const nullOpDebug = nullptr; uassertStatusOK(toCollection->insertDocument( opCtx, InsertStatement(objToClone.value()), nullOpDebug, true)); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException&) { CurOp::get(opCtx)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns()); // Can't use writeConflictRetry since we need to save/restore exec around call to // abandonSnapshot. exec->saveState(); opCtx->recoveryUnit()->abandonSnapshot(); auto restoreStatus = exec->restoreState(); // Handles any WCEs internally. if (!restoreStatus.isOK()) { return restoreStatus; } } } MONGO_UNREACHABLE; }