bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { Client::ReadContext ctx( txn, dbname ); const Database* d = ctx.ctx().db(); const DatabaseCatalogEntry* dbEntry = d->getDatabaseCatalogEntry(); list<string> names; dbEntry->getCollectionNamespaces( &names ); BSONArrayBuilder arr; for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) { string ns = *i; BSONObjBuilder b; b.append( "name", nsToCollectionSubstring( ns ) ); CollectionOptions options = dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn); b.append( "options", options.toBSON() ); arr.append( b.obj() ); } result.append( "collections", arr.arr() ); return true; }
void Database::clearTmpCollections(OperationContext* txn) { txn->lockState()->assertWriteLocked( _name ); list<string> collections; _dbEntry->getCollectionNamespaces( &collections ); for ( list<string>::iterator i = collections.begin(); i != collections.end(); ++i ) { string ns = *i; invariant( NamespaceString::normal( ns ) ); CollectionCatalogEntry* coll = _dbEntry->getCollectionCatalogEntry( txn, ns ); CollectionOptions options = coll->getCollectionOptions( txn ); if ( !options.temp ) continue; WriteUnitOfWork wunit(txn); Status status = dropCollection( txn, ns ); if ( !status.isOK() ) { warning() << "could not drop temp collection '" << ns << "': " << status; continue; } string cmdNs = _name + ".$cmd"; repl::logOp( txn, "c", cmdNs.c_str(), BSON( "drop" << nsToCollectionSubstring( ns ) ) ); wunit.commit(); } }
CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions( OperationContext* txn, const StringData& ns ) const { if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) { return CollectionOptions(); } RecordStoreV1Base* rs = _getNamespaceRecordStore(); invariant( rs ); scoped_ptr<RecordIterator> it( rs->getIterator(txn) ); while ( !it->isEOF() ) { DiskLoc loc = it->getNext(); BSONObj entry = it->dataFor( loc ).toBson(); BSONElement name = entry["name"]; if ( name.type() == String && name.String() == ns ) { CollectionOptions options; if ( entry["options"].isABSONObj() ) { Status status = options.parse( entry["options"].Obj() ); fassert( 18523, status ); } return options; } } return CollectionOptions(); }
void insertObjects(const char *ns, const vector<BSONObj> &objs, bool keepGoing, uint64_t flags, bool logop ) { StringData _ns(ns); if (NamespaceString::isSystem(_ns)) { massert(16748, "need transaction to run insertObjects", cc().txnStackSize() > 0); uassert(10095, "attempt to insert in reserved database name 'system'", nsToDatabaseSubstring(_ns) != "system"); massert(16750, "attempted to insert multiple objects into a system namspace at once", objs.size() == 1); // Trying to insert into a system collection. Fancy side-effects go here: if (nsToCollectionSubstring(ns) == "system.indexes") { BSONObj obj = stripDropDups(objs[0]); NamespaceDetails *d = getAndMaybeCreateNS(obj["ns"].Stringdata(), logop); bool ok = d->ensureIndex(obj); if (!ok) { // Already had that index return; } // Now we have to actually insert that document into system.indexes, we may have // modified it with stripDropDups. vector<BSONObj> newObjs; newObjs.push_back(obj); _insertObjects(ns, newObjs, keepGoing, flags, logop); return; } else if (!legalClientSystemNS(ns, true)) { uasserted(16459, str::stream() << "attempt to insert in system namespace '" << ns << "'"); } } _insertObjects(ns, objs, keepGoing, flags, logop); }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, dbname, MODE_S); const Database* d = autoDb.getDb(); const DatabaseCatalogEntry* dbEntry = NULL; list<string> names; if ( d ) { dbEntry = d->getDatabaseCatalogEntry(); dbEntry->getCollectionNamespaces( &names ); names.sort(); } scoped_ptr<MatchExpression> matcher; if ( jsobj["filter"].isABSONObj() ) { StatusWithMatchExpression parsed = MatchExpressionParser::parse( jsobj["filter"].Obj() ); if ( !parsed.isOK() ) { return appendCommandStatus( result, parsed.getStatus() ); } matcher.reset( parsed.getValue() ); } BSONArrayBuilder arr; for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) { string ns = *i; StringData collection = nsToCollectionSubstring( ns ); if ( collection == "system.namespaces" ) { continue; } BSONObjBuilder b; b.append( "name", collection ); CollectionOptions options = dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn); b.append( "options", options.toBSON() ); BSONObj maybe = b.obj(); if ( matcher && !matcher->matchesBSON( maybe ) ) { continue; } arr.append( maybe ); } result.append( "collections", arr.arr() ); return true; }
// returns true if the operation should run in an alternate // transaction stack instead of the possible multi statement // transaction stack that it is a part of. Several operations/statements, // such as authentication, should not run static bool opNeedsAltTxn(const StringData &ns) { // for now, the only operations that need to run in an // alternate transaction stack are authentication related // operations. We do not want them to be part of multi statement // transactions. return nsToCollectionSubstring(ns) == "system.users"; }
AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* txn, const std::string& ns) : _txn(txn), _transaction(txn, MODE_IS), _db(_txn, nsToDatabaseSubstring(ns), MODE_IS), _collLock(_txn->lockState(), ns, MODE_IS), _coll(NULL) { _init(ns, nsToCollectionSubstring(ns)); }
void CollectionMap::drop() { Lock::assertWriteLocked(_database); init(); if (!allocated()) { return; } string errmsg; BSONObjBuilder result; // This implementation is inefficient and slightly messy, but it was easy. // Feel free to improve it as necessary: // - The getCursor call will grab a table lock on .system.namespaces. // - We'll look at the entire system.namespaces collection just for one database. // - Code is duplicated to handle dropping system system collections in stages. vector<string> sysIndexesEntries; const string systemNamespacesNs = getSisterNS(_database, "system.namespaces"); Collection *sysCl = getCollection(systemNamespacesNs); for (shared_ptr<Cursor> c(Cursor::make(sysCl)); c->ok(); c->advance()) { const BSONObj nsObj = c->current(); const StringData ns = nsObj["name"].Stringdata(); if (nsToDatabaseSubstring(ns) != _database) { // Not part of this database, skip. continue; } if (nsToCollectionSubstring(ns) == "system.indexes") { // Save .system.indexes collection for last, because drop() deletes from it. sysIndexesEntries.push_back(ns.toString()); } else { Collection *cl = getCollection(ns); if (cl != NULL) { cl->drop(errmsg, result, true); } } } if (sysCl != NULL) { // The .system.namespaces collection does not include itself. sysCl->drop(errmsg, result, true); } // Now drop the system.indexes entries. for (vector<string>::const_iterator it = sysIndexesEntries.begin(); it != sysIndexesEntries.end(); it++) { // Need to close any existing handle before drop. Collection *cl = getCollection(*it); if (cl != NULL) { cl->drop(errmsg, result, true); } } // Everything that was open should have been closed due to drop. verify(_collections.empty()); shared_ptr<storage::Dictionary> metadb = _metadb; _metadb.reset(); const int r = metadb->close(); if (r != 0) { storage::handle_ydb_error(r); } storage::db_remove(_metadname); }
// Copies ops out of the bgsync queue into the deque passed in as a parameter. // Returns true if the batch should be ended early. // Batch should end early if we encounter a command, or if // there are no further ops in the bgsync queue to read. // This function also blocks 1 second waiting for new ops to appear in the bgsync // queue. We can't block forever because there are maintenance things we need // to periodically check in the loop. bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops, ReplicationCoordinator* replCoord) { BSONObj op; // Check to see if there are ops waiting in the bgsync queue bool peek_success = peek(&op); if (!peek_success) { // if we don't have anything in the queue, wait a bit for something to appear if (ops->empty()) { replCoord->signalDrainComplete(); // block up to 1 second _networkQueue->waitForMore(); return false; } // otherwise, apply what we have return true; } const char* ns = op["ns"].valuestrsafe(); // check for commands if ((op["op"].valuestrsafe()[0] == 'c') || // Index builds are acheived through the use of an insert op, not a command op. // The following line is the same as what the insert code uses to detect an index build. ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) { if (ops->empty()) { // apply commands one-at-a-time ops->push_back(op); _networkQueue->consume(); } // otherwise, apply what we have so far and come back for the command return true; } // check for oplog version change BSONElement elemVersion = op["v"]; int curVersion = 0; if (elemVersion.eoo()) // missing version means version 1 curVersion = 1; else curVersion = elemVersion.Int(); if (curVersion != OPLOG_VERSION) { severe() << "expected oplog version " << OPLOG_VERSION << " but found version " << curVersion << " in oplog entry: " << op; fassertFailedNoTrace(18820); } // Copy the op to the deque and remove it from the bgsync queue. ops->push_back(op); _networkQueue->consume(); // Go back for more ops return false; }
// Copies ops out of the bgsync queue into the deque passed in as a parameter. // Returns true if the batch should be ended early. // Batch should end early if we encounter a command, or if // there are no further ops in the bgsync queue to read. // This function also blocks 1 second waiting for new ops to appear in the bgsync // queue. We can't block forever because there are maintenance things we need // to periodically check in the loop. bool SyncTail::tryPopAndWaitForMore(OperationContext* txn, SyncTail::OpQueue* ops) { BSONObj op; // Check to see if there are ops waiting in the bgsync queue bool peek_success = peek(&op); if (!peek_success) { // if we don't have anything in the queue, wait a bit for something to appear if (ops->empty()) { // block up to 1 second _networkQueue->waitForMore(); return false; } // otherwise, apply what we have return true; } auto entry = OplogEntry(op); // Check for ops that must be processed one at a time. if (entry.raw.isEmpty() || // sentinel that network queue is drained. (entry.opType[0] == 'c') || // commands. // Index builds are acheived through the use of an insert op, not a command op. // The following line is the same as what the insert code uses to detect an index build. (!entry.ns.empty() && nsToCollectionSubstring(entry.ns) == "system.indexes")) { if (ops->empty()) { // apply commands one-at-a-time ops->push_back(std::move(entry)); _networkQueue->consume(); } // otherwise, apply what we have so far and come back for the command return true; } // check for oplog version change int curVersion = 0; if (entry.version.eoo()) // missing version means version 1 curVersion = 1; else curVersion = entry.version.Int(); if (curVersion != OPLOG_VERSION) { severe() << "expected oplog version " << OPLOG_VERSION << " but found version " << curVersion << " in oplog entry: " << op; fassertFailedNoTrace(18820); } // Copy the op to the deque and remove it from the bgsync queue. ops->push_back(std::move(entry)); _networkQueue->consume(); // Go back for more ops return false; }
void receivedInsert(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; StringData coll = nsToCollectionSubstring(ns); // Auth checking for index writes happens later. if (coll != "system.indexes") { Status status = cc().getAuthorizationManager()->checkAuthForInsert(ns); uassert(16544, status.reason(), status.isOK()); } if (!d.moreJSObjs()) { // strange. should we complain? return; } vector<BSONObj> objs; while (d.moreJSObjs()) { objs.push_back(d.nextJsObj()); } const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError; OpSettings settings; settings.setQueryCursorMode(WRITE_LOCK_CURSOR); cc().setOpSettings(settings); if (coll == "system.indexes" && // Can only build non-unique indexes in the background, because the // hot indexer does not know how to perform unique checks. objs[0]["background"].trueValue() && !objs[0]["unique"].trueValue()) { _buildHotIndex(ns, m, objs); return; } scoped_ptr<Client::ShardedOperationScope> scp; if (coll != "system.indexes") { scp.reset(new Client::ShardedOperationScope); if (scp->handlePossibleShardedMessage(m, 0)) { return; } } LOCK_REASON(lockReason, "insert"); try { Lock::DBRead lk(ns, lockReason); lockedReceivedInsert(ns, m, objs, op, keepGoing); } catch (RetryWithWriteLock &e) { Lock::DBWrite lk(ns, lockReason); lockedReceivedInsert(ns, m, objs, op, keepGoing); } }
void IndexDetails::kill_idx() { const string ns = indexNamespace(); const string parentns = parentNS(); close(); storage::db_remove(ns); // Removing this index's ns from the system.indexes/namespaces catalog. removeNamespaceFromCatalog(ns); if (nsToCollectionSubstring(parentns) != "system.indexes") { removeFromSysIndexes(parentns, indexName()); } }
/* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. caller must check if capped */ void DataFileMgr::_deleteRecord(NamespaceDetails *d, const StringData& ns, Record *todelete, const DiskLoc& dl) { /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs() ) = todelete->nextOfs(); if ( todelete->nextOfs() != DiskLoc::NullOfs ) getDur().writingInt( todelete->getNext(dl).rec()->prevOfs() ) = todelete->prevOfs(); } /* remove ourself from extent pointers */ { Extent *e = getDur().writing( todelete->myExtent(dl) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { d->incrementStats( -1 * todelete->netLength(), -1 ); if ( nsToCollectionSubstring(ns) == "system.indexes") { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { DEV { unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() ); *getDur().writing(p) = 0; //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. } d->addDeletedRec((DeletedRecord*)todelete, dl); } } }
void SyncClusterConnection::insert( const string &ns, BSONObj obj , int flags) { uassert(13119, str::stream() << "SyncClusterConnection::insert obj has to have an _id: " << obj, nsToCollectionSubstring(ns) == "system.indexes" || obj["_id"].type()); string errmsg; if ( ! prepare( errmsg ) ) throw UserException( 8003 , (string)"SyncClusterConnection::insert prepare failed: " + errmsg ); for ( size_t i=0; i<_conns.size(); i++ ) { _conns[i]->insert( ns , obj , flags); } _checkLast(); }
/** { ..., capped: true, size: ..., max: ... } * @param createDefaultIndexes - if false, defers id (and other) index creation. * @return true if successful */ Status userCreateNS( OperationContext* txn, Database* db, StringData ns, BSONObj options, bool logForReplication, bool createDefaultIndexes ) { invariant( db ); LOG(1) << "create collection " << ns << ' ' << options; if ( !NamespaceString::validCollectionComponent(ns) ) return Status( ErrorCodes::InvalidNamespace, str::stream() << "invalid ns: " << ns ); Collection* collection = db->getCollection( ns ); if ( collection ) return Status( ErrorCodes::NamespaceExists, "collection already exists" ); CollectionOptions collectionOptions; Status status = collectionOptions.parse(options); if ( !status.isOK() ) return status; status = validateStorageOptions(collectionOptions.storageEngine, &StorageEngine::Factory::validateCollectionStorageOptions); if ( !status.isOK() ) return status; invariant( db->createCollection( txn, ns, collectionOptions, true, createDefaultIndexes ) ); if ( logForReplication ) { if ( options.getField( "create" ).eoo() ) { BSONObjBuilder b; b << "create" << nsToCollectionSubstring( ns ); b.appendElements( options ); options = b.obj(); } string logNs = nsToDatabase(ns) + ".$cmd"; repl::logOp(txn, "c", logNs.c_str(), options); } return Status::OK(); }
bool WriteBatchExecutor::ExecInsertsState::_lockAndCheckImpl(WriteOpResult* result) { if (hasLock()) { txn->getCurOp()->enter(_context.get()); return true; } invariant(!_context.get()); _writeLock.reset(new Lock::DBLock(txn->lockState(), nsToDatabase(request->getNS()), newlm::MODE_X)); if (!checkIsMasterForDatabase(request->getNS(), result)) { return false; } if (!checkShardVersion(txn, &shardingState, *request, result)) { return false; } if (!checkIndexConstraints(txn, &shardingState, *request, result)) { return false; } _context.reset(new Client::Context(txn, request->getNS(), false)); Database* database = _context->db(); dassert(database); _collection = database->getCollection(txn, request->getTargetingNS()); if (!_collection) { WriteUnitOfWork wunit (txn); // Implicitly create if it doesn't exist _collection = database->createCollection(txn, request->getTargetingNS()); if (!_collection) { result->setError( toWriteError(Status(ErrorCodes::InternalError, "could not create collection " + request->getTargetingNS()))); return false; } repl::logOp(txn, "c", (database->name() + ".$cmd").c_str(), BSON("create" << nsToCollectionSubstring(request->getTargetingNS()))); wunit.commit(); } return true; }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { Lock::DBRead lk( txn->lockState(), dbname ); const Database* d = dbHolder().get( txn, dbname ); const DatabaseCatalogEntry* dbEntry = NULL; list<string> names; if ( d ) { dbEntry = d->getDatabaseCatalogEntry(); dbEntry->getCollectionNamespaces( &names ); names.sort(); } BSONArrayBuilder arr; for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) { string ns = *i; StringData collection = nsToCollectionSubstring( ns ); if ( collection == "system.namespaces" ) { continue; } BSONObjBuilder b; b.append( "name", collection ); CollectionOptions options = dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn); b.append( "options", options.toBSON() ); arr.append( b.obj() ); } result.append( "collections", arr.arr() ); return true; }
void MMAPV1DatabaseCatalogEntry::_addNamespaceToNamespaceCollection( OperationContext* txn, const StringData& ns, const BSONObj* options ) { if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) { // system.namespaces holds all the others, so it is not explicitly listed in the catalog. return; } BSONObjBuilder b; b.append("name", ns); if ( options && !options->isEmpty() ) b.append("options", *options); BSONObj obj = b.done(); RecordStoreV1Base* rs = _getNamespaceRecordStore( txn, ns ); invariant( rs ); StatusWith<DiskLoc> loc = rs->insertRecord( txn, obj.objdata(), obj.objsize(), -1 ); massertStatusOK( loc.getStatus() ); }
void MMAPV1DatabaseCatalogEntry::_removeNamespaceFromNamespaceCollection( OperationContext* txn, const StringData& ns ) { if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) { // system.namespaces holds all the others, so it is not explicitly listed in the catalog. return; } RecordStoreV1Base* rs = _getNamespaceRecordStore(); invariant( rs ); scoped_ptr<RecordIterator> it( rs->getIterator(txn) ); while ( !it->isEOF() ) { DiskLoc loc = it->getNext(); BSONObj entry = it->dataFor( loc ).toBson(); BSONElement name = entry["name"]; if ( name.type() == String && name.String() == ns ) { rs->deleteRecord( txn, loc ); break; } } }
bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) { uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns); if ( strstr(ns, ".system.") ) { // later:check for dba-type permissions here if have that at some point separate if (nsToCollectionSubstring(ns) == "system.indexes") wouldAddIndex = true; else if ( legalClientSystemNS( ns , true ) ) { if ( obuf && StringData(ns) == StringData(".system.users", StringData::LiteralTag()) ) { BSONObj t( reinterpret_cast<const char *>( obuf ) ); V2UserDocumentParser parser; uassertStatusOK(parser.checkValidUserDocument(t)); } } else if ( !god ) { uasserted(16459, str::stream() << "attempt to insert in system namespace '" << ns << "'"); } } return true; }
void Database::clearTmpCollections(OperationContext* txn) { invariant(txn->lockState()->isDbLockedForMode(name(), MODE_X)); list<string> collections; _dbEntry->getCollectionNamespaces( &collections ); for ( list<string>::iterator i = collections.begin(); i != collections.end(); ++i ) { string ns = *i; invariant( NamespaceString::normal( ns ) ); CollectionCatalogEntry* coll = _dbEntry->getCollectionCatalogEntry( ns ); CollectionOptions options = coll->getCollectionOptions( txn ); if ( !options.temp ) continue; try { WriteUnitOfWork wunit(txn); Status status = dropCollection( txn, ns ); if ( !status.isOK() ) { warning() << "could not drop temp collection '" << ns << "': " << status; continue; } string cmdNs = _name + ".$cmd"; repl::logOp( txn, "c", cmdNs.c_str(), BSON( "drop" << nsToCollectionSubstring( ns ) ) ); wunit.commit(); } catch (const WriteConflictException& exp) { warning() << "could not drop temp collection '" << ns << "' due to " "WriteConflictException"; txn->recoveryUnit()->commitAndRestart(); } } }
DBClientCursor::DBClientCursor(DBClientBase* client, const std::string& ns, const BSONObj& query, long long cursorId, int nToReturn, int nToSkip, const BSONObj* fieldsToReturn, int queryOptions, int batchSize) : _client(client), _originalHost(_client->getServerAddress()), ns(ns), _isCommand(nsIsFull(ns) ? nsToCollectionSubstring(ns) == "$cmd" : false), query(query), nToReturn(nToReturn), haveLimit(nToReturn > 0 && !(queryOptions & QueryOption_CursorTailable)), nToSkip(nToSkip), fieldsToReturn(fieldsToReturn), opts(queryOptions), batchSize(batchSize == 1 ? 2 : batchSize), resultFlags(0), cursorId(cursorId), _ownCursor(true), wasError(false) {}
void BackgroundSync::_produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol(); // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election // timeout. This enables the sync source to communicate liveness of the primary to secondaries. // Under protocol version 0, use a default timeout of 2 seconds for awaitData. const Milliseconds fetcherMaxTimeMS( isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2)); Status fetcherReturnStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, fetcherMaxTimeMS, &fetcherReturnStatus); BSONObjBuilder cmdBob; cmdBob.append("find", nsToCollectionSubstring(rsOplogName)); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1))); // 1 min initial find. BSONObjBuilder metadataBob; if (isV1ElectionProtocol) { cmdBob.append("term", _replCoord->getTerm()); metadataBob.append(rpc::kReplSetMetadataFieldName, 1); } auto dbName = nsToDatabase(rsOplogName); auto cmdObj = cmdBob.obj(); auto metadataObj = metadataBob.obj(); Fetcher fetcher(&_threadPoolTaskExecutor, source, dbName, cmdObj, fetcherCallback, metadataObj, _replCoord->getConfig().getElectionTimeoutPeriod()); LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at " << cmdObj["filter"]; auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter" << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())) << "tailable" << true << "oplogReplay" << true << "awaitData" << true << "maxTimeMS" << int(fetcherMaxTimeMS.count())); Fetcher fetcher(taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, rpc::makeEmptyMetadata()); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
// static Status SyncTail::syncApply(OperationContext* txn, const BSONObj& op, bool convertUpdateToUpsert, ApplyOperationInLockFn applyOperationInLock, ApplyCommandInLockFn applyCommandInLock, IncrementOpsAppliedStatsFn incrementOpsAppliedStats) { if (inShutdown()) { return Status::OK(); } // Count each log op application as a separate operation, for reporting purposes CurOp individualOp(txn); const char* ns = op.getStringField("ns"); verify(ns); const char* opType = op["op"].valuestrsafe(); bool isCommand(opType[0] == 'c'); bool isNoOp(opType[0] == 'n'); if ((*ns == '\0') || (*ns == '.')) { // this is ugly // this is often a no-op // but can't be 100% sure if (!isNoOp) { error() << "skipping bad op in oplog: " << op.toString(); } return Status::OK(); } if (isCommand) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( Lock::GlobalWrite globalWriteLock(txn->lockState()); // special case apply for commands to avoid implicit database creation Status status = applyCommandInLock(txn, op); incrementOpsAppliedStats(); return status; } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "syncApply_command", ns); } auto applyOp = [&](Database* db) { // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. txn->setReplicatedWrites(false); DisableDocumentValidation validationDisabler(txn); Status status = applyOperationInLock(txn, db, op, convertUpdateToUpsert, incrementOpsAppliedStats); if (!status.isOK() && status.code() == ErrorCodes::WriteConflict) { throw WriteConflictException(); } return status; }; if (isNoOp || (opType[0] == 'i' && nsToCollectionSubstring(ns) == "system.indexes")) { auto opStr = isNoOp ? "syncApply_noop" : "syncApply_indexBuild"; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X); OldClientContext ctx(txn, ns); return applyOp(ctx.db()); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, opStr, ns); }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; if ( context ) { context->relocked(); } while( i.moreInCurrentBatch() ) { if ( n % 128 == 127 /*yield some*/ ) { time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << n << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ if ( !tmp.valid() ) { stringstream ss; ss << "Cloner: skipping corrupt object from " << from_collection; BSONElement e = tmp.firstElement(); try { e.validate(); ss << " firstElement: " << e; } catch( ... ) { ss << " firstElement corrupt"; } out() << ss.str() << endl; continue; } ++n; BSONObj js = tmp; if ( isindex ) { verify(nsToCollectionSubstring(from_collection) == "system.indexes"); js = fixindex(tmp); storedForLater->push_back( js.getOwned() ); continue; } try { DiskLoc loc = theDataFileMgr.insertWithObjMod(to_collection, js); loc.assertOk(); if ( logForRepl ) logOp("i", to_collection, js); getDur().commitIfNeeded(); } catch( UserException& e ) { error() << "error: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; throw; } RARELY if ( time( 0 ) - saveLast > 60 ) { log() << n << " objects cloned so far from collection " << from_collection << endl; saveLast = time( 0 ); } } }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; context.relocked(); bool createdCollection = false; Collection* collection = NULL; while( i.moreInCurrentBatch() ) { if ( numSeen % 128 == 127 /*yield some*/ ) { collection = NULL; time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << numSeen << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } if ( isindex == false && collection == NULL ) { collection = context.db()->getCollection( to_collection ); if ( !collection ) { massert( 17321, str::stream() << "collection dropped during clone [" << to_collection << "]", !createdCollection ); createdCollection = true; collection = context.db()->createCollection( txn, to_collection ); verify( collection ); } } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ const Status status = validateBSON(tmp.objdata(), tmp.objsize()); if (!status.isOK()) { out() << "Cloner: skipping corrupt object from " << from_collection << ": " << status.reason(); continue; } ++numSeen; BSONObj js = tmp; if ( isindex ) { verify(nsToCollectionSubstring(from_collection) == "system.indexes"); js = fixindex(context.db()->name(), tmp); indexesToBuild->push_back( js.getOwned() ); continue; } verify(nsToCollectionSubstring(from_collection) != "system.indexes"); StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true ); if ( !loc.isOK() ) { error() << "error: exception cloning object in " << from_collection << ' ' << loc.toString() << " obj:" << js; } uassertStatusOK( loc.getStatus() ); if ( logForRepl ) logOp(txn, "i", to_collection, js); getDur().commitIfNeeded(); RARELY if ( time( 0 ) - saveLast > 60 ) { log() << numSeen << " objects cloned so far from collection " << from_collection; saveLast = time( 0 ); } } }
/** @param fromRepl false if from ApplyOpsCmd @return true if was and update should have happened and the document DNE. see replset initial sync code. */ bool applyOperation_inlock(OperationContext* txn, Database* db, const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) { LOG(3) << "applying op: " << op << endl; bool failedUpdate = false; OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; const char *names[] = { "o", "ns", "op", "b", "o2" }; BSONElement fields[5]; op.getFields(5, names, fields); BSONElement& fieldO = fields[0]; BSONElement& fieldNs = fields[1]; BSONElement& fieldOp = fields[2]; BSONElement& fieldB = fields[3]; BSONElement& fieldO2 = fields[4]; BSONObj o; if( fieldO.isABSONObj() ) o = fieldO.embeddedObject(); const char *ns = fieldNs.valuestrsafe(); BSONObj o2; if (fieldO2.isABSONObj()) o2 = fieldO2.Obj(); bool valueB = fieldB.booleanSafe(); txn->lockState()->assertWriteLocked(ns); Collection* collection = db->getCollection( txn, ns ); IndexCatalog* indexCatalog = collection == NULL ? NULL : collection->getIndexCatalog(); // operation type -- see logOp() comments for types const char *opType = fieldOp.valuestrsafe(); if ( *opType == 'i' ) { opCounters->gotInsert(); const char *p = strchr(ns, '.'); if ( p && nsToCollectionSubstring( p ) == "system.indexes" ) { if (o["background"].trueValue()) { IndexBuilder* builder = new IndexBuilder(o); // This spawns a new thread and returns immediately. builder->go(); } else { IndexBuilder builder(o); Status status = builder.buildInForeground(txn, db); if ( status.isOK() ) { // yay } else if ( status.code() == ErrorCodes::IndexOptionsConflict || status.code() == ErrorCodes::IndexKeySpecsConflict ) { // SERVER-13206, SERVER-13496 // 2.4 (and earlier) will add an ensureIndex to an oplog if its ok or not // so in 2.6+ where we do stricter validation, it will fail // but we shouldn't care as the primary is responsible warning() << "index creation attempted on secondary that conflicts, " << "skipping: " << status; } else { uassertStatusOK( status ); } } } else { // do upserts for inserts as we might get replayed more than once OpDebug debug; BSONElement _id; if( !o.getObjectID(_id) ) { /* No _id. This will be very slow. */ Timer t; const NamespaceString requestNs(ns); UpdateRequest request(txn, requestNs); request.setQuery(o); request.setUpdates(o); request.setUpsert(); request.setFromReplication(); UpdateLifecycleImpl updateLifecycle(true, requestNs); request.setLifecycle(&updateLifecycle); update(db, request, &debug); if( t.millis() >= 2 ) { RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; } } else { // probably don't need this since all replicated colls have _id indexes now // but keep it just in case RARELY if ( indexCatalog && !collection->isCapped() && !indexCatalog->haveIdIndex(txn) ) { try { Helpers::ensureIndex(txn, collection, BSON("_id" << 1), true, "_id_"); } catch (const DBException& e) { warning() << "Ignoring error building id index on " << collection->ns() << ": " << e.toString(); } } /* todo : it may be better to do an insert here, and then catch the dup key exception and do update then. very few upserts will not be inserts... */ BSONObjBuilder b; b.append(_id); const NamespaceString requestNs(ns); UpdateRequest request(txn, requestNs); request.setQuery(b.done()); request.setUpdates(o); request.setUpsert(); request.setFromReplication(); UpdateLifecycleImpl updateLifecycle(true, requestNs); request.setLifecycle(&updateLifecycle); update(db, request, &debug); } } }
// Copies ops out of the bgsync queue into the deque passed in as a parameter. // Returns true if the batch should be ended early. // Batch should end early if we encounter a command, or if // there are no further ops in the bgsync queue to read. // This function also blocks 1 second waiting for new ops to appear in the bgsync // queue. We can't block forever because there are maintenance things we need // to periodically check in the loop. bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops) { BSONObj op; // Check to see if there are ops waiting in the bgsync queue bool peek_success = peek(&op); if (!peek_success) { // if we don't have anything in the queue, wait a bit for something to appear if (ops->empty()) { // block up to 1 second _networkQueue->waitForMore(); return false; } // otherwise, apply what we have return true; } const char* ns = op["ns"].valuestrsafe(); // check for commands if ((op["op"].valuestrsafe()[0] == 'c') || // Index builds are acheived through the use of an insert op, not a command op. // The following line is the same as what the insert code uses to detect an index build. ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) { if (ops->empty()) { // apply commands one-at-a-time ops->push_back(op); _networkQueue->consume(); } // otherwise, apply what we have so far and come back for the command return true; } // check for oplog version change BSONElement elemVersion = op["v"]; int curVersion = 0; if (elemVersion.eoo()) // missing version means version 1 curVersion = 1; else curVersion = elemVersion.Int(); if (curVersion != oplogVersion) { // Version changes cause us to end a batch. // If we are starting a new batch, reset version number // and continue. if (ops->empty()) { oplogVersion = curVersion; } else { // End batch early return true; } } // Copy the op to the deque and remove it from the bgsync queue. ops->push_back(op); _networkQueue->consume(); // Go back for more ops return false; }
virtual bool run(OperationContext* txn, const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { const std::string ns = parseNs(db, cmdObj); if (nsToCollectionSubstring(ns).empty()) { errmsg = "missing collection name"; return false; } NamespaceString nss(ns); // Parse the options for this request. auto request = AggregationRequest::parseFromBSON(nss, cmdObj); if (!request.isOK()) { return appendCommandStatus(result, request.getStatus()); } // Set up the ExpressionContext. intrusive_ptr<ExpressionContext> expCtx = new ExpressionContext(txn, request.getValue()); expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; // Parse the pipeline. auto statusWithPipeline = Pipeline::parse(request.getValue().getPipeline(), expCtx); if (!statusWithPipeline.isOK()) { return appendCommandStatus(result, statusWithPipeline.getStatus()); } auto pipeline = std::move(statusWithPipeline.getValue()); auto resolvedNamespaces = resolveInvolvedNamespaces(txn, pipeline, expCtx); if (!resolvedNamespaces.isOK()) { return appendCommandStatus(result, resolvedNamespaces.getStatus()); } expCtx->resolvedNamespaces = std::move(resolvedNamespaces.getValue()); unique_ptr<ClientCursorPin> pin; // either this OR the exec will be non-null unique_ptr<PlanExecutor> exec; auto curOp = CurOp::get(txn); { // This will throw if the sharding version for this connection is out of date. If the // namespace is a view, the lock will be released before re-running the aggregation. // Otherwise, the lock must be held continuously from now until we have we created both // the output ClientCursor and the input executor. This ensures that both are using the // same sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment on // ShardFilterStage for more details. AutoGetCollectionOrViewForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); // If running $collStats on a view, we do not resolve the view since we want stats // on this view namespace. auto startsWithCollStats = [&pipeline]() { const Pipeline::SourceContainer& sources = pipeline->getSources(); return !sources.empty() && dynamic_cast<DocumentSourceCollStats*>(sources.front().get()); }; // If this is a view, resolve it by finding the underlying collection and stitching view // pipelines and this request's pipeline together. We then release our locks before // recursively calling run, which will re-acquire locks on the underlying collection. // (The lock must be released because recursively acquiring locks on the database will // prohibit yielding.) auto view = ctx.getView(); if (view && !startsWithCollStats()) { auto viewDefinition = ViewShardingCheck::getResolvedViewIfSharded(txn, ctx.getDb(), view); if (!viewDefinition.isOK()) { return appendCommandStatus(result, viewDefinition.getStatus()); } if (!viewDefinition.getValue().isEmpty()) { ViewShardingCheck::appendShardedViewStatus(viewDefinition.getValue(), &result); return false; } auto resolvedView = ctx.getDb()->getViewCatalog()->resolveView(txn, nss); if (!resolvedView.isOK()) { return appendCommandStatus(result, resolvedView.getStatus()); } // With the view resolved, we can relinquish locks. ctx.releaseLocksForView(); // Parse the resolved view into a new aggregation request. auto viewCmd = resolvedView.getValue().asExpandedViewAggregation(request.getValue()); if (!viewCmd.isOK()) { return appendCommandStatus(result, viewCmd.getStatus()); } bool status = this->run(txn, db, viewCmd.getValue(), options, errmsg, result); { // Set the namespace of the curop back to the view namespace so ctx records // stats on this view namespace on destruction. stdx::lock_guard<Client>(*txn->getClient()); curOp->setNS_inlock(nss.ns()); } return status; } // If the pipeline does not have a user-specified collation, set it from the collection // default. if (request.getValue().getCollation().isEmpty() && collection && collection->getDefaultCollator()) { invariant(!expCtx->getCollator()); expCtx->setCollator(collection->getDefaultCollator()->clone()); } // Propagate the ExpressionContext throughout all of the pipeline's stages and // expressions. pipeline->injectExpressionContext(expCtx); // The pipeline must be optimized after the correct collator has been set on it (by // injecting the ExpressionContext containing the collator). This is necessary because // optimization may make string comparisons, e.g. optimizing {$eq: [<str1>, <str2>]} to // a constant. pipeline->optimizePipeline(); if (kDebugBuild && !expCtx->isExplain && !expCtx->inShard) { // Make sure all operations round-trip through Pipeline::serialize() correctly by // re-parsing every command in debug builds. This is important because sharded // aggregations rely on this ability. Skipping when inShard because this has // already been through the transformation (and this un-sets expCtx->inShard). pipeline = reparsePipeline(pipeline, request.getValue(), expCtx); } // This does mongod-specific stuff like creating the input PlanExecutor and adding // it to the front of the pipeline if needed. PipelineD::prepareCursorSource(collection, pipeline); // Create the PlanExecutor which returns results from the pipeline. The WorkingSet // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created // PlanExecutor. auto ws = make_unique<WorkingSet>(); auto proxy = make_unique<PipelineProxyStage>(txn, pipeline, ws.get()); auto statusWithPlanExecutor = (NULL == collection) ? PlanExecutor::make( txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL) : PlanExecutor::make( txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); exec = std::move(statusWithPlanExecutor.getValue()); { auto planSummary = Explain::getPlanSummary(exec.get()); stdx::lock_guard<Client>(*txn->getClient()); curOp->setPlanSummary_inlock(std::move(planSummary)); } if (collection) { PlanSummaryStats stats; Explain::getSummaryStats(*exec, &stats); collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed); } if (collection) { const bool isAggCursor = true; // enable special locking behavior ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), 0, cmdObj.getOwned(), isAggCursor); pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid())); // Don't add any code between here and the start of the try block. } // At this point, it is safe to release the collection lock. // - In the case where we have a collection: we will need to reacquire the // collection lock later when cleaning up our ClientCursorPin. // - In the case where we don't have a collection: our PlanExecutor won't be // registered, so it will be safe to clean it up outside the lock. invariant(!exec || !collection); } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; // Use of the aggregate command without specifying to use a cursor is deprecated. // Applications should migrate to using cursors. Cursors are strictly more useful than // outputting the results as a single document, since results that fit inside a single // BSONObj will also fit inside a single batch. // // We occasionally log a deprecation warning. if (!request.getValue().isCursorCommand()) { RARELY { warning() << "Use of the aggregate command without the 'cursor' " "option is deprecated. See " "http://dochub.mongodb.org/core/aggregate-without-cursor-deprecation."; } } // If both explain and cursor are specified, explain wins. if (expCtx->isExplain) { result << "stages" << Value(pipeline->writeExplainOps()); } else if (request.getValue().isCursorCommand()) { keepCursor = handleCursorCommand(txn, nss.ns(), pin.get(), pin ? pin->c()->getExecutor() : exec.get(), request.getValue(), result); } else { pipeline->run(result); } if (!expCtx->isExplain) { PlanSummaryStats stats; Explain::getSummaryStats(pin ? *pin->c()->getExecutor() : *exec.get(), &stats); curOp->debug().setPlanSummaryMetrics(stats); curOp->debug().nreturned = stats.nReturned; } // Clean up our ClientCursorPin, if needed. We must reacquire the collection lock // in order to do so. if (pin) { // We acquire locks here with DBLock and CollectionLock instead of using // AutoGetCollectionForRead. AutoGetCollectionForRead will throw if the // sharding version is out of date, and we don't care if the sharding version // has changed. Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); if (keepCursor) { pin->release(); } else { pin->deleteUnderlying(); } } } catch (...) {