Example #1
        virtual void accepted(boost::shared_ptr<Socket> psocket, long long connectionId ) {
            ScopeGuard sleepAfterClosingPort = MakeGuard(sleepmillis, 2);
            std::auto_ptr<MessagingPortWithHandler> portWithHandler(
                new MessagingPortWithHandler(psocket, _handler, connectionId));

            if ( ! Listener::globalTicketHolder.tryAcquire() ) {
                log() << "connection refused because too many open connections: " << Listener::globalTicketHolder.used() << endl;

            try {
#ifndef __linux__  // TODO: consider making this ifdef _WIN32
                    boost::thread thr(stdx::bind(&handleIncomingMsg, portWithHandler.get()));
                pthread_attr_t attrs;
                pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);

                static const size_t STACK_SIZE = 1024*1024; // if we change this we need to update the warning

                struct rlimit limits;
                verify(getrlimit(RLIMIT_STACK, &limits) == 0);
                if (limits.rlim_cur > STACK_SIZE) {
                    size_t stackSizeToSet = STACK_SIZE;
#if !__has_feature(address_sanitizer)
                    if (kDebugBuild)
                        stackSizeToSet /= 2;
                    pthread_attr_setstacksize(&attrs, stackSizeToSet);
                } else if (limits.rlim_cur < 1024*1024) {
                    warning() << "Stack size set to " << (limits.rlim_cur/1024) << "KB. We suggest 1MB" << endl;

                pthread_t thread;
                int failed =
                    pthread_create(&thread, &attrs, &handleIncomingMsg, portWithHandler.get());


                if (failed) {
                    log() << "pthread_create failed: " << errnoWithDescription(failed) << endl;
                    throw boost::thread_resource_error(); // for consistency with boost::thread
#endif  // __linux__

            catch ( boost::thread_resource_error& ) {
                log() << "can't create new thread, closing connection" << endl;
            catch ( ... ) {
                log() << "unknown error accepting new socket" << endl;
StatusWith<EventHandle> ScatterGatherRunner::RunnerImpl::start(
    const RemoteCommandCallbackFn processResponseCB) {
    LockGuard lk(_mutex);

    _started = true;
    StatusWith<EventHandle> evh = _executor->makeEvent();
    if (!evh.isOK()) {
        return evh;
    _sufficientResponsesReceived = evh.getValue();
    ScopeGuard earlyReturnGuard = MakeGuard(&RunnerImpl::_signalSufficientResponsesReceived, this);

    std::vector<RemoteCommandRequest> requests = _algorithm->getRequests();
    for (size_t i = 0; i < requests.size(); ++i) {
        log() << "Scheduling remote command request for " << _logMessage << ": "
              << requests[i].toString();
        const StatusWith<CallbackHandle> cbh =
            _executor->scheduleRemoteCommand(requests[i], processResponseCB);
        if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) {
            return StatusWith<EventHandle>(cbh.getStatus());
        fassert(18743, cbh.getStatus());

    if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) {

    return evh;
Example #3
    SSLConnection* SSLManager::accept(Socket* socket) {
        SSLConnection* sslConn = new SSLConnection(_serverContext, socket);
        ScopeGuard sslGuard = MakeGuard(::SSL_free, sslConn->ssl);
        ScopeGuard bioGuard = MakeGuard(::BIO_free, sslConn->networkBIO);
        int ret;
        do {
            ret = ::SSL_accept(sslConn->ssl);
        } while(!_doneWithSSLOp(sslConn, ret));
        if (ret != 1)
            _handleSSLError(SSL_get_error(sslConn, ret));
        return sslConn;
Example #4
 SSL* SSLManager::accept(int fd) {
     SSL* ssl = _secure(fd);
     ScopeGuard guard = MakeGuard(::SSL_free, ssl);
     int ret = SSL_accept(ssl);
     if (ret != 1)
         _handleSSLError(SSL_get_error(ssl, ret));
     return ssl;
Example #5
    void GlobalEnvironmentMongoD::setGlobalStorageEngine(const std::string& name) {
        // This should be set once.

        const StorageEngine::Factory* factory = _storageFactories[name];

        uassert(18656, str::stream()
            << "Cannot start server with an unknown storage engine: " << name,

        std::string canonicalName = factory->getCanonicalName().toString();

        // Do not proceed if data directory has been used by a different storage engine previously.
        std::auto_ptr<StorageEngineMetadata> metadata =
            StorageEngineMetadata::validate(storageGlobalParams.dbpath, canonicalName);

        // Validate options in metadata against current startup options.
        if (metadata.get()) {
            uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));

        try {
            _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath));
        catch (const std::exception& ex) {
            uassert(28596, str::stream()
                << "Unable to determine status of lock file in the data directory "
                << storageGlobalParams.dbpath << ": " << ex.what(),
        if (_lockFile->createdByUncleanShutdown()) {
            warning() << "Detected unclean shutdown - "
                      << _lockFile->getFilespec() << " is not empty.";

        ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get());
        _storageEngine = factory->create(storageGlobalParams, *_lockFile);

        // Write a new metadata file if it is not present.
        if (!metadata.get()) {
            metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));


        _supportsDocLocking = _storageEngine->supportsDocLocking();
Example #6
        IndexInsertionContinuation *beginInsertIntoIndex(
                int idxNo, IndexDetails &_idx,
                DiskLoc _recordLoc, const BSONObj &_key,
                const Ordering& _order, bool dupsAllowed) {

            IndexInsertionContinuationImpl<V> *continuation = new IndexInsertionContinuationImpl<V>(
                    _idx.head, _recordLoc, _key, _order, _idx);
            ScopeGuard allocGuard = MakeGuard(boost::checked_delete<IndexInsertionContinuation>,
            _idx.head.btree<V>()->twoStepInsert(_idx.head, *continuation, dupsAllowed);
            return continuation;
Status CollectionBulkLoaderImpl::_runTaskReleaseResourcesOnFailure(F task) noexcept {

    AlternativeClientRegion acr(_client);
    ScopeGuard guard = MakeGuard(&CollectionBulkLoaderImpl::_releaseResources, this);
    try {
        const auto status = [&task]() noexcept {
            return task();
        if (status.isOK()) {
        return status;
    } catch (...) {
    StatusWith<ReplicationExecutor::EventHandle> ScatterGatherRunner::start(
            ReplicationExecutor* executor,
            const stdx::function<void ()>& onCompletion) {

        _started = true;
        _actualResponses = 0;
        _onCompletion = onCompletion;
        StatusWith<ReplicationExecutor::EventHandle> evh = executor->makeEvent();
        if (!evh.isOK()) {
            return evh;
        _sufficientResponsesReceived = evh.getValue();
        ScopeGuard earlyReturnGuard = MakeGuard(

        const ReplicationExecutor::RemoteCommandCallbackFn cb = stdx::bind(

        std::vector<RemoteCommandRequest> requests = _algorithm->getRequests();
        for (size_t i = 0; i < requests.size(); ++i) {
            const StatusWith<ReplicationExecutor::CallbackHandle> cbh =
                executor->scheduleRemoteCommand(requests[i], cb);
            if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) {
                return StatusWith<ReplicationExecutor::EventHandle>(cbh.getStatus());
            fassert(18743, cbh.getStatus());

        if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) {

        return evh;
Example #9
PlanStage::StageState DeleteStage::work(WorkingSetID* out) {

    // Adds the amount of time taken by work() to executionTimeMillis.
    ScopedTimer timer(&_commonStats.executionTimeMillis);

    if (isEOF()) {
        return PlanStage::IS_EOF;
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    StageState status;
    if (_idRetrying == WorkingSet::INVALID_ID) {
        status = child()->work(&id);
    } else {
        status = ADVANCED;
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;

    if (PlanStage::ADVANCED == status) {
        WorkingSetMember* member = _ws->get(id);

        // We want to free this member when we return, unless we need to retry it.
        ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

        if (!member->hasLoc()) {
            // We expect to be here because of an invalidation causing a force-fetch, and
            // doc-locking storage engines do not issue invalidations.
            return PlanStage::NEED_TIME;
        RecordId rloc = member->loc;
        // Deletes can't have projections. This means that covering analysis will always add
        // a fetch. We should always get fetched data, and never just key data.

        try {
            // If the snapshot changed, then we have to make sure we have the latest copy of the
            // doc and that it still matches.
            std::unique_ptr<RecordCursor> cursor;
            if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
                cursor = _collection->getCursor(getOpCtx());
                if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) {
                    // Doc is already deleted. Nothing more to do.
                    return PlanStage::NEED_TIME;

                // Make sure the re-fetched doc still matches the predicate.
                if (_params.canonicalQuery &&
                    !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                    // Doesn't match.
                    return PlanStage::NEED_TIME;

            // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
            // is allowed to free the memory.
            if (_params.returnDeleted) {

            // TODO: Do we want to buffer docs and delete them in a group rather than
            // saving/restoring state repeatedly?

            try {
                if (supportsDocLocking()) {
                    // Doc-locking engines require this before saveState() since they don't use
                    // invalidations.
            } catch (const WriteConflictException& wce) {

            if (_params.returnDeleted) {
                // Save a copy of the document that is about to get deleted.
                BSONObj deletedDoc = member->obj.value();
                member->loc = RecordId();

            // Do the write, unless this is an explain.
            if (!_params.isExplain) {
                WriteUnitOfWork wunit(getOpCtx());
                _collection->deleteDocument(getOpCtx(), rloc);

        } catch (const WriteConflictException& wce) {
            // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may be
            // freed when we yield.
            _idRetrying = id;
            memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
            *out = WorkingSet::INVALID_ID;
            return NEED_YIELD;

        //  As restoreState may restore (recreate) cursors, cursors are tied to the
        //  transaction in which they are created, and a WriteUnitOfWork is a
        //  transaction, make sure to restore the state outside of the WritUnitOfWork.
        try {
        } catch (const WriteConflictException& wce) {
            // Note we don't need to retry anything in this case since the delete already
            // was committed. However, we still need to return the deleted document
            // (if it was requested).
            if (_params.returnDeleted) {
                // member->obj should refer to the deleted document.
                invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

                _idReturning = id;
                // Keep this member around so that we can return it on the next work() call.
            *out = WorkingSet::INVALID_ID;
            return NEED_YIELD;

        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            memberFreer.Dismiss();  // Keep this member around so we can return it.
            *out = id;
            return PlanStage::ADVANCED;

        return PlanStage::NEED_TIME;
    } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) {
        *out = id;
        // If a stage fails, it may create a status WSM to indicate why it failed, in which case
        // 'id' is valid.  If ID is invalid, we create our own error message.
        if (WorkingSet::INVALID_ID == id) {
            const std::string errmsg = "delete stage failed to read in results from child";
            *out = WorkingSetCommon::allocateStatusMember(
                _ws, Status(ErrorCodes::InternalError, errmsg));
        return status;
    } else if (PlanStage::NEED_TIME == status) {
    } else if (PlanStage::NEED_YIELD == status) {
        *out = id;

    return status;
Example #10
Status renameCollection(OperationContext* txn,
                        const NamespaceString& source,
                        const NamespaceString& target,
                        bool dropTarget,
                        bool stayTemp) {
    DisableDocumentValidation validationDisabler(txn);

    ScopedTransaction transaction(txn, MODE_X);
    Lock::GlobalWrite globalWriteLock(txn->lockState());
    // We stay in source context the whole time. This is mostly to set the CurOp namespace.
    OldClientContext ctx(txn, source.ns());

    bool userInitiatedWritesAndNotPrimary = txn->writesAreReplicated() &&

    if (userInitiatedWritesAndNotPrimary) {
        return Status(ErrorCodes::NotMaster,
                      str::stream() << "Not primary while renaming collection " << source.ns()
                                    << " to "
                                    << target.ns());

    Database* const sourceDB = dbHolder().get(txn, source.db());
    Collection* const sourceColl = sourceDB ? sourceDB->getCollection(source.ns()) : nullptr;
    if (!sourceColl) {
        return Status(ErrorCodes::NamespaceNotFound, "source namespace does not exist");

        // Ensure that collection name does not exceed maximum length.
        // Ensure that index names do not push the length over the max.
        // Iterator includes unfinished indexes.
        IndexCatalog::IndexIterator sourceIndIt =
            sourceColl->getIndexCatalog()->getIndexIterator(txn, true);
        int longestIndexNameLength = 0;
        while (sourceIndIt.more()) {
            int thisLength = sourceIndIt.next()->indexName().length();
            if (thisLength > longestIndexNameLength)
                longestIndexNameLength = thisLength;

        unsigned int longestAllowed =
                     int(NamespaceString::MaxNsLen) - 2 /*strlen(".$")*/ - longestIndexNameLength);
        if (target.size() > longestAllowed) {
            StringBuilder sb;
            sb << "collection name length of " << target.size() << " exceeds maximum length of "
               << longestAllowed << ", allowing for index names";
            return Status(ErrorCodes::InvalidLength, sb.str());


    Database* const targetDB = dbHolder().openDb(txn, target.db());

        WriteUnitOfWork wunit(txn);

        // Check if the target namespace exists and if dropTarget is true.
        // If target exists and dropTarget is not true, return false.
        if (targetDB->getCollection(target)) {
            if (!dropTarget) {
                return Status(ErrorCodes::NamespaceExists, "target namespace exists");

            Status s = targetDB->dropCollection(txn, target.ns());
            if (!s.isOK()) {
                return s;

        // If we are renaming in the same database, just
        // rename the namespace and we're done.
        if (sourceDB == targetDB) {
            Status s = targetDB->renameCollection(txn, source.ns(), target.ns(), stayTemp);
            if (!s.isOK()) {
                return s;

                txn, NamespaceString(source), NamespaceString(target), dropTarget, stayTemp);

            return Status::OK();


    // If we get here, we are renaming across databases, so we must copy all the data and
    // indexes, then remove the source collection.

    // Create the target collection. It will be removed if we fail to copy the collection.
    // TODO use a temp collection and unset the temp flag on success.
    Collection* targetColl = nullptr;
        CollectionOptions options = sourceColl->getCatalogEntry()->getCollectionOptions(txn);

        WriteUnitOfWork wunit(txn);

        // No logOp necessary because the entire renameCollection command is one logOp.
        bool shouldReplicateWrites = txn->writesAreReplicated();
        targetColl = targetDB->createCollection(txn,
                                                false);  // _id index build with others later.
        if (!targetColl) {
            return Status(ErrorCodes::OutOfDiskSpace, "Failed to create target collection.");


    // Dismissed on success
    ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target.ns());

    MultiIndexBlock indexer(txn, targetColl);

    // Copy the index descriptions from the source collection, adjusting the ns field.
        std::vector<BSONObj> indexesToCopy;
        IndexCatalog::IndexIterator sourceIndIt =
            sourceColl->getIndexCatalog()->getIndexIterator(txn, true);
        while (sourceIndIt.more()) {
            const BSONObj currIndex = sourceIndIt.next()->infoObj();

            // Process the source index.
            BSONObjBuilder newIndex;
            newIndex.append("ns", target.ns());

        // Copy over all the data from source collection to target collection.
        auto cursor = sourceColl->getCursor(txn);
        while (auto record = cursor->next()) {

            const auto obj = record->data.releaseToBson();

            WriteUnitOfWork wunit(txn);
            // No logOp necessary because the entire renameCollection command is one logOp.
            bool shouldReplicateWrites = txn->writesAreReplicated();
            Status status = targetColl->insertDocument(txn, obj, &indexer, true);
            if (!status.isOK())
                return status;

    Status status = indexer.doneInserting();
    if (!status.isOK())
        return status;

        // Getting here means we successfully built the target copy. We now remove the
        // source collection and finalize the rename.
        WriteUnitOfWork wunit(txn);

        bool shouldReplicateWrites = txn->writesAreReplicated();
        Status status = sourceDB->dropCollection(txn, source.ns());
        if (!status.isOK())
            return status;


            txn, NamespaceString(source), NamespaceString(target), dropTarget, stayTemp);


    return Status::OK();
Example #11
void WiredTigerSizeStorer::syncCache(bool syncToDisk) {
    stdx::lock_guard<stdx::mutex> cursorLock(_cursorMutex);

    Map myMap;
        stdx::lock_guard<stdx::mutex> lk(_entriesMutex);
        for (Map::iterator it = _entries.begin(); it != _entries.end(); ++it) {
            std::string uriKey = it->first;
            Entry& entry = it->second;
            if (entry.rs) {
                if (entry.dataSize != entry.rs->dataSize(NULL)) {
                    entry.dataSize = entry.rs->dataSize(NULL);
                    entry.dirty = true;
                if (entry.numRecords != entry.rs->numRecords(NULL)) {
                    entry.numRecords = entry.rs->numRecords(NULL);
                    entry.dirty = true;

            if (!entry.dirty)
            myMap[uriKey] = entry;

    if (myMap.empty())
        return;  // Nothing to do.

    WT_SESSION* session = _session.getSession();
    invariantWTOK(session->begin_transaction(session, syncToDisk ? "sync=true" : ""));
    ScopeGuard rollbacker = MakeGuard(session->rollback_transaction, session, "");

    for (Map::iterator it = myMap.begin(); it != myMap.end(); ++it) {
        string uriKey = it->first;
        Entry& entry = it->second;

        BSONObj data;
            BSONObjBuilder b;
            b.append("numRecords", entry.numRecords);
            b.append("dataSize", entry.dataSize);
            data = b.obj();

        LOG(2) << "WiredTigerSizeStorer::storeInto " << uriKey << " -> " << redact(data);

        WiredTigerItem key(uriKey.c_str(), uriKey.size());
        WiredTigerItem value(data.objdata(), data.objsize());
        _cursor->set_key(_cursor, key.Get());
        _cursor->set_value(_cursor, value.Get());


    invariantWTOK(session->commit_transaction(session, NULL));

        stdx::lock_guard<stdx::mutex> lk(_entriesMutex);
        for (Map::iterator it = _entries.begin(); it != _entries.end(); ++it) {
            it->second.dirty = false;
         * Generates the next batch of results for a ClientCursor.
         * TODO: Do we need to support some equivalent of OP_REPLY responseFlags?
         * TODO: Is it possible to support awaitData?
        bool run(OperationContext* txn,
                 const std::string& dbname,
                 BSONObj& cmdObj,
                 int options,
                 std::string& errmsg,
                 BSONObjBuilder& result) override {
            // Counted as a getMore, not as a command.

            if (txn->getClient()->isInDirectClient()) {
                return appendCommandStatus(result,
                                                  "Cannot run getMore command from eval()"));

            StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj);
            if (!parseStatus.isOK()) {
                return appendCommandStatus(result, parseStatus.getStatus());
            const GetMoreRequest& request = parseStatus.getValue();

            // Depending on the type of cursor being operated on, we hold locks for the whole
            // getMore, or none of the getMore, or part of the getMore.  The three cases in detail:
            // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
            // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors
            //    don't own any collection state.
            // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
            //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
            //    release), but the pin and unpin of the cursor must occur under the collection
            //    lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because
            //    AutoGetCollectionForRead checks the sharding version (and we want the relock for
            //    the unpin to succeed even if the sharding version has changed).
            // Note that we declare our locks before our ClientCursorPin, in order to ensure that
            // the pin's destructor is called before the lock destructors (so that the unpin occurs
            // under the lock).
            std::unique_ptr<AutoGetCollectionForRead> ctx;
            std::unique_ptr<Lock::DBLock> unpinDBLock;
            std::unique_ptr<Lock::CollectionLock> unpinCollLock;

            CursorManager* cursorManager;
            CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
            if (globalCursorManager->ownsCursorId(request.cursorid)) {
                cursorManager = globalCursorManager;
            else {
                ctx.reset(new AutoGetCollectionForRead(txn, request.nss));
                Collection* collection = ctx->getCollection();
                if (!collection) {
                    return appendCommandStatus(result,
                                                      "collection dropped between getMore calls"));
                cursorManager = collection->getCursorManager();

            ClientCursorPin ccPin(cursorManager, request.cursorid);
            ClientCursor* cursor = ccPin.c();
            if (!cursor) {
                // We didn't find the cursor.
                return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream()
                    << "Cursor not found, cursor id: " << request.cursorid));

            if (request.nss.ns() != cursor->ns()) {
                return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream()
                    << "Requested getMore on namespace '" << request.nss.ns()
                    << "', but cursor belongs to a different namespace"));

            // On early return, get rid of the the cursor.
            ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin);

            if (!cursor->hasRecoveryUnit()) {
                // Start using a new RecoveryUnit.

            // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
            ScopedRecoveryUnitSwapper ruSwapper(cursor, txn);

            // Reset timeout timer on the cursor since the cursor is still in use.

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (cursor->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks

            PlanExecutor* exec = cursor->getExecutor();

            // TODO: Handle result sets larger than 16MB.
            BSONArrayBuilder nextBatch;
            BSONObj obj;
            PlanExecutor::ExecState state;
            int numResults = 0;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.

                if (enoughForGetMore(request.batchSize, numResults, nextBatch.len())) {

            // If we are operating on an aggregation cursor, then we dropped our collection lock
            // earlier and need to reacquire it in order to clean up our ClientCursorPin.
            // TODO: We need to ensure that this relock happens if we release the pin above in
            // response to PlanExecutor::getNext() throwing an exception.
            if (cursor->isAggCursor()) {
                invariant(NULL == ctx.get());
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS));
                    new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS));

            // Fail the command if the PlanExecutor reports execution failure.
            if (PlanExecutor::FAILURE == state) {
                const std::unique_ptr<PlanStageStats> stats(exec->getStats());
                error() << "GetMore executor error, stats: " << Explain::statsToBSON(*stats);
                return appendCommandStatus(result,
                                                  str::stream() << "GetMore executor error: "
                                                  << WorkingSetCommon::toStatusString(obj)));

            CursorId respondWithId = 0;
            if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) {
                respondWithId = request.cursorid;



                if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) {
                    // Rather than swapping their existing RU into the client cursor, tailable
                    // cursors should get a new recovery unit.
            else {
                txn->getCurOp()->debug().cursorExhausted = true;

            appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result);
            if (respondWithId) {
            return true;
Example #13
        virtual bool run(OperationContext* txn,
                         const string& dbname,
                         BSONObj& cmdObj,
                         string& errmsg,
                         BSONObjBuilder& result,
                         bool fromRepl) {
            Lock::GlobalWrite globalWriteLock(txn->lockState());
            string source = cmdObj.getStringField( name.c_str() );
            string target = cmdObj.getStringField( "to" );

            // We stay in source context the whole time. This is mostly to set the CurOp namespace.
            Client::Context ctx(txn, source);

            if ( !NamespaceString::validCollectionComponent(target.c_str()) ) {
                errmsg = "invalid collection name: " + target;
                return false;
            if ( source.empty() || target.empty() ) {
                errmsg = "invalid command syntax";
                return false;

            if (!fromRepl) { // If it got through on the master, need to allow it here too
                Status sourceStatus = userAllowedWriteNS(source);
                if (!sourceStatus.isOK()) {
                    errmsg = "error with source namespace: " + sourceStatus.reason();
                    return false;

                Status targetStatus = userAllowedWriteNS(target);
                if (!targetStatus.isOK()) {
                    errmsg = "error with target namespace: " + targetStatus.reason();
                    return false;

            if (NamespaceString(source).coll() == "system.indexes"
                || NamespaceString(target).coll() == "system.indexes") {
                errmsg = "renaming system.indexes is not allowed";
                return false;

            Database* const sourceDB = dbHolder().get(txn, nsToDatabase(source));
            Collection* const sourceColl = sourceDB ? sourceDB->getCollection(txn, source)
                                                    : NULL;
            if (!sourceColl) {
                errmsg = "source namespace does not exist";
                return false;

                // Ensure that collection name does not exceed maximum length.
                // Ensure that index names do not push the length over the max.
                // Iterator includes unfinished indexes.
                IndexCatalog::IndexIterator sourceIndIt =
                    sourceColl->getIndexCatalog()->getIndexIterator( txn, true );
                int longestIndexNameLength = 0;
                while ( sourceIndIt.more() ) {
                    int thisLength = sourceIndIt.next()->indexName().length();
                    if ( thisLength > longestIndexNameLength )
                        longestIndexNameLength = thisLength;

                unsigned int longestAllowed =
                        int(NamespaceString::MaxNsLen) - 2/*strlen(".$")*/ - longestIndexNameLength);
                if (target.size() > longestAllowed) {
                    StringBuilder sb;
                    sb << "collection name length of " << target.size()
                       << " exceeds maximum length of " << longestAllowed
                       << ", allowing for index names";
                    errmsg = sb.str();
                    return false;

            const std::vector<BSONObj> indexesInProg = stopIndexBuilds(txn, sourceDB, cmdObj);
            // Dismissed on success
            ScopeGuard indexBuildRestorer = MakeGuard(IndexBuilder::restoreIndexes, indexesInProg);

            Database* const targetDB = dbHolder().openDb(txn, nsToDatabase(target));

                WriteUnitOfWork wunit(txn);

                // Check if the target namespace exists and if dropTarget is true.
                // If target exists and dropTarget is not true, return false.
                if (targetDB->getCollection(txn, target)) {
                    if (!cmdObj["dropTarget"].trueValue()) {
                        errmsg = "target namespace exists";
                        return false;

                    Status s = targetDB->dropCollection(txn, target);
                    if ( !s.isOK() ) {
                        errmsg = s.toString();
                        return false;

                // If we are renaming in the same database, just
                // rename the namespace and we're done.
                if (sourceDB == targetDB) {
                    Status s = targetDB->renameCollection(txn,
                                                          cmdObj["stayTemp"].trueValue() );
                    if (!s.isOK()) {
                        return appendCommandStatus(result, s);

                    if (!fromRepl) {
                        repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj);

                    return true;


            // If we get here, we are renaming across databases, so we must copy all the data and
            // indexes, then remove the source collection.

            // Create the target collection. It will be removed if we fail to copy the collection.
            // TODO use a temp collection and unset the temp flag on success.
            Collection* targetColl = NULL;
                CollectionOptions options;

                if (sourceColl->isCapped()) {
                    const CollectionOptions sourceOpts =

                    options.capped = true;
                    options.cappedSize = sourceOpts.cappedSize;
                    options.cappedMaxDocs = sourceOpts.cappedMaxDocs;

                WriteUnitOfWork wunit(txn);

                // No logOp necessary because the entire renameCollection command is one logOp.
                targetColl = targetDB->createCollection(txn, target, options);
                if (!targetColl) {
                    errmsg = "Failed to create target collection.";
                    return false;


            // Dismissed on success
            ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target);

            MultiIndexBlock indexer(txn, targetColl);

            // Copy the index descriptions from the source collection, adjusting the ns field.
                std::vector<BSONObj> indexesToCopy;
                IndexCatalog::IndexIterator sourceIndIt =
                    sourceColl->getIndexCatalog()->getIndexIterator( txn, true );
                while (sourceIndIt.more()) {
                    const BSONObj currIndex = sourceIndIt.next()->infoObj();

                    // Process the source index.
                    BSONObjBuilder newIndex;
                    newIndex.append("ns", target);

                // Copy over all the data from source collection to target collection.
                boost::scoped_ptr<RecordIterator> sourceIt(sourceColl->getIterator(txn));
                while (!sourceIt->isEOF()) {

                    const BSONObj obj = sourceColl->docFor(txn, sourceIt->getNext());

                    WriteUnitOfWork wunit(txn);
                    // No logOp necessary because the entire renameCollection command is one logOp.
                    Status status = targetColl->insertDocument(txn, obj, &indexer, true).getStatus();
                    if (!status.isOK())
                        return appendCommandStatus(result, status);

            Status status = indexer.doneInserting();
            if (!status.isOK())
                return appendCommandStatus(result, status);

                // Getting here means we successfully built the target copy. We now remove the
                // source collection and finalize the rename.
                WriteUnitOfWork wunit(txn);

                Status status = sourceDB->dropCollection(txn, source);
                if (!status.isOK())
                    return appendCommandStatus(result, status);


                if (!fromRepl) {
                    repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj);


            return true;
Example #14
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData,
    OperationContext* opCtx,
    size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {

            _lifecycleState = kKillComplete;


    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    StatusWith<CursorResponse> cursorResponseStatus(
        cbData.response.isOK() ? parseCursorResponse(cbData.response.data, remote)
                               : cbData.response.status);

    if (!cursorResponseStatus.isOK()) {
        auto shard = remote.getShard();
        if (!shard) {
            remote.status = Status(cursorResponseStatus.getStatus().code(),
                                   str::stream() << "Could not find shard containing host "
                                                 << remote.getTargetHost().toString());
        } else {
            shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus());
            remote.status = cursorResponseStatus.getStatus();

        // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We
        // remove the unreachable host entirely from consideration by marking it as exhausted.
        if (_params->isAllowPartialResults) {
            remote.status = Status::OK();

            // Clear the results buffer and cursor id.
            std::queue<ClusterQueryResult> emptyBuffer;
            std::swap(remote.docBuffer, emptyBuffer);
            remote.cursorId = 0;


    // Response successfully received.

    auto cursorResponse = std::move(cursorResponseStatus.getValue());

    // Update the cursorId; it is sent as '0' when the cursor has been exhausted on the shard.
    remote.cursorId = cursorResponse.getCursorId();

    // Save the batch in the remote's buffer.
    if (!addBatchToBuffer(remoteIndex, cursorResponse.getBatch())) {

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    // (Note: tailable cursors are only valid on unsharded collections, so the end of the batch from
    // one shard means the end of the overall batch).
    if (_params->isTailable && !remote.hasNext()) {
        _eofNext = true;

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params->isTailable && !remote.hasNext() && !remote.exhausted()) {
        remote.status = askForNextBatch_inlock(opCtx, remoteIndex);
        if (!remote.status.isOK()) {

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
Example #15
    PlanStage::StageState DeleteStage::work(WorkingSetID* out) {

        // Adds the amount of time taken by work() to executionTimeMillis.
        ScopedTimer timer(&_commonStats.executionTimeMillis);

        if (isEOF()) { return PlanStage::IS_EOF; }
        invariant(_collection); // If isEOF() returns false, we must have a collection.

        // Either retry the last WSM we worked on or get a new one from our child.
        WorkingSetID id;
        StageState status;
        if (_idRetrying == WorkingSet::INVALID_ID) {
            status = _child->work(&id);
        else {
            status = ADVANCED;
            id = _idRetrying;
            _idRetrying = WorkingSet::INVALID_ID;

        if (PlanStage::ADVANCED == status) {
            WorkingSetMember* member = _ws->get(id);

            // We want to free this member when we return, unless we need to retry it.
            ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

            if (!member->hasLoc()) {
                // We expect to be here because of an invalidation causing a force-fetch, and
                // doc-locking storage engines do not issue invalidations.
                return PlanStage::NEED_TIME;
            RecordId rloc = member->loc;

            try {
                // If the snapshot changed, then we have to make sure we have the latest copy of the
                // doc and that it still matches.
                if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
                    if (!WorkingSetCommon::fetch(_txn, member, _collection)) {
                        // Doc is already deleted. Nothing more to do.
                        return PlanStage::NEED_TIME;

                    // Make sure the re-fetched doc still matches the predicate.
                    if (_params.canonicalQuery &&
                        !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                        // Doesn't match.
                        return PlanStage::NEED_TIME;

                // TODO: Do we want to buffer docs and delete them in a group rather than
                // saving/restoring state repeatedly?

                try {
                    if (supportsDocLocking()) {
                        // Doc-locking engines require this after saveState() since they don't use
                        // invalidations.
                catch ( const WriteConflictException& wce ) {

                // Do the write, unless this is an explain.
                if (!_params.isExplain) {
                    WriteUnitOfWork wunit(_txn);

                    const bool deleteCappedOK = false;
                    const bool deleteNoWarn = false;
                    BSONObj deletedDoc;

                    _collection->deleteDocument(_txn, rloc, deleteCappedOK, deleteNoWarn,
                                                _params.shouldCallLogOp ? &deletedDoc : NULL);

                    if (_params.shouldCallLogOp) {
                        if (deletedDoc.isEmpty()) {
                            log() << "Deleted object without id in collection " << _collection->ns()
                            << ", not logging.";
                        else {


            catch ( const WriteConflictException& wce ) {
                _idRetrying = id;
                memberFreer.Dismiss(); // Keep this member around so we can retry deleting it.
                *out = WorkingSet::INVALID_ID;
                return NEED_YIELD;

            //  As restoreState may restore (recreate) cursors, cursors are tied to the
            //  transaction in which they are created, and a WriteUnitOfWork is a
            //  transaction, make sure to restore the state outside of the WritUnitOfWork.
            try {
            catch ( const WriteConflictException& wce ) {
                // Note we don't need to retry anything in this case since the delete already
                // was committed.
                *out = WorkingSet::INVALID_ID;
                return NEED_YIELD;

            return PlanStage::NEED_TIME;
        else if (PlanStage::FAILURE == status) {
            *out = id;
            // If a stage fails, it may create a status WSM to indicate why it failed, in which case
            // 'id' is valid.  If ID is invalid, we create our own error message.
            if (WorkingSet::INVALID_ID == id) {
                const std::string errmsg = "delete stage failed to read in results from child";
                *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError,
                return PlanStage::FAILURE;
            return status;
        else if (PlanStage::NEED_TIME == status) {
        else if (PlanStage::NEED_YIELD == status) {
            *out = id;

        return status;
Example #16
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.

        // Make a best effort to parse the response and retrieve the cursor id. We need the cursor
        // id in order to issue a killCursors command against it.
        if (cbData.response.isOK()) {
            auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote);
            if (cursorResponse.isOK()) {
                remote.cursorId = cursorResponse.getValue().getCursorId();

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {

            _lifecycleState = kKillComplete;


    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    StatusWith<CursorResponse> cursorResponseStatus(
        cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote)
                               : cbData.response.getStatus());

    if (!cursorResponseStatus.isOK()) {
        auto shard = remote.getShard();
        if (!shard) {
            remote.status = Status(cursorResponseStatus.getStatus().code(),
                                   str::stream() << "Could not find shard " << *remote.shardId
                                                 << " containing host "
                                                 << remote.getTargetHost().toString());
        } else {
            shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus());

            // Retry initial cursor establishment if possible.  Never retry getMores to avoid
            // accidentally skipping results.
            if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
                                        Shard::RetryPolicy::kIdempotent)) {
                LOG(1) << "Initial cursor establishment failed with retriable error and will be "
                       << causedBy(redact(cursorResponseStatus.getStatus()));


                // Since we potentially updated the targeter that the last host it chose might be
                // faulty, the call below may end up getting a different host.
                remote.status = askForNextBatch_inlock(remoteIndex);
                if (remote.status.isOK()) {

                // If we end up here, it means we failed to schedule the retry request, which is a
                // more
                // severe error that should not be retried. Just pass through to the error handling
                // logic below.
            } else {
                remote.status = cursorResponseStatus.getStatus();

        // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We
        // remove the unreachable host entirely from consideration by marking it as exhausted.
        if (_params.isAllowPartialResults) {
            remote.status = Status::OK();

            // Clear the results buffer and cursor id.
            std::queue<BSONObj> emptyBuffer;
            std::swap(remote.docBuffer, emptyBuffer);
            remote.cursorId = 0;


    // Cursor id successfully established.
    auto cursorResponse = std::move(cursorResponseStatus.getValue());
    remote.cursorId = cursorResponse.getCursorId();
    remote.initialCmdObj = boost::none;

    for (const auto& obj : cursorResponse.getBatch()) {
        // If there's a sort, we're expecting the remote node to give us back a sort key.
        if (!_params.sort.isEmpty() &&
            obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) {
            remote.status = Status(ErrorCodes::InternalError,
                                   str::stream() << "Missing field '"
                                                 << ClusterClientCursorParams::kSortKeyField
                                                 << "' in document: "
                                                 << obj);


    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) {

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    if (_params.isTailable && !remote.hasNext()) {
        _eofNext = true;

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) {
        remote.status = askForNextBatch_inlock(remoteIndex);
        if (!remote.status.isOK()) {

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
Example #17
Status runAggregate(OperationContext* opCtx,
                    const NamespaceString& origNss,
                    const AggregationRequest& request,
                    const BSONObj& cmdObj,
                    BSONObjBuilder& result) {
    // For operations on views, this will be the underlying namespace.
    NamespaceString nss = request.getNamespaceString();

    // The collation to use for this aggregation. boost::optional to distinguish between the case
    // where the collation has not yet been resolved, and where it has been resolved to nullptr.
    boost::optional<std::unique_ptr<CollatorInterface>> collatorToUse;

    unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec;
    boost::intrusive_ptr<ExpressionContext> expCtx;
    Pipeline* unownedPipeline;
    auto curOp = CurOp::get(opCtx);
        const LiteParsedPipeline liteParsedPipeline(request);

        // Check whether the parsed pipeline supports the given read concern.
        liteParsedPipeline.assertSupportsReadConcern(opCtx, request.getExplain());

        if (liteParsedPipeline.hasChangeStream()) {
            nss = NamespaceString::kRsOplogNamespace;

            // If the read concern is not specified, upgrade to 'majority' and wait to make sure we
            // have a snapshot available.
            if (!repl::ReadConcernArgs::get(opCtx).hasLevel()) {
                const repl::ReadConcernArgs readConcern(
                uassertStatusOK(waitForReadConcern(opCtx, readConcern, true));

            if (!origNss.isCollectionlessAggregateNS()) {
                // AutoGetCollectionForReadCommand will raise an error if 'origNss' is a view.
                AutoGetCollectionForReadCommand origNssCtx(opCtx, origNss);

                // Resolve the collator to either the user-specified collation or the default
                // collation of the collection on which $changeStream was invoked, so that we do not
                // end up resolving the collation on the oplog.
                Collection* origColl = origNssCtx.getCollection();
                collatorToUse.emplace(resolveCollator(opCtx, request, origColl));

        const auto& pipelineInvolvedNamespaces = liteParsedPipeline.getInvolvedNamespaces();

        // If emplaced, AutoGetCollectionForReadCommand will throw if the sharding version for this
        // connection is out of date. If the namespace is a view, the lock will be released before
        // re-running the expanded aggregation.
        boost::optional<AutoGetCollectionForReadCommand> ctx;

        // If this is a collectionless aggregation, we won't create 'ctx' but will still need an
        // AutoStatsTracker to record CurOp and Top entries.
        boost::optional<AutoStatsTracker> statsTracker;

        // If this is a collectionless aggregation with no foreign namespaces, we don't want to
        // acquire any locks. Otherwise, lock the collection or view.
        if (nss.isCollectionlessAggregateNS() && pipelineInvolvedNamespaces.empty()) {
            statsTracker.emplace(opCtx, nss, Top::LockType::NotLocked, 0);
        } else {
            ctx.emplace(opCtx, nss, AutoGetCollection::ViewMode::kViewsPermitted);

        Collection* collection = ctx ? ctx->getCollection() : nullptr;

        // The collator may already have been set if this is a $changeStream pipeline. If not,
        // resolve the collator to either the user-specified collation or the collection default.
        if (!collatorToUse) {
            collatorToUse.emplace(resolveCollator(opCtx, request, collection));

        // If this is a view, resolve it by finding the underlying collection and stitching view
        // pipelines and this request's pipeline together. We then release our locks before
        // recursively calling runAggregate(), which will re-acquire locks on the underlying
        // collection.  (The lock must be released because recursively acquiring locks on the
        // database will prohibit yielding.)
        if (ctx && ctx->getView() && !liteParsedPipeline.startsWithCollStats()) {
            invariant(nss != NamespaceString::kRsOplogNamespace);
            // Check that the default collation of 'view' is compatible with the operation's
            // collation. The check is skipped if the request did not specify a collation.
            if (!request.getCollation().isEmpty()) {
                invariant(collatorToUse);  // Should already be resolved at this point.
                if (!CollatorInterface::collatorsMatch(ctx->getView()->defaultCollator(),
                                                       collatorToUse->get())) {
                    return {ErrorCodes::OptionNotSupportedOnView,
                            "Cannot override a view's default collation"};

            ViewShardingCheck::throwResolvedViewIfSharded(opCtx, ctx->getDb(), ctx->getView());

            auto resolvedView = ctx->getDb()->getViewCatalog()->resolveView(opCtx, nss);
            if (!resolvedView.isOK()) {
                return resolvedView.getStatus();

            // With the view & collation resolved, we can relinquish locks.

            // Parse the resolved view into a new aggregation request.
            auto newRequest = resolvedView.getValue().asExpandedViewAggregation(request);
            auto newCmd = newRequest.serializeToCommandObj().toBson();

            auto status = runAggregate(opCtx, origNss, newRequest, newCmd, result);
                // Set the namespace of the curop back to the view namespace so ctx records
                // stats on this view namespace on destruction.
                stdx::lock_guard<Client> lk(*opCtx->getClient());
            return status;

            new ExpressionContext(opCtx,
                                  uassertStatusOK(resolveInvolvedNamespaces(opCtx, request))));
        expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp";
        auto session = OperationContextSession::get(opCtx);
        expCtx->inSnapshotReadOrMultiDocumentTransaction =
            session && session->inSnapshotReadOrMultiDocumentTransaction();

        auto pipeline = uassertStatusOK(Pipeline::parse(request.getPipeline(), expCtx));

        // Check that the view's collation matches the collation of any views involved in the
        // pipeline.
        if (!pipelineInvolvedNamespaces.empty()) {
            auto pipelineCollationStatus = collatorCompatibleWithPipeline(
                opCtx, ctx->getDb(), expCtx->getCollator(), pipeline.get());
            if (!pipelineCollationStatus.isOK()) {
                return pipelineCollationStatus;


        if (kDebugBuild && !expCtx->explain && !expCtx->fromMongos) {
            // Make sure all operations round-trip through Pipeline::serialize() correctly by
            // re-parsing every command in debug builds. This is important because sharded
            // aggregations rely on this ability.  Skipping when fromMongos because this has
            // already been through the transformation (and this un-sets expCtx->fromMongos).
            pipeline = reparsePipeline(pipeline.get(), request, expCtx);

        // Prepare a PlanExecutor to provide input into the pipeline, if needed.
        if (liteParsedPipeline.hasChangeStream()) {
            // If we are using a change stream, the cursor stage should have a simple collation,
            // regardless of what the user's collation was.
            std::unique_ptr<CollatorInterface> collatorForCursor = nullptr;
            auto collatorStash = expCtx->temporarilyChangeCollator(std::move(collatorForCursor));
            PipelineD::prepareCursorSource(collection, nss, &request, pipeline.get());
        } else {
            PipelineD::prepareCursorSource(collection, nss, &request, pipeline.get());
        // Optimize again, since there may be additional optimizations that can be done after adding
        // the initial cursor stage. Note this has to be done outside the above blocks to ensure
        // this process uses the correct collation if it does any string comparisons.

        // Transfer ownership of the Pipeline to the PipelineProxyStage.
        unownedPipeline = pipeline.get();
        auto ws = make_unique<WorkingSet>();
        auto proxy = make_unique<PipelineProxyStage>(opCtx, std::move(pipeline), ws.get());

        // This PlanExecutor will simply forward requests to the Pipeline, so does not need to
        // yield or to be registered with any collection's CursorManager to receive invalidations.
        // The Pipeline may contain PlanExecutors which *are* yielding PlanExecutors and which *are*
        // registered with their respective collection's CursorManager
        auto statusWithPlanExecutor =
            PlanExecutor::make(opCtx, std::move(ws), std::move(proxy), nss, PlanExecutor::NO_YIELD);
        exec = std::move(statusWithPlanExecutor.getValue());

            auto planSummary = Explain::getPlanSummary(exec.get());
            stdx::lock_guard<Client> lk(*opCtx->getClient());

    // Having released the collection lock, we can now create a cursor that returns results from the
    // pipeline. This cursor owns no collection state, and thus we register it with the global
    // cursor manager. The global cursor manager does not deliver invalidations or kill
    // notifications; the underlying PlanExecutor(s) used by the pipeline will be receiving
    // invalidations and kill notifications themselves, not the cursor we create here.
    ClientCursorParams cursorParams(
    if (expCtx->tailableMode == TailableModeEnum::kTailableAndAwaitData) {

    auto pin =
        CursorManager::getGlobalCursorManager()->registerCursor(opCtx, std::move(cursorParams));

    ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, &pin);

    // If both explain and cursor are specified, explain wins.
    if (expCtx->explain) {
            pin.getCursor()->getExecutor(), *(expCtx->explain), &result);
    } else {
        // Cursor must be specified, if explain is not.
        const bool keepCursor =
            handleCursorCommand(opCtx, origNss, pin.getCursor(), request, result);
        if (keepCursor) {

    if (!expCtx->explain) {
        PlanSummaryStats stats;
        Explain::getSummaryStats(*(pin.getCursor()->getExecutor()), &stats);
        curOp->debug().nreturned = stats.nReturned;

    // Any code that needs the cursor pinned must be inside the try block, above.
    return Status::OK();
Example #18
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) {
    if (isEOF()) {
        return PlanStage::IS_EOF;
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    if (_idRetrying != WorkingSet::INVALID_ID) {
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    } else {
        auto status = child()->work(&id);

        switch (status) {
            case PlanStage::ADVANCED:

            case PlanStage::FAILURE:
            case PlanStage::DEAD:
                // The stage which produces a failure is responsible for allocating a working set
                // member with error details.
                invariant(WorkingSet::INVALID_ID != id);
                *out = id;
                return status;

            case PlanStage::NEED_TIME:
                return status;

            case PlanStage::NEED_YIELD:
                *out = id;
                return status;

            case PlanStage::IS_EOF:
                return status;


    // We advanced, or are retrying, and id is set to the WSM to work on.
    WorkingSetMember* member = _ws->get(id);

    // We want to free this member when we return, unless we need to retry deleting or returning it.
    ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

    RecordId recordId = member->recordId;
    // Deletes can't have projections. This means that covering analysis will always add
    // a fetch. We should always get fetched data, and never just key data.

    // Ensure the document still exists and matches the predicate.
    bool docStillMatches;
    try {
        docStillMatches = write_stage_common::ensureStillMatches(
            _collection, getOpCtx(), _ws, id, _params.canonicalQuery);
    } catch (const WriteConflictException&) {
        // There was a problem trying to detect if the document still exists, so retry.
        return prepareToRetryWSM(id, out);

    if (!docStillMatches) {
        // Either the document has already been deleted, or it has been updated such that it no
        // longer matches the predicate.
        if (shouldRestartDeleteIfNoLongerMatches(_params)) {
            throw WriteConflictException();
        return PlanStage::NEED_TIME;

    // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() is
    // allowed to free the memory.
    if (_params.returnDeleted) {
        // Save a copy of the document that is about to get deleted, but keep it in the RID_AND_OBJ
        // state in case we need to retry deleting it.
        BSONObj deletedDoc = member->obj.value();

    // TODO: Do we want to buffer docs and delete them in a group rather than saving/restoring state
    // repeatedly?

    try {
    } catch (const WriteConflictException&) {

    // Do the write, unless this is an explain.
    if (!_params.isExplain) {
        try {
            WriteUnitOfWork wunit(getOpCtx());
                                        _params.returnDeleted ? Collection::StoreDeletedDoc::On
                                                              : Collection::StoreDeletedDoc::Off);
        } catch (const WriteConflictException&) {
            memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
            return prepareToRetryWSM(id, out);

    if (_params.returnDeleted) {
        // After deleting the document, the RecordId associated with this member is invalid.
        // Remove the 'recordId' from the WorkingSetMember before returning it.
        member->recordId = RecordId();

    // As restoreState may restore (recreate) cursors, cursors are tied to the transaction in which
    // they are created, and a WriteUnitOfWork is a transaction, make sure to restore the state
    // outside of the WriteUnitOfWork.
    try {
    } catch (const WriteConflictException&) {
        // Note we don't need to retry anything in this case since the delete already was committed.
        // However, we still need to return the deleted document (if it was requested).
        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            _idReturning = id;
            // Keep this member around so that we can return it on the next work() call.
        *out = WorkingSet::INVALID_ID;
        return NEED_YIELD;

    if (_params.returnDeleted) {
        // member->obj should refer to the deleted document.
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        memberFreer.Dismiss();  // Keep this member around so we can return it.
        *out = id;
        return PlanStage::ADVANCED;

    return PlanStage::NEED_TIME;
Example #19
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {

            _lifecycleState = kKillComplete;

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    if (!cbData.response.isOK()) {
        remote.status = cbData.response.getStatus();

        // If we failed to retrieve the batch because we couldn't contact the remote, we notify that
        // targeter that the host is unreachable. The caller can then retry on a new host.
        if (remote.status == ErrorCodes::HostUnreachable && remote.shardId) {
            auto shard = _params.shardRegistry->getShard(_params.txn, *remote.shardId);
            if (!shard) {
                remote.status =
                           str::stream() << "Could not find shard " << *remote.shardId
                                         << " containing host " << remote.hostAndPort.toString());
            } else {


    auto getMoreParseStatus = CursorResponse::parseFromBSON(cbData.response.getValue().data);
    if (!getMoreParseStatus.isOK()) {
        remote.status = getMoreParseStatus.getStatus();

    auto cursorResponse = getMoreParseStatus.getValue();

    // If we have a cursor established, and we get a non-zero cursorid that is not equal to the
    // established cursorid, we will fail the operation.
    if (remote.cursorId && cursorResponse.cursorId != 0 &&
        *remote.cursorId != cursorResponse.cursorId) {
        remote.status = Status(ErrorCodes::BadValue,
                               str::stream() << "Expected cursorid " << *remote.cursorId
                                             << " but received " << cursorResponse.cursorId);

    remote.cursorId = cursorResponse.cursorId;
    remote.cmdObj = boost::none;

    for (const auto& obj : cursorResponse.batch) {
        // If there's a sort, we're expecting the remote node to give us back a sort key.
        if (!_params.sort.isEmpty() &&
            obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) {
            remote.status = Status(ErrorCodes::InternalError,
                                   str::stream() << "Missing field '"
                                                 << ClusterClientCursorParams::kSortKeyField
                                                 << "' in document: " << obj);


    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !cursorResponse.batch.empty()) {

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    if (_params.isTailable && !remote.hasNext()) {
        _eofNext = true;

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) {
        auto nextBatchStatus = askForNextBatch_inlock(remoteIndex);
        if (!nextBatchStatus.isOK()) {
            remote.status = nextBatchStatus;

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
Example #20
    void Listener::setupSockets() {

#if !defined(_WIN32)
        _mine = ipToAddrs(_ip.c_str(), _port, (!serverGlobalParams.noUnixSocket &&
        _mine = ipToAddrs(_ip.c_str(), _port, false);

        for (std::vector<SockAddr>::const_iterator it=_mine.begin(), end=_mine.end();
             it != end;
             ++it) {

            const SockAddr& me = *it;

            if (!me.isValid()) {
                error() << "listen(): socket is invalid." << endl;

            SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0);
            ScopeGuard socketGuard = MakeGuard(&closesocket, sock);
            massert( 15863 , str::stream() << "listen(): invalid socket? " << errnoWithDescription() , sock >= 0 );

            if (me.getType() == AF_UNIX) {
#if !defined(_WIN32)
                if (unlink(me.getAddr().c_str()) == -1) {
                    if (errno != ENOENT) {
                        error() << "Failed to unlink socket file " << me << " "
                                << errnoWithDescription(errno);
            else if (me.getType() == AF_INET6) {
                // IPv6 can also accept IPv4 connections as mapped addresses (::ffff:
                // That causes a conflict if we don't do set it to IPV6_ONLY
                const int one = 1;
                setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*) &one, sizeof(one));

#if !defined(_WIN32)
                const int one = 1;
                if ( setsockopt( sock , SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0 )
                    log() << "Failed to set socket opt, SO_REUSEADDR" << endl;

            if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) {
                int x = errno;
                error() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl;
                if ( x == EADDRINUSE )
                    error() << "  addr already in use" << endl;

#if !defined(_WIN32)
            if (me.getType() == AF_UNIX) {
                if (chmod(me.getAddr().c_str(), serverGlobalParams.unixSocketPermissions) == -1) {
                    error() << "Failed to chmod socket file " << me << " "
                            << errnoWithDescription(errno);
                ListeningSockets::get()->addPath( me.getAddr() );

        _setupSocketsSuccessful = true;
Example #21
        bool run(OperationContext* txn,
                 const std::string& dbname,
                 BSONObj& cmdObj,
                 int options,
                 std::string& errmsg,
                 BSONObjBuilder& result) override {
            // Counted as a getMore, not as a command.

            if (txn->getClient()->isInDirectClient()) {
                return appendCommandStatus(result,
                                                  "Cannot run getMore command from eval()"));

            StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj);
            if (!parseStatus.isOK()) {
                return appendCommandStatus(result, parseStatus.getStatus());
            const GetMoreRequest& request = parseStatus.getValue();

            // Depending on the type of cursor being operated on, we hold locks for the whole
            // getMore, or none of the getMore, or part of the getMore.  The three cases in detail:
            // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
            // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors
            //    don't own any collection state.
            // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
            //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
            //    release), but the pin and unpin of the cursor must occur under the collection
            //    lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because
            //    AutoGetCollectionForRead checks the sharding version (and we want the relock for
            //    the unpin to succeed even if the sharding version has changed).
            // Note that we declare our locks before our ClientCursorPin, in order to ensure that
            // the pin's destructor is called before the lock destructors (so that the unpin occurs
            // under the lock).
            std::unique_ptr<AutoGetCollectionForRead> ctx;
            std::unique_ptr<Lock::DBLock> unpinDBLock;
            std::unique_ptr<Lock::CollectionLock> unpinCollLock;

            CursorManager* cursorManager;
            CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
            if (globalCursorManager->ownsCursorId(request.cursorid)) {
                cursorManager = globalCursorManager;
            else {
                ctx.reset(new AutoGetCollectionForRead(txn, request.nss));
                Collection* collection = ctx->getCollection();
                if (!collection) {
                    return appendCommandStatus(result,
                                                      "collection dropped between getMore calls"));
                cursorManager = collection->getCursorManager();

            ClientCursorPin ccPin(cursorManager, request.cursorid);
            ClientCursor* cursor = ccPin.c();
            if (!cursor) {
                // We didn't find the cursor.
                return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream()
                    << "Cursor not found, cursor id: " << request.cursorid));

            if (request.nss.ns() != cursor->ns()) {
                return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream()
                    << "Requested getMore on namespace '" << request.nss.ns()
                    << "', but cursor belongs to a different namespace"));

            const bool hasOwnMaxTime = CurOp::get(txn)->isMaxTimeSet();

            // Validation related to awaitData.
            if (isCursorAwaitData(cursor)) {

                if (!hasOwnMaxTime) {
                    Status status(ErrorCodes::BadValue,
                                  str::stream() << "Must set maxTimeMS on a getMore if the initial "
                                                << "query had 'awaitData' set: " << cmdObj);
                    return appendCommandStatus(result, status);

                if (cursor->isAggCursor()) {
                    Status status(ErrorCodes::BadValue,
                                  "awaitData cannot be set on an aggregation cursor");
                    return appendCommandStatus(result, status);

            // On early return, get rid of the cursor.
            ScopeGuard cursorFreer = MakeGuard(&GetMoreCmd::cleanupCursor, txn, &ccPin, request);

            if (!cursor->hasRecoveryUnit()) {
                // Start using a new RecoveryUnit.

            // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
            ScopedRecoveryUnitSwapper ruSwapper(cursor, txn);

            // Reset timeout timer on the cursor since the cursor is still in use.

            // If there is no time limit set directly on this getMore command, but the operation
            // that spawned this cursor had a time limit set, then we have to apply any leftover
            // time to this getMore.
            if (!hasOwnMaxTime) {
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (cursor->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks

            PlanExecutor* exec = cursor->getExecutor();

            // If we're tailing a capped collection, retrieve a monotonically increasing insert
            // counter.
            uint64_t lastInsertCount = 0;
            if (isCursorAwaitData(cursor)) {
                lastInsertCount = ctx->getCollection()->getCappedInsertNotifier()->getCount();

            CursorId respondWithId = 0;
            BSONArrayBuilder nextBatch;
            BSONObj obj;
            PlanExecutor::ExecState state;
            int numResults = 0;
            Status batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults);
            if (!batchStatus.isOK()) {
                return appendCommandStatus(result, batchStatus);

            // If this is an await data cursor, and we hit EOF without generating any results, then
            // we block waiting for new oplog data to arrive.
            if (isCursorAwaitData(cursor) && state == PlanExecutor::IS_EOF && numResults == 0) {
                // Retrieve the notifier which we will wait on until new data arrives. We make sure
                // to do this in the lock because once we drop the lock it is possible for the
                // collection to become invalid. The notifier itself will outlive the collection if
                // the collection is dropped, as we keep a shared_ptr to it.
                auto notifier = ctx->getCollection()->getCappedInsertNotifier();

                // Save the PlanExecutor and drop our locks.

                // Block waiting for data.
                Microseconds timeout(CurOp::get(txn)->getRemainingMaxTimeMicros());
                notifier->waitForInsert(lastInsertCount, timeout);

                ctx.reset(new AutoGetCollectionForRead(txn, request.nss));

                // We woke up because either the timed_wait expired, or there was more data. Either
                // way, attempt to generate another batch of results.
                batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults);
                if (!batchStatus.isOK()) {
                    return appendCommandStatus(result, batchStatus);

            if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) {
                respondWithId = request.cursorid;


                // If maxTimeMS was set directly on the getMore rather than being rolled over
                // from a previous find, then don't roll remaining micros over to the next
                // getMore.
                if (!hasOwnMaxTime) {


                if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) {
                    // Rather than swapping their existing RU into the client cursor, tailable
                    // cursors should get a new recovery unit.
            else {
                CurOp::get(txn)->debug().cursorExhausted = true;

            appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result);

            if (respondWithId) {

                // If we are operating on an aggregation cursor, then we dropped our collection lock
                // earlier and need to reacquire it in order to clean up our ClientCursorPin.
                if (cursor->isAggCursor()) {
                    invariant(NULL == ctx.get());
                        new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS));
                        new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS));

            return true;
Example #22
void ServiceContextMongoD::initializeGlobalStorageEngine() {
    // This should be set once.

    // We should have a _lockFile or be in read-only mode. Confusingly, we can still have a lockFile
    // if we are in read-only mode. This can happen if the server is started in read-only mode on a
    // writable dbpath.
    invariant(_lockFile || storageGlobalParams.readOnly);

    const std::string dbpath = storageGlobalParams.dbpath;
    if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) {
        if (storageGlobalParams.engineSetByUser) {
            // Verify that the name of the user-supplied storage engine matches the contents of
            // the metadata file.
            const StorageEngine::Factory* factory =
                                   static_cast<const StorageEngine::Factory*>(nullptr));

            if (factory) {
                            << "Cannot start server. Detected data files in " << dbpath
                            << " created by"
                            << " the '" << *existingStorageEngine << "' storage engine, but the"
                            << " specified storage engine was '" << factory->getCanonicalName()
                            << "'.",
                        factory->getCanonicalName() == *existingStorageEngine);
        } else {
            // Otherwise set the active storage engine as the contents of the metadata file.
            log() << "Detected data files in " << dbpath << " created by the '"
                  << *existingStorageEngine << "' storage engine, so setting the active"
                  << " storage engine to '" << *existingStorageEngine << "'.";
            storageGlobalParams.engine = *existingStorageEngine;
    } else if (!storageGlobalParams.engineSetByUser) {
        // Ensure the default storage engine is available with this build of mongod.
                    << "Cannot start server. The default storage engine '"
                    << storageGlobalParams.engine
                    << "' is not available with this build of mongod. Please specify a different"
                    << " storage engine explicitly, e.g. --storageEngine=mmapv1.",

    const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine];

            str::stream() << "Cannot start server with an unknown storage engine: "
                          << storageGlobalParams.engine,

    if (storageGlobalParams.readOnly) {
                    << "Server was started in read-only mode, but the configured storage engine, "
                    << storageGlobalParams.engine << ", does not support read-only operation",

    std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath);

    if (storageGlobalParams.readOnly) {
                "Server was started in read-only mode, but the storage metadata file was not"
                " found.",

    // Validate options in metadata against current startup options.
    if (metadata.get()) {
        uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));

    ScopeGuard guard = MakeGuard([&] {
        if (_lockFile) {

    _storageEngine = factory->create(storageGlobalParams, _lockFile.get());

    if (_lockFile) {

    // Write a new metadata file if it is not present.
    if (!metadata.get()) {
        metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));


    _supportsDocLocking = _storageEngine->supportsDocLocking();
Example #23
Status IndexAccessMethod::commitBulk(OperationContext* opCtx,
                                     std::unique_ptr<BulkBuilder> bulk,
                                     bool mayInterrupt,
                                     bool dupsAllowed,
                                     set<RecordId>* dupsToDrop,
                                     bool assignTimestamp) {
    // Do not track multikey path info for index builds.
    ScopeGuard restartTracker =
        MakeGuard([opCtx] { MultikeyPathTracker::get(opCtx).startTrackingMultikeyPathInfo(); });
    if (!MultikeyPathTracker::get(opCtx).isTrackingMultikeyPathInfo()) {
    Timer timer;

    std::unique_ptr<BulkBuilder::Sorter::Iterator> i(bulk->_sorter->done());

    stdx::unique_lock<Client> lk(*opCtx->getClient());
    ProgressMeterHolder pm(
        CurOp::get(opCtx)->setMessage_inlock("Index Bulk Build: (2/3) btree bottom up",
                                             "Index: (2/3) BTree Bottom Up Progress",

    std::unique_ptr<SortedDataBuilderInterface> builder;

    writeConflictRetry(opCtx, "setting index multikey flag", "", [&] {
        WriteUnitOfWork wunit(opCtx);

        if (bulk->_everGeneratedMultipleKeys || isMultikeyFromPaths(bulk->_indexMultikeyPaths)) {
            _btreeState->setMultikey(opCtx, bulk->_indexMultikeyPaths);

        builder.reset(_newInterface->getBulkBuilder(opCtx, dupsAllowed));
        if (assignTimestamp) {

    while (i->more()) {
        if (mayInterrupt) {

        WriteUnitOfWork wunit(opCtx);
        // Improve performance in the btree-building phase by disabling rollback tracking.
        // This avoids copying all the written bytes to a buffer that is only used to roll back.
        // Note that this is safe to do, as this entire index-build-in-progress will be cleaned
        // up by the index system.

        // Get the next datum and add it to the builder.
        BulkBuilder::Sorter::Data d = i->next();
        Status status = builder->addKey(d.first, d.second);

        if (!status.isOK()) {
            // Overlong key that's OK to skip?
            if (status.code() == ErrorCodes::KeyTooLong && ignoreKeyTooLong(opCtx)) {

            // Check if this is a duplicate that's OK to skip
            if (status.code() == ErrorCodes::DuplicateKey) {
                invariant(!dupsAllowed);  // shouldn't be getting DupKey errors if dupsAllowed.

                if (dupsToDrop) {

            return status;

        // If we're here either it's a dup and we're cool with it or the addKey went just
        // fine.
        if (assignTimestamp) {


        stdx::lock_guard<Client> lk(*opCtx->getClient());
        CurOp::get(opCtx)->setMessage_inlock("Index Bulk Build: (3/3) btree-middle",
                                             "Index: (3/3) BTree Middle Progress");

    LOG(timer.seconds() > 10 ? 0 : 1) << "\t done building bottom layer, going to commit";

    std::unique_ptr<TimestampBlock> tsBlock;
    if (assignTimestamp) {
        tsBlock = stdx::make_unique<TimestampBlock>(
            opCtx, LogicalClock::get(opCtx)->getClusterTime().asTimestamp());
    return Status::OK();
Example #24
void ServiceContextMongoD::initializeGlobalStorageEngine() {
    // This should be set once.

    const std::string dbpath = storageGlobalParams.dbpath;
    if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) {
        if (storageGlobalParams.engineSetByUser) {
            // Verify that the name of the user-supplied storage engine matches the contents of
            // the metadata file.
            const StorageEngine::Factory* factory =
                                   static_cast<const StorageEngine::Factory*>(nullptr));

            if (factory) {
                            << "Cannot start server. Detected data files in " << dbpath
                            << " created by"
                            << " the '" << *existingStorageEngine << "' storage engine, but the"
                            << " specified storage engine was '" << factory->getCanonicalName()
                            << "'.",
                        factory->getCanonicalName() == *existingStorageEngine);
        } else {
            // Otherwise set the active storage engine as the contents of the metadata file.
            log() << "Detected data files in " << dbpath << " created by the '"
                  << *existingStorageEngine << "' storage engine, so setting the active"
                  << " storage engine to '" << *existingStorageEngine << "'.";
            storageGlobalParams.engine = *existingStorageEngine;
    } else if (!storageGlobalParams.engineSetByUser) {
        // Ensure the default storage engine is available with this build of mongod.
                    << "Cannot start server. The default storage engine '"
                    << storageGlobalParams.engine
                    << "' is not available with this build of mongod. Please specify a different"
                    << " storage engine explicitly, e.g. --storageEngine=mmapv1.",

    const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine];

            str::stream() << "Cannot start server with an unknown storage engine: "
                          << storageGlobalParams.engine,

    std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath);

    // Validate options in metadata against current startup options.
    if (metadata.get()) {
        uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));

    try {
        _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath));
    } catch (const std::exception& ex) {
                str::stream() << "Unable to determine status of lock file in the data directory "
                              << storageGlobalParams.dbpath << ": " << ex.what(),
    if (_lockFile->createdByUncleanShutdown()) {
        warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty.";

    ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get());
    _storageEngine = factory->create(storageGlobalParams, *_lockFile);

    // Write a new metadata file if it is not present.
    if (!metadata.get()) {
        metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));


    _supportsDocLocking = _storageEngine->supportsDocLocking();
Example #25
bool MessagingPort::recv(Message& m) {
    try {
        // mmm( log() << "*  recv() sock:" << this->sock << endl; )
        MSGHEADER::Value header;
        int headerLen = sizeof(MSGHEADER::Value);
        psock->recv((char*)&header, headerLen);
        int len = header.constView().getMessageLength();

        if (len == 542393671) {
            // an http GET
            string msg =
                "It looks like you are trying to access MongoDB over HTTP on the native driver "
            LOG(psock->getLogLevel()) << msg;
            std::stringstream ss;
            ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: "
                  "text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg;
            string s = ss.str();
            send(s.c_str(), s.size(), "http");
            return false;
        // If responseTo is not 0 or -1 for first packet assume SSL
        else if (psock->isAwaitingHandshake()) {
            if (header.constView().getResponseTo() != 0 &&
                header.constView().getResponseTo() != -1) {
                          "SSL handshake requested, SSL feature not available in this build");
            if (header.constView().getResponseTo() != 0 &&
                header.constView().getResponseTo() != -1) {
                        "SSL handshake received but server is started without SSL support",
                        sslGlobalParams.sslMode.load() != SSLParams::SSLMode_disabled);
                    psock->doSSLHandshake(reinterpret_cast<const char*>(&header), sizeof(header)));
                goto again;
                    "The server is configured to only allow SSL connections",
                    sslGlobalParams.sslMode.load() != SSLParams::SSLMode_requireSSL);
        if (static_cast<size_t>(len) < sizeof(MSGHEADER::Value) ||
            static_cast<size_t>(len) > MaxMessageSizeBytes) {
            LOG(0) << "recv(): message len " << len << " is invalid. "
                   << "Min " << sizeof(MSGHEADER::Value) << " Max: " << MaxMessageSizeBytes;
            return false;

        int z = (len + 1023) & 0xfffffc00;
        verify(z >= len);
        MsgData::View md = reinterpret_cast<char*>(mongolMalloc(z));
        ScopeGuard guard = MakeGuard(free, md.view2ptr());

        memcpy(md.view2ptr(), &header, headerLen);
        int left = len - headerLen;

        psock->recv(md.data(), left);

        m.setData(md.view2ptr(), true);
        return true;

    } catch (const SocketException& e) {
        logger::LogSeverity severity = psock->getLogLevel();
        if (!e.shouldPrint())
            severity = severity.lessSevere();
        LOG(severity) << "SocketException: remote: " << remote() << " error: " << e;
        return false;
Example #26
PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) {
    if (isEOF()) {
        return PlanStage::IS_EOF;

    if (doneUpdating()) {
        // Even if we're done updating, we may have some inserting left to do.
        if (needInsert()) {
            // TODO we may want to handle WriteConflictException here. Currently we bounce it
            // out to a higher level since if this WCEs it is likely that we raced with another
            // upsert that may have matched our query, and therefore this may need to perform an
            // update rather than an insert. Bouncing to the higher level allows restarting the
            // query in this case.

            if (_params.request->shouldReturnNewDocs()) {
                // Want to return the document we just inserted, create it as a WorkingSetMember
                // so that we can return it.
                BSONObj newObj = _specificStats.objInserted;
                *out = _ws->allocate();
                WorkingSetMember* member = _ws->get(*out);
                member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(),
                return PlanStage::ADVANCED;

        // At this point either we're done updating and there was no insert to do,
        // or we're done updating and we're done inserting. Either way, we're EOF.
        return PlanStage::IS_EOF;

    // If we're here, then we still have to ask for results from the child and apply
    // updates to them. We should only get here if the collection exists.

    // It is possible that after an update was applied, a WriteConflictException
    // occurred and prevented us from returning ADVANCED with the requested version
    // of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    StageState status;
    if (_idRetrying == WorkingSet::INVALID_ID) {
        status = child()->work(&id);
    } else {
        status = ADVANCED;
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;

    if (PlanStage::ADVANCED == status) {
        // Need to get these things from the result returned by the child.
        RecordId recordId;

        WorkingSetMember* member = _ws->get(id);

        // We want to free this member when we return, unless we need to retry updating or returning
        // it.
        ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

        if (!member->hasRecordId()) {
            // We expect to be here because of an invalidation causing a force-fetch.
            return PlanStage::NEED_TIME;
        recordId = member->recordId;

        // Updates can't have projections. This means that covering analysis will always add
        // a fetch. We should always get fetched data, and never just key data.

        // We fill this with the new RecordIds of moved doc so we don't double-update.
        if (_updatedRecordIds && _updatedRecordIds->count(recordId) > 0) {
            // Found a RecordId that refers to a document we had already updated. Note that
            // we can never remove from _updatedRecordIds because updates by other clients
            // could cause us to encounter a document again later.
            return PlanStage::NEED_TIME;

        bool docStillMatches;
        try {
            docStillMatches = write_stage_common::ensureStillMatches(
                _collection, getOpCtx(), _ws, id, _params.canonicalQuery);
        } catch (const WriteConflictException&) {
            // There was a problem trying to detect if the document still exists, so retry.
            return prepareToRetryWSM(id, out);

        if (!docStillMatches) {
            // Either the document has been deleted, or it has been updated such that it no longer
            // matches the predicate.
            if (shouldRestartUpdateIfNoLongerMatches(_params)) {
                throw WriteConflictException();
            return PlanStage::NEED_TIME;

        // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
        // is allowed to free the memory.

        // Save state before making changes
        try {
        } catch (const WriteConflictException&) {

        // If we care about the pre-updated version of the doc, save it out here.
        BSONObj oldObj;
        if (_params.request->shouldReturnOldDocs()) {
            oldObj = member->obj.value().getOwned();

        BSONObj newObj;
        try {
            // Do the update, get us the new version of the doc.
            newObj = transformAndUpdate(member->obj, recordId);
        } catch (const WriteConflictException&) {
            memberFreer.Dismiss();  // Keep this member around so we can retry updating it.
            return prepareToRetryWSM(id, out);

        // Set member's obj to be the doc we want to return.
        if (_params.request->shouldReturnAnyDocs()) {
            if (_params.request->shouldReturnNewDocs()) {
                member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(),
            } else {
            member->recordId = RecordId();

        // This should be after transformAndUpdate to make sure we actually updated this doc.

        // Restore state after modification

        // As restoreState may restore (recreate) cursors, make sure to restore the
        // state outside of the WritUnitOfWork.
        try {
        } catch (const WriteConflictException&) {
            // Note we don't need to retry updating anything in this case since the update
            // already was committed. However, we still need to return the updated document
            // (if it was requested).
            if (_params.request->shouldReturnAnyDocs()) {
                // member->obj should refer to the document we want to return.
                invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

                _idReturning = id;
                // Keep this member around so that we can return it on the next work() call.
            *out = WorkingSet::INVALID_ID;
            return NEED_YIELD;

        if (_params.request->shouldReturnAnyDocs()) {
            // member->obj should refer to the document we want to return.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            memberFreer.Dismiss();  // Keep this member around so we can return it.
            *out = id;
            return PlanStage::ADVANCED;

        return PlanStage::NEED_TIME;
    } else if (PlanStage::IS_EOF == status) {
        // The child is out of results, but we might not be done yet because we still might
        // have to do an insert.
        return PlanStage::NEED_TIME;
    } else if (PlanStage::FAILURE == status) {
        *out = id;
        // If a stage fails, it may create a status WSM to indicate why it failed, in which case
        // 'id' is valid.  If ID is invalid, we create our own error message.
        if (WorkingSet::INVALID_ID == id) {
            const std::string errmsg = "update stage failed to read in results from child";
            *out = WorkingSetCommon::allocateStatusMember(
                _ws, Status(ErrorCodes::InternalError, errmsg));
            return PlanStage::FAILURE;
        return status;
    } else if (PlanStage::NEED_YIELD == status) {
        *out = id;

    return status;
Example #27
    virtual bool run(OperationContext* txn,
                     const string& dbname,
                     BSONObj& cmdObj,
                     int options,
                     string& errmsg,
                     BSONObjBuilder& result) {
        const NamespaceString ns(parseNs(dbname, cmdObj));

        Status status = userAllowedWriteNS(ns);
        if (!status.isOK())
            return appendCommandStatus(result, status);

        if (cmdObj["indexes"].type() != Array) {
            errmsg = "indexes has to be an array";
            result.append("cmdObj", cmdObj);
            return false;

        std::vector<BSONObj> specs;
            BSONObjIterator i(cmdObj["indexes"].Obj());
            while (i.more()) {
                BSONElement e = i.next();
                if (e.type() != Object) {
                    errmsg = "everything in indexes has to be an Object";
                    result.append("cmdObj", cmdObj);
                    return false;

        if (specs.size() == 0) {
            errmsg = "no indexes to add";
            return false;

        // check specs
        for (size_t i = 0; i < specs.size(); i++) {
            BSONObj spec = specs[i];
            if (spec["ns"].eoo()) {
                spec = _addNsToSpec(ns, spec);
                specs[i] = spec;

            if (spec["ns"].type() != String) {
                errmsg = "ns field must be a string";
                result.append("spec", spec);
                return false;

            std::string nsFromUser = spec["ns"].String();
            if (nsFromUser.empty()) {
                errmsg = "ns field cannot be an empty string";
                result.append("spec", spec);
                return false;

            if (ns != nsFromUser) {
                errmsg = str::stream() << "value of ns field '" << nsFromUser
                                       << "' doesn't match namespace " << ns.ns();
                result.append("spec", spec);
                return false;

        // now we know we have to create index(es)
        // Note: createIndexes command does not currently respect shard versioning.
        ScopedTransaction transaction(txn, MODE_IX);
        Lock::DBLock dbLock(txn->lockState(), ns.db(), MODE_X);
        if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) {
            return appendCommandStatus(
                       str::stream() << "Not primary while creating indexes in " << ns.ns()));

        Database* db = dbHolder().get(txn, ns.db());
        if (!db) {
            db = dbHolder().openDb(txn, ns.db());

        Collection* collection = db->getCollection(ns.ns());
        if (collection) {
            result.appendBool("createdCollectionAutomatically", false);
        } else {
                WriteUnitOfWork wunit(txn);
                collection = db->createCollection(txn, ns.ns(), CollectionOptions());
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());
            result.appendBool("createdCollectionAutomatically", true);

        const int numIndexesBefore = collection->getIndexCatalog()->numIndexesTotal(txn);
        result.append("numIndexesBefore", numIndexesBefore);

        auto client = txn->getClient();
        ScopeGuard lastOpSetterGuard =

        MultiIndexBlock indexer(txn, collection);

        const size_t origSpecsSize = specs.size();

        if (specs.size() == 0) {
            result.append("numIndexesAfter", numIndexesBefore);
            result.append("note", "all indexes already exist");
            return true;

        if (specs.size() != origSpecsSize) {
            result.append("note", "index already exists");

        for (size_t i = 0; i < specs.size(); i++) {
            const BSONObj& spec = specs[i];
            if (spec["unique"].trueValue()) {
                status = checkUniqueIndexConstraints(txn, ns.ns(), spec["key"].Obj());

                if (!status.isOK()) {
                    return appendCommandStatus(result, status);
            if (spec["v"].isNumber() && spec["v"].numberInt() == 0) {
                return appendCommandStatus(
                           str::stream() << "illegal index specification: " << spec << ". "
                                         << "The option v:0 cannot be passed explicitly"));

        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());

        // If we're a background index, replace exclusive db lock with an intent lock, so that
        // other readers and writers can proceed during this phase.
        if (indexer.getBuildInBackground()) {
            if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) {
                return appendCommandStatus(
                           str::stream() << "Not primary while creating background indexes in "
                                         << ns.ns()));

        try {
            Lock::CollectionLock colLock(txn->lockState(), ns.ns(), MODE_IX);
        } catch (const DBException& e) {
            invariant(e.getCode() != ErrorCodes::WriteConflict);
            // Must have exclusive DB lock before we clean up the index build via the
            // destructor of 'indexer'.
            if (indexer.getBuildInBackground()) {
                try {
                    // This function cannot throw today, but we will preemptively prepare for
                    // that day, to avoid data corruption due to lack of index cleanup.
                    if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) {
                        return appendCommandStatus(
                                       << "Not primary while creating background indexes in "
                                       << ns.ns() << ": cleaning up index build failure due to "
                                       << e.toString()));
                } catch (...) {
        // Need to return db lock back to exclusive, to complete the index build.
        if (indexer.getBuildInBackground()) {
                    str::stream() << "Not primary while completing index build in " << dbname,

            Database* db = dbHolder().get(txn, ns.db());
            uassert(28551, "database dropped during index build", db);
            uassert(28552, "collection dropped during index build", db->getCollection(ns.ns()));

            WriteUnitOfWork wunit(txn);


            for (size_t i = 0; i < specs.size(); i++) {
                std::string systemIndexes = ns.getSystemIndexesCollection();
                    txn, systemIndexes, specs[i]);

        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());

        result.append("numIndexesAfter", collection->getIndexCatalog()->numIndexesTotal(txn));


        return true;
void AsyncClusterClientCursor::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {

            _lifecycleState = kKillComplete;

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncClusterClientCursor::signalCurrentEvent_inlock, this);

    if (!cbData.response.isOK()) {
        _remotes[remoteIndex].status = cbData.response.getStatus();

    auto getMoreParseStatus = GetMoreResponse::parseFromBSON(cbData.response.getValue().data);
    if (!getMoreParseStatus.isOK()) {
        _remotes[remoteIndex].status = getMoreParseStatus.getStatus();

    auto getMoreResponse = getMoreParseStatus.getValue();

    // If we have a cursor established, and we get a non-zero cursorid that is not equal to the
    // established cursorid, we will fail the operation.
    if (remote.cursorId && getMoreResponse.cursorId != 0 &&
        *remote.cursorId != getMoreResponse.cursorId) {
        _remotes[remoteIndex].status =
                   str::stream() << "Expected cursorid " << *remote.cursorId << " but received "
                                 << getMoreResponse.cursorId);

    remote.cursorId = getMoreResponse.cursorId;

    for (const auto& obj : getMoreResponse.batch) {

    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !getMoreResponse.batch.empty()) {

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) {
    if (isEOF()) {
        return PlanStage::IS_EOF;
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    if (_idRetrying != WorkingSet::INVALID_ID) {
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    } else {
        auto status = child()->work(&id);

        switch (status) {
        case PlanStage::ADVANCED:

        case PlanStage::FAILURE:
        case PlanStage::DEAD:
            *out = id;

            // If a stage fails, it may create a status WSM to indicate why it failed, in which
            // case 'id' is valid.  If ID is invalid, we create our own error message.
            if (WorkingSet::INVALID_ID == id) {
                const std::string errmsg = "delete stage failed to read in results from child";
                *out = WorkingSetCommon::allocateStatusMember(
                           _ws, Status(ErrorCodes::InternalError, errmsg));
            return status;

        case PlanStage::NEED_TIME:
            return status;

        case PlanStage::NEED_YIELD:
            *out = id;
            return status;

        case PlanStage::IS_EOF:
            return status;


    // We advanced, or are retrying, and id is set to the WSM to work on.
    WorkingSetMember* member = _ws->get(id);

    // We want to free this member when we return, unless we need to retry it.
    ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

    if (!member->hasRecordId()) {
        // We expect to be here because of an invalidation causing a force-fetch.
        return PlanStage::NEED_TIME;
    RecordId recordId = member->recordId;
    // Deletes can't have projections. This means that covering analysis will always add
    // a fetch. We should always get fetched data, and never just key data.

    try {
        // If the snapshot changed, then we have to make sure we have the latest copy of the
        // doc and that it still matches.
        std::unique_ptr<SeekableRecordCursor> cursor;
        if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
            cursor = _collection->getCursor(getOpCtx());
            if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) {
                // Doc is already deleted. Nothing more to do.
                return PlanStage::NEED_TIME;

            // Make sure the re-fetched doc still matches the predicate.
            if (_params.canonicalQuery &&
                    !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                // Doesn't match.
                return PlanStage::NEED_TIME;

        // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
        // is allowed to free the memory.
        if (_params.returnDeleted) {
            // Save a copy of the document that is about to get deleted, but keep it in the
            // RID_AND_OBJ state in case we need to retry deleting it.
            BSONObj deletedDoc = member->obj.value();

        // TODO: Do we want to buffer docs and delete them in a group rather than
        // saving/restoring state repeatedly?

        try {
        } catch (const WriteConflictException& wce) {

        // Do the write, unless this is an explain.
        if (!_params.isExplain) {
            WriteUnitOfWork wunit(getOpCtx());
            _collection->deleteDocument(getOpCtx(), recordId, _params.fromMigrate);

    } catch (const WriteConflictException& wce) {
        // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will
        // not produce any more results even if there is another matching document. Re-throw the WCE
        // here so that these operations get another chance to find a matching document. The
        // findAndModify command should automatically retry if it gets a WCE.
        // TODO: this is not necessary if there was no sort specified.
        if (_params.returnDeleted) {
        _idRetrying = id;
        memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
        *out = WorkingSet::INVALID_ID;
        return NEED_YIELD;

    if (_params.returnDeleted) {
        // After deleting the document, the RecordId associated with this member is invalid.
        // Remove the 'recordId' from the WorkingSetMember before returning it.
        member->recordId = RecordId();

    //  As restoreState may restore (recreate) cursors, cursors are tied to the
    //  transaction in which they are created, and a WriteUnitOfWork is a
    //  transaction, make sure to restore the state outside of the WritUnitOfWork.
    try {
    } catch (const WriteConflictException& wce) {
        // Note we don't need to retry anything in this case since the delete already
        // was committed. However, we still need to return the deleted document
        // (if it was requested).
        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            _idReturning = id;
            // Keep this member around so that we can return it on the next work() call.
        *out = WorkingSet::INVALID_ID;
        return NEED_YIELD;

    if (_params.returnDeleted) {
        // member->obj should refer to the deleted document.
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        memberFreer.Dismiss();  // Keep this member around so we can return it.
        *out = id;
        return PlanStage::ADVANCED;

    return PlanStage::NEED_TIME;
Example #30
     * Runs a query using the following steps:
     *   1) Parsing.
     *   2) Acquire locks.
     *   3) Plan query, obtaining an executor that can run it.
     *   4) Setup a cursor for the query, which may be used on subsequent getMores.
     *   5) Generate the first batch.
     *   6) Save state for getMore.
     *   7) Generate response to send to the client.
     * TODO: Rather than using the sharding version available in thread-local storage (i.e. the
     *       call to ShardingState::needCollectionMetadata() below), shard version information
     *       should be passed as part of the command parameter.
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const std::string fullns = parseNs(dbname, cmdObj);
        const NamespaceString nss(fullns);
        if (!nss.isValid()) {
            return appendCommandStatus(result,
                                        str::stream() << "Invalid collection name: " << nss.ns()});

        // Although it is a command, a find command gets counted as a query.

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));

        // 1a) Parse the command BSON to a LiteParsedQuery.
        const bool isExplain = false;
        auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!lpqStatus.isOK()) {
            return appendCommandStatus(result, lpqStatus.getStatus());

        auto& lpq = lpqStatus.getValue();

        // Validate term, if provided.
        if (auto term = lpq->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(*term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);

        // Fill out curop information.
        long long ntoreturn = lpq->getBatchSize().value_or(0);
        beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip());

        // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery.
        WhereCallbackReal whereCallback(txn, nss.db());
        auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // 2) Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel =
            ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

        ShardingState* const shardingState = ShardingState::get(txn);

        // It is possible that the sharding version will change during yield while we are
        // retrieving a plan executor. If this happens we will throw an error and mongos will
        // retry.
        const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

        // 3) Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while
        // the chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // Version changed while retrieving a PlanExecutor. Terminate the operation,
            // signaling that mongos should retry.
            throw SendStaleConfigException(nss.ns(),
                                           "version changed during find command",

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;

        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // 4) If possible, register the execution plan inside a ClientCursor, and pin that
        // cursor. In this case, ownership of the PlanExecutor is transferred to the
        // ClientCursor, and 'exec' becomes null.
        // First unregister the PlanExecutor so it can be re-registered with ClientCursor.

        // Create a ClientCursor containing this plan executor. We don't have to worry
        // about leaking it as it's inserted into a global map by its ctor.
        ClientCursor* cursor =
            new ClientCursor(collection->getCursorManager(),
        CursorId cursorId = cursor->cursorid();
        ClientCursorPin ccPin(collection->getCursorManager(), cursorId);

        // On early return, get rid of the the cursor.
        ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin);

        PlanExecutor* cursorExec = cursor->getExecutor();

        // 5) Stream query results, adding them to a BSONArray as we go.
        BSONArrayBuilder firstBatch;
        BSONObj obj;
        PlanExecutor::ExecState state;
        long long numResults = 0;
        while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) &&
               PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) {
            // If adding this object will cause us to exceed the BSON size limit, then we stash
            // it for later.
            if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) {

            // Add result to output buffer.

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats());
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::statsToBSON(*stats);

            return appendCommandStatus(result,
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));

        // 6) Set up the cursor for getMore.
        if (shouldSaveCursor(txn, collection, state, cursorExec)) {
            // State will be restored on getMore.

        } else {
            cursorId = 0;

        // Fill out curop based on the results.
        endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId);

        // 7) Generate the response object to send to the client.
        appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result);
        if (cursorId) {
        return true;