Ejemplo n.º 1
0
        virtual void accepted(boost::shared_ptr<Socket> psocket, long long connectionId ) {
            ScopeGuard sleepAfterClosingPort = MakeGuard(sleepmillis, 2);
            std::auto_ptr<MessagingPortWithHandler> portWithHandler(
                new MessagingPortWithHandler(psocket, _handler, connectionId));

            if ( ! Listener::globalTicketHolder.tryAcquire() ) {
                log() << "connection refused because too many open connections: " << Listener::globalTicketHolder.used() << endl;
                return;
            }

            try {
#ifndef __linux__  // TODO: consider making this ifdef _WIN32
                {
                    boost::thread thr(stdx::bind(&handleIncomingMsg, portWithHandler.get()));
                }
#else
                pthread_attr_t attrs;
                pthread_attr_init(&attrs);
                pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);

                static const size_t STACK_SIZE = 1024*1024; // if we change this we need to update the warning

                struct rlimit limits;
                verify(getrlimit(RLIMIT_STACK, &limits) == 0);
                if (limits.rlim_cur > STACK_SIZE) {
                    size_t stackSizeToSet = STACK_SIZE;
#if !__has_feature(address_sanitizer)
                    if (kDebugBuild)
                        stackSizeToSet /= 2;
#endif
                    pthread_attr_setstacksize(&attrs, stackSizeToSet);
                } else if (limits.rlim_cur < 1024*1024) {
                    warning() << "Stack size set to " << (limits.rlim_cur/1024) << "KB. We suggest 1MB" << endl;
                }


                pthread_t thread;
                int failed =
                    pthread_create(&thread, &attrs, &handleIncomingMsg, portWithHandler.get());

                pthread_attr_destroy(&attrs);

                if (failed) {
                    log() << "pthread_create failed: " << errnoWithDescription(failed) << endl;
                    throw boost::thread_resource_error(); // for consistency with boost::thread
                }
#endif  // __linux__

                portWithHandler.release();
                sleepAfterClosingPort.Dismiss();
            }
            catch ( boost::thread_resource_error& ) {
                Listener::globalTicketHolder.release();
                log() << "can't create new thread, closing connection" << endl;
            }
            catch ( ... ) {
                Listener::globalTicketHolder.release();
                log() << "unknown error accepting new socket" << endl;
            }
        }
StatusWith<EventHandle> ScatterGatherRunner::RunnerImpl::start(
    const RemoteCommandCallbackFn processResponseCB) {
    LockGuard lk(_mutex);

    invariant(!_started);
    _started = true;
    StatusWith<EventHandle> evh = _executor->makeEvent();
    if (!evh.isOK()) {
        return evh;
    }
    _sufficientResponsesReceived = evh.getValue();
    ScopeGuard earlyReturnGuard = MakeGuard(&RunnerImpl::_signalSufficientResponsesReceived, this);

    std::vector<RemoteCommandRequest> requests = _algorithm->getRequests();
    for (size_t i = 0; i < requests.size(); ++i) {
        log() << "Scheduling remote command request for " << _logMessage << ": "
              << requests[i].toString();
        const StatusWith<CallbackHandle> cbh =
            _executor->scheduleRemoteCommand(requests[i], processResponseCB);
        if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) {
            return StatusWith<EventHandle>(cbh.getStatus());
        }
        fassert(18743, cbh.getStatus());
        _callbacks.push_back(cbh.getValue());
    }

    if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) {
        invariant(_algorithm->hasReceivedSufficientResponses());
        _signalSufficientResponsesReceived();
    }

    earlyReturnGuard.Dismiss();
    return evh;
}
Ejemplo n.º 3
0
 SSL* SSLManager::accept(int fd) {
     SSL* ssl = _secure(fd);
     ScopeGuard guard = MakeGuard(::SSL_free, ssl);
     int ret = SSL_accept(ssl);
     if (ret != 1)
         _handleSSLError(SSL_get_error(ssl, ret));
     guard.Dismiss();
     return ssl;
 }
Ejemplo n.º 4
0
    void GlobalEnvironmentMongoD::setGlobalStorageEngine(const std::string& name) {
        // This should be set once.
        invariant(!_storageEngine);

        const StorageEngine::Factory* factory = _storageFactories[name];

        uassert(18656, str::stream()
            << "Cannot start server with an unknown storage engine: " << name,
            factory);

        std::string canonicalName = factory->getCanonicalName().toString();

        // Do not proceed if data directory has been used by a different storage engine previously.
        std::auto_ptr<StorageEngineMetadata> metadata =
            StorageEngineMetadata::validate(storageGlobalParams.dbpath, canonicalName);

        // Validate options in metadata against current startup options.
        if (metadata.get()) {
            uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));
        }

        try {
            _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath));
        }
        catch (const std::exception& ex) {
            uassert(28596, str::stream()
                << "Unable to determine status of lock file in the data directory "
                << storageGlobalParams.dbpath << ": " << ex.what(),
                false);
        }
        if (_lockFile->createdByUncleanShutdown()) {
            warning() << "Detected unclean shutdown - "
                      << _lockFile->getFilespec() << " is not empty.";
        }
        uassertStatusOK(_lockFile->open());

        ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get());
        _storageEngine = factory->create(storageGlobalParams, *_lockFile);
        _storageEngine->finishInit();
        uassertStatusOK(_lockFile->writePid());

        // Write a new metadata file if it is not present.
        if (!metadata.get()) {
            metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));
            metadata->setStorageEngine(canonicalName);
            metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams));
            uassertStatusOK(metadata->write());
        }

        guard.Dismiss();

        _supportsDocLocking = _storageEngine->supportsDocLocking();
    }
Ejemplo n.º 5
0
        IndexInsertionContinuation *beginInsertIntoIndex(
                int idxNo, IndexDetails &_idx,
                DiskLoc _recordLoc, const BSONObj &_key,
                const Ordering& _order, bool dupsAllowed) {

            IndexInsertionContinuationImpl<V> *continuation = new IndexInsertionContinuationImpl<V>(
                    _idx.head, _recordLoc, _key, _order, _idx);
            ScopeGuard allocGuard = MakeGuard(boost::checked_delete<IndexInsertionContinuation>,
                                              continuation);
            _idx.head.btree<V>()->twoStepInsert(_idx.head, *continuation, dupsAllowed);
            allocGuard.Dismiss();
            return continuation;
        }
Ejemplo n.º 6
0
Status CollectionBulkLoaderImpl::_runTaskReleaseResourcesOnFailure(F task) noexcept {

    AlternativeClientRegion acr(_client);
    ScopeGuard guard = MakeGuard(&CollectionBulkLoaderImpl::_releaseResources, this);
    try {
        const auto status = [&task]() noexcept {
            return task();
        }
        ();
        if (status.isOK()) {
            guard.Dismiss();
        }
        return status;
    } catch (...) {
        std::terminate();
    }
}
Ejemplo n.º 7
0
    SSLConnection* SSLManager::accept(Socket* socket) {
        SSLConnection* sslConn = new SSLConnection(_serverContext, socket);
        ScopeGuard sslGuard = MakeGuard(::SSL_free, sslConn->ssl);
        ScopeGuard bioGuard = MakeGuard(::BIO_free, sslConn->networkBIO);
 
        int ret;
        do {
            ret = ::SSL_accept(sslConn->ssl);
        } while(!_doneWithSSLOp(sslConn, ret));
 
        if (ret != 1)
            _handleSSLError(SSL_get_error(sslConn, ret));
 
        sslGuard.Dismiss();
        bioGuard.Dismiss();
        return sslConn;
    }
Ejemplo n.º 8
0
    StatusWith<ReplicationExecutor::EventHandle> ScatterGatherRunner::start(
            ReplicationExecutor* executor,
            const stdx::function<void ()>& onCompletion) {

        invariant(!_started);
        _started = true;
        _actualResponses = 0;
        _onCompletion = onCompletion;
        StatusWith<ReplicationExecutor::EventHandle> evh = executor->makeEvent();
        if (!evh.isOK()) {
            return evh;
        }
        _sufficientResponsesReceived = evh.getValue();
        ScopeGuard earlyReturnGuard = MakeGuard(
                &ScatterGatherRunner::_signalSufficientResponsesReceived,
                this,
                executor);

        const ReplicationExecutor::RemoteCommandCallbackFn cb = stdx::bind(
                &ScatterGatherRunner::_processResponse,
                stdx::placeholders::_1,
                this);

        std::vector<RemoteCommandRequest> requests = _algorithm->getRequests();
        for (size_t i = 0; i < requests.size(); ++i) {
            const StatusWith<ReplicationExecutor::CallbackHandle> cbh =
                executor->scheduleRemoteCommand(requests[i], cb);
            if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) {
                return StatusWith<ReplicationExecutor::EventHandle>(cbh.getStatus());
            }
            fassert(18743, cbh.getStatus());
            _callbacks.push_back(cbh.getValue());
        }

        if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) {
            invariant(_algorithm->hasReceivedSufficientResponses());
            _signalSufficientResponsesReceived(executor);
        }

        earlyReturnGuard.Dismiss();
        return evh;
    }
Ejemplo n.º 9
0
Junk * Attic::RemoveUselessJunk( void )
{
    ScopeGuard callGuard = MakeObjGuard( *this, &Attic::CallObservers );
    ScopeGuard testGuard = MakeObjGuard( *this, &Attic::CheckInvariants, __FUNCTION__, __LINE__ );
    ScopeGuard fixGuard = MakeObjGuard( *this, &Attic::EliminateHoles );
	callGuard.SetExceptionPolicy( ScopeGuardImplBase::CallIfNoException );
	(void)testGuard;
	(void)fixGuard;

    for ( JunkPileIter it( m_storage.begin() );
          it != m_storage.end();
          ++it )
    {
        Junk * junk = *it;
        if ( junk->IsUgly() )
            return junk;
        if ( junk->IsUseful() )
			continue;
        delete junk;
        *it = NULL;
    }

    return NULL;
}
Ejemplo n.º 10
0
void AsyncClusterClientCursor::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.
        signalCurrentEvent_inlock();

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {
                scheduleKillCursors_inlock();
                _executor->signalEvent(_killCursorsScheduledEvent);
            }

            _lifecycleState = kKillComplete;
        }
        return;
    }

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncClusterClientCursor::signalCurrentEvent_inlock, this);

    if (!cbData.response.isOK()) {
        _remotes[remoteIndex].status = cbData.response.getStatus();
        return;
    }

    auto getMoreParseStatus = GetMoreResponse::parseFromBSON(cbData.response.getValue().data);
    if (!getMoreParseStatus.isOK()) {
        _remotes[remoteIndex].status = getMoreParseStatus.getStatus();
        return;
    }

    auto getMoreResponse = getMoreParseStatus.getValue();

    // If we have a cursor established, and we get a non-zero cursorid that is not equal to the
    // established cursorid, we will fail the operation.
    if (remote.cursorId && getMoreResponse.cursorId != 0 &&
        *remote.cursorId != getMoreResponse.cursorId) {
        _remotes[remoteIndex].status =
            Status(ErrorCodes::BadValue,
                   str::stream() << "Expected cursorid " << *remote.cursorId << " but received "
                                 << getMoreResponse.cursorId);
        return;
    }

    remote.cursorId = getMoreResponse.cursorId;

    for (const auto& obj : getMoreResponse.batch) {
        remote.docBuffer.push(obj);
    }

    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !getMoreResponse.batch.empty()) {
        _mergeQueue.push(remoteIndex);
    }

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
    signaller.Dismiss();
    signalCurrentEvent_inlock();
}
Ejemplo n.º 11
0
void ServiceContextMongoD::initializeGlobalStorageEngine() {
    // This should be set once.
    invariant(!_storageEngine);

    // We should have a _lockFile or be in read-only mode. Confusingly, we can still have a lockFile
    // if we are in read-only mode. This can happen if the server is started in read-only mode on a
    // writable dbpath.
    invariant(_lockFile || storageGlobalParams.readOnly);

    const std::string dbpath = storageGlobalParams.dbpath;
    if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) {
        if (storageGlobalParams.engineSetByUser) {
            // Verify that the name of the user-supplied storage engine matches the contents of
            // the metadata file.
            const StorageEngine::Factory* factory =
                mapFindWithDefault(_storageFactories,
                                   storageGlobalParams.engine,
                                   static_cast<const StorageEngine::Factory*>(nullptr));

            if (factory) {
                uassert(28662,
                        str::stream()
                            << "Cannot start server. Detected data files in " << dbpath
                            << " created by"
                            << " the '" << *existingStorageEngine << "' storage engine, but the"
                            << " specified storage engine was '" << factory->getCanonicalName()
                            << "'.",
                        factory->getCanonicalName() == *existingStorageEngine);
            }
        } else {
            // Otherwise set the active storage engine as the contents of the metadata file.
            log() << "Detected data files in " << dbpath << " created by the '"
                  << *existingStorageEngine << "' storage engine, so setting the active"
                  << " storage engine to '" << *existingStorageEngine << "'.";
            storageGlobalParams.engine = *existingStorageEngine;
        }
    } else if (!storageGlobalParams.engineSetByUser) {
        // Ensure the default storage engine is available with this build of mongod.
        uassert(28663,
                str::stream()
                    << "Cannot start server. The default storage engine '"
                    << storageGlobalParams.engine
                    << "' is not available with this build of mongod. Please specify a different"
                    << " storage engine explicitly, e.g. --storageEngine=mmapv1.",
                isRegisteredStorageEngine(storageGlobalParams.engine));
    }

    const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine];

    uassert(18656,
            str::stream() << "Cannot start server with an unknown storage engine: "
                          << storageGlobalParams.engine,
            factory);

    if (storageGlobalParams.readOnly) {
        uassert(34368,
                str::stream()
                    << "Server was started in read-only mode, but the configured storage engine, "
                    << storageGlobalParams.engine << ", does not support read-only operation",
                factory->supportsReadOnly());
    }

    std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath);

    if (storageGlobalParams.readOnly) {
        uassert(34415,
                "Server was started in read-only mode, but the storage metadata file was not"
                " found.",
                metadata.get());
    }

    // Validate options in metadata against current startup options.
    if (metadata.get()) {
        uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));
    }

    ScopeGuard guard = MakeGuard([&] {
        if (_lockFile) {
            _lockFile->close();
        }
    });

    _storageEngine = factory->create(storageGlobalParams, _lockFile.get());
    _storageEngine->finishInit();

    if (_lockFile) {
        uassertStatusOK(_lockFile->writePid());
    }

    // Write a new metadata file if it is not present.
    if (!metadata.get()) {
        invariant(!storageGlobalParams.readOnly);
        metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));
        metadata->setStorageEngine(factory->getCanonicalName().toString());
        metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams));
        uassertStatusOK(metadata->write());
    }

    guard.Dismiss();

    _supportsDocLocking = _storageEngine->supportsDocLocking();
}
Ejemplo n.º 12
0
PlanStage::StageState DeleteStage::work(WorkingSetID* out) {
    ++_commonStats.works;

    // Adds the amount of time taken by work() to executionTimeMillis.
    ScopedTimer timer(&_commonStats.executionTimeMillis);

    if (isEOF()) {
        return PlanStage::IS_EOF;
    }
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.
        invariant(_params.returnDeleted);

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        ++_commonStats.advanced;
        return PlanStage::ADVANCED;
    }

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    StageState status;
    if (_idRetrying == WorkingSet::INVALID_ID) {
        status = child()->work(&id);
    } else {
        status = ADVANCED;
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    }

    if (PlanStage::ADVANCED == status) {
        WorkingSetMember* member = _ws->get(id);

        // We want to free this member when we return, unless we need to retry it.
        ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

        if (!member->hasLoc()) {
            // We expect to be here because of an invalidation causing a force-fetch, and
            // doc-locking storage engines do not issue invalidations.
            ++_specificStats.nInvalidateSkips;
            ++_commonStats.needTime;
            return PlanStage::NEED_TIME;
        }
        RecordId rloc = member->loc;
        // Deletes can't have projections. This means that covering analysis will always add
        // a fetch. We should always get fetched data, and never just key data.
        invariant(member->hasObj());

        try {
            // If the snapshot changed, then we have to make sure we have the latest copy of the
            // doc and that it still matches.
            std::unique_ptr<RecordCursor> cursor;
            if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
                cursor = _collection->getCursor(getOpCtx());
                if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) {
                    // Doc is already deleted. Nothing more to do.
                    ++_commonStats.needTime;
                    return PlanStage::NEED_TIME;
                }

                // Make sure the re-fetched doc still matches the predicate.
                if (_params.canonicalQuery &&
                    !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                    // Doesn't match.
                    ++_commonStats.needTime;
                    return PlanStage::NEED_TIME;
                }
            }

            // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
            // is allowed to free the memory.
            if (_params.returnDeleted) {
                member->makeObjOwnedIfNeeded();
            }

            // TODO: Do we want to buffer docs and delete them in a group rather than
            // saving/restoring state repeatedly?

            try {
                if (supportsDocLocking()) {
                    // Doc-locking engines require this before saveState() since they don't use
                    // invalidations.
                    WorkingSetCommon::prepareForSnapshotChange(_ws);
                }
                child()->saveState();
            } catch (const WriteConflictException& wce) {
                std::terminate();
            }

            if (_params.returnDeleted) {
                // Save a copy of the document that is about to get deleted.
                BSONObj deletedDoc = member->obj.value();
                member->obj.setValue(deletedDoc.getOwned());
                member->loc = RecordId();
                member->transitionToOwnedObj();
            }

            // Do the write, unless this is an explain.
            if (!_params.isExplain) {
                WriteUnitOfWork wunit(getOpCtx());
                _collection->deleteDocument(getOpCtx(), rloc);
                wunit.commit();
            }

            ++_specificStats.docsDeleted;
        } catch (const WriteConflictException& wce) {
            // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may be
            // freed when we yield.
            member->makeObjOwnedIfNeeded();
            _idRetrying = id;
            memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
            *out = WorkingSet::INVALID_ID;
            _commonStats.needYield++;
            return NEED_YIELD;
        }

        //  As restoreState may restore (recreate) cursors, cursors are tied to the
        //  transaction in which they are created, and a WriteUnitOfWork is a
        //  transaction, make sure to restore the state outside of the WritUnitOfWork.
        try {
            child()->restoreState();
        } catch (const WriteConflictException& wce) {
            // Note we don't need to retry anything in this case since the delete already
            // was committed. However, we still need to return the deleted document
            // (if it was requested).
            if (_params.returnDeleted) {
                // member->obj should refer to the deleted document.
                invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

                _idReturning = id;
                // Keep this member around so that we can return it on the next work() call.
                memberFreer.Dismiss();
            }
            *out = WorkingSet::INVALID_ID;
            _commonStats.needYield++;
            return NEED_YIELD;
        }

        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            memberFreer.Dismiss();  // Keep this member around so we can return it.
            *out = id;
            ++_commonStats.advanced;
            return PlanStage::ADVANCED;
        }

        ++_commonStats.needTime;
        return PlanStage::NEED_TIME;
    } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) {
        *out = id;
        // If a stage fails, it may create a status WSM to indicate why it failed, in which case
        // 'id' is valid.  If ID is invalid, we create our own error message.
        if (WorkingSet::INVALID_ID == id) {
            const std::string errmsg = "delete stage failed to read in results from child";
            *out = WorkingSetCommon::allocateStatusMember(
                _ws, Status(ErrorCodes::InternalError, errmsg));
        }
        return status;
    } else if (PlanStage::NEED_TIME == status) {
        ++_commonStats.needTime;
    } else if (PlanStage::NEED_YIELD == status) {
        *out = id;
        ++_commonStats.needYield;
    }

    return status;
}
Ejemplo n.º 13
0
void ServiceContextMongoD::initializeGlobalStorageEngine() {
    // This should be set once.
    invariant(!_storageEngine);

    const std::string dbpath = storageGlobalParams.dbpath;
    if (auto existingStorageEngine = StorageEngineMetadata::getStorageEngineForPath(dbpath)) {
        if (storageGlobalParams.engineSetByUser) {
            // Verify that the name of the user-supplied storage engine matches the contents of
            // the metadata file.
            const StorageEngine::Factory* factory =
                mapFindWithDefault(_storageFactories,
                                   storageGlobalParams.engine,
                                   static_cast<const StorageEngine::Factory*>(nullptr));

            if (factory) {
                uassert(28662,
                        str::stream()
                            << "Cannot start server. Detected data files in " << dbpath
                            << " created by"
                            << " the '" << *existingStorageEngine << "' storage engine, but the"
                            << " specified storage engine was '" << factory->getCanonicalName()
                            << "'.",
                        factory->getCanonicalName() == *existingStorageEngine);
            }
        } else {
            // Otherwise set the active storage engine as the contents of the metadata file.
            log() << "Detected data files in " << dbpath << " created by the '"
                  << *existingStorageEngine << "' storage engine, so setting the active"
                  << " storage engine to '" << *existingStorageEngine << "'.";
            storageGlobalParams.engine = *existingStorageEngine;
        }
    } else if (!storageGlobalParams.engineSetByUser) {
        // Ensure the default storage engine is available with this build of mongod.
        uassert(28663,
                str::stream()
                    << "Cannot start server. The default storage engine '"
                    << storageGlobalParams.engine
                    << "' is not available with this build of mongod. Please specify a different"
                    << " storage engine explicitly, e.g. --storageEngine=mmapv1.",
                isRegisteredStorageEngine(storageGlobalParams.engine));
    }

    const StorageEngine::Factory* factory = _storageFactories[storageGlobalParams.engine];

    uassert(18656,
            str::stream() << "Cannot start server with an unknown storage engine: "
                          << storageGlobalParams.engine,
            factory);

    std::unique_ptr<StorageEngineMetadata> metadata = StorageEngineMetadata::forPath(dbpath);

    // Validate options in metadata against current startup options.
    if (metadata.get()) {
        uassertStatusOK(factory->validateMetadata(*metadata, storageGlobalParams));
    }

    try {
        _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath));
    } catch (const std::exception& ex) {
        uassert(28596,
                str::stream() << "Unable to determine status of lock file in the data directory "
                              << storageGlobalParams.dbpath << ": " << ex.what(),
                false);
    }
    if (_lockFile->createdByUncleanShutdown()) {
        warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty.";
    }
    uassertStatusOK(_lockFile->open());

    ScopeGuard guard = MakeGuard(&StorageEngineLockFile::close, _lockFile.get());
    _storageEngine = factory->create(storageGlobalParams, *_lockFile);
    _storageEngine->finishInit();
    uassertStatusOK(_lockFile->writePid());

    // Write a new metadata file if it is not present.
    if (!metadata.get()) {
        metadata.reset(new StorageEngineMetadata(storageGlobalParams.dbpath));
        metadata->setStorageEngine(factory->getCanonicalName().toString());
        metadata->setStorageEngineOptions(factory->createMetadataOptions(storageGlobalParams));
        uassertStatusOK(metadata->write());
    }

    guard.Dismiss();

    _supportsDocLocking = _storageEngine->supportsDocLocking();
}
Ejemplo n.º 14
0
void WiredTigerSizeStorer::syncCache(bool syncToDisk) {
    stdx::lock_guard<stdx::mutex> cursorLock(_cursorMutex);
    _checkMagic();

    Map myMap;
    {
        stdx::lock_guard<stdx::mutex> lk(_entriesMutex);
        for (Map::iterator it = _entries.begin(); it != _entries.end(); ++it) {
            std::string uriKey = it->first;
            Entry& entry = it->second;
            if (entry.rs) {
                if (entry.dataSize != entry.rs->dataSize(NULL)) {
                    entry.dataSize = entry.rs->dataSize(NULL);
                    entry.dirty = true;
                }
                if (entry.numRecords != entry.rs->numRecords(NULL)) {
                    entry.numRecords = entry.rs->numRecords(NULL);
                    entry.dirty = true;
                }
            }

            if (!entry.dirty)
                continue;
            myMap[uriKey] = entry;
        }
    }

    if (myMap.empty())
        return;  // Nothing to do.

    WT_SESSION* session = _session.getSession();
    invariantWTOK(session->begin_transaction(session, syncToDisk ? "sync=true" : ""));
    ScopeGuard rollbacker = MakeGuard(session->rollback_transaction, session, "");

    for (Map::iterator it = myMap.begin(); it != myMap.end(); ++it) {
        string uriKey = it->first;
        Entry& entry = it->second;

        BSONObj data;
        {
            BSONObjBuilder b;
            b.append("numRecords", entry.numRecords);
            b.append("dataSize", entry.dataSize);
            data = b.obj();
        }

        LOG(2) << "WiredTigerSizeStorer::storeInto " << uriKey << " -> " << redact(data);

        WiredTigerItem key(uriKey.c_str(), uriKey.size());
        WiredTigerItem value(data.objdata(), data.objsize());
        _cursor->set_key(_cursor, key.Get());
        _cursor->set_value(_cursor, value.Get());
        invariantWTOK(_cursor->insert(_cursor));
    }

    invariantWTOK(_cursor->reset(_cursor));

    rollbacker.Dismiss();
    invariantWTOK(session->commit_transaction(session, NULL));

    {
        stdx::lock_guard<stdx::mutex> lk(_entriesMutex);
        for (Map::iterator it = _entries.begin(); it != _entries.end(); ++it) {
            it->second.dirty = false;
        }
    }
}
Ejemplo n.º 15
0
    PlanStage::StageState DeleteStage::work(WorkingSetID* out) {
        ++_commonStats.works;

        // Adds the amount of time taken by work() to executionTimeMillis.
        ScopedTimer timer(&_commonStats.executionTimeMillis);

        if (isEOF()) { return PlanStage::IS_EOF; }
        invariant(_collection); // If isEOF() returns false, we must have a collection.

        // Either retry the last WSM we worked on or get a new one from our child.
        WorkingSetID id;
        StageState status;
        if (_idRetrying == WorkingSet::INVALID_ID) {
            status = _child->work(&id);
        }
        else {
            status = ADVANCED;
            id = _idRetrying;
            _idRetrying = WorkingSet::INVALID_ID;
        }

        if (PlanStage::ADVANCED == status) {
            WorkingSetMember* member = _ws->get(id);

            // We want to free this member when we return, unless we need to retry it.
            ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

            if (!member->hasLoc()) {
                // We expect to be here because of an invalidation causing a force-fetch, and
                // doc-locking storage engines do not issue invalidations.
                dassert(!supportsDocLocking());
                ++_specificStats.nInvalidateSkips;
                ++_commonStats.needTime;
                return PlanStage::NEED_TIME;
            }
            RecordId rloc = member->loc;

            try {
                // If the snapshot changed, then we have to make sure we have the latest copy of the
                // doc and that it still matches.
                if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
                    if (!WorkingSetCommon::fetch(_txn, member, _collection)) {
                        // Doc is already deleted. Nothing more to do.
                        ++_commonStats.needTime;
                        return PlanStage::NEED_TIME;
                    }

                    // Make sure the re-fetched doc still matches the predicate.
                    if (_params.canonicalQuery &&
                        !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                        // Doesn't match.
                        ++_commonStats.needTime;
                        return PlanStage::NEED_TIME;
                    }
                }

                // TODO: Do we want to buffer docs and delete them in a group rather than
                // saving/restoring state repeatedly?

                try {
                    _child->saveState();
                    if (supportsDocLocking()) {
                        // Doc-locking engines require this after saveState() since they don't use
                        // invalidations.
                        WorkingSetCommon::prepareForSnapshotChange(_ws);
                    }
                }
                catch ( const WriteConflictException& wce ) {
                    std::terminate();
                }

                // Do the write, unless this is an explain.
                if (!_params.isExplain) {
                    WriteUnitOfWork wunit(_txn);

                    const bool deleteCappedOK = false;
                    const bool deleteNoWarn = false;
                    BSONObj deletedDoc;

                    _collection->deleteDocument(_txn, rloc, deleteCappedOK, deleteNoWarn,
                                                _params.shouldCallLogOp ? &deletedDoc : NULL);

                    if (_params.shouldCallLogOp) {
                        if (deletedDoc.isEmpty()) {
                            log() << "Deleted object without id in collection " << _collection->ns()
                            << ", not logging.";
                        }
                        else {
                            getGlobalServiceContext()->getOpObserver()->onDelete(
                                    _txn,
                                    _collection->ns().ns(),
                                    deletedDoc,
                                    _params.fromMigrate);
                        }
                    }

                    wunit.commit();
                }

                ++_specificStats.docsDeleted;
            }
            catch ( const WriteConflictException& wce ) {
                _idRetrying = id;
                memberFreer.Dismiss(); // Keep this member around so we can retry deleting it.
                *out = WorkingSet::INVALID_ID;
                _commonStats.needYield++;
                return NEED_YIELD;
            }

            //  As restoreState may restore (recreate) cursors, cursors are tied to the
            //  transaction in which they are created, and a WriteUnitOfWork is a
            //  transaction, make sure to restore the state outside of the WritUnitOfWork.
            try {
                _child->restoreState(_txn);
            }
            catch ( const WriteConflictException& wce ) {
                // Note we don't need to retry anything in this case since the delete already
                // was committed.
                *out = WorkingSet::INVALID_ID;
                _commonStats.needYield++;
                return NEED_YIELD;
            }

            ++_commonStats.needTime;
            return PlanStage::NEED_TIME;
        }
        else if (PlanStage::FAILURE == status) {
            *out = id;
            // If a stage fails, it may create a status WSM to indicate why it failed, in which case
            // 'id' is valid.  If ID is invalid, we create our own error message.
            if (WorkingSet::INVALID_ID == id) {
                const std::string errmsg = "delete stage failed to read in results from child";
                *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError,
                                                                          errmsg));
                return PlanStage::FAILURE;
            }
            return status;
        }
        else if (PlanStage::NEED_TIME == status) {
            ++_commonStats.needTime;
        }
        else if (PlanStage::NEED_YIELD == status) {
            *out = id;
            ++_commonStats.needYield;
        }

        return status;
    }
Ejemplo n.º 16
0
Status runAggregate(OperationContext* opCtx,
                    const NamespaceString& origNss,
                    const AggregationRequest& request,
                    const BSONObj& cmdObj,
                    BSONObjBuilder& result) {
    // For operations on views, this will be the underlying namespace.
    NamespaceString nss = request.getNamespaceString();

    // The collation to use for this aggregation. boost::optional to distinguish between the case
    // where the collation has not yet been resolved, and where it has been resolved to nullptr.
    boost::optional<std::unique_ptr<CollatorInterface>> collatorToUse;

    unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec;
    boost::intrusive_ptr<ExpressionContext> expCtx;
    Pipeline* unownedPipeline;
    auto curOp = CurOp::get(opCtx);
    {
        const LiteParsedPipeline liteParsedPipeline(request);

        // Check whether the parsed pipeline supports the given read concern.
        liteParsedPipeline.assertSupportsReadConcern(opCtx, request.getExplain());

        if (liteParsedPipeline.hasChangeStream()) {
            nss = NamespaceString::kRsOplogNamespace;

            // If the read concern is not specified, upgrade to 'majority' and wait to make sure we
            // have a snapshot available.
            if (!repl::ReadConcernArgs::get(opCtx).hasLevel()) {
                const repl::ReadConcernArgs readConcern(
                    repl::ReadConcernLevel::kMajorityReadConcern);
                uassertStatusOK(waitForReadConcern(opCtx, readConcern, true));
            }

            if (!origNss.isCollectionlessAggregateNS()) {
                // AutoGetCollectionForReadCommand will raise an error if 'origNss' is a view.
                AutoGetCollectionForReadCommand origNssCtx(opCtx, origNss);

                // Resolve the collator to either the user-specified collation or the default
                // collation of the collection on which $changeStream was invoked, so that we do not
                // end up resolving the collation on the oplog.
                invariant(!collatorToUse);
                Collection* origColl = origNssCtx.getCollection();
                collatorToUse.emplace(resolveCollator(opCtx, request, origColl));
            }
        }

        const auto& pipelineInvolvedNamespaces = liteParsedPipeline.getInvolvedNamespaces();

        // If emplaced, AutoGetCollectionForReadCommand will throw if the sharding version for this
        // connection is out of date. If the namespace is a view, the lock will be released before
        // re-running the expanded aggregation.
        boost::optional<AutoGetCollectionForReadCommand> ctx;

        // If this is a collectionless aggregation, we won't create 'ctx' but will still need an
        // AutoStatsTracker to record CurOp and Top entries.
        boost::optional<AutoStatsTracker> statsTracker;

        // If this is a collectionless aggregation with no foreign namespaces, we don't want to
        // acquire any locks. Otherwise, lock the collection or view.
        if (nss.isCollectionlessAggregateNS() && pipelineInvolvedNamespaces.empty()) {
            statsTracker.emplace(opCtx, nss, Top::LockType::NotLocked, 0);
        } else {
            ctx.emplace(opCtx, nss, AutoGetCollection::ViewMode::kViewsPermitted);
        }

        Collection* collection = ctx ? ctx->getCollection() : nullptr;

        // The collator may already have been set if this is a $changeStream pipeline. If not,
        // resolve the collator to either the user-specified collation or the collection default.
        if (!collatorToUse) {
            collatorToUse.emplace(resolveCollator(opCtx, request, collection));
        }

        // If this is a view, resolve it by finding the underlying collection and stitching view
        // pipelines and this request's pipeline together. We then release our locks before
        // recursively calling runAggregate(), which will re-acquire locks on the underlying
        // collection.  (The lock must be released because recursively acquiring locks on the
        // database will prohibit yielding.)
        if (ctx && ctx->getView() && !liteParsedPipeline.startsWithCollStats()) {
            invariant(nss != NamespaceString::kRsOplogNamespace);
            invariant(!nss.isCollectionlessAggregateNS());
            // Check that the default collation of 'view' is compatible with the operation's
            // collation. The check is skipped if the request did not specify a collation.
            if (!request.getCollation().isEmpty()) {
                invariant(collatorToUse);  // Should already be resolved at this point.
                if (!CollatorInterface::collatorsMatch(ctx->getView()->defaultCollator(),
                                                       collatorToUse->get())) {
                    return {ErrorCodes::OptionNotSupportedOnView,
                            "Cannot override a view's default collation"};
                }
            }

            ViewShardingCheck::throwResolvedViewIfSharded(opCtx, ctx->getDb(), ctx->getView());

            auto resolvedView = ctx->getDb()->getViewCatalog()->resolveView(opCtx, nss);
            if (!resolvedView.isOK()) {
                return resolvedView.getStatus();
            }

            // With the view & collation resolved, we can relinquish locks.
            ctx.reset();

            // Parse the resolved view into a new aggregation request.
            auto newRequest = resolvedView.getValue().asExpandedViewAggregation(request);
            auto newCmd = newRequest.serializeToCommandObj().toBson();

            auto status = runAggregate(opCtx, origNss, newRequest, newCmd, result);
            {
                // Set the namespace of the curop back to the view namespace so ctx records
                // stats on this view namespace on destruction.
                stdx::lock_guard<Client> lk(*opCtx->getClient());
                curOp->setNS_inlock(nss.ns());
            }
            return status;
        }

        invariant(collatorToUse);
        expCtx.reset(
            new ExpressionContext(opCtx,
                                  request,
                                  std::move(*collatorToUse),
                                  std::make_shared<PipelineD::MongoDInterface>(opCtx),
                                  uassertStatusOK(resolveInvolvedNamespaces(opCtx, request))));
        expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp";
        auto session = OperationContextSession::get(opCtx);
        expCtx->inSnapshotReadOrMultiDocumentTransaction =
            session && session->inSnapshotReadOrMultiDocumentTransaction();

        auto pipeline = uassertStatusOK(Pipeline::parse(request.getPipeline(), expCtx));

        // Check that the view's collation matches the collation of any views involved in the
        // pipeline.
        if (!pipelineInvolvedNamespaces.empty()) {
            invariant(ctx);
            auto pipelineCollationStatus = collatorCompatibleWithPipeline(
                opCtx, ctx->getDb(), expCtx->getCollator(), pipeline.get());
            if (!pipelineCollationStatus.isOK()) {
                return pipelineCollationStatus;
            }
        }

        pipeline->optimizePipeline();

        if (kDebugBuild && !expCtx->explain && !expCtx->fromMongos) {
            // Make sure all operations round-trip through Pipeline::serialize() correctly by
            // re-parsing every command in debug builds. This is important because sharded
            // aggregations rely on this ability.  Skipping when fromMongos because this has
            // already been through the transformation (and this un-sets expCtx->fromMongos).
            pipeline = reparsePipeline(pipeline.get(), request, expCtx);
        }

        // Prepare a PlanExecutor to provide input into the pipeline, if needed.
        if (liteParsedPipeline.hasChangeStream()) {
            // If we are using a change stream, the cursor stage should have a simple collation,
            // regardless of what the user's collation was.
            std::unique_ptr<CollatorInterface> collatorForCursor = nullptr;
            auto collatorStash = expCtx->temporarilyChangeCollator(std::move(collatorForCursor));
            PipelineD::prepareCursorSource(collection, nss, &request, pipeline.get());
        } else {
            PipelineD::prepareCursorSource(collection, nss, &request, pipeline.get());
        }
        // Optimize again, since there may be additional optimizations that can be done after adding
        // the initial cursor stage. Note this has to be done outside the above blocks to ensure
        // this process uses the correct collation if it does any string comparisons.
        pipeline->optimizePipeline();

        // Transfer ownership of the Pipeline to the PipelineProxyStage.
        unownedPipeline = pipeline.get();
        auto ws = make_unique<WorkingSet>();
        auto proxy = make_unique<PipelineProxyStage>(opCtx, std::move(pipeline), ws.get());

        // This PlanExecutor will simply forward requests to the Pipeline, so does not need to
        // yield or to be registered with any collection's CursorManager to receive invalidations.
        // The Pipeline may contain PlanExecutors which *are* yielding PlanExecutors and which *are*
        // registered with their respective collection's CursorManager
        auto statusWithPlanExecutor =
            PlanExecutor::make(opCtx, std::move(ws), std::move(proxy), nss, PlanExecutor::NO_YIELD);
        invariant(statusWithPlanExecutor.isOK());
        exec = std::move(statusWithPlanExecutor.getValue());

        {
            auto planSummary = Explain::getPlanSummary(exec.get());
            stdx::lock_guard<Client> lk(*opCtx->getClient());
            curOp->setPlanSummary_inlock(std::move(planSummary));
        }
    }

    // Having released the collection lock, we can now create a cursor that returns results from the
    // pipeline. This cursor owns no collection state, and thus we register it with the global
    // cursor manager. The global cursor manager does not deliver invalidations or kill
    // notifications; the underlying PlanExecutor(s) used by the pipeline will be receiving
    // invalidations and kill notifications themselves, not the cursor we create here.
    ClientCursorParams cursorParams(
        std::move(exec),
        origNss,
        AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(),
        opCtx->recoveryUnit()->getReadConcernLevel(),
        cmdObj);
    if (expCtx->tailableMode == TailableModeEnum::kTailableAndAwaitData) {
        cursorParams.setTailable(true);
        cursorParams.setAwaitData(true);
    }

    auto pin =
        CursorManager::getGlobalCursorManager()->registerCursor(opCtx, std::move(cursorParams));

    ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, &pin);

    // If both explain and cursor are specified, explain wins.
    if (expCtx->explain) {
        Explain::explainPipelineExecutor(
            pin.getCursor()->getExecutor(), *(expCtx->explain), &result);
    } else {
        // Cursor must be specified, if explain is not.
        const bool keepCursor =
            handleCursorCommand(opCtx, origNss, pin.getCursor(), request, result);
        if (keepCursor) {
            cursorFreer.Dismiss();
        }
    }

    if (!expCtx->explain) {
        PlanSummaryStats stats;
        Explain::getSummaryStats(*(pin.getCursor()->getExecutor()), &stats);
        curOp->debug().setPlanSummaryMetrics(stats);
        curOp->debug().nreturned = stats.nReturned;
    }

    // Any code that needs the cursor pinned must be inside the try block, above.
    return Status::OK();
}
Ejemplo n.º 17
0
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.
        signalCurrentEventIfReady_inlock();

        // Make a best effort to parse the response and retrieve the cursor id. We need the cursor
        // id in order to issue a killCursors command against it.
        if (cbData.response.isOK()) {
            auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote);
            if (cursorResponse.isOK()) {
                remote.cursorId = cursorResponse.getValue().getCursorId();
            }
        }

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {
                scheduleKillCursors_inlock();
                _executor->signalEvent(_killCursorsScheduledEvent);
            }

            _lifecycleState = kKillComplete;
        }

        return;
    }

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    StatusWith<CursorResponse> cursorResponseStatus(
        cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote)
                               : cbData.response.getStatus());

    if (!cursorResponseStatus.isOK()) {
        auto shard = remote.getShard();
        if (!shard) {
            remote.status = Status(cursorResponseStatus.getStatus().code(),
                                   str::stream() << "Could not find shard " << *remote.shardId
                                                 << " containing host "
                                                 << remote.getTargetHost().toString());
        } else {
            shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus());

            // Retry initial cursor establishment if possible.  Never retry getMores to avoid
            // accidentally skipping results.
            if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
                shard->isRetriableError(cursorResponseStatus.getStatus().code(),
                                        Shard::RetryPolicy::kIdempotent)) {
                invariant(remote.shardId);
                LOG(1) << "Initial cursor establishment failed with retriable error and will be "
                          "retried"
                       << causedBy(redact(cursorResponseStatus.getStatus()));

                ++remote.retryCount;

                // Since we potentially updated the targeter that the last host it chose might be
                // faulty, the call below may end up getting a different host.
                remote.status = askForNextBatch_inlock(remoteIndex);
                if (remote.status.isOK()) {
                    return;
                }

                // If we end up here, it means we failed to schedule the retry request, which is a
                // more
                // severe error that should not be retried. Just pass through to the error handling
                // logic below.
            } else {
                remote.status = cursorResponseStatus.getStatus();
            }
        }

        // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We
        // remove the unreachable host entirely from consideration by marking it as exhausted.
        if (_params.isAllowPartialResults) {
            remote.status = Status::OK();

            // Clear the results buffer and cursor id.
            std::queue<BSONObj> emptyBuffer;
            std::swap(remote.docBuffer, emptyBuffer);
            remote.cursorId = 0;
        }

        return;
    }

    // Cursor id successfully established.
    auto cursorResponse = std::move(cursorResponseStatus.getValue());
    remote.cursorId = cursorResponse.getCursorId();
    remote.initialCmdObj = boost::none;

    for (const auto& obj : cursorResponse.getBatch()) {
        // If there's a sort, we're expecting the remote node to give us back a sort key.
        if (!_params.sort.isEmpty() &&
            obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) {
            remote.status = Status(ErrorCodes::InternalError,
                                   str::stream() << "Missing field '"
                                                 << ClusterClientCursorParams::kSortKeyField
                                                 << "' in document: "
                                                 << obj);
            return;
        }

        remote.docBuffer.push(obj);
        ++remote.fetchedCount;
    }

    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) {
        _mergeQueue.push(remoteIndex);
    }

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    if (_params.isTailable && !remote.hasNext()) {
        _eofNext = true;
    }

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    //
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) {
        remote.status = askForNextBatch_inlock(remoteIndex);
        if (!remote.status.isOK()) {
            return;
        }
    }

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
    signaller.Dismiss();
    signalCurrentEventIfReady_inlock();
}
Ejemplo n.º 18
0
        bool run(OperationContext* txn,
                 const std::string& dbname,
                 BSONObj& cmdObj,
                 int options,
                 std::string& errmsg,
                 BSONObjBuilder& result) override {
            // Counted as a getMore, not as a command.
            globalOpCounters.gotGetMore();

            if (txn->getClient()->isInDirectClient()) {
                return appendCommandStatus(result,
                                           Status(ErrorCodes::IllegalOperation,
                                                  "Cannot run getMore command from eval()"));
            }

            StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj);
            if (!parseStatus.isOK()) {
                return appendCommandStatus(result, parseStatus.getStatus());
            }
            const GetMoreRequest& request = parseStatus.getValue();

            // Depending on the type of cursor being operated on, we hold locks for the whole
            // getMore, or none of the getMore, or part of the getMore.  The three cases in detail:
            //
            // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
            // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors
            //    don't own any collection state.
            // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
            //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
            //    release), but the pin and unpin of the cursor must occur under the collection
            //    lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because
            //    AutoGetCollectionForRead checks the sharding version (and we want the relock for
            //    the unpin to succeed even if the sharding version has changed).
            //
            // Note that we declare our locks before our ClientCursorPin, in order to ensure that
            // the pin's destructor is called before the lock destructors (so that the unpin occurs
            // under the lock).
            std::unique_ptr<AutoGetCollectionForRead> ctx;
            std::unique_ptr<Lock::DBLock> unpinDBLock;
            std::unique_ptr<Lock::CollectionLock> unpinCollLock;

            CursorManager* cursorManager;
            CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
            if (globalCursorManager->ownsCursorId(request.cursorid)) {
                cursorManager = globalCursorManager;
            }
            else {
                ctx.reset(new AutoGetCollectionForRead(txn, request.nss));
                Collection* collection = ctx->getCollection();
                if (!collection) {
                    return appendCommandStatus(result,
                                               Status(ErrorCodes::OperationFailed,
                                                      "collection dropped between getMore calls"));
                }
                cursorManager = collection->getCursorManager();
            }

            ClientCursorPin ccPin(cursorManager, request.cursorid);
            ClientCursor* cursor = ccPin.c();
            if (!cursor) {
                // We didn't find the cursor.
                return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream()
                    << "Cursor not found, cursor id: " << request.cursorid));
            }

            if (request.nss.ns() != cursor->ns()) {
                return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream()
                    << "Requested getMore on namespace '" << request.nss.ns()
                    << "', but cursor belongs to a different namespace"));
            }

            const bool hasOwnMaxTime = CurOp::get(txn)->isMaxTimeSet();

            // Validation related to awaitData.
            if (isCursorAwaitData(cursor)) {
                invariant(isCursorTailable(cursor));

                if (!hasOwnMaxTime) {
                    Status status(ErrorCodes::BadValue,
                                  str::stream() << "Must set maxTimeMS on a getMore if the initial "
                                                << "query had 'awaitData' set: " << cmdObj);
                    return appendCommandStatus(result, status);
                }

                if (cursor->isAggCursor()) {
                    Status status(ErrorCodes::BadValue,
                                  "awaitData cannot be set on an aggregation cursor");
                    return appendCommandStatus(result, status);
                }
            }

            // On early return, get rid of the cursor.
            ScopeGuard cursorFreer = MakeGuard(&GetMoreCmd::cleanupCursor, txn, &ccPin, request);

            if (!cursor->hasRecoveryUnit()) {
                // Start using a new RecoveryUnit.
                cursor->setOwnedRecoveryUnit(
                    getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit());
            }

            // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
            ScopedRecoveryUnitSwapper ruSwapper(cursor, txn);

            // Reset timeout timer on the cursor since the cursor is still in use.
            cursor->setIdleTime(0);

            // If there is no time limit set directly on this getMore command, but the operation
            // that spawned this cursor had a time limit set, then we have to apply any leftover
            // time to this getMore.
            if (!hasOwnMaxTime) {
                CurOp::get(txn)->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros());
            }
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (cursor->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            PlanExecutor* exec = cursor->getExecutor();
            exec->restoreState(txn);

            // If we're tailing a capped collection, retrieve a monotonically increasing insert
            // counter.
            uint64_t lastInsertCount = 0;
            if (isCursorAwaitData(cursor)) {
                invariant(ctx->getCollection()->isCapped());
                lastInsertCount = ctx->getCollection()->getCappedInsertNotifier()->getCount();
            }

            CursorId respondWithId = 0;
            BSONArrayBuilder nextBatch;
            BSONObj obj;
            PlanExecutor::ExecState state;
            int numResults = 0;
            Status batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults);
            if (!batchStatus.isOK()) {
                return appendCommandStatus(result, batchStatus);
            }

            // If this is an await data cursor, and we hit EOF without generating any results, then
            // we block waiting for new oplog data to arrive.
            if (isCursorAwaitData(cursor) && state == PlanExecutor::IS_EOF && numResults == 0) {
                // Retrieve the notifier which we will wait on until new data arrives. We make sure
                // to do this in the lock because once we drop the lock it is possible for the
                // collection to become invalid. The notifier itself will outlive the collection if
                // the collection is dropped, as we keep a shared_ptr to it.
                auto notifier = ctx->getCollection()->getCappedInsertNotifier();

                // Save the PlanExecutor and drop our locks.
                exec->saveState();
                ctx.reset();

                // Block waiting for data.
                Microseconds timeout(CurOp::get(txn)->getRemainingMaxTimeMicros());
                notifier->waitForInsert(lastInsertCount, timeout);
                notifier.reset();

                ctx.reset(new AutoGetCollectionForRead(txn, request.nss));
                exec->restoreState(txn);

                // We woke up because either the timed_wait expired, or there was more data. Either
                // way, attempt to generate another batch of results.
                batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults);
                if (!batchStatus.isOK()) {
                    return appendCommandStatus(result, batchStatus);
                }
            }

            if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) {
                respondWithId = request.cursorid;

                exec->saveState();

                // If maxTimeMS was set directly on the getMore rather than being rolled over
                // from a previous find, then don't roll remaining micros over to the next
                // getMore.
                if (!hasOwnMaxTime) {
                    cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
                }

                cursor->incPos(numResults);

                if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) {
                    // Rather than swapping their existing RU into the client cursor, tailable
                    // cursors should get a new recovery unit.
                    ruSwapper.dismiss();
                }
            }
            else {
                CurOp::get(txn)->debug().cursorExhausted = true;
            }

            appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result);

            if (respondWithId) {
                cursorFreer.Dismiss();

                // If we are operating on an aggregation cursor, then we dropped our collection lock
                // earlier and need to reacquire it in order to clean up our ClientCursorPin.
                if (cursor->isAggCursor()) {
                    invariant(NULL == ctx.get());
                    unpinDBLock.reset(
                        new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS));
                    unpinCollLock.reset(
                        new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS));
                }
            }

            return true;
        }
Ejemplo n.º 19
0
    void Listener::setupSockets() {
        checkTicketNumbers();

#if !defined(_WIN32)
        _mine = ipToAddrs(_ip.c_str(), _port, (!serverGlobalParams.noUnixSocket &&
                                               useUnixSockets()));
#else
        _mine = ipToAddrs(_ip.c_str(), _port, false);
#endif

        for (std::vector<SockAddr>::const_iterator it=_mine.begin(), end=_mine.end();
             it != end;
             ++it) {

            const SockAddr& me = *it;

            if (!me.isValid()) {
                error() << "listen(): socket is invalid." << endl;
                return;
            }

            SOCKET sock = ::socket(me.getType(), SOCK_STREAM, 0);
            ScopeGuard socketGuard = MakeGuard(&closesocket, sock);
            massert( 15863 , str::stream() << "listen(): invalid socket? " << errnoWithDescription() , sock >= 0 );

            if (me.getType() == AF_UNIX) {
#if !defined(_WIN32)
                if (unlink(me.getAddr().c_str()) == -1) {
                    if (errno != ENOENT) {
                        error() << "Failed to unlink socket file " << me << " "
                                << errnoWithDescription(errno);
                        fassertFailedNoTrace(28578);
                    }
                }
#endif
            }
            else if (me.getType() == AF_INET6) {
                // IPv6 can also accept IPv4 connections as mapped addresses (::ffff:127.0.0.1)
                // That causes a conflict if we don't do set it to IPV6_ONLY
                const int one = 1;
                setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*) &one, sizeof(one));
            }

#if !defined(_WIN32)
            {
                const int one = 1;
                if ( setsockopt( sock , SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0 )
                    log() << "Failed to set socket opt, SO_REUSEADDR" << endl;
            }
#endif

            if ( ::bind(sock, me.raw(), me.addressSize) != 0 ) {
                int x = errno;
                error() << "listen(): bind() failed " << errnoWithDescription(x) << " for socket: " << me.toString() << endl;
                if ( x == EADDRINUSE )
                    error() << "  addr already in use" << endl;
                return;
            }

#if !defined(_WIN32)
            if (me.getType() == AF_UNIX) {
                if (chmod(me.getAddr().c_str(), serverGlobalParams.unixSocketPermissions) == -1) {
                    error() << "Failed to chmod socket file " << me << " "
                            << errnoWithDescription(errno);
                    fassertFailedNoTrace(28582);
                }
                ListeningSockets::get()->addPath( me.getAddr() );
            }
#endif

            _socks.push_back(sock);
            socketGuard.Dismiss();
        }
        
        _setupSocketsSuccessful = true;
    }
Ejemplo n.º 20
0
PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) {
    if (isEOF()) {
        return PlanStage::IS_EOF;
    }

    if (doneUpdating()) {
        // Even if we're done updating, we may have some inserting left to do.
        if (needInsert()) {
            // TODO we may want to handle WriteConflictException here. Currently we bounce it
            // out to a higher level since if this WCEs it is likely that we raced with another
            // upsert that may have matched our query, and therefore this may need to perform an
            // update rather than an insert. Bouncing to the higher level allows restarting the
            // query in this case.
            doInsert();

            invariant(isEOF());
            if (_params.request->shouldReturnNewDocs()) {
                // Want to return the document we just inserted, create it as a WorkingSetMember
                // so that we can return it.
                BSONObj newObj = _specificStats.objInserted;
                *out = _ws->allocate();
                WorkingSetMember* member = _ws->get(*out);
                member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(),
                                                   newObj.getOwned());
                member->transitionToOwnedObj();
                return PlanStage::ADVANCED;
            }
        }

        // At this point either we're done updating and there was no insert to do,
        // or we're done updating and we're done inserting. Either way, we're EOF.
        invariant(isEOF());
        return PlanStage::IS_EOF;
    }

    // If we're here, then we still have to ask for results from the child and apply
    // updates to them. We should only get here if the collection exists.
    invariant(_collection);

    // It is possible that after an update was applied, a WriteConflictException
    // occurred and prevented us from returning ADVANCED with the requested version
    // of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.
        invariant(_params.request->shouldReturnAnyDocs());

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;
    }

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    StageState status;
    if (_idRetrying == WorkingSet::INVALID_ID) {
        status = child()->work(&id);
    } else {
        status = ADVANCED;
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    }

    if (PlanStage::ADVANCED == status) {
        // Need to get these things from the result returned by the child.
        RecordId recordId;

        WorkingSetMember* member = _ws->get(id);

        // We want to free this member when we return, unless we need to retry updating or returning
        // it.
        ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

        if (!member->hasRecordId()) {
            // We expect to be here because of an invalidation causing a force-fetch.
            ++_specificStats.nInvalidateSkips;
            return PlanStage::NEED_TIME;
        }
        recordId = member->recordId;

        // Updates can't have projections. This means that covering analysis will always add
        // a fetch. We should always get fetched data, and never just key data.
        invariant(member->hasObj());

        // We fill this with the new RecordIds of moved doc so we don't double-update.
        if (_updatedRecordIds && _updatedRecordIds->count(recordId) > 0) {
            // Found a RecordId that refers to a document we had already updated. Note that
            // we can never remove from _updatedRecordIds because updates by other clients
            // could cause us to encounter a document again later.
            return PlanStage::NEED_TIME;
        }

        bool docStillMatches;
        try {
            docStillMatches = write_stage_common::ensureStillMatches(
                _collection, getOpCtx(), _ws, id, _params.canonicalQuery);
        } catch (const WriteConflictException&) {
            // There was a problem trying to detect if the document still exists, so retry.
            memberFreer.Dismiss();
            return prepareToRetryWSM(id, out);
        }

        if (!docStillMatches) {
            // Either the document has been deleted, or it has been updated such that it no longer
            // matches the predicate.
            if (shouldRestartUpdateIfNoLongerMatches(_params)) {
                throw WriteConflictException();
            }
            return PlanStage::NEED_TIME;
        }

        // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
        // is allowed to free the memory.
        member->makeObjOwnedIfNeeded();

        // Save state before making changes
        WorkingSetCommon::prepareForSnapshotChange(_ws);
        try {
            child()->saveState();
        } catch (const WriteConflictException&) {
            std::terminate();
        }

        // If we care about the pre-updated version of the doc, save it out here.
        BSONObj oldObj;
        if (_params.request->shouldReturnOldDocs()) {
            oldObj = member->obj.value().getOwned();
        }

        BSONObj newObj;
        try {
            // Do the update, get us the new version of the doc.
            newObj = transformAndUpdate(member->obj, recordId);
        } catch (const WriteConflictException&) {
            memberFreer.Dismiss();  // Keep this member around so we can retry updating it.
            return prepareToRetryWSM(id, out);
        }

        // Set member's obj to be the doc we want to return.
        if (_params.request->shouldReturnAnyDocs()) {
            if (_params.request->shouldReturnNewDocs()) {
                member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(),
                                                   newObj.getOwned());
            } else {
                invariant(_params.request->shouldReturnOldDocs());
                member->obj.setValue(oldObj);
            }
            member->recordId = RecordId();
            member->transitionToOwnedObj();
        }

        // This should be after transformAndUpdate to make sure we actually updated this doc.
        ++_specificStats.nMatched;

        // Restore state after modification

        // As restoreState may restore (recreate) cursors, make sure to restore the
        // state outside of the WritUnitOfWork.
        try {
            child()->restoreState();
        } catch (const WriteConflictException&) {
            // Note we don't need to retry updating anything in this case since the update
            // already was committed. However, we still need to return the updated document
            // (if it was requested).
            if (_params.request->shouldReturnAnyDocs()) {
                // member->obj should refer to the document we want to return.
                invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

                _idReturning = id;
                // Keep this member around so that we can return it on the next work() call.
                memberFreer.Dismiss();
            }
            *out = WorkingSet::INVALID_ID;
            return NEED_YIELD;
        }

        if (_params.request->shouldReturnAnyDocs()) {
            // member->obj should refer to the document we want to return.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            memberFreer.Dismiss();  // Keep this member around so we can return it.
            *out = id;
            return PlanStage::ADVANCED;
        }

        return PlanStage::NEED_TIME;
    } else if (PlanStage::IS_EOF == status) {
        // The child is out of results, but we might not be done yet because we still might
        // have to do an insert.
        return PlanStage::NEED_TIME;
    } else if (PlanStage::FAILURE == status) {
        *out = id;
        // If a stage fails, it may create a status WSM to indicate why it failed, in which case
        // 'id' is valid.  If ID is invalid, we create our own error message.
        if (WorkingSet::INVALID_ID == id) {
            const std::string errmsg = "update stage failed to read in results from child";
            *out = WorkingSetCommon::allocateStatusMember(
                _ws, Status(ErrorCodes::InternalError, errmsg));
            return PlanStage::FAILURE;
        }
        return status;
    } else if (PlanStage::NEED_YIELD == status) {
        *out = id;
    }

    return status;
}
Ejemplo n.º 21
0
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.
        signalCurrentEventIfReady_inlock();

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {
                scheduleKillCursors_inlock();
                _executor->signalEvent(_killCursorsScheduledEvent);
            }

            _lifecycleState = kKillComplete;
        }
        return;
    }

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    if (!cbData.response.isOK()) {
        remote.status = cbData.response.getStatus();

        // If we failed to retrieve the batch because we couldn't contact the remote, we notify that
        // targeter that the host is unreachable. The caller can then retry on a new host.
        if (remote.status == ErrorCodes::HostUnreachable && remote.shardId) {
            auto shard = _params.shardRegistry->getShard(_params.txn, *remote.shardId);
            if (!shard) {
                remote.status =
                    Status(ErrorCodes::HostUnreachable,
                           str::stream() << "Could not find shard " << *remote.shardId
                                         << " containing host " << remote.hostAndPort.toString());
            } else {
                shard->getTargeter()->markHostUnreachable(remote.hostAndPort);
            }
        }

        return;
    }

    auto getMoreParseStatus = CursorResponse::parseFromBSON(cbData.response.getValue().data);
    if (!getMoreParseStatus.isOK()) {
        remote.status = getMoreParseStatus.getStatus();
        return;
    }

    auto cursorResponse = getMoreParseStatus.getValue();

    // If we have a cursor established, and we get a non-zero cursorid that is not equal to the
    // established cursorid, we will fail the operation.
    if (remote.cursorId && cursorResponse.cursorId != 0 &&
        *remote.cursorId != cursorResponse.cursorId) {
        remote.status = Status(ErrorCodes::BadValue,
                               str::stream() << "Expected cursorid " << *remote.cursorId
                                             << " but received " << cursorResponse.cursorId);
        return;
    }

    remote.cursorId = cursorResponse.cursorId;
    remote.cmdObj = boost::none;

    for (const auto& obj : cursorResponse.batch) {
        // If there's a sort, we're expecting the remote node to give us back a sort key.
        if (!_params.sort.isEmpty() &&
            obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) {
            remote.status = Status(ErrorCodes::InternalError,
                                   str::stream() << "Missing field '"
                                                 << ClusterClientCursorParams::kSortKeyField
                                                 << "' in document: " << obj);
            return;
        }

        remote.docBuffer.push(obj);
        ++remote.fetchedCount;
    }

    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !cursorResponse.batch.empty()) {
        _mergeQueue.push(remoteIndex);
    }

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    if (_params.isTailable && !remote.hasNext()) {
        _eofNext = true;
    }

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    //
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) {
        auto nextBatchStatus = askForNextBatch_inlock(remoteIndex);
        if (!nextBatchStatus.isOK()) {
            remote.status = nextBatchStatus;
            return;
        }
    }

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
    signaller.Dismiss();
    signalCurrentEventIfReady_inlock();
}
Ejemplo n.º 22
0
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) {
    if (isEOF()) {
        return PlanStage::IS_EOF;
    }
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.
        invariant(_params.returnDeleted);

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;
    }

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    if (_idRetrying != WorkingSet::INVALID_ID) {
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    } else {
        auto status = child()->work(&id);

        switch (status) {
            case PlanStage::ADVANCED:
                break;

            case PlanStage::FAILURE:
            case PlanStage::DEAD:
                // The stage which produces a failure is responsible for allocating a working set
                // member with error details.
                invariant(WorkingSet::INVALID_ID != id);
                *out = id;
                return status;

            case PlanStage::NEED_TIME:
                return status;

            case PlanStage::NEED_YIELD:
                *out = id;
                return status;

            case PlanStage::IS_EOF:
                return status;

            default:
                MONGO_UNREACHABLE;
        }
    }

    // We advanced, or are retrying, and id is set to the WSM to work on.
    WorkingSetMember* member = _ws->get(id);

    // We want to free this member when we return, unless we need to retry deleting or returning it.
    ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

    invariant(member->hasRecordId());
    RecordId recordId = member->recordId;
    // Deletes can't have projections. This means that covering analysis will always add
    // a fetch. We should always get fetched data, and never just key data.
    invariant(member->hasObj());

    // Ensure the document still exists and matches the predicate.
    bool docStillMatches;
    try {
        docStillMatches = write_stage_common::ensureStillMatches(
            _collection, getOpCtx(), _ws, id, _params.canonicalQuery);
    } catch (const WriteConflictException&) {
        // There was a problem trying to detect if the document still exists, so retry.
        memberFreer.Dismiss();
        return prepareToRetryWSM(id, out);
    }

    if (!docStillMatches) {
        // Either the document has already been deleted, or it has been updated such that it no
        // longer matches the predicate.
        if (shouldRestartDeleteIfNoLongerMatches(_params)) {
            throw WriteConflictException();
        }
        return PlanStage::NEED_TIME;
    }

    // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() is
    // allowed to free the memory.
    if (_params.returnDeleted) {
        // Save a copy of the document that is about to get deleted, but keep it in the RID_AND_OBJ
        // state in case we need to retry deleting it.
        BSONObj deletedDoc = member->obj.value();
        member->obj.setValue(deletedDoc.getOwned());
    }

    // TODO: Do we want to buffer docs and delete them in a group rather than saving/restoring state
    // repeatedly?

    WorkingSetCommon::prepareForSnapshotChange(_ws);
    try {
        child()->saveState();
    } catch (const WriteConflictException&) {
        std::terminate();
    }

    // Do the write, unless this is an explain.
    if (!_params.isExplain) {
        try {
            WriteUnitOfWork wunit(getOpCtx());
            _collection->deleteDocument(getOpCtx(),
                                        _params.stmtId,
                                        recordId,
                                        _params.opDebug,
                                        _params.fromMigrate,
                                        false,
                                        _params.returnDeleted ? Collection::StoreDeletedDoc::On
                                                              : Collection::StoreDeletedDoc::Off);
            wunit.commit();
        } catch (const WriteConflictException&) {
            memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
            return prepareToRetryWSM(id, out);
        }
    }
    ++_specificStats.docsDeleted;

    if (_params.returnDeleted) {
        // After deleting the document, the RecordId associated with this member is invalid.
        // Remove the 'recordId' from the WorkingSetMember before returning it.
        member->recordId = RecordId();
        member->transitionToOwnedObj();
    }

    // As restoreState may restore (recreate) cursors, cursors are tied to the transaction in which
    // they are created, and a WriteUnitOfWork is a transaction, make sure to restore the state
    // outside of the WriteUnitOfWork.
    try {
        child()->restoreState();
    } catch (const WriteConflictException&) {
        // Note we don't need to retry anything in this case since the delete already was committed.
        // However, we still need to return the deleted document (if it was requested).
        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            _idReturning = id;
            // Keep this member around so that we can return it on the next work() call.
            memberFreer.Dismiss();
        }
        *out = WorkingSet::INVALID_ID;
        return NEED_YIELD;
    }

    if (_params.returnDeleted) {
        // member->obj should refer to the deleted document.
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        memberFreer.Dismiss();  // Keep this member around so we can return it.
        *out = id;
        return PlanStage::ADVANCED;
    }

    return PlanStage::NEED_TIME;
}
Ejemplo n.º 23
0
bool MessagingPort::recv(Message& m) {
    try {
#ifdef MONGO_CONFIG_SSL
    again:
#endif
        // mmm( log() << "*  recv() sock:" << this->sock << endl; )
        MSGHEADER::Value header;
        int headerLen = sizeof(MSGHEADER::Value);
        psock->recv((char*)&header, headerLen);
        int len = header.constView().getMessageLength();

        if (len == 542393671) {
            // an http GET
            string msg =
                "It looks like you are trying to access MongoDB over HTTP on the native driver "
                "port.\n";
            LOG(psock->getLogLevel()) << msg;
            std::stringstream ss;
            ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: "
                  "text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg;
            string s = ss.str();
            send(s.c_str(), s.size(), "http");
            return false;
        }
        // If responseTo is not 0 or -1 for first packet assume SSL
        else if (psock->isAwaitingHandshake()) {
#ifndef MONGO_CONFIG_SSL
            if (header.constView().getResponseTo() != 0 &&
                header.constView().getResponseTo() != -1) {
                uasserted(17133,
                          "SSL handshake requested, SSL feature not available in this build");
            }
#else
            if (header.constView().getResponseTo() != 0 &&
                header.constView().getResponseTo() != -1) {
                uassert(17132,
                        "SSL handshake received but server is started without SSL support",
                        sslGlobalParams.sslMode.load() != SSLParams::SSLMode_disabled);
                setX509SubjectName(
                    psock->doSSLHandshake(reinterpret_cast<const char*>(&header), sizeof(header)));
                psock->setHandshakeReceived();
                goto again;
            }
            uassert(17189,
                    "The server is configured to only allow SSL connections",
                    sslGlobalParams.sslMode.load() != SSLParams::SSLMode_requireSSL);
#endif  // MONGO_CONFIG_SSL
        }
        if (static_cast<size_t>(len) < sizeof(MSGHEADER::Value) ||
            static_cast<size_t>(len) > MaxMessageSizeBytes) {
            LOG(0) << "recv(): message len " << len << " is invalid. "
                   << "Min " << sizeof(MSGHEADER::Value) << " Max: " << MaxMessageSizeBytes;
            return false;
        }

        psock->setHandshakeReceived();
        int z = (len + 1023) & 0xfffffc00;
        verify(z >= len);
        MsgData::View md = reinterpret_cast<char*>(mongolMalloc(z));
        ScopeGuard guard = MakeGuard(free, md.view2ptr());
        verify(md.view2ptr());

        memcpy(md.view2ptr(), &header, headerLen);
        int left = len - headerLen;

        psock->recv(md.data(), left);

        guard.Dismiss();
        m.setData(md.view2ptr(), true);
        return true;

    } catch (const SocketException& e) {
        logger::LogSeverity severity = psock->getLogLevel();
        if (!e.shouldPrint())
            severity = severity.lessSevere();
        LOG(severity) << "SocketException: remote: " << remote() << " error: " << e;
        m.reset();
        return false;
    }
}
Ejemplo n.º 24
0
bool tryLock(const Zstring& lockfilepath) //throw FileError
{
#ifdef ZEN_WIN
#ifdef TODO_MinFFS_ActivatePriviledge
    try { activatePrivilege(SE_BACKUP_NAME); }
    catch (const FileError&) {}
    try { activatePrivilege(SE_RESTORE_NAME); }
    catch (const FileError&) {}
#endif//TODO_MinFFS_ActivatePriviledge

    const HANDLE fileHandle = ::CreateFile(applyLongPathPrefix(lockfilepath).c_str(),              //_In_      LPCTSTR lpFileName,
                                           //use both when writing over network, see comment in file_io.cpp
                                           GENERIC_READ | GENERIC_WRITE,                           //_In_      DWORD dwDesiredAccess,
                                           FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, //_In_      DWORD dwShareMode,
                                           nullptr,               //_In_opt_  LPSECURITY_ATTRIBUTES lpSecurityAttributes,
                                           CREATE_NEW,            //_In_      DWORD dwCreationDisposition,
                                           FILE_ATTRIBUTE_NORMAL | FILE_FLAG_BACKUP_SEMANTICS, //_In_      DWORD dwFlagsAndAttributes,
                                           nullptr);              //_In_opt_  HANDLE hTemplateFile
    if (fileHandle == INVALID_HANDLE_VALUE)
    {
        const DWORD ec = ::GetLastError(); //copy before directly/indirectly making other system calls!
        if (ec == ERROR_FILE_EXISTS || //confirmed to be used
            ec == ERROR_ALREADY_EXISTS) //comment on msdn claims, this one is used on Windows Mobile 6
            return false;

        throw FileError(replaceCpy(_("Cannot write file %x."), L"%x", fmtPath(lockfilepath)), formatSystemError(L"CreateFile", ec));
    }
    ScopeGuard guardLockFile = zen::makeGuard([&] { removeFile(lockfilepath); });
    FileOutput fileOut(fileHandle, lockfilepath); //pass handle ownership

    //be careful to avoid CreateFile() + CREATE_ALWAYS on a hidden file -> see file_io.cpp
    //=> we don't need it that badly //::SetFileAttributes(applyLongPathPrefix(lockfilepath).c_str(), FILE_ATTRIBUTE_HIDDEN); //(try to) hide it

#elif defined ZEN_LINUX || defined ZEN_MAC
    const mode_t oldMask = ::umask(0); //important: we want the lock file to have exactly the permissions specified
    ZEN_ON_SCOPE_EXIT(::umask(oldMask));

    //O_EXCL contains a race condition on NFS file systems: http://linux.die.net/man/2/open
    const int fileHandle = ::open(lockfilepath.c_str(), O_CREAT | O_EXCL | O_WRONLY,
                                  S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
    if (fileHandle == -1)
    {
        if (errno == EEXIST)
            return false;
        else
            THROW_LAST_FILE_ERROR(replaceCpy(_("Cannot write file %x."), L"%x", fmtPath(lockfilepath)), L"open");
    }
    ScopeGuard guardLockFile = zen::makeGuard([&] { removeFile(lockfilepath); });
    FileOutput fileOut(fileHandle, lockfilepath); //pass handle ownership
#endif

    //write housekeeping info: user, process info, lock GUID
    ByteArray binStream = [&]
    {
        MemStreamOut streamOut;
        LockInformation(FromCurrentProcess()).toStream(streamOut);
        return streamOut.ref();
    }();

    if (!binStream.empty())
        fileOut.write(&*binStream.begin(), binStream.size()); //throw FileError

    guardLockFile.dismiss(); //lockfile created successfully
    return true;
}
Ejemplo n.º 25
0
void DoExceptionTests( void )
{
	::std::cout << ::std::endl;

	{
		try
		{
			::std::cout << "The function should be called." << ::std::endl;
			ScopeGuard guard = MakeGuard( &CalledAlways );
			(void)guard;
			FunctionMightThrow( false );
		}
		catch ( ... )
		{
			::std::cout << "Caught exception." << ::std::endl;
			assert( false );
		}
	}

	::std::cout << ::std::endl;

	{
		try
		{
			::std::cout << "The function should be called." << ::std::endl;
			ScopeGuard guard = MakeGuard( &CalledAlways );
			(void)guard;
			FunctionMightThrow( true );
			assert( false );
		}
		catch ( ... )
		{
			::std::cout << "Caught exception." << ::std::endl;
		}
	}

	::std::cout << ::std::endl;

	{
		try
		{
			::std::cout << "The function should be called." << ::std::endl;
			ScopeGuard guard = MakeGuard( &CalledIfNoException );
			guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfNoException );
			FunctionMightThrow( false );
			::std::cout << "No exception thrown." << ::std::endl;
		}
		catch ( ... )
		{
			::std::cout << "Caught exception." << ::std::endl;
			assert( false );
		}
	}

	::std::cout << ::std::endl;

	{
		try
		{
			::std::cout << "The function should not be called." << ::std::endl;
			ScopeGuard guard = MakeGuard( &CalledIfNoException );
			guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfNoException );
			FunctionMightThrow( true );
			assert( false );
		}
		catch ( ... )
		{
			::std::cout << "Caught exception." << ::std::endl;
		}
	}

	::std::cout << ::std::endl;

	{
		try
		{
			::std::cout << "The function should not be called." << ::std::endl;
			ScopeGuard guard = MakeGuard( &CalledIfException );
			guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfException );
			FunctionMightThrow( false );
			::std::cout << "No exception thrown." << ::std::endl;
		}
		catch ( ... )
		{
			::std::cout << "Caught exception." << ::std::endl;
			assert( false );
		}
	}

	::std::cout << ::std::endl;

	{
		try
		{
			::std::cout << "The function should be called." << ::std::endl;
			ScopeGuard guard = MakeGuard( &CalledIfException );
			guard.SetExceptionPolicy( ScopeGuardImplBase::CallIfException );
			FunctionMightThrow( true );
			assert( false );
		}
		catch ( ... )
		{
			::std::cout << "Caught exception." << ::std::endl;
		}
	}

	::std::cout << ::std::endl;
}
Ejemplo n.º 26
0
    /**
     * Runs a query using the following steps:
     *   1) Parsing.
     *   2) Acquire locks.
     *   3) Plan query, obtaining an executor that can run it.
     *   4) Setup a cursor for the query, which may be used on subsequent getMores.
     *   5) Generate the first batch.
     *   6) Save state for getMore.
     *   7) Generate response to send to the client.
     *
     * TODO: Rather than using the sharding version available in thread-local storage (i.e. the
     *       call to ShardingState::needCollectionMetadata() below), shard version information
     *       should be passed as part of the command parameter.
     */
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const std::string fullns = parseNs(dbname, cmdObj);
        const NamespaceString nss(fullns);
        if (!nss.isValid()) {
            return appendCommandStatus(result,
                                       {ErrorCodes::InvalidNamespace,
                                        str::stream() << "Invalid collection name: " << nss.ns()});
        }

        // Although it is a command, a find command gets counted as a query.
        globalOpCounters.gotQuery();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));
        }

        // 1a) Parse the command BSON to a LiteParsedQuery.
        const bool isExplain = false;
        auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!lpqStatus.isOK()) {
            return appendCommandStatus(result, lpqStatus.getStatus());
        }

        auto& lpq = lpqStatus.getValue();

        // Validate term, if provided.
        if (auto term = lpq->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(*term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // Fill out curop information.
        long long ntoreturn = lpq->getBatchSize().value_or(0);
        beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip());

        // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery.
        WhereCallbackReal whereCallback(txn, nss.db());
        auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        }
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // 2) Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel =
            ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

        ShardingState* const shardingState = ShardingState::get(txn);

        // It is possible that the sharding version will change during yield while we are
        // retrieving a plan executor. If this happens we will throw an error and mongos will
        // retry.
        const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

        // 3) Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while
        // the chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // Version changed while retrieving a PlanExecutor. Terminate the operation,
            // signaling that mongos should retry.
            throw SendStaleConfigException(nss.ns(),
                                           "version changed during find command",
                                           shardingVersionAtStart,
                                           shardingState->getVersion(nss.ns()));
        }

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;
        }

        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // 4) If possible, register the execution plan inside a ClientCursor, and pin that
        // cursor. In this case, ownership of the PlanExecutor is transferred to the
        // ClientCursor, and 'exec' becomes null.
        //
        // First unregister the PlanExecutor so it can be re-registered with ClientCursor.
        exec->deregisterExec();

        // Create a ClientCursor containing this plan executor. We don't have to worry
        // about leaking it as it's inserted into a global map by its ctor.
        ClientCursor* cursor =
            new ClientCursor(collection->getCursorManager(),
                             exec.release(),
                             nss.ns(),
                             txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                             pq.getOptions(),
                             pq.getFilter());
        CursorId cursorId = cursor->cursorid();
        ClientCursorPin ccPin(collection->getCursorManager(), cursorId);

        // On early return, get rid of the the cursor.
        ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin);

        invariant(!exec);
        PlanExecutor* cursorExec = cursor->getExecutor();

        // 5) Stream query results, adding them to a BSONArray as we go.
        BSONArrayBuilder firstBatch;
        BSONObj obj;
        PlanExecutor::ExecState state;
        long long numResults = 0;
        while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) &&
               PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) {
            // If adding this object will cause us to exceed the BSON size limit, then we stash
            // it for later.
            if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) {
                cursorExec->enqueue(obj);
                break;
            }

            // Add result to output buffer.
            firstBatch.append(obj);
            numResults++;
        }

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats());
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::statsToBSON(*stats);

            return appendCommandStatus(result,
                                       Status(ErrorCodes::OperationFailed,
                                              str::stream()
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));
        }

        // 6) Set up the cursor for getMore.
        if (shouldSaveCursor(txn, collection, state, cursorExec)) {
            // State will be restored on getMore.
            cursorExec->saveState();
            cursorExec->detachFromOperationContext();

            cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
            cursor->setPos(numResults);
        } else {
            cursorId = 0;
        }

        // Fill out curop based on the results.
        endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId);

        // 7) Generate the response object to send to the client.
        appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result);
        if (cursorId) {
            cursorFreer.Dismiss();
        }
        return true;
    }
Ejemplo n.º 27
0
    virtual bool run(OperationContext* txn,
                     const string& dbname,
                     BSONObj& cmdObj,
                     int options,
                     string& errmsg,
                     BSONObjBuilder& result) {
        const NamespaceString ns(parseNs(dbname, cmdObj));

        Status status = userAllowedWriteNS(ns);
        if (!status.isOK())
            return appendCommandStatus(result, status);

        if (cmdObj["indexes"].type() != Array) {
            errmsg = "indexes has to be an array";
            result.append("cmdObj", cmdObj);
            return false;
        }

        std::vector<BSONObj> specs;
        {
            BSONObjIterator i(cmdObj["indexes"].Obj());
            while (i.more()) {
                BSONElement e = i.next();
                if (e.type() != Object) {
                    errmsg = "everything in indexes has to be an Object";
                    result.append("cmdObj", cmdObj);
                    return false;
                }
                specs.push_back(e.Obj());
            }
        }

        if (specs.size() == 0) {
            errmsg = "no indexes to add";
            return false;
        }

        // check specs
        for (size_t i = 0; i < specs.size(); i++) {
            BSONObj spec = specs[i];
            if (spec["ns"].eoo()) {
                spec = _addNsToSpec(ns, spec);
                specs[i] = spec;
            }

            if (spec["ns"].type() != String) {
                errmsg = "ns field must be a string";
                result.append("spec", spec);
                return false;
            }

            std::string nsFromUser = spec["ns"].String();
            if (nsFromUser.empty()) {
                errmsg = "ns field cannot be an empty string";
                result.append("spec", spec);
                return false;
            }

            if (ns != nsFromUser) {
                errmsg = str::stream() << "value of ns field '" << nsFromUser
                                       << "' doesn't match namespace " << ns.ns();
                result.append("spec", spec);
                return false;
            }
        }

        // now we know we have to create index(es)
        // Note: createIndexes command does not currently respect shard versioning.
        ScopedTransaction transaction(txn, MODE_IX);
        Lock::DBLock dbLock(txn->lockState(), ns.db(), MODE_X);
        if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::NotMaster,
                       str::stream() << "Not primary while creating indexes in " << ns.ns()));
        }

        Database* db = dbHolder().get(txn, ns.db());
        if (!db) {
            db = dbHolder().openDb(txn, ns.db());
        }

        Collection* collection = db->getCollection(ns.ns());
        if (collection) {
            result.appendBool("createdCollectionAutomatically", false);
        } else {
            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                WriteUnitOfWork wunit(txn);
                collection = db->createCollection(txn, ns.ns(), CollectionOptions());
                invariant(collection);
                wunit.commit();
            }
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());
            result.appendBool("createdCollectionAutomatically", true);
        }

        const int numIndexesBefore = collection->getIndexCatalog()->numIndexesTotal(txn);
        result.append("numIndexesBefore", numIndexesBefore);

        auto client = txn->getClient();
        ScopeGuard lastOpSetterGuard =
            MakeObjGuard(repl::ReplClientInfo::forClient(client),
                         &repl::ReplClientInfo::setLastOpToSystemLastOpTime,
                         txn);

        MultiIndexBlock indexer(txn, collection);
        indexer.allowBackgroundBuilding();
        indexer.allowInterruption();

        const size_t origSpecsSize = specs.size();
        indexer.removeExistingIndexes(&specs);

        if (specs.size() == 0) {
            result.append("numIndexesAfter", numIndexesBefore);
            result.append("note", "all indexes already exist");
            return true;
        }

        if (specs.size() != origSpecsSize) {
            result.append("note", "index already exists");
        }

        for (size_t i = 0; i < specs.size(); i++) {
            const BSONObj& spec = specs[i];
            if (spec["unique"].trueValue()) {
                status = checkUniqueIndexConstraints(txn, ns.ns(), spec["key"].Obj());

                if (!status.isOK()) {
                    return appendCommandStatus(result, status);
                }
            }
            if (spec["v"].isNumber() && spec["v"].numberInt() == 0) {
                return appendCommandStatus(
                    result,
                    Status(ErrorCodes::CannotCreateIndex,
                           str::stream() << "illegal index specification: " << spec << ". "
                                         << "The option v:0 cannot be passed explicitly"));
            }
        }

        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            uassertStatusOK(indexer.init(specs));
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());

        // If we're a background index, replace exclusive db lock with an intent lock, so that
        // other readers and writers can proceed during this phase.
        if (indexer.getBuildInBackground()) {
            txn->recoveryUnit()->abandonSnapshot();
            dbLock.relockWithMode(MODE_IX);
            if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) {
                return appendCommandStatus(
                    result,
                    Status(ErrorCodes::NotMaster,
                           str::stream() << "Not primary while creating background indexes in "
                                         << ns.ns()));
            }
        }

        try {
            Lock::CollectionLock colLock(txn->lockState(), ns.ns(), MODE_IX);
            uassertStatusOK(indexer.insertAllDocumentsInCollection());
        } catch (const DBException& e) {
            invariant(e.getCode() != ErrorCodes::WriteConflict);
            // Must have exclusive DB lock before we clean up the index build via the
            // destructor of 'indexer'.
            if (indexer.getBuildInBackground()) {
                try {
                    // This function cannot throw today, but we will preemptively prepare for
                    // that day, to avoid data corruption due to lack of index cleanup.
                    txn->recoveryUnit()->abandonSnapshot();
                    dbLock.relockWithMode(MODE_X);
                    if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns)) {
                        return appendCommandStatus(
                            result,
                            Status(ErrorCodes::NotMaster,
                                   str::stream()
                                       << "Not primary while creating background indexes in "
                                       << ns.ns() << ": cleaning up index build failure due to "
                                       << e.toString()));
                    }
                } catch (...) {
                    std::terminate();
                }
            }
            throw;
        }
        // Need to return db lock back to exclusive, to complete the index build.
        if (indexer.getBuildInBackground()) {
            txn->recoveryUnit()->abandonSnapshot();
            dbLock.relockWithMode(MODE_X);
            uassert(ErrorCodes::NotMaster,
                    str::stream() << "Not primary while completing index build in " << dbname,
                    repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(ns));

            Database* db = dbHolder().get(txn, ns.db());
            uassert(28551, "database dropped during index build", db);
            uassert(28552, "collection dropped during index build", db->getCollection(ns.ns()));
        }

        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            WriteUnitOfWork wunit(txn);

            indexer.commit();

            for (size_t i = 0; i < specs.size(); i++) {
                std::string systemIndexes = ns.getSystemIndexesCollection();
                getGlobalServiceContext()->getOpObserver()->onCreateIndex(
                    txn, systemIndexes, specs[i]);
            }

            wunit.commit();
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());

        result.append("numIndexesAfter", collection->getIndexCatalog()->numIndexesTotal(txn));

        lastOpSetterGuard.Dismiss();

        return true;
    }
Ejemplo n.º 28
0
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData,
    OperationContext* opCtx,
    size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.
        signalCurrentEventIfReady_inlock();

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {
                scheduleKillCursors_inlock(opCtx);
                _executor->signalEvent(_killCursorsScheduledEvent);
            }

            _lifecycleState = kKillComplete;
        }

        return;
    }

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    StatusWith<CursorResponse> cursorResponseStatus(
        cbData.response.isOK() ? parseCursorResponse(cbData.response.data, remote)
                               : cbData.response.status);

    if (!cursorResponseStatus.isOK()) {
        auto shard = remote.getShard();
        if (!shard) {
            remote.status = Status(cursorResponseStatus.getStatus().code(),
                                   str::stream() << "Could not find shard containing host "
                                                 << remote.getTargetHost().toString());
        } else {
            shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus());
            remote.status = cursorResponseStatus.getStatus();
        }

        // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We
        // remove the unreachable host entirely from consideration by marking it as exhausted.
        if (_params->isAllowPartialResults) {
            remote.status = Status::OK();

            // Clear the results buffer and cursor id.
            std::queue<ClusterQueryResult> emptyBuffer;
            std::swap(remote.docBuffer, emptyBuffer);
            remote.cursorId = 0;
        }

        return;
    }

    // Response successfully received.

    auto cursorResponse = std::move(cursorResponseStatus.getValue());

    // Update the cursorId; it is sent as '0' when the cursor has been exhausted on the shard.
    remote.cursorId = cursorResponse.getCursorId();

    // Save the batch in the remote's buffer.
    if (!addBatchToBuffer(remoteIndex, cursorResponse.getBatch())) {
        return;
    }

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    // (Note: tailable cursors are only valid on unsharded collections, so the end of the batch from
    // one shard means the end of the overall batch).
    if (_params->isTailable && !remote.hasNext()) {
        _eofNext = true;
    }

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    //
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params->isTailable && !remote.hasNext() && !remote.exhausted()) {
        remote.status = askForNextBatch_inlock(opCtx, remoteIndex);
        if (!remote.status.isOK()) {
            return;
        }
    }

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
    signaller.Dismiss();
    signalCurrentEventIfReady_inlock();
}
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) {
    if (isEOF()) {
        return PlanStage::IS_EOF;
    }
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.
        invariant(_params.returnDeleted);

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        return PlanStage::ADVANCED;
    }

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    if (_idRetrying != WorkingSet::INVALID_ID) {
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    } else {
        auto status = child()->work(&id);

        switch (status) {
        case PlanStage::ADVANCED:
            break;

        case PlanStage::FAILURE:
        case PlanStage::DEAD:
            *out = id;

            // If a stage fails, it may create a status WSM to indicate why it failed, in which
            // case 'id' is valid.  If ID is invalid, we create our own error message.
            if (WorkingSet::INVALID_ID == id) {
                const std::string errmsg = "delete stage failed to read in results from child";
                *out = WorkingSetCommon::allocateStatusMember(
                           _ws, Status(ErrorCodes::InternalError, errmsg));
            }
            return status;

        case PlanStage::NEED_TIME:
            return status;

        case PlanStage::NEED_YIELD:
            *out = id;
            return status;

        case PlanStage::IS_EOF:
            return status;

        default:
            MONGO_UNREACHABLE;
        }
    }

    // We advanced, or are retrying, and id is set to the WSM to work on.
    WorkingSetMember* member = _ws->get(id);

    // We want to free this member when we return, unless we need to retry it.
    ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

    if (!member->hasRecordId()) {
        // We expect to be here because of an invalidation causing a force-fetch.
        ++_specificStats.nInvalidateSkips;
        return PlanStage::NEED_TIME;
    }
    RecordId recordId = member->recordId;
    // Deletes can't have projections. This means that covering analysis will always add
    // a fetch. We should always get fetched data, and never just key data.
    invariant(member->hasObj());

    try {
        // If the snapshot changed, then we have to make sure we have the latest copy of the
        // doc and that it still matches.
        std::unique_ptr<SeekableRecordCursor> cursor;
        if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
            cursor = _collection->getCursor(getOpCtx());
            if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) {
                // Doc is already deleted. Nothing more to do.
                return PlanStage::NEED_TIME;
            }

            // Make sure the re-fetched doc still matches the predicate.
            if (_params.canonicalQuery &&
                    !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                // Doesn't match.
                return PlanStage::NEED_TIME;
            }
        }

        // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
        // is allowed to free the memory.
        if (_params.returnDeleted) {
            // Save a copy of the document that is about to get deleted, but keep it in the
            // RID_AND_OBJ state in case we need to retry deleting it.
            BSONObj deletedDoc = member->obj.value();
            member->obj.setValue(deletedDoc.getOwned());
        }

        // TODO: Do we want to buffer docs and delete them in a group rather than
        // saving/restoring state repeatedly?

        try {
            WorkingSetCommon::prepareForSnapshotChange(_ws);
            child()->saveState();
        } catch (const WriteConflictException& wce) {
            std::terminate();
        }

        // Do the write, unless this is an explain.
        if (!_params.isExplain) {
            WriteUnitOfWork wunit(getOpCtx());
            _collection->deleteDocument(getOpCtx(), recordId, _params.fromMigrate);
            wunit.commit();
        }

        ++_specificStats.docsDeleted;
    } catch (const WriteConflictException& wce) {
        // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will
        // not produce any more results even if there is another matching document. Re-throw the WCE
        // here so that these operations get another chance to find a matching document. The
        // findAndModify command should automatically retry if it gets a WCE.
        // TODO: this is not necessary if there was no sort specified.
        if (_params.returnDeleted) {
            throw;
        }
        _idRetrying = id;
        memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
        *out = WorkingSet::INVALID_ID;
        return NEED_YIELD;
    }

    if (_params.returnDeleted) {
        // After deleting the document, the RecordId associated with this member is invalid.
        // Remove the 'recordId' from the WorkingSetMember before returning it.
        member->recordId = RecordId();
        member->transitionToOwnedObj();
    }

    //  As restoreState may restore (recreate) cursors, cursors are tied to the
    //  transaction in which they are created, and a WriteUnitOfWork is a
    //  transaction, make sure to restore the state outside of the WritUnitOfWork.
    try {
        child()->restoreState();
    } catch (const WriteConflictException& wce) {
        // Note we don't need to retry anything in this case since the delete already
        // was committed. However, we still need to return the deleted document
        // (if it was requested).
        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            _idReturning = id;
            // Keep this member around so that we can return it on the next work() call.
            memberFreer.Dismiss();
        }
        *out = WorkingSet::INVALID_ID;
        return NEED_YIELD;
    }

    if (_params.returnDeleted) {
        // member->obj should refer to the deleted document.
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        memberFreer.Dismiss();  // Keep this member around so we can return it.
        *out = id;
        return PlanStage::ADVANCED;
    }

    return PlanStage::NEED_TIME;
}
Ejemplo n.º 30
0
        virtual bool run(OperationContext* txn,
                         const string& dbname,
                         BSONObj& cmdObj,
                         int,
                         string& errmsg,
                         BSONObjBuilder& result,
                         bool fromRepl) {
            Lock::GlobalWrite globalWriteLock(txn->lockState());
            string source = cmdObj.getStringField( name.c_str() );
            string target = cmdObj.getStringField( "to" );

            // We stay in source context the whole time. This is mostly to set the CurOp namespace.
            Client::Context ctx(txn, source);

            if ( !NamespaceString::validCollectionComponent(target.c_str()) ) {
                errmsg = "invalid collection name: " + target;
                return false;
            }
            if ( source.empty() || target.empty() ) {
                errmsg = "invalid command syntax";
                return false;
            }

            if (!fromRepl) { // If it got through on the master, need to allow it here too
                Status sourceStatus = userAllowedWriteNS(source);
                if (!sourceStatus.isOK()) {
                    errmsg = "error with source namespace: " + sourceStatus.reason();
                    return false;
                }

                Status targetStatus = userAllowedWriteNS(target);
                if (!targetStatus.isOK()) {
                    errmsg = "error with target namespace: " + targetStatus.reason();
                    return false;
                }
            }

            if (NamespaceString(source).coll() == "system.indexes"
                || NamespaceString(target).coll() == "system.indexes") {
                errmsg = "renaming system.indexes is not allowed";
                return false;
            }

            Database* const sourceDB = dbHolder().get(txn, nsToDatabase(source));
            Collection* const sourceColl = sourceDB ? sourceDB->getCollection(txn, source)
                                                    : NULL;
            if (!sourceColl) {
                errmsg = "source namespace does not exist";
                return false;
            }

            {
                // Ensure that collection name does not exceed maximum length.
                // Ensure that index names do not push the length over the max.
                // Iterator includes unfinished indexes.
                IndexCatalog::IndexIterator sourceIndIt =
                    sourceColl->getIndexCatalog()->getIndexIterator( txn, true );
                int longestIndexNameLength = 0;
                while ( sourceIndIt.more() ) {
                    int thisLength = sourceIndIt.next()->indexName().length();
                    if ( thisLength > longestIndexNameLength )
                        longestIndexNameLength = thisLength;
                }

                unsigned int longestAllowed =
                    min(int(NamespaceString::MaxNsCollectionLen),
                        int(NamespaceString::MaxNsLen) - 2/*strlen(".$")*/ - longestIndexNameLength);
                if (target.size() > longestAllowed) {
                    StringBuilder sb;
                    sb << "collection name length of " << target.size()
                       << " exceeds maximum length of " << longestAllowed
                       << ", allowing for index names";
                    errmsg = sb.str();
                    return false;
                }
            }

            const std::vector<BSONObj> indexesInProg = stopIndexBuilds(txn, sourceDB, cmdObj);
            // Dismissed on success
            ScopeGuard indexBuildRestorer = MakeGuard(IndexBuilder::restoreIndexes, indexesInProg);

            Database* const targetDB = dbHolder().openDb(txn, nsToDatabase(target));

            {
                WriteUnitOfWork wunit(txn);

                // Check if the target namespace exists and if dropTarget is true.
                // If target exists and dropTarget is not true, return false.
                if (targetDB->getCollection(txn, target)) {
                    if (!cmdObj["dropTarget"].trueValue()) {
                        errmsg = "target namespace exists";
                        return false;
                    }

                    Status s = targetDB->dropCollection(txn, target);
                    if ( !s.isOK() ) {
                        errmsg = s.toString();
                        return false;
                    }
                }

                // If we are renaming in the same database, just
                // rename the namespace and we're done.
                if (sourceDB == targetDB) {
                    Status s = targetDB->renameCollection(txn,
                                                          source,
                                                          target,
                                                          cmdObj["stayTemp"].trueValue() );
                    if (!s.isOK()) {
                        return appendCommandStatus(result, s);
                    }

                    if (!fromRepl) {
                        repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj);
                    }

                    wunit.commit();
                    indexBuildRestorer.Dismiss();
                    return true;
                }

                wunit.commit();
            }

            // If we get here, we are renaming across databases, so we must copy all the data and
            // indexes, then remove the source collection.

            // Create the target collection. It will be removed if we fail to copy the collection.
            // TODO use a temp collection and unset the temp flag on success.
            Collection* targetColl = NULL;
            {
                CollectionOptions options;
                options.setNoIdIndex();

                if (sourceColl->isCapped()) {
                    const CollectionOptions sourceOpts =
                        sourceColl->getCatalogEntry()->getCollectionOptions(txn);

                    options.capped = true;
                    options.cappedSize = sourceOpts.cappedSize;
                    options.cappedMaxDocs = sourceOpts.cappedMaxDocs;
                }

                WriteUnitOfWork wunit(txn);

                // No logOp necessary because the entire renameCollection command is one logOp.
                targetColl = targetDB->createCollection(txn, target, options);
                if (!targetColl) {
                    errmsg = "Failed to create target collection.";
                    return false;
                }

                wunit.commit();
            }

            // Dismissed on success
            ScopeGuard targetCollectionDropper = MakeGuard(dropCollection, txn, targetDB, target);

            MultiIndexBlock indexer(txn, targetColl);
            indexer.allowInterruption();

            // Copy the index descriptions from the source collection, adjusting the ns field.
            {
                std::vector<BSONObj> indexesToCopy;
                IndexCatalog::IndexIterator sourceIndIt =
                    sourceColl->getIndexCatalog()->getIndexIterator( txn, true );
                while (sourceIndIt.more()) {
                    const BSONObj currIndex = sourceIndIt.next()->infoObj();

                    // Process the source index.
                    BSONObjBuilder newIndex;
                    newIndex.append("ns", target);
                    newIndex.appendElementsUnique(currIndex);
                    indexesToCopy.push_back(newIndex.obj());
                }
                indexer.init(indexesToCopy);
            }

            {
                // Copy over all the data from source collection to target collection.
                boost::scoped_ptr<RecordIterator> sourceIt(sourceColl->getIterator(txn));
                while (!sourceIt->isEOF()) {
                    txn->checkForInterrupt(false);

                    const BSONObj obj = sourceColl->docFor(txn, sourceIt->getNext());

                    WriteUnitOfWork wunit(txn);
                    // No logOp necessary because the entire renameCollection command is one logOp.
                    Status status = targetColl->insertDocument(txn, obj, &indexer, true).getStatus();
                    if (!status.isOK())
                        return appendCommandStatus(result, status);
                    wunit.commit();
                }
            }

            Status status = indexer.doneInserting();
            if (!status.isOK())
                return appendCommandStatus(result, status);

            {
                // Getting here means we successfully built the target copy. We now remove the
                // source collection and finalize the rename.
                WriteUnitOfWork wunit(txn);

                Status status = sourceDB->dropCollection(txn, source);
                if (!status.isOK())
                    return appendCommandStatus(result, status);

                indexer.commit();

                if (!fromRepl) {
                    repl::logOp(txn, "c", (dbname + ".$cmd").c_str(), cmdObj);
                }

                wunit.commit();
            }

            indexBuildRestorer.Dismiss();
            targetCollectionDropper.Dismiss();
            return true;
        }