Beispiel #1
0
    void Socket::recv( char * buf , int len ) {
        int retries = 0;
        while( len > 0 ) {
            int ret = -1;
            if (MONGO_FAIL_POINT(throwSockExcep)) {
#if defined(_WIN32)
                WSASetLastError(WSAENETUNREACH);
#else
                errno = ENETUNREACH;
#endif
            }
            else {
                ret = unsafe_recv(buf, len);
            }
            if (ret <= 0) {
                _handleRecvError(ret, len, &retries);
                continue;
            }

            if ( len <= 4 && ret != len )
                LOG(_logLevel) << "Socket recv() got " << ret <<
                    " bytes wanted len=" << len << endl;
            fassert(16508, ret <= len);
            len -= ret;
            buf += ret;
        }
    }
Beispiel #2
0
    // sends all data or throws an exception
    void Socket::send( const char * data , int len, const char *context ) {
        while( len > 0 ) {
            int ret = -1;
            if (MONGO_FAIL_POINT(throwSockExcep)) {
#if defined(_WIN32)
                WSASetLastError(WSAENETUNREACH);
#else
                errno = ENETUNREACH;
#endif
            }
            else {
                ret = _send(data, len);
            }

            if (ret == -1)
                _handleSendError(ret, context);

            _bytesOut += ret;

            fassert(16507, ret <= len);
            len -= ret;
            data += ret;

        }
    }
Beispiel #3
0
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn,
                                                   const GetMoreRequest& request) {
    auto cursorManager = grid.getCursorManager();

    auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid);
    if (!pinnedCursor.isOK()) {
        return pinnedCursor.getStatus();
    }
    invariant(request.cursorid == pinnedCursor.getValue().getCursorId());

    // If the fail point is enabled, busy wait until it is disabled.
    while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) {
    }

    if (request.awaitDataTimeout) {
        auto status = pinnedCursor.getValue().setAwaitDataTimeout(*request.awaitDataTimeout);
        if (!status.isOK()) {
            return status;
        }
    }

    std::vector<BSONObj> batch;
    int bytesBuffered = 0;
    long long batchSize = request.batchSize.value_or(0);
    long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar();
    auto cursorState = ClusterCursorManager::CursorState::NotExhausted;
    while (!FindCommon::enoughForGetMore(batchSize, batch.size())) {
        auto next = pinnedCursor.getValue().next();
        if (!next.isOK()) {
            return next.getStatus();
        }

        if (!next.getValue()) {
            // We reached end-of-stream.
            if (!pinnedCursor.getValue().isTailable()) {
                cursorState = ClusterCursorManager::CursorState::Exhausted;
            }
            break;
        }

        if (!FindCommon::haveSpaceForNext(*next.getValue(), batch.size(), bytesBuffered)) {
            pinnedCursor.getValue().queueResult(*next.getValue());
            break;
        }

        // Add doc to the batch. Account for the space overhead associated with returning this doc
        // inside a BSON array.
        bytesBuffered += (next.getValue()->objsize() + kPerDocumentOverheadBytesUpperBound);
        batch.push_back(std::move(*next.getValue()));
    }

    // Transfer ownership of the cursor back to the cursor manager.
    pinnedCursor.getValue().returnCursor(cursorState);

    CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted)
        ? CursorId(0)
        : request.cursorid;
    return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom);
}
void NetworkInterfaceASIO::_asyncRunCommand(AsyncOp* op, NetworkOpHandler handler) {
    LOG(2) << "Starting asynchronous command " << op->request().id << " on host "
           << op->request().target.toString();

    if (MONGO_FAIL_POINT(NetworkInterfaceASIOasyncRunCommandFail)) {
        _validateAndRun(op, asio::error::basic_errors::network_unreachable, [] {});
        return;
    }

    // We invert the following steps below to run a command:
    // 1 - send the given command
    // 2 - receive a header for the response
    // 3 - validate and receive response body
    // 4 - advance the state machine by calling handler()
    auto cmd = op->command();

    // Step 4
    auto recvMessageCallback = [this, cmd, handler, op](std::error_code ec, size_t bytes) {
        // We don't call _validateAndRun here as we assume the caller will.
        handler(ec, bytes);
    };

    // Step 3
    auto recvHeaderCallback = [this, cmd, handler, recvMessageCallback, op](std::error_code ec,
                                                                            size_t bytes) {
        // The operation could have been canceled after starting the command, but before
        // receiving the header
        _validateAndRun(op, ec, [this, op, recvMessageCallback, ec, bytes, cmd, handler] {
            // validate response id
            uint32_t expectedId = cmd->toSend().header().getId();
            uint32_t actualId = cmd->header().constView().getResponseToMsgId();
            if (actualId != expectedId) {
                LOG(3) << "got wrong response:"
                       << " expected response id: " << expectedId
                       << ", got response id: " << actualId;
                return handler(make_error_code(ErrorCodes::ProtocolError), bytes);
            }

            asyncRecvMessageBody(cmd->conn().stream(),
                                 &cmd->header(),
                                 &cmd->toRecv(),
                                 std::move(recvMessageCallback));
        });
    };

    // Step 2
    auto sendMessageCallback = [this, cmd, handler, recvHeaderCallback, op](std::error_code ec,
                                                                            size_t bytes) {
        _validateAndRun(op, ec, [this, cmd, op, recvHeaderCallback] {
            asyncRecvMessageHeader(
                cmd->conn().stream(), &cmd->header(), std::move(recvHeaderCallback));
        });


    };

    // Step 1
    asyncSendMessage(cmd->conn().stream(), &cmd->toSend(), std::move(sendMessageCallback));
}
bool OperationContext::hasDeadlineExpired() const {
    if (!hasDeadline()) {
        return false;
    }
    if (MONGO_FAIL_POINT(maxTimeNeverTimeOut)) {
        return false;
    }
    if (MONGO_FAIL_POINT(maxTimeAlwaysTimeOut)) {
        return true;
    }

    // TODO: Remove once all OperationContexts are properly connected to Clients and ServiceContexts
    // in tests.
    if (MONGO_unlikely(!getClient() || !getServiceContext())) {
        return false;
    }

    const auto now = getServiceContext()->getFastClockSource()->now();
    return now >= getDeadline();
}
Beispiel #6
0
void CollectionCloner::_insertDocumentsCallback(
    const executor::TaskExecutor::CallbackArgs& cbd,
    bool lastBatch,
    std::shared_ptr<OnCompletionGuard> onCompletionGuard) {
    if (!cbd.status.isOK()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, cbd.status);
        return;
    }

    UniqueLock lk(_mutex);
    std::vector<BSONObj> docs;
    if (_documentsToInsert.size() == 0) {
        warning() << "_insertDocumentsCallback, but no documents to insert for ns:" << _destNss;
        if (lastBatch) {
            onCompletionGuard->setResultAndCancelRemainingWork_inlock(lk, Status::OK());
        }
        return;
    }
    _documentsToInsert.swap(docs);
    _stats.documentsCopied += docs.size();
    ++_stats.fetchBatches;
    _progressMeter.hit(int(docs.size()));
    invariant(_collLoader);
    const auto status = _collLoader->insertDocuments(docs.cbegin(), docs.cend());
    if (!status.isOK()) {
        onCompletionGuard->setResultAndCancelRemainingWork_inlock(lk, status);
        return;
    }

    MONGO_FAIL_POINT_BLOCK(initialSyncHangDuringCollectionClone, options) {
        const BSONObj& data = options.getData();
        if (data["namespace"].String() == _destNss.ns() &&
            static_cast<int>(_stats.documentsCopied) >= data["numDocsToClone"].numberInt()) {
            lk.unlock();
            log() << "initial sync - initialSyncHangDuringCollectionClone fail point "
                     "enabled. Blocking until fail point is disabled.";
            while (MONGO_FAIL_POINT(initialSyncHangDuringCollectionClone) && !_isShuttingDown()) {
                mongo::sleepsecs(1);
            }
            lk.lock();
        }
    }

    if (lastBatch) {
        // Clean up resources once the last batch has been copied over and set the status to OK.
        onCompletionGuard->setResultAndCancelRemainingWork_inlock(lk, Status::OK());
    }
}
Beispiel #7
0
    // sends all data or throws an exception
    void Socket::send( const char * data , int len, const char *context ) {
        while( len > 0 ) {
            int ret = _send( data , len  );
            if (ret == -1 || MONGO_FAIL_POINT(throwSockExcep)) {
                _handleSendError(ret, context);
            }

            _bytesOut += ret;

            fassert(16507, ret <= len);
            len -= ret;
            data += ret;

        }
    }
void WriteUnitOfWork::commit() {
    invariant(!_committed);
    invariant(!_released);
    invariant(_opCtx->_ruState == RecoveryUnitState::kActiveUnitOfWork);
    if (_toplevel) {
        if (MONGO_FAIL_POINT(sleepBeforeCommit)) {
            sleepFor(Milliseconds(100));
        }

        _opCtx->recoveryUnit()->commitUnitOfWork();
        _opCtx->_ruState = RecoveryUnitState::kNotInUnitOfWork;
    }
    _opCtx->lockState()->endWriteUnitOfWork();
    _committed = true;
}
    RecordFetcher* MmapV1ExtentManager::recordNeedsFetch( const DiskLoc& loc ) const {
        Record* record = _recordForV1( loc );

        // For testing: if failpoint is enabled we randomly request fetches without
        // going to the RecordAccessTracker.
        if ( MONGO_FAIL_POINT( recordNeedsFetchFail ) ) {
            needsFetchFailCounter.increment();
            if ( ( needsFetchFailCounter.get() % kNeedsFetchFailFreq ) == 0 ) {
                return new MmapV1RecordFetcher( record );
            }
        }

        if ( !_recordAccessTracker->checkAccessedAndMark( record ) ) {
            return new MmapV1RecordFetcher( record );
        }

        return NULL;
    }
    std::unique_ptr<RecordFetcher> MmapV1ExtentManager::recordNeedsFetch(const DiskLoc& loc) const {
        if (loc.isNull()) return {};
        MmapV1RecordHeader* record = _recordForV1( loc );

        // For testing: if failpoint is enabled we randomly request fetches without
        // going to the RecordAccessTracker.
        if ( MONGO_FAIL_POINT( recordNeedsFetchFail ) ) {
            needsFetchFailCounter.increment();
            if ( ( needsFetchFailCounter.get() % kNeedsFetchFailFreq ) == 0 ) {
                return stdx::make_unique<MmapV1RecordFetcher>( record );
            }
        }

        if ( !_recordAccessTracker->checkAccessedAndMark( record ) ) {
            return stdx::make_unique<MmapV1RecordFetcher>( record );
        }

        return {};
    }
Beispiel #11
0
void Socket::recv(char* buf, int len) {
    while (len > 0) {
        int ret = -1;
        if (MONGO_FAIL_POINT(throwSockExcep)) {
#if defined(_WIN32)
            WSASetLastError(WSAENETUNREACH);
#else
            errno = ENETUNREACH;
#endif
            if (ret <= 0) {
                handleRecvError(ret, len);
                continue;
            }
        } else {
            ret = unsafe_recv(buf, len);
        }

        fassert(16508, ret <= len);
        len -= ret;
        buf += ret;
    }
}
Beispiel #12
0
int Balancer::_moveChunks(OperationContext* txn,
                          const BalancerChunkSelectionPolicy::MigrateInfoVector& candidateChunks,
                          const MigrationSecondaryThrottleOptions& secondaryThrottle,
                          bool waitForDelete) {
    int movedCount = 0;

    for (const auto& migrateInfo : candidateChunks) {
        // If the balancer was disabled since we started this round, don't start new chunks
        // moves.
        if (!Grid::get(txn)->getBalancerConfiguration()->isBalancerActive() ||
            MONGO_FAIL_POINT(skipBalanceRound)) {
            LOG(1) << "Stopping balancing round early as balancing was disabled";
            return movedCount;
        }

        // Changes to metadata, borked metadata, and connectivity problems between shards
        // should cause us to abort this chunk move, but shouldn't cause us to abort the entire
        // round of chunks.
        //
        // TODO(spencer): We probably *should* abort the whole round on issues communicating
        // with the config servers, but its impossible to distinguish those types of failures
        // at the moment.
        //
        // TODO: Handle all these things more cleanly, since they're expected problems

        const NamespaceString nss(migrateInfo.ns);

        try {
            shared_ptr<DBConfig> cfg =
                uassertStatusOK(grid.catalogCache()->getDatabase(txn, nss.db().toString()));

            // NOTE: We purposely do not reload metadata here, since _getCandidateChunks already
            // tried to do so once
            shared_ptr<ChunkManager> cm = cfg->getChunkManager(txn, migrateInfo.ns);
            uassert(28628,
                    str::stream()
                        << "Collection " << migrateInfo.ns
                        << " was deleted while balancing was active. Aborting balancing round.",
                    cm);

            shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, migrateInfo.minKey);

            if (c->getMin().woCompare(migrateInfo.minKey) ||
                c->getMax().woCompare(migrateInfo.maxKey)) {
                // Likely a split happened somewhere, so force reload the chunk manager
                cm = cfg->getChunkManager(txn, migrateInfo.ns, true);
                invariant(cm);

                c = cm->findIntersectingChunk(txn, migrateInfo.minKey);

                if (c->getMin().woCompare(migrateInfo.minKey) ||
                    c->getMax().woCompare(migrateInfo.maxKey)) {
                    log() << "chunk mismatch after reload, ignoring will retry issue "
                          << migrateInfo;

                    continue;
                }
            }

            BSONObj res;
            if (c->moveAndCommit(txn,
                                 migrateInfo.to,
                                 Grid::get(txn)->getBalancerConfiguration()->getMaxChunkSizeBytes(),
                                 secondaryThrottle,
                                 waitForDelete,
                                 0, /* maxTimeMS */
                                 res)) {
                movedCount++;
                continue;
            }

            log() << "balancer move failed: " << res << ", migrate: " << migrateInfo;

            Status moveStatus = getStatusFromCommandResult(res);

            if (moveStatus == ErrorCodes::ChunkTooBig || res["chunkTooBig"].trueValue()) {
                // Reload just to be safe
                cm = cfg->getChunkManager(txn, migrateInfo.ns);
                invariant(cm);

                c = cm->findIntersectingChunk(txn, migrateInfo.minKey);

                log() << "performing a split because migrate failed for size reasons";

                auto splitStatus = c->split(txn, Chunk::normal, NULL);
                if (!splitStatus.isOK()) {
                    log() << "marking chunk as jumbo: " << c->toString();

                    c->markAsJumbo(txn);

                    // We increment moveCount so we do another round right away
                    movedCount++;
                }
            }
        } catch (const DBException& ex) {
            log() << "balancer move " << migrateInfo << " failed" << causedBy(ex);
        }
    }

    return movedCount;
}
Beispiel #13
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();
        if (!_init(txn.get())) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    Seconds balanceRoundInterval(kBalanceRoundDefaultInterval);

    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();

        BalanceRoundDetails roundDetails;

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping(txn.get(), false);

            MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) {
                const BSONObj& data = scopedBalancerRoundInterval.getData();
                balanceRoundInterval = Seconds(data["sleepSecs"].numberInt());
            }

            // Use fresh shard state and balancer settings
            Grid::get(txn.get())->shardRegistry()->reload(txn.get());

            auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration();
            Status refreshStatus = balancerConfig->refreshAndCheck(txn.get());
            if (!refreshStatus.isOK()) {
                warning() << "Skipping balancing round" << causedBy(refreshStatus);
                sleepFor(balanceRoundInterval);
                continue;
            }

            // now make sure we should even be running
            if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(txn.get(), true);

                sleepFor(balanceRoundInterval);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get()));

            {
                auto scopedDistLock = grid.catalogManager(txn.get())
                                          ->distLock(txn.get(),
                                                     "balancer",
                                                     "doing balance round",
                                                     DistLockManager::kSingleLockAttemptTimeout);

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(txn.get(), true);

                    sleepFor(balanceRoundInterval);  // no need to wake up soon
                    continue;
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << balancerConfig->waitForDelete()
                       << ", secondaryThrottle: "
                       << balancerConfig->getSecondaryThrottle().toBSON();

                OCCASIONALLY warnOnMultiVersion(
                    uassertStatusOK(_clusterStats->getStats(txn.get())));

                Status status = _enforceTagRanges(txn.get());
                if (!status.isOK()) {
                    warning() << "Failed to enforce tag ranges" << causedBy(status);
                } else {
                    LOG(1) << "Done enforcing tag range boundaries.";
                }

                const auto candidateChunks = uassertStatusOK(
                    _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime));

                if (candidateChunks.empty()) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime = _moveChunks(txn.get(),
                                                    candidateChunks,
                                                    balancerConfig->getSecondaryThrottle(),
                                                    balancerConfig->waitForDelete());

                    roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()),
                                              _balancedLastTime);

                    grid.catalogManager(txn.get())
                        ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());
                }

                LOG(1) << "*** End of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(txn.get(), true);

            sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval);
        } catch (const std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            roundDetails.setFailed(e.what());

            grid.catalogManager(txn.get())
                ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());

            // Sleep a fair amount before retrying because of the error
            sleepFor(balanceRoundInterval);
        }
    }
}
Beispiel #14
0
    void Socket::_handleSendError(int ret, const char* context) {
#ifdef MONGO_SSL
        if (_ssl) {
            LOG(_logLevel) << "SSL Error ret: " << ret << " err: " << SSL_get_error(_ssl , ret) 
                           << " " << ERR_error_string(ERR_get_error(), NULL) 
                           << endl;
            throw SocketException(SocketException::SEND_ERROR , remoteString());
        }
#endif

#if defined(_WIN32)
        const int mongo_errno = WSAGetLastError();
        if ( mongo_errno == WSAETIMEDOUT && _timeout != 0 ) {
#else
        const int mongo_errno = errno;
        if ( ( mongo_errno == EAGAIN || mongo_errno == EWOULDBLOCK ) && _timeout != 0 ) {
#endif
            LOG(_logLevel) << "Socket " << context << 
                " send() timed out " << remoteString() << endl;
            throw SocketException(SocketException::SEND_TIMEOUT , remoteString());
        }
        else {
            LOG(_logLevel) << "Socket " << context << " send() "
                           << errnoWithDescription(mongo_errno) << ' ' << remoteString() << endl;
            throw SocketException(SocketException::SEND_ERROR , remoteString());            
        }
    }

    void Socket::_handleRecvError(int ret, int len, int* retries) {
        if (ret == 0 || MONGO_FAIL_POINT(throwSockExcep)) {
            LOG(3) << "Socket recv() conn closed? " << remoteString() << endl;
            throw SocketException(SocketException::CLOSED , remoteString());
        }
     
        // ret < 0
#ifdef MONGO_SSL
        if (_ssl) {
            LOG(_logLevel) << "SSL Error ret: " << ret << " err: " << SSL_get_error(_ssl , ret) 
                           << " " << ERR_error_string(ERR_get_error(), NULL) 
                           << endl;
            throw SocketException(SocketException::RECV_ERROR, remoteString());
        }
#endif

#if defined(_WIN32)
        int e = WSAGetLastError();
#else
        int e = errno;
# if defined(EINTR)
        if (e == EINTR) {
            LOG(_logLevel) << "EINTR retry " << ++*retries << endl;
            return;
        }
# endif
#endif

#if defined(_WIN32)
        // Windows
        if ((e == EAGAIN || e == WSAETIMEDOUT) && _timeout > 0) { 
#else
        if (e == EAGAIN && _timeout > 0) { 
#endif
            // this is a timeout
            LOG(_logLevel) << "Socket recv() timeout  " << remoteString() <<endl;
            throw SocketException(SocketException::RECV_TIMEOUT, remoteString());
        }

        LOG(_logLevel) << "Socket recv() " << 
            errnoWithDescription(e) << " " << remoteString() <<endl;
        throw SocketException(SocketException::RECV_ERROR , remoteString());
    }

    void Socket::setTimeout( double secs ) {
        setSockTimeouts( _fd, secs );
    }

#if defined(_WIN32)
    struct WinsockInit {
        WinsockInit() {
            WSADATA d;
            if ( WSAStartup(MAKEWORD(2,2), &d) != 0 ) {
                out() << "ERROR: wsastartup failed " << errnoWithDescription() << endl;
                problem() << "ERROR: wsastartup failed " << errnoWithDescription() << endl;
                _exit(EXIT_NTSERVICE_ERROR);
            }
        }
    } winsock_init;
#endif

} // namespace mongo
Beispiel #15
0
    /** sends all data or throws an exception
     * @param context descriptive for logging
     */
    void Socket::send( const vector< pair< char *, int > > &data, const char *context ) {

#ifdef MONGO_SSL
        if ( _ssl ) {
            _send( data , context );
            return;
        }
#endif

#if defined(_WIN32)
        // TODO use scatter/gather api
        _send( data , context );
#else
        vector<struct iovec> d( data.size() );
        int i = 0;
        for (vector< pair<char *, int> >::const_iterator j = data.begin(); 
             j != data.end(); 
             ++j) {
            if ( j->second > 0 ) {
                d[ i ].iov_base = j->first;
                d[ i ].iov_len = j->second;
                ++i;
                _bytesOut += j->second;
            }
        }
        struct msghdr meta;
        memset( &meta, 0, sizeof( meta ) );
        meta.msg_iov = &d[ 0 ];
        meta.msg_iovlen = d.size();

        while( meta.msg_iovlen > 0 ) {
            int ret = ::sendmsg( _fd , &meta , portSendFlags );
            if ( ret == -1 || MONGO_FAIL_POINT(throwSockExcep)) {
                if ( errno != EAGAIN || _timeout == 0 ) {
                    LOG(_logLevel) << "Socket " << context << 
                        " send() " << errnoWithDescription() << ' ' << remoteString() << endl;
                    throw SocketException( SocketException::SEND_ERROR , remoteString() );
                }
                else {
                    LOG(_logLevel) << "Socket " << context << 
                        " send() remote timeout " << remoteString() << endl;
                    throw SocketException( SocketException::SEND_TIMEOUT , remoteString() );
                }
            }
            else {
                struct iovec *& i = meta.msg_iov;
                while( ret > 0 ) {
                    if ( i->iov_len > unsigned( ret ) ) {
                        i->iov_len -= ret;
                        i->iov_base = (char*)(i->iov_base) + ret;
                        ret = 0;
                    }
                    else {
                        ret -= i->iov_len;
                        ++i;
                        --(meta.msg_iovlen);
                    }
                }
            }
        }
#endif
    }
void WiredTigerOplogManager::_oplogJournalThreadLoop(WiredTigerSessionCache* sessionCache,
                                                     WiredTigerRecordStore* oplogRecordStore,
                                                     const bool updateOldestTimestamp) noexcept {
    Client::initThread("WTOplogJournalThread");

    // This thread updates the oplog read timestamp, the timestamp used to read from the oplog with
    // forward cursors.  The timestamp is used to hide oplog entries that might be committed but
    // have uncommitted entries ahead of them.
    while (true) {
        stdx::unique_lock<stdx::mutex> lk(_oplogVisibilityStateMutex);
        {
            MONGO_IDLE_THREAD_BLOCK;
            _opsWaitingForJournalCV.wait(lk,
                                         [&] { return _shuttingDown || _opsWaitingForJournal; });

            // If we're not shutting down and nobody is actively waiting for the oplog to become
            // durable, delay journaling a bit to reduce the sync rate.
            auto journalDelay = Milliseconds(storageGlobalParams.journalCommitIntervalMs.load());
            if (journalDelay == Milliseconds(0)) {
                journalDelay = Milliseconds(WiredTigerKVEngine::kDefaultJournalDelayMillis);
            }
            auto now = Date_t::now();
            auto deadline = now + journalDelay;
            auto shouldSyncOpsWaitingForJournal = [&] {
                return _shuttingDown || _opsWaitingForVisibility ||
                    oplogRecordStore->haveCappedWaiters();
            };

            // Eventually it would be more optimal to merge this with the normal journal flushing
            // and block for either oplog tailers or operations waiting for oplog visibility. For
            // now this loop will poll once a millisecond up to the journalDelay to see if we have
            // any waiters yet. This reduces sync-related I/O on the primary when secondaries are
            // lagged, but will avoid significant delays in confirming majority writes on replica
            // sets with infrequent writes.
            // Callers of waitForAllEarlierOplogWritesToBeVisible() like causally consistent reads
            // will preempt this delay.
            while (now < deadline &&
                   !_opsWaitingForJournalCV.wait_until(
                       lk, now.toSystemTimePoint(), shouldSyncOpsWaitingForJournal)) {
                now += Milliseconds(1);
            }
        }

        while (!_shuttingDown && MONGO_FAIL_POINT(WTPausePrimaryOplogDurabilityLoop)) {
            lk.unlock();
            sleepmillis(10);
            lk.lock();
        }

        if (_shuttingDown) {
            log() << "oplog journal thread loop shutting down";
            return;
        }
        invariant(_opsWaitingForJournal);
        _opsWaitingForJournal = false;
        lk.unlock();

        const uint64_t newTimestamp = fetchAllCommittedValue(sessionCache->conn());

        // The newTimestamp may actually go backward during secondary batch application,
        // where we commit data file changes separately from oplog changes, so ignore
        // a non-incrementing timestamp.
        if (newTimestamp <= _oplogReadTimestamp.load()) {
            LOG(2) << "no new oplog entries were made visible: " << newTimestamp;
            continue;
        }

        // In order to avoid oplog holes after an unclean shutdown, we must ensure this proposed
        // oplog read timestamp's documents are durable before publishing that timestamp.
        sessionCache->waitUntilDurable(/*forceCheckpoint=*/false, false);

        lk.lock();
        // Publish the new timestamp value.  Avoid going backward.
        auto oldTimestamp = getOplogReadTimestamp();
        if (newTimestamp > oldTimestamp) {
            _setOplogReadTimestamp(lk, newTimestamp);
        }
        lk.unlock();

        if (updateOldestTimestamp) {
            const bool force = false;
            sessionCache->getKVEngine()->setOldestTimestamp(Timestamp(newTimestamp), force);
        }

        // Wake up any await_data cursors and tell them more data might be visible now.
        oplogRecordStore->notifyCappedWaitersIfNeeded();
    }
}
Beispiel #17
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View getMore(OperationContext* txn,
                              const char* ns,
                              int ntoreturn,
                              long long cursorid,
                              CurOp& curop,
                              int pass,
                              bool& exhaust,
                              bool* isCursorAuthorized) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        const NamespaceString nss(ns);

        // Depending on the type of cursor being operated on, we hold locks for the whole getMore,
        // or none of the getMore, or part of the getMore.  The three cases in detail:
        //
        // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
        // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors don't
        //    own any collection state.
        // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
        //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
        //    release), but the pin and unpin of the cursor must occur under the collection lock.
        //    We don't use our AutoGetCollectionForRead "ctx" to relock, because
        //    AutoGetCollectionForRead checks the sharding version (and we want the relock for the
        //    unpin to succeed even if the sharding version has changed).
        //
        // Note that we declare our locks before our ClientCursorPin, in order to ensure that the
        // pin's destructor is called before the lock destructors (so that the unpin occurs under
        // the lock).
        boost::scoped_ptr<AutoGetCollectionForRead> ctx;
        boost::scoped_ptr<Lock::DBLock> unpinDBLock;
        boost::scoped_ptr<Lock::CollectionLock> unpinCollLock;

        CursorManager* cursorManager;
        CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
        if (globalCursorManager->ownsCursorId(cursorid)) {
            cursorManager = globalCursorManager;
        }
        else {
            ctx.reset(new AutoGetCollectionForRead(txn, nss));
            Collection* collection = ctx->getCollection();
            uassert( 17356, "collection dropped between getMore calls", collection );
            cursorManager = collection->getCursorManager();
        }

        LOG(5) << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorManager, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Check for spoofing of the ns such that it does not match the one originally
            // there for the cursor.
            uassert(ErrorCodes::Unauthorized,
                    str::stream() << "Requested getMore on namespace " << ns << ", but cursor "
                                  << cursorid << " belongs to namespace " << cc->ns(),
                    ns == cc->ns());
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (txn->getClient()->isInDirectClient()) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit());

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn); 
            }

            if (cc->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            // If we're replaying the oplog, we save the last time that we read.
            Timestamp slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || bsonTimestamp == e.type()) {
                        slaveReadTill = e.timestamp();
                    }
                }

                if (enoughForGetMore(ntoreturn, numResults, bb.len())) {
                    break;
                }
            }

            if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) {
                // Propagate this error to caller.
                if (PlanExecutor::FAILURE == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }

            const bool shouldSaveCursor =
                    shouldSaveCursorGetMore(state, exec, isCursorTailable(cc));

            // In order to deregister a cursor, we need to be holding the DB + collection lock and
            // if the cursor is aggregation, we release these locks.
            if (cc->isAggCursor()) {
                invariant(NULL == ctx.get());
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS));
                unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS));
            }

            // Our two possible ClientCursorPin cleanup paths are:
            // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin.
            // 2) If the cursor is going to be saved, we simply let the pin go out of scope.  In
            //    this case, the pin's destructor will be invoked, which will call release() on the
            //    pin.  Because our ClientCursorPin is declared after our lock is declared, this
            //    will happen under the lock.
            if (!shouldSaveCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();

                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                curop.debug().cursorExhausted = true;

                LOG(5) << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                LOG(5) << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!txn->getClient()->isInDirectClient()) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper->dismiss();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        LOG(5) << "getMore returned " << numResults << " results\n";
        return qr;
    }
Beispiel #18
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Status* remoteOplogStartStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        return;
    }

    if (inShutdown()) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    const auto& queryResponse = result.getValue();
    const auto& documents = queryResponse.documents;
    auto documentBegin = documents.cbegin();
    auto documentEnd = documents.cend();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> {
            if (documentBegin == documentEnd) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(documentBegin++);
        };

        *remoteOplogStartStatus =
            checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!remoteOplogStartStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(documentBegin != documents.cbegin());
    }

    // process documents
    int currentBatchMessageSize = 0;
    for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) {
        if (inShutdown()) {
            return;
        }

        // If we are transitioning to primary state, we need to leave
        // this loop in order to go into bgsync-pause mode.
        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
            LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
            return;
        }

        // At this point, we are guaranteed to have at least one thing to read out
        // of the fetcher.
        const BSONObj& o = *documentIter;
        currentBatchMessageSize += o.objsize();
        opsReadStats.increment();

        if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) {
            sleepsecs(20);
        }

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _appliedBuffer = false;
        }

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        bufferCountGauge.increment();
        bufferSizeGauge.increment(getSize(o));
        _buffer.push(o);

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = o["h"].numberLong();
            _lastOpTimeFetched = extractOpTime(o);
            LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }

    // record time for each batch
    getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count());

    networkByteStats.increment(currentBatchMessageSize);

    // Check some things periodically
    // (whenever we run out of items in the
    // current cursor batch)
    if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) {
        // on a very low latency network, if we don't wait a little, we'll be
        // getting ops to write almost one at a time.  this will both be expensive
        // for the upstream server as well as potentially defeating our parallel
        // application of batches on the secondary.
        //
        // the inference here is basically if the batch is really small, we are
        // "caught up".
        //
        sleepmillis(SleepToAllowBatchingMillis);
    }

    // If we are transitioning to primary state, we need to leave
    // this loop in order to go into bgsync-pause mode.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        return;
    }

    // re-evaluate quality of sync target
    if (_shouldChangeSyncSource(source)) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    // We fill in 'bob' to signal the fetcher to process with another getMore.
    invariant(bob);
    bob->append("getMore", queryResponse.cursorId);
    bob->append("collection", queryResponse.nss.coll());
    bob->append("maxTimeMS", int(fetcherMaxTimeMS.count()));
}
Beispiel #19
0
Status dropCollection(OperationContext* opCtx,
                      const NamespaceString& collectionName,
                      BSONObjBuilder& result,
                      const repl::OpTime& dropOpTime,
                      DropCollectionSystemCollectionMode systemCollectionMode) {
    if (!serverGlobalParams.quiet.load()) {
        log() << "CMD: drop " << collectionName;
    }

    return writeConflictRetry(opCtx, "drop", collectionName.ns(), [&] {
        AutoGetDb autoDb(opCtx, collectionName.db(), MODE_X);
        Database* const db = autoDb.getDb();
        Collection* coll = db ? db->getCollection(opCtx, collectionName) : nullptr;
        auto view =
            db && !coll ? db->getViewCatalog()->lookup(opCtx, collectionName.ns()) : nullptr;

        if (MONGO_FAIL_POINT(hangDuringDropCollection)) {
            log() << "hangDuringDropCollection fail point enabled. Blocking until fail point is "
                     "disabled.";
            MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangDuringDropCollection);
        }

        if (!db || (!coll && !view)) {
            return Status(ErrorCodes::NamespaceNotFound, "ns not found");
        }

        const bool shardVersionCheck = true;
        OldClientContext context(opCtx, collectionName.ns(), shardVersionCheck);

        bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() &&
            !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, collectionName);

        if (userInitiatedWritesAndNotPrimary) {
            return Status(ErrorCodes::NotMaster,
                          str::stream() << "Not primary while dropping collection "
                                        << collectionName);
        }

        WriteUnitOfWork wunit(opCtx);
        if (!result.hasField("ns")) {
            result.append("ns", collectionName.ns());
        }

        if (coll) {
            invariant(!view);
            int numIndexes = coll->getIndexCatalog()->numIndexesTotal(opCtx);

            BackgroundOperation::assertNoBgOpInProgForNs(collectionName.ns());

            Status s = systemCollectionMode ==
                    DropCollectionSystemCollectionMode::kDisallowSystemCollectionDrops
                ? db->dropCollection(opCtx, collectionName.ns(), dropOpTime)
                : db->dropCollectionEvenIfSystem(opCtx, collectionName, dropOpTime);

            if (!s.isOK()) {
                return s;
            }

            result.append("nIndexesWas", numIndexes);
        } else {
            invariant(view);
            Status status = db->dropView(opCtx, collectionName.ns());
            if (!status.isOK()) {
                return status;
            }
        }
        wunit.commit();

        return Status::OK();
    });
}
Beispiel #20
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                _currentSyncTarget = NULL;
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        verify(r.conn() == NULL);

        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                sleepsecs(1);
                continue;
            }

            if (isStale(r, oldest)) {
                r.resetConnection();
                theReplSet->veto(current, 600);
                stale = target;
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock);
            theReplSet->syncSourceFeedback.connect(target);

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            theReplSet->goStale(stale, oldest);
            sleepsecs(120);
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Beispiel #21
0
/* tail an oplog.  ok to return, will be re-called. */
void SyncTail::oplogApplication() {
    ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
    ApplyBatchFinalizer finalizer(replCoord);

    OperationContextImpl txn;
    OpTime originalEndOpTime(getMinValid(&txn).end);

    while (!inShutdown()) {
        OpQueue ops;

        Timer batchTimer;
        int lastTimeChecked = 0;

        do {
            int now = batchTimer.seconds();

            // apply replication batch limits
            if (!ops.empty()) {
                if (now > replBatchLimitSeconds)
                    break;
                if (ops.getDeque().size() > replBatchLimitOperations)
                    break;
            }
            // occasionally check some things
            // (always checked in the first iteration of this do-while loop, because
            // ops is empty)
            if (ops.empty() || now > lastTimeChecked) {
                BackgroundSync* bgsync = BackgroundSync::get();
                if (bgsync->getInitialSyncRequestedFlag()) {
                    // got a resync command
                    return;
                }
                lastTimeChecked = now;
                // can we become secondary?
                // we have to check this before calling mgr, as we must be a secondary to
                // become primary
                tryToGoLiveAsASecondary(&txn, replCoord);
            }

            const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs());
            if (!ops.empty() && slaveDelaySecs > 0) {
                const BSONObj lastOp = ops.back();
                const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs();

                // Stop the batch as the lastOp is too new to be applied. If we continue
                // on, we can get ops that are way ahead of the delay and this will
                // make this thread sleep longer when handleSlaveDelay is called
                // and apply ops much sooner than we like.
                if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                    break;
                }
            }

            if (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                break;
            }

            // keep fetching more ops as long as we haven't filled up a full batch yet
        } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) &&  // tryPopAndWaitForMore returns
                                                                  // true when we need to end a
                                                                  // batch early
                 (ops.getSize() < replBatchLimitBytes) &&
                 !inShutdown());

        // For pausing replication in tests
        while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
            sleepmillis(0);
            if (inShutdown())
                return;
        }

        if (ops.empty()) {
            continue;
        }

        const BSONObj lastOp = ops.back();
        handleSlaveDelay(lastOp);

        // Set minValid to the last OpTime that needs to be applied, in this batch or from the
        // (last) failed batch, whichever is larger.
        // This will cause this node to go into RECOVERING state
        // if we should crash and restart before updating finishing.
        const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm);

        // Take the max of the first endOptime (if we recovered) and the end of our batch.
        const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp));

        // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch)
        // ensures that we keep pushing out the point where we can become consistent
        // and allow reads. If we recover and end up doing smaller batches we must pass the
        // originalEndOpTime before we are good.
        //
        // For example:
        // batch apply, 20-40, end = 40
        // batch failure,
        // restart
        // batch apply, 20-25, end = max(25, 40) = 40
        // batch apply, 25-45, end = 45
        const OpTime end(std::max(originalEndOpTime, lastOpTime));

        // This write will not journal/checkpoint.
        setMinValid(&txn, {start, end});

        OpTime finalOpTime = multiApply(&txn, ops);
        setNewTimestamp(finalOpTime.getTimestamp());

        setMinValid(&txn, end, DurableRequirement::None);
        finalizer.record(finalOpTime);
    }
}
Beispiel #22
0
void BackgroundSync::_produce(OperationContext* txn) {

    while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) {
        sleepmillis(0);
    }

    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (!_replCoord->isCatchingUp() &&
            (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) {
            return;
        }

        if (_inShutdown_inlock()) {
            return;
        }
    }

    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    SyncSourceResolverResponse syncSourceResp;
    SyncSourceResolver* syncSourceResolver;
    OpTime minValid;
    if (_replCoord->getMemberState().recovering()) {
        auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn);
        if (minValidSaved > lastOpTimeFetched) {
            minValid = minValidSaved;
        }
    }
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
        _syncSourceResolver = stdx::make_unique<SyncSourceResolver>(
            _replicationCoordinatorExternalState->getTaskExecutor(),
            _replCoord,
            lastOpTimeFetched,
            minValid,
            [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; });
        syncSourceResolver = _syncSourceResolver.get();
    }
    // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls
    // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to
    // acquired before BackgroundSync's.
    // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because
    // we do not destroy this instance outside of this function.
    auto status = _syncSourceResolver->startup();
    if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) {
        return;
    }
    fassertStatusOK(40349, status);
    syncSourceResolver->join();
    syncSourceResolver = nullptr;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        _syncSourceResolver.reset();
    }

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        if (_replCoord->isCatchingUp()) {
            warning() << "Too stale to catch up.";
            log() << "Our newest OpTime : " << lastOpTimeFetched;
            log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen
                  << " from " << syncSourceResp.getSyncSource();
            sleepsecs(1);
            return;
        }

        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode: " << status;
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        if (!_replCoord->isCatchingUp()) {
            _replCoord->signalUpstreamUpdater();
        }
    }

    // Set the applied point if unset. This is most likely the first time we've established a sync
    // source since stepping down or otherwise clearing the applied point. We need to set this here,
    // before the OplogWriter gets a chance to append to the oplog.
    if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) {
        StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime());
    }

    // "lastFetched" not used. Already set in _enqueueDocuments.
    Status fetcherReturnStatus = Status::OK();
    DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
        _replCoord, _replicationCoordinatorExternalState, this);
    OplogFetcher* oplogFetcher;
    try {
        auto executor = _replicationCoordinatorExternalState->getTaskExecutor();
        auto config = _replCoord->getConfig();
        auto onOplogFetcherShutdownCallbackFn =
            [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) {
                fetcherReturnStatus = status;
            };

        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _oplogFetcher = stdx::make_unique<OplogFetcher>(
            executor,
            OpTimeWithHash(lastHashFetched, lastOpTimeFetched),
            source,
            NamespaceString(rsOplogName),
            config,
            _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(),
            &dataReplicatorExternalState,
            stdx::bind(&BackgroundSync::_enqueueDocuments,
                       this,
                       stdx::placeholders::_1,
                       stdx::placeholders::_2,
                       stdx::placeholders::_3),
            onOplogFetcherShutdownCallbackFn);
        oplogFetcher = _oplogFetcher.get();
    } catch (const mongo::DBException& ex) {
        fassertFailedWithStatus(34440, exceptionToStatus());
    }

    LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at "
           << oplogFetcher->getCommandObject_forTest()["filter"];
    auto scheduleStatus = oplogFetcher->startup();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }

    oplogFetcher->join();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << redact(fetcherReturnStatus);
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        if (_replCoord->isCatchingUp()) {
            warning() << "Rollback situation detected in catch-up mode; catch-up mode will end.";
            sleepsecs(1);
            return;
        }

        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << redact(fetcherReturnStatus);

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        const auto minValid = StorageInterface::get(txn)->getMinValid(txn);
        if (lastApplied < minValid) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream() << "need to rollback, but in inconsistent state. "
                                                << "minvalid: "
                                                << minValid.toString()
                                                << " > our last optime: "
                                                << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher stopped querying remote oplog with error: "
                  << redact(fetcherReturnStatus);
    }
}
DocumentSource::GetNextResult Exchange::getNext(OperationContext* opCtx, size_t consumerId) {
    // Grab a lock.
    stdx::unique_lock<stdx::mutex> lk(_mutex);

    for (;;) {
        // Execute only in case we have not encountered an error.
        if (!_errorInLoadNextBatch.isOK()) {
            uasserted(ErrorCodes::ExchangePassthrough,
                      "Exchange failed due to an error on different thread.");
        }

        // Check if we have a document.
        if (!_consumers[consumerId]->isEmpty()) {
            auto doc = _consumers[consumerId]->getNext();
            unblockLoading(consumerId);

            return doc;
        }

        // There is not any document so try to load more from the source.
        if (_loadingThreadId == kInvalidThreadId) {
            LOG(3) << "A consumer " << consumerId << " begins loading";

            try {
                // This consumer won the race and will fill the buffers.
                _loadingThreadId = consumerId;

                _pipeline->reattachToOperationContext(opCtx);

                // This will return when some exchange buffer is full and we cannot make any forward
                // progress anymore.
                // The return value is an index of a full consumer buffer.
                size_t fullConsumerId = loadNextBatch();

                if (MONGO_FAIL_POINT(exchangeFailLoadNextBatch)) {
                    log() << "exchangeFailLoadNextBatch fail point enabled.";
                    uasserted(ErrorCodes::FailPointEnabled,
                              "Asserting on loading the next batch due to failpoint.");
                }

                _pipeline->detachFromOperationContext();

                // The loading cannot continue until the consumer with the full buffer consumes some
                // documents.
                _loadingThreadId = fullConsumerId;

                // Wake up everybody and try to make some progress.
                _haveBufferSpace.notify_all();
            } catch (const DBException& ex) {
                _errorInLoadNextBatch = ex.toStatus();

                // We have to wake up all other blocked threads so they can detect the error and
                // fail too. They can be woken up only after _errorInLoadNextBatch has been set.
                _haveBufferSpace.notify_all();

                throw;
            }
        } else {
            // Some other consumer is already loading the buffers. There is nothing else we can do
            // but wait.
            _haveBufferSpace.wait(lk);
        }
    }
}
Beispiel #24
0
    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
        bool ok = true;

        DbMessage d(m);

        const char *ns = d.getns();
        int ntoreturn = d.pullInt();
        long long cursorid = d.pullInt64();

        curop.debug().ns = ns;
        curop.debug().ntoreturn = ntoreturn;
        curop.debug().cursorid = cursorid;

        shared_ptr<AssertionException> ex;
        scoped_ptr<Timer> timer;
        int pass = 0;
        bool exhaust = false;
        QueryResult* msgdata = 0;
        OpTime last;
        while( 1 ) {
            bool isCursorAuthorized = false;
            try {
                const NamespaceString nsString( ns );
                uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid() );

                Status status = cc().getAuthorizationManager()->checkAuthForGetMore(ns);
                uassert(16543, status.reason(), status.isOK());

                if (str::startsWith(ns, "local.oplog.")){
                    while (MONGO_FAIL_POINT(rsStopGetMore)) {
                        sleepmillis(0);
                    }

                    if (pass == 0) {
                        mutex::scoped_lock lk(OpTime::m);
                        last = OpTime::getLast(lk);
                    }
                    else {
                        last.waitForDifferent(1000/*ms*/);
                    }
                }

                msgdata = processGetMore(ns,
                                         ntoreturn,
                                         cursorid,
                                         curop,
                                         pass,
                                         exhaust,
                                         &isCursorAuthorized);
            }
            catch ( AssertionException& e ) {
                if ( isCursorAuthorized ) {
                    // If a cursor with id 'cursorid' was authorized, it may have been advanced
                    // before an exception terminated processGetMore.  Erase the ClientCursor
                    // because it may now be out of sync with the client's iteration state.
                    // SERVER-7952
                    // TODO Temporary code, see SERVER-4563 for a cleanup overview.
                    ClientCursor::erase( cursorid );
                }
                ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
                ok = false;
                break;
            }
            
            if (msgdata == 0) {
                // this should only happen with QueryOption_AwaitData
                exhaust = false;
                massert(13073, "shutting down", !inShutdown() );
                if ( ! timer ) {
                    timer.reset( new Timer() );
                }
                else {
                    if ( timer->seconds() >= 4 ) {
                        // after about 4 seconds, return. pass stops at 1000 normally.
                        // we want to return occasionally so slave can checkpoint.
                        pass = 10000;
                    }
                }
                pass++;
                if (debug)
                    sleepmillis(20);
                else
                    sleepmillis(2);
                
                // note: the 1100 is beacuse of the waitForDifferent above
                // should eventually clean this up a bit
                curop.setExpectedLatencyMs( 1100 + timer->millis() );
                
                continue;
            }
            break;
        };

        if (ex) {
            exhaust = false;

            BSONObjBuilder err;
            ex->getInfo().append( err );
            BSONObj errObj = err.done();

            log() << errObj << endl;

            curop.debug().exceptionInfo = ex->getInfo();

            if (ex->getCode() == 13436) {
                replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj);
                curop.debug().responseLength = dbresponse.response->header()->dataLen();
                curop.debug().nreturned = 1;
                return ok;
            }

            msgdata = emptyMoreResult(cursorid);
        }

        Message *resp = new Message();
        resp->setData(msgdata, true);
        curop.debug().responseLength = resp->header()->dataLen();
        curop.debug().nreturned = msgdata->nReturned;

        dbresponse.response = resp;
        dbresponse.responseTo = m.header()->id;
        
        if( exhaust ) {
            curop.debug().exhaust = true;
            dbresponse.exhaustNS = ns;
        }

        return ok;
    }
Beispiel #25
0
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    OplogReader syncSourceReader;
    syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

    // no server found
    if (syncSourceReader.getHost().empty()) {
        sleepsecs(1);
        // if there is no one to sync from
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_pause) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _syncSourceHost = syncSourceReader.getHost();
        _replCoord->signalUpstreamUpdater();
    }

    const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout);

    // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared
    // if sync source feedback fails.
    const HostAndPort source = syncSourceReader.getHost();
    syncSourceReader.resetConnection();
    // no more references to oplog reader from here on.

    // If this status is not OK after the fetcher returns from wait(),
    // proceed to execute rollback
    Status remoteOplogStartStatus = Status::OK();

    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      &remoteOplogStartStatus);

    auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter"
                              << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))
                              << "tailable" << true << "oplogReplay" << true << "awaitData" << true
                              << "maxTimeMS" << int(fetcherMaxTimeMS.count()));
    Fetcher fetcher(taskExecutor,
                    source,
                    nsToDatabase(rsOplogName),
                    cmdObj,
                    fetcherCallback,
                    rpc::makeEmptyMetadata());
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();

    // If the background sync is paused after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isPaused()) {
        return;
    }

    // Execute rollback if necessary.
    // Rollback is a synchronous operation that uses the task executor and may not be
    // executed inside the fetcher callback.
    if (!remoteOplogStartStatus.isOK()) {
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection =
            [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* {
                if (!connection.get()) {
                    connection.reset(new ConnectionPool::ConnectionPtr(
                        &connectionPool, source, Date_t::now(), oplogSocketTimeout));
                };
                return connection->get();
            };

        log() << "starting rollback: " << remoteOplogStartStatus;
        _rollback(txn, source, getConnection);
        stop();
    }
}
// Theory of operation for waitForConditionOrInterruptNoAssertUntil and markKilled:
//
// An operation indicates to potential killers that it is waiting on a condition variable by setting
// _waitMutex and _waitCV, while holding the lock on its parent Client. It then unlocks its Client,
// unblocking any killers, which are required to have locked the Client before calling markKilled.
//
// When _waitMutex and _waitCV are set, killers must lock _waitMutex before setting the _killCode,
// and must signal _waitCV before releasing _waitMutex. Unfortunately, they must lock _waitMutex
// without holding a lock on Client to avoid a deadlock with callers of
// waitForConditionOrInterruptNoAssertUntil(). So, in the event that _waitMutex is set, the killer
// increments _numKillers, drops the Client lock, acquires _waitMutex and then re-acquires the
// Client lock. We know that the Client, its OperationContext and _waitMutex will remain valid
// during this period because the caller of waitForConditionOrInterruptNoAssertUntil will not return
// while _numKillers > 0 and will not return until it has itself reacquired _waitMutex. Instead,
// that caller will keep waiting on _waitCV until _numKillers drops to 0.
//
// In essence, when _waitMutex is set, _killCode is guarded by _waitMutex and _waitCV, but when
// _waitMutex is not set, it is guarded by the Client spinlock. Changing _waitMutex is itself
// guarded by the Client spinlock and _numKillers.
//
// When _numKillers does drop to 0, the waiter will null out _waitMutex and _waitCV.
//
// This implementation adds a minimum of two spinlock acquire-release pairs to every condition
// variable wait.
StatusWith<stdx::cv_status> OperationContext::waitForConditionOrInterruptNoAssertUntil(
    stdx::condition_variable& cv, stdx::unique_lock<stdx::mutex>& m, Date_t deadline) noexcept {
    invariant(getClient());
    {
        stdx::lock_guard<Client> clientLock(*getClient());
        invariant(!_waitMutex);
        invariant(!_waitCV);
        invariant(0 == _numKillers);

        // This interrupt check must be done while holding the client lock, so as not to race with a
        // concurrent caller of markKilled.
        auto status = checkForInterruptNoAssert();
        if (!status.isOK()) {
            return status;
        }
        _waitMutex = m.mutex();
        _waitCV = &cv;
    }

    // If the maxTimeNeverTimeOut failpoint is set, behave as though the operation's deadline does
    // not exist. Under normal circumstances, if the op has an existing deadline which is sooner
    // than the deadline passed into this method, we replace our deadline with the op's. This means
    // that we expect to time out at the same time as the existing deadline expires. If, when we
    // time out, we find that the op's deadline has not expired (as will always be the case if
    // maxTimeNeverTimeOut is set) then we assume that the incongruity is due to a clock mismatch
    // and return _timeoutError regardless. To prevent this behaviour, only consider the op's
    // deadline in the event that the maxTimeNeverTimeOut failpoint is not set.
    bool opHasDeadline = (hasDeadline() && !MONGO_FAIL_POINT(maxTimeNeverTimeOut));

    if (opHasDeadline) {
        deadline = std::min(deadline, getDeadline());
    }

    const auto waitStatus = [&] {
        if (Date_t::max() == deadline) {
            Waitable::wait(_baton.get(), getServiceContext()->getPreciseClockSource(), cv, m);
            return stdx::cv_status::no_timeout;
        }
        return getServiceContext()->getPreciseClockSource()->waitForConditionUntil(
            cv, m, deadline, _baton.get());
    }();

    // Continue waiting on cv until no other thread is attempting to kill this one.
    Waitable::wait(_baton.get(), getServiceContext()->getPreciseClockSource(), cv, m, [this] {
        stdx::lock_guard<Client> clientLock(*getClient());
        if (0 == _numKillers) {
            _waitMutex = nullptr;
            _waitCV = nullptr;
            return true;
        }
        return false;
    });

    auto status = checkForInterruptNoAssert();
    if (!status.isOK()) {
        return status;
    }
    if (opHasDeadline && waitStatus == stdx::cv_status::timeout && deadline == getDeadline()) {
        // It's possible that the system clock used in stdx::condition_variable::wait_until
        // is slightly ahead of the FastClock used in checkForInterrupt. In this case,
        // we treat the operation as though it has exceeded its time limit, just as if the
        // FastClock and system clock had agreed.
        if (!_hasArtificialDeadline) {
            markKilled(_timeoutError);
        }
        return Status(_timeoutError, "operation exceeded time limit");
    }

    return waitStatus;
}
Beispiel #27
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View newGetMore(OperationContext* txn,
                            const char* ns,
                            int ntoreturn,
                            long long cursorid,
                            CurOp& curop,
                            int pass,
                            bool& exhaust,
                            bool* isCursorAuthorized,
                            bool fromDBDirectClient) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        // This is a read lock.
        const NamespaceString nss(ns);
        scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss));
        Collection* collection = ctx->getCollection();
        uassert( 17356, "collection dropped between getMore calls", collection );

        QLOG() << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(collection, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (fromDBDirectClient) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn));

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn, curop); 
            }

            if (cc->isAggCursor) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            // We save the client cursor when there might be more results, and hence we may receive
            // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know
            // that we will not be producing more results. We indicate that the cursor is closed by
            // sending a cursorId of 0 back to the client.
            //
            // On the other hand, if we retrieve all results necessary for this batch, then
            // 'saveClientCursor' is true and we send a valid cursorId back to the client. In
            // this case, there may or may not actually be more results (for example, the next call
            // to getNext(...) might just return EOF).
            bool saveClientCursor = false;

            if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) {
                // Propagate this error to caller.
                if (PlanExecutor::EXEC_ERROR == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // If we're dead there's no way to get more results.
                saveClientCursor = false;

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (PlanExecutor::IS_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(PlanExecutor::ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                QLOG() << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!fromDBDirectClient) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper.reset();
                        delete cc->releaseOwnedRecoveryUnit();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
Beispiel #28
0
    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        while( 1 ) {
            OpQueue ops;

            verify( !Lock::isLocked() );

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                if (theReplSet->isPrimary()) {
                    massert(16620, "there are ops to sync, but I'm primary", ops.empty());
                    return;
                }

                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    {
                        boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
                        if (theReplSet->initialSyncRequested) {
                            // got a resync command
                            return;
                        }
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    if (!theReplSet->isSecondary()) {
                        OpTime minvalid;

                        OperationContextImpl txn;
                        theReplSet->tryToGoLiveAsASecondary(&txn, minvalid);
                    }

                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet->config().members.size() == 1 &&
                        theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = theReplSet->myConfig().slaveDelay;
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true 
                                                    // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes));

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            const BSONObj& lastOp = ops.getDeque().back();
            setOplogVersion(lastOp);
            handleSlaveDelay(lastOp);

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            theReplSet->setMinValid(lastOp);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to apply batch up to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            multiApply(ops.getDeque(), multiSyncApply);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to update oplog to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            if (!theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }
Beispiel #29
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r(false /* doHandshake */);

        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);
        }

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            while (!inShutdown()) {

                if (!r.moreInCurrentBatch()) {
                    int bs = r.currentBatchMessageSize();
                    if( bs > 0 && bs < BatchIsSmallish ) {
                        // on a very low latency network, if we don't wait a little, we'll be 
                        // getting ops to write almost one at a time.  this will both be expensive
                        // for the upstream server as well as postentiallyd efating our parallel 
                        // application of batches on the secondary.
                        //
                        // the inference here is basically if the batch is really small, we are 
                        // "caught up".
                        //
                        dassert( !Lock::isLocked() );
                        sleepmillis(SleepToAllowBatchingMillis);
                    }
  
                    if (theReplSet->gotForceSync()) {
                        return;
                    }

                    if (isAssumingPrimary() || theReplSet->isPrimary()) {
                        return;
                    }

                    // re-evaluate quality of sync target
                    if (shouldChangeSyncTarget()) {
                        return;
                    }
                    //record time for each getmore
                    {
                        TimerHolder batchTimer(&getmoreReplStats);
                        r.more();
                    }
                    //increment
                    networkByteStats.increment(r.currentBatchMessageSize());

                }

                if (!r.more())
                    break;

                BSONObj o = r.nextSafe().getOwned();
                opsReadStats.increment();

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _appliedBuffer = false;
                }

                OCCASIONALLY {
                    LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
                }
                // the blocking queue will wait (forever) until there's room for us to push
                _buffer.push(o);
                bufferCountGauge.increment();
                bufferSizeGauge.increment(getSize(o));

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _lastH = o["h"].numberLong();
                    _lastOpTimeFetched = o["ts"]._opTime();
                }
            } // end while

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
                    return;
                }
            }


            r.tailCheck();
            if( !r.haveCursor() ) {
                LOG(1) << "replSet end syncTail pass" << rsLog;
                return;
            }

            // looping back is ok because this is a tailable cursor
        }
    }
Beispiel #30
0
    QueryResult* processGetMore(const char* ns,
                                int ntoreturn,
                                long long cursorid,
                                CurOp& curop,
                                int pass,
                                bool& exhaust,
                                bool* isCursorAuthorized ) {

        bool hasRunner = false;

        // Scoped to kill the pin after seeing if the runner's there.
        {
            // See if there's a runner.  We do this until agg. is behind a Runner instead of a CC.
            ClientCursorPin p(cursorid);
            ClientCursor *cc = p.c();
            if (NULL != cc && NULL != cc->getRunner()) {
                hasRunner = true;
            }
        }

        if (hasRunner) {
            return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust,
                              isCursorAuthorized);
        }

        exhaust = false;

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns));
        // call this readlocked so state can't change
        replVerifyReadsOk();

        ClientCursorPin p(cursorid);
        ClientCursor *cc = p.c();

        if ( unlikely(!cc) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Some internal users create a ClientCursor with a Runner.  Don't crash if this
            // happens.  Instead, hand them off to the new framework.
            if (NULL != cc->getRunner()) {
                p.release();
                return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized);
            }

            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));

            *isCursorAuthorized = true;

            // This must be done after auth check to ensure proper cleanup.
            uassert(16951, "failing getmore due to set failpoint",
                    !MONGO_FAIL_POINT(getMoreError));

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() );
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if ( pass == 0 )
                cc->updateSlaveLocation( curop );

            int queryOptions = cc->queryOptions();
            
            curop.debug().query = cc->query();
            curop.setQuery( cc->query() );

            start = cc->pos();
            Cursor *c = cc->c();

            if (!c->requiresLock()) {
                // make sure it won't be destroyed under us
                fassert(16952, !c->shouldDestroyOnNSDeletion());
                fassert(16953, !c->supportYields());
                ctx.reset(); // unlocks
            }

            c->recoverFromYield();
            DiskLoc last;

            // This metadata may be stale, but it's the state of chunking when the cursor was
            // created.
            CollectionMetadataPtr metadata = cc->getCollMetadata();
            KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() );

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        // when a tailable cursor hits "EOF", ok() goes false, and current() is
                        // null.  however advance() can still be retries as a reactivation attempt.
                        // when there is new data, it will return true.  that's what we are doing
                        // here.
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }

                MatchDetails details;
                if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
                    // field projection specified, and contains an array operator
                    details.requestElemMatchKey();
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches( &details ) ) {
                }
                else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) {
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current()
                               << endl;
                }
                else {
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        last = c->currLoc();
                        n++;

                        // Fill out the fields requested by the query.
                        const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly();
                        if ( keyFieldsOnly ) {
                            fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate(
                            c->currKey() ), &details );
                        }
                        else {
                            DiskLoc loc = c->currLoc();
                            fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details,
                                    ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) );
                        }

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            cc->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();

                if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ?
                                          ClientCursor::DontNeed : ClientCursor::WillNeed ) ) {
                    ClientCursor::erase(cursorid);
                    cursorid = 0;
                    cc = 0;
                    break;
                }
            }
            
            if ( cc ) {
                if ( c->supportYields() ) {
                    ClientCursor::YieldData data;
                    verify( cc->prepareToYield( data ) );
                }
                else {
                    cc->c()->noteLocation();
                }
                cc->storeOpForSlave( last );
                exhaust = cc->queryOptions() & QueryOption_Exhaust;

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }