Exemplo n.º 1
0
Status NetworkInterfaceASIO::startCommand(const TaskExecutor::CallbackHandle& cbHandle,
                                          RemoteCommandRequest& request,
                                          const RemoteCommandCompletionFn& onFinish) {
    MONGO_ASIO_INVARIANT(onFinish, "Invalid completion function");
    {
        stdx::lock_guard<stdx::mutex> lk(_inProgressMutex);
        const auto insertResult = _inGetConnection.emplace(cbHandle);
        // We should never see the same CallbackHandle added twice
        MONGO_ASIO_INVARIANT_INLOCK(insertResult.second, "Same CallbackHandle added twice");
    }

    if (inShutdown()) {
        return {ErrorCodes::ShutdownInProgress, "NetworkInterfaceASIO shutdown in progress"};
    }

    LOG(2) << "startCommand: " << redact(request.toString());

    auto getConnectionStartTime = now();

    auto statusMetadata = attachMetadataIfNeeded(request, _metadataHook.get());
    if (!statusMetadata.isOK()) {
        return statusMetadata;
    }

    auto nextStep = [this, getConnectionStartTime, cbHandle, request, onFinish](
        StatusWith<ConnectionPool::ConnectionHandle> swConn) {

        if (!swConn.isOK()) {
            LOG(2) << "Failed to get connection from pool for request " << request.id << ": "
                   << swConn.getStatus();

            bool wasPreviouslyCanceled = false;
            {
                stdx::lock_guard<stdx::mutex> lk(_inProgressMutex);
                wasPreviouslyCanceled = _inGetConnection.erase(cbHandle) == 0;
            }

            Status status = wasPreviouslyCanceled
                ? Status(ErrorCodes::CallbackCanceled, "Callback canceled")
                : swConn.getStatus();
            if (status.code() == ErrorCodes::NetworkInterfaceExceededTimeLimit) {
                status = Status(ErrorCodes::ExceededTimeLimit, status.reason());
            }
            if (status.code() == ErrorCodes::ExceededTimeLimit) {
                _numTimedOutOps.fetchAndAdd(1);
            }
            if (status.code() != ErrorCodes::CallbackCanceled) {
                _numFailedOps.fetchAndAdd(1);
            }

            onFinish({status, now() - getConnectionStartTime});
            signalWorkAvailable();
            return;
        }

        auto conn = static_cast<connection_pool_asio::ASIOConnection*>(swConn.getValue().get());

        AsyncOp* op = nullptr;

        stdx::unique_lock<stdx::mutex> lk(_inProgressMutex);

        const auto eraseCount = _inGetConnection.erase(cbHandle);

        // If we didn't find the request, we've been canceled
        if (eraseCount == 0) {
            lk.unlock();

            onFinish({ErrorCodes::CallbackCanceled,
                      "Callback canceled",
                      now() - getConnectionStartTime});

            // Though we were canceled, we know that the stream is fine, so indicate success.
            conn->indicateSuccess();

            signalWorkAvailable();

            return;
        }

        // We can't release the AsyncOp until we know we were not canceled.
        auto ownedOp = conn->releaseAsyncOp();
        op = ownedOp.get();

        // This AsyncOp may be recycled. We expect timeout and canceled to be clean.
        // If this op was most recently used to connect, its state transitions won't have been
        // reset, so we do that here.
        MONGO_ASIO_INVARIANT_INLOCK(!op->canceled(), "AsyncOp has dirty canceled flag", op);
        MONGO_ASIO_INVARIANT_INLOCK(!op->timedOut(), "AsyncOp has dirty timeout flag", op);
        op->clearStateTransitions();

        // Now that we're inProgress, an external cancel can touch our op, but
        // not until we release the inProgressMutex.
        _inProgress.emplace(op, std::move(ownedOp));

        op->_cbHandle = std::move(cbHandle);
        op->_request = std::move(request);
        op->_onFinish = std::move(onFinish);
        op->_connectionPoolHandle = std::move(swConn.getValue());
        op->startProgress(getConnectionStartTime);

        // This ditches the lock and gets us onto the strand (so we're
        // threadsafe)
        op->_strand.post([this, op, getConnectionStartTime] {
            const auto timeout = op->_request.timeout;

            // Set timeout now that we have the correct request object
            if (timeout != RemoteCommandRequest::kNoTimeout) {
                // Subtract the time it took to get the connection from the pool from the request
                // timeout.
                auto getConnectionDuration = now() - getConnectionStartTime;
                if (getConnectionDuration >= timeout) {
                    // We only assume that the request timer is guaranteed to fire *after* the
                    // timeout duration - but make no stronger assumption. It is thus possible that
                    // we have already exceeded the timeout. In this case we timeout the operation
                    // manually.
                    std::stringstream msg;
                    msg << "Remote command timed out while waiting to get a connection from the "
                        << "pool, took " << getConnectionDuration << ", timeout was set to "
                        << timeout;
                    auto rs = ResponseStatus(ErrorCodes::NetworkInterfaceExceededTimeLimit,
                                             msg.str(),
                                             getConnectionDuration);
                    return _completeOperation(op, rs);
                }

                // The above conditional guarantees that the adjusted timeout will never underflow.
                MONGO_ASIO_INVARIANT(timeout > getConnectionDuration, "timeout underflowed", op);
                const auto adjustedTimeout = timeout - getConnectionDuration;
                const auto requestId = op->_request.id;

                try {
                    op->_timeoutAlarm =
                        op->_owner->_timerFactory->make(&op->_strand, adjustedTimeout);
                } catch (std::system_error& e) {
                    severe() << "Failed to construct timer for AsyncOp: " << e.what();
                    fassertFailed(40334);
                }

                std::shared_ptr<AsyncOp::AccessControl> access;
                std::size_t generation;
                {
                    stdx::lock_guard<stdx::mutex> lk(op->_access->mutex);
                    access = op->_access;
                    generation = access->id;
                }

                op->_timeoutAlarm->asyncWait(
                    [this, op, access, generation, requestId, adjustedTimeout](std::error_code ec) {
                        // We must pass a check for safe access before using op inside the
                        // callback or we may attempt access on an invalid pointer.
                        stdx::lock_guard<stdx::mutex> lk(access->mutex);
                        if (generation != access->id) {
                            // The operation has been cleaned up, do not access.
                            return;
                        }

                        if (!ec) {
                            LOG(2) << "Request " << requestId << " timed out"
                                   << ", adjusted timeout after getting connection from pool was "
                                   << adjustedTimeout << ", op was " << redact(op->toString());

                            op->timeOut_inlock();
                        } else {
                            LOG(2) << "Failed to time request " << requestId
                                   << "out: " << ec.message() << ", op was "
                                   << redact(op->toString());
                        }
                    });
            }

            _beginCommunication(op);
        });
    };

    _connectionPool.get(request.target, request.timeout, nextStep);
    return Status::OK();
}
// NOTE: This method may only be called by ASIO threads
// (do not call from methods entered by TaskExecutor threads)
void NetworkInterfaceASIO::_completeOperation(AsyncOp* op, ResponseStatus resp) {
    auto metadata = op->getResponseMetadata();
    if (!metadata.isEmpty()) {
        resp.metadata = metadata;
    }

    // Cancel this operation's timeout. Note that the timeout callback may already be running,
    // may have run, or may have already been scheduled to run in the near future.
    if (op->_timeoutAlarm) {
        op->_timeoutAlarm->cancel();
    }

    if (resp.status.code() == ErrorCodes::ExceededTimeLimit) {
        _numTimedOutOps.fetchAndAdd(1);
    }

    if (op->_inSetup) {
        // If we are in setup we should only be here if we failed to connect.
        MONGO_ASIO_INVARIANT(!resp.isOK(), "Failed to connect in setup", op);
        // If we fail during connection, we won't be able to access any of op's members after
        // calling finish(), so we return here.
        log() << "Failed to connect to " << op->request().target << " - " << resp.status;
        _numFailedOps.fetchAndAdd(1);
        op->finish(resp);
        return;
    }

    if (op->_inRefresh) {
        // If we are in refresh we should only be here if we failed to heartbeat.
        MONGO_ASIO_INVARIANT(!resp.isOK(), "In refresh, but did not fail to heartbeat", op);
        // If we fail during heartbeating, we won't be able to access any of op's members after
        // calling finish(), so we return here.
        log() << "Failed asio heartbeat to " << op->request().target << " - "
              << redact(resp.status);
        _numFailedOps.fetchAndAdd(1);
        op->finish(resp);
        return;
    }

    if (!resp.isOK()) {
        // In the case that resp is not OK, but _inSetup is false, we are using a connection
        // that
        // we got from the pool to execute a command, but it failed for some reason.
        LOG(2) << "Failed to execute command: " << redact(op->request().toString())
               << " reason: " << redact(resp.status);

        if (resp.status.code() != ErrorCodes::CallbackCanceled) {
            _numFailedOps.fetchAndAdd(1);
        }
    } else {
        _numSucceededOps.fetchAndAdd(1);
    }

    std::unique_ptr<AsyncOp> ownedOp;

    {
        stdx::lock_guard<stdx::mutex> lk(_inProgressMutex);

        auto iter = _inProgress.find(op);

        MONGO_ASIO_INVARIANT_INLOCK(
            iter != _inProgress.end(), "Could not find AsyncOp in _inProgress", op);

        ownedOp = std::move(iter->second);
        _inProgress.erase(iter);
    }

    op->finish(resp);

    MONGO_ASIO_INVARIANT(static_cast<bool>(ownedOp), "Invalid AsyncOp", op);

    auto conn = std::move(op->_connectionPoolHandle);
    auto asioConn = static_cast<connection_pool_asio::ASIOConnection*>(conn.get());

    // Prevent any other threads or callbacks from accessing this op so we may safely complete
    // and destroy it. It is key that we do this after we remove the op from the _inProgress map
    // or someone else in cancelCommand could read the bumped generation and cancel the next
    // command that uses this op. See SERVER-20556.
    {
        stdx::lock_guard<stdx::mutex> lk(op->_access->mutex);
        ++(op->_access->id);
    }

    // We need to bump the generation BEFORE we call reset() or we could flip the timeout in the
    // timeout callback before returning the AsyncOp to the pool.
    ownedOp->reset();

    asioConn->bindAsyncOp(std::move(ownedOp));
    if (!resp.isOK()) {
        asioConn->indicateFailure(resp.status);
    } else {
        asioConn->indicateUsed();
        asioConn->indicateSuccess();
    }

    signalWorkAvailable();
}