コード例 #1
0
void AsyncRequestsSender::_scheduleRequests() {
    invariant(!_stopRetrying);
    // Schedule remote work on hosts for which we have not sent a request or need to retry.
    for (size_t i = 0; i < _remotes.size(); ++i) {
        auto& remote = _remotes[i];

        // First check if the remote had a retriable error, and if so, clear its response field so
        // it will be retried.
        if (remote.swResponse && !remote.done) {
            // We check both the response status and command status for a retriable error.
            Status status = remote.swResponse->getStatus();
            if (status.isOK()) {
                status = getStatusFromCommandResult(remote.swResponse->getValue().data);
            }

            if (status.isOK()) {
                status = getWriteConcernStatusFromCommandResult(remote.swResponse->getValue().data);
            }

            if (!status.isOK()) {
                // There was an error with either the response or the command.
                auto shard = remote.getShard();
                if (!shard) {
                    remote.swResponse =
                        Status(ErrorCodes::ShardNotFound,
                               str::stream() << "Could not find shard " << remote.shardId);
                } else {
                    if (remote.shardHostAndPort) {
                        shard->updateReplSetMonitor(*remote.shardHostAndPort, status);
                    }
                    if (shard->isRetriableError(status.code(), _retryPolicy) &&
                        remote.retryCount < kMaxNumFailedHostRetryAttempts) {
                        LOG(1) << "Command to remote " << remote.shardId << " at host "
                               << *remote.shardHostAndPort
                               << " failed with retriable error and will be retried "
                               << causedBy(redact(status));
                        ++remote.retryCount;
                        remote.swResponse.reset();
                    }
                }
            }
        }

        // If the remote does not have a response or pending request, schedule remote work for it.
        if (!remote.swResponse && !remote.cbHandle.isValid()) {
            auto scheduleStatus = _scheduleRequest(i);
            if (!scheduleStatus.isOK()) {
                remote.swResponse = std::move(scheduleStatus);

                // Push a noop response to the queue to indicate that a remote is ready for
                // re-processing due to failure.
                _responseQueue.producer.push(boost::none);
            }
        }
    }
}
コード例 #2
0
void AsyncResultsMerger::handleBatchResponse(
    const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) {
    stdx::lock_guard<stdx::mutex> lk(_mutex);

    auto& remote = _remotes[remoteIndex];

    // Clear the callback handle. This indicates that we are no longer waiting on a response from
    // 'remote'.
    remote.cbHandle = executor::TaskExecutor::CallbackHandle();

    // If we're in the process of shutting down then there's no need to process the batch.
    if (_lifecycleState != kAlive) {
        invariant(_lifecycleState == kKillStarted);

        // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down.
        signalCurrentEventIfReady_inlock();

        // Make a best effort to parse the response and retrieve the cursor id. We need the cursor
        // id in order to issue a killCursors command against it.
        if (cbData.response.isOK()) {
            auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote);
            if (cursorResponse.isOK()) {
                remote.cursorId = cursorResponse.getValue().getCursorId();
            }
        }

        // If we're killed and we're not waiting on any more batches to come back, then we are ready
        // to kill the cursors on the remote hosts and clean up this cursor. Schedule the
        // killCursors command and signal that this cursor is safe now safe to destroy. We have to
        // promise not to touch any members of this class because 'this' could become invalid as
        // soon as we signal the event.
        if (!haveOutstandingBatchRequests_inlock()) {
            // If the event handle is invalid, then the executor is in the middle of shutting down,
            // and we can't schedule any more work for it to complete.
            if (_killCursorsScheduledEvent.isValid()) {
                scheduleKillCursors_inlock();
                _executor->signalEvent(_killCursorsScheduledEvent);
            }

            _lifecycleState = kKillComplete;
        }

        return;
    }

    // Early return from this point on signal anyone waiting on an event, if ready() is true.
    ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this);

    StatusWith<CursorResponse> cursorResponseStatus(
        cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote)
                               : cbData.response.getStatus());

    if (!cursorResponseStatus.isOK()) {
        auto shard = remote.getShard();
        if (!shard) {
            remote.status = Status(cursorResponseStatus.getStatus().code(),
                                   str::stream() << "Could not find shard " << *remote.shardId
                                                 << " containing host "
                                                 << remote.getTargetHost().toString());
        } else {
            shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus());

            // Retry initial cursor establishment if possible.  Never retry getMores to avoid
            // accidentally skipping results.
            if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
                shard->isRetriableError(cursorResponseStatus.getStatus().code(),
                                        Shard::RetryPolicy::kIdempotent)) {
                invariant(remote.shardId);
                LOG(1) << "Initial cursor establishment failed with retriable error and will be "
                          "retried"
                       << causedBy(redact(cursorResponseStatus.getStatus()));

                ++remote.retryCount;

                // Since we potentially updated the targeter that the last host it chose might be
                // faulty, the call below may end up getting a different host.
                remote.status = askForNextBatch_inlock(remoteIndex);
                if (remote.status.isOK()) {
                    return;
                }

                // If we end up here, it means we failed to schedule the retry request, which is a
                // more
                // severe error that should not be retried. Just pass through to the error handling
                // logic below.
            } else {
                remote.status = cursorResponseStatus.getStatus();
            }
        }

        // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We
        // remove the unreachable host entirely from consideration by marking it as exhausted.
        if (_params.isAllowPartialResults) {
            remote.status = Status::OK();

            // Clear the results buffer and cursor id.
            std::queue<BSONObj> emptyBuffer;
            std::swap(remote.docBuffer, emptyBuffer);
            remote.cursorId = 0;
        }

        return;
    }

    // Cursor id successfully established.
    auto cursorResponse = std::move(cursorResponseStatus.getValue());
    remote.cursorId = cursorResponse.getCursorId();
    remote.initialCmdObj = boost::none;

    for (const auto& obj : cursorResponse.getBatch()) {
        // If there's a sort, we're expecting the remote node to give us back a sort key.
        if (!_params.sort.isEmpty() &&
            obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) {
            remote.status = Status(ErrorCodes::InternalError,
                                   str::stream() << "Missing field '"
                                                 << ClusterClientCursorParams::kSortKeyField
                                                 << "' in document: "
                                                 << obj);
            return;
        }

        remote.docBuffer.push(obj);
        ++remote.fetchedCount;
    }

    // If we're doing a sorted merge, then we have to make sure to put this remote onto the
    // merge queue.
    if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) {
        _mergeQueue.push(remoteIndex);
    }

    // If the cursor is tailable and we just received an empty batch, the next return value should
    // be boost::none in order to indicate the end of the batch.
    if (_params.isTailable && !remote.hasNext()) {
        _eofNext = true;
    }

    // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize
    // was zero), then can schedule work to retrieve the next batch right away.
    //
    // We do not ask for the next batch if the cursor is tailable, as batches received from remote
    // tailable cursors should be passed through to the client without asking for more batches.
    if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) {
        remote.status = askForNextBatch_inlock(remoteIndex);
        if (!remote.status.isOK()) {
            return;
        }
    }

    // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as
    // well as failure.
    signaller.Dismiss();
    signalCurrentEventIfReady_inlock();
}