void AsyncRequestsSender::_scheduleRequests() { invariant(!_stopRetrying); // Schedule remote work on hosts for which we have not sent a request or need to retry. for (size_t i = 0; i < _remotes.size(); ++i) { auto& remote = _remotes[i]; // First check if the remote had a retriable error, and if so, clear its response field so // it will be retried. if (remote.swResponse && !remote.done) { // We check both the response status and command status for a retriable error. Status status = remote.swResponse->getStatus(); if (status.isOK()) { status = getStatusFromCommandResult(remote.swResponse->getValue().data); } if (status.isOK()) { status = getWriteConcernStatusFromCommandResult(remote.swResponse->getValue().data); } if (!status.isOK()) { // There was an error with either the response or the command. auto shard = remote.getShard(); if (!shard) { remote.swResponse = Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << remote.shardId); } else { if (remote.shardHostAndPort) { shard->updateReplSetMonitor(*remote.shardHostAndPort, status); } if (shard->isRetriableError(status.code(), _retryPolicy) && remote.retryCount < kMaxNumFailedHostRetryAttempts) { LOG(1) << "Command to remote " << remote.shardId << " at host " << *remote.shardHostAndPort << " failed with retriable error and will be retried " << causedBy(redact(status)); ++remote.retryCount; remote.swResponse.reset(); } } } } // If the remote does not have a response or pending request, schedule remote work for it. if (!remote.swResponse && !remote.cbHandle.isValid()) { auto scheduleStatus = _scheduleRequest(i); if (!scheduleStatus.isOK()) { remote.swResponse = std::move(scheduleStatus); // Push a noop response to the queue to indicate that a remote is ready for // re-processing due to failure. _responseQueue.producer.push(boost::none); } } } }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // Make a best effort to parse the response and retrieve the cursor id. We need the cursor // id in order to issue a killCursors command against it. if (cbData.response.isOK()) { auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote); if (cursorResponse.isOK()) { remote.cursorId = cursorResponse.getValue().getCursorId(); } } // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); StatusWith<CursorResponse> cursorResponseStatus( cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote) : cbData.response.getStatus()); if (!cursorResponseStatus.isOK()) { auto shard = remote.getShard(); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId << " containing host " << remote.getTargetHost().toString()); } else { shard->updateReplSetMonitor(remote.getTargetHost(), cursorResponseStatus.getStatus()); // Retry initial cursor establishment if possible. Never retry getMores to avoid // accidentally skipping results. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && shard->isRetriableError(cursorResponseStatus.getStatus().code(), Shard::RetryPolicy::kIdempotent)) { invariant(remote.shardId); LOG(1) << "Initial cursor establishment failed with retriable error and will be " "retried" << causedBy(redact(cursorResponseStatus.getStatus())); ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be // faulty, the call below may end up getting a different host. remote.status = askForNextBatch_inlock(remoteIndex); if (remote.status.isOK()) { return; } // If we end up here, it means we failed to schedule the retry request, which is a // more // severe error that should not be retried. Just pass through to the error handling // logic below. } else { remote.status = cursorResponseStatus.getStatus(); } } // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We // remove the unreachable host entirely from consideration by marking it as exhausted. if (_params.isAllowPartialResults) { remote.status = Status::OK(); // Clear the results buffer and cursor id. std::queue<BSONObj> emptyBuffer; std::swap(remote.docBuffer, emptyBuffer); remote.cursorId = 0; } return; } // Cursor id successfully established. auto cursorResponse = std::move(cursorResponseStatus.getValue()); remote.cursorId = cursorResponse.getCursorId(); remote.initialCmdObj = boost::none; for (const auto& obj : cursorResponse.getBatch()) { // If there's a sort, we're expecting the remote node to give us back a sort key. if (!_params.sort.isEmpty() && obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) { remote.status = Status(ErrorCodes::InternalError, str::stream() << "Missing field '" << ClusterClientCursorParams::kSortKeyField << "' in document: " << obj); return; } remote.docBuffer.push(obj); ++remote.fetchedCount; } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) { _mergeQueue.push(remoteIndex); } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. if (_params.isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) { remote.status = askForNextBatch_inlock(remoteIndex); if (!remote.status.isOK()) { return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }