Status NetworkInterfaceASIO::startCommand(const TaskExecutor::CallbackHandle& cbHandle, RemoteCommandRequest& request, const RemoteCommandCompletionFn& onFinish) { MONGO_ASIO_INVARIANT(onFinish, "Invalid completion function"); { stdx::lock_guard<stdx::mutex> lk(_inProgressMutex); const auto insertResult = _inGetConnection.emplace(cbHandle); // We should never see the same CallbackHandle added twice MONGO_ASIO_INVARIANT_INLOCK(insertResult.second, "Same CallbackHandle added twice"); } if (inShutdown()) { return {ErrorCodes::ShutdownInProgress, "NetworkInterfaceASIO shutdown in progress"}; } LOG(2) << "startCommand: " << redact(request.toString()); auto getConnectionStartTime = now(); auto statusMetadata = attachMetadataIfNeeded(request, _metadataHook.get()); if (!statusMetadata.isOK()) { return statusMetadata; } auto nextStep = [this, getConnectionStartTime, cbHandle, request, onFinish]( StatusWith<ConnectionPool::ConnectionHandle> swConn) { if (!swConn.isOK()) { LOG(2) << "Failed to get connection from pool for request " << request.id << ": " << swConn.getStatus(); bool wasPreviouslyCanceled = false; { stdx::lock_guard<stdx::mutex> lk(_inProgressMutex); wasPreviouslyCanceled = _inGetConnection.erase(cbHandle) == 0; } Status status = wasPreviouslyCanceled ? Status(ErrorCodes::CallbackCanceled, "Callback canceled") : swConn.getStatus(); if (status.code() == ErrorCodes::NetworkInterfaceExceededTimeLimit) { status = Status(ErrorCodes::ExceededTimeLimit, status.reason()); } if (status.code() == ErrorCodes::ExceededTimeLimit) { _numTimedOutOps.fetchAndAdd(1); } if (status.code() != ErrorCodes::CallbackCanceled) { _numFailedOps.fetchAndAdd(1); } onFinish({status, now() - getConnectionStartTime}); signalWorkAvailable(); return; } auto conn = static_cast<connection_pool_asio::ASIOConnection*>(swConn.getValue().get()); AsyncOp* op = nullptr; stdx::unique_lock<stdx::mutex> lk(_inProgressMutex); const auto eraseCount = _inGetConnection.erase(cbHandle); // If we didn't find the request, we've been canceled if (eraseCount == 0) { lk.unlock(); onFinish({ErrorCodes::CallbackCanceled, "Callback canceled", now() - getConnectionStartTime}); // Though we were canceled, we know that the stream is fine, so indicate success. conn->indicateSuccess(); signalWorkAvailable(); return; } // We can't release the AsyncOp until we know we were not canceled. auto ownedOp = conn->releaseAsyncOp(); op = ownedOp.get(); // This AsyncOp may be recycled. We expect timeout and canceled to be clean. // If this op was most recently used to connect, its state transitions won't have been // reset, so we do that here. MONGO_ASIO_INVARIANT_INLOCK(!op->canceled(), "AsyncOp has dirty canceled flag", op); MONGO_ASIO_INVARIANT_INLOCK(!op->timedOut(), "AsyncOp has dirty timeout flag", op); op->clearStateTransitions(); // Now that we're inProgress, an external cancel can touch our op, but // not until we release the inProgressMutex. _inProgress.emplace(op, std::move(ownedOp)); op->_cbHandle = std::move(cbHandle); op->_request = std::move(request); op->_onFinish = std::move(onFinish); op->_connectionPoolHandle = std::move(swConn.getValue()); op->startProgress(getConnectionStartTime); // This ditches the lock and gets us onto the strand (so we're // threadsafe) op->_strand.post([this, op, getConnectionStartTime] { const auto timeout = op->_request.timeout; // Set timeout now that we have the correct request object if (timeout != RemoteCommandRequest::kNoTimeout) { // Subtract the time it took to get the connection from the pool from the request // timeout. auto getConnectionDuration = now() - getConnectionStartTime; if (getConnectionDuration >= timeout) { // We only assume that the request timer is guaranteed to fire *after* the // timeout duration - but make no stronger assumption. It is thus possible that // we have already exceeded the timeout. In this case we timeout the operation // manually. std::stringstream msg; msg << "Remote command timed out while waiting to get a connection from the " << "pool, took " << getConnectionDuration << ", timeout was set to " << timeout; auto rs = ResponseStatus(ErrorCodes::NetworkInterfaceExceededTimeLimit, msg.str(), getConnectionDuration); return _completeOperation(op, rs); } // The above conditional guarantees that the adjusted timeout will never underflow. MONGO_ASIO_INVARIANT(timeout > getConnectionDuration, "timeout underflowed", op); const auto adjustedTimeout = timeout - getConnectionDuration; const auto requestId = op->_request.id; try { op->_timeoutAlarm = op->_owner->_timerFactory->make(&op->_strand, adjustedTimeout); } catch (std::system_error& e) { severe() << "Failed to construct timer for AsyncOp: " << e.what(); fassertFailed(40334); } std::shared_ptr<AsyncOp::AccessControl> access; std::size_t generation; { stdx::lock_guard<stdx::mutex> lk(op->_access->mutex); access = op->_access; generation = access->id; } op->_timeoutAlarm->asyncWait( [this, op, access, generation, requestId, adjustedTimeout](std::error_code ec) { // We must pass a check for safe access before using op inside the // callback or we may attempt access on an invalid pointer. stdx::lock_guard<stdx::mutex> lk(access->mutex); if (generation != access->id) { // The operation has been cleaned up, do not access. return; } if (!ec) { LOG(2) << "Request " << requestId << " timed out" << ", adjusted timeout after getting connection from pool was " << adjustedTimeout << ", op was " << redact(op->toString()); op->timeOut_inlock(); } else { LOG(2) << "Failed to time request " << requestId << "out: " << ec.message() << ", op was " << redact(op->toString()); } }); } _beginCommunication(op); }); }; _connectionPool.get(request.target, request.timeout, nextStep); return Status::OK(); }
void NetworkInterfaceASIO::startCommand(const TaskExecutor::CallbackHandle& cbHandle, const RemoteCommandRequest& request, const RemoteCommandCompletionFn& onFinish) { invariant(onFinish); { stdx::lock_guard<stdx::mutex> lk(_inProgressMutex); const auto insertResult = _inGetConnection.emplace(cbHandle); // We should never see the same CallbackHandle added twice invariant(insertResult.second); } LOG(2) << "startCommand: " << request.toString(); auto startTime = now(); auto nextStep = [this, startTime, cbHandle, request, onFinish]( StatusWith<ConnectionPool::ConnectionHandle> swConn) { if (!swConn.isOK()) { LOG(2) << "Failed to get connection from pool: " << swConn.getStatus(); bool wasPreviouslyCanceled = false; { stdx::lock_guard<stdx::mutex> lk(_inProgressMutex); wasPreviouslyCanceled = _inGetConnection.erase(cbHandle) == 0; } onFinish(wasPreviouslyCanceled ? Status(ErrorCodes::CallbackCanceled, "Callback canceled") : swConn.getStatus()); signalWorkAvailable(); return; } auto conn = static_cast<connection_pool_asio::ASIOConnection*>(swConn.getValue().get()); AsyncOp* op = nullptr; stdx::unique_lock<stdx::mutex> lk(_inProgressMutex); const auto eraseCount = _inGetConnection.erase(cbHandle); // If we didn't find the request, we've been canceled if (eraseCount == 0) { lk.unlock(); onFinish({ErrorCodes::CallbackCanceled, "Callback canceled"}); // Though we were canceled, we know that the stream is fine, so indicate success. conn->indicateSuccess(); signalWorkAvailable(); return; } // We can't release the AsyncOp until we know we were not canceled. auto ownedOp = conn->releaseAsyncOp(); op = ownedOp.get(); // Sanity check that we are getting a clean AsyncOp. invariant(!op->canceled()); invariant(!op->timedOut()); // Now that we're inProgress, an external cancel can touch our op, but // not until we release the inProgressMutex. _inProgress.emplace(op, std::move(ownedOp)); op->_cbHandle = std::move(cbHandle); op->_request = std::move(request); op->_onFinish = std::move(onFinish); op->_connectionPoolHandle = std::move(swConn.getValue()); op->_start = startTime; // This ditches the lock and gets us onto the strand (so we're // threadsafe) op->_strand.post([this, op] { // Set timeout now that we have the correct request object if (op->_request.timeout != RemoteCommandRequest::kNoTimeout) { op->_timeoutAlarm = op->_owner->_timerFactory->make(&op->_strand, op->_request.timeout); std::shared_ptr<AsyncOp::AccessControl> access; std::size_t generation; { stdx::lock_guard<stdx::mutex> lk(op->_access->mutex); access = op->_access; generation = access->id; } op->_timeoutAlarm->asyncWait([this, op, access, generation](std::error_code ec) { if (!ec) { // We must pass a check for safe access before using op inside the // callback or we may attempt access on an invalid pointer. stdx::lock_guard<stdx::mutex> lk(access->mutex); if (generation != access->id) { // The operation has been cleaned up, do not access. return; } LOG(2) << "Operation timed out: " << op->request().toString(); // An operation may be in mid-flight when it times out, so we // cancel any in-progress async calls but do not complete the operation now. op->_timedOut = 1; if (op->_connection) { op->_connection->cancel(); } } else { LOG(4) << "failed to time operation out: " << ec.message(); } }); } _beginCommunication(op); }); }; // TODO: thread some higher level timeout through, rather than 5 minutes, // once we make timeouts pervasive in this api. _connectionPool.get(request.target, Minutes(5), nextStep); }