StatusWith<DistLockManager::ScopedDistLock> ReplSetDistLockManager::lock( StringData name, StringData whyMessage, milliseconds waitFor, milliseconds lockTryInterval) { Timer timer(_serviceContext->getTickSource()); Timer msgTimer(_serviceContext->getTickSource()); while (waitFor <= milliseconds::zero() || milliseconds(timer.millis()) < waitFor) { OID lockSessionID = OID::gen(); string who = str::stream() << _processID << ":" << getThreadName(); auto lockExpiration = _lockExpiration; MONGO_FAIL_POINT_BLOCK(setDistLockTimeout, customTimeout) { const BSONObj& data = customTimeout.getData(); lockExpiration = stdx::chrono::milliseconds(data["timeoutMs"].numberInt()); } LOG(1) << "trying to acquire new distributed lock for " << name << " ( lock timeout : " << durationCount<Milliseconds>(lockExpiration) << " ms, ping interval : " << durationCount<Milliseconds>(_pingInterval) << " ms, process : " << _processID << " )" << " with lockSessionID: " << lockSessionID << ", why: " << whyMessage; auto lockResult = _catalog->grabLock(name, lockSessionID, who, _processID, Date_t::now(), whyMessage); auto status = lockResult.getStatus(); if (status.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. LOG(0) << "distributed lock '" << name << "' acquired, ts : " << lockSessionID; return ScopedDistLock(lockSessionID, this); } if (status != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return status; } // Get info from current lock and check if we can overtake it. auto getLockStatusResult = _catalog->getLockByName(name); const auto& getLockStatus = getLockStatusResult.getStatus(); if (!getLockStatusResult.isOK() && getLockStatus != ErrorCodes::LockNotFound) { return getLockStatus; } // Note: Only attempt to overtake locks that actually exists. If lock was not // found, use the normal grab lock path to acquire it. if (getLockStatusResult.isOK()) { auto currentLock = getLockStatusResult.getValue(); auto canOvertakeResult = canOvertakeLock(currentLock, lockExpiration); if (!canOvertakeResult.isOK()) { return canOvertakeResult.getStatus(); } if (canOvertakeResult.getValue()) { auto overtakeResult = _catalog->overtakeLock(name, lockSessionID, currentLock.getLockID(), who, _processID, Date_t::now(), whyMessage); const auto& overtakeStatus = overtakeResult.getStatus(); if (overtakeResult.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. LOG(0) << "lock '" << name << "' successfully forced"; LOG(0) << "distributed lock '" << name << "' acquired, ts : " << lockSessionID; return ScopedDistLock(lockSessionID, this); } if (overtakeStatus != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return overtakeStatus; } } } LOG(1) << "distributed lock '" << name << "' was not acquired."; if (waitFor == milliseconds::zero()) { break; } // Periodically message for debugging reasons if (msgTimer.seconds() > 10) { LOG(0) << "waited " << timer.seconds() << "s for distributed lock " << name << " for " << whyMessage; msgTimer.reset(); } milliseconds timeRemaining = std::max(milliseconds::zero(), waitFor - milliseconds(timer.millis())); sleepFor(std::min(lockTryInterval, timeRemaining)); } return {ErrorCodes::LockBusy, str::stream() << "timed out waiting for " << name}; }
StatusWith<DistLockManager::ScopedDistLock> LegacyDistLockManager::lock( OperationContext* txn, StringData name, StringData whyMessage, milliseconds waitFor, milliseconds lockTryInterval) { auto distLock = stdx::make_unique<DistributedLock>(_configServer, name.toString()); { stdx::lock_guard<stdx::mutex> sl(_mutex); if (_isStopped) { return Status(ErrorCodes::LockBusy, "legacy distlock manager is stopped"); } if (_pingerEnabled) { auto pingStatus = _pinger->startPing(*(distLock.get()), kDefaultPingInterval); if (!pingStatus.isOK()) { return pingStatus; } } } auto lastStatus = Status(ErrorCodes::LockBusy, str::stream() << "timed out waiting for " << name); Timer timer; Timer msgTimer; while (waitFor <= milliseconds::zero() || milliseconds(timer.millis()) < waitFor) { bool acquired = false; BSONObj lockDoc; try { acquired = distLock->lock_try( whyMessage.toString(), &lockDoc, durationCount<Seconds>(kDefaultSocketTimeout)); if (!acquired) { lastStatus = Status(ErrorCodes::LockBusy, str::stream() << "Lock for " << whyMessage << " is taken."); } } catch (const LockException& lockExcep) { OID needUnlockID(lockExcep.getMustUnlockID()); if (needUnlockID.isSet()) { _pinger->addUnlockOID(needUnlockID); } lastStatus = lockExcep.toStatus(); } catch (...) { lastStatus = exceptionToStatus(); } if (acquired) { verify(!lockDoc.isEmpty()); auto locksTypeResult = LocksType::fromBSON(lockDoc); if (!locksTypeResult.isOK()) { return StatusWith<ScopedDistLock>( ErrorCodes::UnsupportedFormat, str::stream() << "error while parsing lock document: " << lockDoc << " : " << locksTypeResult.getStatus().toString()); } const LocksType& lock = locksTypeResult.getValue(); dassert(lock.isLockIDSet()); { stdx::lock_guard<stdx::mutex> sl(_mutex); _lockMap.insert(std::make_pair(lock.getLockID(), std::move(distLock))); } return ScopedDistLock(txn, lock.getLockID(), this); } if (waitFor == milliseconds::zero()) break; if (lastStatus != ErrorCodes::LockBusy) { return lastStatus; } // Periodically message for debugging reasons if (msgTimer.seconds() > 10) { log() << "waited " << timer.seconds() << "s for distributed lock " << name << " for " << whyMessage << ": " << lastStatus.toString(); msgTimer.reset(); } milliseconds timeRemaining = std::max(milliseconds::zero(), waitFor - milliseconds(timer.millis())); sleepFor(std::min(lockTryInterval, timeRemaining)); } return lastStatus; }
StatusWith<DistLockManager::ScopedDistLock> ReplSetDistLockManager::lock( OperationContext* txn, StringData name, StringData whyMessage, milliseconds waitFor, milliseconds lockTryInterval) { Timer timer(_serviceContext->getTickSource()); Timer msgTimer(_serviceContext->getTickSource()); // Counts how many attempts have been made to grab the lock, which have failed with network // error. This value is reset for each lock acquisition attempt because these are // independent write operations. int networkErrorRetries = 0; // Distributed lock acquisition works by tring to update the state of the lock to 'taken'. If // the lock is currently taken, we will back off and try the acquisition again, repeating this // until the lockTryInterval has been reached. If a network error occurs at each lock // acquisition attempt, the lock acquisition will be retried immediately. while (waitFor <= milliseconds::zero() || milliseconds(timer.millis()) < waitFor) { const OID lockSessionID = OID::gen(); const string who = str::stream() << _processID << ":" << getThreadName(); auto lockExpiration = _lockExpiration; MONGO_FAIL_POINT_BLOCK(setDistLockTimeout, customTimeout) { const BSONObj& data = customTimeout.getData(); lockExpiration = stdx::chrono::milliseconds(data["timeoutMs"].numberInt()); } LOG(1) << "trying to acquire new distributed lock for " << name << " ( lock timeout : " << durationCount<Milliseconds>(lockExpiration) << " ms, ping interval : " << durationCount<Milliseconds>(_pingInterval) << " ms, process : " << _processID << " )" << " with lockSessionID: " << lockSessionID << ", why: " << whyMessage; auto lockResult = _catalog->grabLock( txn, name, lockSessionID, who, _processID, Date_t::now(), whyMessage); auto status = lockResult.getStatus(); if (status.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. log() << "distributed lock '" << name << "' acquired for '" << whyMessage << "', ts : " << lockSessionID; return ScopedDistLock(txn, lockSessionID, this); } // If a network error occurred, unlock the lock synchronously and try again if (ShardRegistry::kAllRetriableErrors.count(status.code()) && networkErrorRetries < kMaxNumLockAcquireRetries) { LOG(1) << "Failed to acquire distributed lock because of retriable error. Retrying " "acquisition by first unlocking the stale entry, which possibly exists now" << causedBy(status); networkErrorRetries++; status = _catalog->unlock(txn, lockSessionID); if (status.isOK()) { // We certainly do not own the lock, so we can retry continue; } // Fall-through to the error checking logic below invariant(status != ErrorCodes::LockStateChangeFailed); LOG(1) << "Failed to retry acqusition of distributed lock. No more attempts will be made" << causedBy(status); } if (status != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return status; } // Get info from current lock and check if we can overtake it. auto getLockStatusResult = _catalog->getLockByName(txn, name); const auto& getLockStatus = getLockStatusResult.getStatus(); if (!getLockStatusResult.isOK() && getLockStatus != ErrorCodes::LockNotFound) { return getLockStatus; } // Note: Only attempt to overtake locks that actually exists. If lock was not // found, use the normal grab lock path to acquire it. if (getLockStatusResult.isOK()) { auto currentLock = getLockStatusResult.getValue(); auto canOvertakeResult = canOvertakeLock(txn, currentLock, lockExpiration); if (!canOvertakeResult.isOK()) { return canOvertakeResult.getStatus(); } if (canOvertakeResult.getValue()) { auto overtakeResult = _catalog->overtakeLock(txn, name, lockSessionID, currentLock.getLockID(), who, _processID, Date_t::now(), whyMessage); const auto& overtakeStatus = overtakeResult.getStatus(); if (overtakeResult.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. LOG(0) << "lock '" << name << "' successfully forced"; LOG(0) << "distributed lock '" << name << "' acquired, ts : " << lockSessionID; return ScopedDistLock(txn, lockSessionID, this); } if (overtakeStatus != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return overtakeStatus; } } } LOG(1) << "distributed lock '" << name << "' was not acquired."; if (waitFor == milliseconds::zero()) { break; } // Periodically message for debugging reasons if (msgTimer.seconds() > 10) { LOG(0) << "waited " << timer.seconds() << "s for distributed lock " << name << " for " << whyMessage; msgTimer.reset(); } // A new lock acquisition attempt will begin now (because the previous found the lock to be // busy, so reset the retries counter) networkErrorRetries = 0; const milliseconds timeRemaining = std::max(milliseconds::zero(), waitFor - milliseconds(timer.millis())); sleepFor(std::min(lockTryInterval, timeRemaining)); } return {ErrorCodes::LockBusy, str::stream() << "timed out waiting for " << name}; }