StatusWith<DistLockManager::ScopedDistLock> ReplSetDistLockManager::lock( StringData name, StringData whyMessage, milliseconds waitFor, milliseconds lockTryInterval) { Timer timer(_serviceContext->getTickSource()); Timer msgTimer(_serviceContext->getTickSource()); while (waitFor <= milliseconds::zero() || milliseconds(timer.millis()) < waitFor) { OID lockSessionID = OID::gen(); string who = str::stream() << _processID << ":" << getThreadName(); auto lockExpiration = _lockExpiration; MONGO_FAIL_POINT_BLOCK(setDistLockTimeout, customTimeout) { const BSONObj& data = customTimeout.getData(); lockExpiration = stdx::chrono::milliseconds(data["timeoutMs"].numberInt()); } LOG(1) << "trying to acquire new distributed lock for " << name << " ( lock timeout : " << durationCount<Milliseconds>(lockExpiration) << " ms, ping interval : " << durationCount<Milliseconds>(_pingInterval) << " ms, process : " << _processID << " )" << " with lockSessionID: " << lockSessionID << ", why: " << whyMessage; auto lockResult = _catalog->grabLock(name, lockSessionID, who, _processID, Date_t::now(), whyMessage); auto status = lockResult.getStatus(); if (status.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. LOG(0) << "distributed lock '" << name << "' acquired, ts : " << lockSessionID; return ScopedDistLock(lockSessionID, this); } if (status != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return status; } // Get info from current lock and check if we can overtake it. auto getLockStatusResult = _catalog->getLockByName(name); const auto& getLockStatus = getLockStatusResult.getStatus(); if (!getLockStatusResult.isOK() && getLockStatus != ErrorCodes::LockNotFound) { return getLockStatus; } // Note: Only attempt to overtake locks that actually exists. If lock was not // found, use the normal grab lock path to acquire it. if (getLockStatusResult.isOK()) { auto currentLock = getLockStatusResult.getValue(); auto canOvertakeResult = canOvertakeLock(currentLock, lockExpiration); if (!canOvertakeResult.isOK()) { return canOvertakeResult.getStatus(); } if (canOvertakeResult.getValue()) { auto overtakeResult = _catalog->overtakeLock(name, lockSessionID, currentLock.getLockID(), who, _processID, Date_t::now(), whyMessage); const auto& overtakeStatus = overtakeResult.getStatus(); if (overtakeResult.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. LOG(0) << "lock '" << name << "' successfully forced"; LOG(0) << "distributed lock '" << name << "' acquired, ts : " << lockSessionID; return ScopedDistLock(lockSessionID, this); } if (overtakeStatus != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return overtakeStatus; } } } LOG(1) << "distributed lock '" << name << "' was not acquired."; if (waitFor == milliseconds::zero()) { break; } // Periodically message for debugging reasons if (msgTimer.seconds() > 10) { LOG(0) << "waited " << timer.seconds() << "s for distributed lock " << name << " for " << whyMessage; msgTimer.reset(); } milliseconds timeRemaining = std::max(milliseconds::zero(), waitFor - milliseconds(timer.millis())); sleepFor(std::min(lockTryInterval, timeRemaining)); } return {ErrorCodes::LockBusy, str::stream() << "timed out waiting for " << name}; }
StatusWith<DistLockManager::ScopedDistLock> ReplSetDistLockManager::lock( OperationContext* txn, StringData name, StringData whyMessage, milliseconds waitFor, milliseconds lockTryInterval) { Timer timer(_serviceContext->getTickSource()); Timer msgTimer(_serviceContext->getTickSource()); // Counts how many attempts have been made to grab the lock, which have failed with network // error. This value is reset for each lock acquisition attempt because these are // independent write operations. int networkErrorRetries = 0; // Distributed lock acquisition works by tring to update the state of the lock to 'taken'. If // the lock is currently taken, we will back off and try the acquisition again, repeating this // until the lockTryInterval has been reached. If a network error occurs at each lock // acquisition attempt, the lock acquisition will be retried immediately. while (waitFor <= milliseconds::zero() || milliseconds(timer.millis()) < waitFor) { const OID lockSessionID = OID::gen(); const string who = str::stream() << _processID << ":" << getThreadName(); auto lockExpiration = _lockExpiration; MONGO_FAIL_POINT_BLOCK(setDistLockTimeout, customTimeout) { const BSONObj& data = customTimeout.getData(); lockExpiration = stdx::chrono::milliseconds(data["timeoutMs"].numberInt()); } LOG(1) << "trying to acquire new distributed lock for " << name << " ( lock timeout : " << durationCount<Milliseconds>(lockExpiration) << " ms, ping interval : " << durationCount<Milliseconds>(_pingInterval) << " ms, process : " << _processID << " )" << " with lockSessionID: " << lockSessionID << ", why: " << whyMessage; auto lockResult = _catalog->grabLock( txn, name, lockSessionID, who, _processID, Date_t::now(), whyMessage); auto status = lockResult.getStatus(); if (status.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. log() << "distributed lock '" << name << "' acquired for '" << whyMessage << "', ts : " << lockSessionID; return ScopedDistLock(txn, lockSessionID, this); } // If a network error occurred, unlock the lock synchronously and try again if (ShardRegistry::kAllRetriableErrors.count(status.code()) && networkErrorRetries < kMaxNumLockAcquireRetries) { LOG(1) << "Failed to acquire distributed lock because of retriable error. Retrying " "acquisition by first unlocking the stale entry, which possibly exists now" << causedBy(status); networkErrorRetries++; status = _catalog->unlock(txn, lockSessionID); if (status.isOK()) { // We certainly do not own the lock, so we can retry continue; } // Fall-through to the error checking logic below invariant(status != ErrorCodes::LockStateChangeFailed); LOG(1) << "Failed to retry acqusition of distributed lock. No more attempts will be made" << causedBy(status); } if (status != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return status; } // Get info from current lock and check if we can overtake it. auto getLockStatusResult = _catalog->getLockByName(txn, name); const auto& getLockStatus = getLockStatusResult.getStatus(); if (!getLockStatusResult.isOK() && getLockStatus != ErrorCodes::LockNotFound) { return getLockStatus; } // Note: Only attempt to overtake locks that actually exists. If lock was not // found, use the normal grab lock path to acquire it. if (getLockStatusResult.isOK()) { auto currentLock = getLockStatusResult.getValue(); auto canOvertakeResult = canOvertakeLock(txn, currentLock, lockExpiration); if (!canOvertakeResult.isOK()) { return canOvertakeResult.getStatus(); } if (canOvertakeResult.getValue()) { auto overtakeResult = _catalog->overtakeLock(txn, name, lockSessionID, currentLock.getLockID(), who, _processID, Date_t::now(), whyMessage); const auto& overtakeStatus = overtakeResult.getStatus(); if (overtakeResult.isOK()) { // Lock is acquired since findAndModify was able to successfully modify // the lock document. LOG(0) << "lock '" << name << "' successfully forced"; LOG(0) << "distributed lock '" << name << "' acquired, ts : " << lockSessionID; return ScopedDistLock(txn, lockSessionID, this); } if (overtakeStatus != ErrorCodes::LockStateChangeFailed) { // An error occurred but the write might have actually been applied on the // other side. Schedule an unlock to clean it up just in case. queueUnlock(lockSessionID); return overtakeStatus; } } } LOG(1) << "distributed lock '" << name << "' was not acquired."; if (waitFor == milliseconds::zero()) { break; } // Periodically message for debugging reasons if (msgTimer.seconds() > 10) { LOG(0) << "waited " << timer.seconds() << "s for distributed lock " << name << " for " << whyMessage; msgTimer.reset(); } // A new lock acquisition attempt will begin now (because the previous found the lock to be // busy, so reset the retries counter) networkErrorRetries = 0; const milliseconds timeRemaining = std::max(milliseconds::zero(), waitFor - milliseconds(timer.millis())); sleepFor(std::min(lockTryInterval, timeRemaining)); } return {ErrorCodes::LockBusy, str::stream() << "timed out waiting for " << name}; }