OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candidate, const Fetcher::QueryResponse& queryResponse) { if (queryResponse.documents.empty()) { // Remote oplog is empty. const auto until = _taskExecutor->now() + kOplogEmptyBlacklistDuration; log() << "Blacklisting " << candidate << " due to empty oplog for " << kOplogEmptyBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } const auto& firstObjFound = queryResponse.documents.front(); if (firstObjFound.isEmpty()) { // First document in remote oplog is empty. const auto until = _taskExecutor->now() + kFirstOplogEntryEmptyBlacklistDuration; log() << "Blacklisting " << candidate << " due to empty first document for " << kFirstOplogEntryEmptyBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } const OplogEntry oplogEntry(firstObjFound); const auto remoteEarliestOpTime = oplogEntry.getOpTime(); if (remoteEarliestOpTime.isNull()) { // First document in remote oplog is empty. const auto until = _taskExecutor->now() + kFirstOplogEntryNullTimestampBlacklistDuration; log() << "Blacklisting " << candidate << " due to null timestamp in first document for " << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } return remoteEarliestOpTime; }
MemberHeartbeatData::MemberHeartbeatData() : _health(-1), _upSince(0), _lastHeartbeat(0), _lastHeartbeatRecv(0), _authIssue(false) { _lastResponse.setState(MemberState::RS_UNKNOWN); _lastResponse.setElectionTime(OpTime()); _lastResponse.setOpTime(OpTime()); }
void MemberHeartbeatData::setDownValues(Date_t now, const std::string& heartbeatMessage) { _health = 0; _upSince = 0; _lastHeartbeat = now; _authIssue = false; _lastResponse = ReplSetHeartbeatResponse(); _lastResponse.setState(MemberState::RS_DOWN); _lastResponse.setElectionTime(OpTime()); _lastResponse.setOpTime(OpTime()); _lastResponse.setHbMsg(heartbeatMessage); _lastResponse.setSyncingTo(""); }
void MemberHeartbeatData::setAuthIssue(Date_t now) { _health = 0; // set health to 0 so that this doesn't count towards majority. _upSince = 0; _lastHeartbeat = now; _authIssue = true; _lastResponse = ReplSetHeartbeatResponse(); _lastResponse.setState(MemberState::RS_UNKNOWN); _lastResponse.setElectionTime(OpTime()); _lastResponse.setOpTime(OpTime()); _lastResponse.setHbMsg(""); _lastResponse.setSyncingTo(""); }
OpTime getMinValid(OperationContext* txn) { BSONObj mv; if (Helpers::getSingleton(txn, minvalidNS, mv)) { return mv["ts"]._opTime(); } return OpTime(); }
StatusWith<OpTime> ReplicationCoordinatorExternalStateImpl::loadLastOpTime( OperationContext* txn) { // TODO: handle WriteConflictExceptions below try { BSONObj oplogEntry; if (!Helpers::getLast(txn, rsOplogName.c_str(), oplogEntry)) { return StatusWith<OpTime>( ErrorCodes::NoMatchingDocument, str::stream() << "Did not find any entries in " << rsOplogName); } BSONElement tsElement = oplogEntry[tsFieldName]; if (tsElement.eoo()) { return StatusWith<OpTime>( ErrorCodes::NoSuchKey, str::stream() << "Most recent entry in " << rsOplogName << " missing \"" << tsFieldName << "\" field"); } if (tsElement.type() != bsonTimestamp) { return StatusWith<OpTime>( ErrorCodes::TypeMismatch, str::stream() << "Expected type of \"" << tsFieldName << "\" in most recent " << rsOplogName << " entry to have type Timestamp, but found " << typeName(tsElement.type())); } // TODO(siyuan) add term return StatusWith<OpTime>(OpTime(tsElement.timestamp(), 0)); } catch (const DBException& ex) { return StatusWith<OpTime>(ex.toStatus()); } }
void Manager::noteARemoteIsPrimary(const Member *m) { if( rs->box.getPrimary() == m ) return; rs->_self->lhb() = ""; // this is what actually puts arbiters into ARBITER state if( rs->iAmArbiterOnly() ) { rs->box.set(MemberState::RS_ARBITER, m); return; } if (rs->box.getState().primary()) { OpTime remoteElectionTime = m->hbinfo().electionTime; LOG(1) << "another primary seen with election time " << remoteElectionTime; if (remoteElectionTime == OpTime()) { // This primary didn't deliver an electionTime in its heartbeat; // assume it's a pre-2.6 primary and always step down ourselves. log() << "stepping down; another primary seen in replicaset"; rs->relinquish(); } // 2.6 or greater primary. Step down whoever has the older election time. else if (remoteElectionTime > rs->getElectionTime()) { log() << "stepping down; another primary was elected more recently"; rs->relinquish(); } else { // else, stick around log() << "another PRIMARY detected but it should step down" " since it was elected earlier than me"; return; } } rs->box.noteRemoteIsPrimary(m); }
void ReplSource::forceResync( const char *requester ) { BSONObj info; { dbtemprelease t; oplogReader.connect(hostName); /* todo use getDatabaseNames() method here */ bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); massert( 10385 , "Unable to get database list", ok ); } BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; string name = e.embeddedObject().getField( "name" ).valuestr(); if ( !e.embeddedObject().getBoolField( "empty" ) ) { if ( name != "local" ) { if ( only.empty() || only == name ) { resyncDrop( name.c_str(), requester ); } } } } syncedTo = OpTime(); addDbNextPass.clear(); save(); }
void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long originalTerm) { invariant(_voteRequester); invariant(!_electionWinnerDeclarer); LoseElectionGuardV1 lossGuard(this); if (_topCoord->getTerm() != originalTerm) { log() << "not becoming primary, we have been superceded already"; return; } const VoteRequester::VoteRequestResult endResult = _voteRequester->getResult(); if (endResult == VoteRequester::InsufficientVotes) { log() << "not becoming primary, we received insufficient votes"; return; } else if (endResult == VoteRequester::StaleTerm) { log() << "not becoming primary, we have been superceded already"; return; } else if (endResult != VoteRequester::SuccessfullyElected) { log() << "not becoming primary, we received an unexpected problem"; return; } log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm(); // Prevent last committed optime from updating until we finish draining. _setFirstOpTimeOfMyTerm( OpTime(Timestamp(std::numeric_limits<int>::max(), 0), std::numeric_limits<int>::max())); _performPostMemberStateUpdateAction(kActionWinElection); _voteRequester.reset(nullptr); _replExecutor.signalEvent(_electionFinishedEvent); lossGuard.dismiss(); }
OpTime OplogEntry::getOpTime() const { long long term = OpTime::kUninitializedTerm; if (getTerm()) { term = getTerm().get(); } return OpTime(getTimestamp(), term); }
Status OldUpdatePositionArgs::initialize(const BSONObj& argsObj) { Status status = bsonCheckOnlyHasFieldsForCommand( "OldUpdatePositionArgs", argsObj, kLegalUpdatePositionFieldNames); if (!status.isOK()) return status; // grab the array of changes BSONElement updateArray; status = bsonExtractTypedField(argsObj, kUpdateArrayFieldName, Array, &updateArray); if (!status.isOK()) return status; // now parse each array entry into an update BSONObjIterator i(updateArray.Obj()); while (i.more()) { BSONObj entry = i.next().Obj(); status = bsonCheckOnlyHasFields("UpdateInfoArgs", entry, kLegalUpdateInfoFieldNames); if (!status.isOK()) return status; OpTime opTime; if (entry[kOpTimeFieldName].isABSONObj()) { // In protocol version 1, { ts: <timestamp>, t: term } Status status = bsonExtractOpTimeField(entry, kOpTimeFieldName, &opTime); if (!status.isOK()) return status; } else { Timestamp ts; status = bsonExtractTimestampField(entry, kOpTimeFieldName, &ts); if (!status.isOK()) return status; opTime = OpTime(ts, OpTime::kUninitializedTerm); } if (!status.isOK()) return status; // TODO(spencer): The following three fields are optional in 3.0, but should be made // required or ignored in 3.0 long long cfgver; status = bsonExtractIntegerFieldWithDefault(entry, kConfigVersionFieldName, -1, &cfgver); if (!status.isOK()) return status; OID rid; status = bsonExtractOIDFieldWithDefault(entry, kMemberRIDFieldName, OID(), &rid); if (!status.isOK()) return status; long long memberID; status = bsonExtractIntegerFieldWithDefault(entry, kMemberIdFieldName, -1, &memberID); if (!status.isOK()) return status; _updates.push_back(UpdateInfo(rid, opTime, cfgver, memberID)); } return Status::OK(); }
OpTime ReplSetImpl::getMinValid(OperationContext* txn) { Lock::DBRead lk(txn->lockState(), "local.replset.minvalid"); BSONObj mv; if (Helpers::getSingleton(txn, minvalidNS, mv)) { return mv["ts"]._opTime(); } return OpTime(); }
OpTime ReplSetImpl::getMinValid() { Lock::DBRead lk("local.replset.minvalid"); BSONObj mv; if (Helpers::getSingleton("local.replset.minvalid", mv)) { return mv["ts"]._opTime(); } return OpTime(); }
OpTime ReplSetImpl::getMinValid() { OperationContextImpl txn; // XXX? Lock::DBRead lk(txn.lockState(), "local.replset.minvalid"); BSONObj mv; if (Helpers::getSingleton(&txn, "local.replset.minvalid", mv)) { return mv["ts"]._opTime(); } return OpTime(); }
Status ModifierCurrentDate::apply() const { const bool destExists = (_preparedState->elemFound.ok() && _preparedState->idxFound == (_updatePath.numParts() - 1)); mutablebson::Document& doc = _preparedState->doc; StringData lastPart = _updatePath.getPart(_updatePath.numParts() - 1); // If the element exists and is the same type, then that is what we want to work with mutablebson::Element elemToSet = destExists ? _preparedState->elemFound : doc.end(); if (!destExists) { // Creates the final element that's going to be $set in 'doc'. // fills in the value with place-holder/empty elemToSet = _typeIsDate ? doc.makeElementDate(lastPart, Date_t()) : doc.makeElementTimestamp(lastPart, OpTime()); if (!elemToSet.ok()) { return Status(ErrorCodes::InternalError, "can't create new element"); } // Now, we can be in two cases here, as far as attaching the element being set goes: // (a) none of the parts in the element's path exist, or (b) some parts of the path // exist but not all. if (!_preparedState->elemFound.ok()) { _preparedState->elemFound = doc.root(); _preparedState->idxFound = 0; } else { _preparedState->idxFound++; } // createPathAt() will complete the path and attach 'elemToSet' at the end of it. Status s = pathsupport::createPathAt( _updatePath, _preparedState->idxFound, _preparedState->elemFound, elemToSet); if (!s.isOK()) return s; } dassert(elemToSet.ok()); // By the time we are here the element is in place and we just need to update the value if (_typeIsDate) { const mongo::Date_t now = mongo::jsTime(); Status s = elemToSet.setValueDate(now); if (!s.isOK()) return s; } else { Status s = elemToSet.setValueTimestamp(getNextGlobalOptime()); if (!s.isOK()) return s; } // Set the elemFound, idxFound to the changed element for oplog logging. _preparedState->elemFound = elemToSet; _preparedState->idxFound = (_updatePath.numParts() - 1); return Status::OK(); }
void BackgroundSync::stop() { boost::unique_lock<boost::mutex> lock(_mutex); _pause = true; _syncSourceHost = HostAndPort(); _lastOpTimeFetched = OpTime(0,0); _lastFetchedHash = 0; _condvar.notify_all(); }
void BackgroundSync::stop() { boost::unique_lock<boost::mutex> lock(_mutex); _pause = true; _currentSyncTarget = NULL; _lastOpTimeFetched = OpTime(0,0); _lastH = 0; _condvar.notify_all(); }
void BackgroundSync::stop() { stdx::lock_guard<stdx::mutex> lock(_mutex); _pause = true; _syncSourceHost = HostAndPort(); _lastOpTimeFetched = OpTime(); _lastFetchedHash = 0; _appliedBufferCondition.notify_all(); _pausedCondition.notify_all(); }
OpTime SyncSourceResolver::_parseRemoteEarliestOpTime(const HostAndPort& candidate, const Fetcher::QueryResponse& queryResponse) { if (queryResponse.documents.empty()) { // Remote oplog is empty. const auto until = _taskExecutor->now() + kOplogEmptyBlacklistDuration; log() << "Blacklisting " << candidate << " due to empty oplog for " << kOplogEmptyBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } const auto& firstObjFound = queryResponse.documents.front(); if (firstObjFound.isEmpty()) { // First document in remote oplog is empty. const auto until = _taskExecutor->now() + kFirstOplogEntryEmptyBlacklistDuration; log() << "Blacklisting " << candidate << " due to empty first document for " << kFirstOplogEntryEmptyBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } const auto remoteEarliestOpTime = OpTime::parseFromOplogEntry(firstObjFound); if (!remoteEarliestOpTime.isOK()) { const auto until = _taskExecutor->now() + kFirstOplogEntryNullTimestampBlacklistDuration; log() << "Blacklisting " << candidate << " due to error parsing OpTime from the oldest" << " oplog entry for " << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until << ". Error: " << remoteEarliestOpTime.getStatus() << ", Entry: " << redact(firstObjFound); _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } if (remoteEarliestOpTime.getValue().isNull()) { // First document in remote oplog is empty. const auto until = _taskExecutor->now() + kFirstOplogEntryNullTimestampBlacklistDuration; log() << "Blacklisting " << candidate << " due to null timestamp in first document for " << kFirstOplogEntryNullTimestampBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); return OpTime(); } return remoteEarliestOpTime.getValue(); }
void BackgroundSync::stop() { stdx::lock_guard<stdx::mutex> lock(_mutex); _stopped = true; _syncSourceHost = HostAndPort(); _lastOpTimeFetched = OpTime(); _lastFetchedHash = 0; if (_oplogFetcher) { _oplogFetcher->shutdown(); } }
void ReplSetImpl::loadLastOpTimeWritten(OperationContext* txn, bool quiet) { Lock::DBRead lk(txn->lockState(), rsoplog); BSONObj o; if (Helpers::getLast(txn, rsoplog, o)) { OpTime lastOpTime = o["ts"]._opTime(); uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTime.isNull()); getGlobalReplicationCoordinator()->setMyLastOptime(txn, lastOpTime); } else { getGlobalReplicationCoordinator()->setMyLastOptime(txn, OpTime()); } }
void ReplSource::syncToTailOfRemoteLog() { string _ns = ns(); BSONObjBuilder b; if ( !only.empty() ) { b.appendRegex("ns", string("^") + only); } BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) ); if ( !last.isEmpty() ) { BSONElement ts = last.getField( "ts" ); massert( 10386 , "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp ); syncedTo = OpTime( ts.date() ); } }
bool ReplSetImpl::resync(string& errmsg) { changeState(MemberState::RS_RECOVERING); Client::Context ctx("local"); ctx.db()->dropCollection("local.oplog.rs"); { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); theReplSet->initialSyncRequested = true; } lastOpTimeWritten = OpTime(); _veto.clear(); return true; }
StatusWith<OpTime> OpTime::parseFromOplogEntry(const BSONObj& obj) { Timestamp ts; Status status = bsonExtractTimestampField(obj, kTimestampFieldName, &ts); if (!status.isOK()) return status; // Default to -1 if the term is absent. long long term; status = bsonExtractIntegerFieldWithDefault(obj, kTermFieldName, kUninitializedTerm, &term); if (!status.isOK()) return status; return OpTime(ts, term); }
void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long originalTerm) { invariant(_voteRequester); LoseElectionGuardV1 lossGuard(this); LockGuard lk(_topoMutex); if (_topCoord->getTerm() != originalTerm) { log() << "not becoming primary, we have been superceded already"; return; } const VoteRequester::Result endResult = _voteRequester->getResult(); switch (endResult) { case VoteRequester::Result::kInsufficientVotes: log() << "not becoming primary, we received insufficient votes"; return; case VoteRequester::Result::kStaleTerm: log() << "not becoming primary, we have been superceded already"; return; case VoteRequester::Result::kSuccessfullyElected: log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm(); break; } { // Mark all nodes that responded to our vote request as up to avoid immediately // relinquishing primary. stdx::lock_guard<stdx::mutex> lk(_mutex); Date_t now = _replExecutor.now(); const unordered_set<HostAndPort> liveNodes = _voteRequester->getResponders(); for (auto& nodeInfo : _slaveInfo) { if (liveNodes.count(nodeInfo.hostAndPort)) { nodeInfo.down = false; nodeInfo.lastUpdate = now; } } } // Prevent last committed optime from updating until we finish draining. _setFirstOpTimeOfMyTerm( OpTime(Timestamp(std::numeric_limits<int>::max(), 0), std::numeric_limits<int>::max())); _performPostMemberStateUpdateAction(kActionWinElection); _voteRequester.reset(nullptr); _replExecutor.signalEvent(_electionFinishedEvent); lossGuard.dismiss(); }
/*static*/ OpTime OpTime::_now() { OpTime result; unsigned t = (unsigned) time(0); if ( last.secs == t ) { last.i++; result = last; } else if ( t < last.secs ) { result = skewed(); // separate function to keep out of the hot code path } else { last = OpTime(t, 1); result = last; } notifier.notify_all(); return last; }
Status SyncSourceResolver::startup() { { stdx::lock_guard<stdx::mutex> lock(_mutex); switch (_state) { case State::kPreStart: _state = State::kRunning; break; case State::kRunning: return Status(ErrorCodes::IllegalOperation, "sync source resolver already started"); case State::kShuttingDown: return Status(ErrorCodes::ShutdownInProgress, "sync source resolver shutting down"); case State::kComplete: return Status(ErrorCodes::ShutdownInProgress, "sync source resolver completed"); } } return _chooseAndProbeNextSyncSource(OpTime()); }
// Applies a batch of oplog entries, by using a set of threads to apply the operations and then // writes the oplog entries to the local oplog. OpTime SyncTail::multiApply(OperationContext* txn, const OpQueue& ops) { invariant(_applyFunc); if (getGlobalServiceContext()->getGlobalStorageEngine()->isMmapV1()) { // Use a ThreadPool to prefetch all the operations in a batch. prefetchOps(ops.getDeque(), &_prefetcherPool); } std::vector<std::vector<BSONObj>> writerVectors(replWriterThreadCount); fillWriterVectors(txn, ops.getDeque(), &writerVectors); LOG(2) << "replication batch size is " << ops.getDeque().size() << endl; // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter // because all readers are blocked anyway. stdx::lock_guard<SimpleMutex> fsynclk(filesLockedFsync); // stop all readers until we're done Lock::ParallelBatchWriterMode pbwm(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } applyOps(writerVectors, &_writerPool, _applyFunc, this); OpTime lastOpTime; { ON_BLOCK_EXIT([&] { _writerPool.join(); }); std::vector<BSONObj> raws; raws.reserve(ops.getDeque().size()); for (auto&& op : ops.getDeque()) { raws.emplace_back(op.raw); } lastOpTime = writeOpsToOplog(txn, raws); if (inShutdown()) { return OpTime(); } } // We have now written all database writes and updated the oplog to match. return lastOpTime; }
void BackgroundSync::stop() { { boost::unique_lock<boost::mutex> lock(_mutex); _pause = true; _currentSyncTarget = NULL; _lastOpTimeFetched = OpTime(0,0); _lastH = 0; _queueCounter.numElems = 0; } if (!_buffer.empty()) { log() << "replset " << _buffer.size() << " ops were not applied from buffer, this should " << "cause a rollback on the former primary" << rsLog; } // get rid of pending ops _buffer.clear(); }
Status ReadAfterOpTimeArgs::initialize(const BSONObj& cmdObj) { auto afterElem = cmdObj[ReadAfterOpTimeArgs::kRootFieldName]; if (afterElem.eoo()) { return Status::OK(); } if (!afterElem.isABSONObj()) { return Status(ErrorCodes::FailedToParse, "'after' field should be an object"); } BSONObj readAfterObj = afterElem.Obj(); BSONElement opTimeElem; auto opTimeStatus = bsonExtractTypedField( readAfterObj, ReadAfterOpTimeArgs::kOpTimeFieldName, Object, &opTimeElem); if (!opTimeStatus.isOK()) { return opTimeStatus; } BSONObj opTimeObj = opTimeElem.Obj(); BSONElement timestampElem; Timestamp timestamp; auto timestampStatus = bsonExtractTimestampField( opTimeObj, ReadAfterOpTimeArgs::kOpTimestampFieldName, ×tamp); if (!timestampStatus.isOK()) { return timestampStatus; } long long termNumber; auto termStatus = bsonExtractIntegerField(opTimeObj, ReadAfterOpTimeArgs::kOpTermFieldName, &termNumber); if (!termStatus.isOK()) { return termStatus; } _opTime = OpTime(timestamp, termNumber); return Status::OK(); }