StatusWith<string> isValidShard(const string& name, const ConnectionString& shardConnectionString, ScopedDbConnection& conn) { if (conn->type() == ConnectionString::SYNC) { return Status(ErrorCodes::BadValue, "can't use sync cluster as a shard; for a replica set, " "you have to use <setname>/<server1>,<server2>,..."); } BSONObj resIsMongos; // (ok == 0) implies that it is a mongos if (conn->runCommand("admin", BSON("isdbgrid" << 1), resIsMongos)) { return Status(ErrorCodes::BadValue, "can't add a mongos process as a shard"); } BSONObj resIsMaster; if (!conn->runCommand("admin", BSON("isMaster" << 1), resIsMaster)) { return Status(ErrorCodes::OperationFailed, str::stream() << "failed running isMaster: " << resIsMaster); } // if the shard has only one host, make sure it is not part of a replica set string setName = resIsMaster["setName"].str(); string commandSetName = shardConnectionString.getSetName(); if (commandSetName.empty() && !setName.empty()) { return Status(ErrorCodes::BadValue, str::stream() << "host is part of set " << setName << "; " << "use replica set url format " << "<setname>/<server1>,<server2>, ..."); } if (!commandSetName.empty() && setName.empty()) { return Status(ErrorCodes::OperationFailed, str::stream() << "host did not return a set name; " << "is the replica set still initializing? " << resIsMaster); } // if the shard is part of replica set, make sure it is the right one if (!commandSetName.empty() && (commandSetName != setName)) { return Status(ErrorCodes::OperationFailed, str::stream() << "host is part of a different set: " << setName); } if (setName.empty()) { // check this isn't a --configsvr BSONObj res; bool ok = conn->runCommand("admin", BSON("replSetGetStatus" << 1), res); if(!ok && res["info"].type() == String && res["info"].String() == "configsvr") { return Status(ErrorCodes::BadValue, "the specified mongod is a --configsvr and " "should thus not be a shard server"); } } // if the shard is part of a replica set, // make sure all the hosts mentioned in 'shardConnectionString' are part of // the set. It is fine if not all members of the set are present in 'shardConnectionString'. bool foundAll = true; string offendingHost; if (!commandSetName.empty()) { set<string> hostSet; BSONObjIterator iter(resIsMaster["hosts"].Obj()); while (iter.more()) { hostSet.insert(iter.next().String()); // host:port } if (resIsMaster["passives"].isABSONObj()) { BSONObjIterator piter(resIsMaster["passives"].Obj()); while (piter.more()) { hostSet.insert(piter.next().String()); // host:port } } if (resIsMaster["arbiters"].isABSONObj()) { BSONObjIterator piter(resIsMaster["arbiters"].Obj()); while (piter.more()) { hostSet.insert(piter.next().String()); // host:port } } vector<HostAndPort> hosts = shardConnectionString.getServers(); for (size_t i = 0; i < hosts.size(); i++) { if (!hosts[i].hasPort()) { hosts[i] = HostAndPort(hosts[i].host(), hosts[i].port()); } string host = hosts[i].toString(); // host:port if (hostSet.find(host) == hostSet.end()) { offendingHost = host; foundAll = false; break; } } } if (!foundAll) { return Status(ErrorCodes::OperationFailed, str::stream() << "in seed list " << shardConnectionString.toString() << ", host " << offendingHost << " does not belong to replica set " << setName); } string shardName(name); // shard name defaults to the name of the replica set if (name.empty() && !setName.empty()) { shardName = setName; } // disallow adding shard replica set with name 'config' if (shardName == "config") { return Status(ErrorCodes::BadValue, "use of shard replica set with name 'config' is not allowed"); } return shardName; }
void BackgroundSync::produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer && !inShutdown()) { _condvar.wait(lock); } if (inShutdown()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { boost::unique_lock<boost::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } _syncSourceReader.resetConnection(); _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); { boost::unique_lock<boost::mutex> lock(_mutex); // no server found if (_syncSourceReader.getHost().empty()) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = _syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!_syncSourceReader.haveCursor()) { return; } if (_rollbackIfNeeded(txn, _syncSourceReader)) { stop(); return; } while (!inShutdown()) { if (!_syncSourceReader.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = _syncSourceReader.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getCurrentMemberState().primary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncSource()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. _syncSourceReader.more(); } networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); if (!_syncSourceReader.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause) { return; } } _syncSourceReader.tailCheck(); if( !_syncSourceReader.haveCursor() ) { LOG(1) << "replSet end syncTail pass"; return; } continue; } } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getCurrentMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = _syncSourceReader.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { boost::unique_lock<boost::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty(); } } }
Status MemberConfig::initialize(const BSONObj& mcfg) { Status status = bsonCheckOnlyHasFields( "replica set member configuration", mcfg, kLegalMemberConfigFieldNames); if (!status.isOK()) return status; // // Parse _id field. // BSONElement idElement = mcfg[kIdFieldName]; if (idElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << kIdFieldName << " field is missing"); } if (!idElement.isNumber()) { return Status(ErrorCodes::TypeMismatch, str::stream() << kIdFieldName << " field has non-numeric type " << typeName(idElement.type())); } _id = idElement.numberInt(); // // Parse h field. // std::string hostAndPortString; status = bsonExtractStringField(mcfg, kHostFieldName, &hostAndPortString); if (!status.isOK()) return status; boost::trim(hostAndPortString); status = _host.initialize(hostAndPortString); if (!status.isOK()) return status; if (!_host.hasPort()) { // make port explicit even if default. _host = HostAndPort(_host.host(), _host.port()); } // // Parse votes field. // BSONElement votesElement = mcfg[kVotesFieldName]; int votes; if (votesElement.eoo()) { votes = kVotesFieldDefault; } else if (votesElement.isNumber()) { votes = votesElement.numberInt(); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << kVotesFieldName << " field value has non-numeric type " << typeName(votesElement.type())); } if (votes != 0 && votes != 1) { return Status(ErrorCodes::BadValue, str::stream() << kVotesFieldName << " field value is " << votesElement.numberInt() << " but must be 0 or 1"); } _isVoter = bool(votes); // // Parse priority field. // BSONElement priorityElement = mcfg[kPriorityFieldName]; if (priorityElement.eoo()) { _priority = kPriorityFieldDefault; } else if (priorityElement.isNumber()) { _priority = priorityElement.numberDouble(); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << kPriorityFieldName << " field has non-numeric type " << typeName(priorityElement.type())); } // // Parse arbiterOnly field. // status = bsonExtractBooleanFieldWithDefault(mcfg, kArbiterOnlyFieldName, kArbiterOnlyFieldDefault, &_arbiterOnly); if (!status.isOK()) return status; // // Parse slaveDelay field. // BSONElement slaveDelayElement = mcfg[kSlaveDelayFieldName]; if (slaveDelayElement.eoo()) { _slaveDelay = kSlaveDelayFieldDefault; } else if (slaveDelayElement.isNumber()) { _slaveDelay = Seconds(slaveDelayElement.numberInt()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << kSlaveDelayFieldName << " field value has non-numeric type " << typeName(slaveDelayElement.type())); } // // Parse hidden field. // status = bsonExtractBooleanFieldWithDefault(mcfg, kHiddenFieldName, kHiddenFieldDefault, &_hidden); if (!status.isOK()) return status; // // Parse buildIndexes field. // status = bsonExtractBooleanFieldWithDefault(mcfg, kBuildIndexesFieldName, kBuildIndexesFieldDefault, &_buildIndexes); if (!status.isOK()) return status; // // Parse "tags" field. // _tags.clear(); BSONElement tagsElement; status = bsonExtractTypedField(mcfg, kTagsFieldName, Object, &tagsElement); if (status.isOK()) { for (BSONObj::iterator tagIter(tagsElement.Obj()); tagIter.more();) { const BSONElement& tag = tagIter.next(); if (tag.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "tags." << tag.fieldName() << " field has non-string value of type " << typeName(tag.type())); } _tags.push_back(ReplicaSetTag(tag.fieldName(), tag.String())); } } else if (ErrorCodes::NoSuchKey != status) { return status; } return Status::OK(); }
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc, long long term) { // Old versions set this even though they returned not "ok" _mismatch = doc[kMismatchFieldName].trueValue(); if (_mismatch) return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match."); // Old versions sometimes set the replica set name ("set") but ok:0 const BSONElement replSetNameElement = doc[kReplSetFieldName]; if (replSetNameElement.eoo()) { _setName.clear(); } else if (replSetNameElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kReplSetFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(replSetNameElement.type())); } else { _setName = replSetNameElement.String(); } if (_setName.empty() && !doc[kOkFieldName].trueValue()) { std::string errMsg = doc[kErrMsgFieldName].str(); BSONElement errCodeElem = doc[kErrorCodeFieldName]; if (errCodeElem.ok()) { if (!errCodeElem.isNumber()) return Status(ErrorCodes::BadValue, "Error code is not a number!"); int errorCode = errCodeElem.numberInt(); return Status(ErrorCodes::Error(errorCode), errMsg); } return Status(ErrorCodes::UnknownError, errMsg); } const BSONElement hasDataElement = doc[kHasDataFieldName]; _hasDataSet = !hasDataElement.eoo(); _hasData = hasDataElement.trueValue(); const BSONElement electionTimeElement = doc[kElectionTimeFieldName]; if (electionTimeElement.eoo()) { _electionTimeSet = false; } else if (electionTimeElement.type() == bsonTimestamp) { _electionTimeSet = true; _electionTime = electionTimeElement.timestamp(); } else if (electionTimeElement.type() == Date) { _electionTimeSet = true; _electionTime = Timestamp(electionTimeElement.date()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kElectionTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(electionTimeElement.type())); } const BSONElement timeElement = doc[kTimeFieldName]; if (timeElement.eoo()) { _timeSet = false; } else if (timeElement.isNumber()) { _timeSet = true; _time = Seconds(timeElement.numberLong()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kTimeFieldName << "\" field in response to replSetHeartbeat " "command to have a numeric type, but found type " << typeName(timeElement.type())); } _isReplSet = doc[kIsReplSetFieldName].trueValue(); // In order to support both the 3.0(V0) and 3.2(V1) heartbeats we must parse the OpTime // field based on its type. If it is a Date, we parse it as the timestamp and use // initialize's term argument to complete the OpTime type. If it is an Object, then it's // V1 and we construct an OpTime out of its nested fields. const BSONElement opTimeElement = doc[kOpTimeFieldName]; if (opTimeElement.eoo()) { _opTimeSet = false; } else if (opTimeElement.type() == bsonTimestamp) { _opTimeSet = true; _opTime = OpTime(opTimeElement.timestamp(), term); } else if (opTimeElement.type() == Date) { _opTimeSet = true; _opTime = OpTime(Timestamp(opTimeElement.date()), term); } else if (opTimeElement.type() == Object) { BSONObj opTime = opTimeElement.Obj(); Timestamp ts; Status status = bsonExtractTimestampField(opTime, kTimestampFieldName, &ts); if (!status.isOK()) return status; long long term; status = bsonExtractIntegerField(opTime, kTermFieldName, &term); if (!status.isOK()) return status; _opTimeSet = true; _opTime = OpTime(ts, term); // since a v1 OpTime was in the response, the member must be part of a replset _isReplSet = true; } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kOpTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(opTimeElement.type())); } const BSONElement electableElement = doc[kIsElectableFieldName]; if (electableElement.eoo()) { _electableSet = false; } else { _electableSet = true; _electable = electableElement.trueValue(); } const BSONElement memberStateElement = doc[kMemberStateFieldName]; if (memberStateElement.eoo()) { _stateSet = false; } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kMemberStateFieldName << "\" field in response to replSetHeartbeat " "command to have type NumberInt or NumberLong, but found type " << typeName(memberStateElement.type())); } else { long long stateInt = memberStateElement.numberLong(); if (stateInt < 0 || stateInt > MemberState::RS_MAX) { return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" << kMemberStateFieldName << "\" in response to replSetHeartbeat is " "out of range; legal values are non-negative and no more than " << MemberState::RS_MAX); } _stateSet = true; _state = MemberState(static_cast<int>(stateInt)); } _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue(); // Not required for the case of uninitialized members -- they have no config const BSONElement configVersionElement = doc[kConfigVersionFieldName]; // If we have an optime then we must have a configVersion if (_opTimeSet && configVersionElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << "Response to replSetHeartbeat missing required \"" << kConfigVersionFieldName << "\" field even though initialized"); } // If there is a "v" (config version) then it must be an int. if (!configVersionElement.eoo() && configVersionElement.type() != NumberInt) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigVersionFieldName << "\" field in response to replSetHeartbeat to have " "type NumberInt, but found " << typeName(configVersionElement.type())); } _configVersion = configVersionElement.numberInt(); const BSONElement hbMsgElement = doc[kHbMessageFieldName]; if (hbMsgElement.eoo()) { _hbmsg.clear(); } else if (hbMsgElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kHbMessageFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(hbMsgElement.type())); } else { _hbmsg = hbMsgElement.String(); } const BSONElement syncingToElement = doc[kSyncSourceFieldName]; if (syncingToElement.eoo()) { _syncingTo = HostAndPort(); } else if (syncingToElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kSyncSourceFieldName << "\" field in response to replSetHeartbeat to " "have type String, but found " << typeName(syncingToElement.type())); } else { _syncingTo = HostAndPort(syncingToElement.String()); } const BSONElement rsConfigElement = doc[kConfigFieldName]; if (rsConfigElement.eoo()) { _configSet = false; _config = ReplicaSetConfig(); return Status::OK(); } else if (rsConfigElement.type() != Object) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigFieldName << "\" in response to replSetHeartbeat to have type " "Object, but found " << typeName(rsConfigElement.type())); } _configSet = true; return _config.initialize(rsConfigElement.Obj()); }
Status MemberConfig::initialize(const BSONObj& mcfg, ReplicaSetTagConfig* tagConfig) { Status status = bsonCheckOnlyHasFields( "replica set member configuration", mcfg, kLegalMemberConfigFieldNames); if (!status.isOK()) return status; // // Parse _id field. // BSONElement idElement = mcfg[kIdFieldName]; if (idElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << kIdFieldName << " field is missing"); } if (!idElement.isNumber()) { return Status(ErrorCodes::TypeMismatch, str::stream() << kIdFieldName << " field has non-numeric type " << typeName(idElement.type())); } _id = idElement.numberInt(); // // Parse h field. // std::string hostAndPortString; status = bsonExtractStringField(mcfg, kHostFieldName, &hostAndPortString); if (!status.isOK()) return status; boost::trim(hostAndPortString); status = _host.initialize(hostAndPortString); if (!status.isOK()) return status; if (!_host.hasPort()) { // make port explicit even if default. _host = HostAndPort(_host.host(), _host.port()); } // // Parse votes field. // BSONElement votesElement = mcfg[kVotesFieldName]; if (votesElement.eoo()) { _votes = kVotesFieldDefault; } else if (votesElement.isNumber()) { _votes = votesElement.numberInt(); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << kVotesFieldName << " field value has non-numeric type " << typeName(votesElement.type())); } // // Parse priority field. // BSONElement priorityElement = mcfg[kPriorityFieldName]; if (priorityElement.eoo()) { _priority = kPriorityFieldDefault; } else if (priorityElement.isNumber()) { _priority = priorityElement.numberDouble(); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << kPriorityFieldName << " field has non-numeric type " << typeName(priorityElement.type())); } // // Parse arbiterOnly field. // status = bsonExtractBooleanFieldWithDefault( mcfg, kArbiterOnlyFieldName, kArbiterOnlyFieldDefault, &_arbiterOnly); if (!status.isOK()) return status; // // Parse slaveDelay field. // BSONElement slaveDelayElement = mcfg[kSlaveDelayFieldName]; if (slaveDelayElement.eoo()) { _slaveDelay = kSlaveDelayFieldDefault; } else if (slaveDelayElement.isNumber()) { _slaveDelay = Seconds(slaveDelayElement.numberInt()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << kSlaveDelayFieldName << " field value has non-numeric type " << typeName(slaveDelayElement.type())); } // // Parse hidden field. // status = bsonExtractBooleanFieldWithDefault(mcfg, kHiddenFieldName, kHiddenFieldDefault, &_hidden); if (!status.isOK()) return status; // // Parse buildIndexes field. // status = bsonExtractBooleanFieldWithDefault( mcfg, kBuildIndexesFieldName, kBuildIndexesFieldDefault, &_buildIndexes); if (!status.isOK()) return status; // // Parse "tags" field. // _tags.clear(); BSONElement tagsElement; status = bsonExtractTypedField(mcfg, kTagsFieldName, Object, &tagsElement); if (status.isOK()) { for (BSONObj::iterator tagIter(tagsElement.Obj()); tagIter.more();) { const BSONElement& tag = tagIter.next(); if (tag.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "tags." << tag.fieldName() << " field has non-string value of type " << typeName(tag.type())); } _tags.push_back(tagConfig->makeTag(tag.fieldNameStringData(), tag.valueStringData())); } } else if (ErrorCodes::NoSuchKey != status) { return status; } // // Add internal tags based on other member properties. // // Add a voter tag if this non-arbiter member votes; use _id for uniquity. const std::string id = str::stream() << _id; if (isVoter() && !_arbiterOnly) { _tags.push_back(tagConfig->makeTag(kInternalVoterTagName, id)); } // Add an electable tag if this member is electable. if (isElectable()) { _tags.push_back(tagConfig->makeTag(kInternalElectableTagName, id)); } // Add a tag for generic counting of this node. if (!_arbiterOnly) { _tags.push_back(tagConfig->makeTag(kInternalAllTagName, id)); } return Status::OK(); }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); BSONObjBuilder cmdBob; cmdBob.append("find", nsToCollectionSubstring(rsOplogName)); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(fetcherMaxTimeMS)); BSONObjBuilder metadataBob; if (_replCoord->isV1ElectionProtocol()) { cmdBob.append("term", _replCoord->getTerm()); metadataBob.append(rpc::kReplSetMetadataFieldName, 1); } auto cmdObj = cmdBob.obj(); auto metadataObj = metadataBob.obj(); Fetcher fetcher( taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, metadataObj); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
HostAndPort ReplicationCoordinatorExternalStateImpl::getClientHostAndPort( const OperationContext* txn) { return HostAndPort(txn->getClient()->clientAddress(true)); }
virtual bool run( const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { if( cmdObj["targets"].trueValue() == false ){ result.append( "errmsg" , "No targets specified." ); return false; } if( cmdObj.getObjectField("targets").couldBeArray() ){ vector<BSONElement> targets = cmdObj.getField("targets").Array(); for( vector<BSONElement>::iterator i=targets.begin(); i!=targets.end(); ++i){ BSONElement be = *i; BSONObj bo = be.embeddedObject(); HostAndPort hp; try{ hp = HostAndPort( bo["server"].valuestrsafe() ); } catch( std::exception& e ){ result.append( bo["server"].valuestrsafe() , "Not a valid host:port pair." ); continue; } // delete exisitng target if( bo["remove"].trueValue() ){ if( PingMonitorThreadManager::hasTarget( hp ) ){ PingMonitorThreadManager::removeTarget( hp ); result.append( bo["server"].valuestrsafe() , "Removed from targets" ); } else result.append( bo["server"].valuestrsafe() , "Server does not exist amongst monitoring targets" ); } // update target settings else if( PingMonitorThreadManager::hasTarget( hp ) ){ if( bo["collectionPrefix"].trueValue() ){ result.append( bo["server"].valuestrsafe() , "Cannot reset collectionPrefix. To use this prefix, ensure no other target is using it and then remove this target using db.runCommand( { pingMonitorConfigure : 1 , targets : [ { server : <host:port> , remove : true } ] } ) , then re-create the target with the desired collectionPrefix. Warning: by doing this, all monitoring data up to this point will be lost." ); continue; } if( bo.getField("on").eoo() == false ){ PingMonitorThreadManager::amendTarget( hp , bo.getBoolField("on") ); } if( bo["interval"].trueValue() && bo["interval"].isNumber() ) PingMonitorThreadManager::amendTarget( hp , bo["interval"].numberInt() ); result.append( bo["server"].valuestrsafe() , PingMonitorThreadManager::getTargetInfo( hp )["info"].embeddedObject() ); } // create new target else{ bool on = true; int interval = 15; string collectionPrefix = ""; if( bo.getField("on").eoo() == false ){ on = bo.getBoolField("on"); } if( bo["interval"].trueValue() && bo["interval"].isNumber() ) interval = bo["interval"].numberInt(); if( bo["collectionPrefix"].trueValue() ) collectionPrefix = bo["collectionPrefix"].valuestrsafe(); BSONObj success = PingMonitorThreadManager::createTarget( hp , on , interval , collectionPrefix ); if( success["ok"].boolean() ){ result.append( bo["server"].valuestrsafe() , PingMonitorThreadManager::getTargetInfo( hp )["info"].embeddedObject() ); continue; } else result.append( bo["server"].valuestrsafe() , success["errmsg"].valuestrsafe() ); } } } else{ result.append("errmsg" , "Invalid input format for targets. Please use an array of JSON documents, for example, db.runCommand( { pingMonitorConfigure : 1 , targets : [ { server : <host:port> , on : true } )." ); } return true; }
HostAndPort ReplicationCoordinatorMock::chooseNewSyncSource(const OpTime& lastOpTimeFetched) { return HostAndPort(); }
HostAndPort ReplicationCoordinatorMock::chooseNewSyncSource() { invariant(false); return HostAndPort(); }
HostAndPort ReplicationCoordinatorMock::getMyHostAndPort() const { return HostAndPort(); }
HostAndPort HostAndPort::fromParts(const ::std::string& host, unsigned int port) { return HostAndPort(host, port); }
HostAndPort HostAndPort::fromHost(const ::std::string& host) { return HostAndPort(host); }
void BackgroundSync::clearSyncTarget() { boost::unique_lock<boost::mutex> lock(_mutex); _syncSourceHost = HostAndPort(); }
auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip , const BSONObj *fieldsToReturn , int queryOptions , int batchSize) { //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions ) return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions , batchSize ); // //verify( query.obj.isEmpty() ); //throw UserException( (string)"yay:" + ns ); } void DBDirectClient::killCursor( long long id ) { ClientCursor::erase( id ); } HostAndPort DBDirectClient::_clientHost = HostAndPort( "0.0.0.0" , 0 ); unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) { Lock::DBRead lk( ns ); string errmsg; long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg ); if ( res == -1 ) return 0; uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 ); return (unsigned long long )res; } DBClientBase * createDirectClient() { return new DBDirectClient(); }
void ReplSetImpl::loadConfig(OperationContext* txn) { startupStatus = LOADINGCONFIG; startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)"); LOG(1) << "loadConfig() " << rsConfigNs << endl; while (1) { try { OwnedPointerVector<ReplSetConfig> configs; try { configs.mutableVector().push_back(ReplSetConfig::makeDirect()); } catch (DBException& e) { log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog; } for (vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++) { try { configs.mutableVector().push_back(ReplSetConfig::make(*i)); } catch (DBException& e) { log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog; } } ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); { scoped_lock lck(replSettings.discoveredSeeds_mx); if (replSettings.discoveredSeeds.size() > 0) { for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); i != replSettings.discoveredSeeds.end(); i++) { try { configs.mutableVector().push_back( ReplSetConfig::make(HostAndPort(*i))); } catch (DBException&) { LOG(1) << "replSet exception trying to load config from discovered " "seed " << *i << rsLog; replSettings.discoveredSeeds.erase(*i); } } } } if (!replSettings.reconfig.isEmpty()) { try { configs.mutableVector().push_back(ReplSetConfig::make(replSettings.reconfig, true)); } catch (DBException& re) { log() << "replSet couldn't load reconfig: " << re.what() << rsLog; replSettings.reconfig = BSONObj(); } } int nok = 0; int nempty = 0; for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin(); i != configs.mutableVector().end(); i++) { if ((*i)->ok()) nok++; if ((*i)->empty()) nempty++; } if (nok == 0) { if (nempty == (int) configs.mutableVector().size()) { startupStatus = EMPTYCONFIG; startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)"); log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog; static unsigned once; if (++once == 1) { log() << "replSet info you may need to run replSetInitiate -- rs.initia" "te() in the shell -- if that is not already done" << rsLog; } if (_seeds->size() == 0) { LOG(1) << "replSet info no seed hosts were specified on the --replSet " "command line" << rsLog; } } else { startupStatus = EMPTYUNREACHABLE; startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)"); log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog; } sleepsecs(1); continue; } if (!_loadConfigFinish(txn, configs.mutableVector())) { log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try " "again." << rsLog; sleepsecs(20); continue; } } catch (DBException& e) { startupStatus = BADCONFIG; startupStatusMsg.set("replSet error loading set config (BADCONFIG)"); log() << "replSet error loading configurations " << e.toString() << rsLog; log() << "replSet error replication will not start" << rsLog; sethbmsg("error loading set config"); _fatal(); throw; } break; } startupStatusMsg.set("? started"); startupStatus = STARTED; }
void BackgroundSync::_produce( OperationContext* txn, ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; StorageInterface::get(txn) ->setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>(&_threadPoolTaskExecutor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3, stdx::placeholders::_4), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = StorageInterface::get(txn)->getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
void ReplSetConfig::from(BSONObj o) { static const string legal[] = {"_id","version", "members","settings"}; static const set<string> legals(legal, legal + 4); assertOnlyHas(o, legals); md5 = o.md5(); _id = o["_id"].String(); if( o["version"].ok() ) { version = o["version"].numberInt(); uassert(13115, "bad " + rsConfigNs + " config: version", version > 0); } if( o["settings"].ok() ) { BSONObj settings = o["settings"].Obj(); if( settings["heartbeatConnRetries "].ok() ) ho.heartbeatConnRetries = settings["heartbeatConnRetries "].numberInt(); if( settings["heartbeatSleep"].ok() ) ho.heartbeatSleepMillis = (unsigned) (settings["heartbeatSleep"].Number() * 1000); if( settings["heartbeatTimeout"].ok() ) ho.heartbeatTimeoutMillis = (unsigned) (settings["heartbeatTimeout"].Number() * 1000); ho.check(); try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); } catch(...) { } } set<string> hosts; set<int> ords; vector<BSONElement> members; try { members = o["members"].Array(); } catch(...) { uasserted(13131, "replSet error parsing (or missing) 'members' field in config object"); } unsigned localhosts = 0; for( unsigned i = 0; i < members.size(); i++ ) { BSONObj mobj = members[i].Obj(); MemberCfg m; try { static const string legal[] = { "_id","votes","priority","host", "hidden","slaveDelay", "arbiterOnly","buildIndexes","tags","initialSync" // deprecated }; static const set<string> legals(legal, legal + 10); assertOnlyHas(mobj, legals); try { m._id = (int) mobj["_id"].Number(); } catch(...) { /* TODO: use of string exceptions may be problematic for reconfig case! */ throw "_id must be numeric"; } string s; try { s = mobj["host"].String(); m.h = HostAndPort(s); } catch(...) { throw string("bad or missing host field? ") + mobj.toString(); } if( m.h.isLocalHost() ) localhosts++; m.arbiterOnly = mobj["arbiterOnly"].trueValue(); m.slaveDelay = mobj["slaveDelay"].numberInt(); if( mobj.hasElement("hidden") ) m.hidden = mobj["hidden"].trueValue(); if( mobj.hasElement("buildIndexes") ) m.buildIndexes = mobj["buildIndexes"].trueValue(); if( mobj.hasElement("priority") ) m.priority = mobj["priority"].Number(); if( mobj.hasElement("votes") ) m.votes = (unsigned) mobj["votes"].Number(); if( mobj.hasElement("tags") ) { vector<BSONElement> v = mobj["tags"].Array(); for( unsigned i = 0; i < v.size(); i++ ) m.tags.insert( v[i].String() ); } m.check(); } catch( const char * p ) { log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog; stringstream ss; ss << "replSet members[" << i << "] " << p; uassert(13107, ss.str(), false); } catch(DBException& e) { log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog; stringstream ss; ss << "bad config for member[" << i << "] " << e.what(); uassert(13135, ss.str(), false); } if( !(ords.count(m._id) == 0 && hosts.count(m.h.toString()) == 0) ) { log() << "replSet " << o.toString() << rsLog; uassert(13108, "bad replset config -- duplicate hosts in the config object?", false); } hosts.insert(m.h.toString()); ords.insert(m._id); this->members.push_back(m); } uassert(13393, "can't use localhost in repl set member names except when using it for all members", localhosts == 0 || localhosts == members.size()); uassert(13117, "bad " + rsConfigNs + " config", !_id.empty()); }
HostAndPort TopologyCoordinatorMock::getSyncSourceAddress() const { return HostAndPort(); }