bool SyncSourceFeedback::replHandshake(OperationContext* txn) { // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; getGlobalReplicationCoordinator()->prepareReplSetUpdatePositionCommandHandshakes( txn, &handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", res["errmsg"].str().find("no such cmd") == std::string::npos); log() << "replSet error while handshaking the upstream updater: " << res["errmsg"].valuestrsafe(); _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return Status(ErrorCodes::NodeNotFound, "Need to send handshake before updating position upstream"); } replCoord->prepareReplSetUpdatePositionCommand(txn, &cmd); } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; _resetConnection(); } return status; }
bool SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to update to return true; } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return false; } replCoord->prepareReplSetUpdatePositionCommand(txn, &cmd); } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); bool ok; try { ok = _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; _resetConnection(); return false; } if (!ok) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; _resetConnection(); return false; } return true; }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedback"); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (true) { // breaks once _shutdownSignaled is true { stdx::unique_lock<stdx::mutex> lock(_mtx); while (!_positionChanged && !_shutdownSignaled) { if (_cond.wait_for(lock, _keepAliveInterval) == stdx::cv_status::timeout) { break; } } if (_shutdownSignaled) { break; } _positionChanged = false; } auto txn = cc().makeOperationContext(); MemberState state = replCoord->getMemberState(); if (state.primary() || state.startup()) { _resetConnection(); continue; } const HostAndPort target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (target.empty()) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; continue; } if (!_connect(txn.get(), target)) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; continue; } } Status status = updateUpstream(txn.get()); if (!status.isOK()) { sleepmillis(500); stdx::unique_lock<stdx::mutex> lock(_mtx); _positionChanged = true; } } }
bool SyncSourceFeedback::replHandshake(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getCurrentMemberState().primary()) { // primary has no one to handshake to return true; } // construct a vector of handshake obj for us as well as all chained members std::vector<BSONObj> handshakeObjs; replCoord->prepareReplSetUpdatePositionCommandHandshakes(txn, &handshakeObjs); LOG(1) << "handshaking upstream updater"; for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); ++it) { BSONObj res; try { LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " "handshake: " << *it; if (!_connection->runCommand("admin", *it, res)) { std::string errMsg = res["errmsg"].valuestrsafe(); massert(17447, "upstream updater is not supported by the member from which we" " are syncing, please update all nodes to 2.6 or later.", errMsg.find("no such cmd") == std::string::npos); log() << "replSet error while handshaking the upstream updater: " << errMsg; // sleep half a second if we are not in our sync source's config // TODO(dannenberg) after 2.8, remove the string comparison if (res["code"].numberInt() == ErrorCodes::NodeNotFound || errMsg.find("could not be found in replica set config while attempting " "to associate it with") != std::string::npos) { // black list sync target for 10 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 10*1000)); BackgroundSync::get()->clearSyncTarget(); } _resetConnection(); return false; } } catch (const DBException& e) { log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } } return true; }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedbackThread"); OperationContextImpl txn; bool positionChanged = false; bool handshakeNeeded = false; ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { _cond.wait(lock); } if (_shutdownSignaled) { break; } positionChanged = _positionChanged; handshakeNeeded = _handshakeNeeded; _positionChanged = false; _handshakeNeeded = false; } MemberState state = replCoord->getCurrentMemberState(); if (state.primary() || state.fatal() || state.startup()) { continue; } const Member* target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (!target) { sleepmillis(500); continue; } if (!_connect(&txn, target->fullName())) { sleepmillis(500); continue; } } if (handshakeNeeded) { if (!replHandshake(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _handshakeNeeded = true; continue; } } if (positionChanged) { if (!updateUpstream(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _positionChanged = true; } } } cc().shutdown(); }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { boost::unique_lock<boost::mutex> lock(_mtx); if (_handshakeNeeded) { // Don't send updates if there are nodes that haven't yet been handshaked return Status(ErrorCodes::NodeNotFound, "Need to send handshake before updating position upstream"); } // the command could not be created, likely because the node was removed from the set if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); } return status; }
Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary()) { // primary has no one to update to return Status::OK(); } BSONObjBuilder cmd; { stdx::unique_lock<stdx::mutex> lock(_mtx); // the command could not be created, likely because the node was removed from the set if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } } BSONObj res; LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { log() << "SyncSourceFeedback error sending update: " << e.what() << endl; // blacklist sync target for .5 seconds and find a new one replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); return e.toStatus(); } Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; // blacklist sync target for .5 seconds and find a new one, unless we were rejected due // to the syncsource having a newer config if (status != ErrorCodes::InvalidReplicaSetConfig || res["cfgver"].eoo() || res["cfgver"].numberLong() < replCoord->getConfig().getConfigVersion()) { replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); } } return status; }
bool SyncSourceFeedback::_connect(OperationContext* txn, const HostAndPort& host) { if (hasConnection()) { return true; } log() << "replset setting syncSourceFeedback to " << host.toString() << rsLog; _connection.reset(new DBClientConnection(false, 0, OplogReader::tcp_timeout)); string errmsg; try { if (!_connection->connect(host, errmsg) || (getGlobalAuthorizationManager()->isAuthEnabled() && !replAuthenticate())) { _resetConnection(); log() << "repl: " << errmsg << endl; return false; } } catch (const DBException& e) { log() << "Error connecting to " << host.toString() << ": " << e.what(); _resetConnection(); return false; } return hasConnection(); }