ServerState::RoleEnum ServerState::checkServersList (std::string const& id) { // fetch value at Plan/DBServers // we need to do this to determine the server's role const std::string key = "Plan/DBServers"; AgencyComm comm; AgencyCommResult result; { AgencyCommLocker locker("Plan", "READ"); if (locker.successful()) { result = comm.getValues(key, true); } } if (! result.successful()) { const std::string endpoints = AgencyComm::getEndpointsString(); LOG_TRACE("Could not fetch configuration from agency endpoints (%s): " "got status code %d, message: %s, key: %s", endpoints.c_str(), result._statusCode, result.errorMessage().c_str(), key.c_str()); return ServerState::ROLE_UNDEFINED; } ServerState::RoleEnum role = ServerState::ROLE_UNDEFINED; // check if we can find ourselves in the list returned by the agency result.parse("Plan/DBServers/", false); std::map<std::string, AgencyCommResultEntry>::const_iterator it = result._values.find(id); if (it != result._values.end()) { // we are in the list. this means we are a primary server role = ServerState::ROLE_PRIMARY; } else { // check if we are a secondary... it = result._values.begin(); while (it != result._values.end()) { const std::string name = triagens::basics::JsonHelper::getStringValue((*it).second._json, ""); if (name == id) { role = ServerState::ROLE_SECONDARY; _idOfPrimary = it->first; break; } ++it; } } return role; }
int ServerState::lookupLocalInfoToId (std::string const& localInfo, std::string& id) { // fetch value at Plan/DBServers // we need to do this to determine the server's role const std::string key = "Target/MapLocalToID"; int count = 0; while (++count <= 600) { AgencyComm comm; AgencyCommResult result; { AgencyCommLocker locker("Target", "READ"); if (locker.successful()) { result = comm.getValues(key, true); } } if (! result.successful()) { const std::string endpoints = AgencyComm::getEndpointsString(); LOG_DEBUG("Could not fetch configuration from agency endpoints (%s): " "got status code %d, message: %s, key: %s", endpoints.c_str(), result._statusCode, result.errorMessage().c_str(), key.c_str()); } else { result.parse("Target/MapLocalToID/", false); std::map<std::string, AgencyCommResultEntry>::const_iterator it = result._values.find(localInfo); if (it != result._values.end()) { TRI_json_t const* json = it->second._json; Json j(TRI_UNKNOWN_MEM_ZONE, json, Json::NOFREE); id = triagens::basics::JsonHelper::getStringValue(json, "ID", ""); if (id.empty()) { LOG_ERROR("ID not set!"); return TRI_ERROR_CLUSTER_COULD_NOT_DETERMINE_ID; } std::string description = triagens::basics::JsonHelper::getStringValue(json, "Description", ""); if (! description.empty()) { setDescription(description); } return TRI_ERROR_NO_ERROR; } } sleep(1); }; return TRI_ERROR_CLUSTER_COULD_NOT_DETERMINE_ID; }
void ApplicationCluster::close () { if (! enabled()) { return; } if (_heartbeat != 0) { _heartbeat->stop(); } // change into shutdown state ServerState::instance()->setState(ServerState::STATE_SHUTDOWN); AgencyComm comm; comm.sendServerState(0.0); }
ServerState::RoleEnum ServerState::checkCoordinatorsList (std::string const& id) { // fetch value at Plan/Coordinators // we need to do this to determine the server's role const std::string key = "Plan/Coordinators"; AgencyComm comm; AgencyCommResult result; { AgencyCommLocker locker("Plan", "READ"); if (locker.successful()) { result = comm.getValues(key, true); } } if (! result.successful()) { const std::string endpoints = AgencyComm::getEndpointsString(); LOG_TRACE("Could not fetch configuration from agency endpoints (%s): " "got status code %d, message: %s, key: %s", endpoints.c_str(), result._statusCode, result.errorMessage().c_str(), key.c_str()); return ServerState::ROLE_UNDEFINED; } if (! result.parse("Plan/Coordinators/", false)) { LOG_TRACE("Got an invalid JSON response for Plan/Coordinators"); return ServerState::ROLE_UNDEFINED; } // check if we can find ourselves in the list returned by the agency std::map<std::string, AgencyCommResultEntry>::const_iterator it = result._values.find(id); if (it != result._values.end()) { // we are in the list. this means we are a primary server return ServerState::ROLE_COORDINATOR; } return ServerState::ROLE_UNDEFINED; }
ServerState::RoleEnum ServerState::getRole () { std::string id; std::string info; { auto role = loadRole(); if (role != ServerState::ROLE_UNDEFINED || ! _clusterEnabled) { return role; } info = _localInfo; id = _id; } if (id.empty()) { // We need to announce ourselves in the agency to get a role configured: LOG_DEBUG("Announcing our birth in Current/NewServers to the agency..."); AgencyComm comm; AgencyCommResult result; Json json(Json::Object, 1); json("endpoint", Json(TRI_UNKNOWN_MEM_ZONE, getAddress())); std::string description = getDescription(); if (! description.empty()) { json("Description", Json(TRI_UNKNOWN_MEM_ZONE, description)); } result = comm.setValue("Current/NewServers/"+_localInfo, json.json(), 0.0); if (! result.successful()) { LOG_ERROR("Could not talk to agency!"); return ROLE_UNDEFINED; } std::string jsonst = json.toString(); LOG_DEBUG("Have stored %s under Current/NewServers/%s in agency.", jsonst.c_str(), _localInfo.c_str()); } // role not yet set RoleEnum role = determineRole(info, id); std::string roleString = roleToString(role); LOG_DEBUG("Found my role: %s", roleString.c_str()); storeRole(role); return role; }
RestStatus RestShutdownHandler::execute() { if (_request->requestType() != rest::RequestType::DELETE_REQ) { generateError(rest::ResponseCode::METHOD_NOT_ALLOWED, 405); return RestStatus::DONE; } bool removeFromCluster; std::string const& remove = _request->value("remove_from_cluster", removeFromCluster); removeFromCluster = removeFromCluster && remove == "1"; bool shutdownClusterFound; std::string const& shutdownCluster = _request->value("shutdown_cluster", shutdownClusterFound); if (shutdownClusterFound && shutdownCluster == "1") { AgencyComm agency; VPackBuilder builder; builder.add(VPackValue(true)); AgencyCommResult result = agency.setValue("Shutdown", builder.slice(), 0.0); if (!result.successful()) { generateError(rest::ResponseCode::SERVER_ERROR, 500); return RestStatus::DONE; } removeFromCluster = true; } if (removeFromCluster) { ClusterFeature* clusterFeature = ApplicationServer::getFeature<ClusterFeature>("Cluster"); clusterFeature->setUnregisterOnShutdown(true); } ApplicationServer::server->beginShutdown(); try { VPackBuilder result; result.add(VPackValue("OK")); generateResult(rest::ResponseCode::OK, result.slice()); } catch (...) { // Ignore the error } return RestStatus::DONE; }
void ApplicationCluster::stop () { if (! enabled()) { return; } // change into shutdown state ServerState::instance()->setState(ServerState::STATE_SHUTDOWN); AgencyComm comm; comm.sendServerState(0.0); if (_heartbeat != 0) { _heartbeat->stop(); } { AgencyCommLocker locker("Current", "WRITE"); if (locker.successful()) { // unregister ourselves ServerState::RoleEnum role = ServerState::instance()->getRole(); if (role == ServerState::ROLE_PRIMARY) { comm.removeValues("Current/DBServers/" + _myId, false); } else if (role == ServerState::ROLE_COORDINATOR) { comm.removeValues("Current/Coordinators/" + _myId, false); } // unregister ourselves comm.removeValues("Current/ServersRegistered/" + _myId, false); } } ClusterComm::cleanup(); ClusterInfo::cleanup(); AgencyComm::cleanup(); }
void ClusterFeature::unprepare() { if (_enableCluster) { if (_heartbeatThread != nullptr) { _heartbeatThread->beginShutdown(); } // change into shutdown state ServerState::instance()->setState(ServerState::STATE_SHUTDOWN); AgencyComm comm; comm.sendServerState(0.0); if (_heartbeatThread != nullptr) { int counter = 0; while (_heartbeatThread->isRunning()) { usleep(100000); // emit warning after 5 seconds if (++counter == 10 * 5) { LOG(WARN) << "waiting for heartbeat thread to finish"; } } } if (_unregisterOnShutdown) { ServerState::instance()->unregister(); } } if (!_enableCluster) { ClusterComm::cleanup(); return; } // change into shutdown state ServerState::instance()->setState(ServerState::STATE_SHUTDOWN); AgencyComm comm; comm.sendServerState(0.0); // Try only once to unregister because maybe the agencycomm // is shutting down as well... ServerState::RoleEnum role = ServerState::instance()->getRole(); AgencyWriteTransaction unreg; // Remove from role if (role == ServerState::ROLE_PRIMARY) { unreg.operations.push_back(AgencyOperation( "Current/DBServers/" + _myId, AgencySimpleOperationType::DELETE_OP)); } else if (role == ServerState::ROLE_COORDINATOR) { unreg.operations.push_back(AgencyOperation( "Current/Coordinators/" + _myId, AgencySimpleOperationType::DELETE_OP)); } // Unregister unreg.operations.push_back( AgencyOperation("Current/ServersRegistered/" + _myId, AgencySimpleOperationType::DELETE_OP)); comm.sendTransactionWithFailover(unreg, 120.0); while (_heartbeatThread->isRunning()) { usleep(50000); } AgencyComm::cleanup(); ClusterComm::cleanup(); }
void ClusterFeature::start() { // return if cluster is disabled if (!_enableCluster) { return; } ServerState::instance()->setState(ServerState::STATE_STARTUP); // the agency about our state AgencyComm comm; comm.sendServerState(0.0); std::string const version = comm.getVersion(); ServerState::instance()->setInitialized(); std::string const endpoints = AgencyComm::getEndpointsString(); ServerState::RoleEnum role = ServerState::instance()->getRole(); LOG(INFO) << "Cluster feature is turned on. Agency version: " << version << ", Agency endpoints: " << endpoints << ", server id: '" << _myId << "', internal address: " << _myAddress << ", role: " << ServerState::roleToString(role); if (!_disableHeartbeat) { AgencyCommResult result = comm.getValues("Sync/HeartbeatIntervalMs"); if (result.successful()) { velocypack::Slice HeartbeatIntervalMs = result.slice()[0].get(std::vector<std::string>( {AgencyComm::prefix(), "Sync", "HeartbeatIntervalMs"})); if (HeartbeatIntervalMs.isInteger()) { try { _heartbeatInterval = HeartbeatIntervalMs.getUInt(); LOG(INFO) << "using heartbeat interval value '" << _heartbeatInterval << " ms' from agency"; } catch (...) { // Ignore if it is not a small int or uint } } } // no value set in agency. use default if (_heartbeatInterval == 0) { _heartbeatInterval = 5000; // 1/s LOG(WARN) << "unable to read heartbeat interval from agency. Using " << "default value '" << _heartbeatInterval << " ms'"; } // start heartbeat thread _heartbeatThread = std::make_shared<HeartbeatThread>( _agencyCallbackRegistry.get(), _heartbeatInterval * 1000, 5, SchedulerFeature::SCHEDULER->ioService()); if (!_heartbeatThread->init() || !_heartbeatThread->start()) { LOG(FATAL) << "heartbeat could not connect to agency endpoints (" << endpoints << ")"; FATAL_ERROR_EXIT(); } while (!_heartbeatThread->isReady()) { // wait until heartbeat is ready usleep(10000); } } AgencyCommResult result; while (true) { VPackBuilder builder; try { VPackObjectBuilder b(&builder); builder.add("endpoint", VPackValue(_myAddress)); } catch (...) { LOG(FATAL) << "out of memory"; FATAL_ERROR_EXIT(); } result = comm.setValue("Current/ServersRegistered/" + _myId, builder.slice(), 0.0); if (!result.successful()) { LOG(FATAL) << "unable to register server in agency: http code: " << result.httpCode() << ", body: " << result.body(); FATAL_ERROR_EXIT(); } else { break; } sleep(1); } if (role == ServerState::ROLE_COORDINATOR) { ServerState::instance()->setState(ServerState::STATE_SERVING); } else if (role == ServerState::ROLE_PRIMARY) { ServerState::instance()->setState(ServerState::STATE_SERVINGASYNC); } else if (role == ServerState::ROLE_SECONDARY) { ServerState::instance()->setState(ServerState::STATE_SYNCING); } }
static void raceForClusterBootstrap() { AgencyComm agency; auto ci = ClusterInfo::instance(); while (true) { AgencyCommResult result = agency.getValues("Bootstrap"); if (!result.successful()) { // Error in communication, note that value not found is not an error LOG_TOPIC(TRACE, Logger::STARTUP) << "raceForClusterBootstrap: no agency communication"; sleep(1); continue; } VPackSlice value = result.slice()[0].get( std::vector<std::string>({agency.prefix(), "Bootstrap"})); if (value.isString()) { // key was found and is a string if (value.copyString().find("done") != std::string::npos) { // all done, let's get out of here: LOG_TOPIC(TRACE, Logger::STARTUP) << "raceForClusterBootstrap: bootstrap already done"; return; } LOG_TOPIC(DEBUG, Logger::STARTUP) << "raceForClusterBootstrap: somebody else does the bootstrap"; sleep(1); continue; } // No value set, we try to do the bootstrap ourselves: VPackBuilder b; b.add(VPackValue(arangodb::ServerState::instance()->getId())); result = agency.casValue("Bootstrap", b.slice(), false, 300, 15); if (!result.successful()) { LOG_TOPIC(DEBUG, Logger::STARTUP) << "raceForClusterBootstrap: lost race, somebody else will bootstrap"; // Cannot get foot into the door, try again later: sleep(1); continue; } // OK, we handle things now, let's see whether a DBserver is there: auto dbservers = ci->getCurrentDBServers(); if (dbservers.size() == 0) { LOG_TOPIC(TRACE, Logger::STARTUP) << "raceForClusterBootstrap: no DBservers, waiting"; agency.removeValues("Bootstrap", false); sleep(1); continue; } LOG_TOPIC(DEBUG, Logger::STARTUP) << "raceForClusterBootstrap: race won, we do the bootstrap"; auto vocbase = DatabaseFeature::DATABASE->systemDatabase(); V8DealerFeature::DEALER->loadJavascriptFiles(vocbase, "server/bootstrap/cluster-bootstrap.js", 0); LOG_TOPIC(DEBUG, Logger::STARTUP) << "raceForClusterBootstrap: bootstrap done"; b.clear(); b.add(VPackValue(arangodb::ServerState::instance()->getId() + ": done")); result = agency.setValue("Bootstrap", b.slice(), 0); if (result.successful()) { return; } LOG_TOPIC(TRACE, Logger::STARTUP) << "raceForClusterBootstrap: could not indicate success"; sleep(1); } }
bool ApplicationCluster::open () { if (! enabled()) { return true; } ServerState::RoleEnum role = ServerState::instance()->getRole(); // tell the agency that we are ready { AgencyComm comm; AgencyCommResult result; AgencyCommLocker locker("Current", "WRITE"); if (locker.successful()) { TRI_json_t* ep = TRI_CreateString2CopyJson(TRI_UNKNOWN_MEM_ZONE, _myAddress.c_str(), _myAddress.size()); if (ep == 0) { locker.unlock(); LOG_FATAL_AND_EXIT("out of memory"); } TRI_json_t* json = TRI_CreateArray2Json(TRI_UNKNOWN_MEM_ZONE, 1); if (json == 0) { TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, ep); locker.unlock(); LOG_FATAL_AND_EXIT("out of memory"); } TRI_Insert2ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "endpoint", ep); result = comm.setValue("Current/ServersRegistered/" + _myId, json, 0.0); TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, json); } if (! result.successful()) { locker.unlock(); LOG_FATAL_AND_EXIT("unable to register server in agency: http code: %d, body: %s", (int) result.httpCode(), result.body().c_str()); } if (role == ServerState::ROLE_COORDINATOR) { TRI_json_t* json = TRI_CreateString2CopyJson(TRI_UNKNOWN_MEM_ZONE, "none", 4); if (json == 0) { locker.unlock(); LOG_FATAL_AND_EXIT("out of memory"); } ServerState::instance()->setState(ServerState::STATE_SERVING); // register coordinator AgencyCommResult result = comm.setValue("Current/Coordinators/" + _myId, json, 0.0); TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, json); if (! result.successful()) { locker.unlock(); LOG_FATAL_AND_EXIT("unable to register coordinator in agency"); } } else if (role == ServerState::ROLE_PRIMARY) { TRI_json_t* json = TRI_CreateString2CopyJson(TRI_UNKNOWN_MEM_ZONE, "none", 4); if (json == 0) { locker.unlock(); LOG_FATAL_AND_EXIT("out of memory"); } ServerState::instance()->setState(ServerState::STATE_SERVINGASYNC); // register server AgencyCommResult result = comm.setValue("Current/DBServers/" + _myId, json, 0.0); TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, json); if (! result.successful()) { locker.unlock(); LOG_FATAL_AND_EXIT("unable to register db server in agency"); } } else if (role == ServerState::ROLE_SECONDARY) { locker.unlock(); LOG_FATAL_AND_EXIT("secondary server tasks are currently not implemented"); } } return true; }
bool ApplicationCluster::start () { // set authentication data ServerState::instance()->setAuthentication(_username, _password); // overwrite memory area _username = _password = "******"; ServerState::instance()->setDataPath(_dataPath); ServerState::instance()->setLogPath(_logPath); ServerState::instance()->setAgentPath(_agentPath); ServerState::instance()->setArangodPath(_arangodPath); ServerState::instance()->setDBserverConfig(_dbserverConfig); ServerState::instance()->setCoordinatorConfig(_coordinatorConfig); ServerState::instance()->setDisableDispatcherFrontend(_disableDispatcherFrontend); ServerState::instance()->setDisableDispatcherKickstarter(_disableDispatcherKickstarter); if (! enabled()) { return true; } ServerState::instance()->setId(_myId); // perfom an initial connect to the agency const std::string endpoints = AgencyComm::getEndpointsString(); if (! AgencyComm::tryConnect()) { LOG_FATAL_AND_EXIT("Could not connect to agency endpoints (%s)", endpoints.c_str()); } ServerState::RoleEnum role = ServerState::instance()->getRole(); if (role == ServerState::ROLE_UNDEFINED) { // no role found LOG_FATAL_AND_EXIT("unable to determine unambiguous role for server '%s'. No role configured in agency (%s)", _myId.c_str(), endpoints.c_str()); } // check if my-address is set if (_myAddress.empty()) { // no address given, now ask the agency for out address _myAddress = ServerState::instance()->getAddress(); } else { // register our own address ServerState::instance()->setAddress(_myAddress); } if (_myAddress.empty()) { LOG_FATAL_AND_EXIT("unable to determine internal address for server '%s'. " "Please specify --cluster.my-address or configure the address for this server in the agency.", _myId.c_str()); } // now we can validate --cluster.my-address const string unified = triagens::rest::Endpoint::getUnifiedForm(_myAddress); if (unified.empty()) { LOG_FATAL_AND_EXIT("invalid endpoint '%s' specified for --cluster.my-address", _myAddress.c_str()); } ServerState::instance()->setState(ServerState::STATE_STARTUP); // initialise ConnectionManager library httpclient::ConnectionManager::instance()->initialise(); // the agency about our state AgencyComm comm; comm.sendServerState(0.0); const std::string version = comm.getVersion(); ServerState::instance()->setInitialised(); LOG_INFO("Cluster feature is turned on. " "Agency version: %s, Agency endpoints: %s, " "server id: '%s', internal address: %s, role: %s", version.c_str(), endpoints.c_str(), _myId.c_str(), _myAddress.c_str(), ServerState::roleToString(role).c_str()); if (! _disableHeartbeat) { AgencyCommResult result = comm.getValues("Sync/HeartbeatIntervalMs", false); if (result.successful()) { result.parse("", false); std::map<std::string, AgencyCommResultEntry>::const_iterator it = result._values.begin(); if (it != result._values.end()) { _heartbeatInterval = triagens::basics::JsonHelper::stringUInt64((*it).second._json); LOG_INFO("using heartbeat interval value '%llu ms' from agency", (unsigned long long) _heartbeatInterval); } } // no value set in agency. use default if (_heartbeatInterval == 0) { _heartbeatInterval = 1000; // 1/s LOG_WARNING("unable to read heartbeat interval from agency. Using default value '%llu ms'", (unsigned long long) _heartbeatInterval); } // start heartbeat thread _heartbeat = new HeartbeatThread(_server, _dispatcher, _applicationV8, _heartbeatInterval * 1000, 5); if (_heartbeat == 0) { LOG_FATAL_AND_EXIT("unable to start cluster heartbeat thread"); } if (! _heartbeat->init() || ! _heartbeat->start()) { LOG_FATAL_AND_EXIT("heartbeat could not connect to agency endpoints (%s)", endpoints.c_str()); } while (! _heartbeat->ready()) { // wait until heartbeat is ready usleep(10000); } } return true; }