void replica::assign_primary(configuration_update_request& proposal) { dassert(proposal.node == primary_address(), ""); if (status() == PS_PRIMARY) { dwarn( "%s: invalid assgin primary proposal as the node is in %s", name(), enum_to_string(status())); return; } if (proposal.type == CT_UPGRADE_TO_PRIMARY && (status() != PS_SECONDARY || _secondary_states.checkpoint_task != nullptr)) { dwarn( "%s: invalid upgrade to primary proposal as the node is in %s or during checkpointing", name(), enum_to_string(status())); // TODO: tell meta server so new primary is built more quickly return; } proposal.config.primary = primary_address(); replica_helper::remove_node(primary_address(), proposal.config.secondaries); update_configuration_on_meta_server(proposal.type, proposal.node, proposal.config); }
bool failure_detector::end_ping_internal(::dsn::error_code err, const beacon_ack& ack) { /* * the caller of the end_ping_internal should lock necessarily!!! */ uint64_t beacon_send_time = ack.time; auto node = ack.this_node; uint64_t now = now_ms(); master_map::iterator itr = _masters.find(node); if (itr == _masters.end()) { dwarn("received beacon ack without corresponding master, ignore it, " "remote_master[%s], local_worker[%s]", node.to_string(), primary_address().to_string()); return false; } master_record& record = itr->second; if (!ack.allowed) { dwarn("worker rejected, stop sending beacon message, " "remote_master[%s], local_worker[%s]", node.to_string(), primary_address().to_string()); record.rejected = true; record.send_beacon_timer->cancel(true); return false; } if (!is_time_greater_than(beacon_send_time, record.last_send_time_for_beacon_with_ack)) { // out-dated beacon acks, do nothing dinfo("ignore out dated beacon acks"); return false; } // now the ack is applicable if (err != ERR_OK) { dwarn("ping master failed, err=%s", err.to_string()); return true; } // update last_send_time_for_beacon_with_ack record.last_send_time_for_beacon_with_ack = beacon_send_time; record.rejected = false; if (record.is_alive == false && now - record.last_send_time_for_beacon_with_ack <= _lease_milliseconds) { // report master connected report(node, true, true); itr->second.is_alive = true; on_master_connected(node); } return true; }
void meta_server_failure_detector::acquire_leader_lock() { // // try to get the leader lock until it is done // dsn::dist::distributed_lock_service::lock_options opt = {true, true}; std::string local_owner_id; while (true) { error_code err; auto tasks = _lock_svc->lock( _primary_lock_id, primary_address().to_std_string(), // lock granted LPC_META_SERVER_LEADER_LOCK_CALLBACK, [this, &err, &local_owner_id](error_code ec, const std::string& owner, uint64_t version) { err = ec; local_owner_id = owner; }, // lease expire LPC_META_SERVER_LEADER_LOCK_CALLBACK, [this](error_code ec, const std::string& owner, uint64_t version) { // let's take the easy way right now dsn_exit(0); }, opt ); _lock_grant_task = tasks.first; _lock_expire_task = tasks.second; _lock_grant_task->wait(); if (err == ERR_OK) { rpc_address addr; if (addr.from_string_ipv4(local_owner_id.c_str())) { dassert(primary_address() == addr, ""); set_primary(addr); break; } } } }
void meta_server_failure_detector::on_ping(const fd::beacon_msg& beacon, ::dsn::service::rpc_replier<fd::beacon_ack>& reply) { fd::beacon_ack ack; ack.this_node = beacon.to; if (!is_primary()) { end_point master; if (_state->get_meta_server_primary(master)) { ack.time = beacon.time; ack.is_master = false; ack.primary_node = master; } else { ack.time = beacon.time; ack.is_master = false; ack.primary_node = end_point::INVALID; } } else { failure_detector::on_ping_internal(beacon, ack); ack.primary_node = primary_address(); } reply(ack); }
void replica::on_config_sync(const partition_configuration& config) { ddebug( "%s: configuration sync", name()); // no outdated update if (config.ballot < get_ballot()) return; if (status() == PS_PRIMARY || nullptr != _primary_states.reconfiguration_task) { // nothing to do as pirmary always holds the truth } else { update_configuration(config); if (status() == PS_INACTIVE && !_inactive_is_transient) { if (config.primary == primary_address() // dead primary || config.primary.is_invalid() // primary is dead (otherwise let primary remove this) ) { _stub->remove_replica_on_meta_server(config); } } } }
void meta_server_failure_detector::set_primary(rpc_address primary) { bool old = _is_primary; { utils::auto_lock<zlock> l(_primary_address_lock); _primary_address = primary; _is_primary = (primary == primary_address()); } if (!old && _is_primary) { node_states ns; _state->get_node_state(ns); for (auto& pr : ns) { register_worker(pr.first, pr.second); } } if (old && !_is_primary) { clear_workers(); } }
void meta_service::on_log_completed(error_code err, size_t size, blob buffer, std::shared_ptr<configuration_update_request> req, dsn_message_t resp) { dassert(err == ERR_OK, "log operation failed, cannot proceed, err = %s", err.to_string()); dassert(buffer.length() == size, "log size must equal to the specified buffer size"); configuration_update_response response; update_configuration(*req, response); if (resp != nullptr) { meta_response_header rhdr; rhdr.err = err; rhdr.primary_address = primary_address(); marshall(resp, rhdr); marshall(resp, response); dsn_rpc_reply(resp); } else { err.end_tracking(); } }
void meta_service::on_request(message_ptr& msg) { meta_request_header hdr; unmarshall(msg, hdr); meta_response_header rhdr; bool is_primary = _state->get_meta_server_primary(rhdr.primary_address); if (is_primary) is_primary = (primary_address() == rhdr.primary_address); rhdr.err = ERR_SUCCESS; message_ptr resp = msg->create_response(); if (!is_primary) { rhdr.err = ERR_TALK_TO_OTHERS; marshall(resp, rhdr); } else if (hdr.rpc_tag == RPC_CM_QUERY_NODE_PARTITIONS) { configuration_query_by_node_request request; configuration_query_by_node_response response; unmarshall(msg, request); query_configuration_by_node(request, response); marshall(resp, rhdr); marshall(resp, response); } else if (hdr.rpc_tag == RPC_CM_QUERY_PARTITION_CONFIG_BY_INDEX) { configuration_query_by_index_request request; configuration_query_by_index_response response; unmarshall(msg, request); query_configuration_by_index(request, response); marshall(resp, rhdr); marshall(resp, response); } else if (hdr.rpc_tag == RPC_CM_UPDATE_PARTITION_CONFIGURATION) { update_configuration(msg, resp); return; } else { dassert(false, "unknown rpc tag %x (%s)", hdr.rpc_tag, task_code(hdr.rpc_tag).to_string()); } rpc::reply(resp); }
void replica::broadcast_group_check() { dassert (nullptr != _primary_states.group_check_task, ""); if (_primary_states.group_check_pending_replies.size() > 0) { dwarn( "%s: %u group check replies are still pending when doing next round check", name(), static_cast<int>(_primary_states.group_check_pending_replies.size()) ); for (auto it = _primary_states.group_check_pending_replies.begin(); it != _primary_states.group_check_pending_replies.end(); it++) { it->second->cancel(true); } _primary_states.group_check_pending_replies.clear(); } for (auto it = _primary_states.statuses.begin(); it != _primary_states.statuses.end(); it++) { if (it->first == primary_address()) continue; end_point addr = it->first; std::shared_ptr<group_check_request> request(new group_check_request); request->app_type = _primary_states.membership.app_type; request->node = addr; _primary_states.get_replica_config(addr, request->config); request->last_committed_decree = last_committed_decree(); request->learner_signature = 0; if (it->second == PS_POTENTIAL_SECONDARY) { auto it2 = _primary_states.learners.find(it->first); dassert (it2 != _primary_states.learners.end(), ""); request->learner_signature = it2->second.signature; } task_ptr callback_task = rpc::call_typed( addr, RPC_GROUP_CHECK, request, this, &replica::on_group_check_reply, gpid_to_hash(get_gpid()) ); _primary_states.group_check_pending_replies[addr] = callback_task; ddebug( "%s: init_group_check for %s:%d", name(), addr.name.c_str(), addr.port ); } }
void replica::on_group_check(const group_check_request& request, __out_param group_check_response& response) { ddebug( "%s: on_group_check from %s:%d", name(), request.config.primary.name.c_str(), request.config.primary.port ); if (request.config.ballot < get_ballot()) { response.err = ERR_VERSION_OUTDATED; return; } else if (request.config.ballot > get_ballot()) { update_local_configuration(request.config); } else if (is_same_ballot_status_change_allowed(status(), request.config.status)) { update_local_configuration(request.config, true); } switch (status()) { case PS_INACTIVE: break; case PS_SECONDARY: if (request.last_committed_decree > last_committed_decree()) { _prepare_list->commit(request.last_committed_decree, true); } break; case PS_POTENTIAL_SECONDARY: init_learn(request.learner_signature); break; case PS_ERROR: break; default: dassert (false, ""); } response.gpid = get_gpid(); response.node = primary_address(); response.err = ERR_SUCCESS; if (status() == PS_ERROR) { response.err = ERR_INVALID_STATE; } response.last_committed_decree_in_app = _app->last_committed_decree(); response.last_committed_decree_in_prepare_list = last_committed_decree(); response.learner_status_ = _potential_secondary_states.learning_status; response.learner_signature = _potential_secondary_states.learning_signature; }
void meta_service::update_configuration(dsn_message_t req, dsn_message_t resp) { if (_state->freezed()) { meta_response_header rhdr; rhdr.err = ERR_OK; rhdr.primary_address = primary_address(); configuration_update_request request; configuration_update_response response; ::unmarshall(req, request); response.err = ERR_STATE_FREEZED; _state->query_configuration_by_gpid(request.config.gpid, response.config); ::marshall(resp, rhdr); ::marshall(resp, response); dsn_rpc_reply(resp); return; } void* ptr; size_t sz; dsn_msg_read_next(req, &ptr, &sz); dsn_msg_read_commit(req, 0); // commit 0 so we can read again uint64_t offset; int len = (int)sz + sizeof(int32_t); char* buffer = new char[len]; *(int32_t*)buffer = (int)sz; memcpy(buffer + sizeof(int32_t), ptr, sz); auto tmp = std::shared_ptr<char>(buffer); blob bb2(tmp, 0, len); auto request = std::shared_ptr<configuration_update_request>(new configuration_update_request()); ::unmarshall(req, *request); { zauto_lock l(_log_lock); offset = _offset; _offset += len; file::write(_log, buffer, len, offset, LPC_CM_LOG_UPDATE, this, std::bind(&meta_service::on_log_completed, this, std::placeholders::_1, std::placeholders::_2, bb2, request, resp)); } }
void meta_service::on_config_changed(global_partition_id gpid) { end_point primary; if (_state->get_meta_server_primary(primary) && primary == primary_address()) { _failure_detector->set_primary(true); _balancer->run(gpid); } else { _failure_detector->set_primary(false); } }
bool meta_server_failure_detector::acquire_leader_lock() { error_code err; auto tasks = _lock_svc->lock( _primary_lock_id, _local_owner_id, true, // lock granted LPC_META_SERVER_LEADER_LOCK_CALLBACK, [this, &err](error_code ec, const std::string& owner, uint64_t version) { err = ec; }, // lease expire LPC_META_SERVER_LEADER_LOCK_CALLBACK, [this](error_code ec, const std::string& owner, uint64_t version) { // let's take the easy way right now dsn_terminate(); // reset primary //rpc_address addr; //set_primary(addr); //_state->on_become_non_leader(); //// set another round of service //acquire_leader_lock(); } ); _lock_grant_task = tasks.first; _lock_expire_task = tasks.second; _lock_grant_task->wait(); if (err == ERR_OK) { rpc_address addr; if (addr.from_string_ipv4(_local_owner_id.c_str())) { dassert(primary_address() == addr, ""); set_primary(addr); return true; } } return false; }
meta_server_failure_detector::meta_server_failure_detector(server_state* state, meta_service* svc) { _state = state; _svc = svc; _is_primary = false; // TODO: config _lock_svc = dsn::utils::factory_store< ::dsn::dist::distributed_lock_service>::create( "distributed_lock_service_simple", PROVIDER_TYPE_MAIN ); _primary_lock_id = "dsn.meta.server.leader"; _local_owner_id = primary_address().to_string(); }
bool replica::update_configuration(const partition_configuration& config) { dassert (config.ballot >= get_ballot(), ""); replica_configuration rconfig; replica_helper::get_replica_config(config, primary_address(), rconfig); if (rconfig.status == PS_PRIMARY && (rconfig.ballot > get_ballot() || status() != PS_PRIMARY) ) { _primary_states.reset_membership(config, config.primary != primary_address()); } if (config.ballot > get_ballot() || is_same_ballot_status_change_allowed(status(), rconfig.status) ) { return update_local_configuration(rconfig, true); } else return false; }
// local timers void meta_service::on_load_balance_timer() { if (_state->freezed()) return; ::dsn::rpc_address primary; if (_state->get_meta_server_primary(primary) && primary == primary_address()) { _failure_detector->set_primary(true); _balancer->run(); } else { _failure_detector->set_primary(false); } }
void failure_detector::on_ping_internal(const beacon_msg& beacon, /*out*/ beacon_ack& ack) { ack.time = beacon.time; ack.this_node = beacon.to_addr; ack.primary_node = primary_address(); ack.is_master = true; ack.allowed = true; zauto_lock l(_lock); uint64_t now = now_ms(); auto node = beacon.from_addr; worker_map::iterator itr = _workers.find(node); if (itr == _workers.end()) { // if is a new worker, check allow list first if need if (_use_allow_list && _allow_list.find(node) == _allow_list.end()) { dwarn("new worker[%s] is rejected", node.to_string()); ack.allowed = false; return; } // create new entry for node worker_record record(node, now); record.is_alive = true; _workers.insert(std::make_pair(node, record)); report(node, false, true); on_worker_connected(node); } else if (is_time_greater_than(now, itr->second.last_beacon_recv_time)) { // update last_beacon_recv_time itr->second.last_beacon_recv_time = now; if (itr->second.is_alive == false) { itr->second.is_alive = true; report(node, false, true); on_worker_connected(node); } } }
void replica::handle_remote_failure(partition_status st, const end_point& node, int error) { ddebug( "%s: handle remote failure error %u, status = %s, node = %s:%d", name(), error, enum_to_string(st), node.name.c_str(), static_cast<int>(node.port) ); dassert (status() == PS_PRIMARY, ""); dassert(node != primary_address(), ""); switch (st) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); { configuration_update_request request; request.node = node; request.type = CT_DOWNGRADE_TO_INACTIVE; request.config = _primary_states.membership; downgrade_to_inactive_on_primary(request); } break; case PS_POTENTIAL_SECONDARY: // potential secondary failure does not lead to ballot change // therefore, it is possible to have multiple exec here if (_primary_states.learners.erase(node) > 0) { if (_primary_states.check_exist(node, PS_INACTIVE)) _primary_states.statuses[node] = PS_INACTIVE; else _primary_states.statuses.erase(node); } break; case PS_INACTIVE: case PS_ERROR: break; default: dassert (false, ""); break; } }
void meta_server_failure_detector::on_ping(const fd::beacon_msg& beacon, ::dsn::rpc_replier<fd::beacon_ack>& reply) { fd::beacon_ack ack; ack.this_node = beacon.to; if (!is_primary()) { ack.time = beacon.time; ack.is_master = false; ack.primary_node = _primary_address; } else { failure_detector::on_ping_internal(beacon, ack); ack.primary_node = primary_address(); } reply(ack); }
void meta_server_failure_detector::set_primary(rpc_address primary) { /* * we don't do register worker things in set_primary * as only nodes sync from meta_state_service are useful, * but currently, we haven't do sync yet */ bool old = _is_primary; { utils::auto_lock<zlock> l(_primary_address_lock); _primary_address = primary; _is_primary = (primary == primary_address()); } if (old && !_is_primary) { clear_workers(); } }
void meta_service::on_log_completed(error_code err, int size, char* buffer, message_ptr req, message_ptr resp) { free(buffer); dassert(err == ERR_SUCCESS, "log operation failed, cannot proceed, err = %s", err.to_string()); configuration_update_request request; configuration_update_response response; unmarshall(req, request); update_configuration(request, response); meta_response_header rhdr; rhdr.err = err; rhdr.primary_address = primary_address(); marshall(resp, rhdr); marshall(resp, response); rpc::reply(resp); }
void meta_service::update_configuration(message_ptr req, message_ptr resp) { if (_state->freezed()) { meta_response_header rhdr; rhdr.err = 0; rhdr.primary_address = primary_address(); configuration_update_request request; configuration_update_response response; unmarshall(req, request); response.err = ERR_STATE_FREEZED; _state->query_configuration_by_gpid(request.config.gpid, response.config); marshall(resp, rhdr); marshall(resp, response); rpc::reply(resp); return; } auto bb = req->reader().get_remaining_buffer(); uint64_t offset; int len = bb.length() + sizeof(int32_t); char* buffer = (char*)malloc(len); *(int32_t*)buffer = bb.length(); memcpy(buffer + sizeof(int32_t), bb.data(), bb.length()); { zauto_lock l(_log_lock); offset = _offset; _offset += len; file::write(_log, buffer, len, offset, LPC_CM_LOG_UPDATE, this, std::bind(&meta_service::on_log_completed, this, std::placeholders::_1, std::placeholders::_2, buffer, req, resp)); } }
// local timers void meta_service::on_load_balance_timer() { // first time is to activate the LB (an initial period of time is presevered for most machine to join in) if (!_started) { _started = true; _state->benign_unfree(); return; } end_point primary; if (_state->get_meta_server_primary(primary) && primary == primary_address()) { _failure_detector->set_primary(true); _balancer->run(); } else { _failure_detector->set_primary(false); } }
void failure_detector::send_beacon(::dsn::rpc_address target, uint64_t time) { beacon_msg beacon; beacon.time = time; beacon.from_addr= primary_address(); beacon.to_addr = target; dinfo("send ping message, from[%s], to[%s], time[%" PRId64 "]", beacon.from_addr.to_string(), beacon.to_addr.to_string(), time); ::dsn::rpc::call( target, RPC_FD_FAILURE_DETECTOR_PING, beacon, this, [=](error_code err, beacon_ack&& resp) { if (err != ::dsn::ERR_OK) { beacon_ack ack; ack.time = beacon.time; ack.this_node = beacon.to_addr; ack.primary_node.set_invalid(); ack.is_master = false; ack.allowed = true; end_ping(err, ack, nullptr); } else { end_ping(err, std::move(resp), nullptr); } }, 0, std::chrono::milliseconds(_check_interval_milliseconds), 0 ); }
/*---------------------------------------------------------------------- | NPT_NetworkInterface::GetNetworkInterfaces +---------------------------------------------------------------------*/ NPT_Result NPT_NetworkInterface::GetNetworkInterfaces(NPT_List<NPT_NetworkInterface*>& interfaces) { int result = 0; struct ifaddrs * ifaddrsList = NULL; struct ifaddrs * ifaddr = NULL; NPT_Flags flags = 0; result = getifaddrs(&ifaddrsList); if( result != 0 || ifaddrsList == NULL) return NPT_FAILURE; for(ifaddr = ifaddrsList; NULL!= ifaddr; ifaddr = ifaddr->ifa_next) { if(ifaddr->ifa_addr == NULL /*|| ifaddr->ifa_addr->sa_family == AF_INET6*/) continue; // process the flags if ((ifaddr->ifa_flags & IFF_UP) == 0) { // the interface is not up, ignore it continue; } if (ifaddr->ifa_flags & IFF_BROADCAST) { flags |= NPT_NETWORK_INTERFACE_FLAG_BROADCAST; } if (ifaddr->ifa_flags & IFF_LOOPBACK) { flags |= NPT_NETWORK_INTERFACE_FLAG_LOOPBACK; } #if defined(IFF_POINTOPOINT) if (ifaddr->ifa_flags & IFF_POINTOPOINT) { flags |= NPT_NETWORK_INTERFACE_FLAG_POINT_TO_POINT; } #endif // defined(IFF_POINTOPOINT) if (ifaddr->ifa_flags & IFF_PROMISC) { flags |= NPT_NETWORK_INTERFACE_FLAG_PROMISCUOUS; } if (ifaddr->ifa_flags & IFF_MULTICAST) { flags |= NPT_NETWORK_INTERFACE_FLAG_MULTICAST; } NPT_NetworkInterface* interface = NULL; for (NPT_List<NPT_NetworkInterface*>::Iterator iface_iter = interfaces.GetFirstItem(); iface_iter; ++iface_iter) { if ((*iface_iter)->GetName() == (const char*)ifaddr->ifa_name) { interface = *iface_iter; break; } } // create a new interface object if(interface == NULL) interface = new NPT_NetworkInterface(ifaddr->ifa_name, flags); if (interface == NULL) continue; // get the mac address NPT_MacAddress::Type mac_addr_type; unsigned int mac_addr_length = IFHWADDRLEN; switch (ifaddr->ifa_addr->sa_family) { case AF_LOCAL: case AF_INET: #if defined(AF_LINK) case AF_LINK: #endif mac_addr_type = NPT_MacAddress::TYPE_ETHERNET; #if defined(ARPHRD_LOOPBACK) mac_addr_type = NPT_MacAddress::TYPE_LOOPBACK; length = 0; #endif break; #if defined(AF_PPP) case AF_PPP: mac_addr_type = NPT_MacAddress::TYPE_PPP; mac_addr_length = 0; break; #endif #if defined(AF_IEEE80211) case AF_IEEE80211: mac_addr_type = NPT_MacAddress::TYPE_IEEE_802_11; break; #endif default: mac_addr_type = NPT_MacAddress::TYPE_UNKNOWN; mac_addr_length = sizeof(ifaddr->ifa_addr->sa_data); break; } if(interface->GetMacAddress().GetLength() == 0) interface->SetMacAddress(mac_addr_type, (const unsigned char*)ifaddr->ifa_addr->sa_data, mac_addr_length); #if defined(NPT_CONFIG_HAVE_NET_IF_DL_H) if (ifaddr->ifa_addr->sa_family == AF_LINK) { //Refer to LLADDR struct sockaddr_dl * socket_dl = (struct sockaddr_dl *)ifaddr->ifa_addr; interface->SetMacAddress(mac_addr_type, (const unsigned char*)socket_dl->sdl_data+socket_dl->sdl_nlen, socket_dl->sdl_alen); } #endif switch (ifaddr->ifa_addr->sa_family) { case AF_INET: { // primary address NPT_IpAddress primary_address( ntohl(((struct sockaddr_in *)ifaddr->ifa_addr)->sin_addr.s_addr)); // broadcast address NPT_IpAddress broadcast_address; if ((flags & NPT_NETWORK_INTERFACE_FLAG_BROADCAST) && ifaddr->ifa_dstaddr) { broadcast_address.Set(ntohl(((struct sockaddr_in*)ifaddr->ifa_dstaddr)->sin_addr.s_addr)); } // point to point address NPT_IpAddress destination_address; if ((flags & NPT_NETWORK_INTERFACE_FLAG_POINT_TO_POINT) && ifaddr->ifa_dstaddr) { destination_address.Set(ntohl(((struct sockaddr_in*)ifaddr->ifa_dstaddr)->sin_addr.s_addr)); } // netmask NPT_IpAddress netmask(0xFFFFFFFF); if(ifaddr->ifa_netmask) netmask.Set(ntohl(((struct sockaddr_in*)ifaddr->ifa_netmask)->sin_addr.s_addr)); // add the address to the interface NPT_NetworkInterfaceAddress iface_address(primary_address, broadcast_address, destination_address, netmask); interface->AddAddress(iface_address); break; } } // add the interface to the list interfaces.Add(interface); } freeifaddrs(ifaddrsList); return NPT_SUCCESS; }
void replica::on_update_configuration_on_meta_server_reply(error_code err, dsn_message_t request, dsn_message_t response, std::shared_ptr<configuration_update_request> req) { check_hashed_access(); if (PS_INACTIVE != status() || _stub->is_connected() == false) { _primary_states.reconfiguration_task = nullptr; err.end_tracking(); return; } configuration_update_response resp; if (err == ERR_OK) { ::unmarshall(response, resp); err = resp.err; } if (err != ERR_OK) { ddebug( "%s: update configuration reply with err %s, request ballot %lld", name(), err.to_string(), req->config.ballot ); if (err != ERR_INVALID_VERSION) { rpc_address target(_stub->_failure_detector->get_servers()); dsn_msg_add_ref(request); // added for another round of rpc::call _primary_states.reconfiguration_task = rpc::call( target, request, this, std::bind(&replica::on_update_configuration_on_meta_server_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, req), gpid_to_hash(get_gpid()) ); return; } } ddebug( "%s: update configuration reply with err %s, ballot %lld, local %lld", name(), resp.err.to_string(), resp.config.ballot, get_ballot() ); if (resp.config.ballot < get_ballot()) { _primary_states.reconfiguration_task = nullptr; return; } // post-update work items? if (resp.err == ERR_OK) { dassert (req->config.gpid == resp.config.gpid, ""); dassert (req->config.app_type == resp.config.app_type, ""); dassert (req->config.primary == resp.config.primary, ""); dassert (req->config.secondaries == resp.config.secondaries, ""); switch (req->type) { case CT_UPGRADE_TO_PRIMARY: _primary_states.last_prepare_decree_on_new_primary = _prepare_list->max_decree(); break; case CT_ASSIGN_PRIMARY: case CT_DOWNGRADE_TO_SECONDARY: case CT_DOWNGRADE_TO_INACTIVE: case CT_UPGRADE_TO_SECONDARY: break; case CT_REMOVE: if (req->node != primary_address()) { replica_configuration rconfig; replica_helper::get_replica_config(resp.config, req->node, rconfig); rpc::call_one_way_typed(req->node, RPC_REMOVE_REPLICA, rconfig, gpid_to_hash(get_gpid())); } break; default: dassert (false, ""); } } update_configuration(resp.config); _primary_states.reconfiguration_task = nullptr; }
/*---------------------------------------------------------------------- | NPT_NetworkInterface::GetNetworkInterfaces +---------------------------------------------------------------------*/ NPT_Result NPT_NetworkInterface::GetNetworkInterfaces(NPT_List<NPT_NetworkInterface*>& interfaces) { // create a socket to talk to the TCP/IP stack SOCKET net; if((net = WSASocket(AF_INET, SOCK_DGRAM, IPPROTO_UDP, NULL, 0, 0)) == INVALID_SOCKET) { return NPT_FAILURE; } // get a list of interfaces INTERFACE_INFO query[32]; // get up to 32 interfaces DWORD bytes_returned; int io_result = WSAIoctl(net, SIO_GET_INTERFACE_LIST, NULL, 0, &query, sizeof(query), &bytes_returned, NULL, NULL); if (io_result == SOCKET_ERROR) { closesocket(net); return NPT_FAILURE; } // we don't need the socket anymore closesocket(net); // Display interface information int interface_count = (bytes_returned/sizeof(INTERFACE_INFO)); unsigned int iface_index = 0; for (int i=0; i<interface_count; i++) { SOCKADDR_IN* address; NPT_Flags flags = 0; // primary address address = (SOCKADDR_IN*)&query[i].iiAddress; NPT_IpAddress primary_address(ntohl(address->sin_addr.s_addr)); // netmask address = (SOCKADDR_IN*)&query[i].iiNetmask; NPT_IpAddress netmask(ntohl(address->sin_addr.s_addr)); // broadcast address address = (SOCKADDR_IN*)&query[i].iiBroadcastAddress; NPT_IpAddress broadcast_address(ntohl(address->sin_addr.s_addr)); { // broadcast address is incorrect unsigned char addr[4]; for(int i=0; i<4; i++) { addr[i] = (primary_address.AsBytes()[i] & netmask.AsBytes()[i]) | ~netmask.AsBytes()[i]; } broadcast_address.Set(addr); } // ignore interfaces that are not up if (!(query[i].iiFlags & IFF_UP)) { continue; } if (query[i].iiFlags & IFF_BROADCAST) { flags |= NPT_NETWORK_INTERFACE_FLAG_BROADCAST; } if (query[i].iiFlags & IFF_MULTICAST) { flags |= NPT_NETWORK_INTERFACE_FLAG_MULTICAST; } if (query[i].iiFlags & IFF_LOOPBACK) { flags |= NPT_NETWORK_INTERFACE_FLAG_LOOPBACK; } if (query[i].iiFlags & IFF_POINTTOPOINT) { flags |= NPT_NETWORK_INTERFACE_FLAG_POINT_TO_POINT; } // mac address (no support for this for now) NPT_MacAddress mac; // create an interface object char iface_name[5]; iface_name[0] = 'i'; iface_name[1] = 'f'; iface_name[2] = '0'+(iface_index/10); iface_name[3] = '0'+(iface_index%10); iface_name[4] = '\0'; NPT_NetworkInterface* iface = new NPT_NetworkInterface(iface_name, mac, flags); // set the interface address NPT_NetworkInterfaceAddress iface_address( primary_address, broadcast_address, NPT_IpAddress::Any, netmask); iface->AddAddress(iface_address); // add the interface to the list interfaces.Add(iface); // increment the index (used for generating the name iface_index++; } return NPT_SUCCESS; }
/*---------------------------------------------------------------------- | NPT_NetworkInterface::GetNetworkInterfaces +---------------------------------------------------------------------*/ NPT_Result NPT_NetworkInterface::GetNetworkInterfaces(NPT_List<NPT_NetworkInterface*>& interfaces) { int net = socket(AF_INET, SOCK_DGRAM, 0); // Try to get the config until we have enough memory for it // According to "Unix Network Programming", some implementations // do not return an error when the supplied buffer is too small // so we need to try, increasing the buffer size every time, // until we get the same size twice. We cannot assume success when // the returned size is smaller than the supplied buffer, because // some implementations can return less that the buffer size if // another structure does not fit. unsigned int buffer_size = 4096; // initial guess unsigned int last_size = 0; struct ifconf config; unsigned char* buffer; for (; buffer_size < 65536;) { buffer = new unsigned char[buffer_size]; config.ifc_len = buffer_size; config.ifc_buf = (char*)buffer; if (ioctl(net, SIOCGIFCONF, &config) < 0) { if (errno != EINVAL || last_size != 0) { return NPT_ERROR_BASE_UNIX-errno; } } else { if ((unsigned int)config.ifc_len == last_size) { // same size, we can use the buffer break; } // different size, we need to reallocate last_size = config.ifc_len; } // supply 4096 more bytes more next time around buffer_size += 4096; delete[] buffer; } // iterate over all objects unsigned char *entries; for (entries = buffer; entries < buffer+config.ifc_len;) { struct ifreq* entry = (struct ifreq*)entries; // get the size of the addresses unsigned int address_length; #if defined(NPT_CONFIG_HAVE_SOCKADDR_SA_LEN) address_length = sizeof(struct sockaddr) > entry->ifr_addr.sa_len ? sizeof(sockaddr) : entry->ifr_addr.sa_len; #else switch (entry->ifr_addr.sa_family) { #if defined(AF_INET6) case AF_INET6: address_length = sizeof(struct sockaddr_in6); break; #endif // defined(AF_INET6) default: address_length = sizeof(struct sockaddr); break; } #endif // point to the next entry entries += address_length + sizeof(entry->ifr_name); // ignore anything except AF_INET and AF_LINK addresses if (entry->ifr_addr.sa_family != AF_INET #if defined(AF_LINK) && entry->ifr_addr.sa_family != AF_LINK #endif ) { continue; } // get detailed info about the interface NPT_Flags flags = 0; #if defined(SIOCGIFFLAGS) struct ifreq query = *entry; if (ioctl(net, SIOCGIFFLAGS, &query) < 0) continue; // process the flags if ((query.ifr_flags & IFF_UP) == 0) { // the interface is not up, ignore it continue; } if (query.ifr_flags & IFF_BROADCAST) { flags |= NPT_NETWORK_INTERFACE_FLAG_BROADCAST; } if (query.ifr_flags & IFF_LOOPBACK) { flags |= NPT_NETWORK_INTERFACE_FLAG_LOOPBACK; } #if defined(IFF_POINTOPOINT) if (query.ifr_flags & IFF_POINTOPOINT) { flags |= NPT_NETWORK_INTERFACE_FLAG_POINT_TO_POINT; } #endif // defined(IFF_POINTOPOINT) if (query.ifr_flags & IFF_PROMISC) { flags |= NPT_NETWORK_INTERFACE_FLAG_PROMISCUOUS; } if (query.ifr_flags & IFF_MULTICAST) { flags |= NPT_NETWORK_INTERFACE_FLAG_MULTICAST; } #endif // defined(SIOCGIFFLAGS) // get a pointer to an interface we've looped over before // or create a new one NPT_NetworkInterface* interface = NULL; for (NPT_List<NPT_NetworkInterface*>::Iterator iface_iter = interfaces.GetFirstItem(); iface_iter; ++iface_iter) { if ((*iface_iter)->GetName() == (const char*)entry->ifr_name) { interface = *iface_iter; break; } } if (interface == NULL) { // create a new interface object interface = new NPT_NetworkInterface(entry->ifr_name, flags); // add the interface to the list interfaces.Add(interface); // get the mac address #if defined(SIOCGIFHWADDR) if (ioctl(net, SIOCGIFHWADDR, &query) == 0) { NPT_MacAddress::Type mac_addr_type; unsigned int mac_addr_length = IFHWADDRLEN; switch (query.ifr_addr.sa_family) { #if defined(ARPHRD_ETHER) case ARPHRD_ETHER: mac_addr_type = NPT_MacAddress::TYPE_ETHERNET; break; #endif #if defined(ARPHRD_LOOPBACK) case ARPHRD_LOOPBACK: mac_addr_type = NPT_MacAddress::TYPE_LOOPBACK; length = 0; break; #endif #if defined(ARPHRD_PPP) case ARPHRD_PPP: mac_addr_type = NPT_MacAddress::TYPE_PPP; mac_addr_length = 0; break; #endif #if defined(ARPHRD_IEEE80211) case ARPHRD_IEEE80211: mac_addr_type = NPT_MacAddress::TYPE_IEEE_802_11; break; #endif default: mac_addr_type = NPT_MacAddress::TYPE_UNKNOWN; mac_addr_length = sizeof(query.ifr_addr.sa_data); break; } interface->SetMacAddress(mac_addr_type, (const unsigned char*)query.ifr_addr.sa_data, mac_addr_length); } #endif } switch (entry->ifr_addr.sa_family) { case AF_INET: { // primary address NPT_IpAddress primary_address(ntohl(((struct sockaddr_in*)&entry->ifr_addr)->sin_addr.s_addr)); // broadcast address NPT_IpAddress broadcast_address; #if defined(SIOCGIFBRDADDR) if (flags & NPT_NETWORK_INTERFACE_FLAG_BROADCAST) { if (ioctl(net, SIOCGIFBRDADDR, &query) == 0) { broadcast_address.Set(ntohl(((struct sockaddr_in*)&query.ifr_addr)->sin_addr.s_addr)); } } #endif // point to point address NPT_IpAddress destination_address; #if defined(SIOCGIFDSTADDR) if (flags & NPT_NETWORK_INTERFACE_FLAG_POINT_TO_POINT) { if (ioctl(net, SIOCGIFDSTADDR, &query) == 0) { destination_address.Set(ntohl(((struct sockaddr_in*)&query.ifr_addr)->sin_addr.s_addr)); } } #endif // netmask NPT_IpAddress netmask(0xFFFFFFFF); #if defined(SIOCGIFNETMASK) if (ioctl(net, SIOCGIFNETMASK, &query) == 0) { netmask.Set(ntohl(((struct sockaddr_in*)&query.ifr_addr)->sin_addr.s_addr)); } #endif // add the address to the interface NPT_NetworkInterfaceAddress iface_address( primary_address, broadcast_address, destination_address, netmask); interface->AddAddress(iface_address); break; } #if defined(AF_LINK) && defined(NPT_CONFIG_HAVE_SOCKADDR_DL) case AF_LINK: { struct sockaddr_dl* mac_addr = (struct sockaddr_dl*)&entry->ifr_addr; NPT_MacAddress::Type mac_addr_type = NPT_MacAddress::TYPE_UNKNOWN; switch (mac_addr->sdl_type) { #if defined(IFT_LOOP) case IFT_LOOP: mac_addr_type = NPT_MacAddress::TYPE_LOOPBACK; break; #endif #if defined(IFT_ETHER) case IFT_ETHER: mac_addr_type = NPT_MacAddress::TYPE_ETHERNET; break; #endif #if defined(IFT_PPP) case IFT_PPP: mac_addr_type = NPT_MacAddress::TYPE_PPP; break; #endif } interface->SetMacAddress(mac_addr_type, (const unsigned char*)(&mac_addr->sdl_data[mac_addr->sdl_nlen]), mac_addr->sdl_alen); break; } #endif } } // free resources delete[] buffer; close(net); return NPT_SUCCESS; }
void meta_service::start(const char* data_dir, bool clean_state) { dassert(!_started, "meta service is already started"); _data_dir = data_dir; std::string checkpoint_path = _data_dir + "/checkpoint"; std::string oplog_path = _data_dir + "/oplog"; if (clean_state) { try { if (!dsn::utils::filesystem::remove_path(checkpoint_path)) { dassert(false, "Fail to remove file %s.", checkpoint_path.c_str()); } if (!dsn::utils::filesystem::remove_path(oplog_path)) { dassert(false, "Fail to remove file %s.", oplog_path.c_str()); } } catch (std::exception& ex) { ex; } } else { if (!dsn::utils::filesystem::create_directory(_data_dir)) { dassert(false, "Fail to create directory %s.", _data_dir.c_str()); } if (dsn::utils::filesystem::file_exists(checkpoint_path)) { _state->load(checkpoint_path.c_str()); } if (dsn::utils::filesystem::file_exists(oplog_path)) { replay_log(oplog_path.c_str()); _state->save(checkpoint_path.c_str()); if (!dsn::utils::filesystem::remove_path(oplog_path)) { dassert(false, "Fail to remove file %s.", oplog_path.c_str()); } } } _log = dsn_file_open((_data_dir + "/oplog").c_str(), O_RDWR | O_CREAT, 0666); _balancer = new load_balancer(_state); _failure_detector = new meta_server_failure_detector(_state, this); // TODO: use zookeeper for leader election _failure_detector->set_primary(primary_address()); // make sure the delay is larger than fd.grace to ensure // all machines are in the correct state (assuming connected initially) tasking::enqueue(LPC_LBM_START, this, &meta_service::on_load_balance_start, 0, _opts.fd_grace_seconds * 1000); auto err = _failure_detector->start( _opts.fd_check_interval_seconds, _opts.fd_beacon_interval_seconds, _opts.fd_lease_seconds, _opts.fd_grace_seconds, false ); dassert(err == ERR_OK, "FD start failed, err = %s", err.to_string()); register_rpc_handler( RPC_CM_QUERY_NODE_PARTITIONS, "RPC_CM_QUERY_NODE_PARTITIONS", &meta_service::on_query_configuration_by_node ); register_rpc_handler( RPC_CM_QUERY_PARTITION_CONFIG_BY_INDEX, "RPC_CM_QUERY_PARTITION_CONFIG_BY_INDEX", &meta_service::on_query_configuration_by_index ); register_rpc_handler( RPC_CM_UPDATE_PARTITION_CONFIGURATION, "RPC_CM_UPDATE_PARTITION_CONFIGURATION", &meta_service::on_update_configuration ); }
void meta_service::start(const char* data_dir, bool clean_state) { _data_dir = data_dir; if (clean_state) { try { boost::filesystem::remove(_data_dir + "/checkpoint"); boost::filesystem::remove(_data_dir + "/oplog"); } catch (std::exception& ex) { ex; } } else { if (!boost::filesystem::exists(_data_dir)) { boost::filesystem::create_directory(_data_dir); } if (boost::filesystem::exists(_data_dir + "/checkpoint")) { _state->load((_data_dir + "/checkpoint").c_str()); } if (boost::filesystem::exists(_data_dir + "/oplog")) { replay_log((_data_dir + "/oplog").c_str()); _state->save((_data_dir + "/checkpoint").c_str()); boost::filesystem::remove(_data_dir + "/oplog"); } } _log = file::open((_data_dir + "/oplog").c_str(), O_RDWR | O_CREAT, 0666); _balancer = new load_balancer(_state); _failure_detector = new meta_server_failure_detector(_state); end_point primary; if (_state->get_meta_server_primary(primary) && primary == primary_address()) { _failure_detector->set_primary(true); } else _failure_detector->set_primary(false); register_rpc_handler(RPC_CM_CALL, "RPC_CM_CALL", &meta_service::on_request); _balancer_timer = tasking::enqueue(LPC_LBM_RUN, this, &meta_service::on_load_balance_timer, 0, _opts.fd_grace_seconds * 1000 + 1, // delay 10000 ); _failure_detector->start( _opts.fd_check_interval_seconds, _opts.fd_beacon_interval_seconds, _opts.fd_lease_seconds, _opts.fd_grace_seconds, false ); }