void replica::on_group_check(const group_check_request& request, __out_param group_check_response& response) { ddebug( "%s: on_group_check from %s:%d", name(), request.config.primary.name.c_str(), request.config.primary.port ); if (request.config.ballot < get_ballot()) { response.err = ERR_VERSION_OUTDATED; return; } else if (request.config.ballot > get_ballot()) { update_local_configuration(request.config); } else if (is_same_ballot_status_change_allowed(status(), request.config.status)) { update_local_configuration(request.config, true); } switch (status()) { case PS_INACTIVE: break; case PS_SECONDARY: if (request.last_committed_decree > last_committed_decree()) { _prepare_list->commit(request.last_committed_decree, true); } break; case PS_POTENTIAL_SECONDARY: init_learn(request.learner_signature); break; case PS_ERROR: break; default: dassert (false, ""); } response.gpid = get_gpid(); response.node = primary_address(); response.err = ERR_SUCCESS; if (status() == PS_ERROR) { response.err = ERR_INVALID_STATE; } response.last_committed_decree_in_app = _app->last_committed_decree(); response.last_committed_decree_in_prepare_list = last_committed_decree(); response.learner_status_ = _potential_secondary_states.learning_status; response.learner_signature = _potential_secondary_states.learning_signature; }
void replica::on_config_sync(const partition_configuration& config) { ddebug( "%s: configuration sync", name()); // no outdated update if (config.ballot < get_ballot()) return; if (status() == PS_PRIMARY || nullptr != _primary_states.reconfiguration_task) { // nothing to do as pirmary always holds the truth } else { update_configuration(config); if (status() == PS_INACTIVE && !_inactive_is_transient) { if (config.primary == primary_address() // dead primary || config.primary.is_invalid() // primary is dead (otherwise let primary remove this) ) { _stub->remove_replica_on_meta_server(config); } } } }
void replica::on_group_check_reply(error_code err, const std::shared_ptr<group_check_request>& req, const std::shared_ptr<group_check_response>& resp) { check_hashed_access(); if (partition_status::PS_PRIMARY != status() || req->config.ballot < get_ballot()) { return; } auto r = _primary_states.group_check_pending_replies.erase(req->node); dassert (r == 1, ""); if (err != ERR_OK) { handle_remote_failure(req->config.status, req->node, err); } else { if (resp->err == ERR_OK) { if (resp->learner_status_ == learner_status::LearningSucceeded && req->config.status == partition_status::PS_POTENTIAL_SECONDARY) { handle_learning_succeeded_on_primary(req->node, resp->learner_signature); } } else { handle_remote_failure(req->config.status, req->node, resp->err); } } }
void replica::remove(configuration_update_request& proposal) { if (proposal.config.ballot != get_ballot() || status() != PS_PRIMARY) return; dassert (proposal.config.gpid == _primary_states.membership.gpid, ""); dassert (proposal.config.app_type == _primary_states.membership.app_type, ""); dassert (proposal.config.primary == _primary_states.membership.primary, ""); dassert (proposal.config.secondaries == _primary_states.membership.secondaries, ""); auto st = _primary_states.get_node_status(proposal.node); switch (st) { case PS_PRIMARY: dassert (proposal.config.primary == proposal.node, ""); proposal.config.primary.set_invalid(); break; case PS_SECONDARY: { auto rt = replica_helper::remove_node(proposal.node, proposal.config.secondaries); dassert (rt, ""); } break; case PS_POTENTIAL_SECONDARY: break; } update_configuration_on_meta_server(CT_REMOVE, proposal.node, proposal.config); }
void replica::replay_prepare_list() { decree start = last_committed_decree() + 1; decree end = _prepare_list->max_decree(); ddebug( "%s: replay prepare list from %lld to %lld, ballot = %lld", name(), start, end, get_ballot() ); for (decree decree = start; decree <= end; decree++) { mutation_ptr old = _prepare_list->get_mutation_by_decree(decree); mutation_ptr mu = new_mutation(decree); if (old != nullptr) { mu->copy_from(old); } else { mu->rpc_code = RPC_REPLICATION_WRITE_EMPTY; ddebug( "%s: emit empty mutation %lld when replay prepare list", name(), decree ); } init_prepare(mu); } }
// from primary void replica::on_remove(const replica_configuration& request) { if (request.ballot < get_ballot()) return; dassert (request.status == PS_INACTIVE, ""); update_local_configuration(request); }
mutation_ptr replica::new_mutation(decree decree) { mutation_ptr mu(new mutation()); mu->data.header.gpid = get_gpid(); mu->data.header.ballot = get_ballot(); mu->data.header.decree = decree; mu->data.header.log_offset = invalid_offset; return mu; }
bool replica::update_configuration(const partition_configuration& config) { dassert (config.ballot >= get_ballot(), ""); replica_configuration rconfig; replica_helper::get_replica_config(config, primary_address(), rconfig); if (rconfig.status == PS_PRIMARY && (rconfig.ballot > get_ballot() || status() != PS_PRIMARY) ) { _primary_states.reset_membership(config, config.primary != primary_address()); } if (config.ballot > get_ballot() || is_same_ballot_status_change_allowed(status(), rconfig.status) ) { return update_local_configuration(rconfig, true); } else return false; }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } if (!_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::downgrade_to_secondary_on_primary(configuration_update_request& proposal) { if (proposal.config.ballot != get_ballot() || status() != PS_PRIMARY) return; dassert (proposal.config.gpid == _primary_states.membership.gpid, ""); dassert (proposal.config.app_type == _primary_states.membership.app_type, ""); dassert (proposal.config.primary == _primary_states.membership.primary, ""); dassert (proposal.config.secondaries == _primary_states.membership.secondaries, ""); dassert (proposal.node == proposal.config.primary, ""); proposal.config.primary.set_invalid(); proposal.config.secondaries.push_back(proposal.node); update_configuration_on_meta_server(CT_DOWNGRADE_TO_SECONDARY, proposal.node, proposal.config); }
void replica::ack_prepare_message(error_code err, mutation_ptr& mu) { prepare_ack resp; resp.gpid = get_gpid(); resp.err = err; resp.ballot = get_ballot(); resp.decree = mu->data.header.decree; // for PS_POTENTIAL_SECONDARY ONLY resp.last_committed_decree_in_app = _app->last_committed_decree(); resp.last_committed_decree_in_prepare_list = last_committed_decree(); dassert(nullptr != mu->prepare_msg(), ""); reply(mu->prepare_msg(), resp); ddebug("%s: mutation %s ack_prepare_message, err = %s", name(), mu->name(), err.to_string()); }
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err); if (err == ERR_SUCCESS) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_SUCCESS) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_SUCCESS) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::downgrade_to_inactive_on_primary(configuration_update_request& proposal) { if (proposal.config.ballot != get_ballot() || status() != PS_PRIMARY) return; dassert (proposal.config.gpid == _primary_states.membership.gpid, ""); dassert (proposal.config.app_type == _primary_states.membership.app_type, ""); dassert (proposal.config.primary == _primary_states.membership.primary, ""); dassert (proposal.config.secondaries == _primary_states.membership.secondaries, ""); if (proposal.node == proposal.config.primary) { proposal.config.primary.set_invalid(); } else { auto rt = replica_helper::remove_node(proposal.node, proposal.config.secondaries); dassert (rt, ""); } update_configuration_on_meta_server(CT_DOWNGRADE_TO_INACTIVE, proposal.node, proposal.config); }
// @ secondary void replica::on_copy_checkpoint(const replica_configuration& request, /*out*/ learn_response& response) { check_hashed_access(); if (request.ballot > get_ballot()) { if (!update_local_configuration(request)) { response.err = ERR_INVALID_STATE; return; } } if (status() != PS_SECONDARY) { response.err = ERR_INVALID_STATE; return; } if (_app->last_durable_decree() == 0) { response.err = ERR_OBJECT_NOT_FOUND; return; } blob placeholder; int err = _app->get_checkpoint(0, placeholder, response.state); if (err != 0) { response.err = ERR_LEARN_FILE_FAILED; } else { response.err = ERR_OK; response.last_committed_decree = last_committed_decree(); response.base_local_dir = _app->data_dir(); response.address = _stub->_primary_address; } }
void replica::add_potential_secondary(configuration_update_request& proposal) { if (status() != PS_PRIMARY) { return; } dassert (proposal.config.ballot == get_ballot(), ""); dassert (proposal.config.gpid == _primary_states.membership.gpid, ""); dassert (proposal.config.app_type == _primary_states.membership.app_type, ""); dassert (proposal.config.primary == _primary_states.membership.primary, ""); dassert (proposal.config.secondaries == _primary_states.membership.secondaries, ""); dassert (!_primary_states.check_exist(proposal.node, PS_PRIMARY), ""); dassert (!_primary_states.check_exist(proposal.node, PS_SECONDARY), ""); if (_primary_states.learners.find(proposal.node) != _primary_states.learners.end()) { return; } remote_learner_state state; state.prepare_start_decree = invalid_decree; state.signature = random64(0, (uint64_t)(-1LL)); state.timeout_task = nullptr; // TODO: add timer for learner task _primary_states.learners[proposal.node] = state; _primary_states.statuses[proposal.node] = PS_POTENTIAL_SECONDARY; group_check_request request; request.app_type = _primary_states.membership.app_type; request.node = proposal.node; _primary_states.get_replica_config(proposal.node, request.config); request.last_committed_decree = last_committed_decree(); request.learner_signature = state.signature; rpc::call_one_way_typed(proposal.node, RPC_LEARN_ADD_LEARNER, request, gpid_to_hash(get_gpid())); }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_SUCCESS; uint8_t count = 0; if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count) { err = ERR_NOT_ENOUGH_MEMBER; goto ErrOut; } mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } if (mu->data.header.decree > _prepare_list->max_decree() && _prepare_list->count() >= _options.staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare without log err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_SUCCESS) { goto ErrOut; } ddebug("%s: mutation %s init_prepare", name(), mu->name()); // // TODO: bounded staleness on secondaries // dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, ""); // remote prepare dassert (mu->remote_tasks().size() == 0, ""); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++) { send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries); count++; } } mu->set_left_potential_secondary_ack_count(count); // local log dassert (mu->data.header.log_offset == invalid_offset, ""); dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); if (nullptr == mu->log_task()) { err = ERR_FILE_OPERATION_FAILED; handle_local_failure(err); goto ErrOut; } return; ErrOut: response_client_message(mu->client_request, err); return; }
void replica::on_group_check(const group_check_request& request, /*out*/ group_check_response& response) { check_hashed_access(); ddebug( "%s: process group check, primary = %s, ballot = %" PRId64 ", status = %s, last_committed_decree = %" PRId64, name(), request.config.primary.to_string(), request.config.ballot, enum_to_string(request.config.status), request.last_committed_decree ); if (request.config.ballot < get_ballot()) { response.err = ERR_VERSION_OUTDATED; dwarn("%s: on_group_check reply %s", name(), response.err.to_string()); return; } else if (request.config.ballot > get_ballot()) { if (!update_local_configuration(request.config)) { response.err = ERR_INVALID_STATE; dwarn("%s: on_group_check reply %s", name(), response.err.to_string()); return; } } else if (is_same_ballot_status_change_allowed(status(), request.config.status)) { update_local_configuration(request.config, true); } switch (status()) { case partition_status::PS_INACTIVE: break; case partition_status::PS_SECONDARY: if (request.last_committed_decree > last_committed_decree()) { _prepare_list->commit(request.last_committed_decree, COMMIT_TO_DECREE_HARD); } break; case partition_status::PS_POTENTIAL_SECONDARY: init_learn(request.config.learner_signature); break; case partition_status::PS_ERROR: break; default: dassert (false, ""); } response.pid = get_gpid(); response.node = _stub->_primary_address; response.err = ERR_OK; if (status() == partition_status::PS_ERROR) { response.err = ERR_INVALID_STATE; dwarn("%s: on_group_check reply %s", name(), response.err.to_string()); } response.last_committed_decree_in_app = _app->last_committed_decree(); response.last_committed_decree_in_prepare_list = last_committed_decree(); response.learner_status_ = _potential_secondary_states.learning_status; response.learner_signature = _potential_secondary_states.learning_version; }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_OK; uint8_t count = 0; mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid()); // check bounded staleness if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_OK) { goto ErrOut; } // remote prepare mu->set_prepare_ts(); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it) { send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature); count++; } } mu->set_left_potential_secondary_ack_count(count); if (mu->is_logged()) { do_possible_commit_on_primary(mu); } else { dassert(mu->data.header.log_offset == invalid_offset, ""); dassert(mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(nullptr != mu->log_task(), ""); } return; ErrOut: for (auto& r : mu->client_requests) { response_client_message(r, err); } return; }
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, error_code err, dsn_message_t request, dsn_message_t reply) { check_hashed_access(); mutation_ptr mu = pr.first; partition_status targetStatus = pr.second; // skip callback for old mutations if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status()) return; dassert (mu->data.header.ballot == get_ballot(), ""); ::dsn::rpc_address node = dsn_msg_to_address(request); partition_status st = _primary_states.get_node_status(node); // handle reply prepare_ack resp; // handle error if (err != ERR_OK) { resp.err = err; } else { ::unmarshall(reply, resp); } ddebug( "%s: mutation %s on_prepare_reply from %s, err = %s", name(), mu->name(), node.to_string(), resp.err.to_string() ); if (resp.err == ERR_OK) { dassert (resp.ballot == get_ballot(), ""); dassert (resp.decree == mu->data.header.decree, ""); switch (targetStatus) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); dassert (mu->left_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; case PS_POTENTIAL_SECONDARY: dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; default: dwarn( "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name() ); break; } } // failure handling else { // retry for INACTIVE state when there are still time if (resp.err == ERR_INACTIVE_STATE && !mu->is_prepare_close_to_timeout(2, targetStatus == PS_SECONDARY ? _options->prepare_timeout_ms_for_secondaries : _options->prepare_timeout_ms_for_potential_secondaries) ) { send_prepare_message(node, targetStatus, mu, targetStatus == PS_SECONDARY ? _options->prepare_timeout_ms_for_secondaries : _options->prepare_timeout_ms_for_potential_secondaries); return; } // make sure this is before any later commit ops // because now commit ops may lead to new prepare ops // due to replication throttling handle_remote_failure(st, node, resp.err); // note targetStatus and (curent) status may diff if (targetStatus == PS_POTENTIAL_SECONDARY) { dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } } } }
void replica::on_update_configuration_on_meta_server_reply(error_code err, dsn_message_t request, dsn_message_t response, std::shared_ptr<configuration_update_request> req) { check_hashed_access(); if (PS_INACTIVE != status() || _stub->is_connected() == false) { _primary_states.reconfiguration_task = nullptr; err.end_tracking(); return; } configuration_update_response resp; if (err == ERR_OK) { ::unmarshall(response, resp); err = resp.err; } if (err != ERR_OK) { ddebug( "%s: update configuration reply with err %s, request ballot %lld", name(), err.to_string(), req->config.ballot ); if (err != ERR_INVALID_VERSION) { rpc_address target(_stub->_failure_detector->get_servers()); dsn_msg_add_ref(request); // added for another round of rpc::call _primary_states.reconfiguration_task = rpc::call( target, request, this, std::bind(&replica::on_update_configuration_on_meta_server_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, req), gpid_to_hash(get_gpid()) ); return; } } ddebug( "%s: update configuration reply with err %s, ballot %lld, local %lld", name(), resp.err.to_string(), resp.config.ballot, get_ballot() ); if (resp.config.ballot < get_ballot()) { _primary_states.reconfiguration_task = nullptr; return; } // post-update work items? if (resp.err == ERR_OK) { dassert (req->config.gpid == resp.config.gpid, ""); dassert (req->config.app_type == resp.config.app_type, ""); dassert (req->config.primary == resp.config.primary, ""); dassert (req->config.secondaries == resp.config.secondaries, ""); switch (req->type) { case CT_UPGRADE_TO_PRIMARY: _primary_states.last_prepare_decree_on_new_primary = _prepare_list->max_decree(); break; case CT_ASSIGN_PRIMARY: case CT_DOWNGRADE_TO_SECONDARY: case CT_DOWNGRADE_TO_INACTIVE: case CT_UPGRADE_TO_SECONDARY: break; case CT_REMOVE: if (req->node != primary_address()) { replica_configuration rconfig; replica_helper::get_replica_config(resp.config, req->node, rconfig); rpc::call_one_way_typed(req->node, RPC_REMOVE_REPLICA, rconfig, gpid_to_hash(get_gpid())); } break; default: dassert (false, ""); } } update_configuration(resp.config); _primary_states.reconfiguration_task = nullptr; }
void replica::on_prepare(dsn_message_t request) { check_hashed_access(); replica_configuration rconfig; mutation_ptr mu; { rpc_read_stream reader(request); unmarshall(reader, rconfig); mu = mutation::read_from(reader, request); } decree decree = mu->data.header.decree; dinfo("%s: mutation %s on_prepare", name(), mu->name()); dassert(mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); // no need response because the rpc should have been cancelled on primary in this case return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { if (!update_local_configuration(rconfig)) { derror( "%s: mutation %s on_prepare failed as update local configuration failed, state = %s", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } } if (PS_INACTIVE == status() || PS_ERROR == status()) { derror( "%s: mutation %s on_prepare failed as invalid replica state, state = %s", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message( (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE, mu ); return; } else if (PS_POTENTIAL_SECONDARY == status()) { // new learning process if (rconfig.learner_signature != _potential_secondary_states.learning_signature) { init_learn(rconfig.learner_signature); // no need response as rpc is already gone return; } if (!(_potential_secondary_states.learning_status == LearningWithPrepare || _potential_secondary_states.learning_status == LearningSucceeded)) { derror( "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // no need response as rpc is already gone return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_OK, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { if (mu2->is_logged()) { ack_prepare_message(ERR_OK, mu); } else { derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name()); // response will be unnecessary when we add retry logic in rpc engine. // the retried rpc will use the same id therefore it will be considered responsed // even the response is for a previous try. } return; } error_code err = _prepare_list->prepare(mu, status()); dassert (err == ERR_OK, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, ""); } dassert(mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE) { switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, [this, mu](error_code err, size_t size) { // // DO NOT CHANGE THIS CALLBACK HERE UNLESS // YOU FULLY UNDERSTAND WHAT WE DO HERE // // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE // THE FIRST CALLBACK IF THERE IS FAILURE TO // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY // CANCELLED!!! // // TODO: we do not need so many callbacks // dinfo("%s: append private log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err != ERR_OK) { derror("%s: append private log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_OK; uint8_t count = 0; if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count) { err = ERR_NOT_ENOUGH_MEMBER; goto ErrOut; } mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } ddebug("%s: mutation %s init_prepare", name(), mu->name()); // check bounded staleness if (mu->data.header.decree > last_committed_decree() + _options.staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_OK) { goto ErrOut; } // remote prepare mu->set_prepare_ts(); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++) { send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries); count++; } } mu->set_left_potential_secondary_ack_count(count); // it is possible to do commit here when logging is not required for acking prepare. // however, it is only possible when replica count == 1 at this moment in the // replication group, and we don't want to do this as it is too fragile now. // do_possible_commit_on_primary(mu); // local log dassert (mu->data.header.log_offset == invalid_offset, ""); dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(nullptr != mu->log_task(), ""); return; ErrOut: response_client_message(mu->client_msg(), err); return; }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE) { switch (status()) { case partition_status::PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case partition_status::PS_SECONDARY: case partition_status::PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case partition_status::PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, nullptr, gpid_to_hash(get_gpid()) ); } }
void replica::on_config_proposal(configuration_update_request& proposal) { check_hashed_access(); ddebug( "%s: on_config_proposal %s for %s", name(), enum_to_string(proposal.type), proposal.node.to_string() ); if (proposal.config.ballot < get_ballot()) { dwarn( "%s: on_config_proposal is out-dated, %lld vs %lld", name(), proposal.config.ballot, get_ballot() ); return; } if (_primary_states.reconfiguration_task != nullptr) { dinfo( "%s: reconfiguration on the way, skip the incoming proposal", name() ); return; } if (proposal.config.ballot > get_ballot()) { if (!update_configuration(proposal.config)) { // is closing or update failed return; } } switch (proposal.type) { case CT_ASSIGN_PRIMARY: case CT_UPGRADE_TO_PRIMARY: assign_primary(proposal); break; case CT_ADD_SECONDARY: add_potential_secondary(proposal); break; case CT_DOWNGRADE_TO_SECONDARY: downgrade_to_secondary_on_primary(proposal); break; case CT_DOWNGRADE_TO_INACTIVE: downgrade_to_inactive_on_primary(proposal); break; case CT_REMOVE: remove(proposal); break; default: dassert (false, ""); } }
void replica::on_prepare(dsn_message_t request) { check_hashed_access(); replica_configuration rconfig; mutation_ptr mu; { msg_binary_reader reader(request); unmarshall(reader, rconfig); mu = mutation::read_from(reader, request); } decree decree = mu->data.header.decree; ddebug( "%s: mutation %s on_prepare", name(), mu->name()); dassert (mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { if (!update_local_configuration(rconfig)) { ddebug( "%s: mutation %s on_prepare to %s failed as update local configuration failed", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } } if (PS_INACTIVE == status() || PS_ERROR == status()) { ddebug( "%s: mutation %s on_prepare to %s skipped", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message( (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE, mu ); return; } else if (PS_POTENTIAL_SECONDARY == status()) { if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded) { ddebug( "%s: mutation %s on_prepare to %s skipped, learnings state = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // do not retry as there may retries later return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_OK, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name()); if (mu2->is_logged() || _options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(ERR_OK, mu); } return; } error_code err = _prepare_list->prepare(mu, status()); dassert (err == ERR_OK, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, ""); } // ack without logging if (_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } // write log dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(mu->log_task() != nullptr, ""); }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } // mutation log failure, propagted to all replicas if (err != ERR_OK) { _stub->handle_log_failure(err); } // write local private log if necessary else if (_private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, [this](error_code err, size_t size) { if (err != ERR_OK) { handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
bool replica::update_local_configuration(const replica_configuration& config, bool same_ballot/* = false*/) { dassert(config.ballot > get_ballot() || (same_ballot && config.ballot == get_ballot()), ""); dassert (config.gpid == get_gpid(), ""); partition_status old_status = status(); ballot old_ballot = get_ballot(); // skip unncessary configuration change if (old_status == config.status && old_ballot == config.ballot) return true; // skip invalid change switch (old_status) { case PS_ERROR: { ddebug( "%s: status change from %s @ %lld to %s @ %lld is not allowed", name(), enum_to_string(old_status), old_ballot, enum_to_string(config.status), config.ballot ); return false; } break; case PS_INACTIVE: if ((config.status == PS_PRIMARY || config.status == PS_SECONDARY) && !_inactive_is_transient) { ddebug( "%s: status change from %s @ %lld to %s @ %lld is not allowed when inactive state is not transient", name(), enum_to_string(old_status), old_ballot, enum_to_string(config.status), config.ballot ); return false; } break; case PS_POTENTIAL_SECONDARY: if (config.status == PS_ERROR || config.status == PS_INACTIVE) { if (!_potential_secondary_states.cleanup(false)) { dwarn( "%s: status change from %s @ %lld to %s @ %lld is not allowed coz learning remote state is still running", name(), enum_to_string(old_status), old_ballot, enum_to_string(config.status), config.ballot ); return false; } } break; case PS_SECONDARY: if (config.status != PS_SECONDARY && _secondary_states.checkpoint_task != nullptr) { dwarn( "%s: status change from %s @ %lld to %s @ %lld is not allowed coz checkpointing is still running", name(), enum_to_string(old_status), old_ballot, enum_to_string(config.status), config.ballot ); return false; } break; } uint64_t oldTs = _last_config_change_time_ms; _config = config; _last_config_change_time_ms =now_ms(); dassert (max_prepared_decree() >= last_committed_decree(), ""); switch (old_status) { case PS_PRIMARY: cleanup_preparing_mutations(true); switch (config.status) { case PS_PRIMARY: replay_prepare_list(); break; case PS_INACTIVE: _primary_states.cleanup(old_ballot != config.ballot); break; case PS_SECONDARY: case PS_ERROR: _primary_states.cleanup(); break; case PS_POTENTIAL_SECONDARY: dassert (false, "invalid execution path"); break; default: dassert (false, "invalid execution path"); } break; case PS_SECONDARY: cleanup_preparing_mutations(true); switch (config.status) { case PS_PRIMARY: init_group_check(); replay_prepare_list(); break; case PS_SECONDARY: break; case PS_POTENTIAL_SECONDARY: // InActive in config break; case PS_INACTIVE: break; case PS_ERROR: break; default: dassert (false, "invalid execution path"); } break; case PS_POTENTIAL_SECONDARY: switch (config.status) { case PS_PRIMARY: dassert (false, "invalid execution path"); break; case PS_SECONDARY: _prepare_list->truncate(_app->last_committed_decree()); _potential_secondary_states.cleanup(true); check_state_completeness(); break; case PS_POTENTIAL_SECONDARY: break; case PS_INACTIVE: _potential_secondary_states.cleanup(true); break; case PS_ERROR: _prepare_list->reset(_app->last_committed_decree()); _potential_secondary_states.cleanup(true); break; default: dassert (false, "invalid execution path"); } break; case PS_INACTIVE: switch (config.status) { case PS_PRIMARY: _inactive_is_transient = false; init_group_check(); replay_prepare_list(); break; case PS_SECONDARY: _inactive_is_transient = false; break; case PS_POTENTIAL_SECONDARY: _inactive_is_transient = false; break; case PS_INACTIVE: break; case PS_ERROR: _inactive_is_transient = false; break; default: dassert (false, "invalid execution path"); } break; case PS_ERROR: switch (config.status) { case PS_PRIMARY: dassert (false, "invalid execution path"); break; case PS_SECONDARY: dassert (false, "invalid execution path"); break; case PS_POTENTIAL_SECONDARY: dassert(false, "invalid execution path"); break; case PS_INACTIVE: dassert (false, "invalid execution path"); break; case PS_ERROR: break; default: dassert (false, "invalid execution path"); } break; default: dassert (false, "invalid execution path"); } dwarn( "%s: status change %s @ %lld => %s @ %lld, pre(%llu, %llu), app(%llu, %llu), duration=%llu ms", name(), enum_to_string(old_status), old_ballot, enum_to_string(status()), get_ballot(), _prepare_list->max_decree(), _prepare_list->last_committed_decree(), _app->last_committed_decree(), _app->last_durable_decree(), _last_config_change_time_ms - oldTs ); if (status() != old_status) { bool isClosing = (status() == PS_ERROR || (status() == PS_INACTIVE && get_ballot() > old_ballot)); _stub->notify_replica_state_update(config, isClosing); if (isClosing) { ddebug("%s: being close ...", name()); _stub->begin_close_replica(this); return false; } } else { _stub->notify_replica_state_update(config, false); } return true; }
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, int err, message_ptr& request, message_ptr& reply) { check_hashed_access(); mutation_ptr& mu = pr.first; partition_status targetStatus = pr.second; // skip callback for old mutations if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status()) return; dassert (mu->data.header.ballot == get_ballot(), ""); end_point node = request->header().to_address; partition_status st = _primary_states.GetNodeStatus(node); // handle reply prepare_ack resp; // handle error if (err) { resp.err = err; } else { unmarshall(reply, resp); ddebug( "%s: mutation %s on_prepare_reply from %s:%d", name(), mu->name(), node.name.c_str(), static_cast<int>(node.port) ); } if (resp.err == ERR_SUCCESS) { dassert (resp.ballot == get_ballot(), ""); dassert (resp.decree == mu->data.header.decree, ""); switch (targetStatus) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); dassert (mu->left_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; case PS_POTENTIAL_SECONDARY: dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; default: ddebug( "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name() ); break; } } // failure handling else { // note targetStatus and (curent) status may diff if (targetStatus == PS_POTENTIAL_SECONDARY) { dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } } handle_remote_failure(st, node, resp.err); } }