void replica::on_group_check_reply(error_code err, const std::shared_ptr<group_check_request>& req, const std::shared_ptr<group_check_response>& resp) { check_hashed_access(); if (partition_status::PS_PRIMARY != status() || req->config.ballot < get_ballot()) { return; } auto r = _primary_states.group_check_pending_replies.erase(req->node); dassert (r == 1, ""); if (err != ERR_OK) { handle_remote_failure(req->config.status, req->node, err); } else { if (resp->err == ERR_OK) { if (resp->learner_status_ == learner_status::LearningSucceeded && req->config.status == partition_status::PS_POTENTIAL_SECONDARY) { handle_learning_succeeded_on_primary(req->node, resp->learner_signature); } } else { handle_remote_failure(req->config.status, req->node, resp->err); } } }
void replica::on_checkpoint_completed(error_code err) { check_hashed_access(); // closing or wrong timing if (PS_SECONDARY != status() || ERR_WRONG_TIMING == err) { _secondary_states.checkpoint_task = nullptr; return; } // handle failure if (err != ERR_OK) { // done checkpointing _secondary_states.checkpoint_task = nullptr; handle_local_failure(err); return; } auto c = _prepare_list->last_committed_decree(); // missing commits if (c > _app->last_committed_decree()) { // missed ones are covered by prepare list if (_app->last_committed_decree() > _prepare_list->min_decree()) { for (auto d = _app->last_committed_decree() + 1; d <= c; d++) { auto mu = _prepare_list->get_mutation_by_decree(d); dassert(nullptr != mu, ""); _app->write_internal(mu); } // everything is ok now, done checkpointing _secondary_states.checkpoint_task = nullptr; } // missed ones need to be loaded via private logs else { _secondary_states.checkpoint_task = tasking::enqueue( LPC_CHECKPOINT_REPLICA, this, [this]() { this->catch_up_with_private_logs(PS_SECONDARY); }, gpid_to_hash(get_gpid()) ); } } // no missing commits else { // everything is ok now, done checkpointing _secondary_states.checkpoint_task = nullptr; } }
void replica::on_copy_checkpoint_ack(error_code err, std::shared_ptr<replica_configuration>& req, std::shared_ptr<learn_response>& resp) { check_hashed_access(); if (PS_PRIMARY != status()) { _primary_states.checkpoint_task = nullptr; return; } if (err != ERR_OK || resp == nullptr) { dwarn("%s: copy checkpoint from secondary failed, err = %s", name(), err.to_string()); _primary_states.checkpoint_task = nullptr; return; } if (resp->err != ERR_OK) { dinfo("%s: copy checkpoint from secondary failed, err = %s", name(), resp->err.to_string()); _primary_states.checkpoint_task = nullptr; return; } if (resp->state.to_decree_included <= _app->last_durable_decree()) { dinfo("%s: copy checkpoint from secondary skipped, as its decree is not bigger than current durable_decree: %" PRIu64 " vs %" PRIu64 "", name(), resp->state.to_decree_included, _app->last_durable_decree() ); _primary_states.checkpoint_task = nullptr; return; } std::string ldir = utils::filesystem::path_combine( _app->learn_dir(), "checkpoint.copy" ); if (utils::filesystem::path_exists(ldir)) utils::filesystem::remove_path(ldir); _primary_states.checkpoint_task = file::copy_remote_files( resp->address, resp->base_local_dir, resp->state.files, ldir, false, LPC_REPLICA_COPY_LAST_CHECKPOINT_DONE, this, [this, resp](error_code err, size_t sz) { this->on_copy_checkpoint_file_completed(err, sz, resp); }, gpid_to_hash(get_gpid()) ); }
void replica::on_copy_checkpoint_file_completed(error_code err, size_t sz, std::shared_ptr<learn_response> resp) { check_hashed_access(); if (PS_PRIMARY == status() && resp->state.to_decree_included > _app->last_durable_decree()) { _app->apply_checkpoint(resp->state, CHKPT_COPY); } _primary_states.checkpoint_task = nullptr; }
void replica::on_client_write(int code, dsn_message_t request) { check_hashed_access(); if (PS_PRIMARY != status()) { response_client_message(request, ERR_INVALID_STATE); return; } mutation_ptr mu = new_mutation(_prepare_list->max_decree() + 1); mu->set_client_request(code, request); init_prepare(mu); }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } if (!_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::init_checkpoint() { check_hashed_access(); // only applicable to primary and secondary replicas if (status() != PS_PRIMARY && status() != PS_SECONDARY) return; // no need to checkpoint if (_app->is_delta_state_learning_supported()) return; // already running if (_secondary_states.checkpoint_task != nullptr) return; // private log must be enabled to make sure commits // are not lost during checkpinting dassert(nullptr != _private_log, "log_enable_private_prepare must be true for checkpointing"); // TODO: when NOT to checkpoint, but use private log replay to build the state if (last_committed_decree() - last_durable_decree() < 10000) return; // primary is downgraded to secondary for checkpointing as no write can be seen // during checkpointing (i.e., state is freezed) if (PS_PRIMARY == status()) { configuration_update_request proposal; proposal.config = _primary_states.membership; proposal.type = CT_DOWNGRADE_TO_SECONDARY; proposal.node = proposal.config.primary; downgrade_to_secondary_on_primary(proposal); } // secondary can start checkpint in the long running thread pool else { dassert(PS_SECONDARY == status(), ""); _secondary_states.checkpoint_task = tasking::enqueue( LPC_CHECKPOINT_REPLICA, this, &replica::checkpoint, gpid_to_hash(get_gpid()) ); } }
void replica::on_client_write(int code, dsn_message_t request) { check_hashed_access(); if (PS_PRIMARY != status()) { response_client_message(request, ERR_INVALID_STATE); return; } auto mu = _primary_states.write_queue.add_work(code, request, this); if (mu) { init_prepare(mu); } }
void replica::init_group_check() { check_hashed_access(); ddebug("%s: init group check", name()); if (partition_status::PS_PRIMARY != status() || _options->group_check_disabled) return; dassert (nullptr == _primary_states.group_check_task, ""); _primary_states.group_check_task = tasking::enqueue_timer( LPC_GROUP_CHECK, this, [this] {broadcast_group_check();}, std::chrono::milliseconds(_options->group_check_interval_ms), gpid_to_thread_hash(get_gpid()) ); }
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err); if (err == ERR_SUCCESS) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_SUCCESS) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_SUCCESS) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::on_client_write(task_code code, dsn_message_t request) { check_hashed_access(); if (PS_PRIMARY != status()) { response_client_message(request, ERR_INVALID_STATE); return; } if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options->mutation_2pc_min_replica_count) { response_client_message(request, ERR_NOT_ENOUGH_MEMBER); return; } auto mu = _primary_states.write_queue.add_work(code, request, this); if (mu) { init_prepare(mu); } }
// @ secondary void replica::on_copy_checkpoint(const replica_configuration& request, /*out*/ learn_response& response) { check_hashed_access(); if (request.ballot > get_ballot()) { if (!update_local_configuration(request)) { response.err = ERR_INVALID_STATE; return; } } if (status() != PS_SECONDARY) { response.err = ERR_INVALID_STATE; return; } if (_app->last_durable_decree() == 0) { response.err = ERR_OBJECT_NOT_FOUND; return; } blob placeholder; int err = _app->get_checkpoint(0, placeholder, response.state); if (err != 0) { response.err = ERR_LEARN_FILE_FAILED; } else { response.err = ERR_OK; response.last_committed_decree = last_committed_decree(); response.base_local_dir = _app->data_dir(); response.address = _stub->_primary_address; } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } // mutation log failure, propagted to all replicas if (err != ERR_OK) { _stub->handle_log_failure(err); } // write local private log if necessary else if (_private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, [this](error_code err, size_t size) { if (err != ERR_OK) { handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::on_group_check(const group_check_request& request, /*out*/ group_check_response& response) { check_hashed_access(); ddebug( "%s: process group check, primary = %s, ballot = %" PRId64 ", status = %s, last_committed_decree = %" PRId64, name(), request.config.primary.to_string(), request.config.ballot, enum_to_string(request.config.status), request.last_committed_decree ); if (request.config.ballot < get_ballot()) { response.err = ERR_VERSION_OUTDATED; dwarn("%s: on_group_check reply %s", name(), response.err.to_string()); return; } else if (request.config.ballot > get_ballot()) { if (!update_local_configuration(request.config)) { response.err = ERR_INVALID_STATE; dwarn("%s: on_group_check reply %s", name(), response.err.to_string()); return; } } else if (is_same_ballot_status_change_allowed(status(), request.config.status)) { update_local_configuration(request.config, true); } switch (status()) { case partition_status::PS_INACTIVE: break; case partition_status::PS_SECONDARY: if (request.last_committed_decree > last_committed_decree()) { _prepare_list->commit(request.last_committed_decree, COMMIT_TO_DECREE_HARD); } break; case partition_status::PS_POTENTIAL_SECONDARY: init_learn(request.config.learner_signature); break; case partition_status::PS_ERROR: break; default: dassert (false, ""); } response.pid = get_gpid(); response.node = _stub->_primary_address; response.err = ERR_OK; if (status() == partition_status::PS_ERROR) { response.err = ERR_INVALID_STATE; dwarn("%s: on_group_check reply %s", name(), response.err.to_string()); } response.last_committed_decree_in_app = _app->last_committed_decree(); response.last_committed_decree_in_prepare_list = last_committed_decree(); response.learner_status_ = _potential_secondary_states.learning_status; response.learner_signature = _potential_secondary_states.learning_version; }
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, error_code err, dsn_message_t request, dsn_message_t reply) { check_hashed_access(); mutation_ptr mu = pr.first; partition_status targetStatus = pr.second; // skip callback for old mutations if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status()) return; dassert (mu->data.header.ballot == get_ballot(), ""); ::dsn::rpc_address node = dsn_msg_to_address(request); partition_status st = _primary_states.get_node_status(node); // handle reply prepare_ack resp; // handle error if (err != ERR_OK) { resp.err = err; } else { ::unmarshall(reply, resp); } ddebug( "%s: mutation %s on_prepare_reply from %s, err = %s", name(), mu->name(), node.to_string(), resp.err.to_string() ); if (resp.err == ERR_OK) { dassert (resp.ballot == get_ballot(), ""); dassert (resp.decree == mu->data.header.decree, ""); switch (targetStatus) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); dassert (mu->left_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; case PS_POTENTIAL_SECONDARY: dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; default: dwarn( "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name() ); break; } } // failure handling else { // retry for INACTIVE state when there are still time if (resp.err == ERR_INACTIVE_STATE && !mu->is_prepare_close_to_timeout(2, targetStatus == PS_SECONDARY ? _options->prepare_timeout_ms_for_secondaries : _options->prepare_timeout_ms_for_potential_secondaries) ) { send_prepare_message(node, targetStatus, mu, targetStatus == PS_SECONDARY ? _options->prepare_timeout_ms_for_secondaries : _options->prepare_timeout_ms_for_potential_secondaries); return; } // make sure this is before any later commit ops // because now commit ops may lead to new prepare ops // due to replication throttling handle_remote_failure(st, node, resp.err); // note targetStatus and (curent) status may diff if (targetStatus == PS_POTENTIAL_SECONDARY) { dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } } } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE) { switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, [this, mu](error_code err, size_t size) { // // DO NOT CHANGE THIS CALLBACK HERE UNLESS // YOU FULLY UNDERSTAND WHAT WE DO HERE // // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE // THE FIRST CALLBACK IF THERE IS FAILURE TO // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY // CANCELLED!!! // // TODO: we do not need so many callbacks // dinfo("%s: append private log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err != ERR_OK) { derror("%s: append private log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::on_prepare(dsn_message_t request) { check_hashed_access(); replica_configuration rconfig; mutation_ptr mu; { rpc_read_stream reader(request); unmarshall(reader, rconfig); mu = mutation::read_from(reader, request); } decree decree = mu->data.header.decree; dinfo("%s: mutation %s on_prepare", name(), mu->name()); dassert(mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); // no need response because the rpc should have been cancelled on primary in this case return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { if (!update_local_configuration(rconfig)) { derror( "%s: mutation %s on_prepare failed as update local configuration failed, state = %s", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } } if (PS_INACTIVE == status() || PS_ERROR == status()) { derror( "%s: mutation %s on_prepare failed as invalid replica state, state = %s", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message( (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE, mu ); return; } else if (PS_POTENTIAL_SECONDARY == status()) { // new learning process if (rconfig.learner_signature != _potential_secondary_states.learning_signature) { init_learn(rconfig.learner_signature); // no need response as rpc is already gone return; } if (!(_potential_secondary_states.learning_status == LearningWithPrepare || _potential_secondary_states.learning_status == LearningSucceeded)) { derror( "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // no need response as rpc is already gone return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_OK, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { if (mu2->is_logged()) { ack_prepare_message(ERR_OK, mu); } else { derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name()); // response will be unnecessary when we add retry logic in rpc engine. // the retried rpc will use the same id therefore it will be considered responsed // even the response is for a previous try. } return; } error_code err = _prepare_list->prepare(mu, status()); dassert (err == ERR_OK, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, ""); } dassert(mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE) { switch (status()) { case partition_status::PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case partition_status::PS_SECONDARY: case partition_status::PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case partition_status::PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, nullptr, gpid_to_hash(get_gpid()) ); } }
void replica::on_update_configuration_on_meta_server_reply(error_code err, dsn_message_t request, dsn_message_t response, std::shared_ptr<configuration_update_request> req) { check_hashed_access(); if (PS_INACTIVE != status() || _stub->is_connected() == false) { _primary_states.reconfiguration_task = nullptr; err.end_tracking(); return; } configuration_update_response resp; if (err == ERR_OK) { ::unmarshall(response, resp); err = resp.err; } if (err != ERR_OK) { ddebug( "%s: update configuration reply with err %s, request ballot %lld", name(), err.to_string(), req->config.ballot ); if (err != ERR_INVALID_VERSION) { rpc_address target(_stub->_failure_detector->get_servers()); dsn_msg_add_ref(request); // added for another round of rpc::call _primary_states.reconfiguration_task = rpc::call( target, request, this, std::bind(&replica::on_update_configuration_on_meta_server_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, req), gpid_to_hash(get_gpid()) ); return; } } ddebug( "%s: update configuration reply with err %s, ballot %lld, local %lld", name(), resp.err.to_string(), resp.config.ballot, get_ballot() ); if (resp.config.ballot < get_ballot()) { _primary_states.reconfiguration_task = nullptr; return; } // post-update work items? if (resp.err == ERR_OK) { dassert (req->config.gpid == resp.config.gpid, ""); dassert (req->config.app_type == resp.config.app_type, ""); dassert (req->config.primary == resp.config.primary, ""); dassert (req->config.secondaries == resp.config.secondaries, ""); switch (req->type) { case CT_UPGRADE_TO_PRIMARY: _primary_states.last_prepare_decree_on_new_primary = _prepare_list->max_decree(); break; case CT_ASSIGN_PRIMARY: case CT_DOWNGRADE_TO_SECONDARY: case CT_DOWNGRADE_TO_INACTIVE: case CT_UPGRADE_TO_SECONDARY: break; case CT_REMOVE: if (req->node != primary_address()) { replica_configuration rconfig; replica_helper::get_replica_config(resp.config, req->node, rconfig); rpc::call_one_way_typed(req->node, RPC_REMOVE_REPLICA, rconfig, gpid_to_hash(get_gpid())); } break; default: dassert (false, ""); } } update_configuration(resp.config); _primary_states.reconfiguration_task = nullptr; }
void replica::on_config_proposal(configuration_update_request& proposal) { check_hashed_access(); ddebug( "%s: on_config_proposal %s for %s", name(), enum_to_string(proposal.type), proposal.node.to_string() ); if (proposal.config.ballot < get_ballot()) { dwarn( "%s: on_config_proposal is out-dated, %lld vs %lld", name(), proposal.config.ballot, get_ballot() ); return; } if (_primary_states.reconfiguration_task != nullptr) { dinfo( "%s: reconfiguration on the way, skip the incoming proposal", name() ); return; } if (proposal.config.ballot > get_ballot()) { if (!update_configuration(proposal.config)) { // is closing or update failed return; } } switch (proposal.type) { case CT_ASSIGN_PRIMARY: case CT_UPGRADE_TO_PRIMARY: assign_primary(proposal); break; case CT_ADD_SECONDARY: add_potential_secondary(proposal); break; case CT_DOWNGRADE_TO_SECONDARY: downgrade_to_secondary_on_primary(proposal); break; case CT_DOWNGRADE_TO_INACTIVE: downgrade_to_inactive_on_primary(proposal); break; case CT_REMOVE: remove(proposal); break; default: dassert (false, ""); } }
void replica::on_prepare(dsn_message_t request) { check_hashed_access(); replica_configuration rconfig; mutation_ptr mu; { msg_binary_reader reader(request); unmarshall(reader, rconfig); mu = mutation::read_from(reader, request); } decree decree = mu->data.header.decree; ddebug( "%s: mutation %s on_prepare", name(), mu->name()); dassert (mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { if (!update_local_configuration(rconfig)) { ddebug( "%s: mutation %s on_prepare to %s failed as update local configuration failed", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } } if (PS_INACTIVE == status() || PS_ERROR == status()) { ddebug( "%s: mutation %s on_prepare to %s skipped", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message( (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE, mu ); return; } else if (PS_POTENTIAL_SECONDARY == status()) { if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded) { ddebug( "%s: mutation %s on_prepare to %s skipped, learnings state = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // do not retry as there may retries later return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_OK, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name()); if (mu2->is_logged() || _options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(ERR_OK, mu); } return; } error_code err = _prepare_list->prepare(mu, status()); dassert (err == ERR_OK, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, ""); } // ack without logging if (_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } // write log dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(mu->log_task() != nullptr, ""); }
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, int err, message_ptr& request, message_ptr& reply) { check_hashed_access(); mutation_ptr& mu = pr.first; partition_status targetStatus = pr.second; // skip callback for old mutations if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status()) return; dassert (mu->data.header.ballot == get_ballot(), ""); end_point node = request->header().to_address; partition_status st = _primary_states.GetNodeStatus(node); // handle reply prepare_ack resp; // handle error if (err) { resp.err = err; } else { unmarshall(reply, resp); ddebug( "%s: mutation %s on_prepare_reply from %s:%d", name(), mu->name(), node.name.c_str(), static_cast<int>(node.port) ); } if (resp.err == ERR_SUCCESS) { dassert (resp.ballot == get_ballot(), ""); dassert (resp.decree == mu->data.header.decree, ""); switch (targetStatus) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); dassert (mu->left_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; case PS_POTENTIAL_SECONDARY: dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; default: ddebug( "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name() ); break; } } // failure handling else { // note targetStatus and (curent) status may diff if (targetStatus == PS_POTENTIAL_SECONDARY) { dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } } handle_remote_failure(st, node, resp.err); } }
void replica::on_checkpoint_timer() { check_hashed_access(); init_checkpoint(); garbage_collection(); }
void replica::on_checkpoint_completed(error_code err) { check_hashed_access(); // closing or wrong timing or no need operate if (PS_SECONDARY != status() || err == ERR_WRONG_TIMING || err == ERR_NO_NEED_OPERATE) { _secondary_states.checkpoint_is_running = false; return; } // handle failure if (err != ERR_OK) { // done checkpointing _secondary_states.checkpoint_is_running = false; handle_local_failure(err); return; } auto c = _prepare_list->last_committed_decree(); // missing commits if (c > _app->last_committed_decree()) { // missed ones are covered by prepare list if (_app->last_committed_decree() > _prepare_list->min_decree()) { for (auto d = _app->last_committed_decree() + 1; d <= c; d++) { auto mu = _prepare_list->get_mutation_by_decree(d); dassert(nullptr != mu, ""); err = _app->write_internal(mu); if (ERR_OK != err) { _secondary_states.checkpoint_is_running = false; handle_local_failure(err); return; } } // everything is ok now, done checkpointing _secondary_states.checkpoint_is_running = false; } // missed ones need to be loaded via private logs else { tasking::enqueue( &_secondary_states.catchup_with_private_log_task, LPC_CATCHUP_WITH_PRIVATE_LOGS, this, [this]() { this->catch_up_with_private_logs(PS_SECONDARY); }, gpid_to_hash(get_gpid()) ); } } // no missing commits else { // everything is ok now, done checkpointing _secondary_states.checkpoint_is_running = false; } }