void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } if (!_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err); if (err == ERR_SUCCESS) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_SUCCESS) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_SUCCESS) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_OK; uint8_t count = 0; mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid()); // check bounded staleness if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_OK) { goto ErrOut; } // remote prepare mu->set_prepare_ts(); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it) { send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature); count++; } } mu->set_left_potential_secondary_ack_count(count); if (mu->is_logged()) { do_possible_commit_on_primary(mu); } else { dassert(mu->data.header.log_offset == invalid_offset, ""); dassert(mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(nullptr != mu->log_task(), ""); } return; ErrOut: for (auto& r : mu->client_requests) { response_client_message(r, err); } return; }
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, error_code err, dsn_message_t request, dsn_message_t reply) { check_hashed_access(); mutation_ptr mu = pr.first; partition_status targetStatus = pr.second; // skip callback for old mutations if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status()) return; dassert (mu->data.header.ballot == get_ballot(), ""); ::dsn::rpc_address node = dsn_msg_to_address(request); partition_status st = _primary_states.get_node_status(node); // handle reply prepare_ack resp; // handle error if (err != ERR_OK) { resp.err = err; } else { ::unmarshall(reply, resp); } ddebug( "%s: mutation %s on_prepare_reply from %s, err = %s", name(), mu->name(), node.to_string(), resp.err.to_string() ); if (resp.err == ERR_OK) { dassert (resp.ballot == get_ballot(), ""); dassert (resp.decree == mu->data.header.decree, ""); switch (targetStatus) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); dassert (mu->left_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; case PS_POTENTIAL_SECONDARY: dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; default: dwarn( "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name() ); break; } } // failure handling else { // retry for INACTIVE state when there are still time if (resp.err == ERR_INACTIVE_STATE && !mu->is_prepare_close_to_timeout(2, targetStatus == PS_SECONDARY ? _options->prepare_timeout_ms_for_secondaries : _options->prepare_timeout_ms_for_potential_secondaries) ) { send_prepare_message(node, targetStatus, mu, targetStatus == PS_SECONDARY ? _options->prepare_timeout_ms_for_secondaries : _options->prepare_timeout_ms_for_potential_secondaries); return; } // make sure this is before any later commit ops // because now commit ops may lead to new prepare ops // due to replication throttling handle_remote_failure(st, node, resp.err); // note targetStatus and (curent) status may diff if (targetStatus == PS_POTENTIAL_SECONDARY) { dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } } } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE) { switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, [this, mu](error_code err, size_t size) { // // DO NOT CHANGE THIS CALLBACK HERE UNLESS // YOU FULLY UNDERSTAND WHAT WE DO HERE // // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE // THE FIRST CALLBACK IF THERE IS FAILURE TO // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY // CANCELLED!!! // // TODO: we do not need so many callbacks // dinfo("%s: append private log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err != ERR_OK) { derror("%s: append private log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE) { switch (status()) { case partition_status::PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case partition_status::PS_SECONDARY: case partition_status::PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case partition_status::PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, nullptr, gpid_to_hash(get_gpid()) ); } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } // mutation log failure, propagted to all replicas if (err != ERR_OK) { _stub->handle_log_failure(err); } // write local private log if necessary else if (_private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, [this](error_code err, size_t size) { if (err != ERR_OK) { handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, int err, message_ptr& request, message_ptr& reply) { check_hashed_access(); mutation_ptr& mu = pr.first; partition_status targetStatus = pr.second; // skip callback for old mutations if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status()) return; dassert (mu->data.header.ballot == get_ballot(), ""); end_point node = request->header().to_address; partition_status st = _primary_states.GetNodeStatus(node); // handle reply prepare_ack resp; // handle error if (err) { resp.err = err; } else { unmarshall(reply, resp); ddebug( "%s: mutation %s on_prepare_reply from %s:%d", name(), mu->name(), node.name.c_str(), static_cast<int>(node.port) ); } if (resp.err == ERR_SUCCESS) { dassert (resp.ballot == get_ballot(), ""); dassert (resp.decree == mu->data.header.decree, ""); switch (targetStatus) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); dassert (mu->left_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; case PS_POTENTIAL_SECONDARY: dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } break; default: ddebug( "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name() ); break; } } // failure handling else { // note targetStatus and (curent) status may diff if (targetStatus == PS_POTENTIAL_SECONDARY) { dassert (mu->left_potential_secondary_ack_count() > 0, ""); if (0 == mu->decrease_left_potential_secondary_ack_count()) { do_possible_commit_on_primary(mu); } } handle_remote_failure(st, node, resp.err); } }