void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } if (!_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::on_checkpoint_completed(error_code err) { check_hashed_access(); // closing or wrong timing if (PS_SECONDARY != status() || ERR_WRONG_TIMING == err) { _secondary_states.checkpoint_task = nullptr; return; } // handle failure if (err != ERR_OK) { // done checkpointing _secondary_states.checkpoint_task = nullptr; handle_local_failure(err); return; } auto c = _prepare_list->last_committed_decree(); // missing commits if (c > _app->last_committed_decree()) { // missed ones are covered by prepare list if (_app->last_committed_decree() > _prepare_list->min_decree()) { for (auto d = _app->last_committed_decree() + 1; d <= c; d++) { auto mu = _prepare_list->get_mutation_by_decree(d); dassert(nullptr != mu, ""); _app->write_internal(mu); } // everything is ok now, done checkpointing _secondary_states.checkpoint_task = nullptr; } // missed ones need to be loaded via private logs else { _secondary_states.checkpoint_task = tasking::enqueue( LPC_CHECKPOINT_REPLICA, this, [this]() { this->catch_up_with_private_logs(PS_SECONDARY); }, gpid_to_hash(get_gpid()) ); } } // no missing commits else { // everything is ok now, done checkpointing _secondary_states.checkpoint_task = nullptr; } }
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err); if (err == ERR_SUCCESS) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_SUCCESS) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_SUCCESS) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::execute_mutation(mutation_ptr& mu) { dassert (nullptr != _app, ""); int err = ERR_SUCCESS; switch (status()) { case PS_INACTIVE: if (_app->last_committed_decree() + 1 == mu->data.header.decree) err = _app->write_internal(mu, false); break; case PS_PRIMARY: case PS_SECONDARY: { dassert (_app->last_committed_decree() + 1 == mu->data.header.decree, ""); bool ack_client = (status() == PS_PRIMARY); if (ack_client) { if (mu->client_request == nullptr) ack_client = false; else if (mu->client_request->header().from_address.ip == 0) ack_client = false; } err = _app->write_internal(mu, ack_client); } break; case PS_POTENTIAL_SECONDARY: if (LearningSucceeded == _potential_secondary_states.learning_status) { if (mu->data.header.decree == _app->last_committed_decree() + 1) { err = _app->write_internal(mu, false); } else { dassert (mu->data.header.decree <= _app->last_committed_decree(), ""); } } else { // drop mutations as learning will catch up ddebug("%s: mutation %s skipped coz learing buffer overflow", name(), mu->name()); } break; case PS_ERROR: break; } ddebug("TwoPhaseCommit, %s: mutation %s committed, err = %x", name(), mu->name(), err); if (err != ERR_SUCCESS) { handle_local_failure(err); } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE) { switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, [this, mu](error_code err, size_t size) { // // DO NOT CHANGE THIS CALLBACK HERE UNLESS // YOU FULLY UNDERSTAND WHAT WE DO HERE // // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE // THE FIRST CALLBACK IF THERE IS FAILURE TO // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY // CANCELLED!!! // // TODO: we do not need so many callbacks // dinfo("%s: append private log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err != ERR_OK) { derror("%s: append private log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::execute_mutation(mutation_ptr& mu) { dinfo("%s: execute mutation %s: request_count = %u", name(), mu->name(), static_cast<int>(mu->client_requests.size()) ); error_code err = ERR_OK; decree d = mu->data.header.decree; switch (status()) { case partition_status::PS_INACTIVE: if (_app->last_committed_decree() + 1 == d) { err = _app->write_internal(mu); } else { ddebug( "%s: mutation %s commit to %s skipped, app.last_committed_decree = %" PRId64, name(), mu->name(), enum_to_string(status()), _app->last_committed_decree() ); } break; case partition_status::PS_PRIMARY: { check_state_completeness(); dassert(_app->last_committed_decree() + 1 == d, ""); err = _app->write_internal(mu); } break; case partition_status::PS_SECONDARY: if (!_secondary_states.checkpoint_is_running) { check_state_completeness(); dassert (_app->last_committed_decree() + 1 == d, ""); err = _app->write_internal(mu); } else { ddebug( "%s: mutation %s commit to %s skipped, app.last_committed_decree = %" PRId64, name(), mu->name(), enum_to_string(status()), _app->last_committed_decree() ); // make sure private log saves the state // catch-up will be done later after checkpoint task is fininished dassert(_private_log != nullptr, ""); } break; case partition_status::PS_POTENTIAL_SECONDARY: if (_potential_secondary_states.learning_status == learner_status::LearningSucceeded || _potential_secondary_states.learning_status == learner_status::LearningWithPrepareTransient) { dassert(_app->last_committed_decree() + 1 == d, ""); err = _app->write_internal(mu); } else { // prepare also happens with learner_status::LearningWithPrepare, in this case // make sure private log saves the state, // catch-up will be done later after the checkpoint task is finished ddebug( "%s: mutation %s commit to %s skipped, app.last_committed_decree = %" PRId64, name(), mu->name(), enum_to_string(status()), _app->last_committed_decree() ); } break; case partition_status::PS_ERROR: break; } ddebug("TwoPhaseCommit, %s: mutation %s committed, err = %s", name(), mu->name(), err.to_string()); _counter_commit_latency.set(dsn_now_ns() - mu->create_ts_ns()); if (err != ERR_OK) { handle_local_failure(err); } if (status() == partition_status::PS_PRIMARY) { mutation_ptr next = _primary_states.write_queue.check_possible_work( static_cast<int>(_prepare_list->max_decree() - d) ); if (next) { init_prepare(next); } } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE) { switch (status()) { case partition_status::PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case partition_status::PS_SECONDARY: case partition_status::PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case partition_status::PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, nullptr, gpid_to_hash(get_gpid()) ); } }
void replica::on_checkpoint_completed(error_code err) { check_hashed_access(); // closing or wrong timing or no need operate if (PS_SECONDARY != status() || err == ERR_WRONG_TIMING || err == ERR_NO_NEED_OPERATE) { _secondary_states.checkpoint_is_running = false; return; } // handle failure if (err != ERR_OK) { // done checkpointing _secondary_states.checkpoint_is_running = false; handle_local_failure(err); return; } auto c = _prepare_list->last_committed_decree(); // missing commits if (c > _app->last_committed_decree()) { // missed ones are covered by prepare list if (_app->last_committed_decree() > _prepare_list->min_decree()) { for (auto d = _app->last_committed_decree() + 1; d <= c; d++) { auto mu = _prepare_list->get_mutation_by_decree(d); dassert(nullptr != mu, ""); err = _app->write_internal(mu); if (ERR_OK != err) { _secondary_states.checkpoint_is_running = false; handle_local_failure(err); return; } } // everything is ok now, done checkpointing _secondary_states.checkpoint_is_running = false; } // missed ones need to be loaded via private logs else { tasking::enqueue( &_secondary_states.catchup_with_private_log_task, LPC_CATCHUP_WITH_PRIVATE_LOGS, this, [this]() { this->catch_up_with_private_logs(PS_SECONDARY); }, gpid_to_hash(get_gpid()) ); } } // no missing commits else { // everything is ok now, done checkpointing _secondary_states.checkpoint_is_running = false; } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } // mutation log failure, propagted to all replicas if (err != ERR_OK) { _stub->handle_log_failure(err); } // write local private log if necessary else if (_private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, [this](error_code err, size_t size) { if (err != ERR_OK) { handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_SUCCESS; uint8_t count = 0; if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count) { err = ERR_NOT_ENOUGH_MEMBER; goto ErrOut; } mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } if (mu->data.header.decree > _prepare_list->max_decree() && _prepare_list->count() >= _options.staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare without log err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_SUCCESS) { goto ErrOut; } ddebug("%s: mutation %s init_prepare", name(), mu->name()); // // TODO: bounded staleness on secondaries // dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, ""); // remote prepare dassert (mu->remote_tasks().size() == 0, ""); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++) { send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries); count++; } } mu->set_left_potential_secondary_ack_count(count); // local log dassert (mu->data.header.log_offset == invalid_offset, ""); dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); if (nullptr == mu->log_task()) { err = ERR_FILE_OPERATION_FAILED; handle_local_failure(err); goto ErrOut; } return; ErrOut: response_client_message(mu->client_request, err); return; }
void replica::on_prepare(message_ptr& request) { check_hashed_access(); replica_configuration rconfig; unmarshall(request, rconfig); mutation_ptr mu = mutation::read_from(request); decree decree = mu->data.header.decree; ddebug( "%s: mutation %s on_prepare", name(), mu->name()); dassert (mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { update_local_configuration(rconfig); } if (PS_INACTIVE == status() || PS_ERROR == status()) { ddebug( "%s: mutation %s on_prepare to %s skipped", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } else if (PS_POTENTIAL_SECONDARY == status()) { if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded) { ddebug( "%s: mutation %s on_prepare to %s skipped, learnings state = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // do not retry as there may retries later return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_SUCCESS, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name()); if (mu2->is_prepared()) { ack_prepare_message(ERR_SUCCESS, mu); } return; } int err = _prepare_list->prepare(mu, status()); dassert (err == ERR_SUCCESS, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, ""); } // write log dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); if (nullptr == mu->log_task()) { err = ERR_FILE_OPERATION_FAILED; ack_prepare_message(err, mu); handle_local_failure(err); } }