void replica::send_prepare_message(const dsn_address_t& addr, partition_status status, mutation_ptr& mu, int timeout_milliseconds) { dsn_message_t msg = dsn_msg_create_request(RPC_PREPARE, timeout_milliseconds, gpid_to_hash(get_gpid())); replica_configuration rconfig; _primary_states.get_replica_config(status, rconfig); { msg_binary_writer writer(msg); marshall(writer, get_gpid()); marshall(writer, rconfig); mu->write_to(writer); } mu->remote_tasks()[addr] = rpc::call(addr, msg, this, std::bind(&replica::on_prepare_reply, this, std::make_pair(mu, rconfig.status), std::placeholders::_1, std::placeholders::_2, std::placeholders::_3), gpid_to_hash(get_gpid()) ); ddebug( "%s: mutation %s send_prepare_message to %s:%hu as %s", name(), mu->name(), addr.name, addr.port, enum_to_string(rconfig.status) ); }
void replica::send_prepare_message( ::dsn::rpc_address addr, partition_status status, mutation_ptr& mu, int timeout_milliseconds, int64_t learn_signature) { dsn_message_t msg = dsn_msg_create_request(RPC_PREPARE, timeout_milliseconds, gpid_to_hash(get_gpid())); replica_configuration rconfig; _primary_states.get_replica_config(status, rconfig, learn_signature); { rpc_write_stream writer(msg); marshall(writer, get_gpid()); marshall(writer, rconfig); mu->write_to(writer); } mu->remote_tasks()[addr] = rpc::call(addr, msg, this, [=](error_code err, dsn_message_t request, dsn_message_t reply) { on_prepare_reply(std::make_pair(mu, rconfig.status), err, request, reply); }, gpid_to_hash(get_gpid()) ); ddebug( "%s: mutation %s send_prepare_message to %s as %s", name(), mu->name(), addr.to_string(), enum_to_string(rconfig.status) ); }
void replica::send_prepare_message(const end_point& addr, partition_status status, mutation_ptr& mu, int timeout_milliseconds) { message_ptr msg = message::create_request(RPC_PREPARE, timeout_milliseconds, gpid_to_hash(get_gpid())); marshall(msg, get_gpid()); replica_configuration rconfig; _primary_states.get_replica_config(status, rconfig); marshall(msg, rconfig); mu->write_to(msg); dbg_dassert (mu->remote_tasks().find(addr) == mu->remote_tasks().end()); mu->remote_tasks()[addr] = rpc::call(addr, msg, this, std::bind(&replica::on_prepare_reply, this, std::make_pair(mu, rconfig.status), std::placeholders::_1, std::placeholders::_2, std::placeholders::_3), gpid_to_hash(get_gpid()) ); ddebug( "%s: mutation %s send_prepare_message to %s:%d as %s", name(), mu->name(), addr.name.c_str(), static_cast<int>(addr.port), enum_to_string(rconfig.status) ); }
void replication_app_client_base::call_with_address(dsn::rpc_address addr, request_context_ptr request) { auto& msg = request->request; dbg_dassert(!addr.is_invalid(), ""); dbg_dassert(_app_id > 0, ""); if (request->header_pos != 0) { if (request->is_read) { request->read_header.gpid.app_id = _app_id; request->read_header.gpid.pidx = request->partition_index; blob buffer(request->header_pos, 0, sizeof(request->read_header)); binary_writer writer(buffer); marshall(writer, request->read_header); dsn_msg_options_t opts; opts.timeout_ms = request->timeout_ms; opts.thread_hash = gpid_to_hash(request->read_header.gpid); opts.vnid = *(uint64_t*)(&request->read_header.gpid); dsn_msg_set_options(request->request, &opts, DSN_MSGM_HASH | DSN_MSGM_TIMEOUT); // TODO: not supported yet DSN_MSGM_VNID); } else { request->write_header.gpid.app_id = _app_id; request->write_header.gpid.pidx = request->partition_index; blob buffer(request->header_pos, 0, sizeof(request->write_header)); binary_writer writer(buffer); marshall(writer, request->write_header); dsn_msg_options_t opts; opts.timeout_ms = request->timeout_ms; opts.thread_hash = gpid_to_hash(request->write_header.gpid); opts.vnid = *(uint64_t*)(&request->write_header.gpid); dsn_msg_set_options(request->request, &opts, DSN_MSGM_HASH | DSN_MSGM_TIMEOUT); // TODO: not supported yet DSN_MSGM_VNID | DSN_MSGM_CONTEXT); } request->header_pos = 0; } { zauto_lock l(request->lock); rpc::call( addr, msg, this, std::bind( &replication_app_client_base::replica_rw_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, request ) ); } }
void replica::catch_up_with_private_logs(partition_status s) { learn_state state; _private_log->get_learn_state( get_gpid(), _app->last_committed_decree() + 1, state ); auto err = apply_learned_state_from_private_log(state); tasking::enqueue( LPC_CHECKPOINT_REPLICA_COMPLETED, this, [this, err, s]() { if (PS_SECONDARY == s) this->on_checkpoint_completed(err); else if (PS_POTENTIAL_SECONDARY == s) this->on_learn_remote_state_completed(err); else { dassert(false, "invalid state %s", enum_to_string(s)); } }, gpid_to_hash(get_gpid()) ); }
replication_app_client_base::request_context* replication_app_client_base::create_write_context( int partition_index, task_code code, rpc_response_task_ptr callback, int reply_hash ) { auto rc = new request_context; rc->callback_task = callback; rc->is_read = false; rc->partition_index = partition_index; rc->write_header.gpid.app_id = _app_id; rc->write_header.gpid.pidx = partition_index; rc->write_header.code = code; rc->timeout_timer = nullptr; if (rc->write_header.gpid.app_id == -1) { rc->header_pos = callback->get_request()->writer().write_placeholder(); dbg_dassert(rc->header_pos != 0xffff, ""); } else { rc->header_pos = 0xffff; marshall(callback->get_request()->writer(), rc->write_header); callback->get_request()->header().client.hash = gpid_to_hash(rc->write_header.gpid); } return rc; }
replication_app_client_base::request_context* replication_app_client_base::create_read_context( int partition_index, task_code code, rpc_response_task_ptr callback, read_semantic_t read_semantic, decree snapshot_decree, // only used when ReadSnapshot int reply_hash ) { auto rc = new request_context; rc->callback_task = callback; rc->is_read = true; rc->partition_index = partition_index; rc->read_header.gpid.app_id = _app_id; rc->read_header.gpid.pidx = partition_index; rc->read_header.code = code; rc->read_header.semantic = read_semantic; rc->read_header.version_decree = snapshot_decree; rc->timeout_timer = nullptr; if (rc->read_header.gpid.app_id == -1) { rc->header_pos = callback->get_request()->writer().write_placeholder(); dbg_dassert(rc->header_pos != 0xffff, ""); } else { rc->header_pos = 0xffff; marshall(callback->get_request()->writer(), rc->read_header); callback->get_request()->header().client.hash = gpid_to_hash(rc->read_header.gpid); } return rc; }
void replica::on_checkpoint_completed(error_code err) { check_hashed_access(); // closing or wrong timing if (PS_SECONDARY != status() || ERR_WRONG_TIMING == err) { _secondary_states.checkpoint_task = nullptr; return; } // handle failure if (err != ERR_OK) { // done checkpointing _secondary_states.checkpoint_task = nullptr; handle_local_failure(err); return; } auto c = _prepare_list->last_committed_decree(); // missing commits if (c > _app->last_committed_decree()) { // missed ones are covered by prepare list if (_app->last_committed_decree() > _prepare_list->min_decree()) { for (auto d = _app->last_committed_decree() + 1; d <= c; d++) { auto mu = _prepare_list->get_mutation_by_decree(d); dassert(nullptr != mu, ""); _app->write_internal(mu); } // everything is ok now, done checkpointing _secondary_states.checkpoint_task = nullptr; } // missed ones need to be loaded via private logs else { _secondary_states.checkpoint_task = tasking::enqueue( LPC_CHECKPOINT_REPLICA, this, [this]() { this->catch_up_with_private_logs(PS_SECONDARY); }, gpid_to_hash(get_gpid()) ); } } // no missing commits else { // everything is ok now, done checkpointing _secondary_states.checkpoint_task = nullptr; } }
void replica::on_copy_checkpoint_ack(error_code err, std::shared_ptr<replica_configuration>& req, std::shared_ptr<learn_response>& resp) { check_hashed_access(); if (PS_PRIMARY != status()) { _primary_states.checkpoint_task = nullptr; return; } if (err != ERR_OK || resp == nullptr) { dwarn("%s: copy checkpoint from secondary failed, err = %s", name(), err.to_string()); _primary_states.checkpoint_task = nullptr; return; } if (resp->err != ERR_OK) { dinfo("%s: copy checkpoint from secondary failed, err = %s", name(), resp->err.to_string()); _primary_states.checkpoint_task = nullptr; return; } if (resp->state.to_decree_included <= _app->last_durable_decree()) { dinfo("%s: copy checkpoint from secondary skipped, as its decree is not bigger than current durable_decree: %" PRIu64 " vs %" PRIu64 "", name(), resp->state.to_decree_included, _app->last_durable_decree() ); _primary_states.checkpoint_task = nullptr; return; } std::string ldir = utils::filesystem::path_combine( _app->learn_dir(), "checkpoint.copy" ); if (utils::filesystem::path_exists(ldir)) utils::filesystem::remove_path(ldir); _primary_states.checkpoint_task = file::copy_remote_files( resp->address, resp->base_local_dir, resp->state.files, ldir, false, LPC_REPLICA_COPY_LAST_CHECKPOINT_DONE, this, [this, resp](error_code err, size_t sz) { this->on_copy_checkpoint_file_completed(err, sz, resp); }, gpid_to_hash(get_gpid()) ); }
// run in background thread void replica::background_checkpoint() { auto err = _app->checkpoint(); tasking::enqueue( LPC_CHECKPOINT_REPLICA_COMPLETED, this, [this, err]() { this->on_checkpoint_completed(err); }, gpid_to_hash(get_gpid()) ); }
// meta server => partition server void server_load_balancer::send_proposal(::dsn::rpc_address node, const configuration_update_request& proposal) { dinfo("send proposal %s of %s, current ballot = %" PRId64, enum_to_string(proposal.type), proposal.node.to_string(), proposal.config.ballot ); rpc::call_one_way_typed(node, RPC_CONFIG_PROPOSAL, proposal, gpid_to_hash(proposal.config.gpid)); }
// for testing purpose only void replica::send_group_check_once_for_test(int delay_milliseconds) { dassert (_options.group_check_disabled, ""); _primary_states.group_check_task = tasking::enqueue( LPC_GROUP_CHECK, this, &replica::broadcast_group_check, gpid_to_hash(get_gpid()), delay_milliseconds ); }
// for testing purpose only void replica::send_group_check_once_for_test(int delay_milliseconds) { dassert (_options->group_check_disabled, ""); _primary_states.group_check_task = tasking::enqueue( LPC_GROUP_CHECK, this, [this] {broadcast_group_check();}, gpid_to_hash(get_gpid()), std::chrono::milliseconds(delay_milliseconds) ); }
void replica::broadcast_group_check() { dassert (nullptr != _primary_states.group_check_task, ""); if (_primary_states.group_check_pending_replies.size() > 0) { dwarn( "%s: %u group check replies are still pending when doing next round check", name(), static_cast<int>(_primary_states.group_check_pending_replies.size()) ); for (auto it = _primary_states.group_check_pending_replies.begin(); it != _primary_states.group_check_pending_replies.end(); it++) { it->second->cancel(true); } _primary_states.group_check_pending_replies.clear(); } for (auto it = _primary_states.statuses.begin(); it != _primary_states.statuses.end(); it++) { if (it->first == primary_address()) continue; end_point addr = it->first; std::shared_ptr<group_check_request> request(new group_check_request); request->app_type = _primary_states.membership.app_type; request->node = addr; _primary_states.get_replica_config(addr, request->config); request->last_committed_decree = last_committed_decree(); request->learner_signature = 0; if (it->second == PS_POTENTIAL_SECONDARY) { auto it2 = _primary_states.learners.find(it->first); dassert (it2 != _primary_states.learners.end(), ""); request->learner_signature = it2->second.signature; } task_ptr callback_task = rpc::call_typed( addr, RPC_GROUP_CHECK, request, this, &replica::on_group_check_reply, gpid_to_hash(get_gpid()) ); _primary_states.group_check_pending_replies[addr] = callback_task; ddebug( "%s: init_group_check for %s:%d", name(), addr.name.c_str(), addr.port ); } }
void replica::checkpoint() { auto lerr = _app->flush(true); auto err = lerr == 0 ? ERR_OK : (lerr == ERR_WRONG_TIMING ? ERR_WRONG_TIMING : ERR_LOCAL_APP_FAILURE); tasking::enqueue( LPC_CHECKPOINT_REPLICA_COMPLETED, this, [this, err]() { this->on_checkpoint_completed(err); }, gpid_to_hash(get_gpid()) ); }
void replica::catch_up_with_private_logs(partition_status s) { learn_state state; _private_log->get_learn_state( get_gpid(), _app->last_committed_decree() + 1, state ); auto err = apply_learned_state_from_private_log(state); if (s == PS_POTENTIAL_SECONDARY) { tasking::enqueue( &_potential_secondary_states.learn_remote_files_completed_task, LPC_CHECKPOINT_REPLICA_COMPLETED, this, [this, err]() { this->on_learn_remote_state_completed(err); }, gpid_to_hash(get_gpid()) ); } else { tasking::enqueue( &_secondary_states.checkpoint_completed_task, LPC_CHECKPOINT_REPLICA_COMPLETED, this, [this, err]() { this->on_checkpoint_completed(err); }, gpid_to_hash(get_gpid()) ); } }
void simple_load_balancer::query_decree(std::shared_ptr<query_replica_decree_request> query) { rpc::call( query->node, RPC_QUERY_PN_DECREE, *query, this, [this, query](error_code err, query_replica_decree_response&& resp) { auto response = std::make_shared<query_replica_decree_response>(std::move(resp)); on_query_decree_ack(err, query, response); } , gpid_to_hash(query->pid), std::chrono::seconds(3)); }
void replica::init_group_check() { if (PS_PRIMARY != status() || _options.group_check_disabled) return; dassert (nullptr == _primary_states.group_check_task, ""); _primary_states.group_check_task = tasking::enqueue( LPC_GROUP_CHECK, this, &replica::broadcast_group_check, gpid_to_hash(get_gpid()), 0, _options.group_check_internal_ms ); }
void replica::init_checkpoint() { check_hashed_access(); // only applicable to primary and secondary replicas if (status() != PS_PRIMARY && status() != PS_SECONDARY) return; // no need to checkpoint if (_app->is_delta_state_learning_supported()) return; // already running if (_secondary_states.checkpoint_task != nullptr) return; // private log must be enabled to make sure commits // are not lost during checkpinting dassert(nullptr != _private_log, "log_enable_private_prepare must be true for checkpointing"); // TODO: when NOT to checkpoint, but use private log replay to build the state if (last_committed_decree() - last_durable_decree() < 10000) return; // primary is downgraded to secondary for checkpointing as no write can be seen // during checkpointing (i.e., state is freezed) if (PS_PRIMARY == status()) { configuration_update_request proposal; proposal.config = _primary_states.membership; proposal.type = CT_DOWNGRADE_TO_SECONDARY; proposal.node = proposal.config.primary; downgrade_to_secondary_on_primary(proposal); } // secondary can start checkpint in the long running thread pool else { dassert(PS_SECONDARY == status(), ""); _secondary_states.checkpoint_task = tasking::enqueue( LPC_CHECKPOINT_REPLICA, this, &replica::checkpoint, gpid_to_hash(get_gpid()) ); } }
mutation_queue::mutation_queue(gpid gpid, int max_concurrent_op /*= 2*/, bool batch_write_disabled /*= false*/) : _max_concurrent_op(max_concurrent_op), _batch_write_disabled(batch_write_disabled) { std::stringstream ss; ss << gpid.get_app_id() << "." << gpid.get_partition_index() << "." << "2pc#"; _current_op_counter.init("eon.replication", ss.str().c_str(), COUNTER_TYPE_NUMBER, "current running 2pc#"); _current_op_counter.set(0); _current_op_count = 0; _pending_mutation = nullptr; dassert(gpid.get_app_id() != 0, "invalid gpid"); _pcount = dsn_task_queue_virtual_length_ptr( RPC_PREPARE, gpid_to_hash(gpid) ); }
void replica::update_configuration_on_meta_server(config_type type, ::dsn::rpc_address node, partition_configuration& newConfig) { newConfig.last_committed_decree = last_committed_decree(); if (type != CT_ASSIGN_PRIMARY && type != CT_UPGRADE_TO_PRIMARY) { dassert (status() == PS_PRIMARY, ""); dassert (newConfig.ballot == _primary_states.membership.ballot, ""); } // disable 2pc during reconfiguration // it is possible to do this only for CT_DOWNGRADE_TO_SECONDARY, // but we choose to disable 2pc during all reconfiguration types // for simplicity at the cost of certain write throughput update_local_configuration_with_no_ballot_change(PS_INACTIVE); set_inactive_state_transient(true); dsn_message_t msg = dsn_msg_create_request(RPC_CM_UPDATE_PARTITION_CONFIGURATION, 0, 0); std::shared_ptr<configuration_update_request> request(new configuration_update_request); request->config = newConfig; request->config.ballot++; request->type = type; request->node = node; ::marshall(msg, *request); if (nullptr != _primary_states.reconfiguration_task) { _primary_states.reconfiguration_task->cancel(true); } rpc_address target(_stub->_failure_detector->get_servers()); _primary_states.reconfiguration_task = rpc::call( target, msg, this, std::bind(&replica::on_update_configuration_on_meta_server_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, request), gpid_to_hash(get_gpid()) ); }
void replica::init_group_check() { check_hashed_access(); ddebug("%s: init group check", name()); if (partition_status::PS_PRIMARY != status() || _options->group_check_disabled) return; dassert (nullptr == _primary_states.group_check_task, ""); _primary_states.group_check_task = tasking::enqueue_timer( LPC_GROUP_CHECK, this, [this] {broadcast_group_check();}, std::chrono::milliseconds(_options->group_check_interval_ms), gpid_to_hash(get_gpid()) ); }
void replica::add_potential_secondary(configuration_update_request& proposal) { if (status() != PS_PRIMARY) { return; } dassert (proposal.config.ballot == get_ballot(), ""); dassert (proposal.config.gpid == _primary_states.membership.gpid, ""); dassert (proposal.config.app_type == _primary_states.membership.app_type, ""); dassert (proposal.config.primary == _primary_states.membership.primary, ""); dassert (proposal.config.secondaries == _primary_states.membership.secondaries, ""); dassert (!_primary_states.check_exist(proposal.node, PS_PRIMARY), ""); dassert (!_primary_states.check_exist(proposal.node, PS_SECONDARY), ""); if (_primary_states.learners.find(proposal.node) != _primary_states.learners.end()) { return; } remote_learner_state state; state.prepare_start_decree = invalid_decree; state.signature = random64(0, (uint64_t)(-1LL)); state.timeout_task = nullptr; // TODO: add timer for learner task _primary_states.learners[proposal.node] = state; _primary_states.statuses[proposal.node] = PS_POTENTIAL_SECONDARY; group_check_request request; request.app_type = _primary_states.membership.app_type; request.node = proposal.node; _primary_states.get_replica_config(proposal.node, request.config); request.last_committed_decree = last_committed_decree(); request.learner_signature = state.signature; rpc::call_one_way_typed(proposal.node, RPC_LEARN_ADD_LEARNER, request, gpid_to_hash(get_gpid())); }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_OK; uint8_t count = 0; mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid()); // check bounded staleness if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_OK) { goto ErrOut; } // remote prepare mu->set_prepare_ts(); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it) { send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature); count++; } } mu->set_left_potential_secondary_ack_count(count); if (mu->is_logged()) { do_possible_commit_on_primary(mu); } else { dassert(mu->data.header.log_offset == invalid_offset, ""); dassert(mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(nullptr != mu->log_task(), ""); } return; ErrOut: for (auto& r : mu->client_requests) { response_client_message(r, err); } return; }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE) { switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, [this, mu](error_code err, size_t size) { // // DO NOT CHANGE THIS CALLBACK HERE UNLESS // YOU FULLY UNDERSTAND WHAT WE DO HERE // // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE // THE FIRST CALLBACK IF THERE IS FAILURE TO // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY // CANCELLED!!! // // TODO: we do not need so many callbacks // dinfo("%s: append private log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err != ERR_OK) { derror("%s: append private log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replica::on_prepare(dsn_message_t request) { check_hashed_access(); replica_configuration rconfig; mutation_ptr mu; { rpc_read_stream reader(request); unmarshall(reader, rconfig); mu = mutation::read_from(reader, request); } decree decree = mu->data.header.decree; dinfo("%s: mutation %s on_prepare", name(), mu->name()); dassert(mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); // no need response because the rpc should have been cancelled on primary in this case return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { if (!update_local_configuration(rconfig)) { derror( "%s: mutation %s on_prepare failed as update local configuration failed, state = %s", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } } if (PS_INACTIVE == status() || PS_ERROR == status()) { derror( "%s: mutation %s on_prepare failed as invalid replica state, state = %s", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message( (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE, mu ); return; } else if (PS_POTENTIAL_SECONDARY == status()) { // new learning process if (rconfig.learner_signature != _potential_secondary_states.learning_signature) { init_learn(rconfig.learner_signature); // no need response as rpc is already gone return; } if (!(_potential_secondary_states.learning_status == LearningWithPrepare || _potential_secondary_states.learning_status == LearningSucceeded)) { derror( "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // no need response as rpc is already gone return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_OK, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { if (mu2->is_logged()) { ack_prepare_message(ERR_OK, mu); } else { derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name()); // response will be unnecessary when we add retry logic in rpc engine. // the retried rpc will use the same id therefore it will be considered responsed // even the response is for a previous try. } return; } error_code err = _prepare_list->prepare(mu, status()); dassert (err == ERR_OK, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, ""); } dassert(mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); }
void replication_app_client_base::call(request_context* request, bool no_delay) { auto& msg = request->callback_task->get_request(); auto nts = ::dsn::service::env::now_us(); if (nts + 100 >= msg->header().client.timeout_ts_us) // < 100us { message_ptr nil(nullptr); end_request(request, ERR_TIMEOUT, nil); delete request; return; } end_point addr; int app_id; error_code err = get_address( request->partition_index, !request->is_read, addr, app_id, request->read_header.semantic ); // target node in cache if (err == ERR_SUCCESS) { dbg_dassert(addr != end_point::INVALID, ""); dassert(app_id > 0, ""); if (request->header_pos != 0xffff) { if (request->is_read) { request->read_header.gpid.app_id = app_id; marshall(msg->writer(), request->read_header, request->header_pos); msg->header().client.hash = gpid_to_hash(request->read_header.gpid); } else { request->write_header.gpid.app_id = app_id; marshall(msg->writer(), request->write_header, request->header_pos); msg->header().client.hash = gpid_to_hash(request->write_header.gpid); } request->header_pos = 0xffff; } rpc::call( addr, msg, this, std::bind( &replication_app_client_base::replica_rw_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, request ) ); } // target node not known else if (!no_delay) { // delay 1 second for further config query tasking::enqueue(LPC_REPLICATION_DELAY_QUERY_CONFIG, this, std::bind(&replication_app_client_base::call, this, request, true), 0, 1000 ); } else { zauto_lock l(_requests_lock); // init timeout timer if necessary if (request->timeout_timer == nullptr) { request->timeout_timer = tasking::enqueue( LPC_REPLICATION_CLIENT_REQUEST_TIMEOUT, this, std::bind(&replication_app_client_base::on_user_request_timeout, this, request), 0, static_cast<int>((msg->header().client.timeout_ts_us - nts) / 1000) ); } // put into pending queue of querying target partition auto it = _pending_requests.find(request->partition_index); if (it == _pending_requests.end()) { auto pc = new partition_context; pc->query_config_task = nullptr; it = _pending_requests.insert(pending_requests::value_type(request->partition_index, pc)).first; } it->second->requests.push_back(request); // init configuration query task if necessary if (it->second->query_config_task == nullptr) { message_ptr msg = message::create_request(RPC_CM_CALL); meta_request_header hdr; hdr.rpc_tag = RPC_CM_QUERY_PARTITION_CONFIG_BY_INDEX; marshall(msg->writer(), hdr); configuration_query_by_index_request req; req.app_name = _app_name; req.partition_indices.push_back(request->partition_index); marshall(msg->writer(), req); it->second->query_config_task = rpc::call_replicated( _last_contact_point, _meta_servers, msg, this, std::bind(&replication_app_client_base::query_partition_configuration_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, request->partition_index ) ); } } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE) { switch (status()) { case partition_status::PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case partition_status::PS_SECONDARY: case partition_status::PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case partition_status::PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, nullptr, gpid_to_hash(get_gpid()) ); } }
void replica::init_prepare(mutation_ptr& mu) { dassert (PS_PRIMARY == status(), ""); error_code err = ERR_OK; uint8_t count = 0; if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count) { err = ERR_NOT_ENOUGH_MEMBER; goto ErrOut; } mu->data.header.last_committed_decree = last_committed_decree(); if (mu->data.header.decree == invalid_decree) { mu->set_id(get_ballot(), _prepare_list->max_decree() + 1); } else { mu->set_id(get_ballot(), mu->data.header.decree); } ddebug("%s: mutation %s init_prepare", name(), mu->name()); // check bounded staleness if (mu->data.header.decree > last_committed_decree() + _options.staleness_for_commit) { err = ERR_CAPACITY_EXCEEDED; goto ErrOut; } dassert (mu->data.header.decree > last_committed_decree(), ""); // local prepare err = _prepare_list->prepare(mu, PS_PRIMARY); if (err != ERR_OK) { goto ErrOut; } // remote prepare mu->set_prepare_ts(); mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size()); for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++) { send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries); } count = 0; for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++) { if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree) { send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries); count++; } } mu->set_left_potential_secondary_ack_count(count); // it is possible to do commit here when logging is not required for acking prepare. // however, it is only possible when replica count == 1 at this moment in the // replication group, and we don't want to do this as it is too fragile now. // do_possible_commit_on_primary(mu); // local log dassert (mu->data.header.log_offset == invalid_offset, ""); dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(nullptr != mu->log_task(), ""); return; ErrOut: response_client_message(mu->client_msg(), err); return; }
void replica::on_prepare(dsn_message_t request) { check_hashed_access(); replica_configuration rconfig; mutation_ptr mu; { msg_binary_reader reader(request); unmarshall(reader, rconfig); mu = mutation::read_from(reader, request); } decree decree = mu->data.header.decree; ddebug( "%s: mutation %s on_prepare", name(), mu->name()); dassert (mu->data.header.ballot == rconfig.ballot, ""); if (mu->data.header.ballot < get_ballot()) { ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name()); return; } // update configuration when necessary else if (rconfig.ballot > get_ballot()) { if (!update_local_configuration(rconfig)) { ddebug( "%s: mutation %s on_prepare to %s failed as update local configuration failed", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message(ERR_INVALID_STATE, mu); return; } } if (PS_INACTIVE == status() || PS_ERROR == status()) { ddebug( "%s: mutation %s on_prepare to %s skipped", name(), mu->name(), enum_to_string(status()) ); ack_prepare_message( (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE, mu ); return; } else if (PS_POTENTIAL_SECONDARY == status()) { if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded) { ddebug( "%s: mutation %s on_prepare to %s skipped, learnings state = %s", name(), mu->name(), enum_to_string(status()), enum_to_string(_potential_secondary_states.learning_status) ); // do not retry as there may retries later return; } } dassert (rconfig.status == status(), ""); if (decree <= last_committed_decree()) { ack_prepare_message(ERR_OK, mu); return; } // real prepare start auto mu2 = _prepare_list->get_mutation_by_decree(decree); if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot) { ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name()); if (mu2->is_logged() || _options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(ERR_OK, mu); } return; } error_code err = _prepare_list->prepare(mu, status()); dassert (err == ERR_OK, ""); if (PS_POTENTIAL_SECONDARY == status()) { dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, ""); } else { dassert (PS_SECONDARY == status(), ""); dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, ""); } // ack without logging if (_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } // write log dassert (mu->log_task() == nullptr, ""); mu->log_task() = _stub->_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2), gpid_to_hash(get_gpid()) ); dassert(mu->log_task() != nullptr, ""); }