void disk_engine::complete_io(aio_task *aio, error_code err, uint32_t bytes, int delay_milliseconds) { if (err != ERR_OK) { dinfo("disk operation failure with code %s, err = %s, aio_task_id = %016" PRIx64, aio->spec().name.c_str(), err.to_string(), aio->id()); } // batching if (aio->code() == LPC_AIO_BATCH_WRITE) { aio->enqueue(err, (size_t)bytes); aio->release_ref(); // added in process_write } // no batching else { auto df = (disk_file *)(aio->aio()->file_object); if (aio->aio()->type == AIO_Read) { auto wk = df->on_read_completed(aio, err, (size_t)bytes); if (wk) { _provider->aio(wk); } } // write else { uint32_t sz; auto wk = df->on_write_completed(aio, (void *)&sz, err, (size_t)bytes); if (wk) { process_write(wk, sz); } } } }
void meta_service::on_log_completed(error_code err, size_t size, blob buffer, std::shared_ptr<configuration_update_request> req, dsn_message_t resp) { dassert(err == ERR_OK, "log operation failed, cannot proceed, err = %s", err.to_string()); dassert(buffer.length() == size, "log size must equal to the specified buffer size"); configuration_update_response response; update_configuration(*req, response); if (resp != nullptr) { meta_response_header rhdr; rhdr.err = err; rhdr.primary_address = primary_address(); marshall(resp, rhdr); marshall(resp, response); dsn_rpc_reply(resp); } else { err.end_tracking(); } }
void replica::on_copy_checkpoint_ack(error_code err, std::shared_ptr<replica_configuration>& req, std::shared_ptr<learn_response>& resp) { check_hashed_access(); if (PS_PRIMARY != status()) { _primary_states.checkpoint_task = nullptr; return; } if (err != ERR_OK || resp == nullptr) { dwarn("%s: copy checkpoint from secondary failed, err = %s", name(), err.to_string()); _primary_states.checkpoint_task = nullptr; return; } if (resp->err != ERR_OK) { dinfo("%s: copy checkpoint from secondary failed, err = %s", name(), resp->err.to_string()); _primary_states.checkpoint_task = nullptr; return; } if (resp->state.to_decree_included <= _app->last_durable_decree()) { dinfo("%s: copy checkpoint from secondary skipped, as its decree is not bigger than current durable_decree: %" PRIu64 " vs %" PRIu64 "", name(), resp->state.to_decree_included, _app->last_durable_decree() ); _primary_states.checkpoint_task = nullptr; return; } std::string ldir = utils::filesystem::path_combine( _app->learn_dir(), "checkpoint.copy" ); if (utils::filesystem::path_exists(ldir)) utils::filesystem::remove_path(ldir); _primary_states.checkpoint_task = file::copy_remote_files( resp->address, resp->base_local_dir, resp->state.files, ldir, false, LPC_REPLICA_COPY_LAST_CHECKPOINT_DONE, this, [this, resp](error_code err, size_t sz) { this->on_copy_checkpoint_file_completed(err, sz, resp); }, gpid_to_hash(get_gpid()) ); }
/*callback*/ void replication_app_client_base::replica_rw_reply( error_code err, dsn_message_t request, dsn_message_t response, request_context_ptr& rc ) { { zauto_lock l(rc->lock); if (rc->completed) { //dinfo("already time out before replica reply"); err.end_tracking(); return; } } if (err != ERR_OK) { goto Retry; } ::unmarshall(response, err); // // some error codes do not need retry // if (err == ERR_OK || err == ERR_HANDLER_NOT_FOUND) { end_request(rc, err, response); return; } // retry else { dsn::rpc_address adr = dsn_msg_from_address(response); } Retry: dinfo("%s.client: get error %s from replica with index %d", _app_name.c_str(), err.to_string(), rc->partition_index ); // clear partition configuration as it could be wrong { zauto_write_lock l(_config_lock); _config_cache.erase(rc->partition_index); } // then retry call(rc.get(), false); }
void replica::response_client_message(dsn_message_t request, error_code error) { if (nullptr == request) { error.end_tracking(); return; } ddebug("%s: reply client read/write, err = %s", name(), error.to_string()); dsn_rpc_reply(dsn_msg_create_response(request), error); }
void replica::response_client_message(message_ptr& request, error_code error, decree d/* = invalid_decree*/) { if (nullptr == request) return; message_ptr resp = request->create_response(); resp->writer().write(error); dassert(error != ERR_OK, ""); dinfo("handle replication request with rpc_id = %016llx failed, err = %s", request->header().rpc_id, error.to_string()); rpc::reply(resp); }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } if (!_options.prepare_ack_on_secondary_before_logging_allowed) { ack_prepare_message(err, mu); } break; case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::handle_local_failure(error_code error) { ddebug( "%s: handle local failure error %s, status = %s", name(), error.to_string(), enum_to_string(status()) ); if (status() == PS_PRIMARY) { _stub->remove_replica_on_meta_server(_primary_states.membership); } update_local_configuration_with_no_ballot_change(PS_ERROR); }
void replica::ack_prepare_message(error_code err, mutation_ptr& mu) { prepare_ack resp; resp.gpid = get_gpid(); resp.err = err; resp.ballot = get_ballot(); resp.decree = mu->data.header.decree; // for PS_POTENTIAL_SECONDARY ONLY resp.last_committed_decree_in_app = _app->last_committed_decree(); resp.last_committed_decree_in_prepare_list = last_committed_decree(); dassert(nullptr != mu->prepare_msg(), ""); reply(mu->prepare_msg(), resp); ddebug("%s: mutation %s ack_prepare_message, err = %s", name(), mu->name(), err.to_string()); }
void disk_engine::complete_io(aio_task* aio, error_code err, uint32_t bytes, int delay_milliseconds) { // TODO: failure injection, profiling, throttling if (err != ERR_OK) { dinfo( "disk operation failure with code %s, err = %s, aio task id = %llx", aio->spec().name.c_str(), err.to_string(), aio->id() ); } aio->enqueue(err, bytes); aio->release_ref(); // added in start_io _request_count--; }
void meta_service::on_log_completed(error_code err, int size, char* buffer, message_ptr req, message_ptr resp) { free(buffer); dassert(err == ERR_SUCCESS, "log operation failed, cannot proceed, err = %s", err.to_string()); configuration_update_request request; configuration_update_response response; unmarshall(req, request); update_configuration(request, response); meta_response_header rhdr; rhdr.err = err; rhdr.primary_address = primary_address(); marshall(resp, rhdr); marshall(resp, response); rpc::reply(resp); }
void replica::handle_remote_failure(partition_status st, ::dsn::rpc_address node, error_code error) { derror( "%s: handle remote failure error %s, status = %s, node = %s", name(), error.to_string(), enum_to_string(st), node.to_string() ); error.end_tracking(); dassert (status() == PS_PRIMARY, ""); dassert (node != _stub->_primary_address, ""); switch (st) { case PS_SECONDARY: dassert (_primary_states.check_exist(node, PS_SECONDARY), ""); { configuration_update_request request; request.node = node; request.type = CT_DOWNGRADE_TO_INACTIVE; request.config = _primary_states.membership; downgrade_to_inactive_on_primary(request); } break; case PS_POTENTIAL_SECONDARY: // potential secondary failure does not lead to ballot change // therefore, it is possible to have multiple exec here _primary_states.learners.erase(node); _primary_states.statuses.erase(node); break; case PS_INACTIVE: case PS_ERROR: break; default: dassert (false, ""); break; } }
void replica::on_update_configuration_on_meta_server_reply(error_code err, dsn_message_t request, dsn_message_t response, std::shared_ptr<configuration_update_request> req) { check_hashed_access(); if (PS_INACTIVE != status() || _stub->is_connected() == false) { _primary_states.reconfiguration_task = nullptr; err.end_tracking(); return; } configuration_update_response resp; if (err == ERR_OK) { ::unmarshall(response, resp); err = resp.err; } if (err != ERR_OK) { ddebug( "%s: update configuration reply with err %s, request ballot %lld", name(), err.to_string(), req->config.ballot ); if (err != ERR_INVALID_VERSION) { rpc_address target(_stub->_failure_detector->get_servers()); dsn_msg_add_ref(request); // added for another round of rpc::call _primary_states.reconfiguration_task = rpc::call( target, request, this, std::bind(&replica::on_update_configuration_on_meta_server_reply, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, req), gpid_to_hash(get_gpid()) ); return; } } ddebug( "%s: update configuration reply with err %s, ballot %lld, local %lld", name(), resp.err.to_string(), resp.config.ballot, get_ballot() ); if (resp.config.ballot < get_ballot()) { _primary_states.reconfiguration_task = nullptr; return; } // post-update work items? if (resp.err == ERR_OK) { dassert (req->config.gpid == resp.config.gpid, ""); dassert (req->config.app_type == resp.config.app_type, ""); dassert (req->config.primary == resp.config.primary, ""); dassert (req->config.secondaries == resp.config.secondaries, ""); switch (req->type) { case CT_UPGRADE_TO_PRIMARY: _primary_states.last_prepare_decree_on_new_primary = _prepare_list->max_decree(); break; case CT_ASSIGN_PRIMARY: case CT_DOWNGRADE_TO_SECONDARY: case CT_DOWNGRADE_TO_INACTIVE: case CT_UPGRADE_TO_SECONDARY: break; case CT_REMOVE: if (req->node != primary_address()) { replica_configuration rconfig; replica_helper::get_replica_config(resp.config, req->node, rconfig); rpc::call_one_way_typed(req->node, RPC_REMOVE_REPLICA, rconfig, gpid_to_hash(get_gpid())); } break; default: dassert (false, ""); } } update_configuration(resp.config); _primary_states.reconfiguration_task = nullptr; }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE) { switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, [this, mu](error_code err, size_t size) { // // DO NOT CHANGE THIS CALLBACK HERE UNLESS // YOU FULLY UNDERSTAND WHAT WE DO HERE // // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE // THE FIRST CALLBACK IF THERE IS FAILURE TO // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY // CANCELLED!!! // // TODO: we do not need so many callbacks // dinfo("%s: append private log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err != ERR_OK) { derror("%s: append private log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void replication_app_client_base::query_partition_configuration_reply(error_code err, dsn_message_t request, dsn_message_t response, request_context_ptr context) { int pidx = context->partition_index; bool conti = true; error_code client_err = ERR_OK; if (err == ERR_OK) { configuration_query_by_index_response resp; ::unmarshall(response, resp); if (resp.err == ERR_OK) { zauto_write_lock l(_config_lock); if (_app_id != -1 && _app_id != resp.app_id) { dassert(false, "app id is changed (mostly the app was removed and created with the same name), local Vs remote: %u vs %u ", _app_id, resp.app_id); } if ( _app_partition_count != -1 && _app_partition_count != resp.partition_count) { dassert(false, "partition count is changed (mostly the app was removed and created with the same name), local Vs remote: %u vs %u ", _app_partition_count, resp.partition_count); } _app_id = resp.app_id; _app_partition_count = resp.partition_count; for (auto it = resp.partitions.begin(); it != resp.partitions.end(); ++it) { partition_configuration& new_config = *it; auto it2 = _config_cache.find(new_config.gpid.pidx); if (it2 == _config_cache.end()) { _config_cache[new_config.gpid.pidx] = new_config; } else if (it2->second.ballot < new_config.ballot) { it2->second = new_config; } else { // nothing to do } } } else if (resp.err == ERR_OBJECT_NOT_FOUND) { derror("%s.client: query config reply err = %s, partition index = %d", _app_name.c_str(), resp.err.to_string(), context->partition_index ); conti = false; client_err = ERR_APP_NOT_EXIST; } else { derror("%s.client: query config reply err = %s, partition index = %d", _app_name.c_str(), resp.err.to_string(), context->partition_index ); conti = false; client_err = resp.err; } } else { derror("%s.client: query config reply err = %s, partition index = %d", _app_name.c_str(), err.to_string(), context->partition_index ); } // get address call if(pidx != -1) { // send pending client msgs partition_context* pc = nullptr; { zauto_lock l(_requests_lock); auto it = _pending_replica_requests.find(pidx); if (it != _pending_replica_requests.end()) { pc = it->second; _pending_replica_requests.erase(pidx); } } if (pc != nullptr) { for (auto& req : pc->requests) { if(conti) { call(req, true); } else { dsn_message_t nil(nullptr); end_request(req, client_err, nil); } } pc->requests.clear(); delete pc; } } else // just get app info { if(conti) { call(context, true); } else { dsn_message_t nil(nullptr); end_request(context, client_err, nil); } } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s", name(), mu->name(), size, err.to_string()); if (err == ERR_OK) { mu->set_logged(); } else { derror("%s: append shared log failed for mutation %s, err = %s", name(), mu->name(), err.to_string()); } // skip old mutations if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE) { switch (status()) { case partition_status::PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case partition_status::PS_SECONDARY: case partition_status::PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } // always ack ack_prepare_message(err, mu); break; case partition_status::PS_ERROR: break; default: dassert(false, ""); break; } } if (err != ERR_OK) { // mutation log failure, propagate to all replicas _stub->handle_log_failure(err); } // write local private log if necessary if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, nullptr, nullptr, gpid_to_hash(get_gpid()) ); } }
void daemon_s_service::on_node_query_reply(error_code err, dsn_message_t request, dsn_message_t response) { ddebug( "%s: node view replied, err = %s", primary_address().to_string(), err.to_string() ); if (err != ERR_OK) { // retry when the timer fires again later return; } else { if (!_online) return; configuration_query_by_node_response resp; ::dsn::unmarshall(response, resp); if (resp.err != ERR_OK) return; apps sapps; { ::dsn::service::zauto_read_lock l(_lock); sapps = _apps; } // find apps on meta server but not on local daemon rpc_address host = primary_address(); for (auto appc : resp.partitions) { int i; for (i = 0; i < (int)appc.config.secondaries.size(); i++) { // host nodes stored in secondaries if (appc.config.secondaries[i] == host) { // worker nodes stored in last-drops break; } } dassert(i < (int)appc.config.secondaries.size(), "host address %s must exist in secondary list of partition %d.%d", host.to_string(), appc.config.pid.get_app_id(), appc.config.pid.get_partition_index() ); auto it = sapps.find(appc.config.pid); if (it == sapps.end()) { configuration_update_request req; req.info = appc.info; req.config = appc.config; req.host_node = host; req.type = config_type::CT_REMOVE; // worker nodes stored in last-drops req.node = appc.config.last_drops[i]; std::shared_ptr<app_internal> app(new app_internal(req)); app->exited = true; app->working_port = req.node.port(); update_configuration_on_meta_server(config_type::CT_REMOVE, std::move(app)); } else { // matched on daemon and meta server sapps.erase(it); } } // find apps on local daemon but not on meta server for (auto app : sapps) { kill_app(std::move(app.second)); } } }
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size) { check_hashed_access(); ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string()); if (err == ERR_OK) { mu->set_logged(); } // skip old mutations if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE) { return; } switch (status()) { case PS_PRIMARY: if (err == ERR_OK) { do_possible_commit_on_primary(mu); } else { handle_local_failure(err); } break; case PS_SECONDARY: case PS_POTENTIAL_SECONDARY: if (err != ERR_OK) { handle_local_failure(err); } ack_prepare_message(err, mu); break; case PS_ERROR: break; default: dassert (false, ""); break; } // mutation log failure, propagted to all replicas if (err != ERR_OK) { _stub->handle_log_failure(err); } // write local private log if necessary else if (_private_log && status() != PS_ERROR) { _private_log->append(mu, LPC_WRITE_REPLICATION_LOG, this, [this](error_code err, size_t size) { if (err != ERR_OK) { handle_local_failure(err); } }, gpid_to_hash(get_gpid()) ); } }
void partition_resolver_simple::query_config_reply(error_code err, dsn_message_t request, dsn_message_t response, int partition_index) { auto client_err = ERR_OK; if (err == ERR_OK) { configuration_query_by_index_response resp; unmarshall(response, resp); if (resp.err == ERR_OK) { zauto_write_lock l(_config_lock); if (_app_id != -1 && _app_id != resp.app_id) { dassert(false, "app id is changed (mostly the app was removed and created with the same name), local Vs remote: %u vs %u ", _app_id, resp.app_id); } if (_app_partition_count != -1 && _app_partition_count != resp.partition_count) { dassert(false, "partition count is changed (mostly the app was removed and created with the same name), local Vs remote: %u vs %u ", _app_partition_count, resp.partition_count); } _app_id = resp.app_id; _app_partition_count = resp.partition_count; _app_is_stateful = resp.is_stateful; for (auto it = resp.partitions.begin(); it != resp.partitions.end(); ++it) { auto& new_config = *it; dinfo("%s.client: query config reply, gpid = %d.%d, ballot = %" PRId64 ", primary = %s", _app_path.c_str(), new_config.pid.get_app_id(), new_config.pid.get_partition_index(), new_config.ballot, new_config.primary.to_string() ); auto it2 = _config_cache.find(new_config.pid.get_partition_index()); if (it2 == _config_cache.end()) { std::unique_ptr<partition_info> pi(new partition_info); pi->timeout_count = 0; pi->config = new_config; _config_cache.emplace(new_config.pid.get_partition_index(), std::move(pi)); } else if (_app_is_stateful && it2->second->config.ballot < new_config.ballot) { it2->second->timeout_count = 0; it2->second->config = new_config; } else if (!_app_is_stateful) { it2->second->timeout_count = 0; it2->second->config = new_config; } else { // nothing to do } } } else if (resp.err == ERR_OBJECT_NOT_FOUND) { derror("%s.client: query config reply, gpid = %d.%d, err = %s", _app_path.c_str(), _app_id, partition_index, resp.err.to_string() ); client_err = ERR_APP_NOT_EXIST; } else { derror("%s.client: query config reply, gpid = %d.%d, err = %s", _app_path.c_str(), _app_id, partition_index, resp.err.to_string() ); client_err = resp.err; } } else { derror("%s.client: query config reply, gpid = %d.%d, err = %s", _app_path.c_str(), _app_id, partition_index, err.to_string() ); } // get specific or all partition update if (partition_index != -1) { partition_context* pc = nullptr; { zauto_lock l(_requests_lock); auto it = _pending_requests.find(partition_index); if (it != _pending_requests.end()) { pc = it->second; _pending_requests.erase(partition_index); } } if (pc) { handle_pending_requests(pc->requests, client_err); delete pc; } } // get all partition update else { pending_replica_requests reqs; std::deque<request_context_ptr> reqs2; { zauto_lock l(_requests_lock); reqs.swap(_pending_requests); reqs2.swap(_pending_requests_before_partition_count_unknown); } if (!reqs2.empty()) { if (_app_partition_count != -1) { for (auto& req : reqs2) { dassert(req->partition_index == -1, ""); req->partition_index = get_partition_index(_app_partition_count, req->partition_hash); } } handle_pending_requests(reqs2, client_err); } for (auto& r : reqs) { if (r.second) { handle_pending_requests(r.second->requests, client_err); delete r.second; } } } }