Exemple #1
0
void replica::send_prepare_message(const dsn_address_t& addr, partition_status status, mutation_ptr& mu, int timeout_milliseconds)
{
    dsn_message_t msg = dsn_msg_create_request(RPC_PREPARE, timeout_milliseconds, gpid_to_hash(get_gpid()));
    replica_configuration rconfig;
    _primary_states.get_replica_config(status, rconfig);

    {
        msg_binary_writer writer(msg);
        marshall(writer, get_gpid());
        marshall(writer, rconfig);
        mu->write_to(writer);
    }

    mu->remote_tasks()[addr] = rpc::call(addr, msg,
                                         this,
                                         std::bind(&replica::on_prepare_reply,
                                                 this,
                                                 std::make_pair(mu, rconfig.status),
                                                 std::placeholders::_1,
                                                 std::placeholders::_2,
                                                 std::placeholders::_3),
                                         gpid_to_hash(get_gpid())
                                        );

    ddebug(
        "%s: mutation %s send_prepare_message to %s:%hu as %s",
        name(), mu->name(),
        addr.name, addr.port,
        enum_to_string(rconfig.status)
    );
}
Exemple #2
0
void replica::send_prepare_message(
    ::dsn::rpc_address addr, 
    partition_status status, 
    mutation_ptr& mu, 
    int timeout_milliseconds,
    int64_t learn_signature)
{
    dsn_message_t msg = dsn_msg_create_request(RPC_PREPARE, timeout_milliseconds, gpid_to_hash(get_gpid()));
    replica_configuration rconfig;
    _primary_states.get_replica_config(status, rconfig, learn_signature);

    {
        rpc_write_stream writer(msg);
        marshall(writer, get_gpid());
        marshall(writer, rconfig);
        mu->write_to(writer);
    }
    
    mu->remote_tasks()[addr] = rpc::call(addr, msg,
        this,
        [=](error_code err, dsn_message_t request, dsn_message_t reply)
        {
            on_prepare_reply(std::make_pair(mu, rconfig.status), err, request, reply);
        },
        gpid_to_hash(get_gpid())
        );

    ddebug( 
        "%s: mutation %s send_prepare_message to %s as %s",  
        name(), mu->name(),
        addr.to_string(),
        enum_to_string(rconfig.status)
        );
}
Exemple #3
0
void replica::send_prepare_message(const end_point& addr, partition_status status, mutation_ptr& mu, int timeout_milliseconds)
{
    message_ptr msg = message::create_request(RPC_PREPARE, timeout_milliseconds, gpid_to_hash(get_gpid()));
    marshall(msg, get_gpid());
    
    replica_configuration rconfig;
    _primary_states.get_replica_config(status, rconfig);

    marshall(msg, rconfig);
    mu->write_to(msg);

    dbg_dassert (mu->remote_tasks().find(addr) == mu->remote_tasks().end());

    mu->remote_tasks()[addr] = rpc::call(addr, msg, 
        this,
        std::bind(&replica::on_prepare_reply, 
            this,
            std::make_pair(mu, rconfig.status),
            std::placeholders::_1, 
            std::placeholders::_2, 
            std::placeholders::_3),
        gpid_to_hash(get_gpid())
        );

    ddebug( 
        "%s: mutation %s send_prepare_message to %s:%d as %s", 
        name(), mu->name(),
        addr.name.c_str(), static_cast<int>(addr.port),
        enum_to_string(rconfig.status)
        );
}
void replication_app_client_base::call_with_address(dsn::rpc_address addr, request_context_ptr request)
{
    auto& msg = request->request;

    dbg_dassert(!addr.is_invalid(), "");
    dbg_dassert(_app_id > 0, "");

    if (request->header_pos != 0)
    {
        if (request->is_read)
        {
            request->read_header.gpid.app_id = _app_id;
            request->read_header.gpid.pidx = request->partition_index;
            blob buffer(request->header_pos, 0, sizeof(request->read_header));
            binary_writer writer(buffer);
            marshall(writer, request->read_header);

            dsn_msg_options_t opts;
            opts.timeout_ms = request->timeout_ms;
            opts.thread_hash = gpid_to_hash(request->read_header.gpid);
            opts.vnid = *(uint64_t*)(&request->read_header.gpid);
            dsn_msg_set_options(request->request, &opts, DSN_MSGM_HASH | DSN_MSGM_TIMEOUT); // TODO: not supported yet DSN_MSGM_VNID);
        }
        else
        {
            request->write_header.gpid.app_id = _app_id;
            request->write_header.gpid.pidx = request->partition_index;
            blob buffer(request->header_pos, 0, sizeof(request->write_header));
            binary_writer writer(buffer);
            marshall(writer, request->write_header);

            dsn_msg_options_t opts;
            opts.timeout_ms = request->timeout_ms;
            opts.thread_hash = gpid_to_hash(request->write_header.gpid);
            opts.vnid = *(uint64_t*)(&request->write_header.gpid);
            
            dsn_msg_set_options(request->request, &opts, DSN_MSGM_HASH | DSN_MSGM_TIMEOUT); // TODO: not supported yet DSN_MSGM_VNID | DSN_MSGM_CONTEXT);
        }
        request->header_pos = 0;
    }

    {
        zauto_lock l(request->lock);
        rpc::call(
            addr,
            msg,
            this,
            std::bind(
            &replication_app_client_base::replica_rw_reply,
            this,
            std::placeholders::_1,
            std::placeholders::_2,
            std::placeholders::_3,
            request
            )
        );
    }
}
Exemple #5
0
        void replica::catch_up_with_private_logs(partition_status s)
        {
            learn_state state;
            _private_log->get_learn_state(
                get_gpid(),
                _app->last_committed_decree() + 1,
                state
                );

            auto err = apply_learned_state_from_private_log(state);

            tasking::enqueue(
                LPC_CHECKPOINT_REPLICA_COMPLETED,
                this,
                [this, err, s]() 
                {
                    if (PS_SECONDARY == s)
                        this->on_checkpoint_completed(err);
                    else if (PS_POTENTIAL_SECONDARY == s)
                        this->on_learn_remote_state_completed(err);
                    else
                    {
                        dassert(false, "invalid state %s", enum_to_string(s));
                    }
                },
                gpid_to_hash(get_gpid())
                );
        }
replication_app_client_base::request_context* replication_app_client_base::create_write_context(
    int partition_index,
    task_code code,
    rpc_response_task_ptr callback,
    int reply_hash
)
{
    auto rc = new request_context;
    rc->callback_task = callback;
    rc->is_read = false;
    rc->partition_index = partition_index;
    rc->write_header.gpid.app_id = _app_id;
    rc->write_header.gpid.pidx = partition_index;
    rc->write_header.code = code;
    rc->timeout_timer = nullptr;

    if (rc->write_header.gpid.app_id == -1)
    {
        rc->header_pos = callback->get_request()->writer().write_placeholder();
        dbg_dassert(rc->header_pos != 0xffff, "");
    }
    else
    {
        rc->header_pos = 0xffff;
        marshall(callback->get_request()->writer(), rc->write_header);
        callback->get_request()->header().client.hash = gpid_to_hash(rc->write_header.gpid);
    }

    return rc;
}
replication_app_client_base::request_context* replication_app_client_base::create_read_context(
    int partition_index,
    task_code code,
    rpc_response_task_ptr callback,
    read_semantic_t read_semantic,
    decree snapshot_decree, // only used when ReadSnapshot
    int reply_hash
)
{
    auto rc = new request_context;
    rc->callback_task = callback;
    rc->is_read = true;
    rc->partition_index = partition_index;
    rc->read_header.gpid.app_id = _app_id;
    rc->read_header.gpid.pidx = partition_index;
    rc->read_header.code = code;
    rc->read_header.semantic = read_semantic;
    rc->read_header.version_decree = snapshot_decree;
    rc->timeout_timer = nullptr;

    if (rc->read_header.gpid.app_id == -1)
    {
        rc->header_pos = callback->get_request()->writer().write_placeholder();
        dbg_dassert(rc->header_pos != 0xffff, "");
    }
    else
    {
        rc->header_pos = 0xffff;
        marshall(callback->get_request()->writer(), rc->read_header);
        callback->get_request()->header().client.hash = gpid_to_hash(rc->read_header.gpid);
    }

    return rc;
}
Exemple #8
0
        void replica::on_checkpoint_completed(error_code err)
        {
            check_hashed_access();

            // closing or wrong timing
            if (PS_SECONDARY != status() || ERR_WRONG_TIMING == err)
            {
                _secondary_states.checkpoint_task = nullptr;
                return;
            } 

            // handle failure
            if (err != ERR_OK)
            {
                // done checkpointing
                _secondary_states.checkpoint_task = nullptr;
                handle_local_failure(err);
                return;
            }

            auto c = _prepare_list->last_committed_decree();

            // missing commits
            if (c > _app->last_committed_decree())
            {
                // missed ones are covered by prepare list
                if (_app->last_committed_decree() > _prepare_list->min_decree())
                {
                    for (auto d = _app->last_committed_decree() + 1; d <= c; d++)
                    {
                        auto mu = _prepare_list->get_mutation_by_decree(d);
                        dassert(nullptr != mu, "");
                        _app->write_internal(mu);
                    }

                    // everything is ok now, done checkpointing
                    _secondary_states.checkpoint_task = nullptr;
                }

                // missed ones need to be loaded via private logs
                else
                {
                    _secondary_states.checkpoint_task = tasking::enqueue(
                        LPC_CHECKPOINT_REPLICA,
                        this,
                        [this]() { this->catch_up_with_private_logs(PS_SECONDARY); },
                        gpid_to_hash(get_gpid())
                        );
                }
            }

            // no missing commits
            else
            {
                // everything is ok now, done checkpointing
                _secondary_states.checkpoint_task = nullptr;
            }
        }
Exemple #9
0
        void replica::on_copy_checkpoint_ack(error_code err, std::shared_ptr<replica_configuration>& req, std::shared_ptr<learn_response>& resp)
        {
            check_hashed_access();

            if (PS_PRIMARY != status())
            {
                _primary_states.checkpoint_task = nullptr;
                return;
            }

            if (err != ERR_OK || resp == nullptr)
            {
                dwarn("%s: copy checkpoint from secondary failed, err = %s", name(), err.to_string());
                _primary_states.checkpoint_task = nullptr;
                return;
            }

            if (resp->err != ERR_OK)
            {
                dinfo("%s: copy checkpoint from secondary failed, err = %s", name(), resp->err.to_string());
                _primary_states.checkpoint_task = nullptr;
                return;
            }

            if (resp->state.to_decree_included <= _app->last_durable_decree())
            {
                dinfo("%s: copy checkpoint from secondary skipped, as its decree is not bigger than current durable_decree: %" PRIu64 " vs %" PRIu64 "",
                    name(), resp->state.to_decree_included, _app->last_durable_decree()
                    );
                _primary_states.checkpoint_task = nullptr;
                return;
            }
                
            std::string ldir = utils::filesystem::path_combine(
                _app->learn_dir(),
                "checkpoint.copy"
                );

            if (utils::filesystem::path_exists(ldir))
                utils::filesystem::remove_path(ldir);

            _primary_states.checkpoint_task = file::copy_remote_files(
                resp->address,
                resp->base_local_dir,
                resp->state.files,
                ldir,
                false,
                LPC_REPLICA_COPY_LAST_CHECKPOINT_DONE,
                this,
                [this, resp](error_code err, size_t sz)
                {
                    this->on_copy_checkpoint_file_completed(err, sz, resp);
                },
                gpid_to_hash(get_gpid())
                );
        }
Exemple #10
0
 // run in background thread
 void replica::background_checkpoint()
 {
     auto err = _app->checkpoint();
     tasking::enqueue(
         LPC_CHECKPOINT_REPLICA_COMPLETED,
         this,
         [this, err]() { this->on_checkpoint_completed(err); },
         gpid_to_hash(get_gpid())
         );
 }
Exemple #11
0
        // meta server => partition server
        void server_load_balancer::send_proposal(::dsn::rpc_address node, const configuration_update_request& proposal)
        {
            dinfo("send proposal %s of %s, current ballot = %" PRId64,
                enum_to_string(proposal.type),
                proposal.node.to_string(),
                proposal.config.ballot
                );

            rpc::call_one_way_typed(node, RPC_CONFIG_PROPOSAL, proposal, gpid_to_hash(proposal.config.gpid));
        }
Exemple #12
0
// for testing purpose only
void replica::send_group_check_once_for_test(int delay_milliseconds)
{
    dassert (_options.group_check_disabled, "");

    _primary_states.group_check_task = tasking::enqueue(
            LPC_GROUP_CHECK,
            this,
            &replica::broadcast_group_check,
            gpid_to_hash(get_gpid()),
            delay_milliseconds
            );
}
Exemple #13
0
// for testing purpose only
void replica::send_group_check_once_for_test(int delay_milliseconds)
{
    dassert (_options->group_check_disabled, "");

    _primary_states.group_check_task = tasking::enqueue(
        LPC_GROUP_CHECK,
        this,
        [this] {broadcast_group_check();},
        gpid_to_hash(get_gpid()),
        std::chrono::milliseconds(delay_milliseconds)
        );
}
Exemple #14
0
void replica::broadcast_group_check()
{
    dassert (nullptr != _primary_states.group_check_task, "");
    if (_primary_states.group_check_pending_replies.size() > 0)
    {
        dwarn(
            "%s: %u group check replies are still pending when doing next round check",
            name(), static_cast<int>(_primary_states.group_check_pending_replies.size())
            );

        for (auto it = _primary_states.group_check_pending_replies.begin(); it != _primary_states.group_check_pending_replies.end(); it++)
        {
            it->second->cancel(true);
        }
        _primary_states.group_check_pending_replies.clear();
    }

    for (auto it = _primary_states.statuses.begin(); it != _primary_states.statuses.end(); it++)
    {
        if (it->first == primary_address())
            continue;

        end_point addr = it->first;
        std::shared_ptr<group_check_request> request(new group_check_request);

        request->app_type = _primary_states.membership.app_type;
        request->node = addr;
        _primary_states.get_replica_config(addr, request->config);
        request->last_committed_decree = last_committed_decree();
        request->learner_signature = 0;
        if (it->second == PS_POTENTIAL_SECONDARY)
        {
            auto it2 = _primary_states.learners.find(it->first);
            dassert (it2 != _primary_states.learners.end(), "");
            request->learner_signature = it2->second.signature;
        }

        task_ptr callback_task = rpc::call_typed(
            addr,
            RPC_GROUP_CHECK,
            request,            
            this,
            &replica::on_group_check_reply,
            gpid_to_hash(get_gpid())
            );

        _primary_states.group_check_pending_replies[addr] = callback_task;

        ddebug(
            "%s: init_group_check for %s:%d", name(), addr.name.c_str(), addr.port
        );
    }
}
Exemple #15
0
 void replica::checkpoint()
 {
     auto lerr = _app->flush(true);
     auto err = lerr == 0 ? ERR_OK :
         (lerr == ERR_WRONG_TIMING ? ERR_WRONG_TIMING : ERR_LOCAL_APP_FAILURE);
     
     tasking::enqueue(
         LPC_CHECKPOINT_REPLICA_COMPLETED,
         this,
         [this, err]() { this->on_checkpoint_completed(err); },
         gpid_to_hash(get_gpid())
         );
 }
Exemple #16
0
        void replica::catch_up_with_private_logs(partition_status s)
        {
            learn_state state;
            _private_log->get_learn_state(
                get_gpid(),
                _app->last_committed_decree() + 1,
                state
                );

            auto err = apply_learned_state_from_private_log(state);

            if (s == PS_POTENTIAL_SECONDARY)
            {
                tasking::enqueue(
                    &_potential_secondary_states.learn_remote_files_completed_task, 
                    LPC_CHECKPOINT_REPLICA_COMPLETED,
                    this,
                    [this, err]() 
                    {
                        this->on_learn_remote_state_completed(err);
                    },
                    gpid_to_hash(get_gpid())
                    );
            }
            else
            {
                tasking::enqueue(
                    &_secondary_states.checkpoint_completed_task,
                    LPC_CHECKPOINT_REPLICA_COMPLETED,
                    this,
                    [this, err]() 
                    {
                        this->on_checkpoint_completed(err);
                    },
                    gpid_to_hash(get_gpid())
                    );
            }
        }
Exemple #17
0
void simple_load_balancer::query_decree(std::shared_ptr<query_replica_decree_request> query)
{
    rpc::call(
        query->node,
        RPC_QUERY_PN_DECREE,
        *query,
        this,
        [this, query](error_code err, query_replica_decree_response&& resp) 
        {
            auto response = std::make_shared<query_replica_decree_response>(std::move(resp));
            on_query_decree_ack(err, query, response);
        }
        ,
        gpid_to_hash(query->pid), std::chrono::seconds(3));
}
Exemple #18
0
void replica::init_group_check()
{
    if (PS_PRIMARY != status() || _options.group_check_disabled)
        return;

    dassert (nullptr == _primary_states.group_check_task, "");
    _primary_states.group_check_task = tasking::enqueue(
            LPC_GROUP_CHECK,
            this,
            &replica::broadcast_group_check,
            gpid_to_hash(get_gpid()),
            0,
            _options.group_check_internal_ms
            );
}
Exemple #19
0
        void replica::init_checkpoint()
        {
            check_hashed_access();
            
            // only applicable to primary and secondary replicas
            if (status() != PS_PRIMARY && status() != PS_SECONDARY)
                return;

            // no need to checkpoint
            if (_app->is_delta_state_learning_supported())
                return;

            // already running
            if (_secondary_states.checkpoint_task != nullptr)
                return;

            // private log must be enabled to make sure commits
            // are not lost during checkpinting
            dassert(nullptr != _private_log, "log_enable_private_prepare must be true for checkpointing");

            // TODO: when NOT to checkpoint, but use private log replay to build the state
            if (last_committed_decree() - last_durable_decree() < 10000)
                return;

            // primary is downgraded to secondary for checkpointing as no write can be seen
            // during checkpointing (i.e., state is freezed)
            if (PS_PRIMARY == status())
            {
                configuration_update_request proposal;
                proposal.config = _primary_states.membership;
                proposal.type = CT_DOWNGRADE_TO_SECONDARY;
                proposal.node = proposal.config.primary;
                downgrade_to_secondary_on_primary(proposal);
            }

            // secondary can start checkpint in the long running thread pool
            else
            {
                dassert(PS_SECONDARY == status(), "");

                _secondary_states.checkpoint_task = tasking::enqueue(
                    LPC_CHECKPOINT_REPLICA,
                    this,
                    &replica::checkpoint,
                    gpid_to_hash(get_gpid())
                    );
            }
        }
Exemple #20
0
mutation_queue::mutation_queue(gpid gpid, int max_concurrent_op /*= 2*/, bool batch_write_disabled /*= false*/)
    : _max_concurrent_op(max_concurrent_op), _batch_write_disabled(batch_write_disabled)
{
    std::stringstream ss;
    ss << gpid.get_app_id() << "." << gpid.get_partition_index() << "." << "2pc#";

    _current_op_counter.init("eon.replication", ss.str().c_str(), COUNTER_TYPE_NUMBER, "current running 2pc#");
    _current_op_counter.set(0);
    
    _current_op_count = 0;
    _pending_mutation = nullptr;
    dassert(gpid.get_app_id() != 0, "invalid gpid");
    _pcount = dsn_task_queue_virtual_length_ptr(
        RPC_PREPARE,
        gpid_to_hash(gpid)
        );
}
Exemple #21
0
void replica::update_configuration_on_meta_server(config_type type, ::dsn::rpc_address node, partition_configuration& newConfig)
{
    newConfig.last_committed_decree = last_committed_decree();

    if (type != CT_ASSIGN_PRIMARY && type != CT_UPGRADE_TO_PRIMARY)
    {
        dassert (status() == PS_PRIMARY, "");
        dassert (newConfig.ballot == _primary_states.membership.ballot, "");
    }

    // disable 2pc during reconfiguration
    // it is possible to do this only for CT_DOWNGRADE_TO_SECONDARY,
    // but we choose to disable 2pc during all reconfiguration types
    // for simplicity at the cost of certain write throughput
    update_local_configuration_with_no_ballot_change(PS_INACTIVE);
    set_inactive_state_transient(true);

    dsn_message_t msg = dsn_msg_create_request(RPC_CM_UPDATE_PARTITION_CONFIGURATION, 0, 0);
    
    std::shared_ptr<configuration_update_request> request(new configuration_update_request);
    request->config = newConfig;
    request->config.ballot++;
    request->type = type;
    request->node = node;

    ::marshall(msg, *request);

    if (nullptr != _primary_states.reconfiguration_task)
    {
        _primary_states.reconfiguration_task->cancel(true);
    }

    rpc_address target(_stub->_failure_detector->get_servers());
    _primary_states.reconfiguration_task = rpc::call(
        target,
        msg,        
        this,
        std::bind(&replica::on_update_configuration_on_meta_server_reply, this,
        std::placeholders::_1,
        std::placeholders::_2,
        std::placeholders::_3,
        request),
        gpid_to_hash(get_gpid())
        );
}
Exemple #22
0
void replica::init_group_check()
{
    check_hashed_access();

    ddebug("%s: init group check", name());

    if (partition_status::PS_PRIMARY != status() || _options->group_check_disabled)
        return;

    dassert (nullptr == _primary_states.group_check_task, "");
    _primary_states.group_check_task = tasking::enqueue_timer(
        LPC_GROUP_CHECK,
        this,
        [this] {broadcast_group_check();},
        std::chrono::milliseconds(_options->group_check_interval_ms),
        gpid_to_hash(get_gpid())
        );
}
Exemple #23
0
void replica::add_potential_secondary(configuration_update_request& proposal)
{
    if (status() != PS_PRIMARY)
    {
        return;
    }   

    dassert (proposal.config.ballot == get_ballot(), "");
    dassert (proposal.config.gpid == _primary_states.membership.gpid, "");
    dassert (proposal.config.app_type == _primary_states.membership.app_type, "");
    dassert (proposal.config.primary == _primary_states.membership.primary, "");
    dassert (proposal.config.secondaries == _primary_states.membership.secondaries, "");
    dassert (!_primary_states.check_exist(proposal.node, PS_PRIMARY), "");
    dassert (!_primary_states.check_exist(proposal.node, PS_SECONDARY), "");

    if (_primary_states.learners.find(proposal.node) != _primary_states.learners.end())
    {
        return;
    }

    remote_learner_state state;
    state.prepare_start_decree = invalid_decree;
    state.signature = random64(0, (uint64_t)(-1LL));
    state.timeout_task = nullptr; // TODO: add timer for learner task

    _primary_states.learners[proposal.node] = state;
    _primary_states.statuses[proposal.node] = PS_POTENTIAL_SECONDARY;

    group_check_request request;
    request.app_type = _primary_states.membership.app_type;
    request.node = proposal.node;
    _primary_states.get_replica_config(proposal.node, request.config);
    request.last_committed_decree = last_committed_decree();
    request.learner_signature = state.signature;

    rpc::call_one_way_typed(proposal.node, RPC_LEARN_ADD_LEARNER, request, gpid_to_hash(get_gpid()));
}
Exemple #24
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;
            
    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }
    
    dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }
 
    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }
    
    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature);
            count++;
        }
    }    
    mu->set_left_potential_secondary_ack_count(count);

    if (mu->is_logged())
    {
        do_possible_commit_on_primary(mu);
    }
    else
    {
        dassert(mu->data.header.log_offset == invalid_offset, "");
        dassert(mu->log_task() == nullptr, "");

        mu->log_task() = _stub->_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            this,
            std::bind(&replica::on_append_log_completed, this, mu,
                      std::placeholders::_1,
                      std::placeholders::_2),
                      gpid_to_hash(get_gpid())
            );

        dassert(nullptr != mu->log_task(), "");
    }
    return;

ErrOut:
    for (auto& r : mu->client_requests)
    {
        response_client_message(r, err);
    }
    return;
}
Exemple #25
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE)
    {
        switch (status())
        {
        case PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case PS_SECONDARY:
        case PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            [this, mu](error_code err, size_t size)
            {
                //
                // DO NOT CHANGE THIS CALLBACK HERE UNLESS
                // YOU FULLY UNDERSTAND WHAT WE DO HERE
                // 
                // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE
                // THE FIRST CALLBACK IF THERE IS FAILURE TO
                // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY
                // CANCELLED!!!
                //
                // TODO: we do not need so many callbacks
                //

                dinfo("%s: append private log completed for mutation %s, size = %u, err = %s",
                      name(), mu->name(), size, err.to_string());

                if (err != ERR_OK)
                {
                    derror("%s: append private log failed for mutation %s, err = %s",
                           name(), mu->name(), err.to_string());
                    handle_local_failure(err);
                }
            },
            gpid_to_hash(get_gpid())
            );
    }
}
Exemple #26
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        rpc_read_stream reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    dinfo("%s: mutation %s on_prepare", name(), mu->name());

    dassert(mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        // no need response because the rpc should have been cancelled on primary in this case
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            derror(
                "%s: mutation %s on_prepare failed as update local configuration failed, state = %s",
                name(), mu->name(),
                enum_to_string(status())
                );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        derror(
            "%s: mutation %s on_prepare failed as invalid replica state, state = %s",
            name(), mu->name(),
            enum_to_string(status())
            );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
            );
        return;
    }
    else if (PS_POTENTIAL_SECONDARY == status())
    {
        // new learning process
        if (rconfig.learner_signature != _potential_secondary_states.learning_signature)
        {
            init_learn(rconfig.learner_signature);
            // no need response as rpc is already gone
            return;
        }

        if (!(_potential_secondary_states.learning_status == LearningWithPrepare
            || _potential_secondary_states.learning_status == LearningSucceeded))
        {
            derror(
                "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
                );

            // no need response as rpc is already gone
            return;
        }
    }

    dassert (rconfig.status == status(), "");    
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }
    
    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        if (mu2->is_logged())
        {
            ack_prepare_message(ERR_OK, mu);
        }
        else
        {
            derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name());
            // response will be unnecessary when we add retry logic in rpc engine.
            // the retried rpc will use the same id therefore it will be considered responsed
            // even the response is for a previous try.
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, "");
    }

    dassert(mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu,
                  std::placeholders::_1,
                  std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );
}
void replication_app_client_base::call(request_context* request, bool no_delay)
{
    auto& msg = request->callback_task->get_request();
    auto nts = ::dsn::service::env::now_us();
    if (nts + 100 >= msg->header().client.timeout_ts_us) // < 100us
    {
        message_ptr nil(nullptr);
        end_request(request, ERR_TIMEOUT, nil);
        delete request;
        return;
    }

    end_point addr;
    int app_id;

    error_code err = get_address(
                         request->partition_index,
                         !request->is_read,
                         addr,
                         app_id,
                         request->read_header.semantic
                     );

    // target node in cache
    if (err == ERR_SUCCESS)
    {
        dbg_dassert(addr != end_point::INVALID, "");
        dassert(app_id > 0, "");

        if (request->header_pos != 0xffff)
        {
            if (request->is_read)
            {
                request->read_header.gpid.app_id = app_id;
                marshall(msg->writer(), request->read_header, request->header_pos);
                msg->header().client.hash = gpid_to_hash(request->read_header.gpid);
            }
            else
            {
                request->write_header.gpid.app_id = app_id;
                marshall(msg->writer(), request->write_header, request->header_pos);
                msg->header().client.hash = gpid_to_hash(request->write_header.gpid);
            }

            request->header_pos = 0xffff;
        }

        rpc::call(
            addr,
            msg,
            this,
            std::bind(
                &replication_app_client_base::replica_rw_reply,
                this,
                std::placeholders::_1,
                std::placeholders::_2,
                std::placeholders::_3,
                request
            )
        );
    }

    // target node not known
    else if (!no_delay)
    {
        // delay 1 second for further config query
        tasking::enqueue(LPC_REPLICATION_DELAY_QUERY_CONFIG, this,
                         std::bind(&replication_app_client_base::call, this, request, true),
                         0,
                         1000
                        );
    }

    else
    {
        zauto_lock l(_requests_lock);

        // init timeout timer if necessary
        if (request->timeout_timer == nullptr)
        {
            request->timeout_timer = tasking::enqueue(
                                         LPC_REPLICATION_CLIENT_REQUEST_TIMEOUT,
                                         this,
                                         std::bind(&replication_app_client_base::on_user_request_timeout, this, request),
                                         0,
                                         static_cast<int>((msg->header().client.timeout_ts_us - nts) / 1000)
                                     );
        }

        // put into pending queue of querying target partition
        auto it = _pending_requests.find(request->partition_index);
        if (it == _pending_requests.end())
        {
            auto pc = new partition_context;
            pc->query_config_task = nullptr;
            it = _pending_requests.insert(pending_requests::value_type(request->partition_index, pc)).first;
        }

        it->second->requests.push_back(request);

        // init configuration query task if necessary
        if (it->second->query_config_task == nullptr)
        {
            message_ptr msg = message::create_request(RPC_CM_CALL);

            meta_request_header hdr;
            hdr.rpc_tag = RPC_CM_QUERY_PARTITION_CONFIG_BY_INDEX;
            marshall(msg->writer(), hdr);

            configuration_query_by_index_request req;
            req.app_name = _app_name;
            req.partition_indices.push_back(request->partition_index);
            marshall(msg->writer(), req);

            it->second->query_config_task = rpc::call_replicated(
                                                _last_contact_point,
                                                _meta_servers,
                                                msg,

                                                this,
                                                std::bind(&replication_app_client_base::query_partition_configuration_reply,
                                                        this,
                                                        std::placeholders::_1,
                                                        std::placeholders::_2,
                                                        std::placeholders::_3,
                                                        request->partition_index
                                                         )
                                            );
        }
    }
}
Exemple #28
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE)
    {
        switch (status())
        {
        case partition_status::PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case partition_status::PS_SECONDARY:
        case partition_status::PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case partition_status::PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            nullptr,
            gpid_to_hash(get_gpid())
            );
    }
}
Exemple #29
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;

    if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count)
    {
        err = ERR_NOT_ENOUGH_MEMBER;
        goto ErrOut;
    }

    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }

    ddebug("%s: mutation %s init_prepare", name(), mu->name());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options.staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }

    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }

    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries);
            count++;
        }
    }
    mu->set_left_potential_secondary_ack_count(count);

    // it is possible to do commit here when logging is not required for acking prepare.
    // however, it is only possible when replica count == 1 at this moment in the
    // replication group, and we don't want to do this as it is too fragile now.
    // do_possible_commit_on_primary(mu);

    // local log
    dassert (mu->data.header.log_offset == invalid_offset, "");
    dassert (mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
                                         LPC_WRITE_REPLICATION_LOG,
                                         this,
                                         std::bind(&replica::on_append_log_completed, this, mu,
                                                 std::placeholders::_1,
                                                 std::placeholders::_2),
                                         gpid_to_hash(get_gpid())
                                        );

    dassert(nullptr != mu->log_task(), "");
    return;

ErrOut:
    response_client_message(mu->client_msg(), err);
    return;
}
Exemple #30
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        msg_binary_reader reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    ddebug( "%s: mutation %s on_prepare", name(), mu->name());

    dassert (mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            ddebug(
                "%s: mutation %s on_prepare  to %s failed as update local configuration failed",
                name(), mu->name(),
                enum_to_string(status())
            );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        ddebug(
            "%s: mutation %s on_prepare  to %s skipped",
            name(), mu->name(),
            enum_to_string(status())
        );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
        );
        return;
    }

    else if (PS_POTENTIAL_SECONDARY == status())
    {
        if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded)
        {
            ddebug(
                "%s: mutation %s on_prepare to %s skipped, learnings state = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
            );

            // do not retry as there may retries later
            return;
        }
    }

    dassert (rconfig.status == status(), "");
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }

    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name());

        if (mu2->is_logged() || _options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(ERR_OK, mu);
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, "");
    }

    // ack without logging
    if (_options.prepare_ack_on_secondary_before_logging_allowed)
    {
        ack_prepare_message(err, mu);
    }

    // write log
    dassert (mu->log_task() == nullptr, "");

    mu->log_task() = _stub->_log->append(mu,
                                         LPC_WRITE_REPLICATION_LOG,
                                         this,
                                         std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2),
                                         gpid_to_hash(get_gpid())
                                        );

    dassert(mu->log_task() != nullptr, "");
}