Ejemplo n.º 1
0
void replica::on_group_check(const group_check_request& request, __out_param group_check_response& response)
{
    ddebug(
        "%s: on_group_check from %s:%d",
        name(), request.config.primary.name.c_str(), request.config.primary.port
        );
    
    if (request.config.ballot < get_ballot())
    {
        response.err = ERR_VERSION_OUTDATED;
        return;
    }
    else if (request.config.ballot > get_ballot())
    {
        update_local_configuration(request.config);
    }
    else if (is_same_ballot_status_change_allowed(status(), request.config.status))
    {
        update_local_configuration(request.config, true);
    }
    
    switch (status())
    {
    case PS_INACTIVE:
        break;
    case PS_SECONDARY:
        if (request.last_committed_decree > last_committed_decree())
        {
            _prepare_list->commit(request.last_committed_decree, true);
        }
        break;
    case PS_POTENTIAL_SECONDARY:
        init_learn(request.learner_signature);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
    }
    
    response.gpid = get_gpid();
    response.node = primary_address();
    response.err = ERR_SUCCESS;
    if (status() == PS_ERROR)
    {
        response.err = ERR_INVALID_STATE;
    }

    response.last_committed_decree_in_app = _app->last_committed_decree();
    response.last_committed_decree_in_prepare_list = last_committed_decree();
    response.learner_status_ = _potential_secondary_states.learning_status;
    response.learner_signature = _potential_secondary_states.learning_signature;
}
Ejemplo n.º 2
0
void replica::on_config_sync(const partition_configuration& config)
{
    ddebug( "%s: configuration sync", name());

    // no outdated update
    if (config.ballot < get_ballot())
        return;

    if (status() == PS_PRIMARY || nullptr != _primary_states.reconfiguration_task)
    {
        // nothing to do as pirmary always holds the truth
    }
    else
    {
        update_configuration(config);

        if (status() == PS_INACTIVE && !_inactive_is_transient)
        {
            if (config.primary == primary_address() // dead primary
                || config.primary.is_invalid() // primary is dead (otherwise let primary remove this)
                )
            {
                _stub->remove_replica_on_meta_server(config);
            }
        }
    }
}
Ejemplo n.º 3
0
void replica::on_group_check_reply(error_code err, const std::shared_ptr<group_check_request>& req, const std::shared_ptr<group_check_response>& resp)
{
    check_hashed_access();

    if (partition_status::PS_PRIMARY != status() || req->config.ballot < get_ballot())
    {
        return;
    }

    auto r = _primary_states.group_check_pending_replies.erase(req->node);
    dassert (r == 1, "");

    if (err != ERR_OK)
    {
        handle_remote_failure(req->config.status, req->node, err);
    }
    else
    {
        if (resp->err == ERR_OK)
        {
            if (resp->learner_status_ == learner_status::LearningSucceeded && req->config.status == partition_status::PS_POTENTIAL_SECONDARY)
            {
                handle_learning_succeeded_on_primary(req->node, resp->learner_signature);
            }
        }
        else
        {
            handle_remote_failure(req->config.status, req->node, resp->err);
        }
    }
}
Ejemplo n.º 4
0
void replica::remove(configuration_update_request& proposal)
{
    if (proposal.config.ballot != get_ballot() || status() != PS_PRIMARY)
        return;

    dassert (proposal.config.gpid == _primary_states.membership.gpid, "");
    dassert (proposal.config.app_type == _primary_states.membership.app_type, "");
    dassert (proposal.config.primary == _primary_states.membership.primary, "");
    dassert (proposal.config.secondaries == _primary_states.membership.secondaries, "");

    auto st = _primary_states.get_node_status(proposal.node);

    switch (st)
    {
    case PS_PRIMARY:
        dassert (proposal.config.primary == proposal.node, "");
        proposal.config.primary.set_invalid();
        break;
    case PS_SECONDARY:
        {
        auto rt = replica_helper::remove_node(proposal.node, proposal.config.secondaries);
        dassert (rt, "");
        }
        break;
    case PS_POTENTIAL_SECONDARY:
        break;
    }

    update_configuration_on_meta_server(CT_REMOVE, proposal.node, proposal.config);
}
Ejemplo n.º 5
0
void replica::replay_prepare_list()
{
    decree start = last_committed_decree() + 1;
    decree end = _prepare_list->max_decree();

    ddebug(
            "%s: replay prepare list from %lld to %lld, ballot = %lld",
            name(),
            start,
            end,
            get_ballot()
            );

    for (decree decree = start; decree <= end; decree++)
    {
        mutation_ptr old = _prepare_list->get_mutation_by_decree(decree);
        mutation_ptr mu = new_mutation(decree);

        if (old != nullptr)
        {
            mu->copy_from(old);
        }
        else
        {
            mu->rpc_code = RPC_REPLICATION_WRITE_EMPTY;
            ddebug(
                "%s: emit empty mutation %lld when replay prepare list",
                name(),
                decree
                );
        }

        init_prepare(mu);
    }
}
Ejemplo n.º 6
0
// from primary
void replica::on_remove(const replica_configuration& request)
{ 
    if (request.ballot < get_ballot())
        return;

    dassert (request.status == PS_INACTIVE, "");
    update_local_configuration(request);
}
Ejemplo n.º 7
0
mutation_ptr replica::new_mutation(decree decree)
{
    mutation_ptr mu(new mutation());
    mu->data.header.gpid = get_gpid();
    mu->data.header.ballot = get_ballot();
    mu->data.header.decree = decree;
    mu->data.header.log_offset = invalid_offset;
    return mu;
}
Ejemplo n.º 8
0
bool replica::update_configuration(const partition_configuration& config)
{
    dassert (config.ballot >= get_ballot(), "");
    
    replica_configuration rconfig;
    replica_helper::get_replica_config(config, primary_address(), rconfig);

    if (rconfig.status == PS_PRIMARY &&
        (rconfig.ballot > get_ballot() || status() != PS_PRIMARY)
        )
    {
        _primary_states.reset_membership(config, config.primary != primary_address());
    }

    if (config.ballot > get_ballot() ||
        is_same_ballot_status_change_allowed(status(), rconfig.status)
        )
    {
        return update_local_configuration(rconfig, true);
    }
    else
        return false;
}
Ejemplo n.º 9
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        if (!_options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(err, mu);
        }
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Ejemplo n.º 10
0
void replica::downgrade_to_secondary_on_primary(configuration_update_request& proposal)
{
    if (proposal.config.ballot != get_ballot() || status() != PS_PRIMARY)
        return;

    dassert (proposal.config.gpid == _primary_states.membership.gpid, "");
    dassert (proposal.config.app_type == _primary_states.membership.app_type, "");
    dassert (proposal.config.primary == _primary_states.membership.primary, "");
    dassert (proposal.config.secondaries == _primary_states.membership.secondaries, "");
    dassert (proposal.node == proposal.config.primary, "");

    proposal.config.primary.set_invalid();
    proposal.config.secondaries.push_back(proposal.node);

    update_configuration_on_meta_server(CT_DOWNGRADE_TO_SECONDARY, proposal.node, proposal.config);
}
Ejemplo n.º 11
0
void replica::ack_prepare_message(error_code err, mutation_ptr& mu)
{
    prepare_ack resp;
    resp.gpid = get_gpid();
    resp.err = err;
    resp.ballot = get_ballot();
    resp.decree = mu->data.header.decree;

    // for PS_POTENTIAL_SECONDARY ONLY
    resp.last_committed_decree_in_app = _app->last_committed_decree(); 
    resp.last_committed_decree_in_prepare_list = last_committed_decree();

    dassert(nullptr != mu->prepare_msg(), "");
    reply(mu->prepare_msg(), resp);

    ddebug("%s: mutation %s ack_prepare_message, err = %s", name(), mu->name(), err.to_string());
}
Ejemplo n.º 12
0
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err);

    if (err == ERR_SUCCESS)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_SUCCESS)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_SUCCESS)
        {
            handle_local_failure(err);
        }
        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Ejemplo n.º 13
0
void replica::downgrade_to_inactive_on_primary(configuration_update_request& proposal)
{
    if (proposal.config.ballot != get_ballot() || status() != PS_PRIMARY)
        return;

    dassert (proposal.config.gpid == _primary_states.membership.gpid, "");
    dassert (proposal.config.app_type == _primary_states.membership.app_type, "");
    dassert (proposal.config.primary == _primary_states.membership.primary, "");
    dassert (proposal.config.secondaries == _primary_states.membership.secondaries, "");

    if (proposal.node == proposal.config.primary)
    {
        proposal.config.primary.set_invalid();
    }
    else
    {
        auto rt = replica_helper::remove_node(proposal.node, proposal.config.secondaries);
        dassert (rt, "");
    }

    update_configuration_on_meta_server(CT_DOWNGRADE_TO_INACTIVE, proposal.node, proposal.config);
}
Ejemplo n.º 14
0
        // @ secondary
        void replica::on_copy_checkpoint(const replica_configuration& request, /*out*/ learn_response& response)
        {
            check_hashed_access();

            if (request.ballot > get_ballot())
            {
                if (!update_local_configuration(request))
                {
                    response.err = ERR_INVALID_STATE;
                    return;
                }
            }

            if (status() != PS_SECONDARY)
            {
                response.err = ERR_INVALID_STATE;
                return;
            }

            if (_app->last_durable_decree() == 0)
            {
                response.err = ERR_OBJECT_NOT_FOUND;
                return;
            }

            blob placeholder;
            int err = _app->get_checkpoint(0, placeholder, response.state);
            if (err != 0)
            {
                response.err = ERR_LEARN_FILE_FAILED;
            }
            else
            {
                response.err = ERR_OK;
                response.last_committed_decree = last_committed_decree();
                response.base_local_dir = _app->data_dir();
                response.address = _stub->_primary_address;
            }
        }
Ejemplo n.º 15
0
void replica::add_potential_secondary(configuration_update_request& proposal)
{
    if (status() != PS_PRIMARY)
    {
        return;
    }   

    dassert (proposal.config.ballot == get_ballot(), "");
    dassert (proposal.config.gpid == _primary_states.membership.gpid, "");
    dassert (proposal.config.app_type == _primary_states.membership.app_type, "");
    dassert (proposal.config.primary == _primary_states.membership.primary, "");
    dassert (proposal.config.secondaries == _primary_states.membership.secondaries, "");
    dassert (!_primary_states.check_exist(proposal.node, PS_PRIMARY), "");
    dassert (!_primary_states.check_exist(proposal.node, PS_SECONDARY), "");

    if (_primary_states.learners.find(proposal.node) != _primary_states.learners.end())
    {
        return;
    }

    remote_learner_state state;
    state.prepare_start_decree = invalid_decree;
    state.signature = random64(0, (uint64_t)(-1LL));
    state.timeout_task = nullptr; // TODO: add timer for learner task

    _primary_states.learners[proposal.node] = state;
    _primary_states.statuses[proposal.node] = PS_POTENTIAL_SECONDARY;

    group_check_request request;
    request.app_type = _primary_states.membership.app_type;
    request.node = proposal.node;
    _primary_states.get_replica_config(proposal.node, request.config);
    request.last_committed_decree = last_committed_decree();
    request.learner_signature = state.signature;

    rpc::call_one_way_typed(proposal.node, RPC_LEARN_ADD_LEARNER, request, gpid_to_hash(get_gpid()));
}
Ejemplo n.º 16
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_SUCCESS;
    uint8_t count = 0;

    if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count)
    {
        err = ERR_NOT_ENOUGH_MEMBER;
        goto ErrOut;
    }
            
    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }

    if (mu->data.header.decree > _prepare_list->max_decree() && _prepare_list->count() >= _options.staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }
 
    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare without log
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_SUCCESS)
    {
        goto ErrOut;
    }
        
    ddebug("%s: mutation %s init_prepare", name(), mu->name());

    //
    // TODO: bounded staleness on secondaries
    //
    dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, "");
    
    // remote prepare
    dassert (mu->remote_tasks().size() == 0, "");
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries);
            count++;
        }
    }    
    mu->set_left_potential_secondary_ack_count(count);

    // local log
    dassert (mu->data.header.log_offset == invalid_offset, "");
    dassert (mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu, 
            std::placeholders::_1, 
            std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );

    if (nullptr == mu->log_task())
    {
        err = ERR_FILE_OPERATION_FAILED;
        handle_local_failure(err);
        goto ErrOut;
    }

    return;

ErrOut:
    response_client_message(mu->client_request, err);
    return;
}
Ejemplo n.º 17
0
void replica::on_group_check(const group_check_request& request, /*out*/ group_check_response& response)
{
    check_hashed_access();

    ddebug(
        "%s: process group check, primary = %s, ballot = %" PRId64 ", status = %s, last_committed_decree = %" PRId64,
        name(), request.config.primary.to_string(),
        request.config.ballot, enum_to_string(request.config.status),
        request.last_committed_decree
        );
    
    if (request.config.ballot < get_ballot())
    {
        response.err = ERR_VERSION_OUTDATED;
        dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
        return;
    }
    else if (request.config.ballot > get_ballot())
    {
        if (!update_local_configuration(request.config))
        {
            response.err = ERR_INVALID_STATE;
            dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
            return;
        }
    }
    else if (is_same_ballot_status_change_allowed(status(), request.config.status))
    {
        update_local_configuration(request.config, true);
    }
    
    switch (status())
    {
    case partition_status::PS_INACTIVE:
        break;
    case partition_status::PS_SECONDARY:
        if (request.last_committed_decree > last_committed_decree())
        {
            _prepare_list->commit(request.last_committed_decree, COMMIT_TO_DECREE_HARD);
        }
        break;
    case partition_status::PS_POTENTIAL_SECONDARY:
        init_learn(request.config.learner_signature);
        break;
    case partition_status::PS_ERROR:
        break;
    default:
        dassert (false, "");
    }
    
    response.pid = get_gpid();
    response.node = _stub->_primary_address;
    response.err = ERR_OK;
    if (status() == partition_status::PS_ERROR)
    {
        response.err = ERR_INVALID_STATE;
        dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
    }

    response.last_committed_decree_in_app = _app->last_committed_decree();
    response.last_committed_decree_in_prepare_list = last_committed_decree();
    response.learner_status_ = _potential_secondary_states.learning_status;
    response.learner_signature = _potential_secondary_states.learning_version;
}
Ejemplo n.º 18
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;
            
    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }
    
    dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }
 
    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }
    
    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature);
            count++;
        }
    }    
    mu->set_left_potential_secondary_ack_count(count);

    if (mu->is_logged())
    {
        do_possible_commit_on_primary(mu);
    }
    else
    {
        dassert(mu->data.header.log_offset == invalid_offset, "");
        dassert(mu->log_task() == nullptr, "");

        mu->log_task() = _stub->_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            this,
            std::bind(&replica::on_append_log_completed, this, mu,
                      std::placeholders::_1,
                      std::placeholders::_2),
                      gpid_to_hash(get_gpid())
            );

        dassert(nullptr != mu->log_task(), "");
    }
    return;

ErrOut:
    for (auto& r : mu->client_requests)
    {
        response_client_message(r, err);
    }
    return;
}
Ejemplo n.º 19
0
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, error_code err, dsn_message_t request, dsn_message_t reply)
{
    check_hashed_access();

    mutation_ptr mu = pr.first;
    partition_status targetStatus = pr.second;

    // skip callback for old mutations
    if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status())
        return;
    
    dassert (mu->data.header.ballot == get_ballot(), "");

    ::dsn::rpc_address node = dsn_msg_to_address(request);
    partition_status st = _primary_states.get_node_status(node);

    // handle reply
    prepare_ack resp;

    // handle error
    if (err != ERR_OK)
    {
        resp.err = err;
    }
    else
    {
        ::unmarshall(reply, resp);
    }
    
    ddebug(
        "%s: mutation %s on_prepare_reply from %s, err = %s",
        name(), mu->name(),
        node.to_string(),
        resp.err.to_string()
        );
       
    if (resp.err == ERR_OK)
    {
        dassert (resp.ballot == get_ballot(), "");
        dassert (resp.decree == mu->data.header.decree, "");

        switch (targetStatus)
        {
        case PS_SECONDARY:
            dassert (_primary_states.check_exist(node, PS_SECONDARY), "");
            dassert (mu->left_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        case PS_POTENTIAL_SECONDARY:            
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        default:
            dwarn(
                "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name()
                );
            break;
        }
    }

    // failure handling
    else
    {
        // retry for INACTIVE state when there are still time
        if (resp.err == ERR_INACTIVE_STATE
            && !mu->is_prepare_close_to_timeout(2, targetStatus == PS_SECONDARY ? 
            _options->prepare_timeout_ms_for_secondaries :
            _options->prepare_timeout_ms_for_potential_secondaries)
            )
        {
            send_prepare_message(node, targetStatus, mu, targetStatus == PS_SECONDARY ?
                _options->prepare_timeout_ms_for_secondaries :
                _options->prepare_timeout_ms_for_potential_secondaries);
            return;
        }

        // make sure this is before any later commit ops
        // because now commit ops may lead to new prepare ops
        // due to replication throttling
        handle_remote_failure(st, node, resp.err);

        // note targetStatus and (curent) status may diff
        if (targetStatus == PS_POTENTIAL_SECONDARY)
        {
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
        }
    }
}
Ejemplo n.º 20
0
void replica::on_update_configuration_on_meta_server_reply(error_code err, dsn_message_t request, dsn_message_t response, std::shared_ptr<configuration_update_request> req)
{
    check_hashed_access();

    if (PS_INACTIVE != status() || _stub->is_connected() == false)
    {
        _primary_states.reconfiguration_task = nullptr;
        err.end_tracking();
        return;
    }

    configuration_update_response resp;
    if (err == ERR_OK)
    {
        ::unmarshall(response, resp);
        err = resp.err;
    }

    if (err != ERR_OK)
    {
        ddebug(
            "%s: update configuration reply with err %s, request ballot %lld",
            name(),
            err.to_string(),
            req->config.ballot
            );

        if (err != ERR_INVALID_VERSION)
        {
            rpc_address target(_stub->_failure_detector->get_servers());
            dsn_msg_add_ref(request); // added for another round of rpc::call
            _primary_states.reconfiguration_task = rpc::call(
                target,
                request,
                this,
                std::bind(&replica::on_update_configuration_on_meta_server_reply, this,
                std::placeholders::_1,
                std::placeholders::_2,
                std::placeholders::_3,
                req),
                gpid_to_hash(get_gpid())
                );
            return;
        }        
    }

    ddebug(
        "%s: update configuration reply with err %s, ballot %lld, local %lld",
        name(),
        resp.err.to_string(),
        resp.config.ballot,
        get_ballot()
        );
    
    if (resp.config.ballot < get_ballot())
    {
        _primary_states.reconfiguration_task = nullptr;
        return;
    }        
    
    // post-update work items?
    if (resp.err == ERR_OK)
    {        
        dassert (req->config.gpid == resp.config.gpid, "");
        dassert (req->config.app_type == resp.config.app_type, "");
        dassert (req->config.primary == resp.config.primary, "");
        dassert (req->config.secondaries == resp.config.secondaries, "");

        switch (req->type)
        {        
        case CT_UPGRADE_TO_PRIMARY:
            _primary_states.last_prepare_decree_on_new_primary = _prepare_list->max_decree();
            break;
        case CT_ASSIGN_PRIMARY:
        case CT_DOWNGRADE_TO_SECONDARY:
        case CT_DOWNGRADE_TO_INACTIVE:
        case CT_UPGRADE_TO_SECONDARY:
            break;
        case CT_REMOVE:
            if (req->node != primary_address())
            {
                replica_configuration rconfig;
                replica_helper::get_replica_config(resp.config, req->node, rconfig);
                rpc::call_one_way_typed(req->node, RPC_REMOVE_REPLICA, rconfig, gpid_to_hash(get_gpid()));
            }
            break;
        default:
            dassert (false, "");
        }
    }
    
    update_configuration(resp.config);
    _primary_states.reconfiguration_task = nullptr;
}
Ejemplo n.º 21
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        rpc_read_stream reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    dinfo("%s: mutation %s on_prepare", name(), mu->name());

    dassert(mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        // no need response because the rpc should have been cancelled on primary in this case
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            derror(
                "%s: mutation %s on_prepare failed as update local configuration failed, state = %s",
                name(), mu->name(),
                enum_to_string(status())
                );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        derror(
            "%s: mutation %s on_prepare failed as invalid replica state, state = %s",
            name(), mu->name(),
            enum_to_string(status())
            );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
            );
        return;
    }
    else if (PS_POTENTIAL_SECONDARY == status())
    {
        // new learning process
        if (rconfig.learner_signature != _potential_secondary_states.learning_signature)
        {
            init_learn(rconfig.learner_signature);
            // no need response as rpc is already gone
            return;
        }

        if (!(_potential_secondary_states.learning_status == LearningWithPrepare
            || _potential_secondary_states.learning_status == LearningSucceeded))
        {
            derror(
                "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
                );

            // no need response as rpc is already gone
            return;
        }
    }

    dassert (rconfig.status == status(), "");    
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }
    
    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        if (mu2->is_logged())
        {
            ack_prepare_message(ERR_OK, mu);
        }
        else
        {
            derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name());
            // response will be unnecessary when we add retry logic in rpc engine.
            // the retried rpc will use the same id therefore it will be considered responsed
            // even the response is for a previous try.
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, "");
    }

    dassert(mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu,
                  std::placeholders::_1,
                  std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );
}
Ejemplo n.º 22
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE)
    {
        switch (status())
        {
        case PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case PS_SECONDARY:
        case PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            [this, mu](error_code err, size_t size)
            {
                //
                // DO NOT CHANGE THIS CALLBACK HERE UNLESS
                // YOU FULLY UNDERSTAND WHAT WE DO HERE
                // 
                // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE
                // THE FIRST CALLBACK IF THERE IS FAILURE TO
                // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY
                // CANCELLED!!!
                //
                // TODO: we do not need so many callbacks
                //

                dinfo("%s: append private log completed for mutation %s, size = %u, err = %s",
                      name(), mu->name(), size, err.to_string());

                if (err != ERR_OK)
                {
                    derror("%s: append private log failed for mutation %s, err = %s",
                           name(), mu->name(), err.to_string());
                    handle_local_failure(err);
                }
            },
            gpid_to_hash(get_gpid())
            );
    }
}
Ejemplo n.º 23
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;

    if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count)
    {
        err = ERR_NOT_ENOUGH_MEMBER;
        goto ErrOut;
    }

    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }

    ddebug("%s: mutation %s init_prepare", name(), mu->name());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options.staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }

    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }

    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries);
            count++;
        }
    }
    mu->set_left_potential_secondary_ack_count(count);

    // it is possible to do commit here when logging is not required for acking prepare.
    // however, it is only possible when replica count == 1 at this moment in the
    // replication group, and we don't want to do this as it is too fragile now.
    // do_possible_commit_on_primary(mu);

    // local log
    dassert (mu->data.header.log_offset == invalid_offset, "");
    dassert (mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
                                         LPC_WRITE_REPLICATION_LOG,
                                         this,
                                         std::bind(&replica::on_append_log_completed, this, mu,
                                                 std::placeholders::_1,
                                                 std::placeholders::_2),
                                         gpid_to_hash(get_gpid())
                                        );

    dassert(nullptr != mu->log_task(), "");
    return;

ErrOut:
    response_client_message(mu->client_msg(), err);
    return;
}
Ejemplo n.º 24
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE)
    {
        switch (status())
        {
        case partition_status::PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case partition_status::PS_SECONDARY:
        case partition_status::PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case partition_status::PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            nullptr,
            gpid_to_hash(get_gpid())
            );
    }
}
Ejemplo n.º 25
0
void replica::on_config_proposal(configuration_update_request& proposal)
{
    check_hashed_access();

    ddebug(
        "%s: on_config_proposal %s for %s", 
        name(),
        enum_to_string(proposal.type),
        proposal.node.to_string()
        );

    if (proposal.config.ballot < get_ballot())
    {
        dwarn(
            "%s: on_config_proposal is out-dated, %lld vs %lld",
            name(),
            proposal.config.ballot,
            get_ballot()
            );
        return;
    }   

    if (_primary_states.reconfiguration_task != nullptr)
    {
        dinfo(
            "%s: reconfiguration on the way, skip the incoming proposal",
            name()
            );
        return;
    }

    if (proposal.config.ballot > get_ballot())
    {
        if (!update_configuration(proposal.config))
        {
            // is closing or update failed
            return;
        }
    }
    
    switch (proposal.type)
    {
    case CT_ASSIGN_PRIMARY:
    case CT_UPGRADE_TO_PRIMARY:
        assign_primary(proposal);
        break;
    case CT_ADD_SECONDARY:
        add_potential_secondary(proposal);
        break;
    case CT_DOWNGRADE_TO_SECONDARY:
        downgrade_to_secondary_on_primary(proposal);
        break;
    case CT_DOWNGRADE_TO_INACTIVE:
        downgrade_to_inactive_on_primary(proposal);
        break;
    case CT_REMOVE:
        remove(proposal);
        break;
    default:
        dassert (false, "");
    }
}
Ejemplo n.º 26
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        msg_binary_reader reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    ddebug( "%s: mutation %s on_prepare", name(), mu->name());

    dassert (mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            ddebug(
                "%s: mutation %s on_prepare  to %s failed as update local configuration failed",
                name(), mu->name(),
                enum_to_string(status())
            );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        ddebug(
            "%s: mutation %s on_prepare  to %s skipped",
            name(), mu->name(),
            enum_to_string(status())
        );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
        );
        return;
    }

    else if (PS_POTENTIAL_SECONDARY == status())
    {
        if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded)
        {
            ddebug(
                "%s: mutation %s on_prepare to %s skipped, learnings state = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
            );

            // do not retry as there may retries later
            return;
        }
    }

    dassert (rconfig.status == status(), "");
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }

    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name());

        if (mu2->is_logged() || _options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(ERR_OK, mu);
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, "");
    }

    // ack without logging
    if (_options.prepare_ack_on_secondary_before_logging_allowed)
    {
        ack_prepare_message(err, mu);
    }

    // write log
    dassert (mu->log_task() == nullptr, "");

    mu->log_task() = _stub->_log->append(mu,
                                         LPC_WRITE_REPLICATION_LOG,
                                         this,
                                         std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2),
                                         gpid_to_hash(get_gpid())
                                        );

    dassert(mu->log_task() != nullptr, "");
}
Ejemplo n.º 27
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }

    // mutation log failure, propagted to all replicas
    if (err != ERR_OK)
    {
        _stub->handle_log_failure(err);
    }

    // write local private log if necessary
    else if (_private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
                             LPC_WRITE_REPLICATION_LOG,
                             this,
                             [this](error_code err, size_t size)
        {
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
        },
        gpid_to_hash(get_gpid())
                            );
    }
}
Ejemplo n.º 28
0
bool replica::update_local_configuration(const replica_configuration& config, bool same_ballot/* = false*/)
{
    dassert(config.ballot > get_ballot()
        || (same_ballot && config.ballot == get_ballot()), "");
    dassert (config.gpid == get_gpid(), "");

    partition_status old_status = status();
    ballot old_ballot = get_ballot();

    // skip unncessary configuration change
    if (old_status == config.status && old_ballot == config.ballot)
        return true;

    // skip invalid change
    switch (old_status)
    {
    case PS_ERROR:
        {
            ddebug(
                "%s: status change from %s @ %lld to %s @ %lld is not allowed",
                name(),
                enum_to_string(old_status),
                old_ballot,
                enum_to_string(config.status),
                config.ballot
                );
            return false;
        }
        break;
    case PS_INACTIVE:
        if ((config.status == PS_PRIMARY || config.status == PS_SECONDARY)
            && !_inactive_is_transient)
        {
            ddebug(
                "%s: status change from %s @ %lld to %s @ %lld is not allowed when inactive state is not transient",
                name(),
                enum_to_string(old_status),
                old_ballot,
                enum_to_string(config.status),
                config.ballot
                );
            return false;
        }
        break;
    case PS_POTENTIAL_SECONDARY:
        if (config.status == PS_ERROR || config.status == PS_INACTIVE)
        {
            if (!_potential_secondary_states.cleanup(false))
            {
                dwarn(
                    "%s: status change from %s @ %lld to %s @ %lld is not allowed coz learning remote state is still running",
                    name(),
                    enum_to_string(old_status),
                    old_ballot,
                    enum_to_string(config.status),
                    config.ballot
                    );
                return false;
            }
        }
        break;
    case PS_SECONDARY:
        if (config.status != PS_SECONDARY
            && _secondary_states.checkpoint_task != nullptr)
        {
            dwarn(
                "%s: status change from %s @ %lld to %s @ %lld is not allowed coz checkpointing is still running",
                name(),
                enum_to_string(old_status),
                old_ballot,
                enum_to_string(config.status),
                config.ballot
                );
            return false;
        }
        break;
    }

    uint64_t oldTs = _last_config_change_time_ms;
    _config = config;
    _last_config_change_time_ms =now_ms();
    dassert (max_prepared_decree() >= last_committed_decree(), "");
    
    switch (old_status)
    {
    case PS_PRIMARY:
        cleanup_preparing_mutations(true);
        switch (config.status)
        {
        case PS_PRIMARY:
            replay_prepare_list();
            break;
        case PS_INACTIVE:
            _primary_states.cleanup(old_ballot != config.ballot);
            break;
        case PS_SECONDARY:
        case PS_ERROR:
            _primary_states.cleanup();
            break;
        case PS_POTENTIAL_SECONDARY:
            dassert (false, "invalid execution path");
            break;
        default:
            dassert (false, "invalid execution path");
        }        
        break;
    case PS_SECONDARY:
        cleanup_preparing_mutations(true);
        switch (config.status)
        {
        case PS_PRIMARY:
            init_group_check();            
            replay_prepare_list();
            break;
        case PS_SECONDARY:
            break;
        case PS_POTENTIAL_SECONDARY:
            // InActive in config
            break;
        case PS_INACTIVE:
            break;
        case PS_ERROR:
            break;
        default:
            dassert (false, "invalid execution path");
        }
        break;
    case PS_POTENTIAL_SECONDARY:
        switch (config.status)
        {
        case PS_PRIMARY:
            dassert (false, "invalid execution path");
            break;
        case PS_SECONDARY:
            _prepare_list->truncate(_app->last_committed_decree());            
            _potential_secondary_states.cleanup(true);
            check_state_completeness();
            break;
        case PS_POTENTIAL_SECONDARY:
            break;
        case PS_INACTIVE:
            _potential_secondary_states.cleanup(true);
            break;
        case PS_ERROR:
            _prepare_list->reset(_app->last_committed_decree());
            _potential_secondary_states.cleanup(true);
            break;
        default:
            dassert (false, "invalid execution path");
        }
        break;
    case PS_INACTIVE:        
        switch (config.status)
        {
        case PS_PRIMARY:
            _inactive_is_transient = false;
            init_group_check();
            replay_prepare_list();
            break;
        case PS_SECONDARY:            
            _inactive_is_transient = false;
            break;
        case PS_POTENTIAL_SECONDARY:
            _inactive_is_transient = false;
            break;
        case PS_INACTIVE:
            break;
        case PS_ERROR:
            _inactive_is_transient = false;
            break;
        default:
            dassert (false, "invalid execution path");
        }
        break;
    case PS_ERROR:
        switch (config.status)
        {
        case PS_PRIMARY:
            dassert (false, "invalid execution path");
            break;
        case PS_SECONDARY:
            dassert (false, "invalid execution path");
            break;
        case PS_POTENTIAL_SECONDARY:
            dassert(false, "invalid execution path");
            break;
        case PS_INACTIVE:
            dassert (false, "invalid execution path");
            break;
        case PS_ERROR:
            break;
        default:
            dassert (false, "invalid execution path");
        }
        break;
    default:
        dassert (false, "invalid execution path");
    }

    dwarn(
        "%s: status change %s @ %lld => %s @ %lld, pre(%llu, %llu), app(%llu, %llu), duration=%llu ms",
        name(),
        enum_to_string(old_status),
        old_ballot,
        enum_to_string(status()),
        get_ballot(),
        _prepare_list->max_decree(),
        _prepare_list->last_committed_decree(),
        _app->last_committed_decree(),
        _app->last_durable_decree(),
        _last_config_change_time_ms - oldTs
        );

    if (status() != old_status)
    {
        bool isClosing = (status() == PS_ERROR || (status() == PS_INACTIVE && get_ballot() > old_ballot));
        _stub->notify_replica_state_update(config, isClosing);

        if (isClosing)
        {
            ddebug("%s: being close ...", name());
            _stub->begin_close_replica(this);
            return false;
        }
    }
    else
    {
        _stub->notify_replica_state_update(config, false);
    }
    return true;
}
Ejemplo n.º 29
0
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, int err, message_ptr& request, message_ptr& reply)
{
    check_hashed_access();

    mutation_ptr& mu = pr.first;
    partition_status targetStatus = pr.second;

    // skip callback for old mutations
    if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status())
        return;
    
    dassert (mu->data.header.ballot == get_ballot(), "");

    end_point node = request->header().to_address;
    partition_status st = _primary_states.GetNodeStatus(node);

    // handle reply
    prepare_ack resp;

    // handle error
    if (err)
    {
        resp.err = err;
    }
    else
    {
        unmarshall(reply, resp);        

        ddebug( 
            "%s: mutation %s on_prepare_reply from %s:%d", 
            name(), mu->name(),
            node.name.c_str(), static_cast<int>(node.port)
            );
    }
       
    if (resp.err == ERR_SUCCESS)
    {
        dassert (resp.ballot == get_ballot(), "");
        dassert (resp.decree == mu->data.header.decree, "");

        switch (targetStatus)
        {
        case PS_SECONDARY:
            dassert (_primary_states.check_exist(node, PS_SECONDARY), "");
            dassert (mu->left_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        case PS_POTENTIAL_SECONDARY:            
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        default:
            ddebug( 
                "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name()
                );
            break;
        }
    }

    // failure handling
    else
    {
        // note targetStatus and (curent) status may diff
        if (targetStatus == PS_POTENTIAL_SECONDARY)
        {
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
        }
        handle_remote_failure(st, node, resp.err);
    }
}