Beispiel #1
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        if (!_options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(err, mu);
        }
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Beispiel #2
0
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err);

    if (err == ERR_SUCCESS)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_SUCCESS)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_SUCCESS)
        {
            handle_local_failure(err);
        }
        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Beispiel #3
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;
            
    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }
    
    dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }
 
    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }
    
    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature);
            count++;
        }
    }    
    mu->set_left_potential_secondary_ack_count(count);

    if (mu->is_logged())
    {
        do_possible_commit_on_primary(mu);
    }
    else
    {
        dassert(mu->data.header.log_offset == invalid_offset, "");
        dassert(mu->log_task() == nullptr, "");

        mu->log_task() = _stub->_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            this,
            std::bind(&replica::on_append_log_completed, this, mu,
                      std::placeholders::_1,
                      std::placeholders::_2),
                      gpid_to_hash(get_gpid())
            );

        dassert(nullptr != mu->log_task(), "");
    }
    return;

ErrOut:
    for (auto& r : mu->client_requests)
    {
        response_client_message(r, err);
    }
    return;
}
Beispiel #4
0
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, error_code err, dsn_message_t request, dsn_message_t reply)
{
    check_hashed_access();

    mutation_ptr mu = pr.first;
    partition_status targetStatus = pr.second;

    // skip callback for old mutations
    if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status())
        return;
    
    dassert (mu->data.header.ballot == get_ballot(), "");

    ::dsn::rpc_address node = dsn_msg_to_address(request);
    partition_status st = _primary_states.get_node_status(node);

    // handle reply
    prepare_ack resp;

    // handle error
    if (err != ERR_OK)
    {
        resp.err = err;
    }
    else
    {
        ::unmarshall(reply, resp);
    }
    
    ddebug(
        "%s: mutation %s on_prepare_reply from %s, err = %s",
        name(), mu->name(),
        node.to_string(),
        resp.err.to_string()
        );
       
    if (resp.err == ERR_OK)
    {
        dassert (resp.ballot == get_ballot(), "");
        dassert (resp.decree == mu->data.header.decree, "");

        switch (targetStatus)
        {
        case PS_SECONDARY:
            dassert (_primary_states.check_exist(node, PS_SECONDARY), "");
            dassert (mu->left_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        case PS_POTENTIAL_SECONDARY:            
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        default:
            dwarn(
                "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name()
                );
            break;
        }
    }

    // failure handling
    else
    {
        // retry for INACTIVE state when there are still time
        if (resp.err == ERR_INACTIVE_STATE
            && !mu->is_prepare_close_to_timeout(2, targetStatus == PS_SECONDARY ? 
            _options->prepare_timeout_ms_for_secondaries :
            _options->prepare_timeout_ms_for_potential_secondaries)
            )
        {
            send_prepare_message(node, targetStatus, mu, targetStatus == PS_SECONDARY ?
                _options->prepare_timeout_ms_for_secondaries :
                _options->prepare_timeout_ms_for_potential_secondaries);
            return;
        }

        // make sure this is before any later commit ops
        // because now commit ops may lead to new prepare ops
        // due to replication throttling
        handle_remote_failure(st, node, resp.err);

        // note targetStatus and (curent) status may diff
        if (targetStatus == PS_POTENTIAL_SECONDARY)
        {
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
        }
    }
}
Beispiel #5
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE)
    {
        switch (status())
        {
        case PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case PS_SECONDARY:
        case PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            [this, mu](error_code err, size_t size)
            {
                //
                // DO NOT CHANGE THIS CALLBACK HERE UNLESS
                // YOU FULLY UNDERSTAND WHAT WE DO HERE
                // 
                // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE
                // THE FIRST CALLBACK IF THERE IS FAILURE TO
                // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY
                // CANCELLED!!!
                //
                // TODO: we do not need so many callbacks
                //

                dinfo("%s: append private log completed for mutation %s, size = %u, err = %s",
                      name(), mu->name(), size, err.to_string());

                if (err != ERR_OK)
                {
                    derror("%s: append private log failed for mutation %s, err = %s",
                           name(), mu->name(), err.to_string());
                    handle_local_failure(err);
                }
            },
            gpid_to_hash(get_gpid())
            );
    }
}
Beispiel #6
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE)
    {
        switch (status())
        {
        case partition_status::PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case partition_status::PS_SECONDARY:
        case partition_status::PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case partition_status::PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            nullptr,
            gpid_to_hash(get_gpid())
            );
    }
}
Beispiel #7
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }

    // mutation log failure, propagted to all replicas
    if (err != ERR_OK)
    {
        _stub->handle_log_failure(err);
    }

    // write local private log if necessary
    else if (_private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
                             LPC_WRITE_REPLICATION_LOG,
                             this,
                             [this](error_code err, size_t size)
        {
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
        },
        gpid_to_hash(get_gpid())
                            );
    }
}
Beispiel #8
0
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, int err, message_ptr& request, message_ptr& reply)
{
    check_hashed_access();

    mutation_ptr& mu = pr.first;
    partition_status targetStatus = pr.second;

    // skip callback for old mutations
    if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status())
        return;
    
    dassert (mu->data.header.ballot == get_ballot(), "");

    end_point node = request->header().to_address;
    partition_status st = _primary_states.GetNodeStatus(node);

    // handle reply
    prepare_ack resp;

    // handle error
    if (err)
    {
        resp.err = err;
    }
    else
    {
        unmarshall(reply, resp);        

        ddebug( 
            "%s: mutation %s on_prepare_reply from %s:%d", 
            name(), mu->name(),
            node.name.c_str(), static_cast<int>(node.port)
            );
    }
       
    if (resp.err == ERR_SUCCESS)
    {
        dassert (resp.ballot == get_ballot(), "");
        dassert (resp.decree == mu->data.header.decree, "");

        switch (targetStatus)
        {
        case PS_SECONDARY:
            dassert (_primary_states.check_exist(node, PS_SECONDARY), "");
            dassert (mu->left_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        case PS_POTENTIAL_SECONDARY:            
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        default:
            ddebug( 
                "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name()
                );
            break;
        }
    }

    // failure handling
    else
    {
        // note targetStatus and (curent) status may diff
        if (targetStatus == PS_POTENTIAL_SECONDARY)
        {
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
        }
        handle_remote_failure(st, node, resp.err);
    }
}