Пример #1
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        if (!_options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(err, mu);
        }
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Пример #2
0
        void replica::on_checkpoint_completed(error_code err)
        {
            check_hashed_access();

            // closing or wrong timing
            if (PS_SECONDARY != status() || ERR_WRONG_TIMING == err)
            {
                _secondary_states.checkpoint_task = nullptr;
                return;
            } 

            // handle failure
            if (err != ERR_OK)
            {
                // done checkpointing
                _secondary_states.checkpoint_task = nullptr;
                handle_local_failure(err);
                return;
            }

            auto c = _prepare_list->last_committed_decree();

            // missing commits
            if (c > _app->last_committed_decree())
            {
                // missed ones are covered by prepare list
                if (_app->last_committed_decree() > _prepare_list->min_decree())
                {
                    for (auto d = _app->last_committed_decree() + 1; d <= c; d++)
                    {
                        auto mu = _prepare_list->get_mutation_by_decree(d);
                        dassert(nullptr != mu, "");
                        _app->write_internal(mu);
                    }

                    // everything is ok now, done checkpointing
                    _secondary_states.checkpoint_task = nullptr;
                }

                // missed ones need to be loaded via private logs
                else
                {
                    _secondary_states.checkpoint_task = tasking::enqueue(
                        LPC_CHECKPOINT_REPLICA,
                        this,
                        [this]() { this->catch_up_with_private_logs(PS_SECONDARY); },
                        gpid_to_hash(get_gpid())
                        );
                }
            }

            // no missing commits
            else
            {
                // everything is ok now, done checkpointing
                _secondary_states.checkpoint_task = nullptr;
            }
        }
Пример #3
0
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err);

    if (err == ERR_SUCCESS)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_SUCCESS)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_SUCCESS)
        {
            handle_local_failure(err);
        }
        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Пример #4
0
void replica::execute_mutation(mutation_ptr& mu)
{
    dassert (nullptr != _app, "");

    int err = ERR_SUCCESS;
    switch (status())
    {
    case PS_INACTIVE:
        if (_app->last_committed_decree() + 1 == mu->data.header.decree)
            err = _app->write_internal(mu, false);
        break;
    case PS_PRIMARY:
    case PS_SECONDARY:
        {
        dassert (_app->last_committed_decree() + 1 == mu->data.header.decree, "");
        bool ack_client = (status() == PS_PRIMARY);
        if (ack_client)
        {
            if (mu->client_request == nullptr)
                ack_client = false;
            else if (mu->client_request->header().from_address.ip == 0)
                ack_client = false;
        }
        err = _app->write_internal(mu, ack_client); 
        }
        break;
    case PS_POTENTIAL_SECONDARY:
        if (LearningSucceeded == _potential_secondary_states.learning_status)
        {
            if (mu->data.header.decree == _app->last_committed_decree() + 1)
            {
                err = _app->write_internal(mu, false); 
            }
            else
            {
                dassert (mu->data.header.decree <= _app->last_committed_decree(), "");
            }
        }
        else
        {
            // drop mutations as learning will catch up
            ddebug("%s: mutation %s skipped coz learing buffer overflow", name(), mu->name());
        }
        break;
    case PS_ERROR:
        break;
    }
    
    ddebug("TwoPhaseCommit, %s: mutation %s committed, err = %x", name(), mu->name(), err);

    if (err != ERR_SUCCESS)
    {
        handle_local_failure(err);
    }
}
Пример #5
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE)
    {
        switch (status())
        {
        case PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case PS_SECONDARY:
        case PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            [this, mu](error_code err, size_t size)
            {
                //
                // DO NOT CHANGE THIS CALLBACK HERE UNLESS
                // YOU FULLY UNDERSTAND WHAT WE DO HERE
                // 
                // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE
                // THE FIRST CALLBACK IF THERE IS FAILURE TO
                // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY
                // CANCELLED!!!
                //
                // TODO: we do not need so many callbacks
                //

                dinfo("%s: append private log completed for mutation %s, size = %u, err = %s",
                      name(), mu->name(), size, err.to_string());

                if (err != ERR_OK)
                {
                    derror("%s: append private log failed for mutation %s, err = %s",
                           name(), mu->name(), err.to_string());
                    handle_local_failure(err);
                }
            },
            gpid_to_hash(get_gpid())
            );
    }
}
Пример #6
0
void replica::execute_mutation(mutation_ptr& mu)
{
    dinfo("%s: execute mutation %s: request_count = %u",
        name(), 
        mu->name(), 
        static_cast<int>(mu->client_requests.size())
        );

    error_code err = ERR_OK;
    decree d = mu->data.header.decree;

    switch (status())
    {
    case partition_status::PS_INACTIVE:
        if (_app->last_committed_decree() + 1 == d)
        {
            err = _app->write_internal(mu);
        }
        else
        {
            ddebug(
                "%s: mutation %s commit to %s skipped, app.last_committed_decree = %" PRId64,
                name(), mu->name(),
                enum_to_string(status()),
                _app->last_committed_decree()
                );
        }
        break;
    case partition_status::PS_PRIMARY:
        {
            check_state_completeness();
            dassert(_app->last_committed_decree() + 1 == d, "");
            err = _app->write_internal(mu);
        }
        break;

    case partition_status::PS_SECONDARY:
        if (!_secondary_states.checkpoint_is_running)
        {
            check_state_completeness();
            dassert (_app->last_committed_decree() + 1 == d, "");
            err = _app->write_internal(mu);
        }
        else
        {
            ddebug(
                "%s: mutation %s commit to %s skipped, app.last_committed_decree = %" PRId64,
                name(), mu->name(),
                enum_to_string(status()),
                _app->last_committed_decree()
                );

            // make sure private log saves the state
            // catch-up will be done later after checkpoint task is fininished
            dassert(_private_log != nullptr, "");          
        }
        break;
    case partition_status::PS_POTENTIAL_SECONDARY:
        if (_potential_secondary_states.learning_status == learner_status::LearningSucceeded ||
            _potential_secondary_states.learning_status == learner_status::LearningWithPrepareTransient)
        {
            dassert(_app->last_committed_decree() + 1 == d, "");
            err = _app->write_internal(mu);
        }
        else
        {
            // prepare also happens with learner_status::LearningWithPrepare, in this case
            // make sure private log saves the state,
            // catch-up will be done later after the checkpoint task is finished

            ddebug(
                "%s: mutation %s commit to %s skipped, app.last_committed_decree = %" PRId64,
                name(), mu->name(),
                enum_to_string(status()),
                _app->last_committed_decree()
                );
        }
        break;
    case partition_status::PS_ERROR:
        break;
    }
    
    ddebug("TwoPhaseCommit, %s: mutation %s committed, err = %s", name(), mu->name(), err.to_string());

    _counter_commit_latency.set(dsn_now_ns() - mu->create_ts_ns());

    if (err != ERR_OK)
    {
        handle_local_failure(err);
    }

    if (status() == partition_status::PS_PRIMARY)
    {
        mutation_ptr next = _primary_states.write_queue.check_possible_work(
            static_cast<int>(_prepare_list->max_decree() - d)
            );

        if (next)
        {
            init_prepare(next);
        }
    }
}
Пример #7
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE)
    {
        switch (status())
        {
        case partition_status::PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case partition_status::PS_SECONDARY:
        case partition_status::PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case partition_status::PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            nullptr,
            gpid_to_hash(get_gpid())
            );
    }
}
Пример #8
0
        void replica::on_checkpoint_completed(error_code err)
        {
            check_hashed_access();

            // closing or wrong timing or no need operate
            if (PS_SECONDARY != status() || err == ERR_WRONG_TIMING || err == ERR_NO_NEED_OPERATE)
            {
                _secondary_states.checkpoint_is_running = false;
                return;
            } 

            // handle failure
            if (err != ERR_OK)
            {
                // done checkpointing
                _secondary_states.checkpoint_is_running = false;
                handle_local_failure(err);
                return;
            }

            auto c = _prepare_list->last_committed_decree();

            // missing commits
            if (c > _app->last_committed_decree())
            {
                // missed ones are covered by prepare list
                if (_app->last_committed_decree() > _prepare_list->min_decree())
                {
                    for (auto d = _app->last_committed_decree() + 1; d <= c; d++)
                    {
                        auto mu = _prepare_list->get_mutation_by_decree(d);
                        dassert(nullptr != mu, "");
                        err = _app->write_internal(mu);
                        if (ERR_OK != err)
                        {
                            _secondary_states.checkpoint_is_running = false;
                            handle_local_failure(err);
                            return;
                        }
                    }

                    // everything is ok now, done checkpointing
                    _secondary_states.checkpoint_is_running = false;
                }

                // missed ones need to be loaded via private logs
                else
                {
                    tasking::enqueue(
                        &_secondary_states.catchup_with_private_log_task,
                        LPC_CATCHUP_WITH_PRIVATE_LOGS,
                        this,
                        [this]() { this->catch_up_with_private_logs(PS_SECONDARY); },
                        gpid_to_hash(get_gpid())
                        );
                }
            }

            // no missing commits
            else
            {
                // everything is ok now, done checkpointing
                _secondary_states.checkpoint_is_running = false;
            }
        }
Пример #9
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }

    // mutation log failure, propagted to all replicas
    if (err != ERR_OK)
    {
        _stub->handle_log_failure(err);
    }

    // write local private log if necessary
    else if (_private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
                             LPC_WRITE_REPLICATION_LOG,
                             this,
                             [this](error_code err, size_t size)
        {
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
        },
        gpid_to_hash(get_gpid())
                            );
    }
}
Пример #10
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_SUCCESS;
    uint8_t count = 0;

    if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count)
    {
        err = ERR_NOT_ENOUGH_MEMBER;
        goto ErrOut;
    }
            
    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }

    if (mu->data.header.decree > _prepare_list->max_decree() && _prepare_list->count() >= _options.staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }
 
    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare without log
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_SUCCESS)
    {
        goto ErrOut;
    }
        
    ddebug("%s: mutation %s init_prepare", name(), mu->name());

    //
    // TODO: bounded staleness on secondaries
    //
    dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, "");
    
    // remote prepare
    dassert (mu->remote_tasks().size() == 0, "");
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries);
            count++;
        }
    }    
    mu->set_left_potential_secondary_ack_count(count);

    // local log
    dassert (mu->data.header.log_offset == invalid_offset, "");
    dassert (mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu, 
            std::placeholders::_1, 
            std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );

    if (nullptr == mu->log_task())
    {
        err = ERR_FILE_OPERATION_FAILED;
        handle_local_failure(err);
        goto ErrOut;
    }

    return;

ErrOut:
    response_client_message(mu->client_request, err);
    return;
}
Пример #11
0
void replica::on_prepare(message_ptr& request)
{
    check_hashed_access();

    replica_configuration rconfig;
    unmarshall(request, rconfig);    

    mutation_ptr mu = mutation::read_from(request);
    decree decree = mu->data.header.decree;

    ddebug( "%s: mutation %s on_prepare", name(), mu->name());

    dassert (mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        update_local_configuration(rconfig);
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        ddebug( 
            "%s: mutation %s on_prepare  to %s skipped",
            name(), mu->name(),
            enum_to_string(status())
            );
        ack_prepare_message(ERR_INVALID_STATE, mu);
        return;
    }

    else if (PS_POTENTIAL_SECONDARY == status())
    {
        if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded)
        {
            ddebug( 
                "%s: mutation %s on_prepare to %s skipped, learnings state = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
                );

            // do not retry as there may retries later
            return;
        }
    }

    dassert (rconfig.status == status(), "");    
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_SUCCESS, mu);
        return;
    }
    
    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name());

        if (mu2->is_prepared())
        {
            ack_prepare_message(ERR_SUCCESS, mu);
        }
        return;
    }

    int err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_SUCCESS, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, "");
    }
    
    // write log
    dassert (mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );

    if (nullptr == mu->log_task())
    {
        err = ERR_FILE_OPERATION_FAILED;
        ack_prepare_message(err, mu);
        handle_local_failure(err);
    }
}