void replica::on_group_check_reply(error_code err, const std::shared_ptr<group_check_request>& req, const std::shared_ptr<group_check_response>& resp)
{
    check_hashed_access();

    if (partition_status::PS_PRIMARY != status() || req->config.ballot < get_ballot())
    {
        return;
    }

    auto r = _primary_states.group_check_pending_replies.erase(req->node);
    dassert (r == 1, "");

    if (err != ERR_OK)
    {
        handle_remote_failure(req->config.status, req->node, err);
    }
    else
    {
        if (resp->err == ERR_OK)
        {
            if (resp->learner_status_ == learner_status::LearningSucceeded && req->config.status == partition_status::PS_POTENTIAL_SECONDARY)
            {
                handle_learning_succeeded_on_primary(req->node, resp->learner_signature);
            }
        }
        else
        {
            handle_remote_failure(req->config.status, req->node, resp->err);
        }
    }
}
Example #2
0
        void replica::on_checkpoint_completed(error_code err)
        {
            check_hashed_access();

            // closing or wrong timing
            if (PS_SECONDARY != status() || ERR_WRONG_TIMING == err)
            {
                _secondary_states.checkpoint_task = nullptr;
                return;
            } 

            // handle failure
            if (err != ERR_OK)
            {
                // done checkpointing
                _secondary_states.checkpoint_task = nullptr;
                handle_local_failure(err);
                return;
            }

            auto c = _prepare_list->last_committed_decree();

            // missing commits
            if (c > _app->last_committed_decree())
            {
                // missed ones are covered by prepare list
                if (_app->last_committed_decree() > _prepare_list->min_decree())
                {
                    for (auto d = _app->last_committed_decree() + 1; d <= c; d++)
                    {
                        auto mu = _prepare_list->get_mutation_by_decree(d);
                        dassert(nullptr != mu, "");
                        _app->write_internal(mu);
                    }

                    // everything is ok now, done checkpointing
                    _secondary_states.checkpoint_task = nullptr;
                }

                // missed ones need to be loaded via private logs
                else
                {
                    _secondary_states.checkpoint_task = tasking::enqueue(
                        LPC_CHECKPOINT_REPLICA,
                        this,
                        [this]() { this->catch_up_with_private_logs(PS_SECONDARY); },
                        gpid_to_hash(get_gpid())
                        );
                }
            }

            // no missing commits
            else
            {
                // everything is ok now, done checkpointing
                _secondary_states.checkpoint_task = nullptr;
            }
        }
Example #3
0
        void replica::on_copy_checkpoint_ack(error_code err, std::shared_ptr<replica_configuration>& req, std::shared_ptr<learn_response>& resp)
        {
            check_hashed_access();

            if (PS_PRIMARY != status())
            {
                _primary_states.checkpoint_task = nullptr;
                return;
            }

            if (err != ERR_OK || resp == nullptr)
            {
                dwarn("%s: copy checkpoint from secondary failed, err = %s", name(), err.to_string());
                _primary_states.checkpoint_task = nullptr;
                return;
            }

            if (resp->err != ERR_OK)
            {
                dinfo("%s: copy checkpoint from secondary failed, err = %s", name(), resp->err.to_string());
                _primary_states.checkpoint_task = nullptr;
                return;
            }

            if (resp->state.to_decree_included <= _app->last_durable_decree())
            {
                dinfo("%s: copy checkpoint from secondary skipped, as its decree is not bigger than current durable_decree: %" PRIu64 " vs %" PRIu64 "",
                    name(), resp->state.to_decree_included, _app->last_durable_decree()
                    );
                _primary_states.checkpoint_task = nullptr;
                return;
            }
                
            std::string ldir = utils::filesystem::path_combine(
                _app->learn_dir(),
                "checkpoint.copy"
                );

            if (utils::filesystem::path_exists(ldir))
                utils::filesystem::remove_path(ldir);

            _primary_states.checkpoint_task = file::copy_remote_files(
                resp->address,
                resp->base_local_dir,
                resp->state.files,
                ldir,
                false,
                LPC_REPLICA_COPY_LAST_CHECKPOINT_DONE,
                this,
                [this, resp](error_code err, size_t sz)
                {
                    this->on_copy_checkpoint_file_completed(err, sz, resp);
                },
                gpid_to_hash(get_gpid())
                );
        }
Example #4
0
        void replica::on_copy_checkpoint_file_completed(error_code err, size_t sz, std::shared_ptr<learn_response> resp)
        {
            check_hashed_access();

            if (PS_PRIMARY == status() && resp->state.to_decree_included > _app->last_durable_decree())
            {
                _app->apply_checkpoint(resp->state, CHKPT_COPY);
            }

            _primary_states.checkpoint_task = nullptr;
        }
Example #5
0
void replica::on_client_write(int code, dsn_message_t request)
{
    check_hashed_access();

    if (PS_PRIMARY != status())
    {
        response_client_message(request, ERR_INVALID_STATE);
        return;
    }

    mutation_ptr mu = new_mutation(_prepare_list->max_decree() + 1);
    mu->set_client_request(code, request);
    init_prepare(mu);
}
Example #6
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        if (!_options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(err, mu);
        }
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Example #7
0
        void replica::init_checkpoint()
        {
            check_hashed_access();
            
            // only applicable to primary and secondary replicas
            if (status() != PS_PRIMARY && status() != PS_SECONDARY)
                return;

            // no need to checkpoint
            if (_app->is_delta_state_learning_supported())
                return;

            // already running
            if (_secondary_states.checkpoint_task != nullptr)
                return;

            // private log must be enabled to make sure commits
            // are not lost during checkpinting
            dassert(nullptr != _private_log, "log_enable_private_prepare must be true for checkpointing");

            // TODO: when NOT to checkpoint, but use private log replay to build the state
            if (last_committed_decree() - last_durable_decree() < 10000)
                return;

            // primary is downgraded to secondary for checkpointing as no write can be seen
            // during checkpointing (i.e., state is freezed)
            if (PS_PRIMARY == status())
            {
                configuration_update_request proposal;
                proposal.config = _primary_states.membership;
                proposal.type = CT_DOWNGRADE_TO_SECONDARY;
                proposal.node = proposal.config.primary;
                downgrade_to_secondary_on_primary(proposal);
            }

            // secondary can start checkpint in the long running thread pool
            else
            {
                dassert(PS_SECONDARY == status(), "");

                _secondary_states.checkpoint_task = tasking::enqueue(
                    LPC_CHECKPOINT_REPLICA,
                    this,
                    &replica::checkpoint,
                    gpid_to_hash(get_gpid())
                    );
            }
        }
Example #8
0
void replica::on_client_write(int code, dsn_message_t request)
{
    check_hashed_access();

    if (PS_PRIMARY != status())
    {
        response_client_message(request, ERR_INVALID_STATE);
        return;
    }

    auto mu = _primary_states.write_queue.add_work(code, request, this);
    if (mu)
    {
        init_prepare(mu);
    }
}
void replica::init_group_check()
{
    check_hashed_access();

    ddebug("%s: init group check", name());

    if (partition_status::PS_PRIMARY != status() || _options->group_check_disabled)
        return;

    dassert (nullptr == _primary_states.group_check_task, "");
    _primary_states.group_check_task = tasking::enqueue_timer(
        LPC_GROUP_CHECK,
        this,
        [this] {broadcast_group_check();},
        std::chrono::milliseconds(_options->group_check_interval_ms),
        gpid_to_thread_hash(get_gpid())
        );
}
Example #10
0
void replica::on_append_log_completed(mutation_ptr& mu, uint32_t err, uint32_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %u", name(), mu->name(), err);

    if (err == ERR_SUCCESS)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_SUCCESS)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_SUCCESS)
        {
            handle_local_failure(err);
        }
        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }
}
Example #11
0
void replica::on_client_write(task_code code, dsn_message_t request)
{
    check_hashed_access();

    if (PS_PRIMARY != status())
    {
        response_client_message(request, ERR_INVALID_STATE);
        return;
    }

    if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options->mutation_2pc_min_replica_count)
    {
        response_client_message(request, ERR_NOT_ENOUGH_MEMBER);
        return;
    }

    auto mu = _primary_states.write_queue.add_work(code, request, this);
    if (mu)
    {
        init_prepare(mu);
    }
}
Example #12
0
        // @ secondary
        void replica::on_copy_checkpoint(const replica_configuration& request, /*out*/ learn_response& response)
        {
            check_hashed_access();

            if (request.ballot > get_ballot())
            {
                if (!update_local_configuration(request))
                {
                    response.err = ERR_INVALID_STATE;
                    return;
                }
            }

            if (status() != PS_SECONDARY)
            {
                response.err = ERR_INVALID_STATE;
                return;
            }

            if (_app->last_durable_decree() == 0)
            {
                response.err = ERR_OBJECT_NOT_FOUND;
                return;
            }

            blob placeholder;
            int err = _app->get_checkpoint(0, placeholder, response.state);
            if (err != 0)
            {
                response.err = ERR_LEARN_FILE_FAILED;
            }
            else
            {
                response.err = ERR_OK;
                response.last_committed_decree = last_committed_decree();
                response.base_local_dir = _app->data_dir();
                response.address = _stub->_primary_address;
            }
        }
Example #13
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    ddebug( "%s: mutation %s on_append_log_completed, err = %s", name(), mu->name(), err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }

    // skip old mutations
    if (mu->data.header.ballot < get_ballot() || status() == PS_INACTIVE)
    {
        return;
    }

    switch (status())
    {
    case PS_PRIMARY:
        if (err == ERR_OK)
        {
            do_possible_commit_on_primary(mu);
        }
        else
        {
            handle_local_failure(err);
        }
        break;
    case PS_SECONDARY:
    case PS_POTENTIAL_SECONDARY:
        if (err != ERR_OK)
        {
            handle_local_failure(err);
        }

        ack_prepare_message(err, mu);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
        break;
    }

    // mutation log failure, propagted to all replicas
    if (err != ERR_OK)
    {
        _stub->handle_log_failure(err);
    }

    // write local private log if necessary
    else if (_private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
                             LPC_WRITE_REPLICATION_LOG,
                             this,
                             [this](error_code err, size_t size)
        {
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
        },
        gpid_to_hash(get_gpid())
                            );
    }
}
void replica::on_group_check(const group_check_request& request, /*out*/ group_check_response& response)
{
    check_hashed_access();

    ddebug(
        "%s: process group check, primary = %s, ballot = %" PRId64 ", status = %s, last_committed_decree = %" PRId64,
        name(), request.config.primary.to_string(),
        request.config.ballot, enum_to_string(request.config.status),
        request.last_committed_decree
        );
    
    if (request.config.ballot < get_ballot())
    {
        response.err = ERR_VERSION_OUTDATED;
        dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
        return;
    }
    else if (request.config.ballot > get_ballot())
    {
        if (!update_local_configuration(request.config))
        {
            response.err = ERR_INVALID_STATE;
            dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
            return;
        }
    }
    else if (is_same_ballot_status_change_allowed(status(), request.config.status))
    {
        update_local_configuration(request.config, true);
    }
    
    switch (status())
    {
    case partition_status::PS_INACTIVE:
        break;
    case partition_status::PS_SECONDARY:
        if (request.last_committed_decree > last_committed_decree())
        {
            _prepare_list->commit(request.last_committed_decree, COMMIT_TO_DECREE_HARD);
        }
        break;
    case partition_status::PS_POTENTIAL_SECONDARY:
        init_learn(request.config.learner_signature);
        break;
    case partition_status::PS_ERROR:
        break;
    default:
        dassert (false, "");
    }
    
    response.pid = get_gpid();
    response.node = _stub->_primary_address;
    response.err = ERR_OK;
    if (status() == partition_status::PS_ERROR)
    {
        response.err = ERR_INVALID_STATE;
        dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
    }

    response.last_committed_decree_in_app = _app->last_committed_decree();
    response.last_committed_decree_in_prepare_list = last_committed_decree();
    response.learner_status_ = _potential_secondary_states.learning_status;
    response.learner_signature = _potential_secondary_states.learning_version;
}
Example #15
0
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, error_code err, dsn_message_t request, dsn_message_t reply)
{
    check_hashed_access();

    mutation_ptr mu = pr.first;
    partition_status targetStatus = pr.second;

    // skip callback for old mutations
    if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status())
        return;
    
    dassert (mu->data.header.ballot == get_ballot(), "");

    ::dsn::rpc_address node = dsn_msg_to_address(request);
    partition_status st = _primary_states.get_node_status(node);

    // handle reply
    prepare_ack resp;

    // handle error
    if (err != ERR_OK)
    {
        resp.err = err;
    }
    else
    {
        ::unmarshall(reply, resp);
    }
    
    ddebug(
        "%s: mutation %s on_prepare_reply from %s, err = %s",
        name(), mu->name(),
        node.to_string(),
        resp.err.to_string()
        );
       
    if (resp.err == ERR_OK)
    {
        dassert (resp.ballot == get_ballot(), "");
        dassert (resp.decree == mu->data.header.decree, "");

        switch (targetStatus)
        {
        case PS_SECONDARY:
            dassert (_primary_states.check_exist(node, PS_SECONDARY), "");
            dassert (mu->left_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        case PS_POTENTIAL_SECONDARY:            
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        default:
            dwarn(
                "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name()
                );
            break;
        }
    }

    // failure handling
    else
    {
        // retry for INACTIVE state when there are still time
        if (resp.err == ERR_INACTIVE_STATE
            && !mu->is_prepare_close_to_timeout(2, targetStatus == PS_SECONDARY ? 
            _options->prepare_timeout_ms_for_secondaries :
            _options->prepare_timeout_ms_for_potential_secondaries)
            )
        {
            send_prepare_message(node, targetStatus, mu, targetStatus == PS_SECONDARY ?
                _options->prepare_timeout_ms_for_secondaries :
                _options->prepare_timeout_ms_for_potential_secondaries);
            return;
        }

        // make sure this is before any later commit ops
        // because now commit ops may lead to new prepare ops
        // due to replication throttling
        handle_remote_failure(st, node, resp.err);

        // note targetStatus and (curent) status may diff
        if (targetStatus == PS_POTENTIAL_SECONDARY)
        {
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
        }
    }
}
Example #16
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != PS_INACTIVE)
    {
        switch (status())
        {
        case PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case PS_SECONDARY:
        case PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            [this, mu](error_code err, size_t size)
            {
                //
                // DO NOT CHANGE THIS CALLBACK HERE UNLESS
                // YOU FULLY UNDERSTAND WHAT WE DO HERE
                // 
                // AS PRIVATE LOG IS BATCHED, WE ONLY EXECUTE
                // THE FIRST CALLBACK IF THERE IS FAILURE TO
                // NOTIFY FAILURE. ALL OTHER TASKS ARE SIMPLY
                // CANCELLED!!!
                //
                // TODO: we do not need so many callbacks
                //

                dinfo("%s: append private log completed for mutation %s, size = %u, err = %s",
                      name(), mu->name(), size, err.to_string());

                if (err != ERR_OK)
                {
                    derror("%s: append private log failed for mutation %s, err = %s",
                           name(), mu->name(), err.to_string());
                    handle_local_failure(err);
                }
            },
            gpid_to_hash(get_gpid())
            );
    }
}
Example #17
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        rpc_read_stream reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    dinfo("%s: mutation %s on_prepare", name(), mu->name());

    dassert(mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        // no need response because the rpc should have been cancelled on primary in this case
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            derror(
                "%s: mutation %s on_prepare failed as update local configuration failed, state = %s",
                name(), mu->name(),
                enum_to_string(status())
                );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        derror(
            "%s: mutation %s on_prepare failed as invalid replica state, state = %s",
            name(), mu->name(),
            enum_to_string(status())
            );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
            );
        return;
    }
    else if (PS_POTENTIAL_SECONDARY == status())
    {
        // new learning process
        if (rconfig.learner_signature != _potential_secondary_states.learning_signature)
        {
            init_learn(rconfig.learner_signature);
            // no need response as rpc is already gone
            return;
        }

        if (!(_potential_secondary_states.learning_status == LearningWithPrepare
            || _potential_secondary_states.learning_status == LearningSucceeded))
        {
            derror(
                "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
                );

            // no need response as rpc is already gone
            return;
        }
    }

    dassert (rconfig.status == status(), "");    
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }
    
    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        if (mu2->is_logged())
        {
            ack_prepare_message(ERR_OK, mu);
        }
        else
        {
            derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name());
            // response will be unnecessary when we add retry logic in rpc engine.
            // the retried rpc will use the same id therefore it will be considered responsed
            // even the response is for a previous try.
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, "");
    }

    dassert(mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu,
                  std::placeholders::_1,
                  std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );
}
Example #18
0
void replica::on_append_log_completed(mutation_ptr& mu, error_code err, size_t size)
{
    check_hashed_access();

    dinfo("%s: append shared log completed for mutation %s, size = %u, err = %s",
          name(), mu->name(), size, err.to_string());

    if (err == ERR_OK)
    {
        mu->set_logged();
    }
    else
    {
        derror("%s: append shared log failed for mutation %s, err = %s",
               name(), mu->name(), err.to_string());
    }

    // skip old mutations
    if (mu->data.header.ballot >= get_ballot() && status() != partition_status::PS_INACTIVE)
    {
        switch (status())
        {
        case partition_status::PS_PRIMARY:
            if (err == ERR_OK)
            {
                do_possible_commit_on_primary(mu);
            }
            else
            {
                handle_local_failure(err);
            }
            break;
        case partition_status::PS_SECONDARY:
        case partition_status::PS_POTENTIAL_SECONDARY:
            if (err != ERR_OK)
            {
                handle_local_failure(err);
            }
            // always ack
            ack_prepare_message(err, mu);
            break;
        case partition_status::PS_ERROR:
            break;
        default:
            dassert(false, "");
            break;
        }
    }

    if (err != ERR_OK)
    {
        // mutation log failure, propagate to all replicas
        _stub->handle_log_failure(err);
    }
   
    // write local private log if necessary
    if (err == ERR_OK && _private_log && status() != partition_status::PS_ERROR)
    {
        _private_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            nullptr,
            nullptr,
            gpid_to_hash(get_gpid())
            );
    }
}
Example #19
0
void replica::on_update_configuration_on_meta_server_reply(error_code err, dsn_message_t request, dsn_message_t response, std::shared_ptr<configuration_update_request> req)
{
    check_hashed_access();

    if (PS_INACTIVE != status() || _stub->is_connected() == false)
    {
        _primary_states.reconfiguration_task = nullptr;
        err.end_tracking();
        return;
    }

    configuration_update_response resp;
    if (err == ERR_OK)
    {
        ::unmarshall(response, resp);
        err = resp.err;
    }

    if (err != ERR_OK)
    {
        ddebug(
            "%s: update configuration reply with err %s, request ballot %lld",
            name(),
            err.to_string(),
            req->config.ballot
            );

        if (err != ERR_INVALID_VERSION)
        {
            rpc_address target(_stub->_failure_detector->get_servers());
            dsn_msg_add_ref(request); // added for another round of rpc::call
            _primary_states.reconfiguration_task = rpc::call(
                target,
                request,
                this,
                std::bind(&replica::on_update_configuration_on_meta_server_reply, this,
                std::placeholders::_1,
                std::placeholders::_2,
                std::placeholders::_3,
                req),
                gpid_to_hash(get_gpid())
                );
            return;
        }        
    }

    ddebug(
        "%s: update configuration reply with err %s, ballot %lld, local %lld",
        name(),
        resp.err.to_string(),
        resp.config.ballot,
        get_ballot()
        );
    
    if (resp.config.ballot < get_ballot())
    {
        _primary_states.reconfiguration_task = nullptr;
        return;
    }        
    
    // post-update work items?
    if (resp.err == ERR_OK)
    {        
        dassert (req->config.gpid == resp.config.gpid, "");
        dassert (req->config.app_type == resp.config.app_type, "");
        dassert (req->config.primary == resp.config.primary, "");
        dassert (req->config.secondaries == resp.config.secondaries, "");

        switch (req->type)
        {        
        case CT_UPGRADE_TO_PRIMARY:
            _primary_states.last_prepare_decree_on_new_primary = _prepare_list->max_decree();
            break;
        case CT_ASSIGN_PRIMARY:
        case CT_DOWNGRADE_TO_SECONDARY:
        case CT_DOWNGRADE_TO_INACTIVE:
        case CT_UPGRADE_TO_SECONDARY:
            break;
        case CT_REMOVE:
            if (req->node != primary_address())
            {
                replica_configuration rconfig;
                replica_helper::get_replica_config(resp.config, req->node, rconfig);
                rpc::call_one_way_typed(req->node, RPC_REMOVE_REPLICA, rconfig, gpid_to_hash(get_gpid()));
            }
            break;
        default:
            dassert (false, "");
        }
    }
    
    update_configuration(resp.config);
    _primary_states.reconfiguration_task = nullptr;
}
Example #20
0
void replica::on_config_proposal(configuration_update_request& proposal)
{
    check_hashed_access();

    ddebug(
        "%s: on_config_proposal %s for %s", 
        name(),
        enum_to_string(proposal.type),
        proposal.node.to_string()
        );

    if (proposal.config.ballot < get_ballot())
    {
        dwarn(
            "%s: on_config_proposal is out-dated, %lld vs %lld",
            name(),
            proposal.config.ballot,
            get_ballot()
            );
        return;
    }   

    if (_primary_states.reconfiguration_task != nullptr)
    {
        dinfo(
            "%s: reconfiguration on the way, skip the incoming proposal",
            name()
            );
        return;
    }

    if (proposal.config.ballot > get_ballot())
    {
        if (!update_configuration(proposal.config))
        {
            // is closing or update failed
            return;
        }
    }
    
    switch (proposal.type)
    {
    case CT_ASSIGN_PRIMARY:
    case CT_UPGRADE_TO_PRIMARY:
        assign_primary(proposal);
        break;
    case CT_ADD_SECONDARY:
        add_potential_secondary(proposal);
        break;
    case CT_DOWNGRADE_TO_SECONDARY:
        downgrade_to_secondary_on_primary(proposal);
        break;
    case CT_DOWNGRADE_TO_INACTIVE:
        downgrade_to_inactive_on_primary(proposal);
        break;
    case CT_REMOVE:
        remove(proposal);
        break;
    default:
        dassert (false, "");
    }
}
Example #21
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        msg_binary_reader reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    ddebug( "%s: mutation %s on_prepare", name(), mu->name());

    dassert (mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        ddebug( "%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            ddebug(
                "%s: mutation %s on_prepare  to %s failed as update local configuration failed",
                name(), mu->name(),
                enum_to_string(status())
            );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        ddebug(
            "%s: mutation %s on_prepare  to %s skipped",
            name(), mu->name(),
            enum_to_string(status())
        );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
        );
        return;
    }

    else if (PS_POTENTIAL_SECONDARY == status())
    {
        if (_potential_secondary_states.learning_status != LearningWithPrepare && _potential_secondary_states.learning_status != LearningSucceeded)
        {
            ddebug(
                "%s: mutation %s on_prepare to %s skipped, learnings state = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
            );

            // do not retry as there may retries later
            return;
        }
    }

    dassert (rconfig.status == status(), "");
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }

    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        ddebug( "%s: mutation %s redundant prepare skipped", name(), mu->name());

        if (mu2->is_logged() || _options.prepare_ack_on_secondary_before_logging_allowed)
        {
            ack_prepare_message(ERR_OK, mu);
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_start_prepare_for_potential_secondary, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options.staleness_for_commit, "");
    }

    // ack without logging
    if (_options.prepare_ack_on_secondary_before_logging_allowed)
    {
        ack_prepare_message(err, mu);
    }

    // write log
    dassert (mu->log_task() == nullptr, "");

    mu->log_task() = _stub->_log->append(mu,
                                         LPC_WRITE_REPLICATION_LOG,
                                         this,
                                         std::bind(&replica::on_append_log_completed, this, mu, std::placeholders::_1, std::placeholders::_2),
                                         gpid_to_hash(get_gpid())
                                        );

    dassert(mu->log_task() != nullptr, "");
}
Example #22
0
void replica::on_prepare_reply(std::pair<mutation_ptr, partition_status> pr, int err, message_ptr& request, message_ptr& reply)
{
    check_hashed_access();

    mutation_ptr& mu = pr.first;
    partition_status targetStatus = pr.second;

    // skip callback for old mutations
    if (mu->data.header.ballot < get_ballot() || PS_PRIMARY != status())
        return;
    
    dassert (mu->data.header.ballot == get_ballot(), "");

    end_point node = request->header().to_address;
    partition_status st = _primary_states.GetNodeStatus(node);

    // handle reply
    prepare_ack resp;

    // handle error
    if (err)
    {
        resp.err = err;
    }
    else
    {
        unmarshall(reply, resp);        

        ddebug( 
            "%s: mutation %s on_prepare_reply from %s:%d", 
            name(), mu->name(),
            node.name.c_str(), static_cast<int>(node.port)
            );
    }
       
    if (resp.err == ERR_SUCCESS)
    {
        dassert (resp.ballot == get_ballot(), "");
        dassert (resp.decree == mu->data.header.decree, "");

        switch (targetStatus)
        {
        case PS_SECONDARY:
            dassert (_primary_states.check_exist(node, PS_SECONDARY), "");
            dassert (mu->left_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        case PS_POTENTIAL_SECONDARY:            
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
            break;
        default:
            ddebug( 
                "%s: mutation %s prepare ack skipped coz the node is now inactive", name(), mu->name()
                );
            break;
        }
    }

    // failure handling
    else
    {
        // note targetStatus and (curent) status may diff
        if (targetStatus == PS_POTENTIAL_SECONDARY)
        {
            dassert (mu->left_potential_secondary_ack_count() > 0, "");
            if (0 == mu->decrease_left_potential_secondary_ack_count())
            {
                do_possible_commit_on_primary(mu);
            }
        }
        handle_remote_failure(st, node, resp.err);
    }
}
Example #23
0
 void replica::on_checkpoint_timer()
 {
     check_hashed_access();
     init_checkpoint();
     garbage_collection();
 }
Example #24
0
        void replica::on_checkpoint_completed(error_code err)
        {
            check_hashed_access();

            // closing or wrong timing or no need operate
            if (PS_SECONDARY != status() || err == ERR_WRONG_TIMING || err == ERR_NO_NEED_OPERATE)
            {
                _secondary_states.checkpoint_is_running = false;
                return;
            } 

            // handle failure
            if (err != ERR_OK)
            {
                // done checkpointing
                _secondary_states.checkpoint_is_running = false;
                handle_local_failure(err);
                return;
            }

            auto c = _prepare_list->last_committed_decree();

            // missing commits
            if (c > _app->last_committed_decree())
            {
                // missed ones are covered by prepare list
                if (_app->last_committed_decree() > _prepare_list->min_decree())
                {
                    for (auto d = _app->last_committed_decree() + 1; d <= c; d++)
                    {
                        auto mu = _prepare_list->get_mutation_by_decree(d);
                        dassert(nullptr != mu, "");
                        err = _app->write_internal(mu);
                        if (ERR_OK != err)
                        {
                            _secondary_states.checkpoint_is_running = false;
                            handle_local_failure(err);
                            return;
                        }
                    }

                    // everything is ok now, done checkpointing
                    _secondary_states.checkpoint_is_running = false;
                }

                // missed ones need to be loaded via private logs
                else
                {
                    tasking::enqueue(
                        &_secondary_states.catchup_with_private_log_task,
                        LPC_CATCHUP_WITH_PRIVATE_LOGS,
                        this,
                        [this]() { this->catch_up_with_private_logs(PS_SECONDARY); },
                        gpid_to_hash(get_gpid())
                        );
                }
            }

            // no missing commits
            else
            {
                // everything is ok now, done checkpointing
                _secondary_states.checkpoint_is_running = false;
            }
        }