Ejemplo n.º 1
0
        // helper routines to accelerate learning
        int counter_service_impl::get_learn_state(decree start, const blob& learn_request, /*out*/ learn_state& state)
        {
            ::dsn::binary_writer writer;

            service::zauto_lock l(_lock);

            int magic = 0xdeadbeef;
            writer.write(magic);

            writer.write(last_committed_decree());

            dassert(last_committed_decree() >= 0, "");

            int count = static_cast<int>(_counters.size());
            writer.write(count);

            for (auto it = _counters.begin(); it != _counters.end(); it++)
            {
                writer.write(it->first);
                writer.write(it->second);
            }

            auto bb = writer.get_buffer();
            auto buf = bb.buffer();

            state.meta.push_back(blob(buf, static_cast<int>(bb.data() - bb.buffer().get()), bb.length()));

            return 0;
        }
Ejemplo n.º 2
0
        int counter_service_impl::checkpoint()
        {
            service::zauto_lock l(_lock);

            if (last_committed_decree() == last_durable_decree())
            {
                ddebug("counter_service_impl create checkpoint succeed, checkpoint already the latest, last_durable_decree = %" PRId64 "", _last_durable_decree.load());
                return 0;
            }

            // TODO: should use async write instead
            char name[256];
            sprintf(name, "%s/checkpoint.%" PRId64, data_dir().c_str(), last_committed_decree());
            std::ofstream os(name);

            uint32_t count = (uint32_t)_counters.size();
            os.write((const char*)&count, (uint32_t)sizeof(count));

            for (auto it = _counters.begin(); it != _counters.end(); it++)
            {
                const std::string& k = it->first;
                uint32_t sz = (uint32_t)k.length();

                os.write((const char*)&sz, (uint32_t)sizeof(sz));
                os.write((const char*)&k[0], sz);
                os.write((const char*)&it->second, sizeof(int32_t));
            }

            _last_durable_decree = last_committed_decree();
            ddebug("counter_service_impl create checkpoint succeed, last_durable_decree = %" PRId64 "", _last_durable_decree.load());
            return 0;
        }
Ejemplo n.º 3
0
        int counter_service_impl::flush(bool force)
        {
            zauto_lock l(_lock);

            if (last_committed_decree() == last_durable_decree())
            {
                return ERR_OK;
            }

            // TODO: should use async write instead
            char name[256];
            sprintf(name, "%s/checkpoint.%lld", data_dir().c_str(),
                static_cast<long long int>(last_committed_decree()));
            std::ofstream os(name);

            uint32_t count = (uint32_t)_counters.size();
            os.write((const char*)&count, (uint32_t)sizeof(count));

            for (auto it = _counters.begin(); it != _counters.end(); it++)
            {
                const std::string& k = it->first;
                uint32_t sz = (uint32_t)k.length();

                os.write((const char*)&sz, (uint32_t)sizeof(sz));
                os.write((const char*)&k[0], sz);
                os.write((const char*)&it->second, sizeof(int32_t));
            }

            _last_durable_decree = last_committed_decree();
            return ERR_OK;
        }
Ejemplo n.º 4
0
void replica::on_group_check(const group_check_request& request, __out_param group_check_response& response)
{
    ddebug(
        "%s: on_group_check from %s:%d",
        name(), request.config.primary.name.c_str(), request.config.primary.port
        );
    
    if (request.config.ballot < get_ballot())
    {
        response.err = ERR_VERSION_OUTDATED;
        return;
    }
    else if (request.config.ballot > get_ballot())
    {
        update_local_configuration(request.config);
    }
    else if (is_same_ballot_status_change_allowed(status(), request.config.status))
    {
        update_local_configuration(request.config, true);
    }
    
    switch (status())
    {
    case PS_INACTIVE:
        break;
    case PS_SECONDARY:
        if (request.last_committed_decree > last_committed_decree())
        {
            _prepare_list->commit(request.last_committed_decree, true);
        }
        break;
    case PS_POTENTIAL_SECONDARY:
        init_learn(request.learner_signature);
        break;
    case PS_ERROR:
        break;
    default:
        dassert (false, "");
    }
    
    response.gpid = get_gpid();
    response.node = primary_address();
    response.err = ERR_SUCCESS;
    if (status() == PS_ERROR)
    {
        response.err = ERR_INVALID_STATE;
    }

    response.last_committed_decree_in_app = _app->last_committed_decree();
    response.last_committed_decree_in_prepare_list = last_committed_decree();
    response.learner_status_ = _potential_secondary_states.learning_status;
    response.learner_signature = _potential_secondary_states.learning_signature;
}
Ejemplo n.º 5
0
// RPC_SIMPLE_KV_WRITE
void simple_kv_service_impl::on_write(const kv_pair& pr, ::dsn::service::rpc_replier<int32_t>& reply)
{
    zauto_lock l(_lock);
    _store[pr.key] = pr.value;
    dinfo("write %s, decree = %lld\n", pr.value.c_str(), last_committed_decree());
    reply(ERR_SUCCESS);
}
Ejemplo n.º 6
0
void replica::replay_prepare_list()
{
    decree start = last_committed_decree() + 1;
    decree end = _prepare_list->max_decree();

    ddebug(
            "%s: replay prepare list from %lld to %lld, ballot = %lld",
            name(),
            start,
            end,
            get_ballot()
            );

    for (decree decree = start; decree <= end; decree++)
    {
        mutation_ptr old = _prepare_list->get_mutation_by_decree(decree);
        mutation_ptr mu = new_mutation(decree);

        if (old != nullptr)
        {
            mu->copy_from(old);
        }
        else
        {
            mu->rpc_code = RPC_REPLICATION_WRITE_EMPTY;
            ddebug(
                "%s: emit empty mutation %lld when replay prepare list",
                name(),
                decree
                );
        }

        init_prepare(mu);
    }
}
Ejemplo n.º 7
0
void replica::check_state_completeness()
{
    /* prepare commit durable */
    dassert(max_prepared_decree() >= last_committed_decree(), "");
    dassert(last_committed_decree() >= last_durable_decree(), "");

    auto mind = _stub->_log->max_gced_decree(get_gpid(), _app->init_info().init_offset_in_shared_log);
    dassert(mind <= last_durable_decree(), "");
    _stub->_log->check_valid_start_offset(get_gpid(), _app->init_info().init_offset_in_shared_log);

    if (_private_log != nullptr)
    {   
        auto mind = _private_log->max_gced_decree(get_gpid(), _app->init_info().init_offset_in_private_log);
        dassert(mind <= last_durable_decree(), "");
        _private_log->check_valid_start_offset(get_gpid(), _app->init_info().init_offset_in_private_log);
    }
}
Ejemplo n.º 8
0
error_code replication_app_base::open_internal(replica* r, bool create_new)
{
    auto err = open(create_new);
    if (err == ERR_OK)
    {
        dassert(last_committed_decree() == last_durable_decree(), "");
        if (!create_new)
        {
            std::string info_path = utils::filesystem::path_combine(r->dir(), ".info");
            err = _info.load(info_path.c_str());
        }
    }

    _app_commit_decree.add(last_committed_decree());

    return err;
}
Ejemplo n.º 9
0
void replica::broadcast_group_check()
{
    dassert (nullptr != _primary_states.group_check_task, "");
    if (_primary_states.group_check_pending_replies.size() > 0)
    {
        dwarn(
            "%s: %u group check replies are still pending when doing next round check",
            name(), static_cast<int>(_primary_states.group_check_pending_replies.size())
            );

        for (auto it = _primary_states.group_check_pending_replies.begin(); it != _primary_states.group_check_pending_replies.end(); it++)
        {
            it->second->cancel(true);
        }
        _primary_states.group_check_pending_replies.clear();
    }

    for (auto it = _primary_states.statuses.begin(); it != _primary_states.statuses.end(); it++)
    {
        if (it->first == primary_address())
            continue;

        end_point addr = it->first;
        std::shared_ptr<group_check_request> request(new group_check_request);

        request->app_type = _primary_states.membership.app_type;
        request->node = addr;
        _primary_states.get_replica_config(addr, request->config);
        request->last_committed_decree = last_committed_decree();
        request->learner_signature = 0;
        if (it->second == PS_POTENTIAL_SECONDARY)
        {
            auto it2 = _primary_states.learners.find(it->first);
            dassert (it2 != _primary_states.learners.end(), "");
            request->learner_signature = it2->second.signature;
        }

        task_ptr callback_task = rpc::call_typed(
            addr,
            RPC_GROUP_CHECK,
            request,            
            this,
            &replica::on_group_check_reply,
            gpid_to_hash(get_gpid())
            );

        _primary_states.group_check_pending_replies[addr] = callback_task;

        ddebug(
            "%s: init_group_check for %s:%d", name(), addr.name.c_str(), addr.port
        );
    }
}
Ejemplo n.º 10
0
// RPC_SIMPLE_KV_APPEND
void simple_kv_service_impl::on_append(const kv_pair& pr, ::dsn::service::rpc_replier<int32_t>& reply)
{
    zauto_lock l(_lock);
    auto it = _store.find(pr.key);
    if (it != _store.end())
        it->second.append(pr.value);
    else
        _store[pr.key] = pr.value;

    dinfo("append %s, decree = %lld\n", pr.value.c_str(), last_committed_decree());
    reply(ERR_SUCCESS);
}
Ejemplo n.º 11
0
error_code replication_app_base::write_internal(mutation_ptr& mu)
{
    dassert (mu->data.header.decree == last_committed_decree() + 1, "");
    dassert(mu->client_requests.size() == mu->data.updates.size()
        && mu->client_requests.size() > 0, 
        "data inconsistency in mutation");

    int count = static_cast<int>(mu->client_requests.size());
    _batch_state = (count == 1 ? BS_NOT_BATCH : BS_BATCH);
    for (int i = 0; i < count; i++)
    {
        if (_batch_state == BS_BATCH && i + 1 == count)
        {
            _batch_state = BS_BATCH_LAST;
        }

        auto& r = mu->client_requests[i];
        if (r.code != RPC_REPLICATION_WRITE_EMPTY)
        {
            dinfo("%s: mutation %s dispatch rpc call: %s",
                  _replica->name(), mu->name(), dsn_task_code_to_string(r.code));
            binary_reader reader(mu->data.updates[i]);
            dsn_message_t resp = (r.req ? dsn_msg_create_response(r.req) : nullptr);

            uint64_t now = dsn_now_ns();
            dispatch_rpc_call(r.code, reader, resp);
            now = dsn_now_ns() - now;

            _app_commit_latency.set(now);
        }
        else
        {
            // empty mutation write
        }

        if (_physical_error != 0)
        {
            derror("%s: physical error %d occurs in replication local app %s",
                   _replica->name(), _physical_error, data_dir().c_str());
            return ERR_LOCAL_APP_FAILURE;
        }
    }

    ++_last_committed_decree;

    _replica->update_commit_statistics(count);
    _app_commit_throughput.add((uint64_t)count);
    _app_commit_decree.increment();

    return ERR_OK;
}
Ejemplo n.º 12
0
int replication_app_base::write_internal(mutation_ptr& mu, bool ack_client)
{
    dassert (mu->data.header.decree == last_committed_decree() + 1, "");

    int err = 0;
    auto& msg = mu->client_request;
    dispatch_rpc_call(
        mu->rpc_code,
        msg,
        ack_client
        );
    
    ++_last_committed_decree;
    return err;
}
Ejemplo n.º 13
0
 // RPC_SIMPLE_KV_READ
 void simple_kv_service_impl::on_read(const std::string& key, ::dsn::service::rpc_replier<std::string>& reply)
 {
     zauto_lock l(_lock);
     
     auto it = _store.find(key);
     if (it == _store.end())
     {
         reply("");
     }
     else
     {
         dinfo("read %s, decree = %lld\n", it->second.c_str(), last_committed_decree());
         reply(it->second);
     }
 }
Ejemplo n.º 14
0
        void replica::init_checkpoint()
        {
            check_hashed_access();
            
            // only applicable to primary and secondary replicas
            if (status() != PS_PRIMARY && status() != PS_SECONDARY)
                return;

            // no need to checkpoint
            if (_app->is_delta_state_learning_supported())
                return;

            // already running
            if (_secondary_states.checkpoint_task != nullptr)
                return;

            // private log must be enabled to make sure commits
            // are not lost during checkpinting
            dassert(nullptr != _private_log, "log_enable_private_prepare must be true for checkpointing");

            // TODO: when NOT to checkpoint, but use private log replay to build the state
            if (last_committed_decree() - last_durable_decree() < 10000)
                return;

            // primary is downgraded to secondary for checkpointing as no write can be seen
            // during checkpointing (i.e., state is freezed)
            if (PS_PRIMARY == status())
            {
                configuration_update_request proposal;
                proposal.config = _primary_states.membership;
                proposal.type = CT_DOWNGRADE_TO_SECONDARY;
                proposal.node = proposal.config.primary;
                downgrade_to_secondary_on_primary(proposal);
            }

            // secondary can start checkpint in the long running thread pool
            else
            {
                dassert(PS_SECONDARY == status(), "");

                _secondary_states.checkpoint_task = tasking::enqueue(
                    LPC_CHECKPOINT_REPLICA,
                    this,
                    &replica::checkpoint,
                    gpid_to_hash(get_gpid())
                    );
            }
        }
Ejemplo n.º 15
0
void replica::ack_prepare_message(error_code err, mutation_ptr& mu)
{
    prepare_ack resp;
    resp.gpid = get_gpid();
    resp.err = err;
    resp.ballot = get_ballot();
    resp.decree = mu->data.header.decree;

    // for PS_POTENTIAL_SECONDARY ONLY
    resp.last_committed_decree_in_app = _app->last_committed_decree(); 
    resp.last_committed_decree_in_prepare_list = last_committed_decree();

    dassert(nullptr != mu->prepare_msg(), "");
    reply(mu->prepare_msg(), resp);

    ddebug("%s: mutation %s ack_prepare_message, err = %s", name(), mu->name(), err.to_string());
}
Ejemplo n.º 16
0
void replica::update_configuration_on_meta_server(config_type type, ::dsn::rpc_address node, partition_configuration& newConfig)
{
    newConfig.last_committed_decree = last_committed_decree();

    if (type != CT_ASSIGN_PRIMARY && type != CT_UPGRADE_TO_PRIMARY)
    {
        dassert (status() == PS_PRIMARY, "");
        dassert (newConfig.ballot == _primary_states.membership.ballot, "");
    }

    // disable 2pc during reconfiguration
    // it is possible to do this only for CT_DOWNGRADE_TO_SECONDARY,
    // but we choose to disable 2pc during all reconfiguration types
    // for simplicity at the cost of certain write throughput
    update_local_configuration_with_no_ballot_change(PS_INACTIVE);
    set_inactive_state_transient(true);

    dsn_message_t msg = dsn_msg_create_request(RPC_CM_UPDATE_PARTITION_CONFIGURATION, 0, 0);
    
    std::shared_ptr<configuration_update_request> request(new configuration_update_request);
    request->config = newConfig;
    request->config.ballot++;
    request->type = type;
    request->node = node;

    ::marshall(msg, *request);

    if (nullptr != _primary_states.reconfiguration_task)
    {
        _primary_states.reconfiguration_task->cancel(true);
    }

    rpc_address target(_stub->_failure_detector->get_servers());
    _primary_states.reconfiguration_task = rpc::call(
        target,
        msg,        
        this,
        std::bind(&replica::on_update_configuration_on_meta_server_reply, this,
        std::placeholders::_1,
        std::placeholders::_2,
        std::placeholders::_3,
        request),
        gpid_to_hash(get_gpid())
        );
}
Ejemplo n.º 17
0
decree replica::last_prepared_decree() const
{
    ballot lastBallot = 0;
    decree start = last_committed_decree();
    while (true)
    {
        auto mu = _prepare_list->get_mutation_by_decree(start + 1);
        if (mu == nullptr 
            || mu->data.header.ballot < lastBallot 
            || !mu->is_logged()
            )
            break;

        start++;
        lastBallot = mu->data.header.ballot;
    }
    return start;
}
Ejemplo n.º 18
0
decree replica::last_prepared_decree() const
{
    ballot lastBallot = 0;
    decree start = last_committed_decree();
    while (true)
    {
        auto mu = _prepare_list->get_mutation_by_decree(start + 1);
        if (mu == nullptr 
            || mu->data.header.ballot < lastBallot 
            || (!mu->is_logged() && !_options.prepare_ack_on_secondary_before_logging_allowed)
            )
            break;

        start++;
        lastBallot = mu->data.header.ballot;
    }
    return start;
}
Ejemplo n.º 19
0
void replica::on_client_read(task_code code, dsn_message_t request)
{    
    if (status() == partition_status::PS_INACTIVE || status() == partition_status::PS_POTENTIAL_SECONDARY)
    {
        response_client_message(request, ERR_INVALID_STATE);
        return;
    }

    if (status() != partition_status::PS_PRIMARY ||

        // a small window where the state is not the latest yet
        last_committed_decree() < _primary_states.last_prepare_decree_on_new_primary)
    {
        response_client_message(request, ERR_INVALID_STATE);
        return;
    }

    dassert (_app != nullptr, "");

    dsn_layer1_app_commit_rpc_request(_app->app_context(), request, true);
}
Ejemplo n.º 20
0
void replica::on_client_read(const read_request_header& meta, message_ptr& request)
{
    if (status() == PS_INACTIVE || status() == PS_POTENTIAL_SECONDARY)
    {
        response_client_message(request, ERR_INVALID_STATE);
        return;
    }

    if (meta.semantic == read_semantic_t::ReadLastUpdate)
    {
        if (status() != PS_PRIMARY || 
            last_committed_decree() < _primary_states.last_prepare_decree_on_new_primary)
        {
            response_client_message(request, ERR_INVALID_STATE);
            return;
        }
    }

    dassert (_app != nullptr, "");
    _app->dispatch_rpc_call(meta.code, request, true);
}
Ejemplo n.º 21
0
        // @ secondary
        void replica::on_copy_checkpoint(const replica_configuration& request, /*out*/ learn_response& response)
        {
            check_hashed_access();

            if (request.ballot > get_ballot())
            {
                if (!update_local_configuration(request))
                {
                    response.err = ERR_INVALID_STATE;
                    return;
                }
            }

            if (status() != PS_SECONDARY)
            {
                response.err = ERR_INVALID_STATE;
                return;
            }

            if (_app->last_durable_decree() == 0)
            {
                response.err = ERR_OBJECT_NOT_FOUND;
                return;
            }

            blob placeholder;
            int err = _app->get_checkpoint(0, placeholder, response.state);
            if (err != 0)
            {
                response.err = ERR_LEARN_FILE_FAILED;
            }
            else
            {
                response.err = ERR_OK;
                response.last_committed_decree = last_committed_decree();
                response.base_local_dir = _app->data_dir();
                response.address = _stub->_primary_address;
            }
        }
Ejemplo n.º 22
0
error_code replication_app_base::write_internal(mutation_ptr& mu)
{
    dassert (mu->data.header.decree == last_committed_decree() + 1, "");

    if (mu->rpc_code != RPC_REPLICATION_WRITE_EMPTY)
    {
        binary_reader reader(mu->data.updates[0]);
        dsn_message_t resp = (mu->client_msg() ? dsn_msg_create_response(mu->client_msg()) : nullptr);
        dispatch_rpc_call(mu->rpc_code, reader, resp);
    }
    else
    {
        on_empty_write();
    }

    if (_physical_error != 0)
    {
        derror("physical error %d occurs in replication local app %s", _physical_error, data_dir().c_str());
    }

    return _physical_error == 0 ? ERR_OK : ERR_LOCAL_APP_FAILURE;
}
Ejemplo n.º 23
0
void replica::cleanup_preparing_mutations(bool wait)
{
    decree start = last_committed_decree() + 1;
    decree end = _prepare_list->max_decree();

    for (decree decree = start; decree <= end; decree++)
    {
        mutation_ptr mu = _prepare_list->get_mutation_by_decree(decree);
        if (mu != nullptr)
        {
            mu->clear_prepare_or_commit_tasks();

            //
            // make sure the buffers from mutations are valid for underlying aio
            //
            if (wait) {
                _stub->_log->flush();
                mu->wait_log_task();
            }
        }
    }
}
Ejemplo n.º 24
0
void replica::cleanup_preparing_mutations(bool is_primary)
{
    decree start = last_committed_decree() + 1;
    decree end = _prepare_list->max_decree();

    for (decree decree = start; decree <= end; decree++)
    {
        mutation_ptr mu = _prepare_list->get_mutation_by_decree(decree);
        if (mu != nullptr)
        {
            int c = mu->clear_prepare_or_commit_tasks();
            if (!is_primary)
            {
                dassert (0 == c, "");
            }
            else
            {
            }

            mu->clear_log_task();
        }
    }
}
Ejemplo n.º 25
0
void replica::add_potential_secondary(configuration_update_request& proposal)
{
    if (status() != PS_PRIMARY)
    {
        return;
    }   

    dassert (proposal.config.ballot == get_ballot(), "");
    dassert (proposal.config.gpid == _primary_states.membership.gpid, "");
    dassert (proposal.config.app_type == _primary_states.membership.app_type, "");
    dassert (proposal.config.primary == _primary_states.membership.primary, "");
    dassert (proposal.config.secondaries == _primary_states.membership.secondaries, "");
    dassert (!_primary_states.check_exist(proposal.node, PS_PRIMARY), "");
    dassert (!_primary_states.check_exist(proposal.node, PS_SECONDARY), "");

    if (_primary_states.learners.find(proposal.node) != _primary_states.learners.end())
    {
        return;
    }

    remote_learner_state state;
    state.prepare_start_decree = invalid_decree;
    state.signature = random64(0, (uint64_t)(-1LL));
    state.timeout_task = nullptr; // TODO: add timer for learner task

    _primary_states.learners[proposal.node] = state;
    _primary_states.statuses[proposal.node] = PS_POTENTIAL_SECONDARY;

    group_check_request request;
    request.app_type = _primary_states.membership.app_type;
    request.node = proposal.node;
    _primary_states.get_replica_config(proposal.node, request.config);
    request.last_committed_decree = last_committed_decree();
    request.learner_signature = state.signature;

    rpc::call_one_way_typed(proposal.node, RPC_LEARN_ADD_LEARNER, request, gpid_to_hash(get_gpid()));
}
Ejemplo n.º 26
0
void replica::broadcast_group_check()
{
    dassert (nullptr != _primary_states.group_check_task, "");

    ddebug("%s: start to broadcast group check", name());

    if (_primary_states.group_check_pending_replies.size() > 0)
    {
        dwarn(
            "%s: %u group check replies are still pending when doing next round check, cancel first",
            name(), static_cast<int>(_primary_states.group_check_pending_replies.size())
            );

        for (auto it = _primary_states.group_check_pending_replies.begin(); it != _primary_states.group_check_pending_replies.end(); ++it)
        {
            it->second->cancel(true);
        }
        _primary_states.group_check_pending_replies.clear();
    }

    for (auto it = _primary_states.statuses.begin(); it != _primary_states.statuses.end(); ++it)
    {
        if (it->first == _stub->_primary_address)
            continue;

        ::dsn::rpc_address addr = it->first;
        std::shared_ptr<group_check_request> request(new group_check_request);

        request->app = _app_info;
        request->node = addr;
        _primary_states.get_replica_config(it->second, request->config);
        request->last_committed_decree = last_committed_decree();

        if (request->config.status == partition_status::PS_POTENTIAL_SECONDARY)
        {
            auto it = _primary_states.learners.find(addr);
            dassert(it != _primary_states.learners.end(), "learner %s is missing", addr.to_string());
            request->config.learner_signature = it->second.signature;
        }

        ddebug(
            "%s: send group check to %s with state %s",
            name(),
            addr.to_string(),
            enum_to_string(it->second)
        );

        dsn::task_ptr callback_task = rpc::call(
            addr,
            RPC_GROUP_CHECK,
            *request,            
            this,
            [=](error_code err, group_check_response&& resp)
            {
                auto alloc = std::make_shared<group_check_response>(std::move(resp));
                on_group_check_reply(err, request, alloc);
            },
            std::chrono::milliseconds(0),
            gpid_to_thread_hash(get_gpid())
            );

        _primary_states.group_check_pending_replies[addr] = callback_task;
    }

    // send empty prepare when necessary
    if (!_options->empty_write_disabled &&
        dsn_now_ms() >= _primary_states.last_prepare_ts_ms + _options->group_check_interval_ms)
    {
        mutation_ptr mu = new_mutation(invalid_decree);
        mu->add_client_request(RPC_REPLICATION_WRITE_EMPTY, nullptr);
        init_prepare(mu);
    }
}
Ejemplo n.º 27
0
void replica::on_group_check(const group_check_request& request, /*out*/ group_check_response& response)
{
    check_hashed_access();

    ddebug(
        "%s: process group check, primary = %s, ballot = %" PRId64 ", status = %s, last_committed_decree = %" PRId64,
        name(), request.config.primary.to_string(),
        request.config.ballot, enum_to_string(request.config.status),
        request.last_committed_decree
        );
    
    if (request.config.ballot < get_ballot())
    {
        response.err = ERR_VERSION_OUTDATED;
        dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
        return;
    }
    else if (request.config.ballot > get_ballot())
    {
        if (!update_local_configuration(request.config))
        {
            response.err = ERR_INVALID_STATE;
            dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
            return;
        }
    }
    else if (is_same_ballot_status_change_allowed(status(), request.config.status))
    {
        update_local_configuration(request.config, true);
    }
    
    switch (status())
    {
    case partition_status::PS_INACTIVE:
        break;
    case partition_status::PS_SECONDARY:
        if (request.last_committed_decree > last_committed_decree())
        {
            _prepare_list->commit(request.last_committed_decree, COMMIT_TO_DECREE_HARD);
        }
        break;
    case partition_status::PS_POTENTIAL_SECONDARY:
        init_learn(request.config.learner_signature);
        break;
    case partition_status::PS_ERROR:
        break;
    default:
        dassert (false, "");
    }
    
    response.pid = get_gpid();
    response.node = _stub->_primary_address;
    response.err = ERR_OK;
    if (status() == partition_status::PS_ERROR)
    {
        response.err = ERR_INVALID_STATE;
        dwarn("%s: on_group_check reply %s", name(), response.err.to_string());
    }

    response.last_committed_decree_in_app = _app->last_committed_decree();
    response.last_committed_decree_in_prepare_list = last_committed_decree();
    response.learner_status_ = _potential_secondary_states.learning_status;
    response.learner_signature = _potential_secondary_states.learning_version;
}
Ejemplo n.º 28
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;

    if (static_cast<int>(_primary_states.membership.secondaries.size()) + 1 < _options.mutation_2pc_min_replica_count)
    {
        err = ERR_NOT_ENOUGH_MEMBER;
        goto ErrOut;
    }

    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }

    ddebug("%s: mutation %s init_prepare", name(), mu->name());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options.staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }

    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }

    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); it++)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options.prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); it++)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options.prepare_timeout_ms_for_potential_secondaries);
            count++;
        }
    }
    mu->set_left_potential_secondary_ack_count(count);

    // it is possible to do commit here when logging is not required for acking prepare.
    // however, it is only possible when replica count == 1 at this moment in the
    // replication group, and we don't want to do this as it is too fragile now.
    // do_possible_commit_on_primary(mu);

    // local log
    dassert (mu->data.header.log_offset == invalid_offset, "");
    dassert (mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
                                         LPC_WRITE_REPLICATION_LOG,
                                         this,
                                         std::bind(&replica::on_append_log_completed, this, mu,
                                                 std::placeholders::_1,
                                                 std::placeholders::_2),
                                         gpid_to_hash(get_gpid())
                                        );

    dassert(nullptr != mu->log_task(), "");
    return;

ErrOut:
    response_client_message(mu->client_msg(), err);
    return;
}
Ejemplo n.º 29
0
void replica::init_prepare(mutation_ptr& mu)
{
    dassert (PS_PRIMARY == status(), "");

    error_code err = ERR_OK;
    uint8_t count = 0;
            
    mu->data.header.last_committed_decree = last_committed_decree();
    if (mu->data.header.decree == invalid_decree)
    {
        mu->set_id(get_ballot(), _prepare_list->max_decree() + 1);
    }
    else
    {
        mu->set_id(get_ballot(), mu->data.header.decree);
    }
    
    dinfo("%s: mutation %s init_prepare, mutation_tid=%" PRIu64, name(), mu->name(), mu->tid());

    // check bounded staleness
    if (mu->data.header.decree > last_committed_decree() + _options->staleness_for_commit)
    {
        err = ERR_CAPACITY_EXCEEDED;
        goto ErrOut;
    }
 
    dassert (mu->data.header.decree > last_committed_decree(), "");

    // local prepare
    err = _prepare_list->prepare(mu, PS_PRIMARY);
    if (err != ERR_OK)
    {
        goto ErrOut;
    }
    
    // remote prepare
    mu->set_prepare_ts();
    mu->set_left_secondary_ack_count((unsigned int)_primary_states.membership.secondaries.size());
    for (auto it = _primary_states.membership.secondaries.begin(); it != _primary_states.membership.secondaries.end(); ++it)
    {
        send_prepare_message(*it, PS_SECONDARY, mu, _options->prepare_timeout_ms_for_secondaries);
    }

    count = 0;
    for (auto it = _primary_states.learners.begin(); it != _primary_states.learners.end(); ++it)
    {
        if (it->second.prepare_start_decree != invalid_decree && mu->data.header.decree >= it->second.prepare_start_decree)
        {
            send_prepare_message(it->first, PS_POTENTIAL_SECONDARY, mu, _options->prepare_timeout_ms_for_potential_secondaries, it->second.signature);
            count++;
        }
    }    
    mu->set_left_potential_secondary_ack_count(count);

    if (mu->is_logged())
    {
        do_possible_commit_on_primary(mu);
    }
    else
    {
        dassert(mu->data.header.log_offset == invalid_offset, "");
        dassert(mu->log_task() == nullptr, "");

        mu->log_task() = _stub->_log->append(mu,
            LPC_WRITE_REPLICATION_LOG,
            this,
            std::bind(&replica::on_append_log_completed, this, mu,
                      std::placeholders::_1,
                      std::placeholders::_2),
                      gpid_to_hash(get_gpid())
            );

        dassert(nullptr != mu->log_task(), "");
    }
    return;

ErrOut:
    for (auto& r : mu->client_requests)
    {
        response_client_message(r, err);
    }
    return;
}
Ejemplo n.º 30
0
void replica::on_prepare(dsn_message_t request)
{
    check_hashed_access();

    replica_configuration rconfig;
    mutation_ptr mu;

    {
        rpc_read_stream reader(request);
        unmarshall(reader, rconfig);
        mu = mutation::read_from(reader, request);
    }

    decree decree = mu->data.header.decree;

    dinfo("%s: mutation %s on_prepare", name(), mu->name());

    dassert(mu->data.header.ballot == rconfig.ballot, "");

    if (mu->data.header.ballot < get_ballot())
    {
        derror("%s: mutation %s on_prepare skipped due to old view", name(), mu->name());
        // no need response because the rpc should have been cancelled on primary in this case
        return;
    }

    // update configuration when necessary
    else if (rconfig.ballot > get_ballot())
    {
        if (!update_local_configuration(rconfig))
        {
            derror(
                "%s: mutation %s on_prepare failed as update local configuration failed, state = %s",
                name(), mu->name(),
                enum_to_string(status())
                );
            ack_prepare_message(ERR_INVALID_STATE, mu);
            return;
        }
    }

    if (PS_INACTIVE == status() || PS_ERROR == status())
    {
        derror(
            "%s: mutation %s on_prepare failed as invalid replica state, state = %s",
            name(), mu->name(),
            enum_to_string(status())
            );
        ack_prepare_message(
            (PS_INACTIVE == status() && _inactive_is_transient) ? ERR_INACTIVE_STATE : ERR_INVALID_STATE,
            mu
            );
        return;
    }
    else if (PS_POTENTIAL_SECONDARY == status())
    {
        // new learning process
        if (rconfig.learner_signature != _potential_secondary_states.learning_signature)
        {
            init_learn(rconfig.learner_signature);
            // no need response as rpc is already gone
            return;
        }

        if (!(_potential_secondary_states.learning_status == LearningWithPrepare
            || _potential_secondary_states.learning_status == LearningSucceeded))
        {
            derror(
                "%s: mutation %s on_prepare skipped as invalid learning status, state = %s, learning_status = %s",
                name(), mu->name(),
                enum_to_string(status()),
                enum_to_string(_potential_secondary_states.learning_status)
                );

            // no need response as rpc is already gone
            return;
        }
    }

    dassert (rconfig.status == status(), "");    
    if (decree <= last_committed_decree())
    {
        ack_prepare_message(ERR_OK, mu);
        return;
    }
    
    // real prepare start
    auto mu2 = _prepare_list->get_mutation_by_decree(decree);
    if (mu2 != nullptr && mu2->data.header.ballot == mu->data.header.ballot)
    {
        if (mu2->is_logged())
        {
            ack_prepare_message(ERR_OK, mu);
        }
        else
        {
            derror("%s: mutation %s on_prepare skipped as it is duplicate", name(), mu->name());
            // response will be unnecessary when we add retry logic in rpc engine.
            // the retried rpc will use the same id therefore it will be considered responsed
            // even the response is for a previous try.
        }
        return;
    }

    error_code err = _prepare_list->prepare(mu, status());
    dassert (err == ERR_OK, "");

    if (PS_POTENTIAL_SECONDARY == status())
    {
        dassert (mu->data.header.decree <= last_committed_decree() + _options->max_mutation_count_in_prepare_list, "");
    }
    else
    {
        dassert (PS_SECONDARY == status(), "");
        dassert (mu->data.header.decree <= last_committed_decree() + _options->staleness_for_commit, "");
    }

    dassert(mu->log_task() == nullptr, "");
    mu->log_task() = _stub->_log->append(mu,
        LPC_WRITE_REPLICATION_LOG,
        this,
        std::bind(&replica::on_append_log_completed, this, mu,
                  std::placeholders::_1,
                  std::placeholders::_2),
        gpid_to_hash(get_gpid())
        );
}