예제 #1
0
void replication_options::initialize()
{
    dsn_app_info app_info;
    bool r = dsn_get_current_app_info(&app_info);
    dassert(r, "get current app info failed");
    app_name = app_info.name;
    app_dir = app_info.data_dir;

    // slog_dir:
    // - if config[slog_dir] is empty: "app_dir/slog"
    // - else: "config[slog_dir]/app_name/slog"
    slog_dir = dsn_config_get_value_string("replication", "slog_dir", "", "shared log directory");
    if (slog_dir.empty())
    {
        slog_dir = app_dir;
    }
    else
    {
        slog_dir = utils::filesystem::path_combine(slog_dir, app_name);
    }
    slog_dir = utils::filesystem::path_combine(slog_dir, "slog");

    // data_dirs
    // - if config[data_dirs] is empty: "app_dir/reps"
    // - else: "config[data_dirs]/app_name/reps"
    std::string dirs_str = dsn_config_get_value_string("replication", "data_dirs", "", "replica directory list");
    std::vector<std::string> dirs;
    ::dsn::utils::split_args(dirs_str.c_str(), dirs, ',');
    if (dirs.empty())
    {
        dirs.push_back(app_dir);
    }
    else
    {
        for (auto& dir : dirs)
        {
            dir = utils::filesystem::path_combine(dir, app_name);
        }
    }
    for (auto& dir : dirs)
    {
        data_dirs.push_back(utils::filesystem::path_combine(dir, "reps"));
    }

    prepare_timeout_ms_for_secondaries =
        (int)dsn_config_get_value_uint64("replication", 
        "prepare_timeout_ms_for_secondaries", 
        prepare_timeout_ms_for_secondaries,
        "timeout (ms) for prepare message to secondaries in two phase commit"
        );
    prepare_timeout_ms_for_potential_secondaries = 
        (int)dsn_config_get_value_uint64("replication", 
        "prepare_timeout_ms_for_potential_secondaries",
        prepare_timeout_ms_for_potential_secondaries,
        "timeout (ms) for prepare message to potential secondaries in two phase commit"
        );

    batch_write_disabled =
        dsn_config_get_value_bool("replication",
        "batch_write_disabled",
        batch_write_disabled,
        "whether to disable auto-batch of replicated write requests"
        );
    staleness_for_commit =
        (int)dsn_config_get_value_uint64("replication", 
        "staleness_for_commit", 
        staleness_for_commit,
        "how many concurrent two phase commit rounds are allowed"
        );
    max_mutation_count_in_prepare_list =
        (int)dsn_config_get_value_uint64("replication", 
        "max_mutation_count_in_prepare_list", 
        max_mutation_count_in_prepare_list,
        "maximum number of mutations in prepare list"
        );
    mutation_2pc_min_replica_count =
        (int)dsn_config_get_value_uint64("replication", 
        "mutation_2pc_min_replica_count",
        mutation_2pc_min_replica_count,
        "minimum number of alive replicas under which write is allowed"
        );

    group_check_disabled =
        dsn_config_get_value_bool("replication",
        "group_check_disabled",
        group_check_disabled,
        "whether group check is disabled"
        );
    group_check_interval_ms =
        (int)dsn_config_get_value_uint64("replication",
        "group_check_interval_ms", 
        group_check_interval_ms,
        "every what period (ms) we check the replica healthness"
        );

    checkpoint_disabled =
        dsn_config_get_value_bool("replication",
        "checkpoint_disabled",
        checkpoint_disabled,
        "whether checkpoint is disabled"
        );
    checkpoint_interval_seconds =
        (int)dsn_config_get_value_uint64("replication",
        "checkpoint_interval_seconds",
        checkpoint_interval_seconds,
        "every what period (seconds) we do checkpoints for replicated apps"
        ); 
    checkpoint_min_decree_gap = 
        (int64_t)dsn_config_get_value_uint64("replication",
        "checkpoint_min_decree_gap",
        checkpoint_min_decree_gap,
        "minimum decree gap that triggers checkpoint"
        );
    checkpoint_max_interval_hours = 
        (int)dsn_config_get_value_uint64("replication",
        "checkpoint_max_interval_hours",
        checkpoint_max_interval_hours,
        "maximum time interval (hours) where a new checkpoint must be created"
        );

    gc_disabled =
        dsn_config_get_value_bool("replication",
        "gc_disabled",
        gc_disabled,
        "whether to disable garbage collection"
        );
    gc_interval_ms =
        (int)dsn_config_get_value_uint64("replication", 
        "gc_interval_ms", 
        gc_interval_ms,
        "every what period (ms) we do garbage collection for dead replicas, on-disk state, log, etc."
        );
    gc_memory_replica_interval_ms =
        (int)dsn_config_get_value_uint64("replication", 
        "gc_memory_replica_interval_ms", 
        gc_memory_replica_interval_ms,
        "after closing a healthy replica (due to LB), the replica will remain in memory for this long (ms) for quick recover"
        );
    gc_disk_error_replica_interval_seconds =
        (int)dsn_config_get_value_uint64("replication", 
        "gc_disk_error_replica_interval_seconds", 
        gc_disk_error_replica_interval_seconds,
        "error replica are deleted after they have been closed and lasted on disk this long (seconds)"
        );

    fd_disabled =
        dsn_config_get_value_bool("replication",
        "fd_disabled",
        fd_disabled,
        "whether to disable failure detection"
        );
    fd_check_interval_seconds =
        (int)dsn_config_get_value_uint64("replication", 
        "fd_check_interval_seconds", 
        fd_check_interval_seconds,
        "every this period(seconds) the FD will check healthness of remote peers"
        );
    fd_beacon_interval_seconds =
        (int)dsn_config_get_value_uint64("replication", 
        "fd_beacon_interval_seconds", 
        fd_beacon_interval_seconds,
        "every this period(seconds) the FD sends beacon message to remote peers"
        );
    fd_lease_seconds =
        (int)dsn_config_get_value_uint64("replication", 
        "fd_lease_seconds", 
        fd_lease_seconds,
        "lease (seconds) get from remote FD master"
        );
    fd_grace_seconds =
        (int)dsn_config_get_value_uint64("replication", 
        "fd_grace_seconds", 
        fd_grace_seconds,
        "grace (seconds) assigned to remote FD slaves (grace > lease)"
        );

    log_private_disabled =
        dsn_config_get_value_bool("replication",
        "log_private_disabled",
        log_private_disabled,
        "whether to disable logging committed mutations for each app, which is used for easier learning"
        );
    log_private_file_size_mb =
        (int)dsn_config_get_value_uint64("replication",
        "log_private_file_size_mb",
        log_private_file_size_mb,
        "private log maximum segment file size (MB)"
        );
    log_private_batch_buffer_kb =
        (int)dsn_config_get_value_uint64("replication",
        "log_private_batch_buffer_kb",
        log_private_batch_buffer_kb,
        "private log buffer size (KB) for batching incoming logs"
        );
    log_private_force_flush =
        dsn_config_get_value_bool("replication",
        "log_private_force_flush",
        log_private_force_flush,
        "when write private log, whether to flush file after write done"
        );

    log_shared_file_size_mb =
        (int)dsn_config_get_value_uint64("replication", 
        "log_shared_file_size_mb",
        log_shared_file_size_mb,
        "shared log maximum segment file size (MB)"
        );
    log_shared_batch_buffer_kb =
        (int)dsn_config_get_value_uint64("replication", 
        "log_batch_buffer_KB_shared", 
        log_shared_batch_buffer_kb,
        "shared log buffer size (KB) for batching incoming logs"
        );
    log_shared_force_flush =
        dsn_config_get_value_bool("replication",
        "log_shared_force_flush",
        log_shared_force_flush,
        "when write shared log, whether to flush file after write done"
        );

    config_sync_disabled =
        dsn_config_get_value_bool("replication", 
        "config_sync_disabled",
        config_sync_disabled,
        "whether to disable replica configuration periodical sync with the meta server"
        );
    config_sync_interval_ms =
        (int)dsn_config_get_value_uint64("replication", 
        "config_sync_interval_ms", 
        config_sync_interval_ms,
        "every this period(ms) the replica syncs replica configuration with the meta server"
        );

    lb_interval_ms =
        (int)dsn_config_get_value_uint64("replication",
        "lb_interval_ms",
        lb_interval_ms,
        "every this period(ms) the meta server will do load balance"
        );

    write_empty_enabled =
        dsn_config_get_value_bool("replication",
            "write_empty_enabled",
            write_empty_enabled,
            "whether to enable empty write when no write requests are processed for more than group_check_period, default is true"
            );
    
    read_meta_servers();

    sanity_check();
}
error_code distributed_lock_service_zookeeper::initialize(const std::vector<std::string>& args)
{
    if (args.empty())
    {
        derror("need parameters: <lock_root>");
        return ERR_INVALID_PARAMETERS;
    }
    const char* lock_root = args[0].c_str();

    dsn_app_info node;
    if (!dsn_get_current_app_info(&node))
    {
        derror("get current app info failed, can not init distributed_lock_service_zookeeper");
        return ERR_CORRUPTION;
    }

    _session = zookeeper_session_mgr::instance().get_session(&node);
    _zoo_state = _session->attach(this, std::bind(&distributed_lock_service_zookeeper::on_zoo_session_evt,
                                                  lock_srv_ptr(this),
                                                  std::placeholders::_1) );
    if (_zoo_state != ZOO_CONNECTED_STATE)
    {
        _waiting_attach.wait_for( zookeeper_session_mgr::fast_instance().timeout() );
        if (_zoo_state != ZOO_CONNECTED_STATE)
        {
            dwarn("attach to zookeeper session timeout, distributed lock service initialized failed");
            return ERR_TIMEOUT;
        }
    }

    std::vector<std::string> slices;
    utils::split_args(lock_root, slices, '/');
    std::string current = "";
    for (auto& str: slices)
    {
        utils::notify_event e;
        int zerr;
        current = current + "/" + str;
        zookeeper_session::zoo_opcontext* op = zookeeper_session::create_context();
        op->_optype = zookeeper_session::ZOO_CREATE;
        op->_input._path = current;
        op->_callback_function = [&e, &zerr](zookeeper_session::zoo_opcontext* op) mutable
        {
            zerr = op->_output.error;
            e.notify();
        };

        _session->visit(op);
        e.wait();
        if (zerr != ZOK && zerr != ZNODEEXISTS)
        {
            derror("create zk node failed, path = %s, err = %s", current.c_str(), zerror(zerr));
            return from_zerror(zerr);
        }
    }
    _lock_root = current.empty() ? "/" : current;

    ddebug("init distributed_lock_service_zookeeper succeed, lock_root = %s", _lock_root.c_str());
    // Notice: this reference is released in the finalize
    add_ref();
    return ERR_OK;
}