static void proposer_handle_msg(struct evproposer* p, struct bufferevent* bev) //proposer处理各种类型的消息 { paxos_msg msg; struct evbuffer* in; char buffer[PAXOS_MAX_VALUE_SIZE]; in = bufferevent_get_input(bev); evbuffer_remove(in, &msg, sizeof(paxos_msg)); //将消息文件头从输入buffer移动到msg中 if (msg.data_size > PAXOS_MAX_VALUE_SIZE) { //消息内容过大,丢弃 evbuffer_drain(in, msg.data_size); paxos_log_error("Discarding message of size %ld. Maximum is %d", msg.data_size, PAXOS_MAX_VALUE_SIZE); return; } evbuffer_remove(in, buffer, msg.data_size); //将消息的内容移动到buffer中 switch (msg.type) { case prepare_acks: proposer_handle_prepare_ack(p, (prepare_ack*)buffer); break; case accept_acks: proposer_handle_accept_ack(p, (accept_ack*)buffer); break; case submit: proposer_handle_client_msg(p, buffer, msg.data_size); break; default: paxos_log_error("Unknow msg type %d not handled", msg.type); return; } try_accept(p); }
struct storage* storage_open(int acceptor_id) { struct storage* s; s = malloc(sizeof(struct storage)); memset(s, 0, sizeof(struct storage)); s->acceptor_id = acceptor_id; //Create path to db file in db dir char* db_env_path; asprintf(&db_env_path, "%s_%d", paxos_config.bdb_env_path, acceptor_id); char* db_filename = paxos_config.bdb_db_filename; struct stat sb; //Check if the environment dir and db file exists int dir_exists = (stat(db_env_path, &sb) == 0); //Create the directory if it does not exist if (!dir_exists && (mkdir(db_env_path, S_IRWXU) != 0)) { paxos_log_error("Failed to create env dir %s: %s", db_env_path, strerror(errno)); return NULL; } //Delete and recreate an empty dir if not recovering if (paxos_config.bdb_trash_files && dir_exists) { char rm_command[600]; sprintf(rm_command, "rm -r %s", db_env_path); if ((system(rm_command) != 0) || (mkdir(db_env_path, S_IRWXU) != 0)) { paxos_log_error("Failed to recreate empty env dir %s: %s", db_env_path, strerror(errno)); } } char * db_file = db_filename; int ret = bdb_init_tx_handle(s, db_env_path); if (ret != 0) { paxos_log_error("Failed to open DB handle"); } if (bdb_init_db(s, db_file) != 0) { paxos_log_error("Failed to open DB file"); return NULL; } free(db_env_path); return s; }
struct evpaxos_config* evpaxos_config_read(const char* path) { struct stat sb; FILE* f = NULL; char line[512]; int linenumber = 1; struct evpaxos_config* c = NULL; if ((f = fopen(path, "r")) == NULL) { perror("fopen"); goto failure; } if (stat(path, &sb) == -1) { perror("stat"); goto failure; } if (!S_ISREG(sb.st_mode)) { paxos_log_error("Error: %s is not a regular file\n", path); goto failure; } c = malloc(sizeof(struct evpaxos_config)); if (c == NULL) { perror("malloc"); goto failure; } memset(c, 0, sizeof(struct evpaxos_config)); while (fgets(line, sizeof(line), f) != NULL) { if (line[0] != '#' && line[0] != '\n') { if (parse_line(c, line) == 0) { paxos_log_error("Please, check line %d\n", linenumber); paxos_log_error("Error parsing config file %s\n", path); goto failure; } } linenumber++; } fclose(f); return c; failure: free(c); if (f != NULL) fclose(f); return NULL; }
static int lmdb_storage_open(void* handle) { struct lmdb_storage* s = handle; char* lmdb_env_path = NULL; struct stat sb; int dir_exists, result; size_t lmdb_env_path_length = strlen(paxos_config.lmdb_env_path) + 16; lmdb_env_path = malloc(lmdb_env_path_length); snprintf(lmdb_env_path, lmdb_env_path_length, "%s_%d", paxos_config.lmdb_env_path, s->acceptor_id); // Trash bdb files -- testing only if (paxos_config.trash_files) { char rm_command[600]; sprintf(rm_command, "rm -r %s", lmdb_env_path); system(rm_command); } dir_exists = (stat(lmdb_env_path, &sb) == 0); if (!dir_exists && (mkdir(lmdb_env_path, S_IRWXU) != 0)) { paxos_log_error("Failed to create env dir %s: %s", lmdb_env_path, strerror(errno)); goto error; } if ((result = lmdb_storage_init(s, lmdb_env_path) != 0)) { paxos_log_error("Failed to open DB handle"); } else { paxos_log_info("lmdb storage opened successfully"); goto cleanup_exit; } error: if (s) { lmdb_storage_close(s); } return -1; cleanup_exit: if (lmdb_env_path) { free(lmdb_env_path); } return 0; }
static int lmdb_storage_get(void* handle, iid_t iid, paxos_accepted* out) { struct lmdb_storage* s = handle; int result; MDB_val key, data; memset(&data, 0, sizeof(data)); key.mv_data = &iid; key.mv_size = sizeof(iid_t); if ((result = mdb_get(s->txn, s->dbi, &key, &data)) != 0) { if (result == MDB_NOTFOUND) { paxos_log_debug("There is no record for iid: %d", iid); } else { paxos_log_error("Could not find record for iid: %d : %s", iid, mdb_strerror(result)); } return 0; } paxos_accepted_from_buffer(data.mv_data, out); assert(iid == out->iid); return 1; }
static void on_listener_error(struct evconnlistener* l, void* arg) { int err = EVUTIL_SOCKET_ERROR(); struct event_base *base = evconnlistener_get_base(l); paxos_log_error("Listener error %d: %s. Shutting down event loop.", err, evutil_socket_error_to_string(err)); event_base_loopexit(base, NULL); }
static int lmdb_storage_trim(void* handle, iid_t iid) { struct lmdb_storage* s = handle; int result; iid_t min = 0; MDB_cursor* cursor = NULL; MDB_val key, data; if (iid == 0) return 0; lmdb_storage_put_trim_instance(handle, iid); if ((result = mdb_cursor_open(s->txn, s->dbi, &cursor)) != 0) { paxos_log_error("Could not create cursor. %s", mdb_strerror(result)); goto cleanup_exit; } key.mv_data = &min; key.mv_size = sizeof(iid_t); do { if ((result = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { assert(key.mv_size = sizeof(iid_t)); min = *(iid_t*)key.mv_data; } else { goto cleanup_exit; } if (min != 0 && min <= iid) { if (mdb_cursor_del(cursor, 0) != 0) { paxos_log_error("mdb_cursor_del failed. %s", mdb_strerror(result)); goto cleanup_exit; } } } while (min <= iid); cleanup_exit: if (cursor) { mdb_cursor_close(cursor); } return 0; }
int storage_close(struct storage* s) { int result = 0; DB* dbp = s->db; DB_ENV* dbenv = s->env; if (dbp->close(dbp, 0) != 0) { paxos_log_error("DB_ENV close failed"); result = -1; } if (dbenv->close(dbenv, 0) != 0) { paxos_log_error("DB close failed"); result = -1; } free(s); paxos_log_info("Berkeley DB storage closed successfully"); return result; }
/*建立一个proposer对象,并启动它*/ struct evproposer* evproposer_init(int id, const char* config, struct event_base* b) { int port, acceptor_count; struct evproposer* p; /*读取配置文件*/ struct evpaxos_config* conf = evpaxos_config_read(config); if(conf == NULL) return NULL; /*非法的proposer id*/ if (id < 0 || id >= MAX_N_OF_PROPOSERS) { paxos_log_error("Invalid proposer id: %d", id); return NULL; } /*读取proposer的监听端口*/ port = evpaxos_proposer_listen_port(conf, id); /*读取acceptor的数量*/ acceptor_count = evpaxos_acceptor_count(conf); p = (struct evproposer *)malloc(sizeof(struct evproposer)); p->id = id; p->base = b; /*获得同时提交的议案数量*/ p->preexec_window = paxos_config.proposer_preexec_window; /*产生一个网络消息接收器*/ p->receiver = tcp_receiver_new(b, port, handle_request, p); /*产生一个acceptor的管理器*/ p->acceptors = peers_new(b); /*对每个acceptor发起连接*/ peers_connect_to_acceptors(p->acceptors, conf, handle_request, p); /*设置定时器*/ p->tv.tv_sec = paxos_config.proposer_timeout; p->tv.tv_usec = 0; /*产生一个libevent定时器事件对象,并设置一个定时器*/ p->timeout_ev = evtimer_new(b, proposer_check_timeouts, p); event_add(p->timeout_ev, &p->tv); /*产生一个proposer 消息处理器*/ p->state = proposer_new(p->id, acceptor_count); /*试探性执行prepare过程(提案第一阶段)*/ proposer_preexecute(p); evpaxos_config_free(conf); return p; }
static int bdb_init_db(struct storage* s, char* db_path) { int result; DB* dbp; //Create the DB file result = db_create(&(s->db), s->env, 0); if (result != 0) { paxos_log_error("Berkeley DB storage call to db_create failed: %s", db_strerror(result)); return -1; } dbp = s->db; // DB flags int flags = DB_CREATE; /*Create if not existing */ storage_tx_begin(s); //Open the DB file result = dbp->open(dbp, s->txn, /* Transaction pointer */ db_path, /* On-disk file that holds the database. */ NULL, /* Optional logical database name */ DB_BTREE, /* Database access method */ flags, /* Open flags */ 0); /* Default file permissions */ storage_tx_commit(s); if (result != 0) { paxos_log_error("Berkeley DB storage open failed: %s", db_strerror(result)); return -1; } return 0; }
static void on_peer_event(struct bufferevent* bev, short ev, void *arg) { struct peer* p = (struct peer*)arg; if (ev & BEV_EVENT_CONNECTED) { paxos_log_info("Connected to %s:%d", inet_ntoa(p->addr.sin_addr), ntohs(p->addr.sin_port)); p->status = ev; } else if (ev & BEV_EVENT_ERROR || ev & BEV_EVENT_EOF) { struct event_base* base; int err = EVUTIL_SOCKET_ERROR(); paxos_log_error("%s (%s:%d)", evutil_socket_error_to_string(err), inet_ntoa(p->addr.sin_addr), ntohs(p->addr.sin_port)); base = bufferevent_get_base(p->bev); bufferevent_free(p->bev); p->bev = bufferevent_socket_new(base, -1, BEV_OPT_CLOSE_ON_FREE); bufferevent_setcb(p->bev, on_read, NULL, on_peer_event, p); event_add(p->reconnect_ev, &reconnect_timeout); p->status = ev; } else { paxos_log_error("Event %d not handled", ev); } }
static void on_client_event(struct bufferevent* bev, short ev, void *arg) { struct peer* p = (struct peer*)arg; if (ev & BEV_EVENT_EOF || ev & BEV_EVENT_ERROR) { int i; struct peer** clients = p->peers->clients; for (i = p->id; i < p->peers->clients_count-1; ++i) { clients[i] = clients[i+1]; clients[i]->id = i; } p->peers->clients_count--; p->peers->clients = realloc(p->peers->clients, sizeof(struct peer*) * (p->peers->clients_count)); free_peer(p); } else { paxos_log_error("Event %d not handled", ev); } }
acceptor_record* storage_get_record(struct storage* s, iid_t iid) { int flags, result; DBT dbkey, dbdata; DB* dbp = s->db; DB_TXN* txn = s->txn; acceptor_record* record_buffer = NULL; memset(&dbkey, 0, sizeof(DBT)); memset(&dbdata, 0, sizeof(DBT)); //Key is iid dbkey.data = &iid; dbkey.size = sizeof(iid_t); //Force copy to the specified buffer dbdata.flags = DB_DBT_MALLOC; //Read the record flags = 0; result = dbp->get(dbp, txn, &dbkey, &dbdata, flags); if (result == DB_NOTFOUND || result == DB_KEYEMPTY) { paxos_log_debug("The record for iid: %d does not exist", iid); return NULL; } else if (result != 0) { paxos_log_error("Error while reading record with iid%u : %s", iid, db_strerror(result)); return NULL; } record_buffer = (acceptor_record*) dbdata.data; assert(record_buffer != NULL); //Record found assert(iid == record_buffer->iid); return record_buffer; }
struct evproposer* evproposer_init(int id, const char* config_file, struct event_base* b) { int i; struct evproposer* p; struct config* conf = read_config(config_file); //读取配置文件 if (conf == NULL) return NULL; // Check id validity of proposer_id if (id < 0 || id >= MAX_N_OF_PROPOSERS) { //检查proposerid paxos_log_error("Invalid proposer id: %d", id); return NULL; } p = malloc(sizeof(struct evproposer)); p->id = id; p->base = b; p->preexec_window = paxos_config.proposer_preexec_window; //128 // Setup client listener p->receiver = tcp_receiver_new(b, &conf->proposers[id], handle_request, p); //创建新的接收器 // Setup connections to acceptors p->acceptors = peers_new(b, conf->acceptors_count); //连接池 for (i = 0; i < conf->acceptors_count; i++) peers_connect(p->acceptors, &conf->acceptors[i], handle_request, p); //连接各个acceptor // Setup timeout p->tv.tv_sec = paxos_config.proposer_timeout; p->tv.tv_usec = 0; p->timeout_ev = evtimer_new(b, proposer_check_timeouts, p); event_add(p->timeout_ev, &p->tv); //添加超时事件 p->state = proposer_new(p->id, conf->acceptors_count); //创建新的proposer状态机 free_config(conf); return p; }
static int lmdb_storage_put_trim_instance(void* handle, iid_t iid) { struct lmdb_storage* s = handle; iid_t k = 0; int result; MDB_val key, data; key.mv_data = &k; key.mv_size = sizeof(iid_t); data.mv_data = &iid; data.mv_size = sizeof(iid_t); result = mdb_put(s->txn, s->dbi, &key, &data, 0); if (result != 0) paxos_log_error("%s\n", mdb_strerror(result)); assert(result == 0); return 0; }
/*proposer处理网络消息接口*/ static void proposer_handle_msg(struct evproposer* p, struct bufferevent* bev) { paxos_msg msg; struct evbuffer* in; char* buffer = NULL; /*解读消息头*/ in = bufferevent_get_input(bev); evbuffer_remove(in, &msg, sizeof(paxos_msg)); /*解读消息体*/ if (msg.data_size > 0) { buffer = malloc(msg.data_size); evbuffer_remove(in, buffer, msg.data_size); } /*处理消息*/ switch (msg.type){ case prepare_acks: proposer_handle_prepare_ack(p, (prepare_ack*)buffer); break; case accept_acks: proposer_handle_accept_ack(p, (accept_ack*)buffer); break; case submit: proposer_handle_client_msg(p, buffer, msg.data_size); break; default: paxos_log_error("Unknow msg type %d not handled", msg.type); return; } /*尝试发起提议的第二阶段,阶段性检查*/ try_accept(p); if (buffer != NULL) free(buffer); }
int peers_listen(struct peers* p, int port) { struct sockaddr_in addr; unsigned flags = LEV_OPT_CLOSE_ON_EXEC | LEV_OPT_CLOSE_ON_FREE | LEV_OPT_REUSEABLE; /* listen on the given port at address 0.0.0.0 */ memset(&addr, 0, sizeof(struct sockaddr_in)); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(0); addr.sin_port = htons(port); p->listener = evconnlistener_new_bind(p->base, on_accept, p, flags, -1, (struct sockaddr*)&addr, sizeof(addr)); if (p->listener == NULL) { paxos_log_error("Failed to bind on port %d", port); return 0; } evconnlistener_set_error_cb(p->listener, on_listener_error); paxos_log_info("Listening on port %d", port); return 1; }
static iid_t lmdb_storage_get_trim_instance(void* handle) { struct lmdb_storage* s = handle; int result; iid_t iid = 0, k = 0; MDB_val key, data; key.mv_data = &k; key.mv_size = sizeof(iid_t); if ((result = mdb_get(s->txn, s->dbi, &key, &data)) != 0) { if (result != MDB_NOTFOUND) { paxos_log_error("mdb_get failed: %s", mdb_strerror(result)); assert(result == 0); } else { iid = 0; } } else { iid = *(iid_t*)data.mv_data; } return iid; }
static int lmdb_storage_init(struct lmdb_storage* s, char* db_env_path) { int result; MDB_env* env = NULL; MDB_txn* txn = NULL; MDB_dbi dbi = 0; if ((result = mdb_env_create(&env)) != 0) { paxos_log_error("Could not create lmdb environment. %s", mdb_strerror(result)); goto error; } if ((result = mdb_env_set_mapsize(env, paxos_config.lmdb_mapsize)) != 0) { paxos_log_error("Could not set lmdb map size. %s", mdb_strerror(result)); goto error; } if ((result = mdb_env_open(env, db_env_path, !paxos_config.lmdb_sync ? MDB_NOSYNC : 0 | MDB_INTEGERKEY, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH)) != 0) { paxos_log_error("Could not open lmdb environment at %s. %s", db_env_path, mdb_strerror(result)); goto error; } if ((result = mdb_txn_begin(env, NULL, 0, &txn)) != 0) { paxos_log_error("Could not start txn on lmdb environment at %s. %s", db_env_path, mdb_strerror(result)); goto error; } if ((result = mdb_open(txn, NULL, 0, &dbi)) != 0) { paxos_log_error("Could not open db on lmdb environment at %s. %s", db_env_path, mdb_strerror(result)); goto error; } if ((result = mdb_set_compare(txn, dbi, lmdb_compare_iid)) != 0) { paxos_log_error("Could setup compare function on lmdb " "environment at %s. %s", db_env_path, mdb_strerror(result)); goto error; } if ((result = mdb_txn_commit(txn)) != 0) { paxos_log_error("Could commit txn on lmdb environment at %s. %s", db_env_path, mdb_strerror(result)); goto error; } s->env = env; s->dbi = dbi; return 0; error: if (txn) { mdb_txn_abort(txn); } if (dbi) { mdb_close(env, dbi); } if (env) { mdb_env_close(env); } return -1; }
static int bdb_init_tx_handle(struct storage* s, char* db_env_path) { int result; DB_ENV* dbenv; //Create environment handle result = db_env_create(&dbenv, 0); if (result != 0) { paxos_log_error("DB_ENV creation failed: %s", db_strerror(result)); return -1; } //Durability mode if (!paxos_config.bdb_sync) result = dbenv->set_flags(dbenv, DB_TXN_WRITE_NOSYNC, 1); if (result != 0) { paxos_log_error("DB_ENV set_flags failed: %s", db_strerror(result)); return -1; } //Redirect errors to sdout dbenv->set_errfile(dbenv, stdout); //Set the size of the memory cache result = dbenv->set_cachesize(dbenv, 0, paxos_config.bdb_cachesize, 1); if (result != 0) { paxos_log_error("DB_ENV set_cachesize failed: %s", db_strerror(result)); return -1; } //TODO see page size impact //Set page size for this db // result = dbp->set_pagesize(dbp, pagesize); // assert(result == 0); //FIXME set log size // Environment open flags int flags; flags = DB_CREATE | /* Create if not existing */ DB_RECOVER | /* Run normal recovery. */ DB_INIT_LOCK | /* Initialize the locking subsystem */ DB_INIT_LOG | /* Initialize the logging subsystem */ DB_INIT_TXN | /* Initialize the transactional subsystem. */ DB_THREAD | /* Cause the environment to be free-threaded */ DB_REGISTER | DB_INIT_MPOOL; /* Initialize the memory pool (in-memory cache) */ //Open the DB environment result = dbenv->open(dbenv, db_env_path, /* Environment directory */ flags, /* Open flags */ 0); /* Default file permissions */ if (result != 0) { paxos_log_error("DB_ENV open failed: %s", db_strerror(result)); return -1; } paxos_log_info("Berkeley DB storage opened successfully"); s->env = dbenv; return 0; }
static int parse_line(struct evpaxos_config* c, char* line) { int rv; char* tok; char* sep = " "; struct option* opt; line = strtrim(line); tok = strsep(&line, sep); if (strcasecmp(tok, "a") == 0 || strcasecmp(tok, "acceptor") == 0) { if (c->acceptors_count >= MAX_N_OF_PROPOSERS) { paxos_log_error("Number of acceptors exceded maximum of: %d\n", MAX_N_OF_PROPOSERS); return 0; } struct address* addr = &c->acceptors[c->acceptors_count++]; return parse_address(line, addr); } if (strcasecmp(tok, "p") == 0 || strcasecmp(tok, "proposer") == 0) { if (c->proposers_count >= MAX_N_OF_PROPOSERS) { paxos_log_error("Number of proposers exceded maximum of: %d\n", MAX_N_OF_PROPOSERS); return 0; } struct address* addr = &c->proposers[c->proposers_count++]; return parse_address(line, addr); } if (strcasecmp(tok, "l") == 0 || strcasecmp(tok, "learner") == 0) { if (c->learners_count >= MAX_N_OF_PROPOSERS) { paxos_log_error("Number of learners exceded maximum of: %d\n", MAX_N_OF_PROPOSERS); return 0; } struct address* addr = &c->learners[c->learners_count++]; return parse_address(line, addr); } if (strcasecmp(tok, "r") == 0 || strcasecmp(tok, "replica") == 0) { if (c->proposers_count >= MAX_N_OF_PROPOSERS || c->acceptors_count >= MAX_N_OF_PROPOSERS ) { paxos_log_error("Number of replicas exceded maximum of: %d\n", MAX_N_OF_PROPOSERS); return 0; } struct address* pro_addr = &c->proposers[c->proposers_count++]; struct address* acc_addr = &c->acceptors[c->acceptors_count++]; int rv = parse_address(line, pro_addr); address_copy(pro_addr, acc_addr); return rv; } line = strtrim(line); opt = lookup_option(tok); if (opt == NULL) return 0; switch (opt->type) { case option_boolean: rv = parse_boolean(line, opt->value); if (rv == 0) paxos_log_error("Expected 'yes' or 'no'\n"); break; case option_integer: rv = parse_integer(line, opt->value); if (rv == 0) paxos_log_error("Expected number\n"); break; case option_string: rv = parse_string(line, opt->value); if (rv == 0) paxos_log_error("Expected string\n"); break; case option_verbosity: rv = parse_verbosity(line, opt->value); if (rv == 0) paxos_log_error("Expected quiet, error, info, or debug\n"); break; case option_backend: rv = parse_backend(line, opt->value); if (rv == 0) paxos_log_error("Expected memory, bdb or lmdb\n"); break; case option_bytes: rv = parse_bytes(line, opt->value); if (rv == 0) paxos_log_error("Expected number of bytes.\n"); } return rv; }