int raft_apply_all(raft_server_t* me_) { while (raft_get_last_applied_idx(me_) < raft_get_commit_idx(me_)) { int e = raft_apply_entry(me_); if (RAFT_ERR_SHUTDOWN == e) return RAFT_ERR_SHUTDOWN; } return 0; }
int raft_msg_entry_response_committed(raft_server_t* me_, const msg_entry_response_t* r) { raft_entry_t* ety = raft_get_entry_from_idx(me_, r->idx); if (!ety) return 0; /* entry from another leader has invalidated this entry message */ if (r->term != ety->term) return -1; return r->idx <= raft_get_commit_idx(me_); }
void TestRaft_server_wont_apply_entry_if_we_dont_have_entry_to_apply(CuTest* tc) { raft_entry_t ety; char *str = "aaa"; void *r = raft_new(); raft_set_commit_idx(r, 0); raft_set_last_applied_idx(r, 0); raft_apply_entry(r); CuAssertTrue(tc, 0 == raft_get_last_applied_idx(r)); CuAssertTrue(tc, 0 == raft_get_commit_idx(r)); ety.term = 1; ety.id = 1; ety.data.buf = str; ety.data.len = 3; raft_append_entry(r, &ety); raft_apply_entry(r); CuAssertTrue(tc, 1 == raft_get_last_applied_idx(r)); CuAssertTrue(tc, 1 == raft_get_commit_idx(r)); }
void TestRaft_follower_recv_appendentries_set_commitidx_to_LeaderCommit( CuTest * tc) { void *r = raft_new(); raft_add_node(r, (void*)1, 1); raft_add_node(r, (void*)2, 0); msg_appendentries_t ae; msg_appendentries_response_t aer; memset(&ae, 0, sizeof(msg_appendentries_t)); ae.term = 1; ae.prev_log_idx = 0; ae.prev_log_term = 1; /* include entries */ msg_entry_t e[4]; memset(&e, 0, sizeof(msg_entry_t) * 4); e[0].term = 1; e[0].id = 1; e[1].term = 1; e[1].id = 2; e[2].term = 1; e[2].id = 3; e[3].term = 1; e[3].id = 4; ae.entries = e; ae.n_entries = 4; raft_recv_appendentries(r, 1, &ae, &aer); /* receive an appendentry with commit */ memset(&ae, 0, sizeof(msg_appendentries_t)); ae.term = 1; ae.prev_log_term = 1; ae.prev_log_idx = 3; ae.leader_commit = 3; /* receipt of appendentries changes commit idx */ raft_recv_appendentries(r, 1, &ae, &aer); CuAssertTrue(tc, 1 == aer.success); /* set to 3 because leaderCommit is lower */ CuAssertTrue(tc, 3 == raft_get_commit_idx(r)); }
int raft_send_appendentries(raft_server_t* me_, raft_node_t* node) { raft_server_private_t* me = (raft_server_private_t*)me_; assert(node); assert(node != me->node); if (!(me->cb.send_appendentries)) return -1; msg_appendentries_t ae = {}; ae.term = me->current_term; ae.leader_commit = raft_get_commit_idx(me_); ae.prev_log_idx = 0; ae.prev_log_term = 0; int next_idx = raft_node_get_next_idx(node); ae.entries = raft_get_entries_from_idx(me_, next_idx, &ae.n_entries); /* previous log is the log just before the new logs */ if (1 < next_idx) { raft_entry_t* prev_ety = raft_get_entry_from_idx(me_, next_idx - 1); ae.prev_log_idx = next_idx - 1; if (prev_ety) ae.prev_log_term = prev_ety->term; } __log(me_, node, "sending appendentries node: ci:%d t:%d lc:%d pli:%d plt:%d", raft_get_current_idx(me_), ae.term, ae.leader_commit, ae.prev_log_idx, ae.prev_log_term); me->cb.send_appendentries(me_, me->udata, node, &ae); return 0; }
int raft_recv_appendentries( raft_server_t* me_, const int node, msg_appendentries_t* ae) { int i; raft_server_private_t* me = (void*)me_; msg_appendentries_response_t r; me->timeout_elapsed = 0; __log(me_, NULL, "received appendentries from: %d", node); r.term = me->current_term; /* we've found a leader who is legitimate */ if (raft_is_leader(me_) && me->current_term <= ae->term) raft_become_follower(me_); /* 1. Reply false if term < currentTerm (�1) */ if (ae->term < me->current_term) { __log(me_, NULL, "AE term is less than current term"); r.success = 0; goto done; } #if 0 if (-1 != ae->prev_log_idx && ae->prev_log_idx < raft_get_current_idx(me_)) { __log(me_, NULL, "AE prev_idx is less than current idx"); r.success = 0; goto done; } #endif /* not the first appendentries we've received */ if (0 != ae->prev_log_idx) { raft_entry_t* e; if ((e = raft_get_entry_from_idx(me_, ae->prev_log_idx))) { /* 2. Reply false if log doesn抰 contain an entry at prevLogIndex whose term matches prevLogTerm (�3) */ if (e->term != ae->prev_log_term) { __log(me_, NULL, "AE term doesn't match prev_idx"); r.success = 0; goto done; } /* 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (�3) */ raft_entry_t* e2; if ((e2 = raft_get_entry_from_idx(me_, ae->prev_log_idx+1))) { log_delete(me->log, ae->prev_log_idx+1); } } else { __log(me_, NULL, "AE no log at prev_idx"); r.success = 0; goto done; //assert(0); } } /* 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, last log index) */ if (raft_get_commit_idx(me_) < ae->leader_commit) { raft_entry_t* e; if ((e = log_peektail(me->log))) { raft_set_commit_idx(me_, e->id < ae->leader_commit ? e->id : ae->leader_commit); while (1 == raft_apply_entry(me_)); } } if (raft_is_candidate(me_)) raft_become_follower(me_); raft_set_current_term(me_, ae->term); /* append all entries to log */ for (i=0; i<ae->n_entries; i++) { msg_entry_t* cmd; raft_entry_t* c; cmd = &ae->entries[i]; /* TODO: replace malloc with mempoll/arena */ c = malloc(sizeof(raft_entry_t)); c->term = me->current_term; c->len = cmd->len; c->id = cmd->id; c->data = malloc(cmd->len); memcpy(c->data, cmd->data, cmd->len); if (0 == raft_append_entry(me_, c)) { __log(me_, NULL, "AE failure; couldn't append entry"); r.success = 0; goto done; } } r.success = 1; r.current_idx = raft_get_current_idx(me_); r.first_idx = ae->prev_log_idx + 1; done: if (me->cb.send) me->cb.send(me->cb_ctx, me, node, RAFT_MSG_APPENDENTRIES_RESPONSE, (void*)&r, sizeof(msg_appendentries_response_t)); return 1; }
int raft_recv_appendentries( raft_server_t* me_, const int node, msg_appendentries_t* ae, msg_appendentries_response_t *r ) { raft_server_private_t* me = (raft_server_private_t*)me_; me->timeout_elapsed = 0; __log(me_, "received appendentries from: %d", node); r->term = me->current_term; /* we've found a leader who is legitimate */ if (raft_is_leader(me_) && me->current_term <= ae->term) raft_become_follower(me_); /* 1. Reply false if term < currentTerm (§5.1) */ if (ae->term < me->current_term) { __log(me_, "AE term is less than current term"); r->success = 0; return 0; } #if 0 if (-1 != ae->prev_log_idx && ae->prev_log_idx < raft_get_current_idx(me_)) { __log(me_, "AE prev_idx is less than current idx"); r->success = 0; return 0; } #endif /* not the first appendentries we've received */ if (0 != ae->prev_log_idx) { raft_entry_t* e = raft_get_entry_from_idx(me_, ae->prev_log_idx); if (e) { /* 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm (§5.3) */ if (e->term != ae->prev_log_term) { __log(me_, "AE term doesn't match prev_idx"); r->success = 0; return 0; } /* 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (§5.3) */ raft_entry_t* e2; e2 = raft_get_entry_from_idx(me_, ae->prev_log_idx + 1); if (e2) log_delete(me->log, ae->prev_log_idx + 1); } else { __log(me_, "AE no log at prev_idx"); r->success = 0; return 0; } } /* 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, last log index) */ if (raft_get_commit_idx(me_) < ae->leader_commit) { raft_entry_t* e = log_peektail(me->log); if (e) { int id = e->id < ae->leader_commit ? e->id : ae->leader_commit; raft_set_commit_idx(me_, id); while (0 == raft_apply_entry(me_)) ; } } if (raft_is_candidate(me_)) raft_become_follower(me_); raft_set_current_term(me_, ae->term); int i; /* append all entries to log */ for (i = 0; i < ae->n_entries; i++) { msg_entry_t* cmd = &ae->entries[i]; /* TODO: replace malloc with mempoll/arena */ raft_entry_t* c = (raft_entry_t*)malloc(sizeof(raft_entry_t)); c->term = me->current_term; c->len = cmd->len; c->id = cmd->id; c->data = (unsigned char*)malloc(cmd->len); memcpy(c->data, cmd->data, cmd->len); if (-1 == raft_append_entry(me_, c)) { __log(me_, "AE failure; couldn't append entry"); r->success = 0; return -1; } } r->success = 1; r->current_idx = raft_get_current_idx(me_); r->first_idx = ae->prev_log_idx + 1; return 0; }
void raft_apply_all(raft_server_t* me_) { while (raft_get_last_applied_idx(me_) < raft_get_commit_idx(me_)) raft_apply_entry(me_); }
int raft_recv_appendentries( raft_server_t* me_, raft_node_t* node, msg_appendentries_t* ae, msg_appendentries_response_t *r ) { raft_server_private_t* me = (raft_server_private_t*)me_; me->timeout_elapsed = 0; if (0 < ae->n_entries) __log(me_, node, "recvd appendentries from: %lx, t:%d ci:%d lc:%d pli:%d plt:%d #%d", node, ae->term, raft_get_current_idx(me_), ae->leader_commit, ae->prev_log_idx, ae->prev_log_term, ae->n_entries); r->term = me->current_term; if (raft_is_candidate(me_) && me->current_term == ae->term) { me->voted_for = -1; raft_become_follower(me_); } else if (me->current_term < ae->term) { raft_set_current_term(me_, ae->term); r->term = ae->term; raft_become_follower(me_); } else if (ae->term < me->current_term) { /* 1. Reply false if term < currentTerm (§5.1) */ __log(me_, node, "AE term %d is less than current term %d", ae->term, me->current_term); goto fail_with_current_idx; } /* Not the first appendentries we've received */ /* NOTE: the log starts at 1 */ if (0 < ae->prev_log_idx) { raft_entry_t* e = raft_get_entry_from_idx(me_, ae->prev_log_idx); if (!e) { __log(me_, node, "AE no log at prev_idx %d", ae->prev_log_idx); goto fail_with_current_idx; } /* 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm (§5.3) */ if (raft_get_current_idx(me_) < ae->prev_log_idx) goto fail_with_current_idx; if (e->term != ae->prev_log_term) { __log(me_, node, "AE term doesn't match prev_term (ie. %d vs %d) ci:%d pli:%d", e->term, ae->prev_log_term, raft_get_current_idx(me_), ae->prev_log_idx); assert(me->commit_idx < ae->prev_log_idx); /* Delete all the following log entries because they don't match */ log_delete(me->log, ae->prev_log_idx); r->current_idx = ae->prev_log_idx - 1; goto fail; } } /* 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (§5.3) */ if (ae->n_entries == 0 && 0 < ae->prev_log_idx && ae->prev_log_idx + 1 < raft_get_current_idx(me_)) { assert(me->commit_idx < ae->prev_log_idx + 1); log_delete(me->log, ae->prev_log_idx + 1); } r->current_idx = ae->prev_log_idx; int i; for (i = 0; i < ae->n_entries; i++) { msg_entry_t* ety = &ae->entries[i]; int ety_index = ae->prev_log_idx + 1 + i; raft_entry_t* existing_ety = raft_get_entry_from_idx(me_, ety_index); r->current_idx = ety_index; if (existing_ety && existing_ety->term != ety->term) { assert(me->commit_idx < ety_index); log_delete(me->log, ety_index); break; } else if (!existing_ety) break; } /* Pick up remainder in case of mismatch or missing entry */ for (; i < ae->n_entries; i++) { int e = raft_append_entry(me_, &ae->entries[i]); if (-1 == e) goto fail_with_current_idx; r->current_idx = ae->prev_log_idx + 1 + i; } /* 4. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of most recent entry) */ if (raft_get_commit_idx(me_) < ae->leader_commit) { int last_log_idx = max(raft_get_current_idx(me_), 1); raft_set_commit_idx(me_, min(last_log_idx, ae->leader_commit)); } /* update current leader because we accepted appendentries from it */ me->current_leader = node; r->success = 1; r->first_idx = ae->prev_log_idx + 1; return 0; fail_with_current_idx: r->current_idx = raft_get_current_idx(me_); fail: r->success = 0; r->first_idx = 0; return -1; }
int raft_recv_appendentries_response(raft_server_t* me_, raft_node_t* node, msg_appendentries_response_t* r) { raft_server_private_t* me = (raft_server_private_t*)me_; __log(me_, node, "received appendentries response %s ci:%d rci:%d 1stidx:%d", r->success == 1 ? "SUCCESS" : "fail", raft_get_current_idx(me_), r->current_idx, r->first_idx); /* Stale response -- ignore */ if (r->current_idx != 0 && r->current_idx <= raft_node_get_match_idx(node)) return 0; if (!raft_is_leader(me_)) return -1; /* If response contains term T > currentTerm: set currentTerm = T and convert to follower (§5.3) */ if (me->current_term < r->term) { raft_set_current_term(me_, r->term); raft_become_follower(me_); return 0; } else if (me->current_term != r->term) return 0; /* stop processing, this is a node we don't have in our configuration */ if (!node) return 0; if (0 == r->success) { /* If AppendEntries fails because of log inconsistency: decrement nextIndex and retry (§5.3) */ assert(0 <= raft_node_get_next_idx(node)); int next_idx = raft_node_get_next_idx(node); assert(0 <= next_idx); if (r->current_idx < next_idx - 1) raft_node_set_next_idx(node, min(r->current_idx + 1, raft_get_current_idx(me_))); else raft_node_set_next_idx(node, next_idx - 1); /* retry */ raft_send_appendentries(me_, node); return 0; } assert(r->current_idx <= raft_get_current_idx(me_)); raft_node_set_next_idx(node, r->current_idx + 1); raft_node_set_match_idx(node, r->current_idx); if (!raft_node_is_voting(node) && -1 == me->voting_cfg_change_log_idx && raft_get_current_idx(me_) <= r->current_idx + 1 && me->cb.node_has_sufficient_logs && 0 == raft_node_has_sufficient_logs(node) ) { raft_node_set_has_sufficient_logs(node); me->cb.node_has_sufficient_logs(me_, me->udata, node); } /* Update commit idx */ int votes = 1; /* include me */ int point = r->current_idx; int i; for (i = 0; i < me->num_nodes; i++) { if (me->node == me->nodes[i] || !raft_node_is_voting(me->nodes[i])) continue; int match_idx = raft_node_get_match_idx(me->nodes[i]); if (0 < match_idx) { raft_entry_t* ety = raft_get_entry_from_idx(me_, match_idx); if (ety->term == me->current_term && point <= match_idx) votes++; } } if (me->num_nodes / 2 < votes && raft_get_commit_idx(me_) < point) raft_set_commit_idx(me_, point); /* Aggressively send remaining entries */ if (raft_get_entry_from_idx(me_, raft_node_get_next_idx(node))) raft_send_appendentries(me_, node); /* periodic applies committed entries lazily */ return 0; }