/** * paxos_sync - GEvent-friendly wrapper around proposer_sync. */ int paxos_sync(void *data) { pax_uuid_t *uuid; // Set the session. We parametrize paxos_sync with a pointer to a session // ID when we add it to the main event loop. uuid = (pax_uuid_t *)data; pax = session_find(&state.sessions, uuid); if (is_proposer()) { proposer_sync(); } return TRUE; }
/** * continue_ack_redirect - If we were able to reestablish connection with the * purported proposer, relinquish our proposership, clear our defer list, * and reintroduce ourselves. Otherwise, try preparing again. */ int do_continue_ack_redirect(GIOChannel *chan, struct paxos_acceptor *acc, struct paxos_continuation *k) { // Sanity check the choice of acc. assert(acc->pa_paxid < pax->self_id); // If connection to the acceptor has already been reestablished, we should // no longer be the proposer and we can simply return. if (acc->pa_peer != NULL) { assert(!is_proposer()); return 0; } // Free the old prepare regardless of whether reconnection succeeded. g_free(pax->prep); pax->prep = NULL; // Register the reconnection; on failure, reprepare. acc->pa_peer = paxos_peer_init(chan); if (acc->pa_peer != NULL) { // Account for a new acceptor. pax->live_count++; // We update the proposer only if we have not reconnected to an even // higher-ranked acceptor. if (acc->pa_paxid < pax->proposer->pa_paxid) { pax->proposer = acc; } // Destroy the defer list; we're finished trying to prepare. // XXX: Do we want to somehow pass it to the real proposer? How do we // know which requests were made for us? instance_container_destroy(&pax->idefer); // Say hello. return paxos_hello(acc); } else { // Prepare again, continuing to append to the defer list. return proposer_prepare(NULL); } }
/** * continue_ack_refuse - If we were able to reestablish connection with the * purported proposer, reset our proposer and reintroduce ourselves. */ int do_continue_ack_refuse(GIOChannel *chan, struct paxos_acceptor *acc, struct paxos_continuation *k) { int r = 0; struct paxos_header hdr; struct paxos_request *req; struct yakyak yy; // If we are the proposer and have finished preparing, anyone higher-ranked // than we are is dead to us. However, their parts may not yet have gone // through, so we make sure to ignore attempts at reconnection. if (is_proposer() && pax->prep == NULL) { return 0; } // Register the reconnection. acc->pa_peer = paxos_peer_init(chan); if (acc->pa_peer != NULL) { // Account for a new acceptor. pax->live_count++; // Free any prep we have. Although we dispatch as an acceptor when we // acknowledge a refuse, when the acknowledgement continues here, we may // have become the proposer. Thus, if we are preparing, we should just // give up. If the acceptor we are reconnecting to fails, we'll find // out about the drop and then reprepare. g_free(pax->prep); pax->prep = NULL; instance_container_destroy(&pax->idefer); // Say hello. ERR_ACCUM(r, paxos_hello(acc)); if (acc->pa_paxid < pax->proposer->pa_paxid) { // Update the proposer only if we have not reconnected to an even // higher-ranked acceptor. pax->proposer = acc; // Resend our request. // XXX: What about the problematic case where A is connected to B, B // thinks it's the proposer and accepts A's request, but in fact B is not // the proposer and C, the real proposer, gets neither of their requests? header_init(&hdr, OP_REQUEST, pax->proposer->pa_paxid); req = request_find(&pax->rcache, k->pk_data.req.pr_val.pv_reqid); if (req == NULL) { req = &k->pk_data.req; } yakyak_init(&yy, 2); paxos_header_pack(&yy, &hdr); paxos_request_pack(&yy, req); ERR_ACCUM(r, paxos_send_to_proposer(&yy)); yakyak_destroy(&yy); } } return r; }
/** * paxos_commit - Commit a value for an instance of the Paxos protocol. * * We totally order calls to paxos_learn by instance number in order to make * the join and greet protocols behave properly. This also gives our chat * clients an easy mechanism for totally ordering their logs without extra * work on their part. * * It is possible that failed DEC_PART decrees (i.e., decrees in which the * proposer attempts to disconnect an acceptor who a majority of acceptors * believe is still alive) could delay the learning of committed chat * messages. To avoid this, once a proposer receives enough rejections * of the decree, the part decree is replaced with a null decree. The * proposer can then issue the part again with a higher instance number * if desired. */ int paxos_commit(struct paxos_instance *inst) { int r; struct paxos_request *req = NULL; struct paxos_instance *it; // Mark the commit. inst->pi_committed = true; // Pull the request from the request cache if applicable. if (request_needs_cached(inst->pi_val.pv_dkind)) { req = request_find(&pax->rcache, inst->pi_val.pv_reqid); // If we can't find a request and need one, send out a retrieve to the // request originator and defer the commit. if (req == NULL) { return paxos_retrieve(inst); } } // Mark the cache. inst->pi_cached = true; // We should already have committed and learned everything before the hole. assert(inst->pi_hdr.ph_inum >= pax->ihole); // Since we want our learns to be totally ordered, if we didn't just fill // the hole, we cannot learn. if (inst->pi_hdr.ph_inum != pax->ihole) { // If we're the proposer, we have to just wait it out. if (is_proposer()) { return 0; } // If the hole has committed but is just waiting on a retrieve, we'll learn // when we receive the resend. if (pax->istart->pi_hdr.ph_inum == pax->ihole && pax->istart->pi_committed) { assert(!pax->istart->pi_cached); return 0; } // The hole is either missing or uncommitted and we are not the proposer, // so issue a retry. return acceptor_retry(pax->ihole); } // Set pax->istart to point to the instance numbered pax->ihole. if (pax->istart->pi_hdr.ph_inum != pax->ihole) { pax->istart = LIST_NEXT(pax->istart, pi_le); } assert(pax->istart->pi_hdr.ph_inum == pax->ihole); // Now learn as many contiguous commits as we can. This function is the // only path by which we learn commits, and we always learn in contiguous // blocks. Therefore, it is an invariant of our system that all the // instances numbered lower than pax->ihole are learned and committed, and // none of the instances geq to pax->ihole are learned (although some may // be committed). // // We iterate over the instance list, detecting and breaking if we find a // hole and learning whenever we don't. for (it = pax->istart; ; it = LIST_NEXT(it, pi_le), ++pax->ihole) { // If we reached the end of the list, set pax->istart to the last existing // instance. if (it == (void *)&pax->ilist) { pax->istart = LIST_LAST(&pax->ilist); break; } // If we skipped over an instance number because we were missing an // instance, set pax->istart to the last instance before the hole. if (it->pi_hdr.ph_inum != pax->ihole) { pax->istart = LIST_PREV(it, pi_le); break; } // If we found an uncommitted or uncached instance, set pax->istart to it. if (!it->pi_committed || !it->pi_cached) { pax->istart = it; break; } // By our invariant, since we are past our original hole, no instance // should be learned. assert(!it->pi_learned); // Grab its associated request. This is guaranteed to exist because we // have checked that pi_cached holds. req = NULL; if (request_needs_cached(it->pi_val.pv_dkind)) { req = request_find(&pax->rcache, it->pi_val.pv_reqid); assert(req != NULL); } // Learn the value. ERR_RET(r, paxos_learn(it, req)); } return 0; }
/** * paxos_learn - Do something useful with the value of a commit. * * Note that we cannot free up the instance or any request associated with * it until a sync. */ int paxos_learn(struct paxos_instance *inst, struct paxos_request *req) { int r = 0; struct paxos_acceptor *acc; // Mark the learn. inst->pi_learned = true; // Act on the decree (e.g., display chat, record acceptor list changes). switch (inst->pi_val.pv_dkind) { case DEC_NULL: break; case DEC_CHAT: // Grab the message sender. acc = acceptor_find(&pax->alist, req->pr_val.pv_reqid.id); assert(acc != NULL); // Invoke client learning callback. state.learn.chat(req->pr_data, req->pr_size, acc->pa_desc, acc->pa_size, pax->client_data); break; case DEC_JOIN: // Check the adefer list to see if we received a hello already for the // newly joined acceptor. acc = acceptor_find(&pax->adefer, inst->pi_hdr.ph_inum); if (acc != NULL) { // We found a deferred hello. To complete the hello, just move our // acceptor over to the alist and increment the live count. LIST_REMOVE(&pax->adefer, acc, pa_le); pax->live_count++; } else { // We have not yet gotten the hello, so create a new acceptor. acc = g_malloc0(sizeof(*acc)); acc->pa_paxid = inst->pi_hdr.ph_inum; } acceptor_insert(&pax->alist, acc); // Copy over the identity information. acc->pa_size = req->pr_size; acc->pa_desc = g_memdup(req->pr_data, req->pr_size); // If we are the proposer, we are responsible for connecting to the new // acceptor, as well as for sending the new acceptor its paxid and other // initial data. if (is_proposer()) { proposer_welcome(acc); } // Invoke client learning callback. state.learn.join(req->pr_data, req->pr_size, acc->pa_desc, acc->pa_size, pax->client_data); break; case DEC_PART: case DEC_KILL: // Grab the acceptor from the alist. acc = acceptor_find(&pax->alist, inst->pi_val.pv_extra); if (acc == NULL) { // It is possible that we may part twice; for instance, if a proposer // issues a part for itself but its departure from the system is // detected by acceptors before the part commit is received. In this // case, just do nothing. break; } // Invoke client learning callback. state.learn.part(acc->pa_desc, acc->pa_size, acc->pa_desc, acc->pa_size, pax->client_data); // If we are being parted, leave the protocol. if (acc->pa_paxid == pax->self_id) { return paxos_end(pax); } // Take the parted acceptor off the list and do accounting if it was // still live. LIST_REMOVE(&pax->alist, acc, pa_le); if (acc->pa_peer != NULL) { pax->live_count--; } // If we just parted our proposer, "elect" a new one. If it's us, send // a prepare. if (acc->pa_paxid == pax->proposer->pa_paxid) { reset_proposer(); if (is_proposer()) { r = proposer_prepare(acc); } } // Free the parted acceptor. acceptor_destroy(acc); break; } return r; }
/** * paxos_request - Request that the proposer make a decree for us. * * If the request has data attached to it, we broadcast an out-of-band message * to all acceptors, asking that they cache our message until the proposer * commits it. * * We send the request as a header along with a two-object array consisting * of a paxos_value (itself an array) and a msgpack raw (i.e., a data * string). */ int paxos_request(struct paxos_session *session, dkind_t dkind, const void *msg, size_t len) { int r, needs_cached; struct paxos_header hdr; struct paxos_request *req; struct paxos_yak py; // Set the session. The client should pass us a pointer to the correct // session object which we returned when the session was created. pax = session; // We can't make requests if we're not part of a protocol. if (pax == NULL) { return 1; } // Do we need to cache this request? needs_cached = request_needs_cached(dkind); // Initialize a header. We overload ph_inum to the ID of the acceptor who // we believe to be the proposer. header_init(&hdr, OP_REQUEST, pax->proposer->pa_paxid); // Allocate a request and initialize it. req = g_malloc0(sizeof(*req)); req->pr_val.pv_dkind = dkind; req->pr_val.pv_reqid.id = pax->self_id; req->pr_val.pv_reqid.gen = (++pax->req_id); // Increment our req_id. req->pr_val.pv_extra = 0; // Always 0 for requests. req->pr_size = len; req->pr_data = g_memdup(msg, len); // Add it to the request cache if needed. if (needs_cached) { request_insert(&pax->rcache, req); } if (!is_proposer() || needs_cached) { // We need to send iff either we are not the proposer or the request // has nontrivial data. paxos_payload_init(&py, 2); paxos_header_pack(&py, &hdr); paxos_request_pack(&py, req); // Broadcast only if it needs caching. if (!needs_cached) { r = paxos_send_to_proposer(&py); } else { r = paxos_broadcast(&py); } paxos_payload_destroy(&py); if (r) { return r; } } // Decree the request if we're the proposer; otherwise just return. if (is_proposer()) { return proposer_decree_request(req); } else { return 0; } }