/** * acceptor_ack_refuse - Resolve an acceptor's claim that we do not know * the true proposer. * * If we send a request to someone who is not the proposer, but identifying * them as the proposer, we will receive a refuse. Since the correctness * of the Paxos protocol guarantees that the acceptor list has a consistent * total ordering across the system, receiving a refuse means that there is * someone more fitting to be proposer than the acceptor we identified. * * Note, as with ack_redirect, that it is possible we noticed a proposer * failure and sent our request to the new proposer correctly before the new * proposer themselves recognized the failure. */ int acceptor_ack_refuse(struct paxos_header *hdr, msgpack_object *o) { int r; msgpack_object *p; struct paxos_acceptor *acc; struct paxos_continuation *k; // Check whether, since we sent our request, we have already found a more // suitable proposer, possibly due to another redirect, in which case we // can ignore this one. if (pax->proposer->pa_paxid <= hdr->ph_inum) { return 0; } // Pull out the acceptor struct corresponding to the purported proposer and // try to reconnect. Note that we should have already set the pa_peer of // this acceptor to NULL to indicate the lost connection. acc = acceptor_find(&pax->alist, hdr->ph_inum); assert(acc->pa_peer == NULL); // Defer computation until the client performs connection. If it succeeds, // resend the request. We bind the request ID as callback data. k = continuation_new(continue_ack_refuse, acc->pa_paxid); assert(o->type == MSGPACK_OBJECT_ARRAY); p = o->via.array.ptr + 1; paxos_value_unpack(&k->pk_data.req.pr_val, p++); ERR_RET(r, state.connect(acc->pa_desc, acc->pa_size, &k->pk_cb)); return 0; }
/** * paxos_ack_retrieve - Acknowledge a retrieve. * * This basically just unpacks and wraps a resend. */ int paxos_ack_retrieve(struct paxos_header *hdr, msgpack_object *o) { paxid_t paxid; msgpack_object *p; struct paxos_value val; struct paxos_request *req; struct paxos_acceptor *acc; // Make sure the payload is well-formed. assert(o->type == MSGPACK_OBJECT_ARRAY); assert(o->via.array.size == 2); p = o->via.array.ptr; // Unpack the retriever's ID and the value being retrieved. paxos_paxid_unpack(&paxid, p++); paxos_value_unpack(&val, p++); // Retrieve the request. assert(request_needs_cached(val.pv_dkind)); req = request_find(&pax->rcache, val.pv_reqid); if (req != NULL) { // If we have the request, look up the recipient and resend. acc = acceptor_find(&pax->alist, paxid); return paxos_resend(acc, hdr, req); } else { // If we don't have the request either, just return. return 0; } }
/** * paxos_retrieve - Ask the originator of request data to send us data which * we do not have in our cache. * * We call this function when and only when we are issued a commit for an * instance whose associated request is not in our request cache. */ int paxos_retrieve(struct paxos_instance *inst) { int r; struct paxos_header hdr; struct paxos_acceptor *acc; struct paxos_yak py; // Initialize a header. We set ph_inum to the instance number of the // request. header_init(&hdr, OP_RETRIEVE, inst->pi_hdr.ph_inum); // Pack the retrieve. paxos_payload_init(&py, 2); paxos_header_pack(&py, &hdr); paxos_payload_begin_array(&py, 2); paxos_paxid_pack(&py, pax->self_id); paxos_value_pack(&py, &inst->pi_val); // Determine the request originator and send. If we are no longer connected // to the request originator, broadcast the retrieve instead. acc = acceptor_find(&pax->alist, inst->pi_val.pv_reqid.id); if (acc == NULL || acc->pa_peer == NULL) { r = paxos_broadcast(&py); } else { r = paxos_send(acc, &py); } paxos_payload_destroy(&py); return r; }
/** * proposer_ack_request - Dispatch a request as a decree. */ int proposer_ack_request(struct paxos_header *hdr, msgpack_object *o) { struct paxos_request *req; struct paxos_acceptor *acc; // Allocate a request and unpack into it. req = g_malloc0(sizeof(*req)); paxos_request_unpack(req, o); // The requester overloads ph_inst to the ID of the acceptor it believes // to be the proposer. If the requester has a live connection to us but // thinks that a lower-ranked acceptor is the proposer, kill them for // having inconsistent state. // // It is possible that a higher-ranked acceptor is identified as the // proposer. This should occur only in the case that we are preparing but // have lost our connection to the true proposer; in this case we will defer // our decree until after our prepare. If we indeed are not the proposer, // our prepare will fail, and we will be redirected at that point. if (hdr->ph_inum > pax->self_id) { acc = acceptor_find(&pax->alist, req->pr_val.pv_reqid.id); request_destroy(req); return proposer_decree_part(acc, 1); } // Add it to the request cache if needed. if (request_needs_cached(req->pr_val.pv_dkind)) { request_insert(&pax->rcache, req); } return proposer_decree_request(req); }
/** * proposer_ack_reject - Acknowledge an acceptor's reject. * * Increment the reject count of the appropriate Paxos instance. If we have * a majority of rejects, try to reconnect to the acceptor we attempted to * force part. If we are successful, re-decree null; otherwise, try the part * again. */ int proposer_ack_reject(struct paxos_header *hdr) { int r; struct paxos_instance *inst; struct paxos_acceptor *acc; struct paxos_continuation *k; // Our prepare succeeded, so we have only one possible ballot in our // lifetime in the system. assert(ballot_compare(hdr->ph_ballot, pax->ballot) == 0); // Find the decree of the correct instance and increment the reject count. inst = instance_find(&pax->ilist, hdr->ph_inum); inst->pi_rejects++; // Ignore the vote if we've already committed. if (inst->pi_committed) { return 0; } // We only reject parts. However, we may continue to receive rejects even // after a majority rejects, in which case we may have re-decreed null. if (inst->pi_val.pv_dkind == DEC_NULL) { return 0; } assert(inst->pi_val.pv_dkind == DEC_PART); // If we have been rejected by a majority, attempt reconnection. if (DEATH_ADJUSTED(inst->pi_rejects) >= majority()) { // See if we can reconnect to the acceptor we tried to part. acc = acceptor_find(&pax->alist, inst->pi_val.pv_extra); assert(acc->pa_peer == NULL); // Defer computation until the client performs connection. If it succeeds, // replace the part decree with a null decree; otherwise, just redecree // the part. We bind the instance number of the decree as callback data. k = continuation_new(continue_ack_reject, acc->pa_paxid); k->pk_data.inum = inst->pi_hdr.ph_inum; ERR_RET(r, state.connect(acc->pa_desc, acc->pa_size, &k->pk_cb)); return 0; } // If we have heard back from everyone but the accepts and rejects are tied, // just decree the part again. if (inst->pi_votes < majority() && DEATH_ADJUSTED(inst->pi_rejects) < majority() && inst->pi_votes + inst->pi_rejects == pax->live_count) { return paxos_broadcast_instance(inst); } return 0; }
/** * proposer_ack_redirect - Resolve an acceptor's claim that we are not the * true proposer. * * If we send a prepare to an acceptor who does not believe us to be the * true proposer, the acceptor will respond with a redirect. Since the * correctness of Paxos guarantees that the acceptor list has a consistent * total ordering, receiving a redirect means that there is someone more * fitting to be proposer who we have lost contact with. * * Note that this does not necessarily mean that the identified proposer is * still live; it is possible that we noticed a proposer failure and then * prepared before the acceptor who sent the redirect detected the failure. * To avoid this as much as possible, we wait for a majority of redirects * before accepting defeat and attempting reconnection to our superior. If * we "win" with a majority completing the prepare, then we drop the former * proposer regardless of whether he has some connections still open. */ int proposer_ack_redirect(struct paxos_header *hdr, msgpack_object *o) { int r; struct paxos_header orig_hdr; struct paxos_acceptor *acc; struct paxos_continuation *k; // We dispatched as the proposer, so we do not need to check again whether // we think ourselves to be the proposer. Instead, just sanity check that // the supposed true proposer has a lower ID than we do. This should // always be the case because of the consistency of proposer ranks. assert(hdr->ph_inum < pax->self_id); // If we are not still preparing, either we succeeded or our prepare was // rejected. In the former case, we should ignore the redirect because // we have affirmed our proposership with a majority vote. In the latter // case, if we connected to the true proposer, we would have dispatched // as an acceptor; and if we did not successfully connect, we would have // sent out another prepare. Hence, if we are not preparing, our prepare // succeeded and hence we should ignore the redirect. if (pax->prep == NULL) { return 0; } // Ensure that the redirect is for our current prepare; otherwise ignore. paxos_header_unpack(&orig_hdr, o); if (ballot_compare(orig_hdr.ph_ballot, pax->prep->pp_ballot) != 0) { return 0; } // Acknowledge the rejection of our prepare. pax->prep->pp_redirects++; // If we have been redirected by a majority, attempt reconnection. If a // majority redirects, our prepare will never succeed, but we defer freeing // it until reconnection occurs. This provides us with the guarantee (used // above) that an acceptor who identifies as the proposer and whose prepare // is non-NULL has either successfully prepared or has not yet begun its // prepare cycle. if (DEATH_ADJUSTED(pax->prep->pp_redirects) >= majority()) { // Connect to the higher-ranked acceptor indicated in the most recent // redirect message we received (i.e., this one). It's possible that an // even higher-ranked acceptor exists, but we'll find that out when we // try to send a request. acc = acceptor_find(&pax->alist, hdr->ph_inum); assert(acc->pa_peer == NULL); // Defer computation until the client performs connection. If it succeeds, // give up the prepare; otherwise, reprepare. k = continuation_new(continue_ack_redirect, acc->pa_paxid); ERR_RET(r, state.connect(acc->pa_desc, acc->pa_size, &k->pk_cb)); return 0; } // If we have heard back from everyone but the acks and redirects are tied, // just prepare again. if (pax->prep->pp_acks < majority() && DEATH_ADJUSTED(pax->prep->pp_redirects) < majority() && pax->prep->pp_acks + pax->prep->pp_redirects == pax->live_count) { g_free(pax->prep); pax->prep = NULL; return proposer_prepare(NULL); } return 0; }
/** * paxos_learn - Do something useful with the value of a commit. * * Note that we cannot free up the instance or any request associated with * it until a sync. */ int paxos_learn(struct paxos_instance *inst, struct paxos_request *req) { int r = 0; struct paxos_acceptor *acc; // Mark the learn. inst->pi_learned = true; // Act on the decree (e.g., display chat, record acceptor list changes). switch (inst->pi_val.pv_dkind) { case DEC_NULL: break; case DEC_CHAT: // Grab the message sender. acc = acceptor_find(&pax->alist, req->pr_val.pv_reqid.id); assert(acc != NULL); // Invoke client learning callback. state.learn.chat(req->pr_data, req->pr_size, acc->pa_desc, acc->pa_size, pax->client_data); break; case DEC_JOIN: // Check the adefer list to see if we received a hello already for the // newly joined acceptor. acc = acceptor_find(&pax->adefer, inst->pi_hdr.ph_inum); if (acc != NULL) { // We found a deferred hello. To complete the hello, just move our // acceptor over to the alist and increment the live count. LIST_REMOVE(&pax->adefer, acc, pa_le); pax->live_count++; } else { // We have not yet gotten the hello, so create a new acceptor. acc = g_malloc0(sizeof(*acc)); acc->pa_paxid = inst->pi_hdr.ph_inum; } acceptor_insert(&pax->alist, acc); // Copy over the identity information. acc->pa_size = req->pr_size; acc->pa_desc = g_memdup(req->pr_data, req->pr_size); // If we are the proposer, we are responsible for connecting to the new // acceptor, as well as for sending the new acceptor its paxid and other // initial data. if (is_proposer()) { proposer_welcome(acc); } // Invoke client learning callback. state.learn.join(req->pr_data, req->pr_size, acc->pa_desc, acc->pa_size, pax->client_data); break; case DEC_PART: case DEC_KILL: // Grab the acceptor from the alist. acc = acceptor_find(&pax->alist, inst->pi_val.pv_extra); if (acc == NULL) { // It is possible that we may part twice; for instance, if a proposer // issues a part for itself but its departure from the system is // detected by acceptors before the part commit is received. In this // case, just do nothing. break; } // Invoke client learning callback. state.learn.part(acc->pa_desc, acc->pa_size, acc->pa_desc, acc->pa_size, pax->client_data); // If we are being parted, leave the protocol. if (acc->pa_paxid == pax->self_id) { return paxos_end(pax); } // Take the parted acceptor off the list and do accounting if it was // still live. LIST_REMOVE(&pax->alist, acc, pa_le); if (acc->pa_peer != NULL) { pax->live_count--; } // If we just parted our proposer, "elect" a new one. If it's us, send // a prepare. if (acc->pa_paxid == pax->proposer->pa_paxid) { reset_proposer(); if (is_proposer()) { r = proposer_prepare(acc); } } // Free the parted acceptor. acceptor_destroy(acc); break; } return r; }