Esempio n. 1
0
/**
 * acceptor_ack_refuse - Resolve an acceptor's claim that we do not know
 * the true proposer.
 *
 * If we send a request to someone who is not the proposer, but identifying
 * them as the proposer, we will receive a refuse.  Since the correctness
 * of the Paxos protocol guarantees that the acceptor list has a consistent
 * total ordering across the system, receiving a refuse means that there is
 * someone more fitting to be proposer than the acceptor we identified.
 *
 * Note, as with ack_redirect, that it is possible we noticed a proposer
 * failure and sent our request to the new proposer correctly before the new
 * proposer themselves recognized the failure.
 */
int
acceptor_ack_refuse(struct paxos_header *hdr, msgpack_object *o)
{
  int r;
  msgpack_object *p;
  struct paxos_acceptor *acc;
  struct paxos_continuation *k;

  // Check whether, since we sent our request, we have already found a more
  // suitable proposer, possibly due to another redirect, in which case we
  // can ignore this one.
  if (pax->proposer->pa_paxid <= hdr->ph_inum) {
    return 0;
  }

  // Pull out the acceptor struct corresponding to the purported proposer and
  // try to reconnect.  Note that we should have already set the pa_peer of
  // this acceptor to NULL to indicate the lost connection.
  acc = acceptor_find(&pax->alist, hdr->ph_inum);
  assert(acc->pa_peer == NULL);

  // Defer computation until the client performs connection.  If it succeeds,
  // resend the request.  We bind the request ID as callback data.
  k = continuation_new(continue_ack_refuse, acc->pa_paxid);

  assert(o->type == MSGPACK_OBJECT_ARRAY);
  p = o->via.array.ptr + 1;
  paxos_value_unpack(&k->pk_data.req.pr_val, p++);

  ERR_RET(r, state.connect(acc->pa_desc, acc->pa_size, &k->pk_cb));
  return 0;
}
Esempio n. 2
0
/**
 * paxos_ack_retrieve - Acknowledge a retrieve.
 *
 * This basically just unpacks and wraps a resend.
 */
int paxos_ack_retrieve(struct paxos_header *hdr, msgpack_object *o)
{
  paxid_t paxid;
  msgpack_object *p;
  struct paxos_value val;
  struct paxos_request *req;
  struct paxos_acceptor *acc;

  // Make sure the payload is well-formed.
  assert(o->type == MSGPACK_OBJECT_ARRAY);
  assert(o->via.array.size == 2);
  p = o->via.array.ptr;

  // Unpack the retriever's ID and the value being retrieved.
  paxos_paxid_unpack(&paxid, p++);
  paxos_value_unpack(&val, p++);

  // Retrieve the request.
  assert(request_needs_cached(val.pv_dkind));
  req = request_find(&pax->rcache, val.pv_reqid);
  if (req != NULL) {
    // If we have the request, look up the recipient and resend.
    acc = acceptor_find(&pax->alist, paxid);
    return paxos_resend(acc, hdr, req);
  } else {
    // If we don't have the request either, just return.
    return 0;
  }
}
Esempio n. 3
0
/**
 * paxos_retrieve - Ask the originator of request data to send us data which
 * we do not have in our cache.
 *
 * We call this function when and only when we are issued a commit for an
 * instance whose associated request is not in our request cache.
 */
int
paxos_retrieve(struct paxos_instance *inst)
{
  int r;
  struct paxos_header hdr;
  struct paxos_acceptor *acc;
  struct paxos_yak py;

  // Initialize a header.  We set ph_inum to the instance number of the
  // request.
  header_init(&hdr, OP_RETRIEVE, inst->pi_hdr.ph_inum);

  // Pack the retrieve.
  paxos_payload_init(&py, 2);
  paxos_header_pack(&py, &hdr);
  paxos_payload_begin_array(&py, 2);
  paxos_paxid_pack(&py, pax->self_id);
  paxos_value_pack(&py, &inst->pi_val);

  // Determine the request originator and send.  If we are no longer connected
  // to the request originator, broadcast the retrieve instead.
  acc = acceptor_find(&pax->alist, inst->pi_val.pv_reqid.id);
  if (acc == NULL || acc->pa_peer == NULL) {
    r = paxos_broadcast(&py);
  } else {
    r = paxos_send(acc, &py);
  }
  paxos_payload_destroy(&py);

  return r;
}
Esempio n. 4
0
/**
 * proposer_ack_request - Dispatch a request as a decree.
 */
int
proposer_ack_request(struct paxos_header *hdr, msgpack_object *o)
{
  struct paxos_request *req;
  struct paxos_acceptor *acc;

  // Allocate a request and unpack into it.
  req = g_malloc0(sizeof(*req));
  paxos_request_unpack(req, o);

  // The requester overloads ph_inst to the ID of the acceptor it believes
  // to be the proposer.  If the requester has a live connection to us but
  // thinks that a lower-ranked acceptor is the proposer, kill them for
  // having inconsistent state.
  //
  // It is possible that a higher-ranked acceptor is identified as the
  // proposer.  This should occur only in the case that we are preparing but
  // have lost our connection to the true proposer; in this case we will defer
  // our decree until after our prepare.  If we indeed are not the proposer,
  // our prepare will fail, and we will be redirected at that point.
  if (hdr->ph_inum > pax->self_id) {
    acc = acceptor_find(&pax->alist, req->pr_val.pv_reqid.id);
    request_destroy(req);
    return proposer_decree_part(acc, 1);
  }

  // Add it to the request cache if needed.
  if (request_needs_cached(req->pr_val.pv_dkind)) {
    request_insert(&pax->rcache, req);
  }

  return proposer_decree_request(req);
}
Esempio n. 5
0
/**
 * proposer_ack_reject - Acknowledge an acceptor's reject.
 *
 * Increment the reject count of the appropriate Paxos instance.  If we have
 * a majority of rejects, try to reconnect to the acceptor we attempted to
 * force part.  If we are successful, re-decree null; otherwise, try the part
 * again.
 */
int
proposer_ack_reject(struct paxos_header *hdr)
{
  int r;
  struct paxos_instance *inst;
  struct paxos_acceptor *acc;
  struct paxos_continuation *k;

  // Our prepare succeeded, so we have only one possible ballot in our
  // lifetime in the system.
  assert(ballot_compare(hdr->ph_ballot, pax->ballot) == 0);

  // Find the decree of the correct instance and increment the reject count.
  inst = instance_find(&pax->ilist, hdr->ph_inum);
  inst->pi_rejects++;

  // Ignore the vote if we've already committed.
  if (inst->pi_committed) {
    return 0;
  }

  // We only reject parts.  However, we may continue to receive rejects even
  // after a majority rejects, in which case we may have re-decreed null.
  if (inst->pi_val.pv_dkind == DEC_NULL) {
    return 0;
  }
  assert(inst->pi_val.pv_dkind == DEC_PART);

  // If we have been rejected by a majority, attempt reconnection.
  if (DEATH_ADJUSTED(inst->pi_rejects) >= majority()) {
    // See if we can reconnect to the acceptor we tried to part.
    acc = acceptor_find(&pax->alist, inst->pi_val.pv_extra);
    assert(acc->pa_peer == NULL);

    // Defer computation until the client performs connection.  If it succeeds,
    // replace the part decree with a null decree; otherwise, just redecree
    // the part.  We bind the instance number of the decree as callback data.
    k = continuation_new(continue_ack_reject, acc->pa_paxid);
    k->pk_data.inum = inst->pi_hdr.ph_inum;
    ERR_RET(r, state.connect(acc->pa_desc, acc->pa_size, &k->pk_cb));
    return 0;
  }

  // If we have heard back from everyone but the accepts and rejects are tied,
  // just decree the part again.
  if (inst->pi_votes < majority() &&
      DEATH_ADJUSTED(inst->pi_rejects) < majority() &&
      inst->pi_votes + inst->pi_rejects == pax->live_count) {
    return paxos_broadcast_instance(inst);
  }

  return 0;
}
Esempio n. 6
0
/**
 * proposer_ack_redirect - Resolve an acceptor's claim that we are not the
 * true proposer.
 *
 * If we send a prepare to an acceptor who does not believe us to be the
 * true proposer, the acceptor will respond with a redirect.  Since the
 * correctness of Paxos guarantees that the acceptor list has a consistent
 * total ordering, receiving a redirect means that there is someone more
 * fitting to be proposer who we have lost contact with.
 *
 * Note that this does not necessarily mean that the identified proposer is
 * still live; it is possible that we noticed a proposer failure and then
 * prepared before the acceptor who sent the redirect detected the failure.
 * To avoid this as much as possible, we wait for a majority of redirects
 * before accepting defeat and attempting reconnection to our superior.  If
 * we "win" with a majority completing the prepare, then we drop the former
 * proposer regardless of whether he has some connections still open.
 */
int
proposer_ack_redirect(struct paxos_header *hdr, msgpack_object *o)
{
  int r;
  struct paxos_header orig_hdr;
  struct paxos_acceptor *acc;
  struct paxos_continuation *k;

  // We dispatched as the proposer, so we do not need to check again whether
  // we think ourselves to be the proposer.  Instead, just sanity check that
  // the supposed true proposer has a lower ID than we do.  This should
  // always be the case because of the consistency of proposer ranks.
  assert(hdr->ph_inum < pax->self_id);

  // If we are not still preparing, either we succeeded or our prepare was
  // rejected.  In the former case, we should ignore the redirect because
  // we have affirmed our proposership with a majority vote.  In the latter
  // case, if we connected to the true proposer, we would have dispatched
  // as an acceptor; and if we did not successfully connect, we would have
  // sent out another prepare.  Hence, if we are not preparing, our prepare
  // succeeded and hence we should ignore the redirect.
  if (pax->prep == NULL) {
    return 0;
  }

  // Ensure that the redirect is for our current prepare; otherwise ignore.
  paxos_header_unpack(&orig_hdr, o);
  if (ballot_compare(orig_hdr.ph_ballot, pax->prep->pp_ballot) != 0) {
    return 0;
  }

  // Acknowledge the rejection of our prepare.
  pax->prep->pp_redirects++;

  // If we have been redirected by a majority, attempt reconnection.  If a
  // majority redirects, our prepare will never succeed, but we defer freeing
  // it until reconnection occurs.  This provides us with the guarantee (used
  // above) that an acceptor who identifies as the proposer and whose prepare
  // is non-NULL has either successfully prepared or has not yet begun its
  // prepare cycle.
  if (DEATH_ADJUSTED(pax->prep->pp_redirects) >= majority()) {
    // Connect to the higher-ranked acceptor indicated in the most recent
    // redirect message we received (i.e., this one).  It's possible that an
    // even higher-ranked acceptor exists, but we'll find that out when we
    // try to send a request.
    acc = acceptor_find(&pax->alist, hdr->ph_inum);
    assert(acc->pa_peer == NULL);

    // Defer computation until the client performs connection.  If it succeeds,
    // give up the prepare; otherwise, reprepare.
    k = continuation_new(continue_ack_redirect, acc->pa_paxid);
    ERR_RET(r, state.connect(acc->pa_desc, acc->pa_size, &k->pk_cb));
    return 0;
  }

  // If we have heard back from everyone but the acks and redirects are tied,
  // just prepare again.
  if (pax->prep->pp_acks < majority() &&
      DEATH_ADJUSTED(pax->prep->pp_redirects) < majority() &&
      pax->prep->pp_acks + pax->prep->pp_redirects == pax->live_count) {
    g_free(pax->prep);
    pax->prep = NULL;
    return proposer_prepare(NULL);
  }

  return 0;
}
Esempio n. 7
0
/**
 * paxos_learn - Do something useful with the value of a commit.
 *
 * Note that we cannot free up the instance or any request associated with
 * it until a sync.
 */
int
paxos_learn(struct paxos_instance *inst, struct paxos_request *req)
{
  int r = 0;
  struct paxos_acceptor *acc;

  // Mark the learn.
  inst->pi_learned = true;

  // Act on the decree (e.g., display chat, record acceptor list changes).
  switch (inst->pi_val.pv_dkind) {
    case DEC_NULL:
      break;

    case DEC_CHAT:
      // Grab the message sender.
      acc = acceptor_find(&pax->alist, req->pr_val.pv_reqid.id);
      assert(acc != NULL);

      // Invoke client learning callback.
      state.learn.chat(req->pr_data, req->pr_size, acc->pa_desc, acc->pa_size,
          pax->client_data);
      break;

    case DEC_JOIN:
      // Check the adefer list to see if we received a hello already for the
      // newly joined acceptor.
      acc = acceptor_find(&pax->adefer, inst->pi_hdr.ph_inum);

      if (acc != NULL) {
        // We found a deferred hello.  To complete the hello, just move our
        // acceptor over to the alist and increment the live count.
        LIST_REMOVE(&pax->adefer, acc, pa_le);
        pax->live_count++;
      } else {
        // We have not yet gotten the hello, so create a new acceptor.
        acc = g_malloc0(sizeof(*acc));
        acc->pa_paxid = inst->pi_hdr.ph_inum;
      }
      acceptor_insert(&pax->alist, acc);

      // Copy over the identity information.
      acc->pa_size = req->pr_size;
      acc->pa_desc = g_memdup(req->pr_data, req->pr_size);

      // If we are the proposer, we are responsible for connecting to the new
      // acceptor, as well as for sending the new acceptor its paxid and other
      // initial data.
      if (is_proposer()) {
        proposer_welcome(acc);
      }

      // Invoke client learning callback.
      state.learn.join(req->pr_data, req->pr_size, acc->pa_desc, acc->pa_size,
          pax->client_data);
      break;

    case DEC_PART:
    case DEC_KILL:
      // Grab the acceptor from the alist.
      acc = acceptor_find(&pax->alist, inst->pi_val.pv_extra);
      if (acc == NULL) {
        // It is possible that we may part twice; for instance, if a proposer
        // issues a part for itself but its departure from the system is
        // detected by acceptors before the part commit is received.  In this
        // case, just do nothing.
        break;
      }

      // Invoke client learning callback.
      state.learn.part(acc->pa_desc, acc->pa_size, acc->pa_desc, acc->pa_size,
          pax->client_data);

      // If we are being parted, leave the protocol.
      if (acc->pa_paxid == pax->self_id) {
        return paxos_end(pax);
      }

      // Take the parted acceptor off the list and do accounting if it was
      // still live.
      LIST_REMOVE(&pax->alist, acc, pa_le);
      if (acc->pa_peer != NULL) {
        pax->live_count--;
      }

      // If we just parted our proposer, "elect" a new one.  If it's us, send
      // a prepare.
      if (acc->pa_paxid == pax->proposer->pa_paxid) {
        reset_proposer();
        if (is_proposer()) {
          r = proposer_prepare(acc);
        }
      }

      // Free the parted acceptor.
      acceptor_destroy(acc);

      break;
  }

  return r;
}