long gu_to_self_cancel(gu_to_t *to, gu_seqno_t seqno) { long err = 0; to_waiter_t *w; assert (seqno >= 0); if ((err = gu_mutex_lock (&to->lock))) { gu_fatal("Mutex lock failed (%d): %s", err, strerror(err)); abort(); } if ((w = to_get_waiter (to, seqno)) == NULL) { gu_mutex_unlock(&to->lock); return -EAGAIN; } /* we have a valid waiter now */ if (seqno > to->seqno) { // most probable case w->state = CANCELED; } else if (seqno == to->seqno) { // have to wake the next waiter as if we grabbed and now releasing TO to_release_and_wake_next (to, w); } else { // (seqno < to->seqno) // This waiter must have been canceled or even released by preceding // waiter. Do nothing. } gu_mutex_unlock(&to->lock); return err; }
long gu_to_interrupt (gu_to_t *to, gu_seqno_t seqno) { long rcode = 0; long err; to_waiter_t *w; assert (seqno >= 0); if ((err = gu_mutex_lock (&to->lock))) { gu_fatal("Mutex lock failed (%d): %s", err, strerror(err)); abort(); } if (seqno >= to->seqno) { if ((w = to_get_waiter (to, seqno)) == NULL) { gu_mutex_unlock(&to->lock); return -EAGAIN; } /* we have a valid waiter now */ switch (w->state) { case HOLDER: gu_debug ("trying to interrupt in use seqno: seqno = %llu, " "TO seqno = %llu", seqno, to->seqno); /* gu_mutex_unlock (&to->lock); */ rcode = -ERANGE; break; case CANCELED: gu_debug ("trying to interrupt canceled seqno: seqno = %llu, " "TO seqno = %llu", seqno, to->seqno); /* gu_mutex_unlock (&to->lock); */ rcode = -ERANGE; break; case WAIT: gu_debug ("signaling to interrupt wait seqno: seqno = %llu, " "TO seqno = %llu", seqno, to->seqno); rcode = to_wake_waiter (w); case RELEASED: w->state = INTERRUPTED; break; case INTERRUPTED: gu_debug ("TO waiter interrupt already seqno: seqno = %llu, " "TO seqno = %llu", seqno, to->seqno); break; } } else { gu_debug ("trying to interrupt used seqno: cancel seqno = %llu, " "TO seqno = %llu", seqno, to->seqno); /* gu_mutex_unlock (&to->lock); */ rcode = -ERANGE; } gu_mutex_unlock (&to->lock); return rcode; }
long gu_to_release (gu_to_t *to, gu_seqno_t seqno) { long err; to_waiter_t *w; assert (seqno >= 0); if ((err = gu_mutex_lock(&to->lock))) { gu_fatal("Mutex lock failed (%d): %s", err, strerror(err)); abort(); } if ((w = to_get_waiter (to, seqno)) == NULL) { gu_mutex_unlock(&to->lock); return -EAGAIN; } /* we have a valid waiter now */ if (seqno == to->seqno) { to_release_and_wake_next (to, w); } else if (seqno > to->seqno) { if (w->state != CANCELED) { gu_fatal("Illegal state in premature release: %d", w->state); abort(); } /* Leave state CANCELED so that real releaser can iterate */ } else { /* */ if (w->state != RELEASED) { gu_fatal("Outdated seqno and state not RELEASED: %d", w->state); abort(); } } gu_mutex_unlock(&to->lock); return err; }
// does basic sanity check of the component message (in response to #145) static void group_check_comp_msg (bool prim, long my_idx, long members) { if (my_idx >= 0) { if (my_idx < members) return; } else { if (!prim && (0 == members)) return; } gu_fatal ("Malformed component message from backend: " "%s, idx = %ld, members = %ld", prim ? "PRIMARY" : "NON-PRIMARY", my_idx, members); assert (0); gu_abort (); }
static inline long to_wake_waiter (to_waiter_t* w) { long err = 0; if (w->state == WAIT) { #ifdef TO_USE_SIGNAL err = gu_cond_signal (&w->cond); #else err = pthread_mutex_unlock (&w->mtx); #endif if (err) { gu_fatal ("gu_cond_signal failed: %d", err); } } return err; }
long gu_to_cancel (gu_to_t *to, gu_seqno_t seqno) { long err; to_waiter_t *w; assert (seqno >= 0); if ((err = gu_mutex_lock (&to->lock))) { gu_fatal("Mutex lock failed (%d): %s", err, strerror(err)); abort(); } // Check for queue overflow. This is totally unrecoverable. Abort. if ((w = to_get_waiter (to, seqno)) == NULL) { gu_mutex_unlock(&to->lock); abort(); } /* we have a valid waiter now */ if ((seqno > to->seqno) || (seqno == to->seqno && w->state != HOLDER)) { err = to_wake_waiter (w); w->state = CANCELED; } else if (seqno == to->seqno && w->state == HOLDER) { gu_warn("tried to cancel current TO holder, state %d seqno %llu", w->state, seqno); err = -ECANCELED; } else { gu_warn("trying to cancel used seqno: state %d cancel seqno = %llu, " "TO seqno = %llu", w->state, seqno, to->seqno); err = -ECANCELED; } gu_mutex_unlock (&to->lock); return err; }
long gu_to_grab (gu_to_t* to, gu_seqno_t seqno) { long err; to_waiter_t *w; assert (seqno >= 0); if ((err = gu_mutex_lock(&to->lock))) { gu_fatal("Mutex lock failed (%d): %s", err, strerror(err)); abort(); } if (seqno < to->seqno) { gu_mutex_unlock(&to->lock); return -ECANCELED; } if ((w = to_get_waiter (to, seqno)) == NULL) { gu_mutex_unlock(&to->lock); return -EAGAIN; } /* we have a valid waiter now */ switch (w->state) { case INTERRUPTED: w->state = RELEASED; err = -EINTR; break; case CANCELED: err = -ECANCELED; break; case RELEASED: if (seqno == to->seqno) { w->state = HOLDER; } else if (seqno < to->seqno) { gu_error("Trying to grab outdated seqno"); err = -ECANCELED; } else { /* seqno > to->seqno, wait for my turn */ w->state = WAIT; to->used++; #ifdef TO_USE_SIGNAL gu_cond_wait(&w->cond, &to->lock); #else pthread_mutex_lock (&w->mtx); pthread_mutex_unlock (&to->lock); pthread_mutex_lock (&w->mtx); // wait for unlock by other thread pthread_mutex_lock (&to->lock); pthread_mutex_unlock (&w->mtx); #endif to->used--; switch (w->state) { case WAIT:// should be most probable assert (seqno == to->seqno); w->state = HOLDER; break; case INTERRUPTED: w->state = RELEASED; err = -EINTR; break; case CANCELED: err = -ECANCELED; break; case RELEASED: /* this waiter has been cancelled */ assert(seqno < to->seqno); err = -ECANCELED; break; default: gu_fatal("Invalid cond wait exit state %d, seqno %llu(%llu)", w->state, seqno, to->seqno); abort(); } } break; default: gu_fatal("TO queue over wrap"); abort(); } gu_mutex_unlock(&to->lock); return err; }
/*! return true if this node is the sender to notify the calling thread of * success */ int gcs_group_handle_join_msg (gcs_group_t* group, const gcs_recv_msg_t* msg) { int const sender_idx = msg->sender_idx; gcs_node_t* sender = &group->nodes[sender_idx]; assert (GCS_MSG_JOIN == msg->type); // TODO: define an explicit type for the join message, like gcs_join_msg_t assert (msg->size == sizeof(gcs_seqno_t)); if (GCS_NODE_STATE_DONOR == sender->status || GCS_NODE_STATE_JOINER == sender->status) { int j; gcs_seqno_t seqno = gcs_seqno_gtoh(*(gcs_seqno_t*)msg->buf); gcs_node_t* peer = NULL; const char* peer_id = NULL; const char* peer_name = "left the group"; int peer_idx = -1; bool from_donor = false; const char* st_dir = NULL; // state transfer direction symbol if (GCS_NODE_STATE_DONOR == sender->status) { peer_id = sender->joiner; from_donor = true; st_dir = "to"; assert (group->last_applied_proto_ver >= 0); if (0 == group->last_applied_proto_ver) { /* #454 - we don't switch to JOINED here, * instead going straignt to SYNCED */ } else { assert(sender->count_last_applied); sender->status = GCS_NODE_STATE_JOINED; } } else { peer_id = sender->donor; st_dir = "from"; if (group->quorum.version < 2) { // #591 remove after quorum v1 is phased out sender->status = GCS_NODE_STATE_JOINED; group->prim_num++; } else { if (seqno >= 0) { sender->status = GCS_NODE_STATE_JOINED; group->prim_num++; } else { sender->status = GCS_NODE_STATE_PRIM; } } } // Try to find peer. for (j = 0; j < group->num; j++) { // #483 if (j == sender_idx) continue; if (!memcmp(peer_id, group->nodes[j].id, sizeof (group->nodes[j].id))) { peer_idx = j; peer = &group->nodes[peer_idx]; peer_name = peer->name; break; } } if (j == group->num) { gu_warn ("Could not find peer: %s", peer_id); } if (seqno < 0) { gu_warn ("%d.%d (%s): State transfer %s %d.%d (%s) failed: %d (%s)", sender_idx, sender->segment, sender->name, st_dir, peer_idx, peer ? peer->segment : -1, peer_name, (int)seqno, strerror((int)-seqno)); if (from_donor && peer_idx == group->my_idx && GCS_NODE_STATE_JOINER == group->nodes[peer_idx].status) { // this node will be waiting for SST forever. If it has only // one recv thread there is no (generic) way to wake it up. gu_fatal ("Will never receive state. Need to abort."); // return to core to shutdown the backend before aborting return -ENOTRECOVERABLE; } if (group->quorum.version < 2 && !from_donor && // #591 sender_idx == group->my_idx) { // remove after quorum v1 is phased out gu_fatal ("Faield to receive state. Need to abort."); return -ENOTRECOVERABLE; } } else { if (sender_idx == peer_idx) { gu_info ("Member %d.%d (%s) resyncs itself to group", sender_idx, sender->segment, sender->name); } else { gu_info ("%d.%d (%s): State transfer %s %d.%d (%s) complete.", sender_idx, sender->segment, sender->name, st_dir, peer_idx, peer ? peer->segment : -1, peer_name); } } } else { if (GCS_NODE_STATE_PRIM == sender->status) { gu_warn("Rejecting JOIN message from %d.%d (%s): new State Transfer" " required.", sender_idx, sender->segment, sender->name); } else { // should we freak out and throw an error? gu_warn("Protocol violation. JOIN message sender %d.%d (%s) is not " "in state transfer (%s). Message ignored.", sender_idx, sender->segment, sender->name, gcs_node_state_to_str(sender->status)); } return 0; } return (sender_idx == group->my_idx); }
gcs_group_state_t gcs_group_handle_comp_msg (gcs_group_t* group, const gcs_comp_msg_t* comp) { long new_idx, old_idx; gcs_node_t* new_nodes = NULL; ulong new_memb = 0; const bool prim_comp = gcs_comp_msg_primary (comp); const bool bootstrap = gcs_comp_msg_bootstrap(comp); const long new_my_idx = gcs_comp_msg_self (comp); const long new_nodes_num = gcs_comp_msg_num (comp); group_check_comp_msg (prim_comp, new_my_idx, new_nodes_num); if (new_my_idx >= 0) { gu_info ("New COMPONENT: primary = %s, bootstrap = %s, my_idx = %ld, " "memb_num = %ld", prim_comp ? "yes" : "no", bootstrap ? "yes" : "no", new_my_idx, new_nodes_num); new_nodes = group_nodes_init (group, comp); if (!new_nodes) { gu_fatal ("Could not allocate memory for %ld-node component.", gcs_comp_msg_num (comp)); assert(0); return (gcs_group_state_t)-ENOMEM; } if (GCS_GROUP_PRIMARY == group->state) { gu_debug ("#281: Saving %s over %s", gcs_node_state_to_str(group->nodes[group->my_idx].status), gcs_node_state_to_str(group->prim_state)); group->prim_state = group->nodes[group->my_idx].status; } } else { // Self-leave message gu_info ("Received self-leave message."); assert (0 == new_nodes_num); assert (!prim_comp); } if (prim_comp) { /* Got PRIMARY COMPONENT - Hooray! */ assert (new_my_idx >= 0); if (group->state == GCS_GROUP_PRIMARY) { /* we come from previous primary configuration, relax */ } else if (bootstrap) { /* Is there need to initialize something else in this case? */ group->nodes[group->my_idx].bootstrap = true; } else { const bool first_component = #ifndef GCS_CORE_TESTING (1 == group->num) && !strcmp (NODE_NO_ID, group->nodes[0].id); #else (1 == group->num); #endif if (1 == new_nodes_num && first_component) { /* bootstrap new configuration */ assert (GCS_GROUP_NON_PRIMARY == group->state); assert (1 == group->num); assert (0 == group->my_idx); // This bootstraps initial primary component for state exchange gu_uuid_generate (&group->prim_uuid, NULL, 0); group->prim_seqno = 0; group->prim_num = 1; group->state = GCS_GROUP_PRIMARY; if (group->act_id < 0) { // no history provided: start a new one group->act_id = GCS_SEQNO_NIL; gu_uuid_generate (&group->group_uuid, NULL, 0); gu_info ("Starting new group from scratch: "GU_UUID_FORMAT, GU_UUID_ARGS(&group->group_uuid)); } // the following should be removed under #474 group->nodes[0].status = GCS_NODE_STATE_JOINED; /* initialize node ID to the one given by the backend - this way * we'll be recognized as coming from prev. conf. in node array * remap below */ strncpy ((char*)group->nodes[0].id, new_nodes[0].id, sizeof (new_nodes[0].id) - 1); group->nodes[0].segment = new_nodes[0].segment; } } } else { group_go_non_primary (group); } /* Remap old node array to new one to preserve action continuity */ for (new_idx = 0; new_idx < new_nodes_num; new_idx++) { /* find member index in old component by unique member id */ for (old_idx = 0; old_idx < group->num; old_idx++) { // just scan through old group if (!strcmp(group->nodes[old_idx].id, new_nodes[new_idx].id)) { /* the node was in previous configuration with us */ /* move node context to new node array */ gcs_node_move (&new_nodes[new_idx], &group->nodes[old_idx]); break; } } /* if wasn't found in new configuration, new member - * need to do state exchange */ new_memb |= (old_idx == group->num); } /* free old nodes array */ group_nodes_free (group); group->my_idx = new_my_idx; group->num = new_nodes_num; group->nodes = new_nodes; if (gcs_comp_msg_primary(comp) || bootstrap) { /* TODO: for now pretend that we always have new nodes and perform * state exchange because old states can carry outdated node status. * (also protocol voting needs to be redone) * However this means aborting ongoing actions. Find a way to avoid * this extra state exchange. Generate new state messages on behalf * of other nodes? see #238 */ new_memb = true; /* if new nodes joined, reset ongoing actions and state messages */ if (new_memb) { group_nodes_reset (group); group->state = GCS_GROUP_WAIT_STATE_UUID; group->state_uuid = GU_UUID_NIL; // prepare for state exchange } else { if (GCS_GROUP_PRIMARY == group->state) { /* since we don't have any new nodes since last PRIMARY, we skip state exchange */ group_post_state_exchange (group); } } group_redo_last_applied (group); } return group->state; }
/*! Processes state messages and sets group parameters accordingly */ static void group_post_state_exchange (gcs_group_t* group) { const gcs_state_msg_t* states[group->num]; gcs_state_quorum_t* quorum = &group->quorum; bool new_exchange = gu_uuid_compare (&group->state_uuid, &GU_UUID_NIL); long i; /* Collect state messages from nodes. */ /* Looping here every time is suboptimal, but simply counting state messages * is not straightforward too: nodes may disappear, so the final count may * include messages from the disappeared nodes. * Let's put it this way: looping here is reliable and not that expensive.*/ for (i = 0; i < group->num; i++) { states[i] = group->nodes[i].state_msg; if (NULL == states[i] || (new_exchange && gu_uuid_compare (&group->state_uuid, gcs_state_msg_uuid(states[i])))) return; // not all states from THIS state exch. received, wait } gu_debug ("STATE EXCHANGE: "GU_UUID_FORMAT" complete.", GU_UUID_ARGS(&group->state_uuid)); gcs_state_msg_get_quorum (states, group->num, quorum); if (quorum->version >= 0) { if (quorum->version < 2) { group->last_applied_proto_ver = 0; } else { group->last_applied_proto_ver = 1; } } else { gu_fatal ("Negative quorum version: %d", quorum->version); abort(); } // Update each node state based on quorum outcome: // is it up to date, does it need SST and stuff for (i = 0; i < group->num; i++) { gcs_node_update_status (&group->nodes[i], quorum); } if (quorum->primary) { // primary configuration if (new_exchange) { // new state exchange happened group->state = GCS_GROUP_PRIMARY; group->act_id = quorum->act_id; group->conf_id = quorum->conf_id + 1; group->group_uuid = quorum->group_uuid; group->prim_uuid = group->state_uuid; group->state_uuid = GU_UUID_NIL; } else { // no state exchange happend, processing old state messages assert (GCS_GROUP_PRIMARY == group->state); group->conf_id++; } group->prim_seqno = group->conf_id; group->prim_num = 0; for (i = 0; i < group->num; i++) { group->prim_num += gcs_node_is_joined (group->nodes[i].status); } assert (group->prim_num > 0); } else { // non-primary configuration group_go_non_primary (group); } gu_info ("Quorum results:" "\n\tversion = %u," "\n\tcomponent = %s," "\n\tconf_id = %lld," "\n\tmembers = %d/%d (joined/total)," "\n\tact_id = %lld," "\n\tlast_appl. = %lld," "\n\tprotocols = %d/%d/%d (gcs/repl/appl)," "\n\tgroup UUID = "GU_UUID_FORMAT, quorum->version, quorum->primary ? "PRIMARY" : "NON-PRIMARY", quorum->conf_id, group->prim_num, group->num, quorum->act_id, group->last_applied, quorum->gcs_proto_ver, quorum->repl_proto_ver, quorum->appl_proto_ver, GU_UUID_ARGS(&quorum->group_uuid)); group_check_donor(group); }
long gcs_fifo_lite_destroy (gcs_fifo_lite_t* f) { if (f) { if (gu_mutex_lock (&f->lock)) { abort(); } if (f->destroyed) { gu_mutex_unlock (&f->lock); return -EALREADY; } f->closed = true; f->destroyed = true; /* get rid of "put" threads waiting for lock or signal */ while (pthread_cond_destroy (&f->put_cond)) { if (f->put_wait <= 0) { gu_fatal ("Can't destroy condition while nobody's waiting"); abort(); } f->put_wait = 0; gu_cond_broadcast (&f->put_cond); } while (f->used) { /* there are some items in FIFO - and that means * no gcs_fifo_lite_safe_get() is waiting on condition */ gu_mutex_unlock (&f->lock); /* let them get remaining items from FIFO, * we don't know how to deallocate them ourselves. * unfortunately this may take some time */ usleep (10000); /* sleep a bit to avoid busy loop */ gu_mutex_lock (&f->lock); } f->length = 0; /* now all we have - "get" threads waiting for lock or signal */ while (pthread_cond_destroy (&f->get_cond)) { if (f->get_wait <= 0) { gu_fatal ("Can't destroy condition while nobody's waiting"); abort(); } f->get_wait = 0; gu_cond_broadcast (&f->get_cond); } /* at this point there are only functions waiting for lock */ gu_mutex_unlock (&f->lock); while (gu_mutex_destroy (&f->lock)) { /* this should be fast provided safe get and safe put are * wtitten correctly. They should immediately freak out. */ gu_mutex_lock (&f->lock); gu_mutex_unlock (&f->lock); } /* now nobody's waiting for anything */ gu_free (f->queue); gu_free (f); return 0; } return -EINVAL; }