int gcs_group_handle_sync_msg (gcs_group_t* group, const gcs_recv_msg_t* msg) { int const sender_idx = msg->sender_idx; gcs_node_t* sender = &group->nodes[sender_idx]; assert (GCS_MSG_SYNC == msg->type); if (GCS_NODE_STATE_JOINED == sender->status || /* #454 - at this layer we jump directly from DONOR to SYNCED */ (0 == group->last_applied_proto_ver && GCS_NODE_STATE_DONOR == sender->status)) { sender->status = GCS_NODE_STATE_SYNCED; sender->count_last_applied = true; group_redo_last_applied (group);//from now on this node must be counted gu_info ("Member %d.%d (%s) synced with group.", sender_idx, sender->segment, sender->name); return (sender_idx == group->my_idx); } else { if (GCS_NODE_STATE_SYNCED != sender->status) { gu_warn ("SYNC message sender from non-JOINED %d.%d (%s). Ignored.", sender_idx, sender->segment, sender->name); } else { gu_debug ("Redundant SYNC message from %d.%d (%s).", sender_idx, sender->segment, sender->name); } return 0; } }
static void group_check_donor (gcs_group_t* group) { gcs_node_state_t const my_state = group->nodes[group->my_idx].status; const char* const donor_id = group->nodes[group->my_idx].donor; if (GCS_NODE_STATE_JOINER == my_state && memcmp (donor_id, group_empty_id, sizeof(group_empty_id))) { long i; for (i = 0; i < group->num; i++) { if (i != group->my_idx && !memcmp (donor_id, group->nodes[i].id, sizeof (group->nodes[i].id))) return; } gu_warn ("Donor %s is no longer in the group. State transfer cannot " "be completed, need to abort. Aborting...", donor_id); gu_abort(); } return; }
static inline void gcs_node_set_last_applied (gcs_node_t* node, gcs_seqno_t seqno) { if (gu_unlikely(seqno < node->last_applied)) { gu_warn ("Received bogus LAST message: %lld, from node %s, " "expected >= %lld. Ignoring.", seqno, node->id, node->last_applied); } else { node->last_applied = seqno; } }
static GCS_BACKEND_MSG_SIZE_FN(dummy_msg_size) { const long max_pkt_size = backend->conn->max_pkt_size; if (pkt_size > max_pkt_size) { gu_warn ("Requested packet size: %d, maximum possible packet size: %d", pkt_size, max_pkt_size); return (max_pkt_size - backend->conn->hdr_size); } return (pkt_size - backend->conn->hdr_size); }
long gu_to_destroy (gu_to_t** to) { gu_to_t *t = *to; long ret; ssize_t i; gu_mutex_lock (&t->lock); if (t->used) { gu_mutex_unlock (&t->lock); return -EBUSY; } for (i = 0; i < t->qlen; i++) { to_waiter_t *w = t->queue + i; #ifdef TO_USE_SIGNAL if (gu_cond_destroy (&w->cond)) { // @todo: what if someone is waiting? gu_warn ("Failed to destroy condition %d. Should not happen", i); } #else if (pthread_mutex_destroy (&w->mtx)) { // @todo: what if someone is waiting? gu_warn ("Failed to destroy mutex %d. Should not happen", i); } #endif } t->qlen = 0; gu_mutex_unlock (&t->lock); /* What else can be done here? */ ret = gu_mutex_destroy (&t->lock); if (ret) return -ret; // application can retry gu_free (t->queue); gu_free (t); *to = NULL; return 0; }
long gu_to_cancel (gu_to_t *to, gu_seqno_t seqno) { long err; to_waiter_t *w; assert (seqno >= 0); if ((err = gu_mutex_lock (&to->lock))) { gu_fatal("Mutex lock failed (%d): %s", err, strerror(err)); abort(); } // Check for queue overflow. This is totally unrecoverable. Abort. if ((w = to_get_waiter (to, seqno)) == NULL) { gu_mutex_unlock(&to->lock); abort(); } /* we have a valid waiter now */ if ((seqno > to->seqno) || (seqno == to->seqno && w->state != HOLDER)) { err = to_wake_waiter (w); w->state = CANCELED; } else if (seqno == to->seqno && w->state == HOLDER) { gu_warn("tried to cancel current TO holder, state %d seqno %llu", w->state, seqno); err = -ECANCELED; } else { gu_warn("trying to cancel used seqno: state %d cancel seqno = %llu, " "TO seqno = %llu", w->state, seqno, to->seqno); err = -ECANCELED; } gu_mutex_unlock (&to->lock); return err; }
/*! * Handles action message. Is called often - therefore, inlined * * @return */ static inline ssize_t gcs_node_handle_act_frag (gcs_node_t* node, const gcs_act_frag_t* frg, struct gcs_act* act, bool local) { if (gu_likely(GCS_ACT_SERVICE != frg->act_type)) { return gcs_defrag_handle_frag (&node->app, frg, act, local); } else if (GCS_ACT_SERVICE == frg->act_type) { return gcs_defrag_handle_frag (&node->oob, frg, act, local); } else { gu_warn ("Unrecognised action type: %d", frg->act_type); assert(0); return -EPROTO; } }
gcs_group_state_t gcs_group_handle_state_msg (gcs_group_t* group, const gcs_recv_msg_t* msg) { if (GCS_GROUP_WAIT_STATE_MSG == group->state) { gcs_state_msg_t* state = gcs_state_msg_read (msg->buf, msg->size); if (state) { const gu_uuid_t* state_uuid = gcs_state_msg_uuid (state); if (!gu_uuid_compare(&group->state_uuid, state_uuid)) { gu_info ("STATE EXCHANGE: got state msg: "GU_UUID_FORMAT " from %d (%s)", GU_UUID_ARGS(state_uuid), msg->sender_idx, gcs_state_msg_name(state)); if (gu_log_debug) group_print_state_debug(state); gcs_node_record_state (&group->nodes[msg->sender_idx], state); group_post_state_exchange (group); } else { gu_debug ("STATE EXCHANGE: stray state msg: "GU_UUID_FORMAT " from node %ld (%s), current state UUID: " GU_UUID_FORMAT, GU_UUID_ARGS(state_uuid), msg->sender_idx, gcs_state_msg_name(state), GU_UUID_ARGS(&group->state_uuid)); if (gu_log_debug) group_print_state_debug(state); gcs_state_msg_destroy (state); } } else { gu_warn ("Could not parse state message from node %d", msg->sender_idx, group->nodes[msg->sender_idx].name); } } return group->state; }
gcs_group_state_t gcs_group_handle_uuid_msg (gcs_group_t* group, const gcs_recv_msg_t* msg) { assert (msg->size == sizeof(gu_uuid_t)); if (GCS_GROUP_WAIT_STATE_UUID == group->state && 0 == msg->sender_idx /* check that it is from the representative */) { group->state_uuid = *(gu_uuid_t*)msg->buf; group->state = GCS_GROUP_WAIT_STATE_MSG; } else { gu_warn ("Stray state UUID msg: "GU_UUID_FORMAT " from node %ld (%s), current group state %s", GU_UUID_ARGS((gu_uuid_t*)msg->buf), msg->sender_idx, group->nodes[msg->sender_idx].name, gcs_group_state_str[group->state]); } return group->state; }
static int group_find_node_by_state (gcs_group_t* const group, int const joiner_idx, gcs_node_state_t const status) { gcs_segment_t const segment = group->nodes[joiner_idx].segment; int idx; int donor = -1; bool hnss = false; /* have nodes in the same segment */ for (idx = 0; idx < group->num; idx++) { if (joiner_idx == idx) continue; /* skip joiner */ gcs_node_t* node = &group->nodes[idx]; if (node->status >= status && group_node_is_stateful (group, node)) donor = idx; /* potential donor */ if (segment == node->segment) { if (donor == idx) return donor; /* found suitable donor in the * same segment */ if (node->status >= GCS_NODE_STATE_JOINER) hnss = true;; } } /* Have not found suitable donor in the same segment. */ if (!hnss && donor >= 0) { if (joiner_idx == group->my_idx) { gu_warn ("There are no nodes in the same segment that will ever " "be able to become donors, yet there is a suitable donor " "outside. Will use that one."); } return donor; } else { /* wait for a suitable donor to appear in the same segment */ return -EAGAIN; } }
/*! return true if this node is the sender to notify the calling thread of * success */ int gcs_group_handle_join_msg (gcs_group_t* group, const gcs_recv_msg_t* msg) { int const sender_idx = msg->sender_idx; gcs_node_t* sender = &group->nodes[sender_idx]; assert (GCS_MSG_JOIN == msg->type); // TODO: define an explicit type for the join message, like gcs_join_msg_t assert (msg->size == sizeof(gcs_seqno_t)); if (GCS_NODE_STATE_DONOR == sender->status || GCS_NODE_STATE_JOINER == sender->status) { int j; gcs_seqno_t seqno = gcs_seqno_gtoh(*(gcs_seqno_t*)msg->buf); gcs_node_t* peer = NULL; const char* peer_id = NULL; const char* peer_name = "left the group"; int peer_idx = -1; bool from_donor = false; const char* st_dir = NULL; // state transfer direction symbol if (GCS_NODE_STATE_DONOR == sender->status) { peer_id = sender->joiner; from_donor = true; st_dir = "to"; assert (group->last_applied_proto_ver >= 0); if (0 == group->last_applied_proto_ver) { /* #454 - we don't switch to JOINED here, * instead going straignt to SYNCED */ } else { assert(sender->count_last_applied); sender->status = GCS_NODE_STATE_JOINED; } } else { peer_id = sender->donor; st_dir = "from"; if (group->quorum.version < 2) { // #591 remove after quorum v1 is phased out sender->status = GCS_NODE_STATE_JOINED; group->prim_num++; } else { if (seqno >= 0) { sender->status = GCS_NODE_STATE_JOINED; group->prim_num++; } else { sender->status = GCS_NODE_STATE_PRIM; } } } // Try to find peer. for (j = 0; j < group->num; j++) { // #483 if (j == sender_idx) continue; if (!memcmp(peer_id, group->nodes[j].id, sizeof (group->nodes[j].id))) { peer_idx = j; peer = &group->nodes[peer_idx]; peer_name = peer->name; break; } } if (j == group->num) { gu_warn ("Could not find peer: %s", peer_id); } if (seqno < 0) { gu_warn ("%d.%d (%s): State transfer %s %d.%d (%s) failed: %d (%s)", sender_idx, sender->segment, sender->name, st_dir, peer_idx, peer ? peer->segment : -1, peer_name, (int)seqno, strerror((int)-seqno)); if (from_donor && peer_idx == group->my_idx && GCS_NODE_STATE_JOINER == group->nodes[peer_idx].status) { // this node will be waiting for SST forever. If it has only // one recv thread there is no (generic) way to wake it up. gu_fatal ("Will never receive state. Need to abort."); // return to core to shutdown the backend before aborting return -ENOTRECOVERABLE; } if (group->quorum.version < 2 && !from_donor && // #591 sender_idx == group->my_idx) { // remove after quorum v1 is phased out gu_fatal ("Faield to receive state. Need to abort."); return -ENOTRECOVERABLE; } } else { if (sender_idx == peer_idx) { gu_info ("Member %d.%d (%s) resyncs itself to group", sender_idx, sender->segment, sender->name); } else { gu_info ("%d.%d (%s): State transfer %s %d.%d (%s) complete.", sender_idx, sender->segment, sender->name, st_dir, peer_idx, peer ? peer->segment : -1, peer_name); } } } else { if (GCS_NODE_STATE_PRIM == sender->status) { gu_warn("Rejecting JOIN message from %d.%d (%s): new State Transfer" " required.", sender_idx, sender->segment, sender->name); } else { // should we freak out and throw an error? gu_warn("Protocol violation. JOIN message sender %d.%d (%s) is not " "in state transfer (%s). Message ignored.", sender_idx, sender->segment, sender->name, gcs_node_state_to_str(sender->status)); } return 0; } return (sender_idx == group->my_idx); }
/*! * Selects and returns the index of state transfer donor, if available. * Updates donor and joiner status if state transfer is possible * * @return * donor index or negative error code: * -EHOSTUNREACH if reqiested donor is not available * -EAGAIN if there were no nodes in the proper state. */ static int group_select_donor (gcs_group_t* group, int const str_version, int const joiner_idx, const char* const donor_string, const gu_uuid_t* ist_uuid, gcs_seqno_t ist_seqno, bool const desync) { static gcs_node_state_t const min_donor_state = GCS_NODE_STATE_SYNCED; int donor_idx; int const donor_len = strlen(donor_string); bool const required_donor = (donor_len > 0); if (desync) { /* sender wants to become "donor" itself */ assert(donor_len > 0); gcs_node_state_t const st = group->nodes[joiner_idx].status; if (st >= min_donor_state) donor_idx = joiner_idx; else donor_idx = -EAGAIN; } else { donor_idx = gcs_group_find_donor(group, str_version, joiner_idx, donor_string, donor_len, ist_uuid, ist_seqno); } if (donor_idx >= 0) { assert(donor_idx != joiner_idx || desync); gcs_node_t* const joiner = &group->nodes[joiner_idx]; gcs_node_t* const donor = &group->nodes[donor_idx]; if (desync) { gu_info ("Member %d.%d (%s) desyncs itself from group", donor_idx, donor->segment, donor->name); } else { gu_info ("Member %d.%d (%s) requested state transfer from '%s'. " "Selected %d.%d (%s)(%s) as donor.", joiner_idx, joiner->segment, joiner->name, required_donor ? donor_string : "*any*", donor_idx, donor->segment, donor->name, gcs_node_state_to_str(donor->status)); } // reserve donor, confirm joiner (! assignment order is significant !) joiner->status = GCS_NODE_STATE_JOINER; donor->status = GCS_NODE_STATE_DONOR; memcpy (donor->joiner, joiner->id, GCS_COMP_MEMB_ID_MAX_LEN+1); memcpy (joiner->donor, donor->id, GCS_COMP_MEMB_ID_MAX_LEN+1); } else { gu_warn ("Member %d.%d (%s) requested state transfer from '%s', " "but it is impossible to select State Transfer donor: %s", joiner_idx, group->nodes[joiner_idx].segment, group->nodes[joiner_idx].name, required_donor ? donor_string : "*any*", strerror (-donor_idx)); } return donor_idx; }
// Initialises core and backend objects + some common tests static inline void core_test_init () { long ret; action_t act; mark_point(); gu_config_t* config = gu_config_create (); fail_if (config == NULL); Core = gcs_core_create (config, NULL, "core_test", "aaa.bbb.ccc.ddd:xxxx", 0, 0); fail_if (NULL == Core); Backend = gcs_core_get_backend (Core); fail_if (NULL == Backend); Seqno = 0; // reset seqno ret = core_test_set_payload_size (FRAG_SIZE); fail_if (-EBADFD != ret, "Expected -EBADFD, got: %ld (%s)", ret, strerror(-ret)); ret = gcs_core_open (Core, "yadda-yadda", "owkmevc", 1); fail_if (-EINVAL != ret, "Expected -EINVAL, got %ld (%s)", ret, strerror(-ret)); ret = gcs_core_open (Core, "yadda-yadda", "dummy://", 1); fail_if (0 != ret, "Failed to open core connection: %ld (%s)", ret, strerror(-ret)); // receive first configuration message fail_if (CORE_RECV_ACT (&act, NULL, UNKNOWN_SIZE, GCS_ACT_CONF)); fail_if (core_test_check_conf(act.out, true, 0, 1)); free (act.out); // this will configure backend to have desired fragment size ret = core_test_set_payload_size (FRAG_SIZE); fail_if (0 != ret, "Failed to set up the message payload size: %ld (%s)", ret, strerror(-ret)); // try to send an action to check that everything's alright ret = gcs_core_send (Core, act1, sizeof(act1_str), GCS_ACT_TORDERED); fail_if (ret != sizeof(act1_str), "Expected %d, got %d (%s)", sizeof(act1_str), ret, strerror (-ret)); gu_warn ("Next CORE_RECV_ACT fails under valgrind"); act.in = act1; fail_if (CORE_RECV_ACT (&act, act1_str, sizeof(act1_str),GCS_ACT_TORDERED)); ret = gcs_core_send_join (Core, Seqno); fail_if (ret != 0, "gcs_core_send_join(): %ld (%s)", ret, strerror(-ret)); // no action to be received (we're joined already) ret = gcs_core_send_sync (Core, Seqno); fail_if (ret != 0, "gcs_core_send_sync(): %ld (%s)", ret, strerror(-ret)); fail_if (CORE_RECV_ACT(&act,NULL,sizeof(gcs_seqno_t),GCS_ACT_SYNC)); fail_if (Seqno != gcs_seqno_gtoh(*(gcs_seqno_t*)act.out)); gcs_core_send_lock_step (Core, true); mark_point(); }
/*! Processes a new action added to a slave queue. * @return length of sleep in nanoseconds or negative error code * or GU_TIME_ETERNITY for complete stop */ long long gcs_fc_process (gcs_fc_t* fc, ssize_t act_size) { fc->size += act_size; fc->act_count++; if (fc->size <= fc->soft_limit) { /* normal operation */ if (gu_unlikely(fc->debug > 0 && !(fc->act_count % fc->debug))) { gu_info ("FC: queue size: %zdb (%4.1f%% of soft limit)", fc->size, ((double)fc->size)/fc->soft_limit*100.0); } return 0; } else if (fc->size >= fc->hard_limit) { if (0.0 == fc->max_throttle) { /* we can accept total service outage */ return GU_TIME_ETERNITY; } else { gu_error ("Recv queue hard limit exceded. Can't continue."); return -ENOMEM; } } // else if (!(fc->act_count & 7)) { // do this for every 8th action else { long long end = gu_time_monotonic(); double interval = ((end - fc->start) * 1.0e-9); if (gu_unlikely (0 == fc->last_sleep)) { /* just tripped the soft limit, preparing constants for throttle */ fc->max_rate = (double)(fc->size - fc->init_size) / interval; double s = (1.0 - fc->max_throttle)/(fc->soft_limit-fc->hard_limit); assert (s < 0.0); fc->scale = s * fc->max_rate; fc->offset = (1.0 - s*fc->soft_limit) * fc->max_rate; // calculate time interval from the soft limit interval = interval * (double)(fc->size - fc->soft_limit) / (fc->size - fc->init_size); assert (interval >= 0.0); // Move reference point to soft limit fc->last_sleep = fc->soft_limit; fc->start = end - interval; gu_warn("Soft recv queue limit exceeded, starting replication " "throttle. Measured avg. rate: %f bytes/sec; " "Throttle parameters: scale=%f, offset=%f", fc->max_rate, fc->scale, fc->offset); } /* throttling operation */ double desired_rate = fc->size * fc->scale + fc->offset; // linear decay //double desired_rate = fc->max_rate * fc->max_throttle; // square wave assert (desired_rate <= fc->max_rate); double sleep = (double)(fc->size - fc->last_sleep) / desired_rate - interval; if (gu_unlikely(fc->debug > 0 && !(fc->act_count % fc->debug))) { gu_info ("FC: queue size: %zdb, length: %zd, " "measured rate: %fb/s, desired rate: %fb/s, " "interval: %5.3fs, sleep: %5.4fs. " "Sleeps initiated: %zd, for a total of %6.3fs", fc->size, fc->act_count, ((double)(fc->size - fc->last_sleep))/interval, desired_rate, interval, sleep, fc->sleep_count, fc->sleeps); fc->sleep_count = 0; fc->sleeps = 0.0; } if (gu_likely(sleep < min_sleep)) { #if 0 gu_info ("Skipping sleep: desired_rate = %f, sleep = %f (%f), " "interval = %f, fc->scale = %f, fc->offset = %f, " "fc->size = %zd", desired_rate, sleep, min_sleep, interval, fc->scale, fc->offset, fc->size); #endif return 0; } fc->last_sleep = fc->size; fc->start = end; fc->sleep_count++; fc->sleeps += sleep; return (1000000000LL * sleep); } return 0; }