Beispiel #1
0
long gu_to_self_cancel(gu_to_t *to, gu_seqno_t seqno)
{
    long         err = 0;
    to_waiter_t *w;

    assert (seqno >= 0);

    if ((err = gu_mutex_lock (&to->lock))) {
        gu_fatal("Mutex lock failed (%d): %s", err, strerror(err));
        abort();
    }

    if ((w = to_get_waiter (to, seqno)) == NULL) {
        gu_mutex_unlock(&to->lock);
        return -EAGAIN;
    }        
    /* we have a valid waiter now */

    if (seqno > to->seqno) { // most probable case
        w->state = CANCELED;
    }
    else if (seqno == to->seqno) {
        // have to wake the next waiter as if we grabbed and now releasing TO
        to_release_and_wake_next (to, w);
    }
    else { // (seqno < to->seqno)
        // This waiter must have been canceled or even released by preceding
        // waiter. Do nothing.
    }
    
    gu_mutex_unlock(&to->lock);
    
    return err;
}
Beispiel #2
0
long gu_to_interrupt (gu_to_t *to, gu_seqno_t seqno)
{
    long rcode = 0;
    long err;
    to_waiter_t *w;

    assert (seqno >= 0);

    if ((err = gu_mutex_lock (&to->lock))) {
        gu_fatal("Mutex lock failed (%d): %s", err, strerror(err));
        abort();
    }
    if (seqno >= to->seqno) {
        if ((w = to_get_waiter (to, seqno)) == NULL) {
            gu_mutex_unlock(&to->lock);
            return -EAGAIN;
        }        
        /* we have a valid waiter now */

        switch (w->state) {
        case HOLDER:
            gu_debug ("trying to interrupt in use seqno: seqno = %llu, "
                      "TO seqno = %llu", seqno, to->seqno);
            /* gu_mutex_unlock (&to->lock); */
            rcode = -ERANGE;
            break;
        case CANCELED:
            gu_debug ("trying to interrupt canceled seqno: seqno = %llu, "
                      "TO seqno = %llu", seqno, to->seqno);
            /* gu_mutex_unlock (&to->lock); */
            rcode = -ERANGE;
            break;
        case WAIT:
            gu_debug ("signaling to interrupt wait seqno: seqno = %llu, "
                      "TO seqno = %llu", seqno, to->seqno);
            rcode    = to_wake_waiter (w);
        case RELEASED:
            w->state = INTERRUPTED;
            break;
        case INTERRUPTED:
            gu_debug ("TO waiter interrupt already seqno: seqno = %llu, "
                      "TO seqno = %llu", seqno, to->seqno);
            break;
        }
    } else {
        gu_debug ("trying to interrupt used seqno: cancel seqno = %llu, "
                  "TO seqno = %llu", seqno, to->seqno);
        /* gu_mutex_unlock (&to->lock); */
        rcode = -ERANGE;
    }

    gu_mutex_unlock (&to->lock);
    return rcode;
}
Beispiel #3
0
long gu_to_release (gu_to_t *to, gu_seqno_t seqno)
{
    long         err;
    to_waiter_t *w;

    assert (seqno >= 0);

    if ((err = gu_mutex_lock(&to->lock))) {
        gu_fatal("Mutex lock failed (%d): %s", err, strerror(err));
        abort();
    }

    if ((w = to_get_waiter (to, seqno)) == NULL) {
        gu_mutex_unlock(&to->lock);
        return -EAGAIN;
    }        
    /* we have a valid waiter now */
    
    if (seqno == to->seqno) {
        to_release_and_wake_next (to, w);
    } else if (seqno > to->seqno) {
        if (w->state != CANCELED) {
            gu_fatal("Illegal state in premature release: %d", w->state);
            abort();
        }
        /* Leave state CANCELED so that real releaser can iterate  */
    } else {
        /* */
        if (w->state != RELEASED) {
            gu_fatal("Outdated seqno and state not RELEASED: %d", w->state);
            abort();
        }
    }

    gu_mutex_unlock(&to->lock);

    return err;
}
// does basic sanity check of the component message (in response to #145)
static void
group_check_comp_msg (bool prim, long my_idx, long members)
{
    if (my_idx >= 0) {
        if (my_idx < members) return;
    }
    else {
        if (!prim && (0 == members)) return;
    }

    gu_fatal ("Malformed component message from backend: "
              "%s, idx = %ld, members = %ld",
              prim ? "PRIMARY" : "NON-PRIMARY", my_idx, members);

    assert (0);
    gu_abort ();
}
Beispiel #5
0
static inline long
to_wake_waiter (to_waiter_t* w)
{
    long err = 0;

    if (w->state == WAIT) {
#ifdef TO_USE_SIGNAL
        err = gu_cond_signal (&w->cond);
#else
        err = pthread_mutex_unlock (&w->mtx);
#endif
        if (err) {
            gu_fatal ("gu_cond_signal failed: %d", err);
        }
    }
    return err;
}
Beispiel #6
0
long gu_to_cancel (gu_to_t *to, gu_seqno_t seqno)
{
    long         err;
    to_waiter_t *w;

    assert (seqno >= 0);

    if ((err = gu_mutex_lock (&to->lock))) {
        gu_fatal("Mutex lock failed (%d): %s", err, strerror(err));
        abort();
    }
    
    // Check for queue overflow. This is totally unrecoverable. Abort.
    if ((w = to_get_waiter (to, seqno)) == NULL) {
        gu_mutex_unlock(&to->lock);
        abort();
    }        
    /* we have a valid waiter now */

    if ((seqno > to->seqno) || 
        (seqno == to->seqno && w->state != HOLDER)) {
        err = to_wake_waiter (w);
        w->state = CANCELED;
    } else if (seqno == to->seqno && w->state == HOLDER) {
        gu_warn("tried to cancel current TO holder, state %d seqno %llu",
                 w->state, seqno);
        err = -ECANCELED;
    } else {
        gu_warn("trying to cancel used seqno: state %d cancel seqno = %llu, "
                "TO seqno = %llu", w->state, seqno, to->seqno);
        err = -ECANCELED;        
    }
    
    gu_mutex_unlock (&to->lock);
    return err;
}
Beispiel #7
0
long gu_to_grab (gu_to_t* to, gu_seqno_t seqno)
{
    long err;
    to_waiter_t *w;

    assert (seqno >= 0);

    if ((err = gu_mutex_lock(&to->lock))) {
        gu_fatal("Mutex lock failed (%d): %s", err, strerror(err));
        abort();
    }

    if (seqno < to->seqno) {
        gu_mutex_unlock(&to->lock);
        return -ECANCELED;
    }

    if ((w = to_get_waiter (to, seqno)) == NULL) {
        gu_mutex_unlock(&to->lock);
        return -EAGAIN;
    }        
    /* we have a valid waiter now */

    switch (w->state) {
    case INTERRUPTED:
        w->state = RELEASED;
        err = -EINTR;
        break;
    case CANCELED:
        err = -ECANCELED;
        break;
    case RELEASED:
        if (seqno == to->seqno) {
            w->state = HOLDER;
        } else if (seqno < to->seqno) {
            gu_error("Trying to grab outdated seqno");
            err = -ECANCELED;
        } else { /* seqno > to->seqno, wait for my turn */
            w->state = WAIT;
            to->used++;
#ifdef TO_USE_SIGNAL
            gu_cond_wait(&w->cond, &to->lock);
#else
            pthread_mutex_lock (&w->mtx);
            pthread_mutex_unlock (&to->lock);
            pthread_mutex_lock (&w->mtx); // wait for unlock by other thread
            pthread_mutex_lock (&to->lock);
            pthread_mutex_unlock (&w->mtx);
#endif
            to->used--;
            switch (w->state) { 
            case WAIT:// should be most probable
                assert (seqno == to->seqno);
                w->state = HOLDER;
                break;
            case INTERRUPTED:
                w->state = RELEASED;
                err      = -EINTR;
                break;
            case CANCELED:
                err = -ECANCELED;
                break;
            case RELEASED:
                /* this waiter has been cancelled */
                assert(seqno < to->seqno);
                err = -ECANCELED;
                break;
            default:
                gu_fatal("Invalid cond wait exit state %d, seqno %llu(%llu)",
                         w->state, seqno, to->seqno);
                abort();
            }
        }
        break;
    default:
        gu_fatal("TO queue over wrap");
        abort();
    }
    
    gu_mutex_unlock(&to->lock);
    return err;
}
/*! return true if this node is the sender to notify the calling thread of
 * success */
int
gcs_group_handle_join_msg  (gcs_group_t* group, const gcs_recv_msg_t* msg)
{
    int const   sender_idx = msg->sender_idx;
    gcs_node_t* sender    = &group->nodes[sender_idx];

    assert (GCS_MSG_JOIN == msg->type);

    // TODO: define an explicit type for the join message, like gcs_join_msg_t
    assert (msg->size == sizeof(gcs_seqno_t));

    if (GCS_NODE_STATE_DONOR  == sender->status ||
        GCS_NODE_STATE_JOINER == sender->status) {
        int j;
        gcs_seqno_t seqno     = gcs_seqno_gtoh(*(gcs_seqno_t*)msg->buf);
        gcs_node_t* peer      = NULL;
        const char* peer_id   = NULL;
        const char* peer_name = "left the group";
        int         peer_idx  = -1;
        bool        from_donor = false;
        const char* st_dir    = NULL; // state transfer direction symbol

        if (GCS_NODE_STATE_DONOR == sender->status) {
            peer_id    = sender->joiner;
            from_donor = true;
            st_dir     = "to";

            assert (group->last_applied_proto_ver >= 0);

            if (0 == group->last_applied_proto_ver) {
                /* #454 - we don't switch to JOINED here,
                 *        instead going straignt to SYNCED */
            }
            else {
                assert(sender->count_last_applied);
                sender->status = GCS_NODE_STATE_JOINED;
            }
        }
        else {
            peer_id = sender->donor;
            st_dir  = "from";

            if (group->quorum.version < 2) {
                // #591 remove after quorum v1 is phased out
                sender->status = GCS_NODE_STATE_JOINED;
                group->prim_num++;
            }
            else {
                if (seqno >= 0) {
                    sender->status = GCS_NODE_STATE_JOINED;
                    group->prim_num++;
                }
                else {
                    sender->status = GCS_NODE_STATE_PRIM;
                }
            }
        }

        // Try to find peer.
        for (j = 0; j < group->num; j++) {
// #483            if (j == sender_idx) continue;
            if (!memcmp(peer_id, group->nodes[j].id,
                        sizeof (group->nodes[j].id))) {
                peer_idx  = j;
                peer      = &group->nodes[peer_idx];
                peer_name = peer->name;
                break;
            }
        }

        if (j == group->num) {
            gu_warn ("Could not find peer: %s", peer_id);
        }

        if (seqno < 0) {
            gu_warn ("%d.%d (%s): State transfer %s %d.%d (%s) failed: %d (%s)",
                     sender_idx, sender->segment, sender->name, st_dir,
                     peer_idx, peer ? peer->segment : -1, peer_name,
                     (int)seqno, strerror((int)-seqno));

            if (from_donor && peer_idx == group->my_idx &&
                GCS_NODE_STATE_JOINER == group->nodes[peer_idx].status) {
                // this node will be waiting for SST forever. If it has only
                // one recv thread there is no (generic) way to wake it up.
                gu_fatal ("Will never receive state. Need to abort.");
                // return to core to shutdown the backend before aborting
                return -ENOTRECOVERABLE;
            }

            if (group->quorum.version < 2 && !from_donor && // #591
                sender_idx == group->my_idx) {
                // remove after quorum v1 is phased out
                gu_fatal ("Faield to receive state. Need to abort.");
                return -ENOTRECOVERABLE;
            }
        }
        else {
            if (sender_idx == peer_idx) {
                gu_info ("Member %d.%d (%s) resyncs itself to group",
                         sender_idx, sender->segment, sender->name);
            }
            else {
                gu_info ("%d.%d (%s): State transfer %s %d.%d (%s) complete.",
                         sender_idx, sender->segment, sender->name, st_dir,
                         peer_idx, peer ? peer->segment : -1, peer_name);
            }
        }
    }
    else {
        if (GCS_NODE_STATE_PRIM == sender->status) {
            gu_warn("Rejecting JOIN message from %d.%d (%s): new State Transfer"
                    " required.", sender_idx, sender->segment, sender->name);
        }
        else {
            // should we freak out and throw an error?
            gu_warn("Protocol violation. JOIN message sender %d.%d (%s) is not "
                    "in state transfer (%s). Message ignored.",
                    sender_idx, sender->segment, sender->name,
                    gcs_node_state_to_str(sender->status));
        }
        return 0;
    }

    return (sender_idx == group->my_idx);
}
gcs_group_state_t
gcs_group_handle_comp_msg (gcs_group_t* group, const gcs_comp_msg_t* comp)
{
    long        new_idx, old_idx;
    gcs_node_t* new_nodes = NULL;
    ulong       new_memb  = 0;

    const bool prim_comp     = gcs_comp_msg_primary  (comp);
    const bool bootstrap     = gcs_comp_msg_bootstrap(comp);
    const long new_my_idx    = gcs_comp_msg_self     (comp);
    const long new_nodes_num = gcs_comp_msg_num      (comp);

    group_check_comp_msg (prim_comp, new_my_idx, new_nodes_num);

    if (new_my_idx >= 0) {
        gu_info ("New COMPONENT: primary = %s, bootstrap = %s, my_idx = %ld, "
                 "memb_num = %ld", prim_comp ? "yes" : "no",
                 bootstrap ? "yes" : "no", new_my_idx, new_nodes_num);

        new_nodes = group_nodes_init (group, comp);

        if (!new_nodes) {
            gu_fatal ("Could not allocate memory for %ld-node component.",
                      gcs_comp_msg_num (comp));
            assert(0);
            return (gcs_group_state_t)-ENOMEM;
        }

        if (GCS_GROUP_PRIMARY == group->state) {
            gu_debug ("#281: Saving %s over %s",
                      gcs_node_state_to_str(group->nodes[group->my_idx].status),
                      gcs_node_state_to_str(group->prim_state));
            group->prim_state = group->nodes[group->my_idx].status;
        }
    }
    else {
        // Self-leave message
        gu_info ("Received self-leave message.");
        assert (0 == new_nodes_num);
        assert (!prim_comp);
    }

    if (prim_comp) {
        /* Got PRIMARY COMPONENT - Hooray! */
        assert (new_my_idx >= 0);
        if (group->state == GCS_GROUP_PRIMARY) {
            /* we come from previous primary configuration, relax */
        }
        else if (bootstrap)
        {
            /* Is there need to initialize something else in this case? */
            group->nodes[group->my_idx].bootstrap = true;
        }
        else {
            const bool first_component =
#ifndef GCS_CORE_TESTING
            (1 == group->num) && !strcmp (NODE_NO_ID, group->nodes[0].id);
#else
            (1 == group->num);
#endif

            if (1 == new_nodes_num && first_component) {
                /* bootstrap new configuration */
                assert (GCS_GROUP_NON_PRIMARY == group->state);
                assert (1 == group->num);
                assert (0 == group->my_idx);

                // This bootstraps initial primary component for state exchange
                gu_uuid_generate (&group->prim_uuid, NULL, 0);
                group->prim_seqno = 0;
                group->prim_num   = 1;
                group->state      = GCS_GROUP_PRIMARY;

                if (group->act_id < 0) {
                    // no history provided: start a new one
                    group->act_id  = GCS_SEQNO_NIL;
                    gu_uuid_generate (&group->group_uuid, NULL, 0);
                    gu_info ("Starting new group from scratch: "GU_UUID_FORMAT,
                             GU_UUID_ARGS(&group->group_uuid));
                }
// the following should be removed under #474
                group->nodes[0].status = GCS_NODE_STATE_JOINED;
                /* initialize node ID to the one given by the backend - this way
                 * we'll be recognized as coming from prev. conf. in node array
                 * remap below */
                strncpy ((char*)group->nodes[0].id, new_nodes[0].id,
                         sizeof (new_nodes[0].id) - 1);
                group->nodes[0].segment = new_nodes[0].segment;
            }
        }
    }
    else {
        group_go_non_primary (group);
    }

    /* Remap old node array to new one to preserve action continuity */
    for (new_idx = 0; new_idx < new_nodes_num; new_idx++) {
        /* find member index in old component by unique member id */
        for (old_idx = 0; old_idx < group->num; old_idx++) {
            // just scan through old group
            if (!strcmp(group->nodes[old_idx].id, new_nodes[new_idx].id)) {
                /* the node was in previous configuration with us */
                /* move node context to new node array */
                gcs_node_move (&new_nodes[new_idx], &group->nodes[old_idx]);
                break;
            }
        }
        /* if wasn't found in new configuration, new member -
         * need to do state exchange */
        new_memb |= (old_idx == group->num);
    }

    /* free old nodes array */
    group_nodes_free (group);

    group->my_idx = new_my_idx;
    group->num    = new_nodes_num;
    group->nodes  = new_nodes;

    if (gcs_comp_msg_primary(comp) || bootstrap) {
        /* TODO: for now pretend that we always have new nodes and perform
         * state exchange because old states can carry outdated node status.
         * (also protocol voting needs to be redone)
         * However this means aborting ongoing actions. Find a way to avoid
         * this extra state exchange. Generate new state messages on behalf
         * of other nodes? see #238 */
        new_memb = true;
        /* if new nodes joined, reset ongoing actions and state messages */
        if (new_memb) {
            group_nodes_reset (group);
            group->state      = GCS_GROUP_WAIT_STATE_UUID;
            group->state_uuid = GU_UUID_NIL; // prepare for state exchange
        }
        else {
            if (GCS_GROUP_PRIMARY == group->state) {
                /* since we don't have any new nodes since last PRIMARY,
                   we skip state exchange */
                group_post_state_exchange (group);
            }
        }
        group_redo_last_applied (group);
    }

    return group->state;
}
/*! Processes state messages and sets group parameters accordingly */
static void
group_post_state_exchange (gcs_group_t* group)
{
    const gcs_state_msg_t* states[group->num];
    gcs_state_quorum_t* quorum = &group->quorum;
    bool new_exchange = gu_uuid_compare (&group->state_uuid, &GU_UUID_NIL);
    long i;

    /* Collect state messages from nodes. */
    /* Looping here every time is suboptimal, but simply counting state messages
     * is not straightforward too: nodes may disappear, so the final count may
     * include messages from the disappeared nodes.
     * Let's put it this way: looping here is reliable and not that expensive.*/
    for (i = 0; i < group->num; i++) {
        states[i] = group->nodes[i].state_msg;
        if (NULL == states[i] ||
            (new_exchange &&
             gu_uuid_compare (&group->state_uuid,
                              gcs_state_msg_uuid(states[i]))))
            return; // not all states from THIS state exch. received, wait
    }
    gu_debug ("STATE EXCHANGE: "GU_UUID_FORMAT" complete.",
              GU_UUID_ARGS(&group->state_uuid));

    gcs_state_msg_get_quorum (states, group->num, quorum);

    if (quorum->version >= 0) {
        if (quorum->version < 2) {
            group->last_applied_proto_ver = 0;
        }
        else {
            group->last_applied_proto_ver = 1;
        }
    }
    else {
        gu_fatal ("Negative quorum version: %d", quorum->version);
        abort();
    }

    // Update each node state based on quorum outcome:
    // is it up to date, does it need SST and stuff
    for (i = 0; i < group->num; i++) {
        gcs_node_update_status (&group->nodes[i], quorum);
    }

    if (quorum->primary) {
        // primary configuration
        if (new_exchange) {
            // new state exchange happened
            group->state      = GCS_GROUP_PRIMARY;
            group->act_id     = quorum->act_id;
            group->conf_id    = quorum->conf_id + 1;
            group->group_uuid = quorum->group_uuid;
            group->prim_uuid  = group->state_uuid;
            group->state_uuid = GU_UUID_NIL;
        }
        else {
            // no state exchange happend, processing old state messages
            assert (GCS_GROUP_PRIMARY == group->state);
            group->conf_id++;
        }

        group->prim_seqno = group->conf_id;
        group->prim_num   = 0;

        for (i = 0; i < group->num; i++) {
            group->prim_num += gcs_node_is_joined (group->nodes[i].status);
        }

        assert (group->prim_num > 0);
    }
    else {
        // non-primary configuration
        group_go_non_primary (group);
    }

    gu_info ("Quorum results:"
             "\n\tversion    = %u,"
             "\n\tcomponent  = %s,"
             "\n\tconf_id    = %lld,"
             "\n\tmembers    = %d/%d (joined/total),"
             "\n\tact_id     = %lld,"
             "\n\tlast_appl. = %lld,"
             "\n\tprotocols  = %d/%d/%d (gcs/repl/appl),"
             "\n\tgroup UUID = "GU_UUID_FORMAT,
             quorum->version,
             quorum->primary ? "PRIMARY" : "NON-PRIMARY",
             quorum->conf_id,
             group->prim_num, group->num,
             quorum->act_id,
             group->last_applied,
             quorum->gcs_proto_ver, quorum->repl_proto_ver,
             quorum->appl_proto_ver,
             GU_UUID_ARGS(&quorum->group_uuid));

    group_check_donor(group);
}
long gcs_fifo_lite_destroy (gcs_fifo_lite_t* f)
{
    if (f) {
	if (gu_mutex_lock (&f->lock)) { abort(); }

	if (f->destroyed) {
	    gu_mutex_unlock (&f->lock);
	    return -EALREADY;
	}

        f->closed    = true;
	f->destroyed = true;

	/* get rid of "put" threads waiting for lock or signal */
	while (pthread_cond_destroy (&f->put_cond)) {
            if (f->put_wait <= 0) {
                gu_fatal ("Can't destroy condition while nobody's waiting");
                abort();
            }
            f->put_wait = 0;
            gu_cond_broadcast (&f->put_cond);
	}

	while (f->used) {
	    /* there are some items in FIFO - and that means
	     * no gcs_fifo_lite_safe_get() is waiting on condition */
	    gu_mutex_unlock (&f->lock);
	    /* let them get remaining items from FIFO,
	     * we don't know how to deallocate them ourselves.
	     * unfortunately this may take some time */
	    usleep (10000); /* sleep a bit to avoid busy loop */
	    gu_mutex_lock (&f->lock);
	}
	f->length = 0;

	/* now all we have - "get" threads waiting for lock or signal */
	while (pthread_cond_destroy (&f->get_cond)) {
            if (f->get_wait <= 0) {
                gu_fatal ("Can't destroy condition while nobody's waiting");
                abort();
            }
            f->get_wait = 0;
            gu_cond_broadcast (&f->get_cond);
	}

	/* at this point there are only functions waiting for lock */
	gu_mutex_unlock (&f->lock);
	while (gu_mutex_destroy (&f->lock)) {
	    /* this should be fast provided safe get and safe put are
	     * wtitten correctly. They should immediately freak out. */
	    gu_mutex_lock   (&f->lock);
	    gu_mutex_unlock (&f->lock);
	}

	/* now nobody's waiting for anything */
	gu_free (f->queue);
	gu_free (f);
	return 0;
    }
    return -EINVAL;
}