Пример #1
0
static void
join_make_offer(gpointer key, gpointer value, gpointer user_data)
{
    xmlNode *offer = NULL;
    crm_node_t *member = (crm_node_t *)value;

    CRM_ASSERT(member != NULL);
    if (crm_is_peer_active(member) == FALSE) {
        crm_info("Not making an offer to %s: not active (%s)", member->uname, member->state);
        if(member->expected == NULL && safe_str_eq(member->state, CRM_NODE_LOST)) {
            /* You would think this unsafe, but in fact this plus an
             * active resource is what causes it to be fenced.
             *
             * Yes, this does mean that any node that dies at the same
             * time as the old DC and is not running resource (still)
             * won't be fenced.
             *
             * I'm not happy about this either.
             */
            crm_update_peer_expected(__FUNCTION__, member, CRMD_JOINSTATE_DOWN);
        }
        return;
    }

    if (member->uname == NULL) {
        crm_err("No recipient for welcome message");
        return;
    }

    if (saved_ccm_membership_id != crm_peer_seq) {
        saved_ccm_membership_id = crm_peer_seq;
        crm_info("Making join offers based on membership %llu", crm_peer_seq);
    }

    if(user_data && member->join > crm_join_none) {
        crm_info("Skipping %s: already known %d", member->uname, member->join);
        return;
    }

    crm_update_peer_join(__FUNCTION__, (crm_node_t*)member, crm_join_none);

    offer = create_request(CRM_OP_JOIN_OFFER, NULL, member->uname,
                           CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);

    crm_xml_add_int(offer, F_CRM_JOIN_ID, current_join_id);
    /* send the welcome */
    crm_info("join-%d: Sending offer to %s", current_join_id, member->uname);

    send_cluster_message(member, crm_msg_crmd, offer, TRUE);
    free_xml(offer);

    crm_update_peer_join(__FUNCTION__, member, crm_join_welcomed);
    /* crm_update_peer_expected(__FUNCTION__, member, CRMD_JOINSTATE_PENDING); */
}
Пример #2
0
/*	A_DC_JOIN_PROCESS_ACK	*/
void
do_dc_join_ack(long long action,
               enum crmd_fsa_cause cause,
               enum crmd_fsa_state cur_state,
               enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    int join_id = -1;
    int call_id = 0;
    ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);

    const char *op = crm_element_value(join_ack->msg, F_CRM_TASK);
    const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
    crm_node_t *peer = crm_get_peer(0, join_from);

    if (safe_str_neq(op, CRM_OP_JOIN_CONFIRM) || peer == NULL) {
        crm_debug("Ignoring op=%s message from %s", op, join_from);
        return;
    }

    crm_trace("Processing ack from %s", join_from);
    crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);

    if (peer->join != crm_join_finalized) {
        crm_info("Join not in progress: ignoring join-%d from %s (phase = %d)",
                 join_id, join_from, peer->join);
        return;

    } else if (join_id != current_join_id) {
        crm_err("Invalid response from %s: join-%d vs. join-%d",
                join_from, join_id, current_join_id);
        crm_update_peer_join(__FUNCTION__, peer, crm_join_nack);
        return;
    }

    crm_update_peer_join(__FUNCTION__, peer, crm_join_confirmed);

    crm_info("join-%d: Updating node state to %s for %s",
             join_id, CRMD_JOINSTATE_MEMBER, join_from);

    /* update CIB with the current LRM status from the node
     * We dont need to notify the TE of these updates, a transition will
     *   be started in due time
     */
    erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local);
    fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml,
                   cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL);
    fsa_register_cib_callback(call_id, FALSE, NULL, join_update_complete_callback);
    crm_debug("join-%d: Registered callback for LRM update %d", join_id, call_id);
}
Пример #3
0
void
initialize_join(gboolean before)
{
    GHashTableIter iter;
    crm_node_t *peer = NULL;

    /* clear out/reset a bunch of stuff */
    crm_debug("join-%d: Initializing join data (flag=%s)",
              current_join_id, before ? "true" : "false");

    g_hash_table_iter_init(&iter, crm_peer_cache);
    while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) {
        crm_update_peer_join(__FUNCTION__, peer, crm_join_none);
    }

    if (before) {
        if (max_generation_from != NULL) {
            free(max_generation_from);
            max_generation_from = NULL;
        }
        if (max_generation_xml != NULL) {
            free_xml(max_generation_xml);
            max_generation_xml = NULL;
        }
        clear_bit(fsa_input_register, R_HAVE_CIB);
        clear_bit(fsa_input_register, R_CIB_ASKED);
    }
}
Пример #4
0
static void
reap_dead_nodes(gpointer key, gpointer value, gpointer user_data)
{
    crm_node_t *node = value;

    if (crm_is_peer_active(node) == FALSE) {
        crm_update_peer_join(__FUNCTION__, node, crm_join_none);

        if(node && node->uname) {
            election_remove(fsa_election, node->uname);

            if (safe_str_eq(fsa_our_uname, node->uname)) {
                crm_err("We're not part of the cluster anymore");
                register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);

            } else if (AM_I_DC == FALSE && safe_str_eq(node->uname, fsa_our_dc)) {
                crm_warn("Our DC node (%s) left the cluster", node->uname);
                register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
            }
        }

        if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) {
            check_join_state(fsa_state, __FUNCTION__);
        }
        fail_incompletable_actions(transition_graph, node->uuid);
    }
}
Пример #5
0
static void
join_make_offer(gpointer key, gpointer value, gpointer user_data)
{
    xmlNode *offer = NULL;
    crm_node_t *member = (crm_node_t *)value;

    CRM_ASSERT(member != NULL);
    if (crm_is_peer_active(member) == FALSE) {
        crm_info("Not making an offer to %s: not active (%s)", member->uname, member->state);
        if(member->expected == NULL && safe_str_eq(member->state, CRM_NODE_LOST)) {
            crm_update_peer_expected(__FUNCTION__, member, CRMD_JOINSTATE_DOWN);
        }
        return;
    }

    if (member->uname == NULL) {
        crm_err("No recipient for welcome message");
        return;
    }

    if (saved_ccm_membership_id != crm_peer_seq) {
        saved_ccm_membership_id = crm_peer_seq;
        crm_info("Making join offers based on membership %llu", crm_peer_seq);
    }

    if(user_data && member->join > crm_join_none) {
        crm_info("Skipping %s: already known %d", member->uname, member->join);
        return;
    }

    crm_update_peer_join(__FUNCTION__, (crm_node_t*)member, crm_join_none);

    offer = create_request(CRM_OP_JOIN_OFFER, NULL, member->uname,
                           CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);

    crm_xml_add_int(offer, F_CRM_JOIN_ID, current_join_id);
    /* send the welcome */
    crm_info("join-%d: Sending offer to %s", current_join_id, member->uname);

    send_cluster_message(member, crm_msg_crmd, offer, TRUE);
    free_xml(offer);

    crm_update_peer_join(__FUNCTION__, member, crm_join_welcomed);
    /* crm_update_peer_expected(__FUNCTION__, member, CRMD_JOINSTATE_PENDING); */
}
Пример #6
0
void
finalize_join_for(gpointer key, gpointer value, gpointer user_data)
{
    xmlNode *acknak = NULL;
    xmlNode *tmp1 = NULL;
    crm_node_t *join_node = value;
    const char *join_to = join_node->uname;

    if(join_node->join != crm_join_integrated) {
        crm_trace("Skipping %s in state %d", join_to, join_node->join);
        return;
    }

    /* make sure a node entry exists for the new node */
    crm_trace("Creating node entry for %s", join_to);

    tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE);
    set_uuid(tmp1, XML_ATTR_UUID, join_node);
    crm_xml_add(tmp1, XML_ATTR_UNAME, join_to);

    fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1,
                        cib_scope_local | cib_quorum_override | cib_can_create);
    free_xml(tmp1);

    join_node = crm_get_peer(0, join_to);
    if (crm_is_peer_active(join_node) == FALSE) {
        /*
         * NACK'ing nodes that the membership layer doesn't know about yet
         * simply creates more churn
         *
         * Better to leave them waiting and let the join restart when
         * the new membership event comes in
         *
         * All other NACKs (due to versions etc) should still be processed
         */
        crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_PENDING);
        return;
    }

    /* send the ack/nack to the node */
    acknak = create_request(CRM_OP_JOIN_ACKNAK, NULL, join_to,
                            CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL);
    crm_xml_add_int(acknak, F_CRM_JOIN_ID, current_join_id);

    crm_debug("join-%d: ACK'ing join request from %s",
              current_join_id, join_to);
    crm_xml_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_TRUE);
    crm_update_peer_join(__FUNCTION__, join_node, crm_join_finalized);
    crm_update_peer_expected(__FUNCTION__, join_node, CRMD_JOINSTATE_MEMBER);

    send_cluster_message(crm_get_peer(0, join_to), crm_msg_crmd, acknak, TRUE);
    free_xml(acknak);
    return;
}
Пример #7
0
void
send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
{
    int rc = pcmk_ok;
    crm_node_t *peer = NULL;

    /* zero out the node-status & remove all LRM status info */
    xmlNode *node_state = NULL;

    CRM_CHECK(target != NULL, return);
    CRM_CHECK(uuid != NULL, return);

    /* Make sure the membership and join caches are accurate */
    peer = crm_get_peer_full(0, target, CRM_GET_PEER_CLUSTER | CRM_GET_PEER_REMOTE);

    CRM_CHECK(peer != NULL, return);

    if (peer->uuid == NULL) {
        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
        peer->uuid = strdup(uuid);
    }
    crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL);
    crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0);
    crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN);
    crm_update_peer_join(__FUNCTION__, peer, crm_join_none);

    node_state =
        do_update_node_cib(peer,
                           node_update_cluster | node_update_peer | node_update_join |
                           node_update_expected, NULL, __FUNCTION__);

    /* Force our known ID */
    crm_xml_add(node_state, XML_ATTR_UUID, uuid);

    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
                                    cib_quorum_override | cib_scope_local | cib_can_create);

    /* Delay processing the trigger until the update completes */
    crm_debug("Sending fencing update %d for %s", rc, target);
    fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);

    /* Make sure it sticks */
    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */

    erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local);
    erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);

    free_xml(node_state);
    return;
}
Пример #8
0
/*	 A_DC_JOIN_PROCESS_REQ	*/
void
do_dc_join_filter_offer(long long action,
                        enum crmd_fsa_cause cause,
                        enum crmd_fsa_state cur_state,
                        enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    xmlNode *generation = NULL;

    int cmp = 0;
    int join_id = -1;
    gboolean ack_nack_bool = TRUE;
    const char *ack_nack = CRMD_JOINSTATE_MEMBER;
    ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);

    const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
    const char *ref = crm_element_value(join_ack->msg, F_CRM_REFERENCE);

    crm_node_t *join_node = crm_get_peer(0, join_from);

    crm_debug("Processing req from %s", join_from);

    generation = join_ack->xml;
    crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);

    if (max_generation_xml != NULL && generation != NULL) {
        int lpc = 0;

        const char *attributes[] = {
            XML_ATTR_GENERATION_ADMIN,
            XML_ATTR_GENERATION,
            XML_ATTR_NUMUPDATES,
        };

        for (lpc = 0; cmp == 0 && lpc < DIMOF(attributes); lpc++) {
            cmp = compare_int_fields(max_generation_xml, generation, attributes[lpc]);
        }
    }

    if (join_id != current_join_id) {
        crm_debug("Invalid response from %s: join-%d vs. join-%d",
                  join_from, join_id, current_join_id);
        check_join_state(cur_state, __FUNCTION__);
        return;

    } else if (join_node == NULL || crm_is_peer_active(join_node) == FALSE) {
        crm_err("Node %s is not a member", join_from);
        ack_nack_bool = FALSE;

    } else if (generation == NULL) {
        crm_err("Generation was NULL");
        ack_nack_bool = FALSE;

    } else if (max_generation_xml == NULL) {
        max_generation_xml = copy_xml(generation);
        max_generation_from = strdup(join_from);

    } else if (cmp < 0 || (cmp == 0 && safe_str_eq(join_from, fsa_our_uname))) {
        crm_debug("%s has a better generation number than"
                  " the current max %s", join_from, max_generation_from);
        if (max_generation_xml) {
            crm_log_xml_debug(max_generation_xml, "Max generation");
        }
        crm_log_xml_debug(generation, "Their generation");

        free(max_generation_from);
        free_xml(max_generation_xml);

        max_generation_from = strdup(join_from);
        max_generation_xml = copy_xml(join_ack->xml);
    }

    if (ack_nack_bool == FALSE) {
        /* NACK this client */
        ack_nack = CRMD_JOINSTATE_NACK;
        crm_update_peer_join(__FUNCTION__, join_node, crm_join_nack);
        crm_err("Rejecting cluster join request from %s " CRM_XS
                " NACK join-%d ref=%s", join_from, join_id, ref);

    } else {
        crm_debug("join-%d: Welcoming node %s (ref %s)", join_id, join_from, ref);
        crm_update_peer_join(__FUNCTION__, join_node, crm_join_integrated);
    }

    crm_update_peer_expected(__FUNCTION__, join_node, ack_nack);

    crm_debug("%u nodes have been integrated into join-%d",
              crmd_join_phase_count(crm_join_integrated), join_id);


    if (check_join_state(cur_state, __FUNCTION__) == FALSE) {
        /* don't waste time by invoking the PE yet; */
        crm_debug("join-%d: Still waiting on %d outstanding offers",
                  join_id, crmd_join_phase_count(crm_join_welcomed));
    }
}
Пример #9
0
/*	 A_DC_JOIN_OFFER_ONE	*/
void
do_dc_join_offer_one(long long action,
                     enum crmd_fsa_cause cause,
                     enum crmd_fsa_state cur_state,
                     enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    crm_node_t *member;
    ha_msg_input_t *welcome = NULL;

    const char *op = NULL;
    const char *join_to = NULL;

    if (msg_data->data) {
        welcome = fsa_typed_data(fsa_dt_ha_msg);

    } else {
        crm_info("An unknown node joined - (re-)offer to any unconfirmed nodes");
        g_hash_table_foreach(crm_peer_cache, join_make_offer, &member);
        check_join_state(cur_state, __FUNCTION__);
        return;
    }

    if (welcome == NULL) {
        crm_err("Attempt to send welcome message without a message to reply to!");
        return;
    }

    join_to = crm_element_value(welcome->msg, F_CRM_HOST_FROM);
    if (join_to == NULL) {
        crm_err("Attempt to send welcome message without a host to reply to!");
        return;
    }

    member = crm_get_peer(0, join_to);
    op = crm_element_value(welcome->msg, F_CRM_TASK);
    if (join_to != NULL && (cur_state == S_INTEGRATION || cur_state == S_FINALIZE_JOIN)) {
        /* note: it _is_ possible that a node will have been
         *  sick or starting up when the original offer was made.
         *  however, it will either re-announce itself in due course
         *  _or_ we can re-store the original offer on the client.
         */
        crm_trace("(Re-)offering membership to %s...", join_to);
    }

    crm_info("join-%d: Processing %s request from %s in state %s",
             current_join_id, op, join_to, fsa_state2string(cur_state));

    crm_update_peer_join(__FUNCTION__, member, crm_join_none);
    join_make_offer(NULL, member, NULL);

    /* always offer to the DC (ourselves)
     * this ensures the correct value for max_generation_from
     */
    member = crm_get_peer(0, fsa_our_uname);
    join_make_offer(NULL, member, NULL);

    /* this was a genuine join request, cancel any existing
     * transition and invoke the PE
     */
    abort_transition(INFINITY, tg_restart, "Node join", NULL);

    /* don't waste time by invoking the PE yet; */
    crm_debug("Waiting on %d outstanding join acks for join-%d",
              crmd_join_phase_count(crm_join_welcomed), current_join_id);
}
Пример #10
0
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
    uint32_t old = 0;
    uint32_t changed = 0;
    bool appeared = FALSE;
    bool is_remote = is_set(node->flags, crm_remote_node);
    const char *status = NULL;

    /* Crmd waits to receive some information from the membership layer before
     * declaring itself operational. If this is being called for a cluster node,
     * indicate that we have it.
     */
    if (!is_remote) {
        set_bit(fsa_input_register, R_PEER_DATA);
    }

    if (node->uname == NULL) {
        return;
    }

    switch (type) {
        case crm_status_uname:
            /* If we've never seen the node, then it also won't be in the status section */
            crm_info("%s node %s is now %s",
                     (is_remote? "Remote" : "Cluster"),
                     node->uname, state_text(node->state));
            return;

        case crm_status_rstate:
        case crm_status_nstate:
            /* This callback should not be called unless the state actually
             * changed, but here's a failsafe just in case.
             */
            CRM_CHECK(safe_str_neq(data, node->state), return);

            crm_info("%s node %s is now %s (was %s)",
                     (is_remote? "Remote" : "Cluster"),
                     node->uname, state_text(node->state), state_text(data));

            if (safe_str_eq(CRM_NODE_MEMBER, node->state)) {
                appeared = TRUE;
                if (!is_remote) {
                    remove_stonith_cleanup(node->uname);
                }
            }

            crmd_alert_node_event(node);
            break;

        case crm_status_processes:
            if (data) {
                old = *(const uint32_t *)data;
                changed = node->processes ^ old;
            }

            status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS;
            crm_info("Client %s/%s now has status [%s] (DC=%s, changed=%6x)",
                     node->uname, peer2text(proc_flags), status,
                     AM_I_DC ? "true" : crm_str(fsa_our_dc), changed);

            if ((changed & proc_flags) == 0) {
                /* Peer process did not change */
                crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags);
                return;
            } else if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) {
                crm_trace("Not connected");
                return;
            } else if (fsa_state == S_STOPPING) {
                crm_trace("Stopping");
                return;
            }

            appeared = (node->processes & proc_flags) != 0;
            if (safe_str_eq(node->uname, fsa_our_uname) && (node->processes & proc_flags) == 0) {
                /* Did we get evicted? */
                crm_notice("Our peer connection failed");
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL);

            } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) {
                /* Did the DC leave us? */
                crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc);
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);

                /* @COMPAT DC < 1.1.13: If a DC shuts down normally, we don't
                 * want to fence it. Newer DCs will send their shutdown request
                 * to all peers, who will update the DC's expected state to
                 * down, thus avoiding fencing. We can safely erase the DC's
                 * transient attributes when it leaves in that case. However,
                 * the only way to avoid fencing older DCs is to leave the
                 * transient attributes intact until it rejoins.
                 */
                if (compare_version(fsa_our_dc_version, "3.0.9") > 0) {
                    erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
                }

            } else if(AM_I_DC && appeared == FALSE) {
                crm_info("Peer %s left us", node->uname);
                erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
            }
            break;
    }

    if (AM_I_DC) {
        xmlNode *update = NULL;
        int flags = node_update_peer;
        gboolean alive = is_remote? appeared : crm_is_peer_active(node);
        crm_action_t *down = match_down_event(node->uuid, appeared);

        crm_trace("Alive=%d, appeared=%d, down=%d",
                  alive, appeared, (down? down->id : -1));

        if (alive && type == crm_status_processes) {
            register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
        }

        if (down) {
            const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);

            if (safe_str_eq(task, CRM_OP_FENCE)) {

                /* tengine_stonith_callback() confirms fence actions */
                crm_trace("Updating CIB %s stonithd reported fencing of %s complete",
                          (down->confirmed? "after" : "before"), node->uname);

            } else if ((alive == FALSE) && safe_str_eq(task, CRM_OP_SHUTDOWN)) {
                crm_notice("%s of peer %s is complete "CRM_XS" op=%d",
                           task, node->uname, down->id);

                /* down->confirmed = TRUE; */
                stop_te_timer(down->timer);

                if (!is_remote) {
                    flags |= node_update_join | node_update_expected;
                    crmd_peer_down(node, FALSE);
                    check_join_state(fsa_state, __FUNCTION__);
                }

                update_graph(transition_graph, down);
                trigger_graph();

            } else {
                crm_trace("Node %s is %salive, was expected to %s (op %d)",
                          node->uname, (alive? "" : "not "), task, down->id);
            }

        } else if (appeared == FALSE) {
            crm_notice("Stonith/shutdown of %s not matched", node->uname);

            if (!is_remote) {
                crm_update_peer_join(__FUNCTION__, node, crm_join_none);
                check_join_state(fsa_state, __FUNCTION__);
            }

            abort_transition(INFINITY, tg_restart, "Node failure", NULL);
            fail_incompletable_actions(transition_graph, node->uuid);

        } else {
            crm_trace("Node %s came up, was not expected to be down",
                      node->uname);
        }

        if (is_remote) {
            /* A pacemaker_remote node won't have its cluster status updated
             * in the CIB by membership-layer callbacks, so do it here.
             */
            flags |= node_update_cluster;

            /* Trigger resource placement on newly integrated nodes */
            if (appeared) {
                abort_transition(INFINITY, tg_restart,
                                 "pacemaker_remote node integrated", NULL);
            }
        }

        /* Update the CIB node state */
        update = create_node_state_update(node, flags, NULL, __FUNCTION__);
        fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
                            cib_scope_local | cib_quorum_override | cib_can_create);
        free_xml(update);
    }

    trigger_fsa(fsa_source);
}
Пример #11
0
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
    uint32_t old = 0;
    uint32_t changed = 0;
    bool appeared = FALSE;
    const char *status = NULL;

    set_bit(fsa_input_register, R_PEER_DATA);
    if (node->uname == NULL) {
        return;
    }

    switch (type) {
        case crm_status_uname:
            /* If we've never seen the node, then it also wont be in the status section */
            crm_info("%s is now %s", node->uname, node->state);
            return;
        case crm_status_rstate:
            crm_info("Remote node %s is now %s (was %s)", node->uname, node->state, (const char *)data);
            /* Keep going */
        case crm_status_nstate:
            crm_info("%s is now %s (was %s)", node->uname, node->state, (const char *)data);
            if (safe_str_eq(data, node->state)) {
                /* State did not change */
                return;
            } else if(safe_str_eq(CRM_NODE_MEMBER, node->state)) {
                appeared = TRUE;
            }
            break;
        case crm_status_processes:
            if (data) {
                old = *(const uint32_t *)data;
                changed = node->processes ^ old;
            }

            /* crmd_proc_update(node, proc_flags); */
            status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS;
            crm_info("Client %s/%s now has status [%s] (DC=%s, changed=%6x)",
                     node->uname, peer2text(proc_flags), status,
                     AM_I_DC ? "true" : crm_str(fsa_our_dc), changed);

            if ((changed & proc_flags) == 0) {
                /* Peer process did not change */
                crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags);
                return;
            } else if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) {
                crm_trace("Not connected");
                return;
            } else if (fsa_state == S_STOPPING) {
                crm_trace("Stopping");
                return;
            }

            appeared = (node->processes & proc_flags) != 0;
            if (safe_str_eq(node->uname, fsa_our_uname) && (node->processes & proc_flags) == 0) {
                /* Did we get evicted? */
                crm_notice("Our peer connection failed");
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL);

            } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) {
                /* Did the DC leave us? */
                crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc);
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);

            } else if(AM_I_DC && appeared == FALSE) {
                crm_info("Peer %s left us", node->uname);
                /* crm_update_peer_join(__FUNCTION__, node, crm_join_none); */
            }
            break;
    }

    if (AM_I_DC) {
        xmlNode *update = NULL;
        int flags = node_update_peer;
        gboolean alive = crm_is_peer_active(node);
        crm_action_t *down = match_down_event(0, node->uuid, NULL, appeared);

        crm_trace("Alive=%d, appear=%d, down=%p", alive, appeared, down);

        if (alive && type == crm_status_processes) {
            register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
        }

        if (down) {
            const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);

            if (alive && safe_str_eq(task, CRM_OP_FENCE)) {
                crm_info("Node return implies stonith of %s (action %d) completed", node->uname,
                         down->id);
                erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local);
                erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
                /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */
                down->sent_update = TRUE;       /* Prevent tengine_stonith_callback() from calling send_stonith_update() */

            } else if (safe_str_eq(task, CRM_OP_FENCE)) {
                crm_trace("Waiting for stonithd to report the fencing of %s is complete", node->uname); /* via tengine_stonith_callback() */

            } else if (alive == FALSE) {
                crm_notice("%s of %s (op %d) is complete", task, node->uname, down->id);
                /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */
                stop_te_timer(down->timer);

                flags |= node_update_join | node_update_expected;
                crmd_peer_down(node, FALSE);
                check_join_state(fsa_state, __FUNCTION__);

                update_graph(transition_graph, down);
                trigger_graph();

            } else {
                crm_trace("Other %p", down);
            }

        } else if (appeared == FALSE) {
            crm_notice("Stonith/shutdown of %s not matched", node->uname);

            crm_update_peer_join(__FUNCTION__, node, crm_join_none);
            check_join_state(fsa_state, __FUNCTION__);

            abort_transition(INFINITY, tg_restart, "Node failure", NULL);
            fail_incompletable_actions(transition_graph, node->uuid);

        } else {
            crm_trace("Other %p", down);
        }

        update = do_update_node_cib(node, flags, NULL, __FUNCTION__);
        fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
                            cib_scope_local | cib_quorum_override | cib_can_create);
        free_xml(update);
    }

    trigger_fsa(fsa_source);
}
Пример #12
0
static void
tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
{
    if(te_client_id == NULL) {
        te_client_id = g_strdup_printf("%s.%d", crm_system_name, getpid());
    }

    if (st_event == NULL) {
        crm_err("Notify data not found");
        return;
    }

    if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) {
        crm_crit("We were alegedly just fenced by %s for %s with %s!", st_event->executioner,
                 st_event->origin, st_event->device); /* Dumps blackbox if enabled */

        qb_log_fini(); /* Try to get the above log message to disk - somehow */

        /* Get out ASAP and do not come back up.
         *
         * Triggering a reboot is also not the worst idea either since
         * the rest of the cluster thinks we're safely down
         */

#ifdef RB_HALT_SYSTEM
        reboot(RB_HALT_SYSTEM);
#endif

        /*
         * If reboot() fails or is not supported, coming back up will
         * probably lead to a situation where the other nodes set our
         * status to 'lost' because of the fencing callback and will
         * discard subsequent election votes with:
         *
         * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster)
         *
         * So just stay dead, something is seriously messed up anyway.
         *
         */
        exit(100); /* None of our wrappers since we already called qb_log_fini() */
        return;
    }

    if (st_event->result == pcmk_ok &&
        safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
        st_fail_count_reset(st_event->target);
    }

    crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",
               st_event->target, st_event->result == pcmk_ok ? "" : " not",
               st_event->action,
               st_event->executioner ? st_event->executioner : "<anyone>",
               st_event->origin, pcmk_strerror(st_event->result), st_event->id,
               st_event->client_origin ? st_event->client_origin : "<unknown>");

#if SUPPORT_CMAN
    if (st_event->result == pcmk_ok && is_cman_cluster()) {
        int local_rc = 0;
        char *target_copy = strdup(st_event->target);

        /* In case fenced hasn't noticed yet
         *
         * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect
         */
        local_rc = fenced_external(target_copy);
        if (local_rc != 0) {
            crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target,
                    local_rc);
        } else {
            crm_notice("Notified CMAN that '%s' is now fenced", st_event->target);
        }
        free(target_copy);
    }
#endif

     if (st_event->result == pcmk_ok) {
         crm_node_t *peer = crm_get_peer_full(0, st_event->target, CRM_GET_PEER_REMOTE | CRM_GET_PEER_CLUSTER);
         const char *uuid = crm_peer_uuid(peer);
         gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);

        crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
        if(AM_I_DC) {
            /* The DC always sends updates */
            send_stonith_update(NULL, st_event->target, uuid);

            if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) {

                /* Abort the current transition graph if it wasn't us
                 * that invoked stonith to fence someone
                 */
                crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
                abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
            }

            /* Assume it was our leader if we dont currently have one */
        } else if (fsa_our_dc == NULL || safe_str_eq(fsa_our_dc, st_event->target)) {
            crm_notice("Target %s our leader %s (recorded: %s)",
                       fsa_our_dc ? "was" : "may have been", st_event->target,
                       fsa_our_dc ? fsa_our_dc : "<unset>");

            /* Given the CIB resyncing that occurs around elections,
             * have one node update the CIB now and, if the new DC is different,
             * have them do so too after the election
             */
            if (we_are_executioner) {
                send_stonith_update(NULL, st_event->target, uuid);
            }
            stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(st_event->target));

        }

        crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL);
        crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0);
        crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN);
        crm_update_peer_join(__FUNCTION__, peer, crm_join_none);
     }
}