Beispiel #1
0
crm_node_t *crm_update_ccm_node(
    const oc_ev_membership_t *oc, int offset, const char *state, uint64_t seq)
{
    crm_node_t *node = NULL;
    const char *uuid = NULL;
    CRM_CHECK(oc->m_array[offset].node_uname != NULL, return NULL);
    uuid = get_uuid(oc->m_array[offset].node_uname);
    node = crm_update_peer(oc->m_array[offset].node_id,
			   oc->m_array[offset].node_born_on, seq, -1, 0,
			   uuid, oc->m_array[offset].node_uname, NULL, state);

    if(safe_str_eq(CRM_NODE_ACTIVE, state)) {
	/* Heartbeat doesn't send status notifications for nodes that were already part of the cluster */
	crm_update_peer_proc(
	    oc->m_array[offset].node_uname, crm_proc_ais, ONLINESTATUS);

	/* Nor does it send status notifications for processes that were already active */
	crm_update_peer_proc(
	   oc->m_array[offset].node_uname, crm_proc_crmd, ONLINESTATUS);
    }
    return node;
}
Beispiel #2
0
void
send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
{
    int rc = pcmk_ok;
    crm_node_t *peer = NULL;

    /* zero out the node-status & remove all LRM status info */
    xmlNode *node_state = NULL;

    CRM_CHECK(target != NULL, return);
    CRM_CHECK(uuid != NULL, return);

    if (get_node_uuid(0, target) == NULL) {
        set_node_uuid(target, uuid);
    }

    /* Make sure the membership and join caches are accurate */
    peer = crm_get_peer(0, target);
    if (peer->uuid == NULL) {
        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
        peer->uuid = strdup(uuid);
    }
    crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL);
    crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0);
    crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN);
    erase_node_from_join(target);

    node_state =
        do_update_node_cib(peer,
                           node_update_cluster | node_update_peer | node_update_join |
                           node_update_expected, NULL, __FUNCTION__);

    /* Force our known ID */
    crm_xml_add(node_state, XML_ATTR_UUID, uuid);

    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
                                    cib_quorum_override | cib_scope_local | cib_can_create);

    /* Delay processing the trigger until the update completes */
    crm_debug("Sending fencing update %d for %s", rc, target);
    fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);

    /* Make sure it sticks */
    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */

    erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local);
    erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);

    free_xml(node_state);
    return;
}
Beispiel #3
0
gboolean
cluster_connect_cpg(crm_cluster_t *cluster)
{
    int rc = -1;
    int fd = 0;
    int retries = 0;
    uint32_t id = 0;
    crm_node_t *peer = NULL;
    cpg_handle_t handle = 0;

    struct mainloop_fd_callbacks cpg_fd_callbacks = {
        .dispatch = pcmk_cpg_dispatch,
        .destroy = cluster->destroy,
    };

    cpg_callbacks_t cpg_callbacks = {
        .cpg_deliver_fn = cluster->cpg.cpg_deliver_fn,
        .cpg_confchg_fn = cluster->cpg.cpg_confchg_fn,
        /* .cpg_deliver_fn = pcmk_cpg_deliver, */
        /* .cpg_confchg_fn = pcmk_cpg_membership, */
    };

    cpg_evicted = FALSE;
    cluster->group.length = 0;
    cluster->group.value[0] = 0;

    /* group.value is char[128] */
    strncpy(cluster->group.value, crm_system_name?crm_system_name:"unknown", 127);
    cluster->group.value[127] = 0;
    cluster->group.length = 1 + QB_MIN(127, strlen(cluster->group.value));

    cs_repeat(retries, 30, rc = cpg_initialize(&handle, &cpg_callbacks));
    if (rc != CS_OK) {
        crm_err("Could not connect to the Cluster Process Group API: %d", rc);
        goto bail;
    }

    id = get_local_nodeid(handle);
    if (id == 0) {
        crm_err("Could not get local node id from the CPG API");
        goto bail;

    }
    cluster->nodeid = id;

    retries = 0;
    cs_repeat(retries, 30, rc = cpg_join(handle, &cluster->group));
    if (rc != CS_OK) {
        crm_err("Could not join the CPG group '%s': %d", crm_system_name, rc);
        goto bail;
    }

    rc = cpg_fd_get(handle, &fd);
    if (rc != CS_OK) {
        crm_err("Could not obtain the CPG API connection: %d", rc);
        goto bail;
    }

    pcmk_cpg_handle = handle;
    cluster->cpg_handle = handle;
    mainloop_add_fd("corosync-cpg", G_PRIORITY_MEDIUM, fd, cluster, &cpg_fd_callbacks);

  bail:
    if (rc != CS_OK) {
        cpg_finalize(handle);
        return FALSE;
    }

    peer = crm_get_peer(id, NULL);
    crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, ONLINESTATUS);
    return TRUE;
}

gboolean
send_cluster_message_cs(xmlNode * msg, gboolean local, crm_node_t * node, enum crm_ais_msg_types dest)
{
    gboolean rc = TRUE;
    char *data = NULL;

    data = dump_xml_unformatted(msg);
    rc = send_cluster_text(crm_class_cluster, data, local, node, dest);
    free(data);
    return rc;
}

gboolean
send_cluster_text(int class, const char *data,
              gboolean local, crm_node_t * node, enum crm_ais_msg_types dest)
{
    static int msg_id = 0;
    static int local_pid = 0;
    static int local_name_len = 0;
    static const char *local_name = NULL;

    char *target = NULL;
    struct iovec *iov;
    AIS_Message *msg = NULL;
    enum crm_ais_msg_types sender = text2msg_type(crm_system_name);

    /* There are only 6 handlers registered to crm_lib_service in plugin.c */
    CRM_CHECK(class < 6, crm_err("Invalid message class: %d", class);
              return FALSE);

#if !SUPPORT_PLUGIN
    CRM_CHECK(dest != crm_msg_ais, return FALSE);
#endif

    if(local_name == NULL) {
        local_name = get_local_node_name();
    }
    if(local_name_len == 0 && local_name) {
        local_name_len = strlen(local_name);
    }

    if (data == NULL) {
        data = "";
    }

    if (local_pid == 0) {
        local_pid = getpid();
    }

    if (sender == crm_msg_none) {
        sender = local_pid;
    }

    msg = calloc(1, sizeof(AIS_Message));

    msg_id++;
    msg->id = msg_id;
    msg->header.id = class;
    msg->header.error = CS_OK;

    msg->host.type = dest;
    msg->host.local = local;

    if (node) {
        if (node->uname) {
            target = strdup(node->uname);
            msg->host.size = strlen(node->uname);
            memset(msg->host.uname, 0, MAX_NAME);
            memcpy(msg->host.uname, node->uname, msg->host.size);
        } else {
            target = crm_strdup_printf("%u", node->id);
        }
        msg->host.id = node->id;
    } else {
        target = strdup("all");
    }

    msg->sender.id = 0;
    msg->sender.type = sender;
    msg->sender.pid = local_pid;
    msg->sender.size = local_name_len;
    memset(msg->sender.uname, 0, MAX_NAME);
    if(local_name && msg->sender.size) {
        memcpy(msg->sender.uname, local_name, msg->sender.size);
    }

    msg->size = 1 + strlen(data);
    msg->header.size = sizeof(AIS_Message) + msg->size;

    if (msg->size < CRM_BZ2_THRESHOLD) {
        msg = realloc_safe(msg, msg->header.size);
        memcpy(msg->data, data, msg->size);

    } else {
        char *compressed = NULL;
        unsigned int new_size = 0;
        char *uncompressed = strdup(data);

        if (crm_compress_string(uncompressed, msg->size, 0, &compressed, &new_size)) {

            msg->header.size = sizeof(AIS_Message) + new_size;
            msg = realloc_safe(msg, msg->header.size);
            memcpy(msg->data, compressed, new_size);

            msg->is_compressed = TRUE;
            msg->compressed_size = new_size;

        } else {
            msg = realloc_safe(msg, msg->header.size);
            memcpy(msg->data, data, msg->size);
        }

        free(uncompressed);
        free(compressed);
    }

    iov = calloc(1, sizeof(struct iovec));
    iov->iov_base = msg;
    iov->iov_len = msg->header.size;

    if (msg->compressed_size) {
        crm_trace("Queueing CPG message %u to %s (%llu bytes, %d bytes compressed payload): %.200s",
                  msg->id, target, (unsigned long long) iov->iov_len,
                  msg->compressed_size, data);
    } else {
        crm_trace("Queueing CPG message %u to %s (%llu bytes, %d bytes payload): %.200s",
                  msg->id, target, (unsigned long long) iov->iov_len,
                  msg->size, data);
    }
    free(target);

#if SUPPORT_PLUGIN
    /* The plugin is the only time we dont use CPG messaging */
    if(get_cluster_type() == pcmk_cluster_classic_ais) {
        return send_plugin_text(class, iov);
    }
#endif

    send_cpg_iov(iov);

    return TRUE;
}

enum crm_ais_msg_types
text2msg_type(const char *text)
{
    int type = crm_msg_none;

    CRM_CHECK(text != NULL, return type);
    if (safe_str_eq(text, "ais")) {
        type = crm_msg_ais;
    } else if (safe_str_eq(text, "crm_plugin")) {
        type = crm_msg_ais;
    } else if (safe_str_eq(text, CRM_SYSTEM_CIB)) {
        type = crm_msg_cib;
    } else if (safe_str_eq(text, CRM_SYSTEM_CRMD)) {
        type = crm_msg_crmd;
    } else if (safe_str_eq(text, CRM_SYSTEM_DC)) {
        type = crm_msg_crmd;
    } else if (safe_str_eq(text, CRM_SYSTEM_TENGINE)) {
        type = crm_msg_te;
    } else if (safe_str_eq(text, CRM_SYSTEM_PENGINE)) {
        type = crm_msg_pe;
    } else if (safe_str_eq(text, CRM_SYSTEM_LRMD)) {
        type = crm_msg_lrmd;
    } else if (safe_str_eq(text, CRM_SYSTEM_STONITHD)) {
        type = crm_msg_stonithd;
    } else if (safe_str_eq(text, "stonith-ng")) {
        type = crm_msg_stonith_ng;
    } else if (safe_str_eq(text, "attrd")) {
        type = crm_msg_attrd;

    } else {
        /* This will normally be a transient client rather than
         * a cluster daemon.  Set the type to the pid of the client
         */
        int scan_rc = sscanf(text, "%d", &type);

        if (scan_rc != 1 || type <= crm_msg_stonith_ng) {
            /* Ensure it's sane */
            type = crm_msg_none;
        }
    }
    return type;
}
Beispiel #4
0
void
pcmk_cpg_membership(cpg_handle_t handle,
                    const struct cpg_name *groupName,
                    const struct cpg_address *member_list, size_t member_list_entries,
                    const struct cpg_address *left_list, size_t left_list_entries,
                    const struct cpg_address *joined_list, size_t joined_list_entries)
{
    int i;
    gboolean found = FALSE;
    static int counter = 0;
    uint32_t local_nodeid = get_local_nodeid(handle);

    for (i = 0; i < left_list_entries; i++) {
        crm_node_t *peer = crm_find_peer(left_list[i].nodeid, NULL);

        crm_info("Node %u left group %s (peer=%s, counter=%d.%d)",
                 left_list[i].nodeid, groupName->value,
                 (peer? peer->uname : "<none>"), counter, i);
        if (peer) {
            crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, OFFLINESTATUS);
        }
    }

    for (i = 0; i < joined_list_entries; i++) {
        crm_info("Node %u joined group %s (counter=%d.%d)",
                 joined_list[i].nodeid, groupName->value, counter, i);
    }

    for (i = 0; i < member_list_entries; i++) {
        crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL);

        crm_info("Node %u still member of group %s (peer=%s, counter=%d.%d)",
                 member_list[i].nodeid, groupName->value,
                 (peer? peer->uname : "<none>"), counter, i);

        /* Anyone that is sending us CPG messages must also be a _CPG_ member.
         * But it's _not_ safe to assume it's in the quorum membership.
         * We may have just found out it's dead and are processing the last couple of messages it sent
         */
        peer = crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, ONLINESTATUS);
        if(peer && peer->state && crm_is_peer_active(peer) == FALSE) {
            time_t now = time(NULL);

            /* Co-opt the otherwise unused votes field */
            if(peer->votes == 0) {
                peer->votes = now;

            } else if(now > (60 + peer->votes)) {
                /* On the otherhand, if we're still getting messages, at a certain point
                 * we need to acknowledge our internal cache is probably wrong
                 *
                 * Set the threshold to 1 minute
                 */
                crm_err("Node %s[%u] appears to be online even though we think it is dead", peer->uname, peer->id);
                if (crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0)) {
                    peer->votes = 0;
                }
            }
        }

        if (local_nodeid == member_list[i].nodeid) {
            found = TRUE;
        }
    }

    if (!found) {
        crm_err("We're not part of CPG group '%s' anymore!", groupName->value);
        cpg_evicted = TRUE;
    }

    counter++;
}
Beispiel #5
0
crm_node_t *
crm_update_peer(const char *source, unsigned int id, uint64_t born, uint64_t seen, int32_t votes,
                uint32_t children, const char *uuid, const char *uname, const char *addr,
                const char *state)
{
#if SUPPORT_PLUGIN
    gboolean addr_changed = FALSE;
    gboolean votes_changed = FALSE;
#endif
    crm_node_t *node = NULL;

    id = get_corosync_id(id, uuid);
    node = crm_get_peer(id, uname);

    CRM_ASSERT(node != NULL);

    if (node->uuid == NULL) {
        if (is_openais_cluster()) {
            /* Yes, overrule whatever was passed in */
            crm_peer_uuid(node);

        } else if (uuid != NULL) {
            node->uuid = strdup(uuid);
        }
    }

    if (children > 0) {
        crm_update_peer_proc(source, node, children, state);
    }

    if (state != NULL) {
        crm_update_peer_state(source, node, state, seen);
    }
#if SUPPORT_HEARTBEAT
    if (born != 0) {
        node->born = born;
    }
#endif

#if SUPPORT_PLUGIN
    /* These were only used by the plugin */
    if (born != 0) {
        node->born = born;
    }

    if (votes > 0 && node->votes != votes) {
        votes_changed = TRUE;
        node->votes = votes;
    }

    if (addr != NULL) {
        if (node->addr == NULL || crm_str_eq(node->addr, addr, FALSE) == FALSE) {
            addr_changed = TRUE;
            free(node->addr);
            node->addr = strdup(addr);
        }
    }
    if (addr_changed || votes_changed) {
        crm_info("%s: Node %s: id=%u state=%s addr=%s%s votes=%d%s born=" U64T " seen=" U64T
                 " proc=%.32x", source, node->uname, node->id, node->state,
                 node->addr, addr_changed ? " (new)" : "", node->votes,
                 votes_changed ? " (new)" : "", node->born, node->last_seen, node->processes);
    }
#endif

    return node;
}
Beispiel #6
0
static void
tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
{
    if(te_client_id == NULL) {
        te_client_id = g_strdup_printf("%s.%d", crm_system_name, getpid());
    }

    if (st_event == NULL) {
        crm_err("Notify data not found");
        return;
    }

    if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) {
        crm_crit("We were alegedly just fenced by %s for %s with %s!", st_event->executioner,
                 st_event->origin, st_event->device); /* Dumps blackbox if enabled */

        qb_log_fini(); /* Try to get the above log message to disk - somehow */

        /* Get out ASAP and do not come back up.
         *
         * Triggering a reboot is also not the worst idea either since
         * the rest of the cluster thinks we're safely down
         */

#ifdef RB_HALT_SYSTEM
        reboot(RB_HALT_SYSTEM);
#endif

        /*
         * If reboot() fails or is not supported, coming back up will
         * probably lead to a situation where the other nodes set our
         * status to 'lost' because of the fencing callback and will
         * discard subsequent election votes with:
         *
         * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster)
         *
         * So just stay dead, something is seriously messed up anyway.
         *
         */
        exit(100); /* None of our wrappers since we already called qb_log_fini() */
        return;
    }

    if (st_event->result == pcmk_ok &&
        safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
        st_fail_count_reset(st_event->target);
    }

    crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",
               st_event->target, st_event->result == pcmk_ok ? "" : " not",
               st_event->action,
               st_event->executioner ? st_event->executioner : "<anyone>",
               st_event->origin, pcmk_strerror(st_event->result), st_event->id,
               st_event->client_origin ? st_event->client_origin : "<unknown>");

#if SUPPORT_CMAN
    if (st_event->result == pcmk_ok && is_cman_cluster()) {
        int local_rc = 0;
        char *target_copy = strdup(st_event->target);

        /* In case fenced hasn't noticed yet
         *
         * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect
         */
        local_rc = fenced_external(target_copy);
        if (local_rc != 0) {
            crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target,
                    local_rc);
        } else {
            crm_notice("Notified CMAN that '%s' is now fenced", st_event->target);
        }
        free(target_copy);
    }
#endif

     if (st_event->result == pcmk_ok) {
         crm_node_t *peer = crm_get_peer_full(0, st_event->target, CRM_GET_PEER_REMOTE | CRM_GET_PEER_CLUSTER);
         const char *uuid = crm_peer_uuid(peer);
         gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);

        crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
        if(AM_I_DC) {
            /* The DC always sends updates */
            send_stonith_update(NULL, st_event->target, uuid);

            if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) {

                /* Abort the current transition graph if it wasn't us
                 * that invoked stonith to fence someone
                 */
                crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
                abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
            }

            /* Assume it was our leader if we dont currently have one */
        } else if (fsa_our_dc == NULL || safe_str_eq(fsa_our_dc, st_event->target)) {
            crm_notice("Target %s our leader %s (recorded: %s)",
                       fsa_our_dc ? "was" : "may have been", st_event->target,
                       fsa_our_dc ? fsa_our_dc : "<unset>");

            /* Given the CIB resyncing that occurs around elections,
             * have one node update the CIB now and, if the new DC is different,
             * have them do so too after the election
             */
            if (we_are_executioner) {
                send_stonith_update(NULL, st_event->target, uuid);
            }
            stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(st_event->target));

        }

        crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL);
        crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0);
        crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN);
        crm_update_peer_join(__FUNCTION__, peer, crm_join_none);
     }
}
Beispiel #7
0
static void
crmd_cs_dispatch(cpg_handle_t handle,
                         const struct cpg_name *groupName,
                         uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
{
    int seq = 0;
    xmlNode *xml = NULL;
    const char *seq_s = NULL;
    crm_node_t *peer = NULL;
    enum crm_proc_flag flag = crm_proc_cpg;

    uint32_t kind = 0;
    const char *from = NULL;
    char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from);

    if(data == NULL) {
        return;
    }
    xml = string2xml(data);
    if (xml == NULL) {
        crm_err("Could not parse message content (%d): %.100s", kind, data);
        free(data);
        return;
    }

    switch (kind) {
        case crm_class_members:
            seq_s = crm_element_value(xml, "id");
            seq = crm_int_helper(seq_s, NULL);
            set_bit(fsa_input_register, R_PEER_DATA);
            post_cache_update(seq);

            /* fall through */
        case crm_class_quorum:
            crm_update_quorum(crm_have_quorum, FALSE);
            if (AM_I_DC) {
                const char *votes = crm_element_value(xml, "expected");

                if (votes == NULL || check_number(votes) == FALSE) {
                    crm_log_xml_err(xml, "Invalid quorum/membership update");

                } else {
                    int rc = update_attr_delegate(fsa_cib_conn,
                                                  cib_quorum_override | cib_scope_local |
                                                  cib_inhibit_notify,
                                                  XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
                                                  XML_ATTR_EXPECTED_VOTES, votes, FALSE, NULL);

                    crm_info("Setting expected votes to %s", votes);
                    if (pcmk_ok > rc) {
                        crm_err("Quorum update failed: %s", pcmk_strerror(rc));
                    }
                }
            }
            break;

        case crm_class_cluster:
            crm_xml_add(xml, F_ORIG, from);
            /* crm_xml_add_int(xml, F_SEQ, wrapper->id); Fake? */

            if (is_heartbeat_cluster()) {
                flag = crm_proc_heartbeat;

            } else if (is_classic_ais_cluster()) {
                flag = crm_proc_plugin;
            }

            peer = crm_get_peer(0, from);
            if (is_not_set(peer->processes, flag)) {
                /* If we can still talk to our peer process on that node,
                 * then its also part of the corosync membership
                 */
                crm_warn("Receiving messages from a node we think is dead: %s[%d]", peer->uname,
                         peer->id);
                crm_update_peer_proc(__FUNCTION__, peer, flag, ONLINESTATUS);
            }
            crmd_ha_msg_filter(xml);
            break;

        case crm_class_rmpeer:
            /* Ignore */
            break;

        case crm_class_notify:
        case crm_class_nodeid:
            crm_err("Unexpected message class (%d): %.100s", kind, data);
            break;

        default:
            crm_err("Invalid message class (%d): %.100s", kind, data);
    }

    free(data);
    free_xml(xml);
}