void attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *data) { if(election_state(writer) == election_won && kind == crm_status_nstate && safe_str_eq(peer->state, CRM_NODE_MEMBER)) { attrd_peer_sync(peer, NULL); } else if(kind == crm_status_nstate && safe_str_neq(peer->state, CRM_NODE_MEMBER)) { attrd_peer_remove(peer->uname, __FUNCTION__); if(peer_writer && safe_str_eq(peer->uname, peer_writer)) { free(peer_writer); peer_writer = NULL; crm_notice("Lost attribute writer %s", peer->uname); } } else if(kind == crm_status_processes) { if(is_set(peer->processes, crm_proc_cpg)) { crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0); } else { crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); } } }
/*! * \internal * \brief Handle cluster communication related to pacemaker_remote node leaving * * \param[in] node_name Name of lost node */ static void remote_node_down(const char *node_name) { xmlNode *update; int call_id = 0; int call_opt = crmd_cib_smart_opt(); crm_node_t *node; /* Clear all node attributes */ update_attrd_remote_node_removed(node_name, NULL); /* Ensure node is in the remote peer cache with lost state */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0); /* Notify DC */ send_remote_state_message(node_name, FALSE); /* Update CIB node state */ update = create_xml_node(NULL, XML_CIB_TAG_STATUS); do_update_node_cib(node, node_update_cluster, update, __FUNCTION__); fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id < 0) { crm_perror(LOG_ERR, "%s CIB node state update", node_name); } free_xml(update); }
/*! * \internal * \brief Handle effects of a remote RA command on node state * * \param[in] cmd Completed remote RA command */ static void check_remote_node_state(remote_ra_cmd_t *cmd) { /* Only successful actions can change node state */ if (cmd->rc != PCMK_OCF_OK) { return; } if (safe_str_eq(cmd->action, "start")) { remote_node_up(cmd->rsc_id); } else if (safe_str_eq(cmd->action, "migrate_from")) { /* After a successful migration, we don't need to do remote_node_up() * because the DC already knows the node is up, and we don't want to * clear LRM history etc. We do need to add the remote node to this * host's remote peer cache, because (unless it happens to be DC) * it hasn't been tracking the remote node, and other code relies on * the cache to distinguish remote nodes from unseen cluster nodes. */ crm_node_t *node = crm_remote_peer_get(cmd->rsc_id); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0); } else if (safe_str_eq(cmd->action, "stop")) {
/*! * \internal * \brief Handle cluster communication related to pacemaker_remote node leaving * * \param[in] node_name Name of lost node */ static void remote_node_down(const char *node_name) { xmlNode *update; int call_id = 0; int call_opt = crmd_cib_smart_opt(); crm_node_t *node; /* Purge node from attrd's memory */ update_attrd_remote_node_removed(node_name, NULL); /* Purge node's operation history and transient attributes from CIB */ erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); /* Ensure node is in the remote peer cache with lost state */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0); /* Notify DC */ send_remote_state_message(node_name, FALSE); /* Update CIB node state */ update = create_xml_node(NULL, XML_CIB_TAG_STATUS); do_update_node_cib(node, node_update_cluster, update, __FUNCTION__); fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id < 0) { crm_perror(LOG_ERR, "%s CIB node state update", node_name); } free_xml(update); }
/*! * \internal * \brief Handle cluster communication related to pacemaker_remote node joining * * \param[in] node_name Name of newly integrated pacemaker_remote node */ static void remote_node_up(const char *node_name) { int call_opt, call_id = 0; xmlNode *update, *state; crm_node_t *node; CRM_CHECK(node_name != NULL, return); crm_info("Announcing pacemaker_remote node %s", node_name); /* Clear node's operation history. The node's transient attributes should * and normally will be cleared when the node leaves, but since remote node * state has a number of corner cases, clear them here as well, to be sure. */ call_opt = crmd_cib_smart_opt(); erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); /* Clear node's probed attribute */ update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE); /* Ensure node is in the remote peer cache with member status */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0); /* pacemaker_remote nodes don't participate in the membership layer, * so cluster nodes don't automatically get notified when they come and go. * We send a cluster message to the DC, and update the CIB node state entry, * so the DC will get it sooner (via message) or later (via CIB refresh), * and any other interested parties can query the CIB. */ send_remote_state_message(node_name, TRUE); update = create_xml_node(NULL, XML_CIB_TAG_STATUS); state = create_node_state_update(node, node_update_cluster, update, __FUNCTION__); /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever * needs to be fenced, this flag will allow various actions to determine * whether the fencing has happened yet. */ crm_xml_add(state, XML_NODE_IS_FENCED, "0"); /* TODO: If the remote connection drops, and this (async) CIB update either * failed or has not yet completed, later actions could mistakenly think the * node has already been fenced (if the XML_NODE_IS_FENCED attribute was * previously set, because it won't have been cleared). This could prevent * actual fencing or allow recurring monitor failures to be cleared too * soon. Ideally, we wouldn't rely on the CIB for the fenced status. */ fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id < 0) { crm_perror(LOG_WARNING, "%s CIB node state setup", node_name); } free_xml(update); }
void send_stonith_update(crm_action_t * action, const char *target, const char *uuid) { int rc = pcmk_ok; crm_node_t *peer = NULL; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); if (get_node_uuid(0, target) == NULL) { set_node_uuid(target, uuid); } /* Make sure the membership and join caches are accurate */ peer = crm_get_peer(0, target); if (peer->uuid == NULL) { crm_info("Recording uuid '%s' for node '%s'", uuid, target); peer->uuid = strdup(uuid); } crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL); crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); erase_node_from_join(target); node_state = do_update_node_cib(peer, node_update_cluster | node_update_peer | node_update_join | node_update_expected, NULL, __FUNCTION__); /* Force our known ID */ crm_xml_add(node_state, XML_ATTR_UUID, uuid); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override | cib_scope_local | cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); /* Make sure it sticks */ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); free_xml(node_state); return; }
/*! * \brief Handle a CRM_OP_REMOTE_STATE message by updating remote peer cache * * \param[in] msg Message XML * * \return Next FSA input */ static enum crmd_fsa_input handle_remote_state(xmlNode *msg) { const char *remote_uname = ID(msg); const char *remote_is_up = crm_element_value(msg, XML_NODE_IN_CLUSTER); crm_node_t *remote_peer; CRM_CHECK(remote_uname && remote_is_up, return I_NULL); remote_peer = crm_remote_peer_get(remote_uname); CRM_CHECK(remote_peer, return I_NULL); crm_update_peer_state(__FUNCTION__, remote_peer, crm_is_true(remote_is_up)? CRM_NODE_MEMBER : CRM_NODE_LOST, 0); return I_NULL; }
/*! * \internal * \brief Handle cluster communication related to pacemaker_remote node leaving * * \param[in] node_name Name of lost node * \param[in] opts Whether to keep or erase LRM history */ static void remote_node_down(const char *node_name, const enum down_opts opts) { xmlNode *update; int call_id = 0; int call_opt = crmd_cib_smart_opt(); crm_node_t *node; /* Purge node from attrd's memory */ update_attrd_remote_node_removed(node_name, NULL); /* Purge node's transient attributes */ erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); /* Normally, the LRM operation history should be kept until the node comes * back up. However, after a successful fence, we want to clear it, so we * don't think resources are still running on the node. */ if (opts == DOWN_ERASE_LRM) { erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); } /* Ensure node is in the remote peer cache with lost state */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0); /* Notify DC */ send_remote_state_message(node_name, FALSE); /* Update CIB node state */ update = create_xml_node(NULL, XML_CIB_TAG_STATUS); create_node_state_update(node, node_update_cluster, update, __FUNCTION__); fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id < 0) { crm_perror(LOG_ERR, "%s CIB node state update", node_name); } free_xml(update); }
void pcmk_cpg_membership(cpg_handle_t handle, const struct cpg_name *groupName, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { int i; gboolean found = FALSE; static int counter = 0; uint32_t local_nodeid = get_local_nodeid(handle); for (i = 0; i < left_list_entries; i++) { crm_node_t *peer = crm_find_peer(left_list[i].nodeid, NULL); crm_info("Node %u left group %s (peer=%s, counter=%d.%d)", left_list[i].nodeid, groupName->value, (peer? peer->uname : "<none>"), counter, i); if (peer) { crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, OFFLINESTATUS); } } for (i = 0; i < joined_list_entries; i++) { crm_info("Node %u joined group %s (counter=%d.%d)", joined_list[i].nodeid, groupName->value, counter, i); } for (i = 0; i < member_list_entries; i++) { crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL); crm_info("Node %u still member of group %s (peer=%s, counter=%d.%d)", member_list[i].nodeid, groupName->value, (peer? peer->uname : "<none>"), counter, i); /* Anyone that is sending us CPG messages must also be a _CPG_ member. * But it's _not_ safe to assume it's in the quorum membership. * We may have just found out it's dead and are processing the last couple of messages it sent */ peer = crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, ONLINESTATUS); if(peer && peer->state && crm_is_peer_active(peer) == FALSE) { time_t now = time(NULL); /* Co-opt the otherwise unused votes field */ if(peer->votes == 0) { peer->votes = now; } else if(now > (60 + peer->votes)) { /* On the otherhand, if we're still getting messages, at a certain point * we need to acknowledge our internal cache is probably wrong * * Set the threshold to 1 minute */ crm_err("Node %s[%u] appears to be online even though we think it is dead", peer->uname, peer->id); if (crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, 0)) { peer->votes = 0; } } } if (local_nodeid == member_list[i].nodeid) { found = TRUE; } } if (!found) { crm_err("We're not part of CPG group '%s' anymore!", groupName->value); cpg_evicted = TRUE; } counter++; }
crm_node_t * crm_update_peer(const char *source, unsigned int id, uint64_t born, uint64_t seen, int32_t votes, uint32_t children, const char *uuid, const char *uname, const char *addr, const char *state) { #if SUPPORT_PLUGIN gboolean addr_changed = FALSE; gboolean votes_changed = FALSE; #endif crm_node_t *node = NULL; id = get_corosync_id(id, uuid); node = crm_get_peer(id, uname); CRM_ASSERT(node != NULL); if (node->uuid == NULL) { if (is_openais_cluster()) { /* Yes, overrule whatever was passed in */ crm_peer_uuid(node); } else if (uuid != NULL) { node->uuid = strdup(uuid); } } if (children > 0) { crm_update_peer_proc(source, node, children, state); } if (state != NULL) { crm_update_peer_state(source, node, state, seen); } #if SUPPORT_HEARTBEAT if (born != 0) { node->born = born; } #endif #if SUPPORT_PLUGIN /* These were only used by the plugin */ if (born != 0) { node->born = born; } if (votes > 0 && node->votes != votes) { votes_changed = TRUE; node->votes = votes; } if (addr != NULL) { if (node->addr == NULL || crm_str_eq(node->addr, addr, FALSE) == FALSE) { addr_changed = TRUE; free(node->addr); node->addr = strdup(addr); } } if (addr_changed || votes_changed) { crm_info("%s: Node %s: id=%u state=%s addr=%s%s votes=%d%s born=" U64T " seen=" U64T " proc=%.32x", source, node->uname, node->id, node->state, node->addr, addr_changed ? " (new)" : "", node->votes, votes_changed ? " (new)" : "", node->born, node->last_seen, node->processes); } #endif return node; }
static void pcmk_quorum_notification(quorum_handle_t handle, uint32_t quorate, uint64_t ring_id, uint32_t view_list_entries, uint32_t * view_list) { int i; GHashTableIter iter; crm_node_t *node = NULL; static gboolean init_phase = TRUE; if (quorate != crm_have_quorum) { crm_notice("Membership " U64T ": quorum %s (%lu)", ring_id, quorate ? "acquired" : "lost", (long unsigned int)view_list_entries); crm_have_quorum = quorate; } else { crm_info("Membership " U64T ": quorum %s (%lu)", ring_id, quorate ? "retained" : "still lost", (long unsigned int)view_list_entries); } if (view_list_entries == 0 && init_phase) { crm_info("Corosync membership is still forming, ignoring"); return; } init_phase = FALSE; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { node->last_seen = 0; } for (i = 0; i < view_list_entries; i++) { uint32_t id = view_list[i]; char *name = NULL; crm_debug("Member[%d] %u ", i, id); node = crm_get_peer(id, NULL); if (node->uname == NULL) { crm_info("Obtaining name for new node %u", id); name = corosync_node_name(0, id); node = crm_get_peer(id, name); } crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, ring_id); free(name); } crm_trace("Reaping unseen nodes..."); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (node->last_seen != ring_id && node->state) { crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0); } else if (node->last_seen != ring_id) { crm_info("State of node %s[%u] is still unknown", node->uname, node->id); } } if (quorum_app_callback) { quorum_app_callback(ring_id, quorate); } }
static void tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) { if(te_client_id == NULL) { te_client_id = g_strdup_printf("%s.%d", crm_system_name, getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { crm_crit("We were alegedly just fenced by %s for %s with %s!", st_event->executioner, st_event->origin, st_event->device); /* Dumps blackbox if enabled */ qb_log_fini(); /* Try to get the above log message to disk - somehow */ /* Get out ASAP and do not come back up. * * Triggering a reboot is also not the worst idea either since * the rest of the cluster thinks we're safely down */ #ifdef RB_HALT_SYSTEM reboot(RB_HALT_SYSTEM); #endif /* * If reboot() fails or is not supported, coming back up will * probably lead to a situation where the other nodes set our * status to 'lost' because of the fencing callback and will * discard subsequent election votes with: * * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) * * So just stay dead, something is seriously messed up anyway. * */ exit(100); /* None of our wrappers since we already called qb_log_fini() */ return; } if (st_event->result == pcmk_ok && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { st_fail_count_reset(st_event->target); } crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin, pcmk_strerror(st_event->result), st_event->id, st_event->client_origin ? st_event->client_origin : "<unknown>"); #if SUPPORT_CMAN if (st_event->result == pcmk_ok && is_cman_cluster()) { int local_rc = 0; char *target_copy = strdup(st_event->target); /* In case fenced hasn't noticed yet * * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect */ local_rc = fenced_external(target_copy); if (local_rc != 0) { crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target, local_rc); } else { crm_notice("Notified CMAN that '%s' is now fenced", st_event->target); } free(target_copy); } #endif if (st_event->result == pcmk_ok) { crm_node_t *peer = crm_get_peer_full(0, st_event->target, CRM_GET_PEER_REMOTE | CRM_GET_PEER_CLUSTER); const char *uuid = crm_peer_uuid(peer); gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we dont currently have one */ } else if (fsa_our_dc == NULL || safe_str_eq(fsa_our_dc, st_event->target)) { crm_notice("Target %s our leader %s (recorded: %s)", fsa_our_dc ? "was" : "may have been", st_event->target, fsa_our_dc ? fsa_our_dc : "<unset>"); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(st_event->target)); } crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL); crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); crm_update_peer_join(__FUNCTION__, peer, crm_join_none); } }
static void pcmk_quorum_notification(quorum_handle_t handle, uint32_t quorate, uint64_t ring_id, uint32_t view_list_entries, uint32_t * view_list) { int i; GHashTableIter iter; crm_node_t *node = NULL; static gboolean init_phase = TRUE; if (quorate != crm_have_quorum) { if (quorate) { crm_notice("Quorum acquired " CRM_XS " membership=%" U64T " members=%lu", ring_id, (long unsigned int)view_list_entries); } else { crm_warn("Quorum lost " CRM_XS " membership=%" U64T " members=%lu", ring_id, (long unsigned int)view_list_entries); } crm_have_quorum = quorate; } else { crm_info("Quorum %s " CRM_XS " membership=%" U64T " members=%lu", (quorate? "retained" : "still lost"), ring_id, (long unsigned int)view_list_entries); } if (view_list_entries == 0 && init_phase) { crm_info("Corosync membership is still forming, ignoring"); return; } init_phase = FALSE; /* Reset last_seen for all cached nodes so we can tell which ones aren't * in the view list */ g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { node->last_seen = 0; } /* Update the peer cache for each node in view list */ for (i = 0; i < view_list_entries; i++) { uint32_t id = view_list[i]; crm_debug("Member[%d] %u ", i, id); /* Get this node's peer cache entry (adding one if not already there) */ node = crm_get_peer(id, NULL); if (node->uname == NULL) { char *name = corosync_node_name(0, id); crm_info("Obtaining name for new node %u", id); node = crm_get_peer(id, name); free(name); } /* Update the node state (including updating last_seen to ring_id) */ crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, ring_id); } /* Remove any peer cache entries we didn't update */ crm_reap_unseen_nodes(ring_id); if (quorum_app_callback) { quorum_app_callback(ring_id, quorate); } }