static void tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) { if(te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%d", crm_system_name, getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } crmd_notify_fencing_op(st_event); if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { crm_notice("%s was successfully unfenced by %s (at the request of %s)", st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* TODO: Hook up st_event->device */ return; } else if (safe_str_eq("on", st_event->action)) { crm_err("Unfencing of %s by %s failed: %s (%d)", st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", pcmk_strerror(st_event->result), st_event->result); return; } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { crm_crit("We were allegedly just fenced by %s for %s!", st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* Dumps blackbox if enabled */ qb_log_fini(); /* Try to get the above log message to disk - somehow */ /* Get out ASAP and do not come back up. * * Triggering a reboot is also not the worst idea either since * the rest of the cluster thinks we're safely down */ #ifdef RB_HALT_SYSTEM reboot(RB_HALT_SYSTEM); #endif /* * If reboot() fails or is not supported, coming back up will * probably lead to a situation where the other nodes set our * status to 'lost' because of the fencing callback and will * discard subsequent election votes with: * * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) * * So just stay dead, something is seriously messed up anyway. * */ exit(100); /* None of our wrappers since we already called qb_log_fini() */ return; } if (st_event->result == pcmk_ok && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { st_fail_count_reset(st_event->target); } crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin, pcmk_strerror(st_event->result), st_event->id, st_event->client_origin ? st_event->client_origin : "<unknown>"); #if SUPPORT_CMAN if (st_event->result == pcmk_ok && is_cman_cluster()) { int local_rc = 0; int confirm = 0; char *target_copy = strdup(st_event->target); /* In case fenced hasn't noticed yet * * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect */ local_rc = fenced_external(target_copy); if (local_rc != 0) { crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target, local_rc); } else { crm_notice("Notified CMAN that '%s' is now fenced", st_event->target); } /* In case fenced is already trying to shoot it */ confirm = open("/var/run/cluster/fenced_override", O_NONBLOCK|O_WRONLY); if (confirm > 0) { int ignore = 0; int len = strlen(target_copy); errno = 0; local_rc = write(confirm, target_copy, len); ignore = write(confirm, "\n", 1); if(ignore < 0 && errno == EBADF) { crm_trace("CMAN not expecting %s to be fenced (yet)", st_event->target); } else if (local_rc < len) { crm_perror(LOG_ERR, "Confirmation of CMAN fencing event for '%s' failed: %d", st_event->target, local_rc); } else { fsync(confirm); crm_notice("Confirmed CMAN fencing event for '%s'", st_event->target); } close(confirm); } free(target_copy); } #endif if (st_event->result == pcmk_ok) { crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_REMOTE | CRM_GET_PEER_CLUSTER); const char *uuid = NULL; gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); if (peer == NULL) { return; } uuid = crm_peer_uuid(peer); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we dont currently have one */ } else if (fsa_our_dc == NULL || safe_str_eq(fsa_our_dc, st_event->target)) { crm_notice("Target %s our leader %s (recorded: %s)", fsa_our_dc ? "was" : "may have been", st_event->target, fsa_our_dc ? fsa_our_dc : "<unset>"); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(st_event->target)); } crmd_peer_down(peer, TRUE); } }
static void tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) { if(te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, (unsigned long) getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } crmd_alert_fencing_op(st_event); if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { crm_notice("%s was successfully unfenced by %s (at the request of %s)", st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* TODO: Hook up st_event->device */ return; } else if (safe_str_eq("on", st_event->action)) { crm_err("Unfencing of %s by %s failed: %s (%d)", st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", pcmk_strerror(st_event->result), st_event->result); return; } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { crm_crit("We were allegedly just fenced by %s for %s!", st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* Dumps blackbox if enabled */ qb_log_fini(); /* Try to get the above log message to disk - somehow */ /* Get out ASAP and do not come back up. * * Triggering a reboot is also not the worst idea either since * the rest of the cluster thinks we're safely down */ #ifdef RB_HALT_SYSTEM reboot(RB_HALT_SYSTEM); #endif /* * If reboot() fails or is not supported, coming back up will * probably lead to a situation where the other nodes set our * status to 'lost' because of the fencing callback and will * discard subsequent election votes with: * * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) * * So just stay dead, something is seriously messed up anyway. * */ exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini() return; } /* Update the count of stonith failures for this target, in case we become * DC later. The current DC has already updated its fail count in * tengine_stonith_callback(). */ if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { if (st_event->result == pcmk_ok) { st_fail_count_reset(st_event->target); } else { st_fail_count_increment(st_event->target); } } crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " CRM_XS " initiator=%s ref=%s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "<anyone>", (st_event->client_origin? st_event->client_origin : "<unknown>"), pcmk_strerror(st_event->result), st_event->origin, st_event->id); if (st_event->result == pcmk_ok) { crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_ANY); const char *uuid = NULL; gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); if (peer == NULL) { return; } uuid = crm_peer_uuid(peer); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); /* @TODO Ideally, at this point, we'd check whether the fenced node * hosted any guest nodes, and call remote_node_down() for them. * Unfortunately, the controller doesn't have a simple, reliable way * to map hosts to guests. It might be possible to track this in the * peer cache via crm_remote_peer_cache_refresh(). For now, we rely * on the PE creating fence pseudo-events for the guests. */ if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we don't currently have one */ } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target)) && !is_set(peer->flags, crm_remote_node)) { crm_notice("Target %s our leader %s (recorded: %s)", fsa_our_dc ? "was" : "may have been", st_event->target, fsa_our_dc ? fsa_our_dc : "<unset>"); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } add_stonith_cleanup(st_event->target); } /* If the target is a remote node, and we host its connection, * immediately fail all monitors so it can be recovered quickly. * The connection won't necessarily drop when a remote node is fenced, * so the failure might not otherwise be detected until the next poke. */ if (is_set(peer->flags, crm_remote_node)) { remote_ra_fail(st_event->target); } crmd_peer_down(peer, TRUE); } }