Ejemplo n.º 1
0
static void
tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
{
    if(te_client_id == NULL) {
        te_client_id = crm_strdup_printf("%s.%d", crm_system_name, getpid());
    }

    if (st_event == NULL) {
        crm_err("Notify data not found");
        return;
    }

    crmd_notify_fencing_op(st_event);

    if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) {
        crm_notice("%s was successfully unfenced by %s (at the request of %s)",
                   st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin);
                /* TODO: Hook up st_event->device */
        return;

    } else if (safe_str_eq("on", st_event->action)) {
        crm_err("Unfencing of %s by %s failed: %s (%d)",
                st_event->target, st_event->executioner ? st_event->executioner : "<anyone>",
                pcmk_strerror(st_event->result), st_event->result);
        return;

    } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) {
        crm_crit("We were allegedly just fenced by %s for %s!",
                 st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* Dumps blackbox if enabled */

        qb_log_fini(); /* Try to get the above log message to disk - somehow */

        /* Get out ASAP and do not come back up.
         *
         * Triggering a reboot is also not the worst idea either since
         * the rest of the cluster thinks we're safely down
         */

#ifdef RB_HALT_SYSTEM
        reboot(RB_HALT_SYSTEM);
#endif

        /*
         * If reboot() fails or is not supported, coming back up will
         * probably lead to a situation where the other nodes set our
         * status to 'lost' because of the fencing callback and will
         * discard subsequent election votes with:
         *
         * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster)
         *
         * So just stay dead, something is seriously messed up anyway.
         *
         */
        exit(100); /* None of our wrappers since we already called qb_log_fini() */
        return;
    }

    if (st_event->result == pcmk_ok &&
        safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
        st_fail_count_reset(st_event->target);
    }

    crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s",
               st_event->target, st_event->result == pcmk_ok ? "" : " not",
               st_event->action,
               st_event->executioner ? st_event->executioner : "<anyone>",
               st_event->origin, pcmk_strerror(st_event->result), st_event->id,
               st_event->client_origin ? st_event->client_origin : "<unknown>");

#if SUPPORT_CMAN
    if (st_event->result == pcmk_ok && is_cman_cluster()) {
        int local_rc = 0;
        int confirm = 0;
        char *target_copy = strdup(st_event->target);

        /* In case fenced hasn't noticed yet
         *
         * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect
         */
        local_rc = fenced_external(target_copy);
        if (local_rc != 0) {
            crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target,
                    local_rc);
        } else {
            crm_notice("Notified CMAN that '%s' is now fenced", st_event->target);
        }

        /* In case fenced is already trying to shoot it */
        confirm = open("/var/run/cluster/fenced_override", O_NONBLOCK|O_WRONLY);
        if (confirm >= 0) {
            int ignore = 0;
            int len = strlen(target_copy);

            errno = 0;
            local_rc = write(confirm, target_copy, len);
            ignore = write(confirm, "\n", 1);

            if(ignore < 0 && errno == EBADF) {
                crm_trace("CMAN not expecting %s to be fenced (yet)", st_event->target);

            } else if (local_rc < len) {
                crm_perror(LOG_ERR, "Confirmation of CMAN fencing event for '%s' failed: %d", st_event->target, local_rc);

            } else {
                fsync(confirm);
                crm_notice("Confirmed CMAN fencing event for '%s'", st_event->target);
            }
            close(confirm);
        }
        free(target_copy);
    }
#endif

    if (st_event->result == pcmk_ok) {
        crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
        const char *uuid = NULL;
        gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);

        if (peer == NULL) {
            return;
        }

        uuid = crm_peer_uuid(peer);

        crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
        if(AM_I_DC) {
            /* The DC always sends updates */
            send_stonith_update(NULL, st_event->target, uuid);

            if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) {

                /* Abort the current transition graph if it wasn't us
                 * that invoked stonith to fence someone
                 */
                crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
                abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
            }

            /* Assume it was our leader if we dont currently have one */
        } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target))
            && !is_set(peer->flags, crm_remote_node)) {

            crm_notice("Target %s our leader %s (recorded: %s)",
                       fsa_our_dc ? "was" : "may have been", st_event->target,
                       fsa_our_dc ? fsa_our_dc : "<unset>");

            /* Given the CIB resyncing that occurs around elections,
             * have one node update the CIB now and, if the new DC is different,
             * have them do so too after the election
             */
            if (we_are_executioner) {
                send_stonith_update(NULL, st_event->target, uuid);
            }
            add_stonith_cleanup(st_event->target);
        }

        /* If the target is a remote node, and we host its connection,
         * immediately fail all monitors so it can be recovered quickly.
         * The connection won't necessarily drop when a remote node is fenced,
         * so the failure might not otherwise be detected until the next poke.
         */
        if (is_set(peer->flags, crm_remote_node)) {
            remote_ra_fail(st_event->target);
        }

        crmd_peer_down(peer, TRUE);
     }
}
Ejemplo n.º 2
0
static void
tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
{
    if(te_client_id == NULL) {
        te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
                                         (unsigned long) getpid());
    }

    if (st_event == NULL) {
        crm_err("Notify data not found");
        return;
    }

    crmd_alert_fencing_op(st_event);

    if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) {
        crm_notice("%s was successfully unfenced by %s (at the request of %s)",
                   st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin);
                /* TODO: Hook up st_event->device */
        return;

    } else if (safe_str_eq("on", st_event->action)) {
        crm_err("Unfencing of %s by %s failed: %s (%d)",
                st_event->target, st_event->executioner ? st_event->executioner : "<anyone>",
                pcmk_strerror(st_event->result), st_event->result);
        return;

    } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) {
        crm_crit("We were allegedly just fenced by %s for %s!",
                 st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin); /* Dumps blackbox if enabled */

        qb_log_fini(); /* Try to get the above log message to disk - somehow */

        /* Get out ASAP and do not come back up.
         *
         * Triggering a reboot is also not the worst idea either since
         * the rest of the cluster thinks we're safely down
         */

#ifdef RB_HALT_SYSTEM
        reboot(RB_HALT_SYSTEM);
#endif

        /*
         * If reboot() fails or is not supported, coming back up will
         * probably lead to a situation where the other nodes set our
         * status to 'lost' because of the fencing callback and will
         * discard subsequent election votes with:
         *
         * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster)
         *
         * So just stay dead, something is seriously messed up anyway.
         *
         */
        exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini()
        return;
    }

    /* Update the count of stonith failures for this target, in case we become
     * DC later. The current DC has already updated its fail count in
     * tengine_stonith_callback().
     */
    if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) {
        if (st_event->result == pcmk_ok) {
            st_fail_count_reset(st_event->target);
        } else {
            st_fail_count_increment(st_event->target);
        }
    }

    crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
               CRM_XS " initiator=%s ref=%s",
               st_event->target, st_event->result == pcmk_ok ? "" : " not",
               st_event->action,
               st_event->executioner ? st_event->executioner : "<anyone>",
               (st_event->client_origin? st_event->client_origin : "<unknown>"),
               pcmk_strerror(st_event->result),
               st_event->origin, st_event->id);

    if (st_event->result == pcmk_ok) {
        crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
        const char *uuid = NULL;
        gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);

        if (peer == NULL) {
            return;
        }

        uuid = crm_peer_uuid(peer);

        crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
        if(AM_I_DC) {
            /* The DC always sends updates */
            send_stonith_update(NULL, st_event->target, uuid);

            /* @TODO Ideally, at this point, we'd check whether the fenced node
             * hosted any guest nodes, and call remote_node_down() for them.
             * Unfortunately, the controller doesn't have a simple, reliable way
             * to map hosts to guests. It might be possible to track this in the
             * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
             * on the PE creating fence pseudo-events for the guests.
             */

            if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) {

                /* Abort the current transition graph if it wasn't us
                 * that invoked stonith to fence someone
                 */
                crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
                abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
            }

            /* Assume it was our leader if we don't currently have one */
        } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target))
            && !is_set(peer->flags, crm_remote_node)) {

            crm_notice("Target %s our leader %s (recorded: %s)",
                       fsa_our_dc ? "was" : "may have been", st_event->target,
                       fsa_our_dc ? fsa_our_dc : "<unset>");

            /* Given the CIB resyncing that occurs around elections,
             * have one node update the CIB now and, if the new DC is different,
             * have them do so too after the election
             */
            if (we_are_executioner) {
                send_stonith_update(NULL, st_event->target, uuid);
            }
            add_stonith_cleanup(st_event->target);
        }

        /* If the target is a remote node, and we host its connection,
         * immediately fail all monitors so it can be recovered quickly.
         * The connection won't necessarily drop when a remote node is fenced,
         * so the failure might not otherwise be detected until the next poke.
         */
        if (is_set(peer->flags, crm_remote_node)) {
            remote_ra_fail(st_event->target);
        }

        crmd_peer_down(peer, TRUE);
     }
}