Пример #1
0
static enum crmd_fsa_input
handle_failcount_op(xmlNode * stored_msg)
{
    const char *rsc = NULL;
    xmlNode *xml_rsc = get_xpath_object("//" XML_CIB_TAG_RESOURCE, stored_msg, LOG_ERR);

    if (xml_rsc) {
        rsc = ID(xml_rsc);
    }

    if (rsc) {
        char *attr = NULL;

        crm_info("Removing failcount for %s", rsc);

        attr = crm_concat("fail-count", rsc, '-');
        update_attrd(NULL, attr, NULL, NULL);
        free(attr);

        attr = crm_concat("last-failure", rsc, '-');
        update_attrd(NULL, attr, NULL, NULL);
        free(attr);

        lrm_clear_last_failure(rsc);
    } else {
        crm_log_xml_warn(stored_msg, "invalid failcount op");
    }

    return I_NULL;
}
Пример #2
0
enum crmd_fsa_input
handle_shutdown_request(xmlNode * stored_msg)
{
    /* handle here to avoid potential version issues
     *   where the shutdown message/proceedure may have
     *   been changed in later versions.
     *
     * This way the DC is always in control of the shutdown
     */

    char *now_s = NULL;
    time_t now = time(NULL);
    const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM);

    if (host_from == NULL) {
        /* we're shutting down and the DC */
        host_from = fsa_our_uname;
    }

    crm_info("Creating shutdown request for %s (state=%s)", host_from, fsa_state2string(fsa_state));
    crm_log_xml_trace(stored_msg, "message");

    now_s = crm_itoa(now);
    update_attrd(host_from, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, FALSE);
    free(now_s);

    /* will be picked up by the TE as long as its running */
    return I_NULL;
}
Пример #3
0
/*!
 * \internal
 * \brief Handle cluster communication related to pacemaker_remote node joining
 *
 * \param[in] node_name  Name of newly integrated pacemaker_remote node
 */
static void
remote_node_up(const char *node_name)
{
    int call_opt, call_id = 0;
    xmlNode *update, *state;
    crm_node_t *node;

    CRM_CHECK(node_name != NULL, return);
    crm_info("Announcing pacemaker_remote node %s", node_name);

    /* Clear node's operation history. The node's transient attributes should
     * and normally will be cleared when the node leaves, but since remote node
     * state has a number of corner cases, clear them here as well, to be sure.
     */
    call_opt = crmd_cib_smart_opt();
    erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
    erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);

    /* Clear node's probed attribute */
    update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);

    /* Ensure node is in the remote peer cache with member status */
    node = crm_remote_peer_get(node_name);
    CRM_CHECK(node != NULL, return);
    crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0);

    /* pacemaker_remote nodes don't participate in the membership layer,
     * so cluster nodes don't automatically get notified when they come and go.
     * We send a cluster message to the DC, and update the CIB node state entry,
     * so the DC will get it sooner (via message) or later (via CIB refresh),
     * and any other interested parties can query the CIB.
     */
    send_remote_state_message(node_name, TRUE);

    update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
    state = create_node_state_update(node, node_update_cluster, update,
                                     __FUNCTION__);

    /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
     * needs to be fenced, this flag will allow various actions to determine
     * whether the fencing has happened yet.
     */
    crm_xml_add(state, XML_NODE_IS_FENCED, "0");

    /* TODO: If the remote connection drops, and this (async) CIB update either
     * failed or has not yet completed, later actions could mistakenly think the
     * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
     * previously set, because it won't have been cleared). This could prevent
     * actual fencing or allow recurring monitor failures to be cleared too
     * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
     */
    fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
    if (call_id < 0) {
        crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
    }
    free_xml(update);
}
Пример #4
0
void
do_dc_join_final(long long action,
                 enum crmd_fsa_cause cause,
                 enum crmd_fsa_state cur_state,
                 enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    crm_debug("Ensuring DC, quorum and node attributes are up-to-date");
    update_attrd(NULL, NULL, NULL, NULL, FALSE);
    crm_update_quorum(crm_have_quorum, TRUE);
}
Пример #5
0
static enum crmd_fsa_input
handle_failcount_op(xmlNode * stored_msg)
{
    const char *rsc = NULL;
    const char *uname = NULL;
    gboolean is_remote_node = FALSE;
    xmlNode *xml_rsc = get_xpath_object("//" XML_CIB_TAG_RESOURCE, stored_msg, LOG_ERR);

    if (xml_rsc) {
        rsc = ID(xml_rsc);
    }

    uname = crm_element_value(stored_msg, XML_LRM_ATTR_TARGET);
    if (crm_element_value(stored_msg, XML_LRM_ATTR_ROUTER_NODE)) {
        is_remote_node = TRUE;
    }

    if (rsc) {
        char *attr = NULL;

        crm_info("Removing failcount for %s", rsc);

        attr = crm_concat("fail-count", rsc, '-');
        update_attrd(uname, attr, NULL, NULL, is_remote_node);
        free(attr);

        attr = crm_concat("last-failure", rsc, '-');
        update_attrd(uname, attr, NULL, NULL, is_remote_node);
        free(attr);

        lrm_clear_last_failure(rsc, uname);
    } else {
        crm_log_xml_warn(stored_msg, "invalid failcount op");
    }

    return I_NULL;
}
Пример #6
0
void
do_dc_join_final(long long action,
                 enum crmd_fsa_cause cause,
                 enum crmd_fsa_state cur_state,
                 enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    crm_debug("Ensuring DC, quorum and node attributes are up-to-date");
#if !HAVE_ATOMIC_ATTRD
    /* Ask attrd to write all attributes to disk. This is not needed for
     * atomic attrd because atomic attrd does a peer sync and write-out
     * when winning an election.
     */
    update_attrd(NULL, NULL, NULL, NULL, FALSE);
#endif
    crm_update_quorum(crm_have_quorum, TRUE);
}
Пример #7
0
static void
remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg)
{
    lrm_state_t *lrm_state = userdata;
    const char *op = crm_element_value(msg, F_LRMD_IPC_OP);
    const char *session = crm_element_value(msg, F_LRMD_IPC_SESSION);
    int msg_id = 0;

    /* sessions are raw ipc connections to IPC,
     * all we do is proxy requests/responses exactly
     * like they are given to us at the ipc level. */

    CRM_CHECK(op != NULL, return);
    CRM_CHECK(session != NULL, return);

    crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id);
    /* This is msg from remote ipc client going to real ipc server */

    if (safe_str_eq(op, LRMD_IPC_OP_SHUTDOWN_REQ)) {
        char *now_s = NULL;
        time_t now = time(NULL);

        crm_notice("Graceful proxy shutdown of %s", lrm_state->node_name);

        now_s = crm_itoa(now);
        update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE);
        free(now_s);

        remote_proxy_ack_shutdown(lrmd);
        return;

    } else if (safe_str_eq(op, LRMD_IPC_OP_NEW)) {
        int rc;
        const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER);

        CRM_CHECK(channel != NULL, return);

        if (remote_proxy_new(lrm_state->node_name, session, channel) == NULL) {
            remote_proxy_notify_destroy(lrmd, session);
        }
        crm_trace("new remote proxy client established to %s, session id %s", channel, session);

        /* Look up stonith-watchdog-timeout and send to the remote peer for validation */
        rc = fsa_cib_conn->cmds->query(fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local);
        fsa_cib_conn->cmds->register_callback_full(fsa_cib_conn, rc, 10, FALSE, lrmd, "remote_config_check", remote_config_check, NULL);
        
    } else if (safe_str_eq(op, LRMD_IPC_OP_DESTROY)) {
Пример #8
0
static void
remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg)
{
    lrm_state_t *lrm_state = userdata;
    const char *op = crm_element_value(msg, F_LRMD_IPC_OP);
    const char *session = crm_element_value(msg, F_LRMD_IPC_SESSION);
    int msg_id = 0;

    /* sessions are raw ipc connections to IPC,
     * all we do is proxy requests/responses exactly
     * like they are given to us at the ipc level. */

    CRM_CHECK(op != NULL, return);
    CRM_CHECK(session != NULL, return);

    crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id);
    /* This is msg from remote ipc client going to real ipc server */

    if (safe_str_eq(op, LRMD_IPC_OP_SHUTDOWN_REQ)) {
        char *now_s = NULL;
        time_t now = time(NULL);

        crm_notice("Graceful proxy shutdown of %s", lrm_state->node_name);

        now_s = crm_itoa(now);
        update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE);
        free(now_s);

        remote_proxy_ack_shutdown(lrmd);
        return;

    } else if (safe_str_eq(op, LRMD_IPC_OP_NEW)) {
        const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER);

        CRM_CHECK(channel != NULL, return);

        if (remote_proxy_new(lrm_state->node_name, session, channel) == NULL) {
            remote_proxy_notify_destroy(lrmd, session);
        }
        crm_trace("new remote proxy client established to %s, session id %s", channel, session);
    } else if (safe_str_eq(op, LRMD_IPC_OP_DESTROY)) {
Пример #9
0
enum crmd_fsa_input
handle_shutdown_request(xmlNode * stored_msg)
{
    /* handle here to avoid potential version issues
     *   where the shutdown message/proceedure may have
     *   been changed in later versions.
     *
     * This way the DC is always in control of the shutdown
     */

    char *now_s = NULL;
    time_t now = time(NULL);
    xmlNode *node_state = NULL;
    const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM);

    if (host_from == NULL) {
        /* we're shutting down and the DC */
        host_from = fsa_our_uname;
    }

    crm_info("Creating shutdown request for %s (state=%s)", host_from, fsa_state2string(fsa_state));

    crm_log_xml(LOG_MSG, "message", stored_msg);

    node_state = create_node_state(host_from, NULL, NULL, NULL, NULL,
                                   CRMD_STATE_INACTIVE, FALSE, __FUNCTION__);

    fsa_cib_anon_update(XML_CIB_TAG_STATUS, node_state, cib_quorum_override);
    crm_log_xml_debug_2(node_state, "Shutdown update");
    free_xml(node_state);

    now_s = crm_itoa(now);
    update_attrd(host_from, XML_CIB_ATTR_SHUTDOWN, now_s, NULL);
    crm_free(now_s);

    /* will be picked up by the TE as long as its running */
    return I_NULL;
}
Пример #10
0
static gboolean
update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update)
{
    int interval = 0;

    char *task = NULL;
    char *rsc_id = NULL;
    char *attr_name = NULL;

    const char *value = NULL;
    const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
    const char *on_uname = get_uname_from_event(event);
    const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);

    if (rc == 99) {
        /* this is an internal code for "we're busy, try again" */
        return FALSE;

    } else if (rc == target_rc) {
        return FALSE;
    }

    if (safe_str_eq(origin, "build_active_RAs")) {
        crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
                  id, rc, on_uname);
        return FALSE;
    }

    CRM_LOG_ASSERT(on_uname != NULL);
    if (on_uname == NULL) {
        return TRUE;
    }

    if (failed_stop_offset == NULL) {
        failed_stop_offset = strdup(INFINITY_S);
    }

    if (failed_start_offset == NULL) {
        failed_start_offset = strdup(INFINITY_S);
    }

    CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event));
              goto bail);
    CRM_CHECK(task != NULL, goto bail);
    CRM_CHECK(rsc_id != NULL, goto bail);

    if (do_update || interval > 0) {
        do_update = TRUE;

    } else if (safe_str_eq(task, CRMD_ACTION_START)) {
        do_update = TRUE;
        value = failed_start_offset;

    } else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
        do_update = TRUE;
        value = failed_stop_offset;

    } else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
        do_update = TRUE;
        value = failed_stop_offset;

    } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) {
        do_update = TRUE;

    } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
        do_update = TRUE;
    }

    if (value == NULL || safe_str_neq(value, INFINITY_S)) {
        value = XML_NVPAIR_ATTR_VALUE "++";
    }

    if (do_update) {
        char *now = crm_itoa(time(NULL));
        gboolean is_remote_node = get_is_remote_from_event(event);

        crm_warn("Updating failcount for %s on %s after failed %s:"
                 " rc=%d (update=%s, time=%s)", rsc_id, on_uname, task, rc, value, now);

        attr_name = crm_concat("fail-count", rsc_id, '-');
        update_attrd(on_uname, attr_name, value, NULL, is_remote_node);
        free(attr_name);

        attr_name = crm_concat("last-failure", rsc_id, '-');
        update_attrd(on_uname, attr_name, now, NULL, is_remote_node);
        free(attr_name);

        free(now);
    }

  bail:
    free(rsc_id);
    free(task);
    return TRUE;
}
Пример #11
0
/*!
 * \internal
 * \brief Update failure-related node attributes if warranted
 *
 * \param[in] event            XML describing operation that (maybe) failed
 * \param[in] event_node_uuid  Node that event occurred on
 * \param[in] rc               Actual operation return code
 * \param[in] target_rc        Expected operation return code
 * \param[in] do_update        If TRUE, do update regardless of operation type
 * \param[in] ignore_failures  If TRUE, update last failure but not fail count
 *
 * \return TRUE if this was not a direct nack, success or lrm status refresh
 */
static gboolean
update_failcount(xmlNode * event, const char *event_node_uuid, int rc,
                 int target_rc, gboolean do_update, gboolean ignore_failures)
{
    int interval = 0;

    char *task = NULL;
    char *rsc_id = NULL;

    const char *value = NULL;
    const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
    const char *on_uname = crm_peer_uname(event_node_uuid);
    const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);

    /* Nothing needs to be done for success, lrm status refresh,
     * or direct nack (internal code for "busy, try again")
     */
    if ((rc == CRM_DIRECT_NACK_RC) || (rc == target_rc)) {
        return FALSE;
    } else if (safe_str_eq(origin, "build_active_RAs")) {
        crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
                  id, rc, on_uname);
        return FALSE;
    }

    /* Sanity check */
    CRM_CHECK(on_uname != NULL, return TRUE);
    CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval),
              crm_err("Couldn't parse: %s", ID(event)); goto bail);
    CRM_CHECK(task != NULL, goto bail);
    CRM_CHECK(rsc_id != NULL, goto bail);

    /* Decide whether update is necessary and what value to use */
    if ((interval > 0) || safe_str_eq(task, CRMD_ACTION_PROMOTE)
        || safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
        do_update = TRUE;

    } else if (safe_str_eq(task, CRMD_ACTION_START)) {
        do_update = TRUE;
        if (failed_start_offset == NULL) {
            failed_start_offset = strdup(INFINITY_S);
        }
        value = failed_start_offset;

    } else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
        do_update = TRUE;
        if (failed_stop_offset == NULL) {
            failed_stop_offset = strdup(INFINITY_S);
        }
        value = failed_stop_offset;
    }

    /* Fail count will be either incremented or set to infinity */
    if (value == NULL || safe_str_neq(value, INFINITY_S)) {
        value = XML_NVPAIR_ATTR_VALUE "++";
    }

    if (do_update) {
        char *now = crm_itoa(time(NULL));
        char *attr_name = NULL;
        gboolean is_remote_node = FALSE;

        if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
            is_remote_node = TRUE;
        }

        crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)",
                 (ignore_failures? "last failure" : "failcount"),
                 rsc_id, on_uname, task, rc, value, now);

        /* Update the fail count, if we're not ignoring failures */
        if (!ignore_failures) {
            attr_name = crm_concat("fail-count", rsc_id, '-');
            update_attrd(on_uname, attr_name, value, NULL, is_remote_node);
            free(attr_name);
        }

        /* Update the last failure time (even if we're ignoring failures,
         * so that failure can still be detected and shown, e.g. by crm_mon)
         */
        attr_name = crm_concat("last-failure", rsc_id, '-');
        update_attrd(on_uname, attr_name, now, NULL, is_remote_node);
        free(attr_name);

        free(now);
    }

  bail:
    free(rsc_id);
    free(task);
    return TRUE;
}
Пример #12
0
/* aka. this is notification that we have (or have not) been accepted */
void
do_cl_join_finalize_respond(long long action,
                            enum crmd_fsa_cause cause,
                            enum crmd_fsa_state cur_state,
                            enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    xmlNode *tmp1 = NULL;
    gboolean was_nack = TRUE;
    static gboolean first_join = TRUE;
    ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
    const char *start_state = daemon_option("node_start_state");

    int join_id = -1;
    const char *op = crm_element_value(input->msg, F_CRM_TASK);
    const char *ack_nack = crm_element_value(input->msg, CRM_OP_JOIN_ACKNAK);
    const char *welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);

    if (safe_str_neq(op, CRM_OP_JOIN_ACKNAK)) {
        crm_trace("Ignoring op=%s message", op);
        return;
    }

    /* calculate if it was an ack or a nack */
    if (crm_is_true(ack_nack)) {
        was_nack = FALSE;
    }

    crm_element_value_int(input->msg, F_CRM_JOIN_ID, &join_id);

    if (was_nack) {
        crm_err("Shutting down because cluster join with leader %s failed "
                CRM_XS" join-%d NACK'd", welcome_from, join_id);
        register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
        return;
    }

    if (AM_I_DC == FALSE && safe_str_eq(welcome_from, fsa_our_uname)) {
        crm_warn("Discarding our own welcome - we're no longer the DC");
        return;
    }

    if (update_dc(input->msg) == FALSE) {
        crm_warn("Discarding %s from node %s (expected from %s)",
                 op, welcome_from, fsa_our_dc);
        return;
    }

    update_dc_expected(input->msg);

    /* send our status section to the DC */
    tmp1 = do_lrm_query(TRUE, fsa_our_uname);
    if (tmp1 != NULL) {
        xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
                                        CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);

        crm_xml_add_int(reply, F_CRM_JOIN_ID, join_id);

        crm_debug("Confirming join-%d: sending local operation history to %s",
                  join_id, fsa_our_dc);

        /*
         * If this is the node's first join since the crmd started on it, clear
         * any previous transient node attributes, to handle the case where
         * the node restarted so quickly that the cluster layer didn't notice.
         *
         * Do not remove the resources though, they'll be cleaned up in
         * do_dc_join_ack(). Removing them here creates a race condition if the
         * crmd is being recovered. Instead of a list of active resources from
         * the lrmd, we may end up with a blank status section. If we are _NOT_
         * lucky, we will probe for the "wrong" instance of anonymous clones and
         * end up with multiple active instances on the machine.
         */
        if (first_join && is_not_set(fsa_input_register, R_SHUTDOWN)) {
            first_join = FALSE;
            erase_status_tag(fsa_our_uname, XML_TAG_TRANSIENT_NODEATTRS, 0);
            update_attrd(fsa_our_uname, "terminate", NULL, NULL, FALSE);
            update_attrd(fsa_our_uname, XML_CIB_ATTR_SHUTDOWN, "0", NULL, FALSE);

            if (start_state) {
                set_join_state(start_state);
            }
        }

        send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, reply, TRUE);
        free_xml(reply);

        if (AM_I_DC == FALSE) {
            register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE, __FUNCTION__);
            update_attrd(NULL, NULL, NULL, NULL, FALSE);
        }

        free_xml(tmp1);

    } else {
        crm_err("Could not confirm join-%d with %s: Local operation history failed",
                join_id, fsa_our_dc);
        register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
    }
}
Пример #13
0
void
tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
{
    char *uuid = NULL;
    int stonith_id = -1;
    int transition_id = -1;
    crm_action_t *action = NULL;
    int call_id = data->call_id;
    int rc = data->rc;
    char *userdata = data->userdata;

    CRM_CHECK(userdata != NULL, return);
    crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
               pcmk_strerror(rc), rc);

    if (AM_I_DC == FALSE) {
        return;
    }

    /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
    /*       op->call_id, op->optype, op->node_name, op->op_result, */
    /*       (char *)op->node_list, op->private_data); */

    /* filter out old STONITH actions */
    CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
              goto bail);

    if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid)
        || transition_graph->id != transition_id) {
        crm_info("Ignoring STONITH action initiated outside of the current transition");
        goto bail;
    }

    action = get_action(stonith_id, FALSE);
    if (action == NULL) {
        crm_err("Stonith action not matched");
        goto bail;
    }

    stop_te_timer(action->timer);
    if (rc == pcmk_ok) {
        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
        const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
        const char *op = crm_meta_value(action->params, "stonith_action"); 

        crm_info("Stonith operation %d for %s passed", call_id, target);
        if (action->confirmed == FALSE) {
            te_action_confirmed(action);
            if (safe_str_eq("on", op)) {
                const char *value = NULL;
                char *now = crm_itoa(time(NULL));

                update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE);
                free(now);

                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
                update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE);

                value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
                update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE);

            } else if (action->sent_update == FALSE) {
                send_stonith_update(action, target, uuid);
                action->sent_update = TRUE;
            }
        }
        st_fail_count_reset(target);

    } else {
        const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
        enum transition_action abort_action = tg_restart;

        action->failed = TRUE;
        crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
                   call_id, target, pcmk_strerror(rc));

        /* If no fence devices were available, there's no use in immediately
         * checking again, so don't start a new transition in that case.
         */
        if (rc == -ENODEV) {
            crm_warn("No devices found in cluster to fence %s, giving up",
                     target);
            abort_action = tg_stop;
        }

        /* Increment the fail count now, so abort_for_stonith_failure() can
         * check it. Non-DC nodes will increment it in tengine_stonith_notify().
         */
        st_fail_count_increment(target);
        abort_for_stonith_failure(abort_action, target, NULL);
    }

    update_graph(transition_graph, action);
    trigger_graph();

  bail:
    free(userdata);
    free(uuid);
    return;
}
Пример #14
0
/* aka. this is notification that we have (or have not) been accepted */
void
do_cl_join_finalize_respond(long long action,
                            enum crmd_fsa_cause cause,
                            enum crmd_fsa_state cur_state,
                            enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    xmlNode *tmp1 = NULL;
    gboolean was_nack = TRUE;
    static gboolean first_join = TRUE;
    ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);

    int join_id = -1;
    const char *op = crm_element_value(input->msg, F_CRM_TASK);
    const char *ack_nack = crm_element_value(input->msg, CRM_OP_JOIN_ACKNAK);
    const char *welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);

    if (safe_str_neq(op, CRM_OP_JOIN_ACKNAK)) {
        crm_trace("Ignoring op=%s message", op);
        return;
    }

    /* calculate if it was an ack or a nack */
    if (crm_is_true(ack_nack)) {
        was_nack = FALSE;
    }

    crm_element_value_int(input->msg, F_CRM_JOIN_ID, &join_id);

    if (was_nack) {
        crm_err("Join (join-%d) with leader %s failed (NACK'd): Shutting down",
                join_id, welcome_from);
        register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
        return;
    }

    if (AM_I_DC == FALSE && safe_str_eq(welcome_from, fsa_our_uname)) {
        crm_warn("Discarding our own welcome - we're no longer the DC");
        return;
    }

    if (update_dc(input->msg) == FALSE) {
        crm_warn("Discarding %s from %s (expected %s)", op, welcome_from, fsa_our_dc);
        return;
    }

    /* send our status section to the DC */
    crm_debug("Confirming join join-%d: %s", join_id, crm_element_value(input->msg, F_CRM_TASK));
    tmp1 = do_lrm_query(TRUE, fsa_our_uname);
    if (tmp1 != NULL) {
        xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
                                        CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);

        crm_xml_add_int(reply, F_CRM_JOIN_ID, join_id);

        crm_debug("join-%d: Join complete."
                  "  Sending local LRM status to %s", join_id, fsa_our_dc);

        if (first_join) {
            first_join = FALSE;

            /*
             * Clear any previous transient node attribute and lrm operations
             *
             * Corosync has a nasty habit of not being able to tell if a
             *   node is returning or didn't leave in the first place.
             * This confuses Pacemaker because it never gets a "node up"
             *   event which is normally used to clean up the status section.
             *
             * Do not remove the resources though, they'll be cleaned up in
             *   do_dc_join_ack().  Removing them here creates a race
             *   condition if the crmd is being recovered.
             * Instead of a list of active resources from the lrmd
             *   we may end up with a blank status section.
             * If we are _NOT_ lucky, we will probe for the "wrong" instance
             *   of anonymous clones and end up with multiple active
             *   instances on the machine.
             */
            erase_status_tag(fsa_our_uname, XML_TAG_TRANSIENT_NODEATTRS, 0);

            /* Just in case attrd was still around too */
            if (is_not_set(fsa_input_register, R_SHUTDOWN)) {
                update_attrd(fsa_our_uname, "terminate", NULL, NULL, FALSE);
                update_attrd(fsa_our_uname, XML_CIB_ATTR_SHUTDOWN, "0", NULL, FALSE);
            }
        }

        send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, reply, TRUE);
        free_xml(reply);

        if (AM_I_DC == FALSE) {
            register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE, __FUNCTION__);
            update_attrd(NULL, NULL, NULL, NULL, FALSE);
        }

        free_xml(tmp1);

    } else {
        crm_err("Could not send our LRM state to the DC");
        register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
    }
}