void
send_stonith_update(stonith_ops_t * op)
{
	enum cib_errors rc = cib_ok;
	const char *target = op->node_name;
	const char *uuid   = op->node_uuid;
	
	/* zero out the node-status & remove all LRM status info */
	xmlNode *node_state = create_xml_node(NULL, XML_CIB_TAG_STATE);
	
	CRM_CHECK(op->node_name != NULL, return);
	CRM_CHECK(op->node_uuid != NULL, return);
	
	crm_xml_add(node_state, XML_ATTR_UUID,  uuid);
	crm_xml_add(node_state, XML_ATTR_UNAME, target);
	crm_xml_add(node_state, XML_CIB_ATTR_HASTATE,   DEADSTATUS);
	crm_xml_add(node_state, XML_CIB_ATTR_INCCM,     XML_BOOLEAN_NO);
	crm_xml_add(node_state, XML_CIB_ATTR_CRMDSTATE, OFFLINESTATUS);
	crm_xml_add(node_state, XML_CIB_ATTR_JOINSTATE, CRMD_JOINSTATE_DOWN);
	crm_xml_add(node_state, XML_CIB_ATTR_EXPSTATE,  CRMD_JOINSTATE_DOWN);
	crm_xml_add(node_state, XML_ATTR_ORIGIN,   __FUNCTION__);
	
	rc = fsa_cib_conn->cmds->update(
		fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
		cib_quorum_override|cib_scope_local|cib_can_create);	
	
	if(rc < cib_ok) {
		crm_err("CIB update failed: %s", cib_error2string(rc));
		abort_transition(
			INFINITY, tg_shutdown, "CIB update failed", node_state);
		
	} else {
		/* delay processing the trigger until the update completes */
	    add_cib_op_callback(fsa_cib_conn, rc, FALSE, crm_strdup(target), cib_fencing_updated);
	}

	erase_status_tag(op->node_name, XML_CIB_TAG_LRM, cib_scope_local);
	erase_status_tag(op->node_name, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
	
	free_xml(node_state);

#if 0
	/* Make sure the membership cache is accurate */ 
	crm_update_peer(0, 0, 0, -1, 0, uuid, target, NULL, CRM_NODE_LOST);
#endif
	
	return;
}
Beispiel #2
0
void
send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
{
    int rc = pcmk_ok;
    crm_node_t *peer = NULL;

    /* zero out the node-status & remove all LRM status info */
    xmlNode *node_state = NULL;

    CRM_CHECK(target != NULL, return);
    CRM_CHECK(uuid != NULL, return);

    if(get_node_uuid(0, target) == NULL) {
        set_node_uuid(target, uuid);
    }

    /* Make sure the membership and join caches are accurate */
    peer = crm_get_peer(0, target);
    if(peer->uuid == NULL) {
        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
        peer->uuid = strdup(uuid);
    }
    crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL);
    crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0);
    crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN);
    erase_node_from_join(target);

    node_state = do_update_node_cib(peer, node_update_cluster|node_update_peer|node_update_join|node_update_expected, NULL, __FUNCTION__);

    /* Force our known ID */
    crm_xml_add(node_state, XML_ATTR_UUID, uuid);

    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
                                    cib_quorum_override | cib_scope_local | cib_can_create);

    /* Delay processing the trigger until the update completes */
    crm_debug("Sending fencing update %d for %s", rc, target);
    add_cib_op_callback(fsa_cib_conn, rc, FALSE, strdup(target), cib_fencing_updated);

    /* Make sure it sticks */
    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */

    erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local);
    erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);

    free_xml(node_state);
    return;
}
Beispiel #3
0
void
send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
{
    enum cib_errors rc = cib_ok;

    /* zero out the node-status & remove all LRM status info */
    xmlNode *node_state = create_xml_node(NULL, XML_CIB_TAG_STATE);

    CRM_CHECK(target != NULL, return);
    CRM_CHECK(uuid != NULL, return);

    crm_xml_add(node_state, XML_ATTR_UUID, uuid);
    crm_xml_add(node_state, XML_ATTR_UNAME, target);
    crm_xml_add(node_state, XML_CIB_ATTR_HASTATE, DEADSTATUS);
    crm_xml_add(node_state, XML_CIB_ATTR_INCCM, XML_BOOLEAN_NO);
    crm_xml_add(node_state, XML_CIB_ATTR_CRMDSTATE, OFFLINESTATUS);
    crm_xml_add(node_state, XML_CIB_ATTR_JOINSTATE, CRMD_JOINSTATE_DOWN);
    crm_xml_add(node_state, XML_CIB_ATTR_EXPSTATE, CRMD_JOINSTATE_DOWN);
    crm_xml_add(node_state, XML_ATTR_ORIGIN, __FUNCTION__);

    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
                                    cib_quorum_override | cib_scope_local | cib_can_create);

    /* Delay processing the trigger until the update completes */
    crm_info("Sending fencing update %d for %s", rc, target);
    add_cib_op_callback(fsa_cib_conn, rc, FALSE, crm_strdup(target), cib_fencing_updated);

    /* Make sure it sticks */
    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */

    erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local);
    erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);

    free_xml(node_state);

    /* Make sure the membership cache is accurate */
    crm_update_peer(0, 0, 0, -1, crm_proc_none, uuid, target, NULL, CRM_NODE_LOST);

    return;
}
Beispiel #4
0
/*!
 * \internal
 * \brief Handle cluster communication related to pacemaker_remote node leaving
 *
 * \param[in] node_name  Name of lost node
 * \param[in] opts       Whether to keep or erase LRM history
 */
static void
remote_node_down(const char *node_name, const enum down_opts opts)
{
    xmlNode *update;
    int call_id = 0;
    int call_opt = crmd_cib_smart_opt();
    crm_node_t *node;

    /* Purge node from attrd's memory */
    update_attrd_remote_node_removed(node_name, NULL);

    /* Purge node's transient attributes */
    erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);

    /* Normally, the LRM operation history should be kept until the node comes
     * back up. However, after a successful fence, we want to clear it, so we
     * don't think resources are still running on the node.
     */
    if (opts == DOWN_ERASE_LRM) {
        erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
    }

    /* Ensure node is in the remote peer cache with lost state */
    node = crm_remote_peer_get(node_name);
    CRM_CHECK(node != NULL, return);
    crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0);

    /* Notify DC */
    send_remote_state_message(node_name, FALSE);

    /* Update CIB node state */
    update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
    create_node_state_update(node, node_update_cluster, update, __FUNCTION__);
    fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
    if (call_id < 0) {
        crm_perror(LOG_ERR, "%s CIB node state update", node_name);
    }
    free_xml(update);
}
Beispiel #5
0
/*	A_DC_JOIN_PROCESS_ACK	*/
void
do_dc_join_ack(long long action,
               enum crmd_fsa_cause cause,
               enum crmd_fsa_state cur_state,
               enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    int join_id = -1;
    int call_id = 0;
    ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);

    const char *op = crm_element_value(join_ack->msg, F_CRM_TASK);
    const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
    crm_node_t *peer = crm_get_peer(0, join_from);

    if (safe_str_neq(op, CRM_OP_JOIN_CONFIRM) || peer == NULL) {
        crm_debug("Ignoring op=%s message from %s", op, join_from);
        return;
    }

    crm_trace("Processing ack from %s", join_from);
    crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);

    if (peer->join != crm_join_finalized) {
        crm_info("Join not in progress: ignoring join-%d from %s (phase = %d)",
                 join_id, join_from, peer->join);
        return;

    } else if (join_id != current_join_id) {
        crm_err("Invalid response from %s: join-%d vs. join-%d",
                join_from, join_id, current_join_id);
        crm_update_peer_join(__FUNCTION__, peer, crm_join_nack);
        return;
    }

    crm_update_peer_join(__FUNCTION__, peer, crm_join_confirmed);

    crm_info("join-%d: Updating node state to %s for %s",
             join_id, CRMD_JOINSTATE_MEMBER, join_from);

    /* update CIB with the current LRM status from the node
     * We dont need to notify the TE of these updates, a transition will
     *   be started in due time
     */
    erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local);
    fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml,
                   cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL);
    fsa_register_cib_callback(call_id, FALSE, NULL, join_update_complete_callback);
    crm_debug("join-%d: Registered callback for LRM update %d", join_id, call_id);
}
/*
	CRM_OP_JOIN_ACKNACKを送信後、ノードからCRM_OP_JOIN_CONFIRMメッセージが送られて来たときの処理
*/
void
do_dc_join_ack(long long action,
	       enum crmd_fsa_cause cause,
	       enum crmd_fsa_state cur_state,
	       enum crmd_fsa_input current_input,
	       fsa_data_t *msg_data)
{
	int join_id = -1;
	int call_id = 0;
	ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);

	const char *join_id_s  = NULL;
	const char *join_state = NULL;
	const char *op         = crm_element_value(join_ack->msg, F_CRM_TASK);
	const char *join_from  = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);

	if(safe_str_neq(op, CRM_OP_JOIN_CONFIRM)) {
		/* CRM_OP_JOIN_CONFIRMメッセージ以外のメッセージは無視してログを出力 */
		crm_debug("Ignoring op=%s message from %s", op, join_from);
		return;
	} 

	crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);
	join_id_s = crm_element_value(join_ack->msg, F_CRM_JOIN_ID);

	/* now update them to "member" */
	
	crm_debug_2("Processing ack from %s", join_from);

	/* CRM_OP_JOIN_CONFIRMを送ってきたノードがfinalized_nodesハッシュテーブルに含まれるかサーチする */
	join_state = (const char *)
		g_hash_table_lookup(finalized_nodes, join_from);
	
	if(join_state == NULL) {
		/* 含まれていない場合は、無視 */
		crm_err("Join not in progress: ignoring join-%d from %s",
			join_id, join_from);
		return;
		
	} else if(safe_str_neq(join_state, CRMD_JOINSTATE_MEMBER)) {
		crm_err("Node %s wasnt invited to join the cluster",join_from);
		/* 含まれていて、CRMD_JOINSTATE_MEMBER状態の場合は、finalized_nodesハッシュテーブルからCRM_OP_JOIN_CONFIRMを送ってきたノードを削除 */
		g_hash_table_remove(finalized_nodes, join_from);
		return;
		
	} else if(join_id != current_join_id) {
		crm_err("Invalid response from %s: join-%d vs. join-%d",
			join_from, join_id, current_join_id);
		/* JOIN_IDが異なる場合も、finalized_nodesハッシュテーブルからCRM_OP_JOIN_CONFIRMを送ってきたノードを削除 */
		g_hash_table_remove(finalized_nodes, join_from);
		return;
	}
	
	/* 上記以外の場合も、finalized_nodesハッシュテーブルからCRM_OP_JOIN_CONFIRMを送ってきたノードを削除 */
	g_hash_table_remove(finalized_nodes, join_from);
	
	/* confirmed_nodesハッシュテーブルにRM_OP_JOIN_CONFIRMを送ってきたノードでサーチ */
	if(g_hash_table_lookup(confirmed_nodes, join_from) != NULL) {
		/* 既に登録済みの場合は、エラーメッセージを表示 */
		crm_err("join-%d: hash already contains confirmation from %s",
			join_id, join_from);
	}

	/* confirmed_nodesハッシュテーブルにCRM_OP_JOIN_CONFIRMを送ってきたノードを追加 */
	g_hash_table_insert(
		confirmed_nodes, crm_strdup(join_from), crm_strdup(join_id_s));

	/* join完了ログを出す */
 	crm_info("join-%d: Updating node state to %s for %s",
 		 join_id, CRMD_JOINSTATE_MEMBER, join_from);

	/* update CIB with the current LRM status from the node
	 * We dont need to notify the TE of these updates, a transition will
	 *   be started in due time
	 */
	/* CIBからCRM_OP_JOIN_CONFIRMを送ってきたノードのXML_CIB_TAG_LRM情報を消去 */
	erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local);
	/* CIBを更新 */
	/* CRM_OP_JOIN_CONFIRMを送ってきたノードのXML_CIB_TAG_LRM情報も追加される */
	/* クラスタ構成ノードのLRMD情報が最新で更新 */
	fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml,
		       cib_scope_local|cib_quorum_override|cib_can_create, call_id);
	/* CIBの更新コールバックをセット */
	add_cib_op_callback(
		fsa_cib_conn, call_id, FALSE, NULL, join_update_complete_callback);
 	crm_debug("join-%d: Registered callback for LRM update %d",
		  join_id, call_id);
}
Beispiel #7
0
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
    uint32_t old = 0;
    uint32_t changed = 0;
    bool appeared = FALSE;
    bool is_remote = is_set(node->flags, crm_remote_node);
    const char *status = NULL;

    /* Crmd waits to receive some information from the membership layer before
     * declaring itself operational. If this is being called for a cluster node,
     * indicate that we have it.
     */
    if (!is_remote) {
        set_bit(fsa_input_register, R_PEER_DATA);
    }

    if (node->uname == NULL) {
        return;
    }

    switch (type) {
        case crm_status_uname:
            /* If we've never seen the node, then it also won't be in the status section */
            crm_info("%s node %s is now %s",
                     (is_remote? "Remote" : "Cluster"),
                     node->uname, state_text(node->state));
            return;

        case crm_status_rstate:
        case crm_status_nstate:
            /* This callback should not be called unless the state actually
             * changed, but here's a failsafe just in case.
             */
            CRM_CHECK(safe_str_neq(data, node->state), return);

            crm_info("%s node %s is now %s (was %s)",
                     (is_remote? "Remote" : "Cluster"),
                     node->uname, state_text(node->state), state_text(data));

            if (safe_str_eq(CRM_NODE_MEMBER, node->state)) {
                appeared = TRUE;
                if (!is_remote) {
                    remove_stonith_cleanup(node->uname);
                }
            }

            crmd_alert_node_event(node);
            break;

        case crm_status_processes:
            if (data) {
                old = *(const uint32_t *)data;
                changed = node->processes ^ old;
            }

            status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS;
            crm_info("Client %s/%s now has status [%s] (DC=%s, changed=%6x)",
                     node->uname, peer2text(proc_flags), status,
                     AM_I_DC ? "true" : crm_str(fsa_our_dc), changed);

            if ((changed & proc_flags) == 0) {
                /* Peer process did not change */
                crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags);
                return;
            } else if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) {
                crm_trace("Not connected");
                return;
            } else if (fsa_state == S_STOPPING) {
                crm_trace("Stopping");
                return;
            }

            appeared = (node->processes & proc_flags) != 0;
            if (safe_str_eq(node->uname, fsa_our_uname) && (node->processes & proc_flags) == 0) {
                /* Did we get evicted? */
                crm_notice("Our peer connection failed");
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL);

            } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) {
                /* Did the DC leave us? */
                crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc);
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);

                /* @COMPAT DC < 1.1.13: If a DC shuts down normally, we don't
                 * want to fence it. Newer DCs will send their shutdown request
                 * to all peers, who will update the DC's expected state to
                 * down, thus avoiding fencing. We can safely erase the DC's
                 * transient attributes when it leaves in that case. However,
                 * the only way to avoid fencing older DCs is to leave the
                 * transient attributes intact until it rejoins.
                 */
                if (compare_version(fsa_our_dc_version, "3.0.9") > 0) {
                    erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
                }

            } else if(AM_I_DC && appeared == FALSE) {
                crm_info("Peer %s left us", node->uname);
                erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
            }
            break;
    }

    if (AM_I_DC) {
        xmlNode *update = NULL;
        int flags = node_update_peer;
        gboolean alive = is_remote? appeared : crm_is_peer_active(node);
        crm_action_t *down = match_down_event(node->uuid, appeared);

        crm_trace("Alive=%d, appeared=%d, down=%d",
                  alive, appeared, (down? down->id : -1));

        if (alive && type == crm_status_processes) {
            register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
        }

        if (down) {
            const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);

            if (safe_str_eq(task, CRM_OP_FENCE)) {

                /* tengine_stonith_callback() confirms fence actions */
                crm_trace("Updating CIB %s stonithd reported fencing of %s complete",
                          (down->confirmed? "after" : "before"), node->uname);

            } else if ((alive == FALSE) && safe_str_eq(task, CRM_OP_SHUTDOWN)) {
                crm_notice("%s of peer %s is complete "CRM_XS" op=%d",
                           task, node->uname, down->id);

                /* down->confirmed = TRUE; */
                stop_te_timer(down->timer);

                if (!is_remote) {
                    flags |= node_update_join | node_update_expected;
                    crmd_peer_down(node, FALSE);
                    check_join_state(fsa_state, __FUNCTION__);
                }

                update_graph(transition_graph, down);
                trigger_graph();

            } else {
                crm_trace("Node %s is %salive, was expected to %s (op %d)",
                          node->uname, (alive? "" : "not "), task, down->id);
            }

        } else if (appeared == FALSE) {
            crm_notice("Stonith/shutdown of %s not matched", node->uname);

            if (!is_remote) {
                crm_update_peer_join(__FUNCTION__, node, crm_join_none);
                check_join_state(fsa_state, __FUNCTION__);
            }

            abort_transition(INFINITY, tg_restart, "Node failure", NULL);
            fail_incompletable_actions(transition_graph, node->uuid);

        } else {
            crm_trace("Node %s came up, was not expected to be down",
                      node->uname);
        }

        if (is_remote) {
            /* A pacemaker_remote node won't have its cluster status updated
             * in the CIB by membership-layer callbacks, so do it here.
             */
            flags |= node_update_cluster;

            /* Trigger resource placement on newly integrated nodes */
            if (appeared) {
                abort_transition(INFINITY, tg_restart,
                                 "pacemaker_remote node integrated", NULL);
            }
        }

        /* Update the CIB node state */
        update = create_node_state_update(node, flags, NULL, __FUNCTION__);
        fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
                            cib_scope_local | cib_quorum_override | cib_can_create);
        free_xml(update);
    }

    trigger_fsa(fsa_source);
}
Beispiel #8
0
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
    uint32_t old = 0;
    uint32_t changed = 0;
    bool appeared = FALSE;
    const char *status = NULL;

    set_bit(fsa_input_register, R_PEER_DATA);
    if (node->uname == NULL) {
        return;
    }

    switch (type) {
        case crm_status_uname:
            /* If we've never seen the node, then it also wont be in the status section */
            crm_info("%s is now %s", node->uname, node->state);
            return;
        case crm_status_rstate:
            crm_info("Remote node %s is now %s (was %s)", node->uname, node->state, (const char *)data);
            /* Keep going */
        case crm_status_nstate:
            crm_info("%s is now %s (was %s)", node->uname, node->state, (const char *)data);
            if (safe_str_eq(data, node->state)) {
                /* State did not change */
                return;
            } else if(safe_str_eq(CRM_NODE_MEMBER, node->state)) {
                appeared = TRUE;
            }
            break;
        case crm_status_processes:
            if (data) {
                old = *(const uint32_t *)data;
                changed = node->processes ^ old;
            }

            /* crmd_proc_update(node, proc_flags); */
            status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS;
            crm_info("Client %s/%s now has status [%s] (DC=%s, changed=%6x)",
                     node->uname, peer2text(proc_flags), status,
                     AM_I_DC ? "true" : crm_str(fsa_our_dc), changed);

            if ((changed & proc_flags) == 0) {
                /* Peer process did not change */
                crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags);
                return;
            } else if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) {
                crm_trace("Not connected");
                return;
            } else if (fsa_state == S_STOPPING) {
                crm_trace("Stopping");
                return;
            }

            appeared = (node->processes & proc_flags) != 0;
            if (safe_str_eq(node->uname, fsa_our_uname) && (node->processes & proc_flags) == 0) {
                /* Did we get evicted? */
                crm_notice("Our peer connection failed");
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL);

            } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) {
                /* Did the DC leave us? */
                crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc);
                register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);

            } else if(AM_I_DC && appeared == FALSE) {
                crm_info("Peer %s left us", node->uname);
                /* crm_update_peer_join(__FUNCTION__, node, crm_join_none); */
            }
            break;
    }

    if (AM_I_DC) {
        xmlNode *update = NULL;
        int flags = node_update_peer;
        gboolean alive = crm_is_peer_active(node);
        crm_action_t *down = match_down_event(0, node->uuid, NULL, appeared);

        crm_trace("Alive=%d, appear=%d, down=%p", alive, appeared, down);

        if (alive && type == crm_status_processes) {
            register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
        }

        if (down) {
            const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);

            if (alive && safe_str_eq(task, CRM_OP_FENCE)) {
                crm_info("Node return implies stonith of %s (action %d) completed", node->uname,
                         down->id);
                erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local);
                erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
                /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */
                down->sent_update = TRUE;       /* Prevent tengine_stonith_callback() from calling send_stonith_update() */

            } else if (safe_str_eq(task, CRM_OP_FENCE)) {
                crm_trace("Waiting for stonithd to report the fencing of %s is complete", node->uname); /* via tengine_stonith_callback() */

            } else if (alive == FALSE) {
                crm_notice("%s of %s (op %d) is complete", task, node->uname, down->id);
                /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */
                stop_te_timer(down->timer);

                flags |= node_update_join | node_update_expected;
                crmd_peer_down(node, FALSE);
                check_join_state(fsa_state, __FUNCTION__);

                update_graph(transition_graph, down);
                trigger_graph();

            } else {
                crm_trace("Other %p", down);
            }

        } else if (appeared == FALSE) {
            crm_notice("Stonith/shutdown of %s not matched", node->uname);

            crm_update_peer_join(__FUNCTION__, node, crm_join_none);
            check_join_state(fsa_state, __FUNCTION__);

            abort_transition(INFINITY, tg_restart, "Node failure", NULL);
            fail_incompletable_actions(transition_graph, node->uuid);

        } else {
            crm_trace("Other %p", down);
        }

        update = do_update_node_cib(node, flags, NULL, __FUNCTION__);
        fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
                            cib_scope_local | cib_quorum_override | cib_can_create);
        free_xml(update);
    }

    trigger_fsa(fsa_source);
}
Beispiel #9
0
void
send_stonith_update(crm_action_t * action, const char *target, const char *uuid)
{
    int rc = pcmk_ok;
    crm_node_t *peer = NULL;

    /* We (usually) rely on the membership layer to do node_update_cluster,
     * and the peer status callback to do node_update_peer, because the node
     * might have already rejoined before we get the stonith result here.
     */
    int flags = node_update_join | node_update_expected;

    /* zero out the node-status & remove all LRM status info */
    xmlNode *node_state = NULL;

    CRM_CHECK(target != NULL, return);
    CRM_CHECK(uuid != NULL, return);

    /* Make sure the membership and join caches are accurate */
    peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);

    CRM_CHECK(peer != NULL, return);

    if (peer->state == NULL) {
        /* Usually, we rely on the membership layer to update the cluster state
         * in the CIB. However, if the node has never been seen, do it here, so
         * the node is not considered unclean.
         */
        flags |= node_update_cluster;
    }

    if (peer->uuid == NULL) {
        crm_info("Recording uuid '%s' for node '%s'", uuid, target);
        peer->uuid = strdup(uuid);
    }

    crmd_peer_down(peer, TRUE);

    /* Generate a node state update for the CIB */
    node_state = do_update_node_cib(peer, flags, NULL, __FUNCTION__);

    /* we have to mark whether or not remote nodes have already been fenced */
    if (peer->flags & crm_remote_node) {
        time_t now = time(NULL);
        char *now_s = crm_itoa(now);
        crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
        free(now_s);
    }

    /* Force our known ID */
    crm_xml_add(node_state, XML_ATTR_UUID, uuid);

    rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
                                    cib_quorum_override | cib_scope_local | cib_can_create);

    /* Delay processing the trigger until the update completes */
    crm_debug("Sending fencing update %d for %s", rc, target);
    fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);

    /* Make sure it sticks */
    /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */

    erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local);
    erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);

    free_xml(node_state);
    return;
}
Beispiel #10
0
/* aka. this is notification that we have (or have not) been accepted */
void
do_cl_join_finalize_respond(long long action,
                            enum crmd_fsa_cause cause,
                            enum crmd_fsa_state cur_state,
                            enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    xmlNode *tmp1 = NULL;
    gboolean was_nack = TRUE;
    static gboolean first_join = TRUE;
    ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
    const char *start_state = daemon_option("node_start_state");

    int join_id = -1;
    const char *op = crm_element_value(input->msg, F_CRM_TASK);
    const char *ack_nack = crm_element_value(input->msg, CRM_OP_JOIN_ACKNAK);
    const char *welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);

    if (safe_str_neq(op, CRM_OP_JOIN_ACKNAK)) {
        crm_trace("Ignoring op=%s message", op);
        return;
    }

    /* calculate if it was an ack or a nack */
    if (crm_is_true(ack_nack)) {
        was_nack = FALSE;
    }

    crm_element_value_int(input->msg, F_CRM_JOIN_ID, &join_id);

    if (was_nack) {
        crm_err("Shutting down because cluster join with leader %s failed "
                CRM_XS" join-%d NACK'd", welcome_from, join_id);
        register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
        return;
    }

    if (AM_I_DC == FALSE && safe_str_eq(welcome_from, fsa_our_uname)) {
        crm_warn("Discarding our own welcome - we're no longer the DC");
        return;
    }

    if (update_dc(input->msg) == FALSE) {
        crm_warn("Discarding %s from node %s (expected from %s)",
                 op, welcome_from, fsa_our_dc);
        return;
    }

    update_dc_expected(input->msg);

    /* send our status section to the DC */
    tmp1 = do_lrm_query(TRUE, fsa_our_uname);
    if (tmp1 != NULL) {
        xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
                                        CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);

        crm_xml_add_int(reply, F_CRM_JOIN_ID, join_id);

        crm_debug("Confirming join-%d: sending local operation history to %s",
                  join_id, fsa_our_dc);

        /*
         * If this is the node's first join since the crmd started on it, clear
         * any previous transient node attributes, to handle the case where
         * the node restarted so quickly that the cluster layer didn't notice.
         *
         * Do not remove the resources though, they'll be cleaned up in
         * do_dc_join_ack(). Removing them here creates a race condition if the
         * crmd is being recovered. Instead of a list of active resources from
         * the lrmd, we may end up with a blank status section. If we are _NOT_
         * lucky, we will probe for the "wrong" instance of anonymous clones and
         * end up with multiple active instances on the machine.
         */
        if (first_join && is_not_set(fsa_input_register, R_SHUTDOWN)) {
            first_join = FALSE;
            erase_status_tag(fsa_our_uname, XML_TAG_TRANSIENT_NODEATTRS, 0);
            update_attrd(fsa_our_uname, "terminate", NULL, NULL, FALSE);
            update_attrd(fsa_our_uname, XML_CIB_ATTR_SHUTDOWN, "0", NULL, FALSE);

            if (start_state) {
                set_join_state(start_state);
            }
        }

        send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, reply, TRUE);
        free_xml(reply);

        if (AM_I_DC == FALSE) {
            register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE, __FUNCTION__);
            update_attrd(NULL, NULL, NULL, NULL, FALSE);
        }

        free_xml(tmp1);

    } else {
        crm_err("Could not confirm join-%d with %s: Local operation history failed",
                join_id, fsa_our_dc);
        register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
    }
}
Beispiel #11
0
/*	A_DC_JOIN_PROCESS_ACK	*/
void
do_dc_join_ack(long long action,
               enum crmd_fsa_cause cause,
               enum crmd_fsa_state cur_state,
               enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    int join_id = -1;
    int call_id = 0;
    ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);

    const char *join_id_s = NULL;
    const char *join_state = NULL;
    const char *op = crm_element_value(join_ack->msg, F_CRM_TASK);
    const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);

    if (safe_str_neq(op, CRM_OP_JOIN_CONFIRM)) {
        crm_debug("Ignoring op=%s message from %s", op, join_from);
        return;
    }

    crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id);
    join_id_s = crm_element_value(join_ack->msg, F_CRM_JOIN_ID);

    /* now update them to "member" */

    crm_trace("Processing ack from %s", join_from);

    join_state = (const char *)
        g_hash_table_lookup(finalized_nodes, join_from);

    if (join_state == NULL) {
        crm_err("Join not in progress: ignoring join-%d from %s", join_id, join_from);
        return;

    } else if (safe_str_neq(join_state, CRMD_JOINSTATE_MEMBER)) {
        crm_err("Node %s wasnt invited to join the cluster", join_from);
        g_hash_table_remove(finalized_nodes, join_from);
        return;

    } else if (join_id != current_join_id) {
        crm_err("Invalid response from %s: join-%d vs. join-%d",
                join_from, join_id, current_join_id);
        g_hash_table_remove(finalized_nodes, join_from);
        return;
    }

    g_hash_table_remove(finalized_nodes, join_from);

    if (g_hash_table_lookup(confirmed_nodes, join_from) != NULL) {
        crm_err("join-%d: hash already contains confirmation from %s", join_id, join_from);
    }

    g_hash_table_insert(confirmed_nodes, strdup(join_from), strdup(join_id_s));

    crm_info("join-%d: Updating node state to %s for %s",
             join_id, CRMD_JOINSTATE_MEMBER, join_from);

    /* update CIB with the current LRM status from the node
     * We dont need to notify the TE of these updates, a transition will
     *   be started in due time
     */
    erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local);
    fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml,
                   cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL);
    add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, join_update_complete_callback);
    crm_debug("join-%d: Registered callback for LRM update %d", join_id, call_id);
}
Beispiel #12
0
/* aka. this is notification that we have (or have not) been accepted */
void
do_cl_join_finalize_respond(long long action,
                            enum crmd_fsa_cause cause,
                            enum crmd_fsa_state cur_state,
                            enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
    xmlNode *tmp1 = NULL;
    gboolean was_nack = TRUE;
    static gboolean first_join = TRUE;
    ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);

    int join_id = -1;
    const char *op = crm_element_value(input->msg, F_CRM_TASK);
    const char *ack_nack = crm_element_value(input->msg, CRM_OP_JOIN_ACKNAK);
    const char *welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);

    if (safe_str_neq(op, CRM_OP_JOIN_ACKNAK)) {
        crm_trace("Ignoring op=%s message", op);
        return;
    }

    /* calculate if it was an ack or a nack */
    if (crm_is_true(ack_nack)) {
        was_nack = FALSE;
    }

    crm_element_value_int(input->msg, F_CRM_JOIN_ID, &join_id);

    if (was_nack) {
        crm_err("Join (join-%d) with leader %s failed (NACK'd): Shutting down",
                join_id, welcome_from);
        register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
        return;
    }

    if (AM_I_DC == FALSE && safe_str_eq(welcome_from, fsa_our_uname)) {
        crm_warn("Discarding our own welcome - we're no longer the DC");
        return;
    }

    if (update_dc(input->msg) == FALSE) {
        crm_warn("Discarding %s from %s (expected %s)", op, welcome_from, fsa_our_dc);
        return;
    }

    /* send our status section to the DC */
    crm_debug("Confirming join join-%d: %s", join_id, crm_element_value(input->msg, F_CRM_TASK));
    tmp1 = do_lrm_query(TRUE, fsa_our_uname);
    if (tmp1 != NULL) {
        xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
                                        CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);

        crm_xml_add_int(reply, F_CRM_JOIN_ID, join_id);

        crm_debug("join-%d: Join complete."
                  "  Sending local LRM status to %s", join_id, fsa_our_dc);

        if (first_join) {
            first_join = FALSE;

            /*
             * Clear any previous transient node attribute and lrm operations
             *
             * Corosync has a nasty habit of not being able to tell if a
             *   node is returning or didn't leave in the first place.
             * This confuses Pacemaker because it never gets a "node up"
             *   event which is normally used to clean up the status section.
             *
             * Do not remove the resources though, they'll be cleaned up in
             *   do_dc_join_ack().  Removing them here creates a race
             *   condition if the crmd is being recovered.
             * Instead of a list of active resources from the lrmd
             *   we may end up with a blank status section.
             * If we are _NOT_ lucky, we will probe for the "wrong" instance
             *   of anonymous clones and end up with multiple active
             *   instances on the machine.
             */
            erase_status_tag(fsa_our_uname, XML_TAG_TRANSIENT_NODEATTRS, 0);

            /* Just in case attrd was still around too */
            if (is_not_set(fsa_input_register, R_SHUTDOWN)) {
                update_attrd(fsa_our_uname, "terminate", NULL, NULL, FALSE);
                update_attrd(fsa_our_uname, XML_CIB_ATTR_SHUTDOWN, "0", NULL, FALSE);
            }
        }

        send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, reply, TRUE);
        free_xml(reply);

        if (AM_I_DC == FALSE) {
            register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE, __FUNCTION__);
            update_attrd(NULL, NULL, NULL, NULL, FALSE);
        }

        free_xml(tmp1);

    } else {
        crm_err("Could not send our LRM state to the DC");
        register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
    }
}