Beispiel #1
0
static gboolean remote_op_timeout(gpointer userdata)
{
    remote_fencing_op_t *op = userdata;
    op->query_timer = 0;

    if(op->state == st_done) {
        crm_debug("Action %s (%s) for %s already completed", op->action, op->id, op->target);
        return FALSE;
    }

    crm_debug("Action %s (%s) for %s timed out", op->action, op->id, op->target);
    remote_op_done(op, NULL, -ETIME);
    op->state = st_failed;

    return FALSE;
}
Beispiel #2
0
int
stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op)
{
    xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);

    op->state = st_done;
    op->completed = time(NULL);
    op->delegate = strdup("a human");

    crm_notice("Injecting manual confirmation that %s is safely off/down",
               crm_element_value(dev, F_STONITH_TARGET));

    remote_op_done(op, msg, pcmk_ok, FALSE);

    /* Replies are sent via done_cb->stonith_send_async_reply()->do_local_reply() */
    return -EINPROGRESS;
}
Beispiel #3
0
static void
handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc)
{
    GListPtr iter = NULL;

    for (iter = op->duplicates; iter != NULL; iter = iter->next) {
        remote_fencing_op_t *other = iter->data;

        if (other->state == st_duplicate) {
            /* Ie. it hasn't timed out already */
            other->state = op->state;
            crm_debug("Peforming duplicate notification for %s@%s.%.8s = %s", other->client_name,
                      other->originator, other->id, pcmk_strerror(rc));
            remote_op_done(other, data, rc, TRUE);

        } else {
            crm_err("Skipping duplicate notification for %s@%s - %d", other->client_name,
                    other->originator, other->state);
        }
    }
}
Beispiel #4
0
void
call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
{
    const char *device = NULL;
    int timeout = op->base_timeout;

    crm_trace("State for %s.%.8s: %s %d", op->target, op->client_name, op->id, op->state);
    if (peer == NULL && !is_set(op->call_options, st_opt_topology)) {
        peer = stonith_choose_peer(op);
    }

    if (!op->op_timer_total) {
        int total_timeout = get_op_total_timeout(op, peer, op->base_timeout);

        op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
        op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
        report_timeout_period(op, op->total_timeout);
        crm_info("Total remote op timeout set to %d for fencing of node %s for %s.%.8s",
                 total_timeout, op->target, op->client_name, op->id);
    }

    if (is_set(op->call_options, st_opt_topology) && op->devices) {
        /* Ignore any preference, they might not have the device we need */
        /* When using topology, the stonith_choose_peer function pops off
         * the peer from the op's query results.  Make sure to calculate
         * the op_timeout before calling this function when topology is in use */
        peer = stonith_choose_peer(op);
        device = op->devices->data;
        timeout = get_device_timeout(peer, device, op->base_timeout);
    }

    if (peer) {
        int timeout_one = 0;
        xmlNode *query = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);

        crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
        crm_xml_add(query, F_STONITH_TARGET, op->target);
        crm_xml_add(query, F_STONITH_ACTION, op->action);
        crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
        crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
        crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
        crm_xml_add_int(query, F_STONITH_TIMEOUT, timeout);

        if (device) {
            timeout_one =
                TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(peer, device, op->base_timeout);
            crm_info("Requesting that %s perform op %s %s with %s for %s (%ds)", peer->host,
                     op->action, op->target, device, op->client_name, timeout_one);
            crm_xml_add(query, F_STONITH_DEVICE, device);
            crm_xml_add(query, F_STONITH_MODE, "slave");

        } else {
            timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(peer, op->base_timeout);
            crm_info("Requesting that %s perform op %s %s for %s (%ds)",
                     peer->host, op->action, op->target, op->client_name, timeout_one);
            crm_xml_add(query, F_STONITH_MODE, "smart");
        }

        op->state = st_exec;
        if (op->op_timer_one) {
            g_source_remove(op->op_timer_one);
        }
        op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);

        send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, query, FALSE);
        peer->tried = TRUE;
        free_xml(query);
        return;

    } else if (op->owner == FALSE) {
        crm_err("The termination of %s for %s is not ours to control", op->target, op->client_name);

    } else if (op->query_timer == 0) {
        /* We've exhausted all available peers */
        crm_info("No remaining peers capable of terminating %s for %s (%d)", op->target,
                 op->client_name, op->state);
        CRM_LOG_ASSERT(op->state < st_done);
        remote_op_timeout(op);

    } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
        int rc = -EHOSTUNREACH;

        /* if the operation never left the query state,
         * but we have all the expected replies, then no devices
         * are available to execute the fencing operation. */
        if (op->state == st_query) {
           crm_info("None of the %d peers have devices capable of terminating %s for %s (%d)",
                   op->replies, op->target, op->client_name, op->state);

            rc = -ENODEV;
        } else {
           crm_info("None of the %d peers are capable of terminating %s for %s (%d)",
                   op->replies, op->target, op->client_name, op->state);
        }

        op->state = st_failed;
        remote_op_done(op, NULL, rc, FALSE);

    } else if (device) {
        crm_info("Waiting for additional peers capable of terminating %s with %s for %s.%.8s",
                 op->target, device, op->client_name, op->id);
    } else {
        crm_info("Waiting for additional peers capable of terminating %s for %s%.8s",
                 op->target, op->client_name, op->id);
    }
}
Beispiel #5
0
remote_fencing_op_t *
initiate_remote_stonith_op(crm_client_t * client, xmlNode * request, gboolean manual_ack)
{
    int query_timeout = 0;
    xmlNode *query = NULL;
    const char *client_id = NULL;
    remote_fencing_op_t *op = NULL;

    if (client) {
        client_id = client->id;
    } else {
        client_id = crm_element_value(request, F_STONITH_CLIENTID);
    }

    CRM_LOG_ASSERT(client_id != NULL);
    op = create_remote_stonith_op(client_id, request, FALSE);
    op->owner = TRUE;
    if (manual_ack) {
        crm_notice("Initiating manual confirmation for %s: %s",
                   op->target, op->id);
        return op;
    }
    
    CRM_CHECK(op->action, return NULL);

    if (stonith_topology_next(op) != pcmk_ok) {
        op->state = st_failed;
    }

    switch (op->state) {
        case st_failed:
            crm_warn("Initiation of remote operation %s for %s: failed (%s)", op->action,
                     op->target, op->id);
            remote_op_done(op, NULL, -EINVAL, FALSE);
            return op;

        case st_duplicate:
            crm_info("Initiating remote operation %s for %s: %s (duplicate)", op->action,
                     op->target, op->id);
            return op;

        default:
            crm_notice("Initiating remote operation %s for %s: %s (%d)", op->action, op->target,
                       op->id, op->state);
    }

    query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY, NULL, 0);

    crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
    crm_xml_add(query, F_STONITH_TARGET, op->target);
    crm_xml_add(query, F_STONITH_ACTION, op->action);
    crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
    crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
    crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
    crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);

    send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
    free_xml(query);

    query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
    op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);

    return op;
}
Beispiel #6
0
int
process_remote_stonith_exec(xmlNode * msg)
{
    int rc = 0;
    const char *id = NULL;
    const char *device = NULL;
    remote_fencing_op_t *op = NULL;
    xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);

    CRM_CHECK(dev != NULL, return -EPROTO);

    id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
    CRM_CHECK(id != NULL, return -EPROTO);

    dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
    CRM_CHECK(dev != NULL, return -EPROTO);

    crm_element_value_int(dev, F_STONITH_RC, &rc);

    device = crm_element_value(dev, F_STONITH_DEVICE);

    if (remote_op_list) {
        op = g_hash_table_lookup(remote_op_list, id);
    }

    if (op == NULL && rc == pcmk_ok) {
        /* Record successful fencing operations */
        const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);

        op = create_remote_stonith_op(client_id, dev, TRUE);
    }

    if (op == NULL) {
        /* Could be for an event that began before we started */
        /* TODO: Record the op for later querying */
        crm_info("Unknown or expired remote op: %s", id);
        return -EOPNOTSUPP;
    }

    if (op->devices && device && safe_str_neq(op->devices->data, device)) {
        crm_err
            ("Received outdated reply for device %s (instead of %s) to %s node %s. Operation already timed out at remote level.",
             device, op->devices->data, op->action, op->target);
        return rc;
    }

    if (safe_str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast")) {
        crm_debug("Marking call to %s for %s on behalf of %s@%s.%.8s: %s (%d)",
                  op->action, op->target, op->client_name, op->id, op->originator,
                  pcmk_strerror(rc), rc);
        if (rc == pcmk_ok) {
            op->state = st_done;
        } else {
            op->state = st_failed;
        }
        remote_op_done(op, msg, rc, FALSE);
        return pcmk_ok;
    } else if (safe_str_neq(op->originator, stonith_our_uname)) {
        /* If this isn't a remote level broadcast, and we are not the
         * originator of the operation, we should not be receiving this msg. */
        crm_err
            ("%s received non-broadcast fencing result for operation it does not own (device %s targeting %s)",
             stonith_our_uname, device, op->target);
        return rc;
    }

    if (is_set(op->call_options, st_opt_topology)) {
        const char *device = crm_element_value(msg, F_STONITH_DEVICE);

        crm_notice("Call to %s for %s on behalf of %s@%s: %s (%d)",
                   device, op->target, op->client_name, op->originator,
                   pcmk_strerror(rc), rc);

        /* We own the op, and it is complete. broadcast the result to all nodes
         * and notify our local clients. */
        if (op->state == st_done) {
            remote_op_done(op, msg, rc, FALSE);
            return rc;
        }

        /* An operation completed succesfully but has not yet been marked as done.
         * Continue the topology if more devices exist at the current level, otherwise
         * mark as done. */
        if (rc == pcmk_ok) {
            if (op->devices) {
                /* Success, are there any more? */
                op->devices = op->devices->next;
            }
            /* if no more devices at this fencing level, we are done,
             * else we need to contine with executing the next device in the list */
            if (op->devices == NULL) {
                crm_trace("Marking complex fencing op for %s as complete", op->target);
                op->state = st_done;
                remote_op_done(op, msg, rc, FALSE);
                return rc;
            }
        } else {
            /* This device failed, time to try another topology level. If no other
             * levels are available, mark this operation as failed and report results. */
            if (stonith_topology_next(op) != pcmk_ok) {
                op->state = st_failed;
                remote_op_done(op, msg, rc, FALSE);
                return rc;
            }
        }
    } else if (rc == pcmk_ok && op->devices == NULL) {
        crm_trace("All done for %s", op->target);

        op->state = st_done;
        remote_op_done(op, msg, rc, FALSE);
        return rc;
    }

    /* Retry on failure or execute the rest of the topology */
    crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
              op->client_name, rc);
    call_remote_stonith(op, NULL);
    return rc;
}
Beispiel #7
0
int process_remote_stonith_exec(xmlNode *msg) 
{
    int rc = 0;
    const char *id = NULL;
    remote_fencing_op_t *op = NULL;
    xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR);

    CRM_CHECK(dev != NULL, return -EPROTO);

    id = crm_element_value(dev, F_STONITH_REMOTE);
    CRM_CHECK(id != NULL, return -EPROTO);

    dev = get_xpath_object("//@"F_STONITH_RC, msg, LOG_ERR);
    CRM_CHECK(dev != NULL, return -EPROTO);

    crm_element_value_int(dev, F_STONITH_RC, &rc);

    if(remote_op_list) {
        op = g_hash_table_lookup(remote_op_list, id);
    }

    if(op == NULL && rc == pcmk_ok) {
        /* Record successful fencing operations */
        const char *client_id = crm_element_value(msg, F_STONITH_CLIENTID);

        op = create_remote_stonith_op(client_id, msg, TRUE);
    }

    if(op == NULL) {
        /* Could be for an event that began before we started */
        /* TODO: Record the op for later querying */
        crm_info("Unknown or expired remote op: %s", id);
        return -EOPNOTSUPP;
    }

    if(is_set(op->call_options, st_opt_topology)) {
        const char *device = crm_element_value(msg, F_STONITH_DEVICE);

        crm_notice("Call to %s for %s on behalf of %s: %s (%d)", device, op->target, op->originator, rc == pcmk_ok?"passed":"failed", rc);
        if(safe_str_eq(op->originator, stonith_our_uname)) {

            if(op->state == st_done) {
                remote_op_done(op, msg, rc);
                return rc;

            } else if(rc == pcmk_ok && op->devices) {
                /* Success, are there any more? */
                op->devices = op->devices->next;
            }

            if(op->devices == NULL) {
                crm_trace("Broadcasting completion of complex fencing op for %s", op->target);
                send_cluster_message(NULL, crm_msg_stonith_ng, msg, FALSE);
                op->state = st_done;
                return rc;
            }

        } else {
            op->state = st_done;
            remote_op_done(op, msg, rc);
        }

    } else if(rc == pcmk_ok && op->devices == NULL) {
        crm_trace("All done for %s", op->target);

        op->state = st_done;
        remote_op_done(op, msg, rc);
        return rc;
    }

    /* Retry on failure or execute the rest of the topology */
    crm_trace("Next for %s (rc was %d)", op->target, rc);
    call_remote_stonith(op, NULL);
    return rc;
}