Example #1
0
static gboolean remote_op_query_timeout(gpointer data)
{
    remote_fencing_op_t *op = data;

    op->query_timer = 0;
    if(op->state == st_done) {
	crm_debug("Operation %s for %s already completed", op->id, op->target);
	
    } else if(op->state == st_exec) {
	crm_debug("Operation %s for %s already in progress", op->id, op->target);
	
    } else if(op->query_results) {
	crm_debug("Query %s for %s complete: %d", op->id, op->target, op->state);
	call_remote_stonith(op, NULL);

    } else {
	if(op->op_timer) {
	    g_source_remove(op->op_timer);
	    op->op_timer = 0;
	}
	remote_op_timeout(op);
    }
    
    
    return FALSE;
}
Example #2
0
static gboolean
remote_op_timeout_one(gpointer userdata)
{
    remote_fencing_op_t *op = userdata;

    op->op_timer_one = 0;

    crm_notice("Remote %s operation on %s for %s.%8s timed out",
               op->action, op->target, op->client_name, op->id);
    call_remote_stonith(op, NULL);
    return FALSE;
}
Example #3
0
int
process_remote_stonith_exec(xmlNode * msg)
{
    int rc = 0;
    const char *id = NULL;
    const char *device = NULL;
    remote_fencing_op_t *op = NULL;
    xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);

    CRM_CHECK(dev != NULL, return -EPROTO);

    id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
    CRM_CHECK(id != NULL, return -EPROTO);

    dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
    CRM_CHECK(dev != NULL, return -EPROTO);

    crm_element_value_int(dev, F_STONITH_RC, &rc);

    device = crm_element_value(dev, F_STONITH_DEVICE);

    if (remote_op_list) {
        op = g_hash_table_lookup(remote_op_list, id);
    }

    if (op == NULL && rc == pcmk_ok) {
        /* Record successful fencing operations */
        const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);

        op = create_remote_stonith_op(client_id, dev, TRUE);
    }

    if (op == NULL) {
        /* Could be for an event that began before we started */
        /* TODO: Record the op for later querying */
        crm_info("Unknown or expired remote op: %s", id);
        return -EOPNOTSUPP;
    }

    if (op->devices && device && safe_str_neq(op->devices->data, device)) {
        crm_err
            ("Received outdated reply for device %s (instead of %s) to %s node %s. Operation already timed out at remote level.",
             device, op->devices->data, op->action, op->target);
        return rc;
    }

    if (safe_str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast")) {
        crm_debug("Marking call to %s for %s on behalf of %s@%s.%.8s: %s (%d)",
                  op->action, op->target, op->client_name, op->id, op->originator,
                  pcmk_strerror(rc), rc);
        if (rc == pcmk_ok) {
            op->state = st_done;
        } else {
            op->state = st_failed;
        }
        remote_op_done(op, msg, rc, FALSE);
        return pcmk_ok;
    } else if (safe_str_neq(op->originator, stonith_our_uname)) {
        /* If this isn't a remote level broadcast, and we are not the
         * originator of the operation, we should not be receiving this msg. */
        crm_err
            ("%s received non-broadcast fencing result for operation it does not own (device %s targeting %s)",
             stonith_our_uname, device, op->target);
        return rc;
    }

    if (is_set(op->call_options, st_opt_topology)) {
        const char *device = crm_element_value(msg, F_STONITH_DEVICE);

        crm_notice("Call to %s for %s on behalf of %s@%s: %s (%d)",
                   device, op->target, op->client_name, op->originator,
                   pcmk_strerror(rc), rc);

        /* We own the op, and it is complete. broadcast the result to all nodes
         * and notify our local clients. */
        if (op->state == st_done) {
            remote_op_done(op, msg, rc, FALSE);
            return rc;
        }

        /* An operation completed succesfully but has not yet been marked as done.
         * Continue the topology if more devices exist at the current level, otherwise
         * mark as done. */
        if (rc == pcmk_ok) {
            if (op->devices) {
                /* Success, are there any more? */
                op->devices = op->devices->next;
            }
            /* if no more devices at this fencing level, we are done,
             * else we need to contine with executing the next device in the list */
            if (op->devices == NULL) {
                crm_trace("Marking complex fencing op for %s as complete", op->target);
                op->state = st_done;
                remote_op_done(op, msg, rc, FALSE);
                return rc;
            }
        } else {
            /* This device failed, time to try another topology level. If no other
             * levels are available, mark this operation as failed and report results. */
            if (stonith_topology_next(op) != pcmk_ok) {
                op->state = st_failed;
                remote_op_done(op, msg, rc, FALSE);
                return rc;
            }
        }
    } else if (rc == pcmk_ok && op->devices == NULL) {
        crm_trace("All done for %s", op->target);

        op->state = st_done;
        remote_op_done(op, msg, rc, FALSE);
        return rc;
    }

    /* Retry on failure or execute the rest of the topology */
    crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
              op->client_name, rc);
    call_remote_stonith(op, NULL);
    return rc;
}
Example #4
0
int
process_remote_stonith_query(xmlNode * msg)
{
    int devices = 0;
    gboolean host_is_target = FALSE;
    const char *id = NULL;
    const char *host = NULL;
    remote_fencing_op_t *op = NULL;
    st_query_result_t *result = NULL;
    uint32_t active = fencing_active_peers();
    xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
    xmlNode *child = NULL;

    CRM_CHECK(dev != NULL, return -EPROTO);

    id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
    CRM_CHECK(id != NULL, return -EPROTO);

    dev = get_xpath_object("//@st-available-devices", msg, LOG_ERR);
    CRM_CHECK(dev != NULL, return -EPROTO);
    crm_element_value_int(dev, "st-available-devices", &devices);

    op = g_hash_table_lookup(remote_op_list, id);
    if (op == NULL) {
        crm_debug("Unknown or expired remote op: %s", id);
        return -EOPNOTSUPP;
    }

    op->replies++;
    host = crm_element_value(msg, F_ORIG);
    host_is_target = safe_str_eq(host, op->target);

    if (devices <= 0) {
        /* If we're doing 'known' then we might need to fire anyway */
        crm_trace("Query result from %s (%d devices)", host, devices);
        if(op->state == st_query && (op->replies >= op->replies_expected || op->replies >= active)) {
            crm_info("All queries have arrived, continuing (%d, %d, %d) ", op->replies_expected, active, op->replies);
            call_remote_stonith(op, NULL);
        }
        return pcmk_ok;

    } else if (host_is_target) {
        if (op->call_options & st_opt_allow_suicide) {
            crm_trace("Allowing %s to potentialy fence itself", op->target);
        } else {
            crm_info("Ignoring reply from %s, hosts are not permitted to commit suicide",
                     op->target);
            return pcmk_ok;
        }
    }

    crm_info("Query result %d of %d from %s (%d devices)", op->replies, op->replies_expected, host, devices);
    result = calloc(1, sizeof(st_query_result_t));
    result->host = strdup(host);
    result->devices = devices;
    result->custom_action_timeouts = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL);
    result->verified_devices = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL);

    for (child = __xml_first_child(dev); child != NULL; child = __xml_next(child)) {
        const char *device = ID(child);
        int action_timeout = 0;
        int verified = 0;

        if (device) {
            result->device_list = g_list_prepend(result->device_list, strdup(device));
            crm_element_value_int(child, F_STONITH_ACTION_TIMEOUT, &action_timeout);
            crm_element_value_int(child, F_STONITH_DEVICE_VERIFIED, &verified);
            if (action_timeout) {
                crm_trace("Peer %s with device %s returned action timeout %d",
                          result->host, device, action_timeout);
                g_hash_table_insert(result->custom_action_timeouts,
                                    strdup(device), GINT_TO_POINTER(action_timeout));
            }
            if (verified) {
                crm_trace("Peer %s has confirmed a verified device %s", result->host, device);
                g_hash_table_insert(result->verified_devices,
                                    strdup(device), GINT_TO_POINTER(verified));
            }
        }
    }

    CRM_CHECK(devices == g_list_length(result->device_list),
              crm_err("Mis-match: Query claimed to have %d devices but %d found", devices,
                      g_list_length(result->device_list)));

    op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);

    if (is_set(op->call_options, st_opt_topology)) {
        /* If we start the fencing before all the topology results are in,
         * it is possible fencing levels will be skipped because of the missing
         * query results. */
        if (op->state == st_query && all_topology_devices_found(op)) {
            /* All the query results are in for the topology, start the fencing ops. */
            crm_trace("All topology devices found");
            call_remote_stonith(op, result);

        } else if(op->state == st_query && (op->replies >= op->replies_expected || op->replies >= active)) {
            crm_info("All topology queries have arrived, continuing (%d, %d, %d) ", op->replies_expected, active, op->replies);
            call_remote_stonith(op, NULL);
        }

    } else if (op->state == st_query) {
        /* We have a result for a non-topology fencing op that looks promising,
         * go ahead and start fencing before query timeout */
        if (host_is_target == FALSE && g_hash_table_size(result->verified_devices)) {
            /* we have a verified device living on a peer that is not the target */
            crm_trace("Found %d verified devices", g_hash_table_size(result->verified_devices));
            call_remote_stonith(op, result);

        } else if (safe_str_eq(op->action, "on")) {
            crm_trace("Unfencing %s", op->target);
            call_remote_stonith(op, result);

        } else if(op->replies >= op->replies_expected || op->replies >= active) {
            crm_info("All queries have arrived, continuing (%d, %d, %d) ", op->replies_expected, active, op->replies);
            call_remote_stonith(op, NULL);

        } else {
            crm_trace("Waiting for more peer results before launching fencing operation");
        }

    } else if (op->state == st_done) {
        crm_info("Discarding query result from %s (%d devices): Operation is in state %d",
                 result->host, result->devices, op->state);
    }

    return pcmk_ok;
}
Example #5
0
int process_remote_stonith_exec(xmlNode *msg) 
{
    int rc = 0;
    const char *id = NULL;
    remote_fencing_op_t *op = NULL;
    xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR);

    CRM_CHECK(dev != NULL, return -EPROTO);

    id = crm_element_value(dev, F_STONITH_REMOTE);
    CRM_CHECK(id != NULL, return -EPROTO);

    dev = get_xpath_object("//@"F_STONITH_RC, msg, LOG_ERR);
    CRM_CHECK(dev != NULL, return -EPROTO);

    crm_element_value_int(dev, F_STONITH_RC, &rc);

    if(remote_op_list) {
        op = g_hash_table_lookup(remote_op_list, id);
    }

    if(op == NULL && rc == pcmk_ok) {
        /* Record successful fencing operations */
        const char *client_id = crm_element_value(msg, F_STONITH_CLIENTID);

        op = create_remote_stonith_op(client_id, msg, TRUE);
    }

    if(op == NULL) {
        /* Could be for an event that began before we started */
        /* TODO: Record the op for later querying */
        crm_info("Unknown or expired remote op: %s", id);
        return -EOPNOTSUPP;
    }

    if(is_set(op->call_options, st_opt_topology)) {
        const char *device = crm_element_value(msg, F_STONITH_DEVICE);

        crm_notice("Call to %s for %s on behalf of %s: %s (%d)", device, op->target, op->originator, rc == pcmk_ok?"passed":"failed", rc);
        if(safe_str_eq(op->originator, stonith_our_uname)) {

            if(op->state == st_done) {
                remote_op_done(op, msg, rc);
                return rc;

            } else if(rc == pcmk_ok && op->devices) {
                /* Success, are there any more? */
                op->devices = op->devices->next;
            }

            if(op->devices == NULL) {
                crm_trace("Broadcasting completion of complex fencing op for %s", op->target);
                send_cluster_message(NULL, crm_msg_stonith_ng, msg, FALSE);
                op->state = st_done;
                return rc;
            }

        } else {
            op->state = st_done;
            remote_op_done(op, msg, rc);
        }

    } else if(rc == pcmk_ok && op->devices == NULL) {
        crm_trace("All done for %s", op->target);

        op->state = st_done;
        remote_op_done(op, msg, rc);
        return rc;
    }

    /* Retry on failure or execute the rest of the topology */
    crm_trace("Next for %s (rc was %d)", op->target, rc);
    call_remote_stonith(op, NULL);
    return rc;
}
Example #6
0
int process_remote_stonith_query(xmlNode *msg) 
{
    int devices = 0;
    const char *id = NULL;
    const char *host = NULL;
    remote_fencing_op_t *op = NULL;
    st_query_result_t *result = NULL;
    xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE, msg, LOG_ERR);
    xmlNode *child = NULL;

    CRM_CHECK(dev != NULL, return -EPROTO);

    id = crm_element_value(dev, F_STONITH_REMOTE);
    CRM_CHECK(id != NULL, return -EPROTO);

    dev = get_xpath_object("//@st-available-devices", msg, LOG_ERR);
    CRM_CHECK(dev != NULL, return -EPROTO);
    crm_element_value_int(dev, "st-available-devices", &devices);

    op = g_hash_table_lookup(remote_op_list, id);
    if(op == NULL) {
        crm_debug("Unknown or expired remote op: %s", id);
        return -EOPNOTSUPP;
    }

    op->replies++;
    host = crm_element_value(msg, F_ORIG);

    if(devices <= 0) {
        /* If we're doing 'known' then we might need to fire anyway */
        crm_trace("Query result from %s (%d devices)", host, devices);
        return pcmk_ok;

    } else if(op->call_options & st_opt_allow_suicide) {
        crm_trace("Allowing %s to potentialy fence itself", op->target);

    } else if(safe_str_eq(host, op->target)) {
        crm_info("Ignoring reply from %s, hosts are not permitted to commit suicide", op->target);
        return pcmk_ok;
    }

    crm_debug("Query result from %s (%d devices)", host, devices);
    result = calloc(1, sizeof(st_query_result_t));
    result->host = strdup(host);
    result->devices = devices;

    for (child = __xml_first_child(dev); child != NULL; child = __xml_next(child)) {
        const char *device = ID(child);
        if(device) {
            result->device_list = g_list_prepend(result->device_list, strdup(device));
        }
    }

    CRM_CHECK(devices == g_list_length(result->device_list),
              crm_err("Mis-match: Query claimed to have %d devices but %d found", devices, g_list_length(result->device_list)));

    op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);

    if(op->state == st_query && is_set(op->call_options, st_opt_all_replies) == FALSE) {
        call_remote_stonith(op, result);

    } else if(op->state == st_done) {
        crm_info("Discarding query result from %s (%d devices): Operation is in state %d",
                 result->host, result->devices, op->state);
    }

    return pcmk_ok;
}