Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F4638757
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
49 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
index 5aad414e67..fa682a3201 100644
--- a/crmd/te_callbacks.c
+++ b/crmd/te_callbacks.c
@@ -1,838 +1,837 @@
/*
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <crm_internal.h>
#include <sys/stat.h>
#include <crm/crm.h>
#include <crm/common/xml.h>
#include <crm/msg_xml.h>
#include <tengine.h>
#include <te_callbacks.h>
#include <crmd_fsa.h>
#include <crm/cluster.h> /* For ONLINESTATUS etc */
void te_update_confirm(const char *event, xmlNode * msg);
extern char *te_uuid;
gboolean shuttingdown = FALSE;
crm_graph_t *transition_graph;
crm_trigger_t *transition_trigger = NULL;
/* #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */
#define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']"
static const char *
get_node_id(xmlNode * rsc_op)
{
xmlNode *node = rsc_op;
while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) {
node = node->parent;
}
CRM_CHECK(node != NULL, return NULL);
return ID(node);
}
static void
te_legacy_update_diff(const char *event, xmlNode * diff)
{
int lpc, max;
xmlXPathObject *xpathObj = NULL;
CRM_CHECK(diff != NULL, return);
xml_log_patchset(LOG_TRACE, __FUNCTION__, diff);
if (cib_config_changed(NULL, NULL, &diff)) {
abort_transition(INFINITY, tg_restart, "Non-status change", diff);
goto bail; /* configuration changed */
}
/* Tickets Attributes - Added/Updated */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_TICKETS);
if (numXpathResults(xpathObj) > 0) {
xmlNode *aborted = getXpathResult(xpathObj, 0);
abort_transition(INFINITY, tg_restart, "Ticket attribute: update", aborted);
goto bail;
}
freeXpathObject(xpathObj);
/* Tickets Attributes - Removed */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_TICKETS);
if (numXpathResults(xpathObj) > 0) {
xmlNode *aborted = getXpathResult(xpathObj, 0);
abort_transition(INFINITY, tg_restart, "Ticket attribute: removal", aborted);
goto bail;
}
freeXpathObject(xpathObj);
/* Transient Attributes - Added/Updated */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//"
XML_TAG_TRANSIENT_NODEATTRS "//" XML_CIB_TAG_NVPAIR);
max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
xmlNode *attr = getXpathResult(xpathObj, lpc);
const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME);
const char *value = NULL;
if (safe_str_eq(CRM_OP_PROBED, name)) {
value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE);
}
if (crm_is_true(value) == FALSE) {
abort_transition(INFINITY, tg_restart, "Transient attribute: update", attr);
crm_log_xml_trace(attr, "Abort");
goto bail;
}
}
freeXpathObject(xpathObj);
/* Transient Attributes - Removed */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//"
XML_TAG_TRANSIENT_NODEATTRS);
if (numXpathResults(xpathObj) > 0) {
xmlNode *aborted = getXpathResult(xpathObj, 0);
abort_transition(INFINITY, tg_restart, "Transient attribute: removal", aborted);
goto bail;
}
freeXpathObject(xpathObj);
/*
* Check for and fast-track the processing of LRM refreshes
* In large clusters this can result in _huge_ speedups
*
* Unfortunately we can only do so when there are no pending actions
* Otherwise we could miss updates we're waiting for and stall
*
*/
xpathObj = NULL;
if (transition_graph->pending == 0) {
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//"
XML_LRM_TAG_RESOURCE);
}
max = numXpathResults(xpathObj);
if (max > 1) {
/* Updates by, or in response to, TE actions will never contain updates
* for more than one resource at a time
*/
crm_debug("Detected LRM refresh - %d resources updated: Skipping all resource events", max);
crm_log_xml_trace(diff, "lrm-refresh");
abort_transition(INFINITY, tg_restart, "LRM Refresh", NULL);
goto bail;
}
freeXpathObject(xpathObj);
/* Process operation updates */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RSC_OP);
if (numXpathResults(xpathObj)) {
/*
<status>
<node_state id="node1" state=CRMD_JOINSTATE_MEMBER exp_state="active">
<lrm>
<lrm_resources>
<rsc_state id="" rsc_id="rsc4" node_id="node1" rsc_state="stopped"/>
*/
int lpc = 0, max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
xmlNode *rsc_op = getXpathResult(xpathObj, lpc);
const char *node = get_node_id(rsc_op);
process_graph_event(rsc_op, node);
}
}
freeXpathObject(xpathObj);
/* Detect deleted (as opposed to replaced or added) actions - eg. crm_resource -C */
xpathObj = xpath_search(diff, "//" XML_TAG_DIFF_REMOVED "//" XML_LRM_TAG_RSC_OP);
max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
int path_max = 0;
const char *op_id = NULL;
char *rsc_op_xpath = NULL;
xmlXPathObject *op_match = NULL;
xmlNode *match = getXpathResult(xpathObj, lpc);
CRM_LOG_ASSERT(match != NULL);
if(match == NULL) { continue; };
op_id = ID(match);
path_max = strlen(rsc_op_template) + strlen(op_id) + 1;
rsc_op_xpath = calloc(1, path_max);
snprintf(rsc_op_xpath, path_max, rsc_op_template, op_id);
op_match = xpath_search(diff, rsc_op_xpath);
if (numXpathResults(op_match) == 0) {
/* Prevent false positives by matching cancelations too */
const char *node = get_node_id(match);
crm_action_t *cancelled = get_cancel_action(op_id, node);
if (cancelled == NULL) {
crm_debug("No match for deleted action %s (%s on %s)", rsc_op_xpath, op_id,
node);
abort_transition(INFINITY, tg_restart, "Resource op removal", match);
freeXpathObject(op_match);
free(rsc_op_xpath);
goto bail;
} else {
crm_debug("Deleted lrm_rsc_op %s on %s was for graph event %d",
op_id, node, cancelled->id);
}
}
freeXpathObject(op_match);
free(rsc_op_xpath);
}
bail:
freeXpathObject(xpathObj);
}
static void process_resource_updates(
const char *node, xmlNode *xml, xmlNode *change, const char *op, const char *xpath)
{
xmlNode *cIter = NULL;
xmlNode *rsc = NULL;
xmlNode *rsc_op = NULL;
int num_resources = 0;
if(xml == NULL) {
return;
} else if(strcmp((const char*)xml->name, XML_CIB_TAG_LRM) == 0) {
xml = first_named_child(xml, XML_LRM_TAG_RESOURCES);
crm_trace("Got %p in %s", xml, XML_CIB_TAG_LRM);
}
CRM_ASSERT(strcmp((const char*)xml->name, XML_LRM_TAG_RESOURCES) == 0);
for(cIter = xml->children; cIter; cIter = cIter->next) {
num_resources++;
}
if(num_resources > 1) {
/*
* Check for and fast-track the processing of LRM refreshes
* In large clusters this can result in _huge_ speedups
*
* Unfortunately we can only do so when there are no pending actions
* Otherwise we could miss updates we're waiting for and stall
*
*/
crm_debug("Detected LRM refresh - %d resources updated", num_resources);
crm_log_xml_trace(change, "lrm-refresh");
abort_transition(INFINITY, tg_restart, "LRM Refresh", NULL);
return;
}
for (rsc = __xml_first_child(xml); rsc != NULL; rsc = __xml_next(rsc)) {
crm_trace("Processing %s", ID(rsc));
for (rsc_op = __xml_first_child(rsc); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) {
crm_trace("Processing %s", ID(rsc_op));
process_graph_event(rsc_op, node);
}
}
}
#define NODE_PATT "/lrm[@id="
static char *get_node_from_xpath(const char *xpath)
{
char *nodeid = NULL;
char *tmp = strstr(xpath, NODE_PATT);
if(tmp) {
tmp += strlen(NODE_PATT);
tmp += 1;
nodeid = strdup(tmp);
tmp = strstr(nodeid, "\'");
CRM_ASSERT(tmp);
tmp[0] = 0;
}
return nodeid;
}
static char *extract_node_uuid(const char *xpath)
{
char *mutable_path = strdup(xpath);
char *node_uuid = NULL;
char *search = NULL;
char *match = NULL;
match = strstr(mutable_path, "node_state[@id=\'");
if (match == NULL) {
free(mutable_path);
return NULL;
}
match += strlen("node_state[@id=\'");
search = strchr(match, '\'');
if (search == NULL) {
free(mutable_path);
return NULL;
}
search[0] = 0;
node_uuid = strdup(match);
free(mutable_path);
return node_uuid;
}
static void abort_unless_down(const char *xpath, const char *op, xmlNode *change, const char *reason)
{
char *node_uuid = NULL;
crm_action_t *down = NULL;
if(safe_str_neq(op, "delete")) {
abort_transition(INFINITY, tg_restart, reason, change);
return;
}
node_uuid = extract_node_uuid(xpath);
if(node_uuid == NULL) {
crm_err("Could not extract node ID from %s", xpath);
abort_transition(INFINITY, tg_restart, reason, change);
return;
}
down = match_down_event(0, node_uuid, NULL, FALSE);
if(down == NULL || down->executed == false) {
crm_trace("Not expecting %s to be down (%s)", node_uuid, xpath);
abort_transition(INFINITY, tg_restart, reason, change);
} else {
crm_trace("Expecting changes to %s (%s)", node_uuid, xpath);
}
free(node_uuid);
}
void
te_update_diff(const char *event, xmlNode * msg)
{
int rc = -EINVAL;
int format = 1;
xmlNode *change = NULL;
const char *op = NULL;
xmlNode *diff = NULL;
int p_add[] = { 0, 0, 0 };
int p_del[] = { 0, 0, 0 };
CRM_CHECK(msg != NULL, return);
crm_element_value_int(msg, F_CIB_RC, &rc);
if (transition_graph == NULL) {
crm_trace("No graph");
return;
} else if (rc < pcmk_ok) {
crm_trace("Filter rc=%d (%s)", rc, pcmk_strerror(rc));
return;
} else if (transition_graph->complete == TRUE
&& fsa_state != S_IDLE
&& fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) {
crm_trace("Filter state=%s, complete=%d", fsa_state2string(fsa_state),
transition_graph->complete);
return;
}
op = crm_element_value(msg, F_CIB_OPERATION);
diff = get_message_xml(msg, F_CIB_UPDATE_RESULT);
xml_patch_versions(diff, p_add, p_del);
crm_debug("Processing (%s) diff: %d.%d.%d -> %d.%d.%d (%s)", op,
p_del[0], p_del[1], p_del[2], p_add[0], p_add[1], p_add[2],
fsa_state2string(fsa_state));
crm_element_value_int(diff, "format", &format);
switch(format) {
case 1:
te_legacy_update_diff(event, diff);
return;
case 2:
/* Cool, we know what to do here */
crm_log_xml_trace(diff, "Patch:Raw");
break;
default:
crm_warn("Unknown patch format: %d", format);
return;
}
for (change = __xml_first_child(diff); change != NULL; change = __xml_next(change)) {
const char *name = NULL;
const char *op = crm_element_value(change, XML_DIFF_OP);
const char *xpath = crm_element_value(change, XML_DIFF_PATH);
xmlNode *match = NULL;
const char *node = NULL;
if(op == NULL) {
continue;
} else if(strcmp(op, "create") == 0) {
match = change->children;
} else if(strcmp(op, "move") == 0) {
continue;
} else if(strcmp(op, "modify") == 0) {
match = first_named_child(change, XML_DIFF_RESULT);
if(match) {
match = match->children;
}
}
if(match) {
name = (const char *)match->name;
}
crm_trace("Handling %s operation for %s %p, %s", op, xpath, match, name);
if(xpath == NULL) {
/* Version field, ignore */
} else if(strstr(xpath, "/cib/configuration")) {
abort_transition(INFINITY, tg_restart, "Non-status change", change);
break; /* Wont be packaged with any resource operations we may be waiting for */
} else if(strstr(xpath, "/"XML_CIB_TAG_TICKETS) || safe_str_eq(name, XML_CIB_TAG_TICKETS)) {
abort_transition(INFINITY, tg_restart, "Ticket attribute change", change);
break; /* Wont be packaged with any resource operations we may be waiting for */
} else if(strstr(xpath, "/"XML_TAG_TRANSIENT_NODEATTRS"[") || safe_str_eq(name, XML_TAG_TRANSIENT_NODEATTRS)) {
abort_unless_down(xpath, op, change, "Transient attribute change");
break; /* Wont be packaged with any resource operations we may be waiting for */
} else if(strstr(xpath, "/"XML_LRM_TAG_RSC_OP"[") && safe_str_eq(op, "delete")) {
crm_action_t *cancel = NULL;
char *mutable_key = strdup(xpath);
char *key, *node_uuid;
/* Extract the part of xpath between last pair of single quotes */
key = strrchr(mutable_key, '\'');
if (key != NULL) {
*key = '\0';
key = strrchr(mutable_key, '\'');
}
if (key++ == NULL) {
crm_warn("Ignoring malformed CIB update (resource deletion)");
free(mutable_key);
continue;
}
node_uuid = extract_node_uuid(xpath);
cancel = get_cancel_action(key, node_uuid);
if (cancel == NULL) {
abort_transition(INFINITY, tg_restart, "Resource operation removal", change);
} else {
crm_info("Cancellation of %s on %s confirmed (%d)", key, node_uuid, cancel->id);
stop_te_timer(cancel->timer);
te_action_confirmed(cancel);
update_graph(transition_graph, cancel);
trigger_graph();
}
free(mutable_key);
free(node_uuid);
} else if(strstr(xpath, "/"XML_CIB_TAG_LRM"[") && safe_str_eq(op, "delete")) {
abort_unless_down(xpath, op, change, "Resource state removal");
} else if(strstr(xpath, "/"XML_CIB_TAG_STATE"[") && safe_str_eq(op, "delete")) {
abort_unless_down(xpath, op, change, "Node state removal");
} else if(name == NULL) {
crm_debug("No result for %s operation to %s", op, xpath);
CRM_ASSERT(strcmp(op, "delete") == 0 || strcmp(op, "move") == 0);
} else if(strcmp(name, XML_TAG_CIB) == 0) {
xmlNode *state = NULL;
xmlNode *status = first_named_child(match, XML_CIB_TAG_STATUS);
xmlNode *config = first_named_child(match, XML_CIB_TAG_CONFIGURATION);
for (state = __xml_first_child(status); state != NULL; state = __xml_next(state)) {
xmlNode *lrm = first_named_child(state, XML_CIB_TAG_LRM);
node = ID(state);
process_resource_updates(node, lrm, change, op, xpath);
}
if(config) {
abort_transition(INFINITY, tg_restart, "Non-status change", change);
}
} else if(strcmp(name, XML_CIB_TAG_STATUS) == 0) {
xmlNode *state = NULL;
for (state = __xml_first_child(match); state != NULL; state = __xml_next(state)) {
xmlNode *lrm = first_named_child(state, XML_CIB_TAG_LRM);
node = ID(state);
process_resource_updates(node, lrm, change, op, xpath);
}
} else if(strcmp(name, XML_CIB_TAG_STATE) == 0) {
xmlNode *lrm = first_named_child(match, XML_CIB_TAG_LRM);
node = ID(match);
process_resource_updates(node, lrm, change, op, xpath);
} else if(strcmp(name, XML_CIB_TAG_LRM) == 0) {
node = ID(match);
process_resource_updates(node, match, change, op, xpath);
} else if(strcmp(name, XML_LRM_TAG_RESOURCES) == 0) {
char *local_node = get_node_from_xpath(xpath);
process_resource_updates(local_node, match, change, op, xpath);
free(local_node);
} else if(strcmp(name, XML_LRM_TAG_RESOURCE) == 0) {
xmlNode *rsc_op;
char *local_node = get_node_from_xpath(xpath);
for (rsc_op = __xml_first_child(match); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) {
process_graph_event(rsc_op, local_node);
}
free(local_node);
} else if(strcmp(name, XML_LRM_TAG_RSC_OP) == 0) {
char *local_node = get_node_from_xpath(xpath);
process_graph_event(match, local_node);
free(local_node);
} else {
crm_err("Ignoring %s operation for %s %p, %s", op, xpath, match, name);
}
}
}
gboolean
process_te_message(xmlNode * msg, xmlNode * xml_data)
{
const char *from = crm_element_value(msg, F_ORIG);
const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO);
const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
const char *ref = crm_element_value(msg, F_CRM_REFERENCE);
const char *op = crm_element_value(msg, F_CRM_TASK);
const char *type = crm_element_value(msg, F_CRM_MSG_TYPE);
crm_trace("Processing %s (%s) message", op, ref);
crm_log_xml_trace(msg, "ipc");
if (op == NULL) {
/* error */
} else if (sys_to == NULL || strcasecmp(sys_to, CRM_SYSTEM_TENGINE) != 0) {
crm_trace("Bad sys-to %s", crm_str(sys_to));
return FALSE;
} else if (safe_str_eq(op, CRM_OP_INVOKE_LRM)
&& safe_str_eq(sys_from, CRM_SYSTEM_LRMD)
/* && safe_str_eq(type, XML_ATTR_RESPONSE) */
) {
xmlXPathObject *xpathObj = NULL;
crm_log_xml_trace(msg, "Processing (N)ACK");
crm_debug("Processing (N)ACK %s from %s", crm_element_value(msg, F_CRM_REFERENCE), from);
xpathObj = xpath_search(xml_data, "//" XML_LRM_TAG_RSC_OP);
if (numXpathResults(xpathObj)) {
int lpc = 0, max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
xmlNode *rsc_op = getXpathResult(xpathObj, lpc);
const char *node = get_node_id(rsc_op);
process_graph_event(rsc_op, node);
}
freeXpathObject(xpathObj);
} else {
crm_log_xml_err(msg, "Invalid (N)ACK");
freeXpathObject(xpathObj);
return FALSE;
}
} else {
crm_err("Unknown command: %s::%s from %s", type, op, sys_from);
}
crm_trace("finished processing message");
return TRUE;
}
GHashTable *stonith_failures = NULL;
struct st_fail_rec {
int count;
int last_rc;
};
gboolean
too_many_st_failures(void)
{
GHashTableIter iter;
const char *key = NULL;
struct st_fail_rec *value = NULL;
if (stonith_failures == NULL) {
return FALSE;
}
g_hash_table_iter_init(&iter, stonith_failures);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
if (value->count > 10) {
crm_notice("Too many failures to fence %s (%d), giving up", key, value->count);
return TRUE;
} else if (value->last_rc == -ENODEV) {
crm_notice("No devices found in cluster to fence %s, giving up", key);
return TRUE;
}
}
return FALSE;
}
void
st_fail_count_reset(const char *target)
{
struct st_fail_rec *rec = NULL;
if (stonith_failures) {
rec = g_hash_table_lookup(stonith_failures, target);
}
if (rec) {
rec->count = 0;
rec->last_rc = 0;
}
}
static void
st_fail_count_increment(const char *target, int rc)
{
struct st_fail_rec *rec = NULL;
if (stonith_failures == NULL) {
stonith_failures =
g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, free);
}
rec = g_hash_table_lookup(stonith_failures, target);
if (rec) {
rec->count++;
} else {
rec = malloc(sizeof(struct st_fail_rec));
if(rec == NULL) {
return;
}
rec->count = 1;
g_hash_table_insert(stonith_failures, strdup(target), rec);
}
rec->last_rc = rc;
}
void
tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
{
char *uuid = NULL;
int target_rc = -1;
int stonith_id = -1;
int transition_id = -1;
crm_action_t *action = NULL;
int call_id = data->call_id;
int rc = data->rc;
char *userdata = data->userdata;
CRM_CHECK(userdata != NULL, return);
crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
pcmk_strerror(rc), rc);
if (AM_I_DC == FALSE) {
return;
}
/* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
/* op->call_id, op->optype, op->node_name, op->op_result, */
/* (char *)op->node_list, op->private_data); */
/* filter out old STONITH actions */
CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, &target_rc),
crm_err("Invalid event detected");
goto bail;
);
if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid)
|| transition_graph->id != transition_id) {
crm_info("Ignoring STONITH action initiated outside of the current transition");
goto bail;
}
- /* this will mark the event complete if a match is found */
action = get_action(stonith_id, FALSE);
if (action == NULL) {
crm_err("Stonith action not matched");
goto bail;
}
stop_te_timer(action->timer);
if (rc == pcmk_ok) {
const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
const char *op = crm_meta_value(action->params, "stonith_action");
crm_debug("Stonith operation %d for %s passed", call_id, target);
if (action->confirmed == FALSE) {
te_action_confirmed(action);
if (action->sent_update == FALSE && safe_str_neq("on", op)) {
send_stonith_update(action, target, uuid);
}
}
st_fail_count_reset(target);
} else {
const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET);
action->failed = TRUE;
crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
call_id, target, pcmk_strerror(rc));
abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
st_fail_count_increment(target, rc);
}
update_graph(transition_graph, action);
trigger_graph();
bail:
free(userdata);
free(uuid);
return;
}
void
cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc < pcmk_ok) {
crm_err("Fencing update %d for %s: failed - %s (%d)",
call_id, (char *)user_data, pcmk_strerror(rc), rc);
crm_log_xml_warn(msg, "Failed update");
abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
} else {
crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
}
}
void
cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc < pcmk_ok) {
crm_err("Update %d FAILED: %s", call_id, pcmk_strerror(rc));
}
}
gboolean
action_timer_callback(gpointer data)
{
crm_action_timer_t *timer = NULL;
CRM_CHECK(data != NULL, return FALSE);
timer = (crm_action_timer_t *) data;
stop_te_timer(timer);
crm_warn("Timer popped (timeout=%d, abort_level=%d, complete=%s)",
timer->timeout,
transition_graph->abort_priority, transition_graph->complete ? "true" : "false");
CRM_CHECK(timer->action != NULL, return FALSE);
if (transition_graph->complete) {
crm_warn("Ignoring timeout while not in transition");
} else if (timer->reason == timeout_action_warn) {
print_action(LOG_WARNING, "Action missed its timeout: ", timer->action);
/* Don't check the FSA state
*
* We might also be in S_INTEGRATION or some other state waiting for this
* action so we can close the transition and continue
*/
} else {
/* fail the action */
gboolean send_update = TRUE;
const char *task = crm_element_value(timer->action->xml, XML_LRM_ATTR_TASK);
print_action(LOG_ERR, "Aborting transition, action lost: ", timer->action);
timer->action->failed = TRUE;
te_action_confirmed(timer->action);
abort_transition(INFINITY, tg_restart, "Action lost", NULL);
update_graph(transition_graph, timer->action);
trigger_graph();
if (timer->action->type != action_type_rsc) {
send_update = FALSE;
} else if (safe_str_eq(task, RSC_CANCEL)) {
/* we dont need to update the CIB with these */
send_update = FALSE;
}
if (send_update) {
cib_action_update(timer->action, PCMK_LRM_OP_TIMEOUT, PCMK_OCF_UNKNOWN_ERROR);
}
}
return FALSE;
}
diff --git a/crmd/te_events.c b/crmd/te_events.c
index db780dd59c..73d04538ac 100644
--- a/crmd/te_events.c
+++ b/crmd/te_events.c
@@ -1,630 +1,628 @@
/*
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <crm_internal.h>
#include <sys/param.h>
#include <crm/crm.h>
#include <crm/cib.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <tengine.h>
#include <crmd_fsa.h>
char *failed_stop_offset = NULL;
char *failed_start_offset = NULL;
gboolean
fail_incompletable_actions(crm_graph_t * graph, const char *down_node)
{
const char *target_uuid = NULL;
const char *router = NULL;
const char *router_uuid = NULL;
xmlNode *last_action = NULL;
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
if (graph == NULL || graph->complete) {
return FALSE;
}
gIter = graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
if (synapse->confirmed || synapse->failed) {
/* We've already been here */
continue;
}
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
crm_action_t *action = (crm_action_t *) gIter2->data;
if (action->type == action_type_pseudo || action->confirmed) {
continue;
} else if (action->type == action_type_crm) {
const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (safe_str_eq(task, CRM_OP_FENCE)) {
continue;
}
}
target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
if (router) {
crm_node_t *node = crm_get_peer(0, router);
if (node) {
router_uuid = node->uuid;
}
}
if (safe_str_eq(target_uuid, down_node) || safe_str_eq(router_uuid, down_node)) {
action->failed = TRUE;
synapse->failed = TRUE;
last_action = action->xml;
stop_te_timer(action->timer);
update_graph(graph, action);
if (synapse->executed) {
crm_notice("Action %d (%s) was pending on %s (offline)",
action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
} else {
crm_info("Action %d (%s) is scheduled for %s (offline)",
action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
}
}
}
}
if (last_action != NULL) {
crm_info("Node %s shutdown resulted in un-runnable actions", down_node);
abort_transition(INFINITY, tg_restart, "Node failure", last_action);
return TRUE;
}
return FALSE;
}
/*!
* \internal
* \brief Update failure-related node attributes if warranted
*
* \param[in] event XML describing operation that (maybe) failed
* \param[in] event_node_uuid Node that event occurred on
* \param[in] rc Actual operation return code
* \param[in] target_rc Expected operation return code
* \param[in] do_update If TRUE, do update regardless of operation type
* \param[in] ignore_failures If TRUE, update last failure but not fail count
*
* \return TRUE if this was not a direct nack, success or lrm status refresh
*/
static gboolean
update_failcount(xmlNode * event, const char *event_node_uuid, int rc,
int target_rc, gboolean do_update, gboolean ignore_failures)
{
int interval = 0;
char *task = NULL;
char *rsc_id = NULL;
const char *value = NULL;
const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
const char *on_uname = crm_peer_uname(event_node_uuid);
const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);
/* Nothing needs to be done for success, lrm status refresh,
* or direct nack (internal code for "busy, try again")
*/
if ((rc == CRM_DIRECT_NACK_RC) || (rc == target_rc)) {
return FALSE;
} else if (safe_str_eq(origin, "build_active_RAs")) {
crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
id, rc, on_uname);
return FALSE;
}
/* Sanity check */
CRM_CHECK(on_uname != NULL, return TRUE);
CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval),
crm_err("Couldn't parse: %s", ID(event)); goto bail);
CRM_CHECK(task != NULL, goto bail);
CRM_CHECK(rsc_id != NULL, goto bail);
/* Decide whether update is necessary and what value to use */
if ((interval > 0) || safe_str_eq(task, CRMD_ACTION_PROMOTE)
|| safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
do_update = TRUE;
} else if (safe_str_eq(task, CRMD_ACTION_START)) {
do_update = TRUE;
if (failed_start_offset == NULL) {
failed_start_offset = strdup(INFINITY_S);
}
value = failed_start_offset;
} else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
do_update = TRUE;
if (failed_stop_offset == NULL) {
failed_stop_offset = strdup(INFINITY_S);
}
value = failed_stop_offset;
}
/* Fail count will be either incremented or set to infinity */
if (value == NULL || safe_str_neq(value, INFINITY_S)) {
value = XML_NVPAIR_ATTR_VALUE "++";
}
if (do_update) {
char *now = crm_itoa(time(NULL));
char *attr_name = NULL;
gboolean is_remote_node = FALSE;
if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
is_remote_node = TRUE;
}
crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)",
(ignore_failures? "last failure" : "failcount"),
rsc_id, on_uname, task, rc, value, now);
/* Update the fail count, if we're not ignoring failures */
if (!ignore_failures) {
attr_name = crm_concat("fail-count", rsc_id, '-');
update_attrd(on_uname, attr_name, value, NULL, is_remote_node);
free(attr_name);
}
/* Update the last failure time (even if we're ignoring failures,
* so that failure can still be detected and shown, e.g. by crm_mon)
*/
attr_name = crm_concat("last-failure", rsc_id, '-');
update_attrd(on_uname, attr_name, now, NULL, is_remote_node);
free(attr_name);
free(now);
}
bail:
free(rsc_id);
free(task);
return TRUE;
}
/*!
* \internal
* \brief Return simplified operation status based on operation return code
*
* \param[in] action CRM action instance of operation
* \param[in] orig_status Original reported operation status
* \param[in] rc Actual operation return code
* \param[in] target_rc Expected operation return code
*
* \return PCMK_LRM_OP_DONE if rc equals target_rc, PCMK_LRM_OP_ERROR otherwise
*
* \note This assumes that PCMK_LRM_OP_PENDING operations have already been
* filtered (otherwise they will get simplified as well).
*/
static int
status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc)
{
if (target_rc == rc) {
crm_trace("Target rc: == %d", rc);
if (orig_status != PCMK_LRM_OP_DONE) {
crm_trace("Re-mapping op status to PCMK_LRM_OP_DONE for rc=%d", rc);
}
return PCMK_LRM_OP_DONE;
}
if (rc != CRM_DIRECT_NACK_RC) {
const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
const char *uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s",
action->id, task, uname, target_rc, rc,
services_lrm_status_str(PCMK_LRM_OP_ERROR));
}
return PCMK_LRM_OP_ERROR;
}
static void
process_remote_node_action(crm_action_t *action, xmlNode *event)
{
xmlNode *child = NULL;
/* The whole point of this function is to detect when a remote-node
* is integrated into the cluster or has failed, and properly abort
* the transition so resources can be placed on the new node or fail
* all pending actions on a lost node.
*/
if (crm_remote_peer_cache_size() == 0) {
return;
} else if (action->type != action_type_rsc) {
return;
} else if (action->confirmed == FALSE) {
return;
} else if (!action->failed || safe_str_neq(crm_element_value(action->xml, XML_LRM_ATTR_TASK), "start")) {
/* we only care about failed remote nodes, or remote nodes that have just come online. */
return;
}
for (child = __xml_first_child(action->xml); child != NULL; child = __xml_next(child)) {
const char *provider;
const char *type;
const char *rsc;
const char *action_type;
crm_node_t *remote_peer;
if (safe_str_neq(crm_element_name(child), XML_CIB_TAG_RESOURCE)) {
continue;
}
provider = crm_element_value(child, XML_AGENT_ATTR_PROVIDER);
type = crm_element_value(child, XML_ATTR_TYPE);
rsc = ID(child);
action_type = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (safe_str_neq(provider, "pacemaker") || safe_str_neq(type, "remote") || rsc == NULL) {
break;
}
remote_peer = crm_get_peer_full(0, rsc, CRM_GET_PEER_REMOTE);
if (remote_peer == NULL) {
break;
}
/* if a remote node connection failed, and this failure is not related to a probe
* action, make sure to cancel any in-flight operations occurring on that remote node
* since those actions will timeout. we don't want to wait around for the timeouts */
if (action->failed &&
!(safe_str_eq(action_type, "monitor") && action->interval == 0)) {
/* the rsc id is actually the remote node id. we want to mark all
* in-flight actions on a failed remote node as incompletable */
fail_incompletable_actions(transition_graph, rsc);
} else if (!action->failed &&
safe_str_eq(remote_peer->state, CRM_NODE_LOST) &&
safe_str_eq(action_type, "start")) {
/* A remote node will be placed in the "lost" state after
* it has been successfully fenced. After successfully connecting
* to a remote-node after being fenced, we need to abort the transition
* so resources can be placed on the newly integrated remote-node */
abort_transition(INFINITY, tg_restart, "Remote-node re-discovered.", event);
}
return;
}
}
/*!
* \internal
* \brief Confirm action and update transition graph, aborting transition on failures
*
* \param[in/out] action CRM action instance of this operation
* \param[in] event Event instance of this operation
* \param[in] orig_status Original reported operation status
* \param[in] op_rc Actual operation return code
* \param[in] target_rc Expected operation return code
* \param[in] ignore_failures Whether to ignore operation failures
*
* \note This assumes that PCMK_LRM_OP_PENDING operations have already been
* filtered (otherwise they may be treated as failures).
*/
static void
match_graph_event(crm_action_t *action, xmlNode *event, int op_status,
int op_rc, int target_rc, gboolean ignore_failures)
{
const char *target = NULL;
const char *this_event = NULL;
const char *ignore_s = "";
/* Remap operation status based on return code */
op_status = status_from_rc(action, op_status, op_rc, target_rc);
/* Process OP status */
switch (op_status) {
case PCMK_LRM_OP_DONE:
break;
case PCMK_LRM_OP_ERROR:
case PCMK_LRM_OP_TIMEOUT:
case PCMK_LRM_OP_NOTSUPPORTED:
if (ignore_failures) {
ignore_s = ", ignoring failure";
} else {
action->failed = TRUE;
}
break;
case PCMK_LRM_OP_CANCELLED:
/* do nothing?? */
crm_err("Don't know what to do for cancelled ops yet");
break;
default:
/*
PCMK_LRM_OP_ERROR_HARD,
PCMK_LRM_OP_ERROR_FATAL,
PCMK_LRM_OP_NOT_INSTALLED
*/
action->failed = TRUE;
crm_err("Unsupported action result: %d", op_status);
}
/* stop this event's timer if it had one */
stop_te_timer(action->timer);
te_action_confirmed(action);
update_graph(transition_graph, action);
trigger_graph();
if (action->failed) {
abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event);
}
this_event = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
crm_info("Action %s (%d) confirmed on %s (rc=%d%s)",
crm_str(this_event), action->id, crm_str(target), op_rc, ignore_s);
/* determine if this action affects a remote-node's online/offline status */
process_remote_node_action(action, event);
}
crm_action_t *
get_action(int id, gboolean confirmed)
{
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
gIter = transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
crm_action_t *action = (crm_action_t *) gIter2->data;
if (action->id == id) {
if (confirmed) {
stop_te_timer(action->timer);
te_action_confirmed(action);
}
return action;
}
}
}
return NULL;
}
crm_action_t *
get_cancel_action(const char *id, const char *node)
{
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
gIter = transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
const char *task = NULL;
const char *target = NULL;
crm_action_t *action = (crm_action_t *) gIter2->data;
task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (safe_str_neq(CRMD_ACTION_CANCEL, task)) {
continue;
}
task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
if (safe_str_neq(task, id)) {
crm_trace("Wrong key %s for %s on %s", task, id, node);
continue;
}
target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
if (node && safe_str_neq(target, node)) {
crm_trace("Wrong node %s for %s on %s", target, id, node);
continue;
}
crm_trace("Found %s on %s", id, node);
return action;
}
}
return NULL;
}
crm_action_t *
match_down_event(int id, const char *target, const char *filter, bool quiet)
{
const char *this_action = NULL;
const char *this_node = NULL;
crm_action_t *match = NULL;
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
gIter = transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
/* lookup event */
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
crm_action_t *action = (crm_action_t *) gIter2->data;
if (id > 0 && action->id == id) {
match = action;
break;
}
this_action = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (action->type != action_type_crm) {
continue;
} else if (safe_str_eq(this_action, CRM_OP_LRM_REFRESH)) {
continue;
} else if (filter != NULL && safe_str_neq(this_action, filter)) {
continue;
}
this_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
if (this_node == NULL) {
crm_log_xml_err(action->xml, "No node uuid");
}
if (safe_str_neq(this_node, target)) {
crm_debug("Action %d : Node mismatch: %s", action->id, this_node);
continue;
}
match = action;
id = action->id;
break;
}
if (match != NULL) {
- /* stop this event's timer if it had one */
break;
}
}
if (match != NULL) {
- /* stop this event's timer if it had one */
crm_debug("Match found for action %d: %s on %s", id,
crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target);
} else if (id > 0) {
crm_err("No match for action %d", id);
} else if(quiet == FALSE) {
crm_warn("No match for shutdown action on %s", target);
}
return match;
}
gboolean
process_graph_event(xmlNode * event, const char *event_node)
{
int rc = -1;
int status = -1;
int callid = -1;
int action_num = -1;
crm_action_t *action = NULL;
int target_rc = -1;
int transition_num = -1;
char *update_te_uuid = NULL;
gboolean stop_early = FALSE;
gboolean ignore_failures = FALSE;
const char *id = NULL;
const char *desc = NULL;
const char *magic = NULL;
CRM_ASSERT(event != NULL);
/*
<lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-run="1355361636" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
*/
id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
crm_element_value_int(event, XML_LRM_ATTR_RC, &rc);
crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status);
crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid);
magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY);
if (magic == NULL) {
/* non-change */
return FALSE;
}
if (decode_transition_key(magic, &update_te_uuid, &transition_num,
&action_num, &target_rc) == FALSE) {
crm_err("Invalid event %s.%d detected: %s", id, callid, magic);
abort_transition(INFINITY, tg_restart, "Bad event", event);
return FALSE;
}
if (status == PCMK_LRM_OP_PENDING) {
goto bail;
}
if (transition_num == -1) {
desc = "initiated outside of the cluster";
abort_transition(INFINITY, tg_restart, "Unexpected event", event);
} else if ((action_num < 0) || (crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE)) {
desc = "initiated by a different node";
abort_transition(INFINITY, tg_restart, "Foreign event", event);
stop_early = TRUE; /* This could be an lrm status refresh */
} else if (transition_graph->id != transition_num) {
desc = "arrived really late";
abort_transition(INFINITY, tg_restart, "Old event", event);
stop_early = TRUE; /* This could be an lrm status refresh */
} else if (transition_graph->complete) {
desc = "arrived late";
abort_transition(INFINITY, tg_restart, "Inactive graph", event);
} else {
action = get_action(action_num, FALSE);
if (action == NULL) {
desc = "unknown";
abort_transition(INFINITY, tg_restart, "Unknown event", event);
} else {
ignore_failures = safe_str_eq(
crm_meta_value(action->params, XML_OP_ATTR_ON_FAIL), "ignore");
match_graph_event(action, event, status, rc, target_rc, ignore_failures);
}
}
if (action && (rc == target_rc)) {
crm_trace("Processed update to %s: %s", id, magic);
} else {
if (update_failcount(event, event_node, rc, target_rc,
(transition_num == -1), ignore_failures)) {
/* Turns out this wasn't an lrm status refresh update aferall */
stop_early = FALSE;
desc = "failed";
}
crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num,
action_num, id, callid, services_ocf_exitcode_str(rc), desc);
}
bail:
free(update_te_uuid);
return stop_early;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Jul 10, 1:16 AM (10 h, 13 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2009454
Default Alt Text
(49 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment