Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c
index eba5f11a0a..68742c29cf 100644
--- a/crmd/te_callbacks.c
+++ b/crmd/te_callbacks.c
@@ -1,790 +1,786 @@
/*
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <crm_internal.h>
#include <sys/stat.h>
#include <crm/crm.h>
#include <crm/common/xml.h>
#include <crm/msg_xml.h>
#include <tengine.h>
#include <te_callbacks.h>
#include <crmd_fsa.h>
#include <crm/cluster.h> /* For ONLINESTATUS etc */
void te_update_confirm(const char *event, xmlNode * msg);
extern char *te_uuid;
gboolean shuttingdown = FALSE;
crm_graph_t *transition_graph;
crm_trigger_t *transition_trigger = NULL;
/* #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */
#define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']"
static const char *
get_node_id(xmlNode * rsc_op)
{
xmlNode *node = rsc_op;
while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) {
node = node->parent;
}
CRM_CHECK(node != NULL, return NULL);
return ID(node);
}
static void
te_legacy_update_diff(const char *event, xmlNode * diff)
{
int lpc, max;
xmlXPathObject *xpathObj = NULL;
CRM_CHECK(diff != NULL, return);
xml_log_patchset(LOG_TRACE, __FUNCTION__, diff);
if (cib_config_changed(NULL, NULL, &diff)) {
abort_transition(INFINITY, tg_restart, "Non-status change", diff);
goto bail; /* configuration changed */
}
/* Tickets Attributes - Added/Updated */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_TICKETS);
if (numXpathResults(xpathObj) > 0) {
xmlNode *aborted = getXpathResult(xpathObj, 0);
abort_transition(INFINITY, tg_restart, "Ticket attribute: update", aborted);
goto bail;
}
freeXpathObject(xpathObj);
/* Tickets Attributes - Removed */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_TICKETS);
if (numXpathResults(xpathObj) > 0) {
xmlNode *aborted = getXpathResult(xpathObj, 0);
abort_transition(INFINITY, tg_restart, "Ticket attribute: removal", aborted);
goto bail;
}
freeXpathObject(xpathObj);
/* Transient Attributes - Added/Updated */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//"
XML_TAG_TRANSIENT_NODEATTRS "//" XML_CIB_TAG_NVPAIR);
max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
xmlNode *attr = getXpathResult(xpathObj, lpc);
const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME);
const char *value = NULL;
if (safe_str_eq(CRM_OP_PROBED, name)) {
value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE);
}
if (crm_is_true(value) == FALSE) {
abort_transition(INFINITY, tg_restart, "Transient attribute: update", attr);
crm_log_xml_trace(attr, "Abort");
goto bail;
}
}
freeXpathObject(xpathObj);
/* Transient Attributes - Removed */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//"
XML_TAG_TRANSIENT_NODEATTRS);
if (numXpathResults(xpathObj) > 0) {
xmlNode *aborted = getXpathResult(xpathObj, 0);
abort_transition(INFINITY, tg_restart, "Transient attribute: removal", aborted);
goto bail;
}
freeXpathObject(xpathObj);
/*
* Check for and fast-track the processing of LRM refreshes
* In large clusters this can result in _huge_ speedups
*
* Unfortunately we can only do so when there are no pending actions
* Otherwise we could miss updates we're waiting for and stall
*
*/
xpathObj = NULL;
if (transition_graph->pending == 0) {
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//"
XML_LRM_TAG_RESOURCE);
}
max = numXpathResults(xpathObj);
if (max > 1) {
/* Updates by, or in response to, TE actions will never contain updates
* for more than one resource at a time
*/
crm_debug("Detected LRM refresh - %d resources updated: Skipping all resource events", max);
crm_log_xml_trace(diff, "lrm-refresh");
abort_transition(INFINITY, tg_restart, "LRM Refresh", NULL);
goto bail;
}
freeXpathObject(xpathObj);
/* Process operation updates */
xpathObj =
xpath_search(diff,
"//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RSC_OP);
if (numXpathResults(xpathObj)) {
/*
<status>
<node_state id="node1" state=CRMD_JOINSTATE_MEMBER exp_state="active">
<lrm>
<lrm_resources>
<rsc_state id="" rsc_id="rsc4" node_id="node1" rsc_state="stopped"/>
*/
int lpc = 0, max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
xmlNode *rsc_op = getXpathResult(xpathObj, lpc);
const char *node = get_node_id(rsc_op);
process_graph_event(rsc_op, node);
}
}
freeXpathObject(xpathObj);
/* Detect deleted (as opposed to replaced or added) actions - eg. crm_resource -C */
xpathObj = xpath_search(diff, "//" XML_TAG_DIFF_REMOVED "//" XML_LRM_TAG_RSC_OP);
max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
int path_max = 0;
const char *op_id = NULL;
char *rsc_op_xpath = NULL;
xmlXPathObject *op_match = NULL;
xmlNode *match = getXpathResult(xpathObj, lpc);
CRM_LOG_ASSERT(match != NULL);
if(match == NULL) { continue; };
op_id = ID(match);
path_max = strlen(rsc_op_template) + strlen(op_id) + 1;
rsc_op_xpath = calloc(1, path_max);
snprintf(rsc_op_xpath, path_max, rsc_op_template, op_id);
op_match = xpath_search(diff, rsc_op_xpath);
if (numXpathResults(op_match) == 0) {
/* Prevent false positives by matching cancelations too */
const char *node = get_node_id(match);
crm_action_t *cancelled = get_cancel_action(op_id, node);
if (cancelled == NULL) {
crm_debug("No match for deleted action %s (%s on %s)", rsc_op_xpath, op_id,
node);
abort_transition(INFINITY, tg_restart, "Resource op removal", match);
freeXpathObject(op_match);
free(rsc_op_xpath);
goto bail;
} else {
crm_debug("Deleted lrm_rsc_op %s on %s was for graph event %d",
op_id, node, cancelled->id);
}
}
freeXpathObject(op_match);
free(rsc_op_xpath);
}
bail:
freeXpathObject(xpathObj);
}
static void process_resource_updates(
const char *node, xmlNode *xml, xmlNode *change, const char *op, const char *xpath)
{
xmlNode *cIter = NULL;
xmlNode *rsc = NULL;
xmlNode *rsc_op = NULL;
int num_resources = 0;
if(xml == NULL) {
return;
} else if(strcmp((const char*)xml->name, XML_CIB_TAG_LRM) == 0) {
xml = first_named_child(xml, XML_LRM_TAG_RESOURCES);
crm_trace("Got %p in %s", xml, XML_CIB_TAG_LRM);
}
CRM_ASSERT(strcmp((const char*)xml->name, XML_LRM_TAG_RESOURCES) == 0);
for(cIter = xml->children; cIter; cIter = cIter->next) {
num_resources++;
}
if(num_resources > 1) {
/*
* Check for and fast-track the processing of LRM refreshes
* In large clusters this can result in _huge_ speedups
*
* Unfortunately we can only do so when there are no pending actions
* Otherwise we could miss updates we're waiting for and stall
*
*/
crm_debug("Detected LRM refresh - %d resources updated", num_resources);
crm_log_xml_trace(change, "lrm-refresh");
abort_transition(INFINITY, tg_restart, "LRM Refresh", NULL);
return;
}
for (rsc = __xml_first_child(xml); rsc != NULL; rsc = __xml_next(rsc)) {
crm_trace("Processing %s", ID(rsc));
for (rsc_op = __xml_first_child(rsc); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) {
crm_trace("Processing %s", ID(rsc_op));
process_graph_event(rsc_op, node);
}
}
}
#define NODE_PATT "/lrm[@id="
static char *get_node_from_xpath(const char *xpath)
{
char *nodeid = NULL;
char *tmp = strstr(xpath, NODE_PATT);
if(tmp) {
tmp += strlen(NODE_PATT);
tmp += 1;
nodeid = strdup(tmp);
tmp = strstr(nodeid, "\'");
CRM_ASSERT(tmp);
tmp[0] = 0;
}
return nodeid;
}
void
te_update_diff(const char *event, xmlNode * msg)
{
int rc = -EINVAL;
int format = 1;
xmlNode *change = NULL;
const char *op = NULL;
xmlNode *diff = NULL;
int p_add[] = { 0, 0, 0 };
int p_del[] = { 0, 0, 0 };
CRM_CHECK(msg != NULL, return);
crm_element_value_int(msg, F_CIB_RC, &rc);
if (transition_graph == NULL) {
crm_trace("No graph");
return;
} else if (rc < pcmk_ok) {
crm_trace("Filter rc=%d (%s)", rc, pcmk_strerror(rc));
return;
} else if (transition_graph->complete == TRUE
&& fsa_state != S_IDLE
&& fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) {
crm_trace("Filter state=%s, complete=%d", fsa_state2string(fsa_state),
transition_graph->complete);
return;
}
op = crm_element_value(msg, F_CIB_OPERATION);
diff = get_message_xml(msg, F_CIB_UPDATE_RESULT);
xml_patch_versions(diff, p_add, p_del);
crm_debug("Processing (%s) diff: %d.%d.%d -> %d.%d.%d (%s)", op,
p_del[0], p_del[1], p_del[2], p_add[0], p_add[1], p_add[2],
fsa_state2string(fsa_state));
crm_element_value_int(diff, "format", &format);
switch(format) {
case 1:
te_legacy_update_diff(event, diff);
return;
case 2:
/* Cool, we know what to do here */
crm_log_xml_trace(diff, "Patch:Raw");
break;
default:
crm_warn("Unknown patch format: %d", format);
return;
}
for (change = __xml_first_child(diff); change != NULL; change = __xml_next(change)) {
const char *name = NULL;
const char *op = crm_element_value(change, XML_DIFF_OP);
const char *xpath = crm_element_value(change, XML_DIFF_PATH);
xmlNode *match = NULL;
const char *node = NULL;
if(op == NULL) {
continue;
} else if(strcmp(op, "create") == 0) {
match = change->children;
} else if(strcmp(op, "move") == 0) {
continue;
} else if(strcmp(op, "modify") == 0) {
match = first_named_child(change, XML_DIFF_RESULT);
if(match) {
match = match->children;
}
}
if(match) {
name = (const char *)match->name;
}
crm_trace("Handling %s operation for %s %p, %s", op, xpath, match, name);
if(xpath == NULL) {
/* Version field, ignore */
} else if(strstr(xpath, "/cib/configuration")) {
abort_transition(INFINITY, tg_restart, "Non-status change", change);
break; /* Wont be packaged with any resource operations we may be waiting for */
} else if(strstr(xpath, "/"XML_CIB_TAG_TICKETS) || safe_str_eq(name, XML_CIB_TAG_TICKETS)) {
abort_transition(INFINITY, tg_restart, "Ticket attribute change", change);
break; /* Wont be packaged with any resource operations we may be waiting for */
} else if(strstr(xpath, "/"XML_TAG_TRANSIENT_NODEATTRS"[") || safe_str_eq(name, XML_TAG_TRANSIENT_NODEATTRS)) {
abort_transition(INFINITY, tg_restart, "Transient attribute change", change);
break; /* Wont be packaged with any resource operations we may be waiting for */
} else if(strstr(xpath, "/"XML_LRM_TAG_RSC_OP"[") && safe_str_eq(op, "delete")) {
crm_action_t *cancel = NULL;
char *mutable_key = strdup(xpath);
char *mutable_node = strdup(xpath);
char *search = NULL;
const char *key = NULL;
const char *node_uuid = NULL;
search = strrchr(mutable_key, '\'');
search[0] = 0;
key = strrchr(mutable_key, '\'') + 1;
node_uuid = strstr(mutable_node, "node_state[@id=\'") + strlen("node_state[@id=\'");
search = strchr(node_uuid, '\'');
search[0] = 0;
cancel = get_cancel_action(key, node_uuid);
if (cancel == NULL) {
abort_transition(INFINITY, tg_restart, "Resource operation removal", change);
} else {
crm_info("Cancellation of %s on %s confirmed (%d)", key, node_uuid, cancel->id);
stop_te_timer(cancel->timer);
te_action_confirmed(cancel);
update_graph(transition_graph, cancel);
trigger_graph();
}
free(mutable_node);
free(mutable_key);
} else if(strstr(xpath, "/"XML_CIB_TAG_LRM"[") && safe_str_eq(op, "delete")) {
abort_transition(INFINITY, tg_restart, "Resource state removal", change);
} else if(strstr(xpath, "/"XML_CIB_TAG_STATE"[") && safe_str_eq(op, "delete")) {
abort_transition(INFINITY, tg_restart, "Node state removal", change);
} else if(name == NULL) {
crm_debug("No result for %s operation to %s", op, xpath);
CRM_ASSERT(strcmp(op, "delete") == 0 || strcmp(op, "move") == 0);
} else if(strcmp(name, XML_TAG_CIB) == 0) {
xmlNode *state = NULL;
xmlNode *status = first_named_child(match, XML_CIB_TAG_STATUS);
xmlNode *config = first_named_child(match, XML_CIB_TAG_CONFIGURATION);
for (state = __xml_first_child(status); state != NULL; state = __xml_next(state)) {
xmlNode *lrm = first_named_child(state, XML_CIB_TAG_LRM);
node = ID(state);
process_resource_updates(node, lrm, change, op, xpath);
}
if(config) {
abort_transition(INFINITY, tg_restart, "Non-status change", change);
}
} else if(strcmp(name, XML_CIB_TAG_STATUS) == 0) {
xmlNode *state = NULL;
for (state = __xml_first_child(match); state != NULL; state = __xml_next(state)) {
xmlNode *lrm = first_named_child(state, XML_CIB_TAG_LRM);
node = ID(state);
process_resource_updates(node, lrm, change, op, xpath);
}
} else if(strcmp(name, XML_CIB_TAG_STATE) == 0) {
xmlNode *lrm = first_named_child(match, XML_CIB_TAG_LRM);
node = ID(match);
process_resource_updates(node, lrm, change, op, xpath);
} else if(strcmp(name, XML_CIB_TAG_LRM) == 0) {
node = ID(match);
process_resource_updates(node, match, change, op, xpath);
} else if(strcmp(name, XML_LRM_TAG_RESOURCES) == 0) {
char *local_node = get_node_from_xpath(xpath);
process_resource_updates(local_node, match, change, op, xpath);
free(local_node);
} else if(strcmp(name, XML_LRM_TAG_RESOURCE) == 0) {
xmlNode *rsc_op;
char *local_node = get_node_from_xpath(xpath);
for (rsc_op = __xml_first_child(match); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) {
process_graph_event(rsc_op, local_node);
}
free(local_node);
} else if(strcmp(name, XML_LRM_TAG_RSC_OP) == 0) {
char *local_node = get_node_from_xpath(xpath);
process_graph_event(match, local_node);
free(local_node);
} else {
crm_err("Ignoring %s operation for %s %p, %s", op, xpath, match, name);
}
}
}
gboolean
process_te_message(xmlNode * msg, xmlNode * xml_data)
{
const char *from = crm_element_value(msg, F_ORIG);
const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO);
const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
const char *ref = crm_element_value(msg, F_CRM_REFERENCE);
const char *op = crm_element_value(msg, F_CRM_TASK);
const char *type = crm_element_value(msg, F_CRM_MSG_TYPE);
crm_trace("Processing %s (%s) message", op, ref);
crm_log_xml_trace(msg, "ipc");
if (op == NULL) {
/* error */
} else if (sys_to == NULL || strcasecmp(sys_to, CRM_SYSTEM_TENGINE) != 0) {
crm_trace("Bad sys-to %s", crm_str(sys_to));
return FALSE;
} else if (safe_str_eq(op, CRM_OP_INVOKE_LRM)
&& safe_str_eq(sys_from, CRM_SYSTEM_LRMD)
/* && safe_str_eq(type, XML_ATTR_RESPONSE) */
) {
xmlXPathObject *xpathObj = NULL;
crm_log_xml_trace(msg, "Processing (N)ACK");
crm_debug("Processing (N)ACK %s from %s", crm_element_value(msg, F_CRM_REFERENCE), from);
xpathObj = xpath_search(xml_data, "//" XML_LRM_TAG_RSC_OP);
if (numXpathResults(xpathObj)) {
int lpc = 0, max = numXpathResults(xpathObj);
for (lpc = 0; lpc < max; lpc++) {
xmlNode *rsc_op = getXpathResult(xpathObj, lpc);
const char *node = get_node_id(rsc_op);
process_graph_event(rsc_op, node);
}
freeXpathObject(xpathObj);
} else {
crm_log_xml_err(msg, "Invalid (N)ACK");
freeXpathObject(xpathObj);
return FALSE;
}
} else {
crm_err("Unknown command: %s::%s from %s", type, op, sys_from);
}
crm_trace("finished processing message");
return TRUE;
}
GHashTable *stonith_failures = NULL;
struct st_fail_rec {
int count;
int last_rc;
};
gboolean
too_many_st_failures(void)
{
GHashTableIter iter;
const char *key = NULL;
struct st_fail_rec *value = NULL;
if (stonith_failures == NULL) {
return FALSE;
}
g_hash_table_iter_init(&iter, stonith_failures);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
if (value->count > 10) {
crm_notice("Too many failures to fence %s (%d), giving up", key, value->count);
return TRUE;
} else if (value->last_rc == -ENODEV) {
crm_notice("No devices found in cluster to fence %s, giving up", key);
return TRUE;
}
}
return FALSE;
}
void
st_fail_count_reset(const char *target)
{
struct st_fail_rec *rec = NULL;
if (stonith_failures) {
rec = g_hash_table_lookup(stonith_failures, target);
}
if (rec) {
rec->count = 0;
rec->last_rc = 0;
}
}
static void
st_fail_count_increment(const char *target, int rc)
{
struct st_fail_rec *rec = NULL;
if (stonith_failures == NULL) {
stonith_failures =
g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, free);
}
rec = g_hash_table_lookup(stonith_failures, target);
if (rec) {
rec->count++;
} else {
rec = malloc(sizeof(struct st_fail_rec));
if(rec == NULL) {
return;
}
rec->count = 1;
g_hash_table_insert(stonith_failures, strdup(target), rec);
}
rec->last_rc = rc;
}
void
tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data)
{
char *uuid = NULL;
int target_rc = -1;
int stonith_id = -1;
int transition_id = -1;
crm_action_t *action = NULL;
int call_id = data->call_id;
int rc = data->rc;
char *userdata = data->userdata;
CRM_CHECK(userdata != NULL, return);
crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
pcmk_strerror(rc), rc);
if (AM_I_DC == FALSE) {
return;
}
/* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
/* op->call_id, op->optype, op->node_name, op->op_result, */
/* (char *)op->node_list, op->private_data); */
/* filter out old STONITH actions */
CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, &target_rc),
crm_err("Invalid event detected");
goto bail;
);
if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid)
|| transition_graph->id != transition_id) {
crm_info("Ignoring STONITH action initiated outside of the current transition");
goto bail;
}
/* this will mark the event complete if a match is found */
action = get_action(stonith_id, FALSE);
if (action == NULL) {
crm_err("Stonith action not matched");
goto bail;
}
stop_te_timer(action->timer);
if (rc == pcmk_ok) {
const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
const char *op = crm_meta_value(action->params, "stonith_action");
crm_debug("Stonith operation %d for %s passed", call_id, target);
if (action->confirmed == FALSE) {
te_action_confirmed(action);
if (action->sent_update == FALSE && safe_str_neq("on", op)) {
send_stonith_update(action, target, uuid);
}
}
st_fail_count_reset(target);
} else {
const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET);
- const char *allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL);
action->failed = TRUE;
- if (crm_is_true(allow_fail) == FALSE) {
- crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", call_id,
- target, pcmk_strerror(rc));
- abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
- }
-
+ crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
+ call_id, target, pcmk_strerror(rc));
+ abort_transition(INFINITY, tg_restart, "Stonith failed", NULL);
st_fail_count_increment(target, rc);
}
update_graph(transition_graph, action);
trigger_graph();
bail:
free(userdata);
free(uuid);
return;
}
void
cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc < pcmk_ok) {
crm_err("Fencing update %d for %s: failed - %s (%d)",
call_id, (char *)user_data, pcmk_strerror(rc), rc);
crm_log_xml_warn(msg, "Failed update");
abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
} else {
crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
}
free(user_data);
}
void
cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc < pcmk_ok) {
crm_err("Update %d FAILED: %s", call_id, pcmk_strerror(rc));
}
}
gboolean
action_timer_callback(gpointer data)
{
crm_action_timer_t *timer = NULL;
CRM_CHECK(data != NULL, return FALSE);
timer = (crm_action_timer_t *) data;
stop_te_timer(timer);
crm_warn("Timer popped (timeout=%d, abort_level=%d, complete=%s)",
timer->timeout,
transition_graph->abort_priority, transition_graph->complete ? "true" : "false");
CRM_CHECK(timer->action != NULL, return FALSE);
if (transition_graph->complete) {
crm_warn("Ignoring timeout while not in transition");
} else if (timer->reason == timeout_action_warn) {
print_action(LOG_WARNING, "Action missed its timeout: ", timer->action);
/* Don't check the FSA state
*
* We might also be in S_INTEGRATION or some other state waiting for this
* action so we can close the transition and continue
*/
} else {
/* fail the action */
gboolean send_update = TRUE;
const char *task = crm_element_value(timer->action->xml, XML_LRM_ATTR_TASK);
print_action(LOG_ERR, "Aborting transition, action lost: ", timer->action);
timer->action->failed = TRUE;
te_action_confirmed(timer->action);
abort_transition(INFINITY, tg_restart, "Action lost", NULL);
update_graph(transition_graph, timer->action);
trigger_graph();
if (timer->action->type != action_type_rsc) {
send_update = FALSE;
} else if (safe_str_eq(task, RSC_CANCEL)) {
/* we dont need to update the CIB with these */
send_update = FALSE;
}
if (send_update) {
cib_action_update(timer->action, PCMK_LRM_OP_TIMEOUT, PCMK_OCF_UNKNOWN_ERROR);
}
}
return FALSE;
}
diff --git a/crmd/te_events.c b/crmd/te_events.c
index 8343a4fc62..bc8bebe11e 100644
--- a/crmd/te_events.c
+++ b/crmd/te_events.c
@@ -1,590 +1,604 @@
/*
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <crm_internal.h>
#include <sys/param.h>
#include <crm/crm.h>
#include <crm/cib.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <tengine.h>
#include <crmd_fsa.h>
char *failed_stop_offset = NULL;
char *failed_start_offset = NULL;
-int match_graph_event(int action_id, xmlNode * event, const char *event_node,
- int op_status, int op_rc, int target_rc);
-
gboolean
fail_incompletable_actions(crm_graph_t * graph, const char *down_node)
{
const char *target_uuid = NULL;
const char *router = NULL;
const char *router_uuid = NULL;
xmlNode *last_action = NULL;
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
if (graph == NULL || graph->complete) {
return FALSE;
}
gIter = graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
if (synapse->confirmed) {
continue;
}
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
crm_action_t *action = (crm_action_t *) gIter2->data;
if (action->type == action_type_pseudo || action->confirmed) {
continue;
} else if (action->type == action_type_crm) {
const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (safe_str_eq(task, CRM_OP_FENCE)) {
continue;
}
}
target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
if (router) {
crm_node_t *node = crm_get_peer(0, router);
if (node) {
router_uuid = node->uuid;
}
}
if (safe_str_eq(target_uuid, down_node) || safe_str_eq(router_uuid, down_node)) {
action->failed = TRUE;
synapse->failed = TRUE;
last_action = action->xml;
stop_te_timer(action->timer);
update_graph(graph, action);
if (synapse->executed) {
crm_notice("Action %d (%s) was pending on %s (offline)",
action->id, ID(action->xml), down_node);
} else {
crm_notice("Action %d (%s) is scheduled for %s (offline)",
action->id, ID(action->xml), down_node);
}
}
}
}
if (last_action != NULL) {
crm_warn("Node %s shutdown resulted in un-runnable actions", down_node);
abort_transition(INFINITY, tg_restart, "Node failure", last_action);
return TRUE;
}
return FALSE;
}
/*!
* \internal
* \brief Update failure-related node attributes if warranted
*
* \param[in] event XML describing operation that (maybe) failed
* \param[in] event_node_uuid Node that event occurred on
* \param[in] rc Actual operation return code
* \param[in] target_rc Expected operation return code
* \param[in] do_update If TRUE, do update regardless of operation type
*
* \return TRUE if this was not a direct nack, success or lrm status refresh
*/
static gboolean
update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update)
{
int interval = 0;
char *task = NULL;
char *rsc_id = NULL;
const char *value = NULL;
const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
const char *on_uname = crm_peer_uname(event_node_uuid);
const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);
/* Nothing needs to be done for success, lrm status refresh,
* or direct nack (internal code for "busy, try again")
*/
if ((rc == CRM_DIRECT_NACK_RC) || (rc == target_rc)) {
return FALSE;
} else if (safe_str_eq(origin, "build_active_RAs")) {
crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
id, rc, on_uname);
return FALSE;
}
/* Sanity check */
CRM_CHECK(on_uname != NULL, return TRUE);
CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval),
crm_err("Couldn't parse: %s", ID(event)); goto bail);
CRM_CHECK(task != NULL, goto bail);
CRM_CHECK(rsc_id != NULL, goto bail);
/* Decide whether update is necessary and what value to use */
if ((interval > 0) || safe_str_eq(task, CRMD_ACTION_PROMOTE)
|| safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
do_update = TRUE;
} else if (safe_str_eq(task, CRMD_ACTION_START)) {
do_update = TRUE;
if (failed_start_offset == NULL) {
failed_start_offset = strdup(INFINITY_S);
}
value = failed_start_offset;
} else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
do_update = TRUE;
if (failed_stop_offset == NULL) {
failed_stop_offset = strdup(INFINITY_S);
}
value = failed_stop_offset;
}
/* Fail count will be either incremented or set to infinity */
if (value == NULL || safe_str_neq(value, INFINITY_S)) {
value = XML_NVPAIR_ATTR_VALUE "++";
}
if (do_update) {
char *now = crm_itoa(time(NULL));
char *attr_name = NULL;
gboolean is_remote_node = FALSE;
if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
is_remote_node = TRUE;
}
crm_warn("Updating failcount for %s on %s after failed %s:"
" rc=%d (update=%s, time=%s)", rsc_id, on_uname, task, rc, value, now);
attr_name = crm_concat("fail-count", rsc_id, '-');
update_attrd(on_uname, attr_name, value, NULL, is_remote_node);
free(attr_name);
attr_name = crm_concat("last-failure", rsc_id, '-');
update_attrd(on_uname, attr_name, now, NULL, is_remote_node);
free(attr_name);
free(now);
}
bail:
free(rsc_id);
free(task);
return TRUE;
}
+/*!
+ * \internal
+ * \brief Return simplified operation status based on operation return code
+ *
+ * \param[in] action CRM action instance of operation
+ * \param[in] orig_status Original reported operation status
+ * \param[in] rc Actual operation return code
+ * \param[in] target_rc Expected operation return code
+ *
+ * \return PCMK_LRM_OP_DONE if rc equals target_rc, PCMK_LRM_OP_ERROR otherwise
+ *
+ * \note This assumes that PCMK_LRM_OP_PENDING operations have already been
+ * filtered (otherwise they will get simplified as well).
+ */
static int
status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc)
{
- int status = orig_status;
-
if (target_rc == rc) {
crm_trace("Target rc: == %d", rc);
- if (status != PCMK_LRM_OP_DONE) {
- crm_trace("Re-mapping op status to" " PCMK_LRM_OP_DONE for rc=%d", rc);
- status = PCMK_LRM_OP_DONE;
+ if (orig_status != PCMK_LRM_OP_DONE) {
+ crm_trace("Re-mapping op status to PCMK_LRM_OP_DONE for rc=%d", rc);
}
-
- } else {
- status = PCMK_LRM_OP_ERROR;
+ return PCMK_LRM_OP_DONE;
}
- if ((rc != CRM_DIRECT_NACK_RC) && (status != PCMK_LRM_OP_DONE)) {
- const char *task, *uname;
+ if (rc != CRM_DIRECT_NACK_RC) {
+ const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
+ const char *uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
- task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
- uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s",
- action->id, task, uname, target_rc, rc, services_lrm_status_str(status));
+ action->id, task, uname, target_rc, rc,
+ services_lrm_status_str(PCMK_LRM_OP_ERROR));
}
-
- return status;
+ return PCMK_LRM_OP_ERROR;
}
static void
process_remote_node_action(crm_action_t *action, xmlNode *event)
{
xmlNode *child = NULL;
/* The whole point of this function is to detect when a remote-node
* is integrated into the cluster, and abort the transition if that remote-node
* was fenced earlier in the transition. This allows a new transition to be
* generated so resources can be placed on the new node.
*/
if (crm_remote_peer_cache_size() == 0) {
return;
} else if (action->type != action_type_rsc) {
return;
} else if (action->failed || action->confirmed == FALSE) {
return;
} else if (safe_str_neq(crm_element_value(action->xml, XML_LRM_ATTR_TASK), "start")) {
return;
}
for (child = __xml_first_child(action->xml); child != NULL; child = __xml_next(child)) {
const char *provider;
const char *type;
const char *rsc;
crm_node_t *remote_peer;
if (safe_str_neq(crm_element_name(child), XML_CIB_TAG_RESOURCE)) {
continue;
}
provider = crm_element_value(child, XML_AGENT_ATTR_PROVIDER);
type = crm_element_value(child, XML_ATTR_TYPE);
rsc = ID(child);
if (safe_str_neq(provider, "pacemaker") || safe_str_neq(type, "remote") || rsc == NULL) {
break;
}
remote_peer = crm_get_peer_full(0, rsc, CRM_GET_PEER_REMOTE);
if (remote_peer == NULL) {
break;
}
/* A remote node will be placed in the "lost" state after
* it has been successfully fenced. After successfully connecting
* to a remote-node after being fenced, we need to abort the transition
* so resources can be placed on the newly integrated remote-node */
if (safe_str_eq(remote_peer->state, CRM_NODE_LOST)) {
abort_transition(INFINITY, tg_restart, "Remote-node re-discovered.", event);
}
return;
}
}
-/*
- * returns the ID of the action if a match is found
- * returns -1 if a match was not found
- * returns -2 if a match was found but the action failed (and was
- * not allowed to)
+/*!
+ * \internal
+ * \brief Confirm action and update transition graph, aborting transition on failures
+ *
+ * \param[in] action_id Action number of operation
+ * \param[in] event Event instance of this operation
+ * \param[in] orig_status Original reported operation status
+ * \param[in] op_rc Actual operation return code
+ * \param[in] target_rc Expected operation return code
+ *
+ * \return 0 on success, -1 if action does not exist
+ * \note This assumes that PCMK_LRM_OP_PENDING operations have already been
+ * filtered (otherwise they may be treated as failures).
*/
-int
-match_graph_event(int action_id, xmlNode * event, const char *event_node,
- int op_status, int op_rc, int target_rc)
+static int
+match_graph_event(int action_id, xmlNode *event, int op_status, int op_rc, int target_rc)
{
const char *target = NULL;
const char *allow_fail = NULL;
const char *this_event = NULL;
crm_action_t *action = NULL;
action = get_action(action_id, FALSE);
if (action == NULL) {
return -1;
}
+ /* Remap operation status based on return code */
op_status = status_from_rc(action, op_status, op_rc, target_rc);
/* Process OP status */
switch (op_status) {
- case PCMK_LRM_OP_PENDING:
- crm_debug("Ignoring pending operation");
- return action->id;
- break;
case PCMK_LRM_OP_DONE:
break;
case PCMK_LRM_OP_ERROR:
case PCMK_LRM_OP_TIMEOUT:
case PCMK_LRM_OP_NOTSUPPORTED:
action->failed = TRUE;
break;
case PCMK_LRM_OP_CANCELLED:
/* do nothing?? */
crm_err("Dont know what to do for cancelled ops yet");
break;
default:
+ /*
+ PCMK_LRM_OP_ERROR_HARD,
+ PCMK_LRM_OP_ERROR_FATAL,
+ PCMK_LRM_OP_NOT_INSTALLED
+ */
action->failed = TRUE;
crm_err("Unsupported action result: %d", op_status);
}
/* stop this event's timer if it had one */
stop_te_timer(action->timer);
te_action_confirmed(action);
update_graph(transition_graph, action);
trigger_graph();
if (action->failed) {
allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL);
if (crm_is_true(allow_fail)) {
action->failed = FALSE;
}
}
if (action->failed) {
abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event);
}
this_event = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
crm_info("Action %s (%d) confirmed on %s (rc=%d)",
crm_str(this_event), action->id, crm_str(target), op_rc);
/* determine if this action affects a remote-node's online/offline status */
process_remote_node_action(action, event);
- return action->id;
+ return 0;
}
crm_action_t *
get_action(int id, gboolean confirmed)
{
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
gIter = transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
crm_action_t *action = (crm_action_t *) gIter2->data;
if (action->id == id) {
if (confirmed) {
stop_te_timer(action->timer);
te_action_confirmed(action);
}
return action;
}
}
}
return NULL;
}
crm_action_t *
get_cancel_action(const char *id, const char *node)
{
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
gIter = transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
const char *task = NULL;
const char *target = NULL;
crm_action_t *action = (crm_action_t *) gIter2->data;
task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (safe_str_neq(CRMD_ACTION_CANCEL, task)) {
continue;
}
task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
if (safe_str_neq(task, id)) {
crm_trace("Wrong key %s for %s on %s", task, id, node);
continue;
}
target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
if (node && safe_str_neq(target, node)) {
crm_trace("Wrong node %s for %s on %s", target, id, node);
continue;
}
crm_trace("Found %s on %s", id, node);
return action;
}
}
return NULL;
}
crm_action_t *
match_down_event(int id, const char *target, const char *filter, bool quiet)
{
const char *this_action = NULL;
const char *this_node = NULL;
crm_action_t *match = NULL;
GListPtr gIter = NULL;
GListPtr gIter2 = NULL;
gIter = transition_graph->synapses;
for (; gIter != NULL; gIter = gIter->next) {
synapse_t *synapse = (synapse_t *) gIter->data;
/* lookup event */
gIter2 = synapse->actions;
for (; gIter2 != NULL; gIter2 = gIter2->next) {
crm_action_t *action = (crm_action_t *) gIter2->data;
if (id > 0 && action->id == id) {
match = action;
break;
}
this_action = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
if (action->type != action_type_crm) {
continue;
} else if (safe_str_eq(this_action, CRM_OP_LRM_REFRESH)) {
continue;
} else if (filter != NULL && safe_str_neq(this_action, filter)) {
continue;
}
this_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
if (this_node == NULL) {
crm_log_xml_err(action->xml, "No node uuid");
}
if (safe_str_neq(this_node, target)) {
crm_debug("Action %d : Node mismatch: %s", action->id, this_node);
continue;
}
match = action;
id = action->id;
break;
}
if (match != NULL) {
/* stop this event's timer if it had one */
break;
}
}
if (match != NULL) {
/* stop this event's timer if it had one */
crm_debug("Match found for action %d: %s on %s", id,
crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target);
} else if (id > 0) {
crm_err("No match for action %d", id);
} else if(quiet == FALSE) {
crm_warn("No match for shutdown action on %s", target);
}
return match;
}
gboolean
process_graph_event(xmlNode * event, const char *event_node)
{
int rc = -1;
int status = -1;
int callid = -1;
int action = -1;
int target_rc = -1;
int transition_num = -1;
char *update_te_uuid = NULL;
gboolean stop_early = FALSE;
gboolean passed = FALSE;
const char *id = NULL;
const char *desc = NULL;
const char *magic = NULL;
CRM_ASSERT(event != NULL);
/*
<lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-run="1355361636" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
*/
id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
crm_element_value_int(event, XML_LRM_ATTR_RC, &rc);
crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status);
crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid);
magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY);
if (magic == NULL) {
/* non-change */
return FALSE;
}
if (decode_transition_key(magic, &update_te_uuid, &transition_num, &action, &target_rc) ==
FALSE) {
crm_err("Invalid event %s.%d detected: %s", id, callid, magic);
abort_transition(INFINITY, tg_restart, "Bad event", event);
return FALSE;
}
if (status == PCMK_LRM_OP_PENDING) {
goto bail;
}
if (transition_num == -1) {
desc = "initiated outside of the cluster";
abort_transition(INFINITY, tg_restart, "Unexpected event", event);
} else if (action < 0 || crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE) {
desc = "initiated by a different node";
abort_transition(INFINITY, tg_restart, "Foreign event", event);
stop_early = TRUE; /* This could be an lrm status refresh */
} else if (transition_graph->id != transition_num) {
desc = "arrived really late";
abort_transition(INFINITY, tg_restart, "Old event", event);
stop_early = TRUE; /* This could be an lrm status refresh */
} else if (transition_graph->complete) {
desc = "arrived late";
abort_transition(INFINITY, tg_restart, "Inactive graph", event);
- } else if (match_graph_event(action, event, event_node, status, rc, target_rc) < 0) {
+ } else if (match_graph_event(action, event, status, rc, target_rc) < 0) {
desc = "unknown";
abort_transition(INFINITY, tg_restart, "Unknown event", event);
} else if (rc == target_rc) {
passed = TRUE;
crm_trace("Processed update to %s: %s", id, magic);
}
if (passed == FALSE) {
if (update_failcount(event, event_node, rc, target_rc, transition_num == -1)) {
/* Turns out this wasn't an lrm status refresh update aferall */
stop_early = FALSE;
desc = "failed";
}
crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num, action, id, callid,
services_ocf_exitcode_str(rc), desc);
}
bail:
free(update_te_uuid);
return stop_early;
}

File Metadata

Mime Type
text/x-diff
Expires
Sat, Jan 25, 12:20 PM (20 h, 8 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1322538
Default Alt Text
(47 KB)

Event Timeline