diff --git a/crmd/te_actions.c b/crmd/te_actions.c index a62806fa7a..71d572dd02 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -1,562 +1,565 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *te_uuid = NULL; -int active_timeout = 0; -int stonith_op_active = 0; void send_rsc_command(crm_action_t *action); -extern crm_action_timer_t *transition_timer; static void -te_start_action_timer(crm_action_t *action) +te_start_action_timer(crm_graph_t *graph, crm_action_t *action) { crm_malloc0(action->timer, sizeof(crm_action_timer_t)); action->timer->timeout = action->timeout; - action->timer->reason = timeout_action_warn; + action->timer->reason = timeout_action; action->timer->action = action; action->timer->source_id = Gmain_timeout_add( - action->timer->timeout, + action->timer->timeout + graph->network_delay, action_timer_callback, (void*)action->timer); CRM_ASSERT(action->timer->source_id != 0); } static gboolean te_pseudo_action(crm_graph_t *graph, crm_action_t *pseudo) { crm_info("Pseudo action %d fired and confirmed", pseudo->id); pseudo->confirmed = TRUE; update_graph(graph, pseudo); trigger_graph(); return TRUE; } void send_stonith_update(stonith_ops_t * op) { enum cib_errors rc = cib_ok; const char *target = op->node_name; const char *uuid = op->node_uuid; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = create_xml_node(NULL, XML_CIB_TAG_STATE); CRM_CHECK(op->node_name != NULL, return); CRM_CHECK(op->node_uuid != NULL, return); crm_xml_add(node_state, XML_ATTR_UUID, uuid); crm_xml_add(node_state, XML_ATTR_UNAME, target); crm_xml_add(node_state, XML_CIB_ATTR_HASTATE, DEADSTATUS); crm_xml_add(node_state, XML_CIB_ATTR_INCCM, XML_BOOLEAN_NO); crm_xml_add(node_state, XML_CIB_ATTR_CRMDSTATE, OFFLINESTATUS); crm_xml_add(node_state, XML_CIB_ATTR_JOINSTATE, CRMD_JOINSTATE_DOWN); crm_xml_add(node_state, XML_CIB_ATTR_EXPSTATE, CRMD_JOINSTATE_DOWN); crm_xml_add(node_state, XML_ATTR_ORIGIN, __FUNCTION__); rc = fsa_cib_conn->cmds->update( fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override|cib_scope_local|cib_can_create); if(rc < cib_ok) { crm_err("CIB update failed: %s", cib_error2string(rc)); abort_transition( INFINITY, tg_shutdown, "CIB update failed", node_state); } else { /* delay processing the trigger until the update completes */ add_cib_op_callback(fsa_cib_conn, rc, FALSE, crm_strdup(target), cib_fencing_updated); } erase_status_tag(op->node_name, XML_CIB_TAG_LRM); erase_status_tag(op->node_name, XML_TAG_TRANSIENT_NODEATTRS); free_xml(node_state); + +#if 0 + /* Make sure the membership cache is accurate */ + crm_update_peer(0, 0, 0, -1, 0, uuid, target, NULL, CRM_NODE_LOST); +#endif return; } static gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action) { const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; stonith_ops_t * st_op = NULL; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = crm_meta_value(action->params, "stonith_action"); CRM_CHECK(id != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); CRM_CHECK(uuid != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); CRM_CHECK(type != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); CRM_CHECK(target != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); te_log_action(LOG_INFO, "Executing %s fencing operation (%s) on %s (timeout=%d)", type, id, target, transition_graph->stonith_timeout); /* Passing NULL means block until we can connect... */ te_connect_stonith(NULL); crm_malloc0(st_op, sizeof(stonith_ops_t)); if(safe_str_eq(type, "poweroff")) { st_op->optype = POWEROFF; } else { st_op->optype = RESET; } st_op->timeout = transition_graph->stonith_timeout; st_op->node_name = crm_strdup(target); st_op->node_uuid = crm_strdup(uuid); st_op->private_data = generate_transition_key( transition_graph->id, action->id, 0, te_uuid); CRM_ASSERT(stonithd_input_IPC_channel() != NULL); if (ST_OK != stonithd_node_fence( st_op )) { crm_err("Cannot fence %s: stonithd_node_fence() call failed ", target); - - } else { - if(stonith_op_active == 0) { - crm_info("Storing current transition timeout (%d) during stonith op", - transition_graph->transition_timeout); - active_timeout = transition_graph->transition_timeout; - transition_graph->transition_timeout = 0; - } - stonith_op_active++; } return TRUE; } static int get_target_rc(crm_action_t *action) { const char *target_rc_s = crm_meta_value(action->params, XML_ATTR_TE_TARGET_RC); if(target_rc_s != NULL) { return crm_parse_int(target_rc_s, "0"); } return 0; } static gboolean te_crm_command(crm_graph_t *graph, crm_action_t *action) { char *counter = NULL; - xmlNode *cmd = NULL; + xmlNode *cmd = NULL; + gboolean is_local = FALSE; const char *id = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; - gboolean ret = TRUE; + gboolean rc = TRUE; id = ID(action->xml); task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, te_log_action(LOG_ERR, "Corrupted command (id=%s) %s: no node", crm_str(id), crm_str(task)); return FALSE); te_log_action(LOG_INFO, "Executing crm-event (%s): %s on %s", crm_str(id), crm_str(task), on_node); - if(safe_str_eq(on_node, fsa_our_uname) - && safe_str_eq(task, CRM_OP_SHUTDOWN)) { + if(safe_str_eq(on_node, fsa_our_uname)) { + is_local = TRUE; + } + + if(is_local && safe_str_eq(task, CRM_OP_SHUTDOWN)) { /* defer until everything else completes */ te_log_action(LOG_INFO, "crm-event (%s) is a local shutdown", crm_str(id)); graph->completion_action = tg_shutdown; graph->abort_reason = "local shutdown"; action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); return TRUE; } cmd = create_request(task, NULL, on_node, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); counter = generate_transition_key( transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(cmd, XML_ATTR_TRANSITION_KEY, counter); - ret = send_cluster_message(on_node, crm_msg_crmd, cmd, TRUE); + + rc = send_cluster_message(on_node, crm_msg_crmd, cmd, TRUE); crm_free(counter); free_xml(cmd); value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); - if(ret == FALSE) { + if(rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if(crm_is_true(value)) { crm_info("Skipping wait for %d", action->id); action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); - } else if(ret && action->timeout > 0) { - crm_debug("Setting timer for action %d",action->id); - action->timer->reason = timeout_action_warn; - te_start_action_timer(action); + } else { + if(action->timeout <= 0) { + crm_err("Action %d: %s on %s had an invalid timeout (%dms). Using %dms instead", + action->id, task, on_node, action->timeout, graph->network_delay); + action->timeout = graph->network_delay; + } + te_start_action_timer(graph, action); } return TRUE; } -static gboolean -te_rsc_command(crm_graph_t *graph, crm_action_t *action) -{ - /* never overwrite stop actions in the CIB with - * anything other than completed results - * - * Writing pending stops makes it look like the - * resource is running again - */ - const char *task = NULL; - const char *on_node = NULL; - action->executed = FALSE; - - on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - CRM_CHECK(on_node != NULL && strlen(on_node) != 0, - te_log_action(LOG_ERR, "Corrupted command(id=%s) %s: no node", - ID(action->xml), crm_str(task)); - return FALSE); - - send_rsc_command(action); - return TRUE; -} - gboolean cib_action_update(crm_action_t *action, int status, int op_rc) { char *op_id = NULL; char *code = NULL; char *digest = NULL; xmlNode *tmp = NULL; xmlNode *params = NULL; xmlNode *state = NULL; xmlNode *rsc = NULL; xmlNode *xml_op = NULL; xmlNode *action_rsc = NULL; enum cib_errors rc = cib_ok; const char *name = NULL; const char *value = NULL; const char *rsc_id = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); const char *target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int call_options = cib_quorum_override|cib_scope_local; if(status == LRM_OP_PENDING) { crm_debug("%s %d: Recording pending operation %s on %s", crm_element_name(action->xml), action->id, task_uuid, target); } else { crm_warn("%s %d: %s on %s timed out", crm_element_name(action->xml), action->id, task_uuid, target); } action_rsc = find_xml_node(action->xml, XML_CIB_TAG_RESOURCE, TRUE); if(action_rsc == NULL) { return FALSE; } rsc_id = ID(action_rsc); CRM_CHECK(rsc_id != NULL, crm_log_xml_err(action->xml, "Bad:action"); return FALSE); /* update the CIB */ state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(state, XML_ATTR_UUID, target_uuid); crm_xml_add(state, XML_ATTR_UNAME, target); rsc = create_xml_node(state, XML_CIB_TAG_LRM); crm_xml_add(rsc, XML_ATTR_ID, target_uuid); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCES); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, rsc_id); name = XML_ATTR_TYPE; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_CLASS; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_PROVIDER; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); xml_op = create_xml_node(rsc, XML_LRM_TAG_RSC_OP); crm_xml_add(xml_op, XML_ATTR_ID, task); op_id = generate_op_key(rsc_id, task, action->interval); crm_xml_add(xml_op, XML_ATTR_ID, op_id); crm_free(op_id); - crm_xml_add(xml_op, XML_LRM_ATTR_CALLID, "-1"); + crm_xml_add_int(xml_op, XML_LRM_ATTR_CALLID, -1); crm_xml_add(xml_op, XML_LRM_ATTR_TASK, task); crm_xml_add(xml_op, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); crm_xml_add_int(xml_op, XML_LRM_ATTR_OPSTATUS, status); crm_xml_add_int(xml_op, XML_LRM_ATTR_INTERVAL, action->interval); crm_xml_add_int(xml_op, XML_LRM_ATTR_RC, op_rc); crm_xml_add(xml_op, XML_ATTR_ORIGIN, __FUNCTION__); code = generate_transition_key( transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(xml_op, XML_ATTR_TRANSITION_KEY, code); crm_free(code); code = generate_transition_magic( crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY), status, op_rc); crm_xml_add(xml_op, XML_ATTR_TRANSITION_MAGIC, code); crm_free(code); tmp = find_xml_node(action->xml, "attributes", TRUE); params = create_xml_node(NULL, XML_TAG_PARAMS); copy_in_properties(params, tmp); filter_action_parameters(params, CRM_FEATURE_SET); digest = calculate_xml_digest(params, TRUE, FALSE); /* info for now as this area has been problematic to debug */ crm_debug("Calculated digest %s for %s (%s)\n", digest, ID(xml_op), crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); crm_log_xml(LOG_DEBUG, "digest:source", params); crm_xml_add(xml_op, XML_LRM_ATTR_OP_DIGEST, digest); crm_free(digest); free_xml(params); crm_debug_3("Updating CIB with \"%s\" (%s): %s %s on %s", status<0?"new action":XML_ATTR_TIMEOUT, crm_element_name(action->xml), crm_str(task), rsc_id, target); rc = fsa_cib_conn->cmds->update( fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); - crm_debug("Updating CIB with %s action %d: %s on %s (call_id=%d)", + crm_debug_2("Updating CIB with %s action %d: %s on %s (call_id=%d)", op_status2text(status), action->id, task_uuid, target, rc); add_cib_op_callback(fsa_cib_conn, rc, FALSE, NULL, cib_action_updated); free_xml(state); action->sent_update = TRUE; if(rc < cib_ok) { return FALSE; } return TRUE; } -static void update_transition_timer(crm_action_t *action) -{ - int action_timeout = action->timeout + transition_graph->network_delay; - int current_timeout = transition_graph->transition_timeout; - - if(stonith_op_active > 0) { - current_timeout = active_timeout; - } - - if(current_timeout < action_timeout) { - crm_debug("Action %d: Increasing transition %d timeout to %d (%d + %d)", - action->id, transition_graph->id, action_timeout, - action->timeout, transition_graph->network_delay); - - if(stonith_op_active > 0) { - active_timeout = action_timeout; - - } else { - transition_graph->transition_timeout = action_timeout; - } - } -} - -void -send_rsc_command(crm_action_t *action) +static gboolean +te_rsc_command(crm_graph_t *graph, crm_action_t *action) { + /* never overwrite stop actions in the CIB with + * anything other than completed results + * + * Writing pending stops makes it look like the + * resource is running again + */ xmlNode *cmd = NULL; xmlNode *rsc_op = NULL; - char *counter = NULL; + gboolean rc = TRUE; + gboolean is_local = FALSE; + + char *counter = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *task_uuid = NULL; CRM_ASSERT(action != NULL); CRM_ASSERT(action->xml != NULL); + action->executed = FALSE; + on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + + CRM_CHECK(on_node != NULL && strlen(on_node) != 0, + te_log_action(LOG_ERR, "Corrupted command(id=%s) %s: no node", + ID(action->xml), crm_str(task)); + return FALSE); + rsc_op = action->xml; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); on_node = crm_element_value(rsc_op, XML_LRM_ATTR_TARGET); counter = generate_transition_key( transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(rsc_op, XML_ATTR_TRANSITION_KEY, counter); - crm_info("Initiating action %d: %s %s on %s", - action->id, task, task_uuid, on_node); - - crm_free(counter); - - if(rsc_op != NULL) { - crm_log_xml_debug_2(rsc_op, "Performing"); + if(safe_str_eq(on_node, fsa_our_uname)) { + is_local = TRUE; } + + crm_info("Initiating action %d: %s %s on %s %s", + action->id, task, task_uuid, on_node, is_local?"(local)":""); + cmd = create_request(CRM_OP_INVOKE_LRM, rsc_op, on_node, CRM_SYSTEM_LRMD, CRM_SYSTEM_TENGINE, NULL); - send_cluster_message(on_node, crm_msg_lrmd, cmd, TRUE); + if(is_local) { + /* shortcut local resource commands */ + ha_msg_input_t data = { + .msg = cmd, + .xml = rsc_op, + }; + + fsa_data_t msg = { + .id = 0, + .data = &data, + .data_type = fsa_dt_ha_msg, + .fsa_input = I_NULL, + .fsa_cause = C_FSA_INTERNAL, + .actions = A_LRM_INVOKE, + .origin = __FUNCTION__, + }; + + crm_info("Performing %s locally", counter); + do_lrm_invoke(A_LRM_INVOKE, C_FSA_INTERNAL, fsa_state, I_NULL, &msg); + + } else { + rc = send_cluster_message(on_node, crm_msg_lrmd, cmd, TRUE); + } + + crm_free(counter); free_xml(cmd); action->executed = TRUE; value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); - if(crm_is_true(value)) { + if(rc == FALSE) { + crm_err("Action %d failed: send", action->id); + return FALSE; + + } else if(crm_is_true(value)) { crm_debug("Skipping wait for %d", action->id); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); - } else if(action->timeout > 0) { - crm_debug_3("Setting timer for action %s", task_uuid); - update_transition_timer(action); - te_start_action_timer(action); + } else { + if(action->timeout <= 0) { + crm_err("Action %d: %s %s on %s had an invalid timeout (%dms). Using %dms instead", + action->id, task, task_uuid, on_node, action->timeout, graph->network_delay); + action->timeout = graph->network_delay; + } + te_start_action_timer(graph, action); } value = crm_meta_value(action->params, XML_OP_ATTR_PENDING); if(crm_is_true(value)) { /* write a "pending" entry to the CIB, inhibit notification */ crm_info("Recording pending op %s in the CIB", task_uuid); cib_action_update(action, LRM_OP_PENDING, EXECRA_STATUS_UNKNOWN); - } + } + + return TRUE; } crm_graph_functions_t te_graph_fns = { te_pseudo_action, te_rsc_command, te_crm_command, te_fence_node }; void notify_crmd(crm_graph_t *graph) { int log_level = LOG_DEBUG; const char *type = "unknown"; enum crmd_fsa_input event = I_NULL; crm_debug("Processing transition completion in state %s", fsa_state2string(fsa_state)); - stop_te_timer(transition_timer); CRM_CHECK(graph->complete, graph->complete = TRUE); switch(graph->completion_action) { case tg_stop: type = "stop"; /* fall through */ case tg_done: type = "done"; log_level = LOG_INFO; if(fsa_state == S_TRANSITION_ENGINE) { event = I_TE_SUCCESS; } break; case tg_restart: type = "restart"; if(fsa_state == S_TRANSITION_ENGINE) { event = I_PE_CALC; } else if(fsa_state == S_POLICY_ENGINE) { register_fsa_action(A_PE_INVOKE); } break; case tg_shutdown: type = "shutdown"; if(is_set(fsa_input_register, R_SHUTDOWN)) { event = I_STOP; } else { event = I_TERMINATE; } } te_log_action(log_level, "Transition %d status: %s - %s", graph->id, type, crm_str(graph->abort_reason)); graph->abort_reason = NULL; graph->completion_action = tg_done; clear_bit_inplace(fsa_input_register, R_IN_TRANSITION); if(event != I_NULL) { register_fsa_input(C_FSA_INTERNAL, event, NULL); } } diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c index c80789e722..a2e73642fe 100644 --- a/crmd/te_callbacks.c +++ b/crmd/te_callbacks.c @@ -1,578 +1,500 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include void te_update_confirm(const char *event, xmlNode *msg); extern char *te_uuid; gboolean shuttingdown = FALSE; crm_graph_t *transition_graph; GTRIGSource *transition_trigger = NULL; -crm_action_timer_t *transition_timer = NULL; /* #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" static const char *get_node_id(xmlNode *rsc_op) { xmlNode *node = rsc_op; while(node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return NULL); return ID(node); } static void process_resource_updates(xmlXPathObject *xpathObj) { /* */ int lpc = 0, max = xpathObj->nodesetval->nodeNr; for(lpc = 0; lpc < max; lpc++) { xmlNode *rsc_op = getXpathResult(xpathObj, lpc); const char *node = get_node_id(rsc_op); process_graph_event(rsc_op, node); } } void te_update_diff(const char *event, xmlNode *msg) { int rc = -1; const char *op = NULL; xmlNode *diff = NULL; xmlNode *cib_top = NULL; xmlXPathObject *xpathObj = NULL; int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; CRM_CHECK(msg != NULL, return); crm_element_value_int(msg, F_CIB_RC, &rc); if(transition_graph == NULL) { crm_debug_3("No graph"); return; } else if(rc < cib_ok) { crm_debug_3("Filter rc=%d (%s)", rc, cib_error2string(rc)); return; } else if(transition_graph->complete == TRUE && fsa_state != S_IDLE && fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) { crm_debug_2("Filter state=%s, complete=%d", fsa_state2string(fsa_state), transition_graph->complete); return; } op = crm_element_value(msg, F_CIB_OPERATION); diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); cib_diff_version_details( diff, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); crm_debug("Processing diff (%s): %d.%d.%d -> %d.%d.%d (%s)", op, diff_del_admin_epoch,diff_del_epoch,diff_del_updates, diff_add_admin_epoch,diff_add_epoch,diff_add_updates, fsa_state2string(fsa_state)); log_cib_diff(LOG_DEBUG_2, diff, op); /* Process anything that was added */ cib_top = get_xpath_object("//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB, diff, LOG_ERR); if(need_abort(cib_top)) { goto bail; /* configuration changed */ } /* Process anything that was removed */ cib_top = get_xpath_object("//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_REMOVED"//"XML_TAG_CIB, diff, LOG_ERR); if(need_abort(cib_top)) { goto bail; /* configuration changed */ } /* Transient Attributes - Added/Updated */ xpathObj = xpath_search(diff,"//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_ADDED"//"XML_TAG_TRANSIENT_NODEATTRS); if(xpathObj && xpathObj->nodesetval->nodeNr > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Transient attribute: update", aborted); goto bail; } /* Transient Attributes - Removed */ xpathObj = xpath_search(diff,"//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_REMOVED"//"XML_TAG_TRANSIENT_NODEATTRS); if(xpathObj && xpathObj->nodesetval->nodeNr > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Transient attribute: removal", aborted); goto bail; } /* Check for node state updates... possibly from a shutdown we requested */ xpathObj = xpath_search(diff, "//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_ADDED"//"XML_CIB_TAG_STATE); if(xpathObj && xpathObj->nodesetval->nodeNr > 0) { int lpc = 0, max = xpathObj->nodesetval->nodeNr; for(lpc = 0; lpc < max; lpc++) { xmlNode *node = getXpathResult(xpathObj, lpc); const char *event_node = crm_element_value(node, XML_ATTR_ID); const char *ccm_state = crm_element_value(node, XML_CIB_ATTR_INCCM); const char *ha_state = crm_element_value(node, XML_CIB_ATTR_HASTATE); const char *shutdown_s = crm_element_value(node, XML_CIB_ATTR_SHUTDOWN); const char *crmd_state = crm_element_value(node, XML_CIB_ATTR_CRMDSTATE); if(safe_str_eq(ccm_state, XML_BOOLEAN_FALSE) || safe_str_eq(ha_state, DEADSTATUS) || safe_str_eq(crmd_state, CRMD_JOINSTATE_DOWN)) { crm_action_t *shutdown = match_down_event(0, event_node, NULL); if(shutdown != NULL) { const char *task = crm_element_value(shutdown->xml, XML_LRM_ATTR_TASK); if(safe_str_neq(task, CRM_OP_FENCE)) { /* Wait for stonithd to tell us it is complete via tengine_stonith_callback() */ update_graph(transition_graph, shutdown); trigger_graph(); } } else { crm_info("Stonith/shutdown of %s not matched", event_node); abort_transition(INFINITY, tg_restart, "Node failure", node); } fail_incompletable_actions(transition_graph, event_node); } if(shutdown_s) { int shutdown = crm_parse_int(shutdown_s, NULL); if(shutdown > 0) { crm_info("Aborting on "XML_CIB_ATTR_SHUTDOWN" attribute for %s", event_node); abort_transition(INFINITY, tg_restart, "Shutdown request", node); } } } xmlXPathFreeObject(xpathObj); xpathObj = NULL; } /* * Check for and fast-track the processing of LRM refreshes * In large clusters this can result in _huge_ speedups * * Unfortunately we can only do so when there are no pending actions * Otherwise we could miss updates we're waiting for and stall * */ xpathObj = NULL; if(transition_graph->pending == 0) { xpathObj = xpath_search(diff, "//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_ADDED"//"XML_LRM_TAG_RESOURCE); } if(xpathObj && xpathObj->nodesetval->nodeNr > 0) { int updates = xpathObj->nodesetval->nodeNr; xmlXPathFreeObject(xpathObj); xpathObj = NULL; if(updates > 1) { /* Updates by, or in response to, TE actions will never contain updates * for more than one resource at a time */ crm_info("Detected LRM refresh - %d resources updated: Skipping all resource events", updates); abort_transition(INFINITY, tg_restart, "LRM Refresh", diff); goto bail; } } /* Process operation updates */ xpathObj = xpath_search(diff, "//"F_CIB_UPDATE_RESULT"//"XML_TAG_DIFF_ADDED"//"XML_LRM_TAG_RSC_OP); if(xpathObj && xpathObj->nodesetval->nodeNr > 0) { process_resource_updates(xpathObj); xmlXPathFreeObject(xpathObj); } /* Detect deleted (as opposed to replaced or added) actions - eg. crm_resource -C */ xpathObj = xpath_search(diff, "//"XML_TAG_DIFF_REMOVED"//"XML_LRM_TAG_RSC_OP); if(xpathObj) { int lpc = 0, max = xpathObj->nodesetval->nodeNr; for(lpc = 0; lpc < max; lpc++) { int max = 0; const char *op_id = NULL; char *rsc_op_xpath = NULL; xmlXPathObject *op_match = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_CHECK(match != NULL, continue); op_id = ID(match); max = strlen(rsc_op_template) + strlen(op_id) + 1; crm_malloc0(rsc_op_xpath, max); snprintf(rsc_op_xpath, max, rsc_op_template, op_id); op_match = xpath_search(diff, rsc_op_xpath); if(op_match && op_match->nodesetval->nodeNr > 0) { /* XML deletion had a corresponding add */ xmlXPathFreeObject(op_match); } else { /* Prevent false positives by matching cancelations too */ const char *node = get_node_id(match); crm_action_t *cancelled = get_cancel_action(op_id, node); if(cancelled == NULL) { crm_info("No match for deleted action %s (%s on %s)", rsc_op_xpath, op_id, node); abort_transition(INFINITY, tg_restart, "Resource op removal", match); goto bail; } else { crm_debug("Deleted lrm_rsc_op %s on %s was for graph event %d", op_id, node, cancelled->id); } } crm_free(rsc_op_xpath); } } bail: if(xpathObj) { xmlXPathFreeObject(xpathObj); } } gboolean process_te_message(xmlNode *msg, xmlNode *xml_data) { const char *from = crm_element_value(msg, F_ORIG); const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); const char *ref = crm_element_value(msg, XML_ATTR_REFERENCE); const char *op = crm_element_value(msg, F_CRM_TASK); const char *type = crm_element_value(msg, F_CRM_MSG_TYPE); crm_debug_2("Processing %s (%s) message", op, ref); crm_log_xml(LOG_DEBUG_3, "ipc", msg); if(op == NULL){ /* error */ } else if(sys_to == NULL || strcasecmp(sys_to, CRM_SYSTEM_TENGINE) != 0) { crm_debug_2("Bad sys-to %s", crm_str(sys_to)); return FALSE; } else if(safe_str_eq(op, CRM_OP_INVOKE_LRM) && safe_str_eq(sys_from, CRM_SYSTEM_LRMD) /* && safe_str_eq(type, XML_ATTR_RESPONSE) */ ){ xmlXPathObject *xpathObj = NULL; crm_log_xml(LOG_DEBUG_2, "Processing (N)ACK", msg); crm_info("Processing (N)ACK %s from %s", crm_element_value(msg, XML_ATTR_REFERENCE), from); xpathObj = xpath_search(xml_data, "//"XML_LRM_TAG_RSC_OP); if(xpathObj) { process_resource_updates(xpathObj); xmlXPathFreeObject(xpathObj); xpathObj = NULL; } else { crm_log_xml(LOG_ERR, "Invalid (N)ACK", msg); return FALSE; } } else { crm_err("Unknown command: %s::%s from %s", type, op, sys_from); } crm_debug_3("finished processing message"); return TRUE; } void tengine_stonith_callback(stonith_ops_t * op) { const char *allow_fail = NULL; int target_rc = -1; int stonith_id = -1; int transition_id = -1; char *uuid = NULL; crm_action_t *stonith_action = NULL; if(op == NULL) { crm_err("Called with a NULL op!"); return; } crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", op->call_id, op->optype, op->node_name, op->op_result, (char *)op->node_list, op->private_data); - - /* restore the orignal transition timeout */ - stonith_op_active--; - if(stonith_op_active == 0) { - crm_info("Restoring transition timeout: %d", active_timeout); - transition_graph->transition_timeout = active_timeout; - } /* this will mark the event complete if a match is found */ CRM_CHECK(op->private_data != NULL, return); /* filter out old STONITH actions */ CRM_CHECK(decode_transition_key( op->private_data, &uuid, &transition_id, &stonith_id, &target_rc), crm_err("Invalid event detected"); goto bail; ); if(transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) || transition_graph->id != transition_id) { crm_info("Ignoring STONITH action initiated outside" " of the current transition"); } stonith_action = get_action(stonith_id, TRUE); if(stonith_action == NULL) { crm_err("Stonith action not matched"); goto bail; } switch(op->op_result) { case STONITH_SUCCEEDED: send_stonith_update(op); break; case STONITH_CANNOT: case STONITH_TIMEOUT: case STONITH_GENERIC: stonith_action->failed = TRUE; allow_fail = crm_meta_value(stonith_action->params, XML_ATTR_TE_ALLOWFAIL); if(FALSE == crm_is_true(allow_fail)) { crm_err("Stonith of %s failed (%d)..." " aborting transition.", op->node_name, op->op_result); abort_transition(INFINITY, tg_restart, "Stonith failed", NULL); } break; default: crm_err("Unsupported action result: %d", op->op_result); abort_transition(INFINITY, tg_restart, "Unsupport Stonith result", NULL); } update_graph(transition_graph, stonith_action); trigger_graph(); bail: crm_free(uuid); return; } void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if(rc < cib_ok) { crm_err("CIB update failed: %s", cib_error2string(rc)); crm_log_xml_warn(msg, "Failed update"); } crm_free(user_data); } void cib_action_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if(rc < cib_ok) { crm_err("Update %d FAILED: %s", call_id, cib_error2string(rc)); } } void cib_failcount_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if(rc < cib_ok) { crm_err("Update %d FAILED: %s", call_id, cib_error2string(rc)); } } gboolean action_timer_callback(gpointer data) { crm_action_timer_t *timer = NULL; - if(data == NULL) { - crm_err("Timer popped with no data"); - return FALSE; - } + CRM_CHECK(data != NULL, return FALSE); timer = (crm_action_timer_t*)data; stop_te_timer(timer); crm_warn("Timer popped (timeout=%d, abort_level=%d, complete=%s)", timer->timeout, transition_graph->abort_priority, transition_graph->complete?"true":"false"); CRM_CHECK(timer->action != NULL, return FALSE); if(transition_graph->complete) { crm_warn("Ignoring timeout while not in transition"); } else if(timer->reason == timeout_action_warn) { print_action( LOG_WARNING,"Action missed its timeout: ", timer->action); - } else { - /* fail the action */ - cib_action_update(timer->action, LRM_OP_TIMEOUT, EXECRA_UNKNOWN_ERROR); - } - - return FALSE; -} - - -static int -unconfirmed_actions(gboolean send_updates) -{ - int unconfirmed = 0; - const char *key = NULL; - const char *task = NULL; - const char *node = NULL; - - crm_debug_2("Unconfirmed actions..."); - slist_iter( - synapse, synapse_t, transition_graph->synapses, lpc, - - /* lookup event */ - slist_iter( - action, crm_action_t, synapse->actions, lpc2, - if(action->executed == FALSE) { - continue; - - } else if(action->confirmed) { - continue; - } - - unconfirmed++; - task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); - node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - key = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); - - crm_info("Action %s %d unconfirmed from %s", - key, action->id, node); - if(action->type != action_type_rsc) { - continue; - } else if(send_updates == FALSE) { - continue; - } else if(safe_str_eq(task, "cancel")) { - /* we dont need to update the CIB with these */ - continue; - } else if(safe_str_eq(task, "stop")) { - /* *never* update the CIB with these */ - continue; - } - cib_action_update(action, LRM_OP_PENDING, EXECRA_STATUS_UNKNOWN); - ); - ); - if(unconfirmed > 0) { - crm_warn("Waiting on %d unconfirmed actions", unconfirmed); - } - return unconfirmed; -} - -gboolean -global_timer_callback(gpointer data) -{ - crm_action_timer_t *timer = NULL; - - if(data == NULL) { - crm_err("Timer popped with no data"); - return FALSE; - } - - timer = (crm_action_timer_t*)data; - stop_te_timer(timer); - - if(transition_graph == NULL) { - crm_err("No current graph"); - return FALSE; - } - - crm_warn("Timer popped (abort_level=%d, complete=%s)", - transition_graph->abort_priority, - transition_graph->complete?"true":"false"); - - CRM_CHECK(timer->action == NULL, return FALSE); - - if(fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) { - crm_err("Discarding transition timeout in state: %s", fsa_state2string(fsa_state)); + } else if(fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) { + crm_err("Discarding action timeout in state: %s", fsa_state2string(fsa_state)); } else if(transition_graph->complete) { - crm_err("Ignoring timeout while not in transition"); + crm_err("Ignoring action timeout while not in transition"); - } else if(timer->reason == timeout_abort) { - int unconfirmed = unconfirmed_actions(FALSE); - crm_warn("Transition abort timeout reached..." - " marking transition complete."); - - transition_graph->complete = TRUE; - abort_transition(INFINITY, tg_restart, "Global Timeout", NULL); - - if(unconfirmed != 0) { - crm_warn("Writing %d unconfirmed actions to the CIB", - unconfirmed); - unconfirmed_actions(TRUE); - } + } else { + /* fail the action */ + gboolean send_update = TRUE; + const char *task = crm_element_value(timer->action->xml, XML_LRM_ATTR_TASK); + print_action(LOG_ERR, "Aborting transition, action lost: ", timer->action); + + timer->action->failed = TRUE; + timer->action->confirmed = TRUE; + abort_transition(INFINITY, tg_restart, "Action lost", NULL); + + update_graph(transition_graph, timer->action); + trigger_graph(); + + if(timer->action->type != action_type_rsc) { + send_update = FALSE; + } else if(safe_str_eq(task, "cancel")) { + /* we dont need to update the CIB with these */ + send_update = FALSE; + } else if(safe_str_eq(task, "stop")) { + /* *never* update the CIB with these */ + send_update = FALSE; + } + + if(send_update) { + /* cib_action_update(timer->action, LRM_OP_PENDING, EXECRA_STATUS_UNKNOWN); */ + cib_action_update(timer->action, LRM_OP_TIMEOUT, EXECRA_UNKNOWN_ERROR); + } } - return FALSE; + + return FALSE; } diff --git a/crmd/te_events.c b/crmd/te_events.c index c3aed22430..38b71f04b0 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -1,500 +1,501 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; int match_graph_event(int action_id, xmlNode *event, const char *event_node, int op_status, int op_rc, int target_rc); gboolean need_abort(xmlNode *update) { xmlNode *xml = NULL; if(update == NULL) { return FALSE; } xml_prop_iter(update, name, value, if(safe_str_eq(name, XML_ATTR_HAVE_QUORUM)) { goto do_abort; /* possibly not required */ } else if(safe_str_eq(name, XML_ATTR_GENERATION)) { goto do_abort; } else if(safe_str_eq(name, XML_ATTR_GENERATION_ADMIN)) { goto do_abort; } continue; do_abort: abort_transition(INFINITY, tg_restart, "Non-status change", NULL); crm_info("Aborting on change to %s", name); return TRUE; ); xml = get_object_root(XML_CIB_TAG_CONFIGURATION, update); if(xml != NULL) { abort_transition(INFINITY, tg_restart, "Non-status change", xml); return TRUE; } return FALSE; } gboolean fail_incompletable_actions(crm_graph_t *graph, const char *down_node) { const char *target = NULL; xmlNode *last_action = NULL; slist_iter( synapse, synapse_t, graph->synapses, lpc, if (synapse->confirmed) { continue; } slist_iter( action, crm_action_t, synapse->actions, lpc, if(action->type == action_type_pseudo || action->confirmed) { continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if(safe_str_eq(target, down_node)) { action->failed = TRUE; last_action = action->xml; update_graph(graph, action); crm_notice("Action %d (%s) is scheduled for %s (offline)", action->id, ID(action->xml), down_node); } ); ); if(last_action != NULL) { crm_warn("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } static gboolean update_failcount(xmlNode *event, const char *event_node, int rc, int target_rc, gboolean force) { int interval = 0; char *task = NULL; char *rsc_id = NULL; char *attr_name = NULL; const char *id = ID(event); const char *on_uuid = event_node; const char *value = NULL; if(rc == 99) { /* this is an internal code for "we're busy, try again" */ return FALSE; } else if(rc == target_rc) { return FALSE; } if(failed_stop_offset == NULL) { failed_stop_offset = crm_strdup(INFINITY_S); } if(failed_start_offset == NULL) { failed_start_offset = crm_strdup(INFINITY_S); } CRM_CHECK(on_uuid != NULL, return TRUE); CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event)); goto bail); CRM_CHECK(task != NULL, goto bail); CRM_CHECK(rsc_id != NULL, goto bail); if(safe_str_eq(task, CRMD_ACTION_START)) { interval = 1; value = failed_start_offset; } else if(safe_str_eq(task, CRMD_ACTION_STOP)) { interval = 1; value = failed_stop_offset; } if(value == NULL || safe_str_neq(value, INFINITY_S)) { value = XML_NVPAIR_ATTR_VALUE"++"; } if(interval > 0 || force) { int call_id = 0; char *now = crm_itoa(time(NULL)); attr_name = crm_concat("fail-count", rsc_id, '-'); crm_warn("Updating failcount for %s on %s after failed %s:" " rc=%d (update=%s, time=%s)", rsc_id, on_uuid, task, rc, value, now); /* don't let notificatios of these updates cause new transitions */ call_id = update_attr(fsa_cib_conn, cib_inhibit_notify, XML_CIB_TAG_STATUS, on_uuid, NULL,NULL, attr_name, value, FALSE); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, cib_failcount_updated); crm_free(attr_name); attr_name = crm_concat("last-failure", rsc_id, '-'); /* don't let notificatios of these updates cause new transitions */ call_id = update_attr(fsa_cib_conn, cib_inhibit_notify, XML_CIB_TAG_STATUS, on_uuid, NULL,NULL, attr_name, now, FALSE); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, cib_failcount_updated); crm_free(attr_name); crm_free(now); } bail: crm_free(rsc_id); crm_free(task); return TRUE; } static int status_from_rc(crm_action_t *action, int orig_status, int rc, int target_rc) { int status = orig_status; if(target_rc == rc) { crm_debug_2("Target rc: == %d", rc); if(status != LRM_OP_DONE) { crm_debug_2("Re-mapping op status to" " LRM_OP_DONE for rc=%d", rc); status = LRM_OP_DONE; } } else { status = LRM_OP_ERROR; } /* 99 is the code we use for direct nack's */ if(rc != 99 && status != LRM_OP_DONE) { const char *task, *uname; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s", action->id, task, uname, target_rc, rc, op_status2text(status)); } return status; } /* * returns the ID of the action if a match is found * returns -1 if a match was not found * returns -2 if a match was found but the action failed (and was * not allowed to) */ int match_graph_event(int action_id, xmlNode *event, const char *event_node, int op_status, int op_rc, int target_rc) { const char *target = NULL; const char *allow_fail = NULL; const char *this_event = NULL; crm_action_t *action = NULL; action = get_action(action_id, FALSE); if(action == NULL) { return -1; } op_status = status_from_rc(action, op_status, op_rc, target_rc); if(op_status != LRM_OP_DONE) { update_failcount(event, event_node, op_rc, target_rc, FALSE); } /* Process OP status */ switch(op_status) { case LRM_OP_PENDING: crm_debug("Ignoring pending operation"); return action->id; break; case LRM_OP_DONE: break; case LRM_OP_ERROR: case LRM_OP_TIMEOUT: case LRM_OP_NOTSUPPORTED: action->failed = TRUE; break; case LRM_OP_CANCELLED: /* do nothing?? */ crm_err("Dont know what to do for cancelled ops yet"); break; default: action->failed = TRUE; crm_err("Unsupported action result: %d", op_status); } /* stop this event's timer if it had one */ stop_te_timer(action->timer); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); if(action->failed) { allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); if(crm_is_true(allow_fail)) { action->failed = FALSE; } } if(action->failed) { abort_transition(action->synapse->priority+1, tg_restart, "Event failed", event); } this_event = ID(event); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); te_log_action(LOG_INFO, "Action %s (%d) confirmed on %s (rc=%d)", crm_str(this_event), action->id, crm_str(target), op_status); return action->id; } crm_action_t * get_action(int id, gboolean confirmed) { slist_iter( synapse, synapse_t, transition_graph->synapses, lpc, slist_iter( action, crm_action_t, synapse->actions, lpc2, if(action->id == id) { if(confirmed) { stop_te_timer(action->timer); action->confirmed = TRUE; } return action; } ) ); return NULL; } crm_action_t * get_cancel_action(const char *id, const char *node) { const char *task = NULL; const char *target = NULL; slist_iter( synapse, synapse_t, transition_graph->synapses, lpc, slist_iter( action, crm_action_t, synapse->actions, lpc2, task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if(safe_str_neq(CRMD_ACTION_CANCEL, task)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if(safe_str_neq(task, id)) { continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if(safe_str_neq(target, node)) { continue; } return action; ); ); return NULL; } crm_action_t * match_down_event(int id, const char *target, const char *filter) { const char *this_action = NULL; const char *this_node = NULL; crm_action_t *match = NULL; slist_iter( synapse, synapse_t, transition_graph->synapses, lpc, /* lookup event */ slist_iter( action, crm_action_t, synapse->actions, lpc2, if(id > 0 && action->id == id) { match = action; break; } this_action = crm_element_value( action->xml, XML_LRM_ATTR_TASK); if(action->type != action_type_crm) { continue; } else if(safe_str_eq(this_action, CRM_OP_LRM_REFRESH)){ continue; } else if(filter != NULL && safe_str_neq(this_action, filter)) { continue; } this_node = crm_element_value( action->xml, XML_LRM_ATTR_TARGET_UUID); if(this_node == NULL) { crm_log_xml_err(action->xml, "No node uuid"); } if(safe_str_neq(this_node, target)) { crm_debug("Action %d : Node mismatch: %s", action->id, this_node); continue; } match = action; break; ); if(match != NULL) { /* stop this event's timer if it had one */ break; } ); if(match != NULL) { /* stop this event's timer if it had one */ crm_debug("Match found for action %d: %s on %s", id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); stop_te_timer(match->timer); match->confirmed = TRUE; } else if(id > 0) { crm_err("No match for action %d", id); } else { crm_warn("No match for shutdown action on %s", target); } return match; } gboolean process_graph_event(xmlNode *event, const char *event_node) { int rc = -1; int status = -1; int action = -1; int target_rc = -1; int transition_num = -1; char *update_te_uuid = NULL; gboolean stop_early = FALSE; gboolean passed = FALSE; const char *id = NULL; const char *magic = NULL; CRM_ASSERT(event != NULL); id = ID(event); magic = crm_element_value(event, XML_ATTR_TRANSITION_MAGIC); if(magic == NULL) { /* non-change */ return FALSE; } CRM_CHECK(decode_transition_magic( magic, &update_te_uuid, &transition_num, &action, &status, &rc, &target_rc), crm_err("Invalid event %s detected", id); abort_transition(INFINITY, tg_restart,"Bad event", event); + return FALSE; ); if(status == LRM_OP_PENDING) { goto bail; } if(transition_num == -1) { crm_err("Action %s (%s) initiated outside of a transition", id, magic); abort_transition(INFINITY, tg_restart,"Unexpected event",event); } else if(action < 0 || crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE) { - crm_info("Action %s (%s) initiated by a different transitioner", - id, magic); + crm_info("Action %s/%d (%s) initiated by a different transitioner", + id, action, magic); abort_transition(INFINITY, tg_restart,"Foreign event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if(transition_graph->id != transition_num) { crm_info("Detected action %s from a different transition:" " %d vs. %d", id, transition_num, transition_graph->id); abort_transition(INFINITY, tg_restart,"Old event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if(transition_graph->complete) { crm_info("Action %s arrived after a completed transition", id); abort_transition(INFINITY, tg_restart, "Inactive graph", event); } else if(match_graph_event( action, event, event_node, status, rc, target_rc) < 0) { crm_err("Unknown graph action %s", id); abort_transition(INFINITY, tg_restart, "Unknown event", event); } else { passed = TRUE; crm_debug_2("Processed update to %s: %s", id, magic); } if(passed == FALSE) { if(update_failcount(event, event_node, rc, target_rc, transition_num == -1)) { /* Turns out this wasn't an lrm status refresh update aferall */ stop_early = FALSE; } } bail: crm_free(update_te_uuid); return stop_early; } diff --git a/crmd/te_utils.c b/crmd/te_utils.c index 3bf59eaa4b..e0397cf62d 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -1,293 +1,297 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include GCHSource *stonith_src = NULL; GTRIGSource *stonith_reconnect = NULL; +static gboolean +fail_incompletable_stonith(crm_graph_t *graph) +{ + const char *task = NULL; + xmlNode *last_action = NULL; + + slist_iter( + synapse, synapse_t, graph->synapses, lpc, + if (synapse->confirmed) { + continue; + } + + slist_iter( + action, crm_action_t, synapse->actions, lpc, + + if(action->type != action_type_crm || action->confirmed) { + continue; + } + + task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); + if(task && safe_str_eq(task, CRM_OP_FENCE)) { + action->failed = TRUE; + last_action = action->xml; + update_graph(graph, action); + crm_notice("Failing action %d (%s): STONITHd terminated", + action->id, ID(action->xml)); + } + ); + ); + + if(last_action != NULL) { + crm_warn("STONITHd failure resulted in un-runnable actions"); + abort_transition(INFINITY, tg_restart, "Stonith failure", last_action); + return TRUE; + } + + return FALSE; +} + static void tengine_stonith_connection_destroy(gpointer user_data) { if(stonith_src == NULL) { crm_info("Fencing daemon disconnected"); } else { crm_crit("Fencing daemon connection failed"); G_main_set_trigger(stonith_reconnect); } - stonith_op_active = 0; - if(transition_graph) { - transition_graph->transition_timeout = active_timeout; - - crm_info("Restoring transition timeout: %d", - transition_graph->transition_timeout); - } - /* cbchan will be garbage at this point, arrange for it to be reset */ set_stonithd_input_IPC_channel_NULL(); stonith_src = NULL; + + fail_incompletable_stonith(transition_graph); + trigger_graph(); return; } static gboolean tengine_stonith_dispatch(IPC_Channel *sender, void *user_data) { while(stonithd_op_result_ready()) { if (sender->ch_status != IPC_CONNECT) { /* The message which was pending for us is that * the IPC status is now IPC_DISCONNECT */ break; } if(ST_FAIL == stonithd_receive_ops_result(FALSE)) { crm_err("stonithd_receive_ops_result() failed"); } } if (sender->ch_status != IPC_CONNECT) { tengine_stonith_connection_destroy(NULL); return FALSE; } return TRUE; } gboolean te_connect_stonith(gpointer user_data) { int lpc = 0; int rc = ST_OK; IPC_Channel *fence_ch = NULL; if(stonith_src != NULL) { crm_debug_2("Still connected"); return TRUE; } for(lpc = 0; lpc < 30; lpc++) { crm_info("Attempting connection to fencing daemon..."); sleep(1); rc = stonithd_signon("tengine"); if(rc == ST_OK) { break; } if(user_data != NULL) { crm_err("Sign-in failed: triggered a retry"); G_main_set_trigger(stonith_reconnect); return TRUE; } crm_err("Sign-in failed: pausing and trying again in 2s..."); sleep(1); } CRM_ASSERT(rc == ST_OK); /* If not, we failed 30 times... just get out */ CRM_ASSERT(stonithd_set_stonith_ops_callback( tengine_stonith_callback) == ST_OK); crm_debug_2("Grabbing IPC channel"); fence_ch = stonithd_input_IPC_channel(); CRM_ASSERT(fence_ch != NULL); crm_debug_2("Attaching to mainloop"); stonith_src = G_main_add_IPC_Channel( G_PRIORITY_LOW, fence_ch, FALSE, tengine_stonith_dispatch, NULL, tengine_stonith_connection_destroy); CRM_ASSERT(stonith_src != NULL); crm_info("Connected"); return TRUE; } -gboolean -start_global_timer(crm_action_timer_t *timer, int timeout) -{ - CRM_ASSERT(timer != NULL); - CRM_CHECK(timer->source_id == 0, return FALSE); - - if(stonith_op_active > 0) { - crm_debug("Skipping transition timer while stonith op is active"); - - } else if(timeout <= 0) { - crm_err("Tried to start timer with period: %d", timeout); - - } else if(timer->source_id == 0) { - crm_debug_2("Starting abort timer: %dms", timeout); - timer->timeout = timeout; - timer->source_id = Gmain_timeout_add( - timeout, global_timer_callback, (void*)timer); - CRM_ASSERT(timer->source_id != 0); - return TRUE; - - } else { - crm_err("Timer is already active with period: %d", timer->timeout); - } - - return FALSE; -} - gboolean stop_te_timer(crm_action_timer_t *timer) { const char *timer_desc = "action timer"; if(timer == NULL) { return FALSE; } if(timer->reason == timeout_abort) { timer_desc = "global timer"; crm_debug_2("Stopping %s", timer_desc); } if(timer->source_id != 0) { crm_debug_2("Stopping %s", timer_desc); Gmain_timeout_remove(timer->source_id); timer->source_id = 0; } else { crm_debug_2("%s was already stopped", timer_desc); return FALSE; } return TRUE; } gboolean te_graph_trigger(gpointer user_data) { enum transition_status graph_rc = -1; crm_debug_2("Invoking graph %d in state %s", transition_graph->id, fsa_state2string(fsa_state)); switch(fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: return TRUE; break; default: break; } if(transition_graph->complete == FALSE) { graph_rc = run_graph(transition_graph); print_graph(LOG_DEBUG_3, transition_graph); if(graph_rc == transition_active) { crm_debug_3("Transition not yet complete"); - stop_te_timer(transition_timer); - start_global_timer( - transition_timer, transition_graph->transition_timeout); return TRUE; } else if(graph_rc == transition_pending) { crm_debug_3("Transition not yet complete - no actions fired"); return TRUE; } if(graph_rc != transition_complete) { crm_err("Transition failed: %s", transition_status(graph_rc)); print_graph(LOG_WARNING, transition_graph); } } crm_info("Transition %d is now complete", transition_graph->id); transition_graph->complete = TRUE; notify_crmd(transition_graph); return TRUE; } void trigger_graph_processing(const char *fn, int line) { G_main_set_trigger(transition_trigger); crm_debug_2("%s:%d - Triggered graph processing", fn, line); } void abort_transition_graph( int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode *reason, const char *fn, int line) { int log_level = LOG_INFO; const char *magic = NULL; CRM_CHECK(transition_graph != NULL, return); if(reason) { magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC); do_crm_log(log_level, "%s:%d - Triggered transition abort (complete=%d, tag=%s, id=%s, magic=%s) : %s", fn, line, transition_graph->complete, TYPE(reason), ID(reason), magic?magic:"NA", abort_text); } else { do_crm_log(log_level, "%s:%d - Triggered transition abort (complete=%d) : %s", fn, line, transition_graph->complete, abort_text); } switch(fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: do_crm_log(log_level, "Abort suppressed: state=%s (complete=%d)", fsa_state2string(fsa_state), transition_graph->complete); return; default: break; } if(magic == NULL) { crm_log_xml(log_level+1, "Cause", reason); } if(transition_graph->complete) { register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL); return; } update_abort_priority( transition_graph, abort_priority, abort_action, abort_text); G_main_set_trigger(transition_trigger); } diff --git a/crmd/tengine.c b/crmd/tengine.c index b26d3cff08..c1759aad14 100644 --- a/crmd/tengine.c +++ b/crmd/tengine.c @@ -1,279 +1,271 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include /* for access */ #include #include #include /* for calls to open */ #include /* for calls to open */ #include /* for calls to open */ #include /* for getpwuid */ #include /* for initgroups */ #include /* for getrlimit */ #include /* for getrlimit */ #include #include #include #include #include #include #include #include #include extern crm_graph_functions_t te_graph_fns; struct crm_subsystem_s *te_subsystem = NULL; static void global_cib_callback(const xmlNode *msg, int callid ,int rc, xmlNode *output) { } static crm_graph_t *create_blank_graph(void) { crm_graph_t *a_graph = unpack_graph(NULL, NULL); a_graph->complete = TRUE; a_graph->abort_reason = "DC Takeover"; a_graph->completion_action = tg_restart; return a_graph; } /* A_TE_START, A_TE_STOP, A_TE_RESTART */ void do_te_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int dummy; gboolean init_ok = TRUE; cl_uuid_t new_uuid; char uuid_str[UU_UNPARSE_SIZEOF]; if(action & A_TE_STOP) { - stop_te_timer(transition_timer); if(transition_graph) { destroy_graph(transition_graph); transition_graph = NULL; } if(fsa_cib_conn && cib_ok != fsa_cib_conn->cmds->del_notify_callback( fsa_cib_conn, T_CIB_DIFF_NOTIFY, te_update_diff)) { crm_err("Could not set CIB notification callback"); init_ok = FALSE; } clear_bit_inplace(fsa_input_register, te_subsystem->flag_connected); crm_info("Transitioner is now inactive"); if(stonith_src) { GCHSource *source = stonith_src; crm_info("Disconnecting STONITH..."); stonith_src = NULL; /* so that we don't try to reconnect */ G_main_del_IPC_Channel(source); stonithd_signoff(); } } if((action & A_TE_START) == 0) { return; } else if(is_set(fsa_input_register, te_subsystem->flag_connected)) { crm_debug("The transitioner is already active"); return; } else if((action & A_TE_START) && cur_state == S_STOPPING) { crm_info("Ignoring request to start %s while shutting down", te_subsystem->name); return; } cl_uuid_generate(&new_uuid); cl_uuid_unparse(&new_uuid, uuid_str); te_uuid = crm_strdup(uuid_str); crm_info("Registering TE UUID: %s", te_uuid); if(transition_trigger == NULL) { transition_trigger = G_main_add_TriggerHandler( G_PRIORITY_LOW, te_graph_trigger, NULL, NULL); } if(stonith_reconnect == NULL) { stonith_reconnect = G_main_add_TriggerHandler( G_PRIORITY_LOW, te_connect_stonith, &dummy, NULL); } if(cib_ok != fsa_cib_conn->cmds->add_notify_callback( fsa_cib_conn, T_CIB_DIFF_NOTIFY, te_update_diff)) { crm_err("Could not set CIB notification callback"); init_ok = FALSE; } if(cib_EXISTS != fsa_cib_conn->cmds->add_notify_callback( fsa_cib_conn, T_CIB_DIFF_NOTIFY, te_update_diff)) { crm_err("Set duplicate CIB notification callback"); } if(cib_ok != fsa_cib_conn->cmds->set_op_callback(fsa_cib_conn, global_cib_callback)) { crm_err("Could not set CIB global callback"); init_ok = FALSE; } if(init_ok) { G_main_set_trigger(stonith_reconnect); set_graph_functions(&te_graph_fns); if(transition_graph) { destroy_graph(transition_graph); } /* create a blank one */ - transition_graph = create_blank_graph(); - crm_malloc0(transition_timer, sizeof(crm_action_timer_t)); - transition_timer->source_id = 0; - transition_timer->reason = timeout_abort; - transition_timer->action = NULL; - crm_debug("Transitioner is now active"); + transition_graph = create_blank_graph(); set_bit_inplace(fsa_input_register, te_subsystem->flag_connected); } } /* A_TE_INVOKE, A_TE_CANCEL */ void do_te_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { if(AM_I_DC == FALSE) { crm_err("Not DC: No need to invoke the TE (anymore): %s", fsa_action2string(action)); return; } else if(fsa_state != S_TRANSITION_ENGINE && (action & A_TE_INVOKE)) { crm_err("No need to invoke the TE (%s) in state %s", fsa_action2string(action), fsa_state2string(fsa_state)); return; } if(action & A_TE_CANCEL) { crm_debug("Cancelling the active Transition"); abort_transition(INFINITY, tg_restart, "Peer Cancelled", NULL); } else if(action & A_TE_HALT) { abort_transition(INFINITY, tg_stop, "Peer Halt", NULL); } else if(action & A_TE_INVOKE) { const char *value = NULL; xmlNode *graph_data = NULL; ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); const char *ref = crm_element_value(input->msg, XML_ATTR_REFERENCE); const char *graph_file = crm_element_value(input->msg, F_CRM_TGRAPH); const char *graph_input = crm_element_value(input->msg, F_CRM_TGRAPH_INPUT); if(graph_file == NULL && input->xml == NULL) { crm_log_xml_err(input->msg, "Bad command"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return; } if(transition_graph->complete == FALSE) { crm_info("Another transition is already active"); abort_transition(INFINITY, tg_restart, "Transition Active", NULL); return; } - stop_te_timer(transition_timer); graph_data = input->xml; if(graph_data == NULL && graph_file != NULL) { graph_data = filename2xml(graph_file); } CRM_CHECK(graph_data != NULL, crm_err("Input raised by %s is invalid", msg_data->origin); crm_log_xml_err(input->msg, "Bad command"); return); destroy_graph(transition_graph); transition_graph = unpack_graph(graph_data, graph_input); CRM_CHECK(transition_graph != NULL, transition_graph = create_blank_graph(); return); crm_info("Processing graph %d (ref=%s) derived from %s", transition_graph->id, ref, graph_input); value = crm_element_value(graph_data, "failed-stop-offset"); if(value) { failed_stop_offset = crm_strdup(value); } value = crm_element_value(graph_data, "failed-start-offset"); if(value) { failed_start_offset = crm_strdup(value); } trigger_graph(); print_graph(LOG_DEBUG_2, transition_graph); if(graph_data != input->xml) { free_xml(graph_data); } } } #if 0 gboolean shuttingdown; gboolean tengine_shutdown(int nsig, gpointer unused) { shuttingdown = TRUE; abort_transition(INFINITY, tg_shutdown, "Shutdown", NULL); return TRUE; } gboolean te_stop(void) { destroy_graph(transition_graph); - crm_free(transition_timer); #if SUPPORT_HEARTBEAT if(is_heartbeat_cluster()) { stonithd_signoff(); } #endif crm_free(te_uuid); } #endif