diff --git a/crmd/te_actions.c b/crmd/te_actions.c index 07aaf0f180..e122a941c2 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -1,547 +1,698 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include +#include char *te_uuid = NULL; - +GHashTable *te_targets = NULL; void send_rsc_command(crm_action_t * action); +static void te_update_job_count(crm_action_t * action, int offset); static void te_start_action_timer(crm_graph_t * graph, crm_action_t * action) { action->timer = calloc(1, sizeof(crm_action_timer_t)); action->timer->timeout = action->timeout; action->timer->reason = timeout_action; action->timer->action = action; action->timer->source_id = g_timeout_add(action->timer->timeout + graph->network_delay, action_timer_callback, (void *)action->timer); CRM_ASSERT(action->timer->source_id != 0); } static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { crm_debug("Pseudo action %d fired and confirmed", pseudo->id); - pseudo->confirmed = TRUE; + te_action_confirmed(pseudo); update_graph(graph, pseudo); trigger_graph(); return TRUE; } void send_stonith_update(crm_action_t * action, const char *target, const char *uuid) { int rc = pcmk_ok; crm_node_t *peer = NULL; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); /* Make sure the membership and join caches are accurate */ peer = crm_get_peer_full(0, target, CRM_GET_PEER_CLUSTER | CRM_GET_PEER_REMOTE); CRM_CHECK(peer != NULL, return); if (peer->uuid == NULL) { crm_info("Recording uuid '%s' for node '%s'", uuid, target); peer->uuid = strdup(uuid); } crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL); crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); crm_update_peer_join(__FUNCTION__, peer, crm_join_none); node_state = do_update_node_cib(peer, node_update_cluster | node_update_peer | node_update_join | node_update_expected, NULL, __FUNCTION__); /* Force our known ID */ crm_xml_add(node_state, XML_ATTR_UUID, uuid); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override | cib_scope_local | cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); /* Make sure it sticks */ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); free_xml(node_state); return; } static gboolean te_fence_node(crm_graph_t * graph, crm_action_t * action) { int rc = 0; const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; gboolean invalid_action = FALSE; enum stonith_call_options options = st_opt_none; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = crm_meta_value(action->params, "stonith_action"); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return FALSE; } crm_notice("Executing %s fencing operation (%s) on %s (timeout=%d)", type, id, target, transition_graph->stonith_timeout); /* Passing NULL means block until we can connect... */ te_connect_stonith(NULL); if (crmd_join_phase_count(crm_join_confirmed) == 1) { options |= st_opt_allow_suicide; } rc = stonith_api->cmds->fence(stonith_api, options, target, type, transition_graph->stonith_timeout / 1000, 0); stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000, st_opt_timeout_updates, generate_transition_key(transition_graph->id, action->id, 0, te_uuid), "tengine_stonith_callback", tengine_stonith_callback); return TRUE; } static int get_target_rc(crm_action_t * action) { const char *target_rc_s = crm_meta_value(action->params, XML_ATTR_TE_TARGET_RC); if (target_rc_s != NULL) { return crm_parse_int(target_rc_s, "0"); } return 0; } static gboolean te_crm_command(crm_graph_t * graph, crm_action_t * action) { char *counter = NULL; xmlNode *cmd = NULL; gboolean is_local = FALSE; const char *id = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *router_node = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; id = ID(action->xml); task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (!router_node) { router_node = on_node; } CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err("Corrupted command (id=%s) %s: no node", crm_str(id), crm_str(task)); return FALSE); crm_info("Executing crm-event (%s): %s on %s%s%s", crm_str(id), crm_str(task), on_node, is_local ? " (local)" : "", no_wait ? " - no waiting" : ""); if (safe_str_eq(router_node, fsa_our_uname)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } if (is_local && safe_str_eq(task, CRM_OP_SHUTDOWN)) { /* defer until everything else completes */ crm_info("crm-event (%s) is a local shutdown", crm_str(id)); graph->completion_action = tg_shutdown; graph->abort_reason = "local shutdown"; - action->confirmed = TRUE; + te_action_confirmed(action); update_graph(graph, action); trigger_graph(); return TRUE; } else if (safe_str_eq(task, CRM_OP_SHUTDOWN)) { crm_node_t *peer = crm_get_peer(0, router_node); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); } cmd = create_request(task, action->xml, router_node, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); counter = generate_transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(cmd, XML_ATTR_TRANSITION_KEY, counter); rc = send_cluster_message(crm_get_peer(0, router_node), crm_msg_crmd, cmd, TRUE); free(counter); free_xml(cmd); if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { - action->confirmed = TRUE; + te_action_confirmed(action); update_graph(graph, action); trigger_graph(); } else { if (action->timeout <= 0) { crm_err("Action %d: %s on %s had an invalid timeout (%dms). Using %dms instead", action->id, task, on_node, action->timeout, graph->network_delay); action->timeout = graph->network_delay; } te_start_action_timer(graph, action); } return TRUE; } gboolean cib_action_update(crm_action_t * action, int status, int op_rc) { lrmd_event_data_t *op = NULL; xmlNode *state = NULL; xmlNode *rsc = NULL; xmlNode *xml_op = NULL; xmlNode *action_rsc = NULL; int rc = pcmk_ok; const char *name = NULL; const char *value = NULL; const char *rsc_id = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); const char *target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int call_options = cib_quorum_override | cib_scope_local; int target_rc = get_target_rc(action); if (status == PCMK_LRM_OP_PENDING) { crm_debug("%s %d: Recording pending operation %s on %s", crm_element_name(action->xml), action->id, task_uuid, target); } else { crm_warn("%s %d: %s on %s timed out", crm_element_name(action->xml), action->id, task_uuid, target); } action_rsc = find_xml_node(action->xml, XML_CIB_TAG_RESOURCE, TRUE); if (action_rsc == NULL) { return FALSE; } rsc_id = ID(action_rsc); CRM_CHECK(rsc_id != NULL, crm_log_xml_err(action->xml, "Bad:action"); return FALSE); /* update the CIB */ state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(state, XML_ATTR_UUID, target_uuid); crm_xml_add(state, XML_ATTR_UNAME, target); rsc = create_xml_node(state, XML_CIB_TAG_LRM); crm_xml_add(rsc, XML_ATTR_ID, target_uuid); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCES); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, rsc_id); name = XML_ATTR_TYPE; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_CLASS; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_PROVIDER; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); op = convert_graph_action(NULL, action, status, op_rc); op->call_id = -1; op->user_data = generate_transition_key(transition_graph->id, action->id, target_rc, te_uuid); xml_op = create_operation_update(rsc, op, CRM_FEATURE_SET, target_rc, __FUNCTION__, LOG_INFO); lrmd_free_event(op); crm_trace("Updating CIB with \"%s\" (%s): %s %s on %s", status < 0 ? "new action" : XML_ATTR_TIMEOUT, crm_element_name(action->xml), crm_str(task), rsc_id, target); crm_log_xml_trace(xml_op, "Op"); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); crm_trace("Updating CIB with %s action %d: %s on %s (call_id=%d)", services_lrm_status_str(status), action->id, task_uuid, target, rc); fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); free_xml(state); action->sent_update = TRUE; if (rc < pcmk_ok) { return FALSE; } return TRUE; } static gboolean te_rsc_command(crm_graph_t * graph, crm_action_t * action) { /* never overwrite stop actions in the CIB with * anything other than completed results * * Writing pending stops makes it look like the * resource is running again */ xmlNode *cmd = NULL; xmlNode *rsc_op = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; gboolean is_local = FALSE; char *counter = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *router_node = NULL; const char *task_uuid = NULL; CRM_ASSERT(action != NULL); CRM_ASSERT(action->xml != NULL); action->executed = FALSE; on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err("Corrupted command(id=%s) %s: no node", ID(action->xml), crm_str(task)); return FALSE); rsc_op = action->xml; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); router_node = crm_element_value(rsc_op, XML_LRM_ATTR_ROUTER_NODE); if (!router_node) { router_node = on_node; } counter = generate_transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(rsc_op, XML_ATTR_TRANSITION_KEY, counter); if (safe_str_eq(router_node, fsa_our_uname)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } crm_notice("Initiating action %d: %s %s on %s%s%s", action->id, task, task_uuid, on_node, is_local ? " (local)" : "", no_wait ? " - no waiting" : ""); cmd = create_request(CRM_OP_INVOKE_LRM, rsc_op, router_node, CRM_SYSTEM_LRMD, CRM_SYSTEM_TENGINE, NULL); if (is_local) { /* shortcut local resource commands */ ha_msg_input_t data = { .msg = cmd, .xml = rsc_op, }; fsa_data_t msg = { .id = 0, .data = &data, .data_type = fsa_dt_ha_msg, .fsa_input = I_NULL, .fsa_cause = C_FSA_INTERNAL, .actions = A_LRM_INVOKE, .origin = __FUNCTION__, }; do_lrm_invoke(A_LRM_INVOKE, C_FSA_INTERNAL, fsa_state, I_NULL, &msg); } else { rc = send_cluster_message(crm_get_peer(0, router_node), crm_msg_lrmd, cmd, TRUE); } free(counter); free_xml(cmd); action->executed = TRUE; + if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { crm_info("Action %d confirmed - no wait", action->id); - action->confirmed = TRUE; + action->confirmed = TRUE; /* Just mark confirmed. + * Don't bump the job count only to immediately decrement it + */ update_graph(transition_graph, action); trigger_graph(); } else { if (action->timeout <= 0) { crm_err("Action %d: %s %s on %s had an invalid timeout (%dms). Using %dms instead", action->id, task, task_uuid, on_node, action->timeout, graph->network_delay); action->timeout = graph->network_delay; } + te_update_job_count(action, 1); te_start_action_timer(graph, action); } value = crm_meta_value(action->params, XML_OP_ATTR_PENDING); if (crm_is_true(value) && safe_str_neq(task, CRMD_ACTION_CANCEL) && safe_str_neq(task, CRMD_ACTION_DELETE)) { /* write a "pending" entry to the CIB, inhibit notification */ crm_debug("Recording pending op %s in the CIB", task_uuid); cib_action_update(action, PCMK_LRM_OP_PENDING, PCMK_OCF_UNKNOWN); } return TRUE; } +struct te_peer_s +{ + char *name; + int jobs; +}; + +static void te_peer_free(gpointer p) +{ + struct te_peer_s *peer = p; + + free(peer->name); + free(peer); +} + +void te_reset_job_counts(void) +{ + GHashTableIter iter; + struct te_peer_s *peer = NULL; + + if(te_targets == NULL) { + te_targets = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, te_peer_free); + } + + g_hash_table_iter_init(&iter, te_targets); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & peer)) { + peer->jobs = 0; + } +} + +static void +te_update_job_count_on(const char *target, int offset) +{ + struct te_peer_s *r = NULL; + + if(target == NULL || te_targets == NULL) { + return; + } + + r = g_hash_table_lookup(te_targets, target); + if(r == NULL) { + r = calloc(1, sizeof(struct te_peer_s)); + r->name = strdup(target); + g_hash_table_insert(te_targets, r->name, r); + } + + r->jobs += offset; + crm_trace("jobs[%s] = %d", target, r->jobs); +} + +static void +te_update_job_count(crm_action_t * action, int offset) +{ + const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); + const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + + if (action->type != action_type_rsc || target == NULL) { + /* No limit on these */ + return; + } + + if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { + xmlNode *meta = get_xpath_object("//[@"XML_LRM_ATTR_MIGRATE_SOURCE"]", action->xml, LOG_ERR); + const char *t1 = crm_element_value(meta, XML_LRM_ATTR_MIGRATE_SOURCE); + const char *t2 = crm_element_value(meta, XML_LRM_ATTR_MIGRATE_TARGET); + + te_update_job_count_on(t1, offset); + te_update_job_count_on(t2, offset); + + } else { + + te_update_job_count_on(target, offset); + } +} + +static gboolean +te_should_perform_action_on(const char *action, const char *target) +{ + int limit = 0; + struct te_peer_s *r = NULL; + + if(target == NULL) { + /* No limit on these */ + return TRUE; + + } else if(te_targets == NULL) { + return FALSE; + } + + r = g_hash_table_lookup(te_targets, target); + limit = throttle_get_job_limit(target); + + if(r == NULL) { + r = calloc(1, sizeof(struct te_peer_s)); + r->name = strdup(target); + g_hash_table_insert(te_targets, r->name, r); + } + + if(limit <= r->jobs) { + crm_trace("Peer %s is over their job limit of %d (%d): deferring %s", + target, limit, r->jobs, action); + return FALSE; + } + return TRUE; +} + +static gboolean +te_should_perform_action(crm_graph_t * graph, crm_action_t * action) +{ + const char *target = NULL; + const char *id = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); + const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); + + if (action->type != action_type_rsc) { + /* No limit on these */ + return TRUE; + } + + if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { + target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); + if(te_should_perform_action_on(id, target) == FALSE) { + return FALSE; + } + + target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET); + + } else { + target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + } + + return te_should_perform_action_on(id, target); +} + +void +te_action_confirmed(crm_action_t * action) +{ + const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + + if (action->confirmed == FALSE && action->type == action_type_rsc && target != NULL) { + te_update_job_count(action, -1); + } + action->confirmed = TRUE; +} + + crm_graph_functions_t te_graph_fns = { te_pseudo_action, te_rsc_command, te_crm_command, - te_fence_node + te_fence_node, + te_should_perform_action, }; void notify_crmd(crm_graph_t * graph) { const char *type = "unknown"; enum crmd_fsa_input event = I_NULL; crm_debug("Processing transition completion in state %s", fsa_state2string(fsa_state)); CRM_CHECK(graph->complete, graph->complete = TRUE); switch (graph->completion_action) { case tg_stop: type = "stop"; /* fall through */ case tg_done: type = "done"; if (fsa_state == S_TRANSITION_ENGINE) { event = I_TE_SUCCESS; } break; case tg_restart: type = "restart"; if (fsa_state == S_TRANSITION_ENGINE) { if (too_many_st_failures() == FALSE) { if (transition_timer->period_ms > 0) { crm_timer_stop(transition_timer); crm_timer_start(transition_timer); } else { event = I_PE_CALC; } } else { event = I_TE_SUCCESS; } } else if (fsa_state == S_POLICY_ENGINE) { register_fsa_action(A_PE_INVOKE); } break; case tg_shutdown: type = "shutdown"; if (is_set(fsa_input_register, R_SHUTDOWN)) { event = I_STOP; } else { crm_err("We didn't ask to be shut down, yet our" " PE is telling us too."); event = I_TERMINATE; } } crm_debug("Transition %d status: %s - %s", graph->id, type, crm_str(graph->abort_reason)); graph->abort_reason = NULL; graph->completion_action = tg_done; clear_bit(fsa_input_register, R_IN_TRANSITION); if (event != I_NULL) { register_fsa_input(C_FSA_INTERNAL, event, NULL); } else if (fsa_source) { mainloop_set_trigger(fsa_source); } } diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c index 4b8aac54b8..0151295134 100644 --- a/crmd/te_callbacks.c +++ b/crmd/te_callbacks.c @@ -1,556 +1,556 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include /* For ONLINESTATUS etc */ void te_update_confirm(const char *event, xmlNode * msg); extern char *te_uuid; gboolean shuttingdown = FALSE; crm_graph_t *transition_graph; crm_trigger_t *transition_trigger = NULL; /* #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" static const char * get_node_id(xmlNode * rsc_op) { xmlNode *node = rsc_op; while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return NULL); return ID(node); } static void process_resource_updates(xmlXPathObject * xpathObj) { /* */ int lpc = 0, max = numXpathResults(xpathObj); for (lpc = 0; lpc < max; lpc++) { xmlNode *rsc_op = getXpathResult(xpathObj, lpc); const char *node = get_node_id(rsc_op); process_graph_event(rsc_op, node); } } void te_update_diff(const char *event, xmlNode * msg) { int lpc, max; int rc = -1; const char *op = NULL; xmlNode *diff = NULL; xmlXPathObject *xpathObj = NULL; int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; CRM_CHECK(msg != NULL, return); crm_element_value_int(msg, F_CIB_RC, &rc); if (transition_graph == NULL) { crm_trace("No graph"); return; } else if (rc < pcmk_ok) { crm_trace("Filter rc=%d (%s)", rc, pcmk_strerror(rc)); return; } else if (transition_graph->complete == TRUE && fsa_state != S_IDLE && fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) { crm_trace("Filter state=%s, complete=%d", fsa_state2string(fsa_state), transition_graph->complete); return; } op = crm_element_value(msg, F_CIB_OPERATION); diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); cib_diff_version_details(diff, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); crm_debug("Processing diff (%s): %d.%d.%d -> %d.%d.%d (%s)", op, diff_del_admin_epoch, diff_del_epoch, diff_del_updates, diff_add_admin_epoch, diff_add_epoch, diff_add_updates, fsa_state2string(fsa_state)); log_cib_diff(LOG_DEBUG_2, diff, __FUNCTION__); if (cib_config_changed(NULL, NULL, &diff)) { abort_transition(INFINITY, tg_restart, "Non-status change", diff); goto bail; /* configuration changed */ } /* Tickets Attributes - Added/Updated */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_TICKETS); if (numXpathResults(xpathObj) > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Ticket attribute: update", aborted); goto bail; } freeXpathObject(xpathObj); /* Tickets Attributes - Removed */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_TICKETS); if (numXpathResults(xpathObj) > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Ticket attribute: removal", aborted); goto bail; } freeXpathObject(xpathObj); /* Transient Attributes - Added/Updated */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_TRANSIENT_NODEATTRS "//" XML_CIB_TAG_NVPAIR); max = numXpathResults(xpathObj); for (lpc = 0; lpc < max; lpc++) { xmlNode *attr = getXpathResult(xpathObj, lpc); const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME); const char *value = NULL; if (safe_str_eq(CRM_OP_PROBED, name)) { value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE); } if (crm_is_true(value) == FALSE) { abort_transition(INFINITY, tg_restart, "Transient attribute: update", attr); crm_log_xml_trace(attr, "Abort"); goto bail; } } freeXpathObject(xpathObj); /* Transient Attributes - Removed */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_TRANSIENT_NODEATTRS); if (numXpathResults(xpathObj) > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Transient attribute: removal", aborted); goto bail; } freeXpathObject(xpathObj); /* * Check for and fast-track the processing of LRM refreshes * In large clusters this can result in _huge_ speedups * * Unfortunately we can only do so when there are no pending actions * Otherwise we could miss updates we're waiting for and stall * */ xpathObj = NULL; if (transition_graph->pending == 0) { xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RESOURCE); } max = numXpathResults(xpathObj); if (max > 1) { /* Updates by, or in response to, TE actions will never contain updates * for more than one resource at a time */ crm_debug("Detected LRM refresh - %d resources updated: Skipping all resource events", max); crm_log_xml_trace(diff, "lrm-refresh"); abort_transition(INFINITY, tg_restart, "LRM Refresh", NULL); goto bail; } freeXpathObject(xpathObj); /* Process operation updates */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RSC_OP); if (numXpathResults(xpathObj)) { process_resource_updates(xpathObj); } freeXpathObject(xpathObj); /* Detect deleted (as opposed to replaced or added) actions - eg. crm_resource -C */ xpathObj = xpath_search(diff, "//" XML_TAG_DIFF_REMOVED "//" XML_LRM_TAG_RSC_OP); max = numXpathResults(xpathObj); for (lpc = 0; lpc < max; lpc++) { int path_max = 0; const char *op_id = NULL; char *rsc_op_xpath = NULL; xmlXPathObject *op_match = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_CHECK(match != NULL, continue); op_id = ID(match); path_max = strlen(rsc_op_template) + strlen(op_id) + 1; rsc_op_xpath = calloc(1, path_max); snprintf(rsc_op_xpath, path_max, rsc_op_template, op_id); op_match = xpath_search(diff, rsc_op_xpath); if (numXpathResults(op_match) == 0) { /* Prevent false positives by matching cancelations too */ const char *node = get_node_id(match); crm_action_t *cancelled = get_cancel_action(op_id, node); if (cancelled == NULL) { crm_debug("No match for deleted action %s (%s on %s)", rsc_op_xpath, op_id, node); abort_transition(INFINITY, tg_restart, "Resource op removal", match); freeXpathObject(op_match); free(rsc_op_xpath); goto bail; } else { crm_debug("Deleted lrm_rsc_op %s on %s was for graph event %d", op_id, node, cancelled->id); } } freeXpathObject(op_match); free(rsc_op_xpath); } bail: freeXpathObject(xpathObj); } gboolean process_te_message(xmlNode * msg, xmlNode * xml_data) { const char *from = crm_element_value(msg, F_ORIG); const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); const char *ref = crm_element_value(msg, F_CRM_REFERENCE); const char *op = crm_element_value(msg, F_CRM_TASK); const char *type = crm_element_value(msg, F_CRM_MSG_TYPE); crm_trace("Processing %s (%s) message", op, ref); crm_log_xml_trace(msg, "ipc"); if (op == NULL) { /* error */ } else if (sys_to == NULL || strcasecmp(sys_to, CRM_SYSTEM_TENGINE) != 0) { crm_trace("Bad sys-to %s", crm_str(sys_to)); return FALSE; } else if (safe_str_eq(op, CRM_OP_INVOKE_LRM) && safe_str_eq(sys_from, CRM_SYSTEM_LRMD) /* && safe_str_eq(type, XML_ATTR_RESPONSE) */ ) { xmlXPathObject *xpathObj = NULL; crm_log_xml_trace(msg, "Processing (N)ACK"); crm_debug("Processing (N)ACK %s from %s", crm_element_value(msg, F_CRM_REFERENCE), from); xpathObj = xpath_search(xml_data, "//" XML_LRM_TAG_RSC_OP); if (numXpathResults(xpathObj)) { process_resource_updates(xpathObj); freeXpathObject(xpathObj); } else { crm_log_xml_err(msg, "Invalid (N)ACK"); freeXpathObject(xpathObj); return FALSE; } } else { crm_err("Unknown command: %s::%s from %s", type, op, sys_from); } crm_trace("finished processing message"); return TRUE; } GHashTable *stonith_failures = NULL; struct st_fail_rec { int count; int last_rc; }; gboolean too_many_st_failures(void) { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *value = NULL; if (stonith_failures == NULL) { return FALSE; } g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { if (value->count > 10) { crm_notice("Too many failures to fence %s (%d), giving up", key, value->count); return TRUE; } else if (value->last_rc == -ENODEV) { crm_notice("No devices found in cluster to fence %s, giving up", key); return TRUE; } } return FALSE; } void st_fail_count_reset(const char *target) { struct st_fail_rec *rec = NULL; if (stonith_failures) { rec = g_hash_table_lookup(stonith_failures, target); } if (rec) { rec->count = 0; rec->last_rc = 0; } } static void st_fail_count_increment(const char *target, int rc) { struct st_fail_rec *rec = NULL; if (stonith_failures == NULL) { stonith_failures = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, free); } rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count++; } else { rec = malloc(sizeof(struct st_fail_rec)); if(rec == NULL) { return; } rec->count = 1; g_hash_table_insert(stonith_failures, strdup(target), rec); } rec->last_rc = rc; } void tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) { char *uuid = NULL; int target_rc = -1; int stonith_id = -1; int transition_id = -1; crm_action_t *action = NULL; int call_id = data->call_id; int rc = data->rc; char *userdata = data->userdata; CRM_CHECK(userdata != NULL, return); crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, pcmk_strerror(rc), rc); if (AM_I_DC == FALSE) { return; } /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ /* op->call_id, op->optype, op->node_name, op->op_result, */ /* (char *)op->node_list, op->private_data); */ /* filter out old STONITH actions */ CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, &target_rc), crm_err("Invalid event detected"); goto bail; ); if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) || transition_graph->id != transition_id) { crm_info("Ignoring STONITH action initiated outside of the current transition"); goto bail; } /* this will mark the event complete if a match is found */ action = get_action(stonith_id, FALSE); if (action == NULL) { crm_err("Stonith action not matched"); goto bail; } stop_te_timer(action->timer); if (rc == pcmk_ok) { const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); crm_debug("Stonith operation %d for %s passed", call_id, target); if (action->confirmed == FALSE) { - action->confirmed = TRUE; + te_action_confirmed(action); if (action->sent_update == FALSE) { send_stonith_update(action, target, uuid); } } st_fail_count_reset(target); } else { const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET); const char *allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); action->failed = TRUE; if (crm_is_true(allow_fail) == FALSE) { crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", call_id, target, pcmk_strerror(rc)); abort_transition(INFINITY, tg_restart, "Stonith failed", NULL); } st_fail_count_increment(target, rc); } update_graph(transition_graph, action); trigger_graph(); bail: free(userdata); free(uuid); return; } void cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc < pcmk_ok) { crm_err("Fencing update %d for %s: failed - %s (%d)", call_id, (char *)user_data, pcmk_strerror(rc), rc); crm_log_xml_warn(msg, "Failed update"); abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); } else { crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); } free(user_data); } void cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc < pcmk_ok) { crm_err("Update %d FAILED: %s", call_id, pcmk_strerror(rc)); } } gboolean action_timer_callback(gpointer data) { crm_action_timer_t *timer = NULL; CRM_CHECK(data != NULL, return FALSE); timer = (crm_action_timer_t *) data; stop_te_timer(timer); crm_warn("Timer popped (timeout=%d, abort_level=%d, complete=%s)", timer->timeout, transition_graph->abort_priority, transition_graph->complete ? "true" : "false"); CRM_CHECK(timer->action != NULL, return FALSE); if (transition_graph->complete) { crm_warn("Ignoring timeout while not in transition"); } else if (timer->reason == timeout_action_warn) { print_action(LOG_WARNING, "Action missed its timeout: ", timer->action); /* Don't check the FSA state * * We might also be in S_INTEGRATION or some other state waiting for this * action so we can close the transition and continue */ } else { /* fail the action */ gboolean send_update = TRUE; const char *task = crm_element_value(timer->action->xml, XML_LRM_ATTR_TASK); print_action(LOG_ERR, "Aborting transition, action lost: ", timer->action); timer->action->failed = TRUE; - timer->action->confirmed = TRUE; + te_action_confirmed(timer->action); abort_transition(INFINITY, tg_restart, "Action lost", NULL); update_graph(transition_graph, timer->action); trigger_graph(); if (timer->action->type != action_type_rsc) { send_update = FALSE; } else if (safe_str_eq(task, RSC_CANCEL)) { /* we dont need to update the CIB with these */ send_update = FALSE; } if (send_update) { cib_action_update(timer->action, PCMK_LRM_OP_TIMEOUT, PCMK_OCF_UNKNOWN_ERROR); } } return FALSE; } diff --git a/crmd/te_events.c b/crmd/te_events.c index 8174b98b54..d202eee75f 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -1,607 +1,607 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; int match_graph_event(int action_id, xmlNode * event, const char *event_node, int op_status, int op_rc, int target_rc); gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node) { const char *target = NULL; xmlNode *last_action = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; if (graph == NULL || graph->complete) { return FALSE; } gIter = graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; if (synapse->confirmed) { continue; } gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->type == action_type_pseudo || action->confirmed) { continue; } else if (action->type == action_type_crm) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRM_OP_FENCE)) { continue; } } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (safe_str_eq(target, down_node)) { action->failed = TRUE; synapse->failed = TRUE; last_action = action->xml; stop_te_timer(action->timer); update_graph(graph, action); if (synapse->executed) { crm_notice("Action %d (%s) was pending on %s (offline)", action->id, ID(action->xml), down_node); } else { crm_notice("Action %d (%s) is scheduled for %s (offline)", action->id, ID(action->xml), down_node); } } } } if (last_action != NULL) { crm_warn("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } static const char * get_uname_from_event(xmlNode * event) { xmlNode *node = event; while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return NULL); return crm_element_value(node, XML_ATTR_UNAME); } static gboolean get_is_remote_from_event(xmlNode * event) { xmlNode *node = event; while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return FALSE); return crm_element_value(node, XML_NODE_IS_REMOTE) ? TRUE : FALSE; } static gboolean update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update) { int interval = 0; char *task = NULL; char *rsc_id = NULL; char *attr_name = NULL; const char *value = NULL; const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); const char *on_uname = get_uname_from_event(event); const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); if (rc == 99) { /* this is an internal code for "we're busy, try again" */ return FALSE; } else if (rc == target_rc) { return FALSE; } if (safe_str_eq(origin, "build_active_RAs")) { crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", id, rc, on_uname); return FALSE; } CRM_LOG_ASSERT(on_uname != NULL); if (on_uname == NULL) { return TRUE; } if (failed_stop_offset == NULL) { failed_stop_offset = strdup(INFINITY_S); } if (failed_start_offset == NULL) { failed_start_offset = strdup(INFINITY_S); } CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event)); goto bail); CRM_CHECK(task != NULL, goto bail); CRM_CHECK(rsc_id != NULL, goto bail); if (do_update || interval > 0) { do_update = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_START)) { do_update = TRUE; value = failed_start_offset; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { do_update = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { do_update = TRUE; } if (value == NULL || safe_str_neq(value, INFINITY_S)) { value = XML_NVPAIR_ATTR_VALUE "++"; } if (do_update) { char *now = crm_itoa(time(NULL)); gboolean is_remote_node = get_is_remote_from_event(event); crm_warn("Updating failcount for %s on %s after failed %s:" " rc=%d (update=%s, time=%s)", rsc_id, on_uname, task, rc, value, now); attr_name = crm_concat("fail-count", rsc_id, '-'); update_attrd(on_uname, attr_name, value, NULL, is_remote_node); free(attr_name); attr_name = crm_concat("last-failure", rsc_id, '-'); update_attrd(on_uname, attr_name, now, NULL, is_remote_node); free(attr_name); free(now); } bail: free(rsc_id); free(task); return TRUE; } static int status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc) { int status = orig_status; if (target_rc == rc) { crm_trace("Target rc: == %d", rc); if (status != PCMK_LRM_OP_DONE) { crm_trace("Re-mapping op status to" " PCMK_LRM_OP_DONE for rc=%d", rc); status = PCMK_LRM_OP_DONE; } } else { status = PCMK_LRM_OP_ERROR; } /* 99 is the code we use for direct nack's */ if (rc != 99 && status != PCMK_LRM_OP_DONE) { const char *task, *uname; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s", action->id, task, uname, target_rc, rc, services_lrm_status_str(status)); } return status; } static void process_remote_node_action(crm_action_t *action, xmlNode *event) { xmlNode *child = NULL; /* The whole point of this function is to detect when a remote-node * is integrated into the cluster, and abort the transition if that remote-node * was fenced earlier in the transition. This allows a new transition to be * generated so resources can be placed on the new node. */ if (crm_remote_peer_cache_size() == 0) { return; } else if (action->type != action_type_rsc) { return; } else if (action->failed || action->confirmed == FALSE) { return; } else if (safe_str_neq(crm_element_value(action->xml, XML_LRM_ATTR_TASK), "start")) { return; } for (child = __xml_first_child(action->xml); child != NULL; child = __xml_next(child)) { const char *provider; const char *type; const char *rsc; crm_node_t *remote_peer; if (safe_str_neq(crm_element_name(child), XML_CIB_TAG_RESOURCE)) { continue; } provider = crm_element_value(child, XML_AGENT_ATTR_PROVIDER); type = crm_element_value(child, XML_ATTR_TYPE); rsc = ID(child); if (safe_str_neq(provider, "pacemaker") || safe_str_neq(type, "remote") || rsc == NULL) { break; } remote_peer = crm_get_peer_full(0, rsc, CRM_GET_PEER_REMOTE); if (remote_peer == NULL) { break; } /* A remote node will be placed in the "lost" state after * it has been successfully fenced. After successfully connecting * to a remote-node after being fenced, we need to abort the transition * so resources can be placed on the newly integrated remote-node */ if (safe_str_eq(remote_peer->state, CRM_NODE_LOST)) { abort_transition(INFINITY, tg_restart, "Remote-node re-discovered.", event); } return; } } /* * returns the ID of the action if a match is found * returns -1 if a match was not found * returns -2 if a match was found but the action failed (and was * not allowed to) */ int match_graph_event(int action_id, xmlNode * event, const char *event_node, int op_status, int op_rc, int target_rc) { const char *target = NULL; const char *allow_fail = NULL; const char *this_event = NULL; crm_action_t *action = NULL; action = get_action(action_id, FALSE); if (action == NULL) { return -1; } op_status = status_from_rc(action, op_status, op_rc, target_rc); if (op_status != PCMK_LRM_OP_DONE) { update_failcount(event, event_node, op_rc, target_rc, FALSE); } /* Process OP status */ switch (op_status) { case PCMK_LRM_OP_PENDING: crm_debug("Ignoring pending operation"); return action->id; break; case PCMK_LRM_OP_DONE: break; case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_TIMEOUT: case PCMK_LRM_OP_NOTSUPPORTED: action->failed = TRUE; break; case PCMK_LRM_OP_CANCELLED: /* do nothing?? */ crm_err("Dont know what to do for cancelled ops yet"); break; default: action->failed = TRUE; crm_err("Unsupported action result: %d", op_status); } /* stop this event's timer if it had one */ stop_te_timer(action->timer); - action->confirmed = TRUE; + te_action_confirmed(action); update_graph(transition_graph, action); trigger_graph(); if (action->failed) { allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); if (crm_is_true(allow_fail)) { action->failed = FALSE; } } if (action->failed) { abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event); } this_event = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_info("Action %s (%d) confirmed on %s (rc=%d)", crm_str(this_event), action->id, crm_str(target), op_status); /* determine if this action affects a remote-node's online/offline status */ process_remote_node_action(action, event); return action->id; } crm_action_t * get_action(int id, gboolean confirmed) { GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->id == id) { if (confirmed) { stop_te_timer(action->timer); - action->confirmed = TRUE; + te_action_confirmed(action); } return action; } } } return NULL; } crm_action_t * get_cancel_action(const char *id, const char *node) { const char *task = NULL; const char *target = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (safe_str_neq(CRMD_ACTION_CANCEL, task)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if (safe_str_neq(task, id)) { continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (safe_str_neq(target, node)) { continue; } return action; } } return NULL; } crm_action_t * match_down_event(int id, const char *target, const char *filter, bool quiet) { const char *this_action = NULL; const char *this_node = NULL; crm_action_t *match = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; /* lookup event */ gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (id > 0 && action->id == id) { match = action; break; } this_action = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (action->type != action_type_crm) { continue; } else if (safe_str_eq(this_action, CRM_OP_LRM_REFRESH)) { continue; } else if (filter != NULL && safe_str_neq(this_action, filter)) { continue; } this_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (this_node == NULL) { crm_log_xml_err(action->xml, "No node uuid"); } if (safe_str_neq(this_node, target)) { crm_debug("Action %d : Node mismatch: %s", action->id, this_node); continue; } match = action; id = action->id; break; } if (match != NULL) { /* stop this event's timer if it had one */ break; } } if (match != NULL) { /* stop this event's timer if it had one */ crm_debug("Match found for action %d: %s on %s", id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); } else if (id > 0) { crm_err("No match for action %d", id); } else if(quiet == FALSE) { crm_warn("No match for shutdown action on %s", target); } return match; } gboolean process_graph_event(xmlNode * event, const char *event_node) { int rc = -1; int status = -1; int callid = -1; int action = -1; int target_rc = -1; int transition_num = -1; char *update_te_uuid = NULL; gboolean stop_early = FALSE; gboolean passed = FALSE; const char *id = NULL; const char *desc = NULL; const char *magic = NULL; CRM_ASSERT(event != NULL); /* */ id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); crm_element_value_int(event, XML_LRM_ATTR_RC, &rc); crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status); crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid); magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY); if (magic == NULL) { /* non-change */ return FALSE; } if (decode_transition_key(magic, &update_te_uuid, &transition_num, &action, &target_rc) == FALSE) { crm_err("Invalid event %s.%d detected: %s", id, callid, magic); abort_transition(INFINITY, tg_restart, "Bad event", event); return FALSE; } if (status == PCMK_LRM_OP_PENDING) { goto bail; } if (transition_num == -1) { desc = "initiated outside of the cluster"; abort_transition(INFINITY, tg_restart, "Unexpected event", event); } else if (action < 0 || crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE) { desc = "initiated by a different node"; abort_transition(INFINITY, tg_restart, "Foreign event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if (transition_graph->id != transition_num) { desc = "arrived really late"; abort_transition(INFINITY, tg_restart, "Old event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if (transition_graph->complete) { desc = "arrived late"; abort_transition(INFINITY, tg_restart, "Inactive graph", event); } else if (match_graph_event(action, event, event_node, status, rc, target_rc) < 0) { desc = "unknown"; abort_transition(INFINITY, tg_restart, "Unknown event", event); } else if (rc == target_rc) { passed = TRUE; crm_trace("Processed update to %s: %s", id, magic); } if (passed == FALSE) { if (update_failcount(event, event_node, rc, target_rc, transition_num == -1)) { /* Turns out this wasn't an lrm status refresh update aferall */ stop_early = FALSE; desc = "failed"; } crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num, action, id, callid, services_ocf_exitcode_str(rc), desc); } bail: free(update_te_uuid); return stop_early; } diff --git a/crmd/tengine.c b/crmd/tengine.c index 8e236f1f4e..765628c04b 100644 --- a/crmd/tengine.c +++ b/crmd/tengine.c @@ -1,229 +1,230 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include /* for access */ #include /* for calls to open */ #include /* for calls to open */ #include /* for calls to open */ #include /* for getpwuid */ #include /* for initgroups */ #include /* for getrlimit */ #include /* for getrlimit */ #include #include #include #include #include #include #include #include extern crm_graph_functions_t te_graph_fns; struct crm_subsystem_s *te_subsystem = NULL; stonith_t *stonith_api = NULL; static void global_cib_callback(const xmlNode * msg, int callid, int rc, xmlNode * output) { } static crm_graph_t * create_blank_graph(void) { crm_graph_t *a_graph = unpack_graph(NULL, NULL); a_graph->complete = TRUE; a_graph->abort_reason = "DC Takeover"; a_graph->completion_action = tg_restart; return a_graph; } /* A_TE_START, A_TE_STOP, A_TE_RESTART */ void do_te_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean init_ok = TRUE; if (action & A_TE_STOP) { if (transition_graph) { destroy_graph(transition_graph); transition_graph = NULL; } if (fsa_cib_conn) { fsa_cib_conn->cmds->del_notify_callback(fsa_cib_conn, T_CIB_DIFF_NOTIFY, te_update_diff); } clear_bit(fsa_input_register, te_subsystem->flag_connected); crm_info("Transitioner is now inactive"); } if ((action & A_TE_START) == 0) { return; } else if (is_set(fsa_input_register, te_subsystem->flag_connected)) { crm_debug("The transitioner is already active"); return; } else if ((action & A_TE_START) && cur_state == S_STOPPING) { crm_info("Ignoring request to start %s while shutting down", te_subsystem->name); return; } te_uuid = crm_generate_uuid(); crm_info("Registering TE UUID: %s", te_uuid); if (pcmk_ok != fsa_cib_conn->cmds->add_notify_callback(fsa_cib_conn, T_CIB_DIFF_NOTIFY, te_update_diff)) { crm_err("Could not set CIB notification callback"); init_ok = FALSE; } if (pcmk_ok != fsa_cib_conn->cmds->set_op_callback(fsa_cib_conn, global_cib_callback)) { crm_err("Could not set CIB global callback"); init_ok = FALSE; } if (init_ok) { set_graph_functions(&te_graph_fns); if (transition_graph) { destroy_graph(transition_graph); } /* create a blank one */ crm_debug("Transitioner is now active"); transition_graph = create_blank_graph(); set_bit(fsa_input_register, te_subsystem->flag_connected); } } /* A_TE_INVOKE, A_TE_CANCEL */ void do_te_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (AM_I_DC == FALSE || (fsa_state != S_TRANSITION_ENGINE && (action & A_TE_INVOKE))) { crm_notice("No need to invoke the TE (%s) in state %s", fsa_action2string(action), fsa_state2string(fsa_state)); return; } if (action & A_TE_CANCEL) { crm_debug("Cancelling the transition: %s", transition_graph->complete ? "inactive" : "active"); abort_transition(INFINITY, tg_restart, "Peer Cancelled", NULL); if (transition_graph->complete == FALSE) { crmd_fsa_stall(FALSE); } } else if (action & A_TE_HALT) { crm_debug("Halting the transition: %s", transition_graph->complete ? "inactive" : "active"); abort_transition(INFINITY, tg_stop, "Peer Halt", NULL); if (transition_graph->complete == FALSE) { crmd_fsa_stall(FALSE); } } else if (action & A_TE_INVOKE) { const char *value = NULL; xmlNode *graph_data = NULL; ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); const char *ref = crm_element_value(input->msg, XML_ATTR_REFERENCE); const char *graph_file = crm_element_value(input->msg, F_CRM_TGRAPH); const char *graph_input = crm_element_value(input->msg, F_CRM_TGRAPH_INPUT); if (graph_file == NULL && input->xml == NULL) { crm_log_xml_err(input->msg, "Bad command"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return; } if (transition_graph->complete == FALSE) { crm_info("Another transition is already active"); abort_transition(INFINITY, tg_restart, "Transition Active", NULL); return; } if (fsa_pe_ref == NULL || safe_str_neq(fsa_pe_ref, ref)) { crm_info("Transition is redundant: %s vs. %s", crm_str(fsa_pe_ref), crm_str(ref)); abort_transition(INFINITY, tg_restart, "Transition Redundant", NULL); } graph_data = input->xml; if (graph_data == NULL && graph_file != NULL) { graph_data = filename2xml(graph_file); } if (is_timer_started(transition_timer)) { crm_debug("The transitioner wait for a transition timer"); return; } CRM_CHECK(graph_data != NULL, crm_err("Input raised by %s is invalid", msg_data->origin); crm_log_xml_err(input->msg, "Bad command"); return); destroy_graph(transition_graph); transition_graph = unpack_graph(graph_data, graph_input); CRM_CHECK(transition_graph != NULL, transition_graph = create_blank_graph(); return); crm_info("Processing graph %d (ref=%s) derived from %s", transition_graph->id, ref, graph_input); + te_reset_job_counts(); value = crm_element_value(graph_data, "failed-stop-offset"); if (value) { free(failed_stop_offset); failed_stop_offset = strdup(value); } value = crm_element_value(graph_data, "failed-start-offset"); if (value) { free(failed_start_offset); failed_start_offset = strdup(value); } trigger_graph(); print_graph(LOG_DEBUG_2, transition_graph); if (graph_data != input->xml) { free_xml(graph_data); } } } diff --git a/crmd/tengine.h b/crmd/tengine.h index d5e317926e..1765dd439e 100644 --- a/crmd/tengine.h +++ b/crmd/tengine.h @@ -1,74 +1,78 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef TENGINE__H # define TENGINE__H # include # include # include # include extern stonith_t *stonith_api; extern GListPtr stonith_cleanup_list; extern void send_stonith_update(crm_action_t * stonith_action, const char *target, const char *uuid); /* tengine */ extern crm_action_t *match_down_event(int rc, const char *target, const char *filter, bool quiet); extern crm_action_t *get_cancel_action(const char *id, const char *node); extern gboolean cib_action_update(crm_action_t * action, int status, int op_rc); extern gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node); extern gboolean process_graph_event(xmlNode * event, const char *event_node); /* utils */ extern crm_action_t *get_action(int id, gboolean confirmed); extern gboolean start_global_timer(crm_action_timer_t * timer, int timeout); extern gboolean stop_te_timer(crm_action_timer_t * timer); extern const char *get_rsc_state(const char *task, enum op_status status); /* unpack */ extern gboolean process_te_message(xmlNode * msg, xmlNode * xml_data); extern crm_graph_t *transition_graph; extern crm_trigger_t *transition_trigger; extern char *te_uuid; extern void notify_crmd(crm_graph_t * graph); # include extern void trigger_graph_processing(const char *fn, int line); extern void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line); # define trigger_graph() trigger_graph_processing(__FUNCTION__, __LINE__) # define abort_transition(pri, action, text, reason) \ abort_transition_graph(pri, action, text, reason,__FUNCTION__,__LINE__); extern gboolean te_connect_stonith(gpointer user_data); extern crm_trigger_t *transition_trigger; extern crm_trigger_t *stonith_reconnect; extern char *failed_stop_offset; extern char *failed_start_offset; extern int active_timeout; extern int stonith_op_active; + +void te_action_confirmed(crm_action_t * action); +void te_reset_job_counts(void); + #endif diff --git a/crmd/throttle.c b/crmd/throttle.c index a5a61b2215..70be475b32 100644 --- a/crmd/throttle.c +++ b/crmd/throttle.c @@ -1,272 +1,270 @@ /* * Copyright (C) 2013 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include enum throttle_state_e { throttle_high = 0x0100, throttle_med = 0x0010, throttle_low = 0x0001, throttle_none = 0x0000, }; struct throttle_record_s { int cores; enum throttle_state_e mode; char *node; }; -static float cpu_target = 0.8; /* Ie. 80% configured by the user */ +static float cpu_target = 0.5; /* Ie. 50% configured by the user */ GHashTable *throttle_records = NULL; mainloop_timer_t *throttle_timer = NULL; static int throttle_num_cores(void) { static int cores = 0; char buffer[256]; char *iter = NULL; char *processor = NULL; FILE *stream = NULL; const char *cpufile = "/proc/cpuinfo"; if(cores) { return cores; } stream = fopen(cpufile, "r"); if(stream == NULL) { int rc = errno; crm_warn("Couldn't read %s: %s (%d)", cpufile, pcmk_strerror(rc), rc); return 0; } while (fgets(buffer, sizeof(buffer), stream)) { if(strstr(buffer, "processor") == buffer) { free(processor); processor = strdup(buffer); } } if(processor == NULL) { crm_warn("No processors found in %s", cpufile); return 0; } iter = processor + strlen("processor"); while(iter[0] == ':' || isspace(iter[0])) { iter++; } cores = strtol(iter, NULL, 10); crm_trace("Got %d from %s", cores, iter); cores++; /* Counting starts at 0 */ free(processor); fclose(stream); return cores; } static float throttle_load_avg(void) { float load = 0.0; char buffer[256]; const char *loadfile = "/proc/loadavg"; FILE *stream = fopen(loadfile, "r"); if(stream == NULL) { int rc = errno; crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc); return 0; } if(fgets(buffer, sizeof(buffer), stream)) { char *nl = strstr(buffer, "\n"); /* Grab the 1-minute average, ignore the rest */ load = strtof(buffer, NULL); if(nl) { nl[0] = 0; } crm_debug("Current load is %f (full: %s)", load, buffer); } fclose(stream); return load; } static enum throttle_state_e throttle_mode(void) { float load = throttle_load_avg(); int cores = throttle_num_cores(); float simple_load = 0.0; enum throttle_state_e mode = throttle_none; if(cores) { simple_load = load / cores; } else { simple_load = load; } if(simple_load > cpu_target) { mode |= throttle_high; - } else if(simple_load > 0.5 * cpu_target) { + } else if(simple_load > 0.66 * cpu_target) { mode |= throttle_med; - } else if(simple_load > 0.25 * cpu_target) { + } else if(simple_load > 0.33 * cpu_target) { mode |= throttle_low; } if(mode & throttle_high) { return throttle_high; } else if(mode & throttle_med) { return throttle_med; } else if(mode & throttle_low) { return throttle_low; } return throttle_none; } static void throttle_send_command(enum throttle_state_e mode) { xmlNode *xml = NULL; int cores = throttle_num_cores(); xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode); crm_xml_add_int(xml, F_CRM_THROTTLE_CORES, cores); send_cluster_message(NULL, crm_msg_crmd, xml, TRUE); free_xml(xml); crm_info("Updated throttle state to %.4x", mode); } static gboolean throttle_timer_cb(gpointer data) { static enum throttle_state_e last = throttle_none; enum throttle_state_e now = throttle_mode(); if(now != last) { crm_debug("New throttle mode: %.4x (was %.4x)", now, last); throttle_send_command(now); last = now; } return TRUE; } static void throttle_record_free(gpointer p) { struct throttle_record_s *r = p; free(r->node); free(r); } void throttle_init(void) { throttle_records = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, throttle_record_free); throttle_timer = mainloop_timer_add("throttle", 30* 1000, TRUE, throttle_timer_cb, NULL); crm_debug("load avg: %f on %d cores", throttle_load_avg(), throttle_num_cores()); mainloop_timer_start(throttle_timer); } void throttle_fini(void) { mainloop_timer_del(throttle_timer); throttle_timer = NULL; g_hash_table_destroy(throttle_records); throttle_records = NULL; } int throttle_get_job_limit(const char *node) { int jobs = 1; - int cores = 0; - enum throttle_state_e mode = 0; struct throttle_record_s *r = NULL; r = g_hash_table_lookup(throttle_records, node); if(r == NULL) { + r = calloc(1, sizeof(struct throttle_record_s)); + r->node = strdup(node); + r->mode = throttle_mode(); + r->cores = throttle_num_cores(); crm_trace("Defaulting to local values for unknown node %s", node); - mode = throttle_mode(); - cores = throttle_num_cores(); - } else { - mode = r->mode; - cores = r->cores; + g_hash_table_insert(throttle_records, r->node, r); } - switch(mode) { + switch(r->mode) { case throttle_high: jobs = 1; /* At least one job must always be allowed */ break; case throttle_med: - jobs = QB_MAX(1, cores / 2); + jobs = QB_MAX(1, r->cores / 2); break; case throttle_low: - jobs = QB_MAX(1, cores); + jobs = QB_MAX(1, r->cores); break; case throttle_none: - jobs = QB_MAX(1, cores * 2); + jobs = QB_MAX(1, r->cores * 2); break; default: - crm_err("Unknown throttle mode %.4x", mode); + crm_err("Unknown throttle mode %.4x on %s", r->mode, node); break; } return jobs; } void throttle_update(xmlNode *xml) { int cores = 0; enum throttle_state_e mode = 0; struct throttle_record_s *r = NULL; const char *from = crm_element_value(xml, F_CRM_HOST_FROM); crm_element_value_int(xml, F_CRM_THROTTLE_MODE, (int*)&mode); crm_element_value_int(xml, F_CRM_THROTTLE_CORES, &cores); r = g_hash_table_lookup(throttle_records, from); if(r == NULL) { r = calloc(1, sizeof(struct throttle_record_s)); r->node = strdup(from); g_hash_table_insert(throttle_records, r->node, r); } r->cores = cores; r->mode = mode; - crm_debug("Host %s has %d cores and throttle mode %.4x. Job limit = %d", + crm_debug("Host %s has %d cores and throttle mode %.4x. New job limit is %d", from, cores, mode, throttle_get_job_limit(from)); } diff --git a/include/crm/transition.h b/include/crm/transition.h index b84aaf890f..dc7f0cac04 100644 --- a/include/crm/transition.h +++ b/include/crm/transition.h @@ -1,146 +1,147 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include typedef enum { action_type_pseudo, action_type_rsc, action_type_crm } action_type_e; typedef struct te_timer_s crm_action_timer_t; typedef struct synapse_s { int id; int priority; gboolean ready; gboolean failed; gboolean executed; gboolean confirmed; GListPtr actions; /* crm_action_t* */ GListPtr inputs; /* crm_action_t* */ } synapse_t; typedef struct crm_action_s { int id; int timeout; int interval; GHashTable *params; action_type_e type; crm_action_timer_t *timer; synapse_t *synapse; gboolean sent_update; /* sent to the CIB */ gboolean executed; /* sent to the CRM */ gboolean confirmed; gboolean failed; gboolean can_fail; xmlNode *xml; } crm_action_t; enum timer_reason { timeout_action, timeout_action_warn, timeout_abort, }; struct te_timer_s { int source_id; int timeout; enum timer_reason reason; crm_action_t *action; }; /* order matters here */ enum transition_action { tg_done, tg_stop, tg_restart, tg_shutdown, }; typedef struct crm_graph_s { int id; char *source; int abort_priority; gboolean complete; const char *abort_reason; enum transition_action completion_action; int num_actions; int num_synapses; int batch_limit; int network_delay; int stonith_timeout; int transition_timeout; int fired; int pending; int skipped; int completed; int incomplete; GListPtr synapses; /* synpase_t* */ int migration_limit; GHashTable *migrating; } crm_graph_t; typedef struct crm_graph_functions_s { gboolean(*pseudo) (crm_graph_t * graph, crm_action_t * action); gboolean(*rsc) (crm_graph_t * graph, crm_action_t * action); gboolean(*crmd) (crm_graph_t * graph, crm_action_t * action); gboolean(*stonith) (crm_graph_t * graph, crm_action_t * action); + gboolean(*allowed) (crm_graph_t * graph, crm_action_t * action); } crm_graph_functions_t; enum transition_status { transition_active, transition_pending, /* active but no actions performed this time */ transition_complete, transition_stopped, transition_terminated, transition_action_failed, transition_failed, }; void set_default_graph_functions(void); void set_graph_functions(crm_graph_functions_t * fns); crm_graph_t *unpack_graph(xmlNode * xml_graph, const char *reference); int run_graph(crm_graph_t * graph); gboolean update_graph(crm_graph_t * graph, crm_action_t * action); void destroy_graph(crm_graph_t * graph); const char *transition_status(enum transition_status state); void print_graph(unsigned int log_level, crm_graph_t * graph); void print_action(int log_level, const char *prefix, crm_action_t * action); void update_abort_priority(crm_graph_t * graph, int priority, enum transition_action action, const char *abort_reason); const char *actiontype2text(action_type_e type); lrmd_event_data_t *convert_graph_action(xmlNode * resource, crm_action_t * action, int status, int rc); diff --git a/lib/transition/graph.c b/lib/transition/graph.c index 1965dde7bb..697004f0ee 100644 --- a/lib/transition/graph.c +++ b/lib/transition/graph.c @@ -1,414 +1,424 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include /* #include */ /* */ crm_graph_functions_t *graph_fns = NULL; static gboolean update_synapse_ready(synapse_t * synapse, int action_id) { GListPtr lpc = NULL; gboolean updates = FALSE; CRM_CHECK(synapse->executed == FALSE, return FALSE); CRM_CHECK(synapse->confirmed == FALSE, return FALSE); synapse->ready = TRUE; for (lpc = synapse->inputs; lpc != NULL; lpc = lpc->next) { crm_action_t *prereq = (crm_action_t *) lpc->data; crm_trace("Processing input %d", prereq->id); if (prereq->id == action_id) { crm_trace("Marking input %d of synapse %d confirmed", action_id, synapse->id); prereq->confirmed = TRUE; updates = TRUE; } else if (prereq->confirmed == FALSE) { synapse->ready = FALSE; } } if (updates) { crm_trace("Updated synapse %d", synapse->id); } return updates; } static gboolean update_synapse_confirmed(synapse_t * synapse, int action_id) { GListPtr lpc = NULL; gboolean updates = FALSE; gboolean is_confirmed = TRUE; CRM_CHECK(synapse->executed, return FALSE); CRM_CHECK(synapse->confirmed == FALSE, return TRUE); is_confirmed = TRUE; for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { crm_action_t *action = (crm_action_t *) lpc->data; crm_trace("Processing action %d", action->id); if (action->id == action_id) { crm_trace("Confirmed: Action %d of Synapse %d", action_id, synapse->id); action->confirmed = TRUE; updates = TRUE; } else if (action->confirmed == FALSE) { is_confirmed = FALSE; crm_trace("Synapse %d still not confirmed after action %d", synapse->id, action_id); } } if (is_confirmed && synapse->confirmed == FALSE) { crm_trace("Confirmed: Synapse %d", synapse->id); synapse->confirmed = TRUE; updates = TRUE; } if (updates) { crm_trace("Updated synapse %d", synapse->id); } return updates; } gboolean update_graph(crm_graph_t * graph, crm_action_t * action) { gboolean rc = FALSE; gboolean updates = FALSE; GListPtr lpc = NULL; for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed || synapse->failed) { crm_trace("Synapse complete"); } else if (synapse->executed) { crm_trace("Synapse executed"); rc = update_synapse_confirmed(synapse, action->id); } else if (action->failed == FALSE || synapse->priority == INFINITY) { rc = update_synapse_ready(synapse, action->id); } updates = updates || rc; } if (updates) { crm_trace("Updated graph with completed action %d", action->id); } return updates; } static gboolean -should_fire_synapse(synapse_t * synapse) +should_fire_synapse(crm_graph_t * graph, synapse_t * synapse) { GListPtr lpc = NULL; CRM_CHECK(synapse->executed == FALSE, return FALSE); CRM_CHECK(synapse->confirmed == FALSE, return FALSE); crm_trace("Checking pre-reqs for %d", synapse->id); /* lookup prereqs */ synapse->ready = TRUE; for (lpc = synapse->inputs; lpc != NULL; lpc = lpc->next) { crm_action_t *prereq = (crm_action_t *) lpc->data; crm_trace("Processing input %d", prereq->id); if (prereq->confirmed == FALSE) { crm_trace("Inputs for synapse %d not satisfied", synapse->id); synapse->ready = FALSE; break; } } + if(synapse->ready && graph_fns->allowed) { + for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { + crm_action_t *a = (crm_action_t *) lpc->data; + + if(graph_fns->allowed(graph, a) == FALSE) { + return FALSE; + } + } + } + return synapse->ready; } static gboolean initiate_action(crm_graph_t * graph, crm_action_t * action) { const char *id = NULL; CRM_CHECK(action->executed == FALSE, return FALSE); id = ID(action->xml); CRM_CHECK(id != NULL, return FALSE); action->executed = TRUE; if (action->type == action_type_pseudo) { - crm_trace("Executing pseudo-event: %d", action->id); + crm_trace("Executing pseudo-event: %s (%d)", id, action->id); return graph_fns->pseudo(graph, action); } else if (action->type == action_type_rsc) { - crm_trace("Executing rsc-event: %d", action->id); + crm_trace("Executing rsc-event: %s (%d)", id, action->id); return graph_fns->rsc(graph, action); } else if (action->type == action_type_crm) { const char *task = NULL; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); CRM_CHECK(task != NULL, return FALSE); if (safe_str_eq(task, CRM_OP_FENCE)) { - crm_trace("Executing STONITH-event: %d", action->id); + crm_trace("Executing STONITH-event: %s (%d)", id, action->id); return graph_fns->stonith(graph, action); } - crm_trace("Executing crm-event: %d", action->id); + crm_trace("Executing crm-event: %s (%d)", id, action->id); return graph_fns->crmd(graph, action); } crm_err("Failed on unsupported command type: %s (id=%s)", crm_element_name(action->xml), id); return FALSE; } static gboolean fire_synapse(crm_graph_t * graph, synapse_t * synapse) { GListPtr lpc = NULL; CRM_CHECK(synapse != NULL, return FALSE); CRM_CHECK(synapse->ready, return FALSE); CRM_CHECK(synapse->confirmed == FALSE, return TRUE); crm_trace("Synapse %d fired", synapse->id); synapse->executed = TRUE; for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { crm_action_t *action = (crm_action_t *) lpc->data; /* allow some leeway */ gboolean passed = FALSE; /* Invoke the action and start the timer */ passed = initiate_action(graph, action); if (passed == FALSE) { crm_err("Failed initiating <%s id=%d> in synapse %d", crm_element_name(action->xml), action->id, synapse->id); synapse->confirmed = TRUE; action->confirmed = TRUE; action->failed = TRUE; return FALSE; } } return TRUE; } static gboolean count_migrating(crm_graph_t * graph, synapse_t * synapse) { GListPtr lpc = NULL; CRM_CHECK(synapse != NULL, return FALSE); for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { crm_action_t *action = (crm_action_t *) lpc->data; const char *task = NULL; if (action->type != action_type_rsc) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (crm_str_eq(task, CRMD_ACTION_MIGRATE, TRUE) || crm_str_eq(task, CRMD_ACTION_MIGRATED, TRUE)) { const char *node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int *counter = g_hash_table_lookup(graph->migrating, node); if (counter == NULL) { counter = calloc(1, sizeof(int)); g_hash_table_insert(graph->migrating, strdup(node), counter); } (*counter)++; } } return TRUE; } static gboolean migration_overrun(crm_graph_t * graph, synapse_t * synapse) { GListPtr lpc = NULL; CRM_CHECK(synapse != NULL, return FALSE); if (graph->migration_limit < 0) { return FALSE; } for (lpc = synapse->actions; lpc != NULL; lpc = lpc->next) { crm_action_t *action = (crm_action_t *) lpc->data; const char *task = NULL; if (action->type != action_type_rsc) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (crm_str_eq(task, CRMD_ACTION_MIGRATE, TRUE) || crm_str_eq(task, CRMD_ACTION_MIGRATED, TRUE)) { const char *node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int *counter = g_hash_table_lookup(graph->migrating, node); if (counter && *counter >= graph->migration_limit) { return TRUE; } } } return FALSE; } int run_graph(crm_graph_t * graph) { GListPtr lpc = NULL; int stat_log_level = LOG_DEBUG; int pass_result = transition_active; const char *status = "In-progress"; if (graph_fns == NULL) { set_default_graph_functions(); } if (graph == NULL) { return transition_complete; } graph->fired = 0; graph->pending = 0; graph->skipped = 0; graph->completed = 0; graph->incomplete = 0; g_hash_table_remove_all(graph->migrating); crm_trace("Entering graph %d callback", graph->id); /* Pre-calculate the number of completed and in-flight operations */ for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed) { crm_trace("Synapse %d complete", synapse->id); graph->completed++; } else if (synapse->failed == FALSE && synapse->executed) { crm_trace("Synapse %d: confirmation pending", synapse->id); graph->pending++; if (graph->migration_limit >= 0) { count_migrating(graph, synapse); } } } /* Now check if there is work to do */ for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { synapse_t *synapse = (synapse_t *) lpc->data; if (graph->batch_limit > 0 && graph->pending >= graph->batch_limit) { crm_debug("Throttling output: batch limit (%d) reached", graph->batch_limit); break; } else if (graph->migration_limit >= 0 && migration_overrun(graph, synapse)) { crm_debug("Throttling output: migration limit (%d) reached", graph->migration_limit); break; } else if (synapse->failed) { graph->skipped++; continue; } else if (synapse->confirmed || synapse->executed) { /* Already handled */ continue; } if (synapse->priority < graph->abort_priority) { crm_trace("Skipping synapse %d: aborting", synapse->id); graph->skipped++; - } else if (should_fire_synapse(synapse)) { + } else if (should_fire_synapse(graph, synapse)) { crm_trace("Synapse %d fired", synapse->id); graph->fired++; if(fire_synapse(graph, synapse) == FALSE) { crm_err("Synapse %d failed to fire", synapse->id); stat_log_level = LOG_ERR; graph->abort_priority = INFINITY; graph->incomplete++; graph->fired--; } if (synapse->confirmed == FALSE) { graph->pending++; if (graph->migration_limit >= 0) { count_migrating(graph, synapse); } } } else { crm_trace("Synapse %d cannot fire", synapse->id); graph->incomplete++; } } if (graph->pending == 0 && graph->fired == 0) { graph->complete = TRUE; stat_log_level = LOG_NOTICE; pass_result = transition_complete; status = "Complete"; if (graph->incomplete != 0 && graph->abort_priority <= 0) { stat_log_level = LOG_WARNING; pass_result = transition_terminated; status = "Terminated"; } else if (graph->skipped != 0) { status = "Stopped"; } } else if (graph->fired == 0) { pass_result = transition_pending; } do_crm_log(stat_log_level, "Transition %d (Complete=%d, Pending=%d," " Fired=%d, Skipped=%d, Incomplete=%d, Source=%s): %s", graph->id, graph->completed, graph->pending, graph->fired, graph->skipped, graph->incomplete, graph->source, status); return pass_result; }