diff --git a/daemons/attrd/Makefile.am b/daemons/attrd/Makefile.am index 86ce9ed9cf..7e9db28ac8 100644 --- a/daemons/attrd/Makefile.am +++ b/daemons/attrd/Makefile.am @@ -1,38 +1,38 @@ # # Copyright 2004-2018 Andrew Beekhof # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # include $(top_srcdir)/Makefile.common halibdir = $(CRM_DAEMON_DIR) halib_PROGRAMS = pacemaker-attrd noinst_HEADERS = pacemaker-attrd.h pacemaker_attrd_CFLAGS = $(CFLAGS_HARDENED_EXE) pacemaker_attrd_LDFLAGS = $(LDFLAGS_HARDENED_EXE) pacemaker_attrd_LDADD = $(top_builddir)/lib/cluster/libcrmcluster.la \ $(top_builddir)/lib/pengine/libpe_rules.la \ $(top_builddir)/lib/common/libcrmcommon.la \ $(top_builddir)/lib/cib/libcib.la \ $(top_builddir)/lib/lrmd/liblrmd.la \ $(CLUSTERLIBS) pacemaker_attrd_SOURCES = pacemaker-attrd.c attrd_commands.c \ - attrd_utils.c attrd_alerts.c + attrd_utils.c attrd_alerts.c attrd_elections.c clean-generic: rm -f *.log *.debug *.xml *~ if BUILD_LEGACY_LINKS install-exec-hook: cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f attrd && $(LN_S) pacemaker-attrd attrd uninstall-hook: cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f attrd endif diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c index da366f558c..ed18b88807 100644 --- a/daemons/attrd/attrd_commands.c +++ b/daemons/attrd/attrd_commands.c @@ -1,1306 +1,1271 @@ /* * Copyright 2013-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include "pacemaker-attrd.h" /* * Legacy attrd (all pre-1.1.11 Pacemaker versions, plus all versions when used * with the no-longer-supported CMAN or corosync-plugin stacks) is unversioned. * * With atomic attrd, each attrd will send ATTRD_PROTOCOL_VERSION with every * peer request and reply. As of Pacemaker 2.0.0, at start-up each attrd will * also set a private attribute for itself with its version, so any attrd can * determine the minimum version supported by all peers. * * Protocol Pacemaker Significant changes * -------- --------- ------------------- * 1 1.1.11 ATTRD_OP_UPDATE (F_ATTRD_ATTRIBUTE only), * ATTRD_OP_PEER_REMOVE, ATTRD_OP_REFRESH, ATTRD_OP_FLUSH, * ATTRD_OP_SYNC, ATTRD_OP_SYNC_RESPONSE * 1 1.1.13 ATTRD_OP_UPDATE (with F_ATTR_REGEX), ATTRD_OP_QUERY * 1 1.1.15 ATTRD_OP_UPDATE_BOTH, ATTRD_OP_UPDATE_DELAY * 2 1.1.17 ATTRD_OP_CLEAR_FAILURE */ #define ATTRD_PROTOCOL_VERSION "2" int last_cib_op_done = 0; -char *peer_writer = NULL; GHashTable *attributes = NULL; void write_attribute(attribute_t *a, bool ignore_delay); void write_or_elect_attribute(attribute_t *a); void attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml); void attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter); void attrd_peer_sync(crm_node_t *peer, xmlNode *xml); void attrd_peer_remove(const char *host, gboolean uncache, const char *source); static gboolean send_attrd_message(crm_node_t * node, xmlNode * data) { crm_xml_add(data, F_TYPE, T_ATTRD); crm_xml_add(data, F_ATTRD_VERSION, ATTRD_PROTOCOL_VERSION); - crm_xml_add_int(data, F_ATTRD_WRITER, election_state(writer)); - + attrd_xml_add_writer(data); return send_cluster_message(node, crm_msg_attrd, data, TRUE); } static gboolean attribute_timer_cb(gpointer data) { attribute_t *a = data; - crm_trace("Dampen interval expired for %s in state %d", a->id, election_state(writer)); + crm_trace("Dampen interval expired for %s", a->id); write_or_elect_attribute(a); return FALSE; } static void free_attribute_value(gpointer data) { attribute_value_t *v = data; free(v->nodename); free(v->current); free(v->requested); free(v); } void free_attribute(gpointer data) { attribute_t *a = data; if(a) { free(a->id); free(a->set); free(a->uuid); free(a->user); mainloop_timer_del(a->timer); g_hash_table_destroy(a->values); free(a); } } static xmlNode * build_attribute_xml( xmlNode *parent, const char *name, const char *set, const char *uuid, unsigned int timeout_ms, const char *user, gboolean is_private, const char *peer, uint32_t peerid, const char *value) { xmlNode *xml = create_xml_node(parent, __FUNCTION__); crm_xml_add(xml, F_ATTRD_ATTRIBUTE, name); crm_xml_add(xml, F_ATTRD_SET, set); crm_xml_add(xml, F_ATTRD_KEY, uuid); crm_xml_add(xml, F_ATTRD_USER, user); crm_xml_add(xml, F_ATTRD_HOST, peer); crm_xml_add_int(xml, F_ATTRD_HOST_ID, peerid); crm_xml_add(xml, F_ATTRD_VALUE, value); crm_xml_add_int(xml, F_ATTRD_DAMPEN, timeout_ms/1000); crm_xml_add_int(xml, F_ATTRD_IS_PRIVATE, is_private); return xml; } static void clear_attribute_value_seen(void) { GHashTableIter aIter; GHashTableIter vIter; attribute_t *a; attribute_value_t *v = NULL; g_hash_table_iter_init(&aIter, attributes); while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) { g_hash_table_iter_init(&vIter, a->values); while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) { v->seen = FALSE; crm_trace("Clear seen flag %s[%s] = %s.", a->id, v->nodename, v->current); } } } static attribute_t * create_attribute(xmlNode *xml) { int dampen = 0; const char *value = crm_element_value(xml, F_ATTRD_DAMPEN); attribute_t *a = calloc(1, sizeof(attribute_t)); a->id = crm_element_value_copy(xml, F_ATTRD_ATTRIBUTE); a->set = crm_element_value_copy(xml, F_ATTRD_SET); a->uuid = crm_element_value_copy(xml, F_ATTRD_KEY); a->values = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, NULL, free_attribute_value); crm_element_value_int(xml, F_ATTRD_IS_PRIVATE, &a->is_private); #if ENABLE_ACL a->user = crm_element_value_copy(xml, F_ATTRD_USER); crm_trace("Performing all %s operations as user '%s'", a->id, a->user); #endif if(value) { dampen = crm_get_msec(value); crm_trace("Created attribute %s with delay %dms (%s)", a->id, dampen, value); } else { crm_trace("Created attribute %s with no delay", a->id); } if(dampen > 0) { a->timeout_ms = dampen; a->timer = mainloop_timer_add(a->id, a->timeout_ms, FALSE, attribute_timer_cb, a); } else if (dampen < 0) { crm_warn("Ignoring invalid delay %s for attribute %s", value, a->id); } g_hash_table_replace(attributes, a->id, a); return a; } /*! * \internal * \brief Respond to a client peer-remove request (i.e. propagate to all peers) * * \param[in] client_name Name of client that made request (for log messages) * \param[in] xml Root of request XML * * \return void */ void attrd_client_peer_remove(const char *client_name, xmlNode *xml) { // Host and ID are not used in combination, rather host has precedence const char *host = crm_element_value(xml, F_ATTRD_HOST); char *host_alloc = NULL; if (host == NULL) { int nodeid; crm_element_value_int(xml, F_ATTRD_HOST_ID, &nodeid); if (nodeid > 0) { crm_node_t *node = crm_find_peer(nodeid, NULL); char *host_alloc = NULL; if (node && node->uname) { // Use cached name if available host = node->uname; } else { // Otherwise ask cluster layer host_alloc = get_node_name(nodeid); host = host_alloc; } crm_xml_add(xml, F_ATTRD_HOST, host); } } if (host) { crm_info("Client %s is requesting all values for %s be removed", client_name, host); send_attrd_message(NULL, xml); /* ends up at attrd_peer_message() */ free(host_alloc); } else { crm_info("Ignoring request by client %s to remove all peer values without specifying peer", client_name); } } /*! * \internal * \brief Respond to a client update request * * \param[in] xml Root of request XML * * \return void */ void attrd_client_update(xmlNode *xml) { attribute_t *a = NULL; char *host = crm_element_value_copy(xml, F_ATTRD_HOST); const char *attr = crm_element_value(xml, F_ATTRD_ATTRIBUTE); const char *value = crm_element_value(xml, F_ATTRD_VALUE); const char *regex = crm_element_value(xml, F_ATTRD_REGEX); /* If a regex was specified, broadcast a message for each match */ if ((attr == NULL) && regex) { GHashTableIter aIter; regex_t *r_patt = calloc(1, sizeof(regex_t)); crm_debug("Setting %s to %s", regex, value); if (regcomp(r_patt, regex, REG_EXTENDED|REG_NOSUB)) { crm_err("Bad regex '%s' for update", regex); } else { g_hash_table_iter_init(&aIter, attributes); while (g_hash_table_iter_next(&aIter, (gpointer *) & attr, NULL)) { int status = regexec(r_patt, attr, 0, NULL, 0); if (status == 0) { crm_trace("Matched %s with %s", attr, regex); crm_xml_add(xml, F_ATTRD_ATTRIBUTE, attr); send_attrd_message(NULL, xml); } } } free(host); regfree(r_patt); free(r_patt); return; } else if (attr == NULL) { crm_err("Update request did not specify attribute or regular expression"); free(host); return; } if (host == NULL) { crm_trace("Inferring host"); host = strdup(attrd_cluster->uname); crm_xml_add(xml, F_ATTRD_HOST, host); crm_xml_add_int(xml, F_ATTRD_HOST_ID, attrd_cluster->nodeid); } a = g_hash_table_lookup(attributes, attr); /* If value was specified using ++ or += notation, expand to real value */ if (value) { if (attrd_value_needs_expansion(value)) { int int_value; attribute_value_t *v = NULL; if (a) { v = g_hash_table_lookup(a->values, host); } int_value = attrd_expand_value(value, (v? v->current : NULL)); crm_info("Expanded %s=%s to %d", attr, value, int_value); crm_xml_add_int(xml, F_ATTRD_VALUE, int_value); /* Replacing the value frees the previous memory, so re-query it */ value = crm_element_value(xml, F_ATTRD_VALUE); } } - if ((peer_writer == NULL) && (election_state(writer) != election_in_progress)) { - crm_info("Starting an election to determine the writer"); - election_vote(writer); - } + attrd_start_election_if_needed(); crm_debug("Broadcasting %s[%s]=%s%s", attr, host, value, - ((election_state(writer) == election_won)? " (writer)" : "")); + (attrd_election_won()? " (writer)" : "")); free(host); send_attrd_message(NULL, xml); /* ends up at attrd_peer_message() */ } /*! * \internal * \brief Respond to client clear-failure request * * \param[in] xml Request XML */ void attrd_client_clear_failure(xmlNode *xml) { #if 0 /* @TODO Track the minimum supported protocol version across all nodes, * then enable this more-efficient code. */ if (compare_version("2", minimum_protocol_version) <= 0) { /* Propagate to all peers (including ourselves). * This ends up at attrd_peer_message(). */ send_attrd_message(NULL, xml); return; } #endif const char *rsc = crm_element_value(xml, F_ATTRD_RESOURCE); const char *op = crm_element_value(xml, F_ATTRD_OPERATION); const char *interval_spec = crm_element_value(xml, F_ATTRD_INTERVAL); /* Map this to an update */ crm_xml_add(xml, F_ATTRD_TASK, ATTRD_OP_UPDATE); /* Add regular expression matching desired attributes */ if (rsc) { char *pattern; if (op == NULL) { pattern = crm_strdup_printf(ATTRD_RE_CLEAR_ONE, rsc); } else { guint interval_ms = crm_parse_interval_spec(interval_spec); pattern = crm_strdup_printf(ATTRD_RE_CLEAR_OP, rsc, op, interval_ms); } crm_xml_add(xml, F_ATTRD_REGEX, pattern); free(pattern); } else { crm_xml_add(xml, F_ATTRD_REGEX, ATTRD_RE_CLEAR_ALL); } /* Make sure attribute and value are not set, so we delete via regex */ if (crm_element_value(xml, F_ATTRD_ATTRIBUTE)) { crm_xml_replace(xml, F_ATTRD_ATTRIBUTE, NULL); } if (crm_element_value(xml, F_ATTRD_VALUE)) { crm_xml_replace(xml, F_ATTRD_VALUE, NULL); } attrd_client_update(xml); } /*! * \internal * \brief Respond to a client refresh request (i.e. write out all attributes) * * \return void */ void attrd_client_refresh(void) { crm_info("Updating all attributes"); write_attributes(TRUE, TRUE); } /*! * \internal * \brief Build the XML reply to a client query * * param[in] attr Name of requested attribute * param[in] host Name of requested host (or NULL for all hosts) * * \return New XML reply * \note Caller is responsible for freeing the resulting XML */ static xmlNode *build_query_reply(const char *attr, const char *host) { xmlNode *reply = create_xml_node(NULL, __FUNCTION__); attribute_t *a; if (reply == NULL) { return NULL; } crm_xml_add(reply, F_TYPE, T_ATTRD); crm_xml_add(reply, F_ATTRD_VERSION, ATTRD_PROTOCOL_VERSION); /* If desired attribute exists, add its value(s) to the reply */ a = g_hash_table_lookup(attributes, attr); if (a) { attribute_value_t *v; xmlNode *host_value; crm_xml_add(reply, F_ATTRD_ATTRIBUTE, attr); /* Allow caller to use "localhost" to refer to local node */ if (safe_str_eq(host, "localhost")) { host = attrd_cluster->uname; crm_trace("Mapped localhost to %s", host); } /* If a specific node was requested, add its value */ if (host) { v = g_hash_table_lookup(a->values, host); host_value = create_xml_node(reply, XML_CIB_TAG_NODE); if (host_value == NULL) { free_xml(reply); return NULL; } crm_xml_add(host_value, F_ATTRD_HOST, host); crm_xml_add(host_value, F_ATTRD_VALUE, (v? v->current : NULL)); /* Otherwise, add all nodes' values */ } else { GHashTableIter iter; g_hash_table_iter_init(&iter, a->values); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &v)) { host_value = create_xml_node(reply, XML_CIB_TAG_NODE); if (host_value == NULL) { free_xml(reply); return NULL; } crm_xml_add(host_value, F_ATTRD_HOST, v->nodename); crm_xml_add(host_value, F_ATTRD_VALUE, v->current); } } } return reply; } /*! * \internal * \brief Respond to a client query * * \param[in] client Who queried us * \param[in] query Root of query XML * * \return void */ void attrd_client_query(crm_client_t *client, uint32_t id, uint32_t flags, xmlNode *query) { const char *attr; const char *origin = crm_element_value(query, F_ORIG); ssize_t rc; xmlNode *reply; if (origin == NULL) { origin = "unknown client"; } crm_debug("Query arrived from %s", origin); /* Request must specify attribute name to query */ attr = crm_element_value(query, F_ATTRD_ATTRIBUTE); if (attr == NULL) { crm_warn("Ignoring malformed query from %s (no attribute name given)", origin); return; } /* Build the XML reply */ reply = build_query_reply(attr, crm_element_value(query, F_ATTRD_HOST)); if (reply == NULL) { crm_err("Could not respond to query from %s: could not create XML reply", origin); return; } crm_log_xml_trace(reply, "Reply"); /* Send the reply to the client */ client->request_id = 0; if ((rc = crm_ipcs_send(client, id, reply, flags)) < 0) { crm_err("Could not respond to query from %s: %s (%lld)", origin, pcmk_strerror(-rc), (long long) -rc); } free_xml(reply); } /*! * \internal * \brief Clear failure-related attributes * * \param[in] peer Peer that sent clear request * \param[in] xml Request XML */ static void attrd_peer_clear_failure(crm_node_t *peer, xmlNode *xml) { const char *rsc = crm_element_value(xml, F_ATTRD_RESOURCE); const char *host = crm_element_value(xml, F_ATTRD_HOST); const char *op = crm_element_value(xml, F_ATTRD_OPERATION); const char *interval_spec = crm_element_value(xml, F_ATTRD_INTERVAL); guint interval_ms = crm_parse_interval_spec(interval_spec); char *attr = NULL; GHashTableIter iter; regex_t regex; if (attrd_failure_regex(®ex, rsc, op, interval_ms) != pcmk_ok) { crm_info("Ignoring invalid request to clear failures for %s", (rsc? rsc : "all resources")); return; } crm_xml_add(xml, F_ATTRD_TASK, ATTRD_OP_UPDATE); /* Make sure value is not set, so we delete */ if (crm_element_value(xml, F_ATTRD_VALUE)) { crm_xml_replace(xml, F_ATTRD_VALUE, NULL); } g_hash_table_iter_init(&iter, attributes); while (g_hash_table_iter_next(&iter, (gpointer *) &attr, NULL)) { if (regexec(®ex, attr, 0, NULL, 0) == 0) { crm_trace("Matched %s when clearing %s", attr, (rsc? rsc : "all resources")); crm_xml_add(xml, F_ATTRD_ATTRIBUTE, attr); attrd_peer_update(peer, xml, host, FALSE); } } regfree(®ex); } /*! \internal \brief Broadcast private attribute for local node with protocol version */ void attrd_broadcast_protocol() { xmlNode *attrd_op = create_xml_node(NULL, __FUNCTION__); crm_xml_add(attrd_op, F_TYPE, T_ATTRD); crm_xml_add(attrd_op, F_ORIG, crm_system_name); crm_xml_add(attrd_op, F_ATTRD_TASK, ATTRD_OP_UPDATE); crm_xml_add(attrd_op, F_ATTRD_ATTRIBUTE, CRM_ATTR_PROTOCOL); crm_xml_add(attrd_op, F_ATTRD_VALUE, ATTRD_PROTOCOL_VERSION); crm_xml_add_int(attrd_op, F_ATTRD_IS_PRIVATE, 1); attrd_client_update(attrd_op); free_xml(attrd_op); } void attrd_peer_message(crm_node_t *peer, xmlNode *xml) { - int peer_state = 0; const char *op = crm_element_value(xml, F_ATTRD_TASK); const char *election_op = crm_element_value(xml, F_CRM_TASK); const char *host = crm_element_value(xml, F_ATTRD_HOST); + bool peer_won = FALSE; - if(election_op) { - enum election_result rc = 0; - - crm_xml_add(xml, F_CRM_HOST_FROM, peer->uname); - rc = election_count_vote(writer, xml, TRUE); - switch(rc) { - case election_start: - free(peer_writer); - peer_writer = NULL; - election_vote(writer); - break; - case election_lost: - free(peer_writer); - peer_writer = strdup(peer->uname); - break; - default: - election_check(writer); - break; - } + if (election_op) { + attrd_handle_election_op(peer, xml); return; } - crm_element_value_int(xml, F_ATTRD_WRITER, &peer_state); - if(election_state(writer) == election_won - && peer_state == election_won - && safe_str_neq(peer->uname, attrd_cluster->uname)) { - crm_notice("Detected another attribute writer: %s", peer->uname); - election_vote(writer); - - } else if(peer_state == election_won) { - if(peer_writer == NULL) { - peer_writer = strdup(peer->uname); - crm_notice("Recorded attribute writer: %s", peer->uname); - - } else if(safe_str_neq(peer->uname, peer_writer)) { - crm_notice("Recorded new attribute writer: %s (was %s)", peer->uname, peer_writer); - free(peer_writer); - peer_writer = strdup(peer->uname); - } - } + peer_won = attrd_check_for_new_writer(peer, xml); if (safe_str_eq(op, ATTRD_OP_UPDATE) || safe_str_eq(op, ATTRD_OP_UPDATE_BOTH) || safe_str_eq(op, ATTRD_OP_UPDATE_DELAY)) { attrd_peer_update(peer, xml, host, FALSE); } else if (safe_str_eq(op, ATTRD_OP_SYNC)) { attrd_peer_sync(peer, xml); } else if (safe_str_eq(op, ATTRD_OP_PEER_REMOVE)) { attrd_peer_remove(host, TRUE, peer->uname); } else if (safe_str_eq(op, ATTRD_OP_CLEAR_FAILURE)) { /* It is not currently possible to receive this as a peer command, * but will be, if we one day enable propagating this operation. */ attrd_peer_clear_failure(peer, xml); } else if (safe_str_eq(op, ATTRD_OP_SYNC_RESPONSE) && safe_str_neq(peer->uname, attrd_cluster->uname)) { xmlNode *child = NULL; crm_info("Processing %s from %s", op, peer->uname); /* Clear the seen flag for attribute processing held only in the own node. */ - if (peer_state == election_won) { + if (peer_won) { clear_attribute_value_seen(); } for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) { host = crm_element_value(child, F_ATTRD_HOST); attrd_peer_update(peer, child, host, TRUE); } - if (peer_state == election_won) { + if (peer_won) { /* Synchronize if there is an attribute held only by own node that Writer does not have. */ attrd_current_only_attribute_update(peer, xml); } } } void attrd_peer_sync(crm_node_t *peer, xmlNode *xml) { GHashTableIter aIter; GHashTableIter vIter; attribute_t *a = NULL; attribute_value_t *v = NULL; xmlNode *sync = create_xml_node(NULL, __FUNCTION__); crm_xml_add(sync, F_ATTRD_TASK, ATTRD_OP_SYNC_RESPONSE); g_hash_table_iter_init(&aIter, attributes); while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) { g_hash_table_iter_init(&vIter, a->values); while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) { crm_debug("Syncing %s[%s] = %s to %s", a->id, v->nodename, v->current, peer?peer->uname:"everyone"); build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private, v->nodename, v->nodeid, v->current); } } crm_debug("Syncing values to %s", peer?peer->uname:"everyone"); send_attrd_message(peer, sync); free_xml(sync); } /*! * \internal * \brief Remove all attributes and optionally peer cache entries for a node * * \param[in] host Name of node to purge * \param[in] uncache If TRUE, remove node from peer caches * \param[in] source Who requested removal (only used for logging) */ void attrd_peer_remove(const char *host, gboolean uncache, const char *source) { attribute_t *a = NULL; GHashTableIter aIter; CRM_CHECK(host != NULL, return); crm_notice("Removing all %s attributes for peer %s", host, source); g_hash_table_iter_init(&aIter, attributes); while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) { if(g_hash_table_remove(a->values, host)) { crm_debug("Removed %s[%s] for peer %s", a->id, host, source); } } if (uncache) { crm_remote_peer_cache_remove(host); reap_crm_member(0, host); } } /*! * \internal * \brief Return host's hash table entry (creating one if needed) * * \param[in] values Hash table of values * \param[in] host Name of peer to look up * \param[in] xml XML describing the attribute * * \return Pointer to new or existing hash table entry */ static attribute_value_t * attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml) { attribute_value_t *v = g_hash_table_lookup(values, host); int is_remote = 0; crm_element_value_int(xml, F_ATTRD_IS_REMOTE, &is_remote); if (is_remote) { /* If we previously assumed this node was an unseen cluster node, * remove its entry from the cluster peer cache. */ crm_node_t *dup = crm_find_peer(0, host); if (dup && (dup->uuid == NULL)) { reap_crm_member(0, host); } /* Ensure this host is in the remote peer cache */ CRM_ASSERT(crm_remote_peer_get(host) != NULL); } if (v == NULL) { v = calloc(1, sizeof(attribute_value_t)); CRM_ASSERT(v != NULL); v->nodename = strdup(host); CRM_ASSERT(v->nodename != NULL); v->is_remote = is_remote; g_hash_table_replace(values, v->nodename, v); } return(v); } void attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml) { GHashTableIter aIter; GHashTableIter vIter; attribute_t *a; attribute_value_t *v = NULL; xmlNode *sync = create_xml_node(NULL, __FUNCTION__); gboolean build = FALSE; crm_xml_add(sync, F_ATTRD_TASK, ATTRD_OP_SYNC_RESPONSE); g_hash_table_iter_init(&aIter, attributes); while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) { g_hash_table_iter_init(&vIter, a->values); while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) { if (safe_str_eq(v->nodename, attrd_cluster->uname) && v->seen == FALSE) { crm_trace("Syncing %s[%s] = %s to everyone.(from local only attributes)", a->id, v->nodename, v->current); build = TRUE; build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private, v->nodename, v->nodeid, v->current); } else { crm_trace("Local attribute(%s[%s] = %s) was ignore.(another host) : [%s]", a->id, v->nodename, v->current, attrd_cluster->uname); continue; } } } if (build) { crm_debug("Syncing values to everyone.(from local only attributes)"); send_attrd_message(NULL, sync); } free_xml(sync); } void attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter) { bool update_both = FALSE; attribute_t *a; attribute_value_t *v = NULL; const char *op = crm_element_value(xml, F_ATTRD_TASK); const char *attr = crm_element_value(xml, F_ATTRD_ATTRIBUTE); const char *value = crm_element_value(xml, F_ATTRD_VALUE); if (attr == NULL) { crm_warn("Could not update attribute: peer did not specify name"); return; } update_both = ((op == NULL) // ATTRD_OP_SYNC_RESPONSE has no F_ATTRD_TASK || safe_str_eq(op, ATTRD_OP_UPDATE_BOTH)); // Look up or create attribute entry a = g_hash_table_lookup(attributes, attr); if (a == NULL) { if (update_both || safe_str_eq(op, ATTRD_OP_UPDATE)) { a = create_attribute(xml); } else { crm_warn("Could not update %s: attribute not found", attr); return; } } // Update attribute dampening if (update_both || safe_str_eq(op, ATTRD_OP_UPDATE_DELAY)) { const char *dvalue = crm_element_value(xml, F_ATTRD_DAMPEN); int dampen = 0; if (dvalue == NULL) { crm_warn("Could not update %s: peer did not specify value for delay", attr); return; } dampen = crm_get_msec(dvalue); if (dampen < 0) { crm_warn("Could not update %s: invalid delay value %dms (%s)", attr, dampen, dvalue); return; } if (a->timeout_ms != dampen) { mainloop_timer_del(a->timer); a->timeout_ms = dampen; if (dampen > 0) { a->timer = mainloop_timer_add(attr, a->timeout_ms, FALSE, attribute_timer_cb, a); crm_info("Update attribute %s delay to %dms (%s)", attr, dampen, dvalue); } else { a->timer = NULL; crm_info("Update attribute %s to remove delay", attr); } /* If dampening changed, do an immediate write-out, * otherwise repeated dampening changes would prevent write-outs */ write_or_elect_attribute(a); } if (!update_both) { return; } } // If no host was specified, update all hosts recursively if (host == NULL) { GHashTableIter vIter; crm_debug("Setting %s for all hosts to %s", attr, value); xml_remove_prop(xml, F_ATTRD_HOST_ID); g_hash_table_iter_init(&vIter, a->values); while (g_hash_table_iter_next(&vIter, (gpointer *) & host, NULL)) { attrd_peer_update(peer, xml, host, filter); } return; } // Update attribute value for one host v = attrd_lookup_or_create_value(a->values, host, xml); if (filter && safe_str_neq(v->current, value) && safe_str_eq(host, attrd_cluster->uname)) { xmlNode *sync = create_xml_node(NULL, __FUNCTION__); crm_notice("%s[%s]: local value '%s' takes priority over '%s' from %s", attr, host, v->current, value, peer->uname); crm_xml_add(sync, F_ATTRD_TASK, ATTRD_OP_SYNC_RESPONSE); v = g_hash_table_lookup(a->values, host); build_attribute_xml(sync, attr, a->set, a->uuid, a->timeout_ms, a->user, a->is_private, v->nodename, v->nodeid, v->current); - crm_xml_add_int(sync, F_ATTRD_WRITER, election_state(writer)); + attrd_xml_add_writer(sync); /* Broadcast in case any other nodes had the inconsistent value */ send_attrd_message(NULL, sync); free_xml(sync); } else if (safe_str_neq(v->current, value)) { crm_info("Setting %s[%s]: %s -> %s from %s", attr, host, v->current, value, peer->uname); free(v->current); v->current = (value? strdup(value) : NULL); a->changed = TRUE; // Write out new value or start dampening timer if (a->timeout_ms && a->timer) { crm_trace("Delayed write out (%dms) for %s", a->timeout_ms, attr); mainloop_timer_start(a->timer); } else { write_or_elect_attribute(a); } } else { crm_trace("Unchanged %s[%s] from %s is %s", attr, host, peer->uname, value); } /* Set the seen flag for attribute processing held only in the own node. */ v->seen = TRUE; /* If this is a cluster node whose node ID we are learning, remember it */ if ((v->nodeid == 0) && (v->is_remote == FALSE) && (crm_element_value_int(xml, F_ATTRD_HOST_ID, (int*)&v->nodeid) == 0)) { crm_node_t *known_peer = crm_get_peer(v->nodeid, host); crm_trace("Learned %s has node id %s", known_peer->uname, known_peer->uuid); - if (election_state(writer) == election_won) { + if (attrd_election_won()) { write_attributes(FALSE, FALSE); } } } void write_or_elect_attribute(attribute_t *a) { - enum election_result rc = election_state(writer); - if(rc == election_won) { + if (attrd_election_won()) { write_attribute(a, FALSE); - - } else if(rc == election_in_progress) { - crm_trace("Election in progress to determine who will write out %s", a->id); - - } else if(peer_writer == NULL) { - crm_info("Starting an election to determine who will write out %s", a->id); - election_vote(writer); - } else { - crm_trace("%s will write out %s, we are in state %d", peer_writer, a->id, rc); + attrd_start_election_if_needed(); } } gboolean attrd_election_cb(gpointer user_data) { - free(peer_writer); - peer_writer = strdup(attrd_cluster->uname); + attrd_declare_winner(); /* Update the peers after an election */ attrd_peer_sync(NULL, NULL); /* Update the CIB after an election */ write_attributes(TRUE, FALSE); return FALSE; } void attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *data) { - if (kind == crm_status_nstate) { - if (safe_str_eq(peer->state, CRM_NODE_MEMBER)) { - /* If we're the writer, send new peers a list of all attributes - * (unless it's a remote node, which doesn't run its own attrd) - */ - if ((election_state(writer) == election_won) - && !is_set(peer->flags, crm_remote_node)) { - attrd_peer_sync(peer, NULL); + bool remove_voter = FALSE; + + switch (kind) { + case crm_status_uname: + break; + + case crm_status_processes: + if (is_not_set(peer->processes, crm_get_cluster_proc())) { + remove_voter = TRUE; } - } else { - /* Remove all attribute values associated with lost nodes */ - attrd_peer_remove(peer->uname, FALSE, "loss"); - if (peer_writer && safe_str_eq(peer->uname, peer_writer)) { - free(peer_writer); - peer_writer = NULL; - crm_notice("Lost attribute writer %s", peer->uname); + break; + + case crm_status_nstate: + if (safe_str_eq(peer->state, CRM_NODE_MEMBER)) { + /* If we're the writer, send new peers a list of all attributes + * (unless it's a remote node, which doesn't run its own attrd) + */ + if (attrd_election_won() + && !is_set(peer->flags, crm_remote_node)) { + attrd_peer_sync(peer, NULL); + } + } else { + // Remove all attribute values associated with lost nodes + attrd_peer_remove(peer->uname, FALSE, "loss"); + remove_voter = TRUE; } - } + break; + } + + // In case an election is in progress, remove any vote by the node + if (remove_voter) { + attrd_remove_voter(peer); } } static void attrd_cib_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { int level = LOG_ERR; GHashTableIter iter; const char *peer = NULL; attribute_value_t *v = NULL; char *name = user_data; attribute_t *a = g_hash_table_lookup(attributes, name); if(a == NULL) { crm_info("Attribute %s no longer exists", name); return; } a->update = 0; if (rc == pcmk_ok && call_id < 0) { rc = call_id; } switch (rc) { case pcmk_ok: level = LOG_INFO; last_cib_op_done = call_id; if (a->timer && !a->timeout_ms) { // Remove temporary dampening for failed writes mainloop_timer_del(a->timer); a->timer = NULL; } break; case -pcmk_err_diff_failed: /* When an attr changes while the CIB is syncing */ case -ETIME: /* When an attr changes while there is a DC election */ case -ENXIO: /* When an attr changes while the CIB is syncing a * newer config from a node that just came up */ level = LOG_WARNING; break; } do_crm_log(level, "CIB update %d result for %s: %s " CRM_XS " rc=%d", call_id, a->id, pcmk_strerror(rc), rc); g_hash_table_iter_init(&iter, a->values); while (g_hash_table_iter_next(&iter, (gpointer *) & peer, (gpointer *) & v)) { do_crm_log(level, "* %s[%s]=%s", a->id, peer, v->requested); free(v->requested); v->requested = NULL; if (rc != pcmk_ok) { a->changed = TRUE; /* Attempt write out again */ } } - if (a->changed && (election_state(writer) == election_won)) { + if (a->changed && attrd_election_won()) { /* If we're re-attempting a write because the original failed, delay * the next attempt so we don't potentially flood the CIB manager * and logs with a zillion attempts per second. * * @TODO We could elect a new writer instead. However, we'd have to * somehow downgrade our vote, and we'd still need something like this * if all peers similarly fail to write this attribute (which may * indicate a corrupted attribute entry rather than a CIB issue). */ if (a->timer) { // Attribute has a dampening value, so use that as delay if (!mainloop_timer_running(a->timer)) { crm_trace("Delayed re-attempted write (%dms) for %s", a->timeout_ms, name); mainloop_timer_start(a->timer); } } else { /* Set a temporary dampening of 2 seconds (timer will continue * to exist until the attribute's dampening gets set or the * write succeeds). */ a->timer = mainloop_timer_add(a->id, 2000, FALSE, attribute_timer_cb, a); mainloop_timer_start(a->timer); } } } void write_attributes(bool all, bool ignore_delay) { GHashTableIter iter; attribute_t *a = NULL; crm_debug("Writing out %s attributes", all? "all" : "changed"); g_hash_table_iter_init(&iter, attributes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & a)) { if (!all && a->unknown_peer_uuids) { // Try writing this attribute again, in case peer ID was learned a->changed = TRUE; } if(all || a->changed) { write_attribute(a, ignore_delay); } else { crm_debug("Skipping unchanged attribute %s", a->id); } } } static void build_update_element(xmlNode *parent, attribute_t *a, const char *nodeid, const char *value) { const char *set = NULL; xmlNode *xml_obj = NULL; xml_obj = create_xml_node(parent, XML_CIB_TAG_STATE); crm_xml_add(xml_obj, XML_ATTR_ID, nodeid); xml_obj = create_xml_node(xml_obj, XML_TAG_TRANSIENT_NODEATTRS); crm_xml_add(xml_obj, XML_ATTR_ID, nodeid); xml_obj = create_xml_node(xml_obj, XML_TAG_ATTR_SETS); if (a->set) { crm_xml_set_id(xml_obj, "%s", a->set); } else { crm_xml_set_id(xml_obj, "%s-%s", XML_CIB_TAG_STATUS, nodeid); } set = ID(xml_obj); xml_obj = create_xml_node(xml_obj, XML_CIB_TAG_NVPAIR); if (a->uuid) { crm_xml_set_id(xml_obj, "%s", a->uuid); } else { crm_xml_set_id(xml_obj, "%s-%s", set, a->id); } crm_xml_add(xml_obj, XML_NVPAIR_ATTR_NAME, a->id); if(value) { crm_xml_add(xml_obj, XML_NVPAIR_ATTR_VALUE, value); } else { crm_xml_add(xml_obj, XML_NVPAIR_ATTR_VALUE, ""); crm_xml_add(xml_obj, "__delete__", XML_NVPAIR_ATTR_VALUE); } } static void set_alert_attribute_value(GHashTable *t, attribute_value_t *v) { attribute_value_t *a_v = NULL; a_v = calloc(1, sizeof(attribute_value_t)); CRM_ASSERT(a_v != NULL); a_v->nodeid = v->nodeid; a_v->nodename = strdup(v->nodename); if (v->current != NULL) { a_v->current = strdup(v->current); } g_hash_table_replace(t, a_v->nodename, a_v); } static void send_alert_attributes_value(attribute_t *a, GHashTable *t) { int rc = 0; attribute_value_t *at = NULL; GHashTableIter vIter; g_hash_table_iter_init(&vIter, t); while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & at)) { rc = attrd_send_attribute_alert(at->nodename, at->nodeid, a->id, at->current); crm_trace("Sent alerts for %s[%s]=%s: nodeid=%d rc=%d", a->id, at->nodename, at->current, at->nodeid, rc); } } #define s_if_plural(i) (((i) == 1)? "" : "s") void write_attribute(attribute_t *a, bool ignore_delay) { int private_updates = 0, cib_updates = 0; xmlNode *xml_top = NULL; attribute_value_t *v = NULL; GHashTableIter iter; enum cib_call_options flags = cib_quorum_override; GHashTable *alert_attribute_value = NULL; if (a == NULL) { return; } /* If this attribute will be written to the CIB ... */ if (!a->is_private) { /* Defer the write if now's not a good time */ if (the_cib == NULL) { crm_info("Write out of '%s' delayed: cib not connected", a->id); return; } else if (a->update && (a->update < last_cib_op_done)) { crm_info("Write out of '%s' continuing: update %d considered lost", a->id, a->update); } else if (a->update) { crm_info("Write out of '%s' delayed: update %d in progress", a->id, a->update); return; } else if (mainloop_timer_running(a->timer)) { if (ignore_delay) { /* 'refresh' forces a write of the current value of all attributes * Cancel any existing timers, we're writing it NOW */ mainloop_timer_stop(a->timer); crm_debug("Write out of '%s': timer is running but ignore delay", a->id); } else { crm_info("Write out of '%s' delayed: timer is running", a->id); return; } } /* Initialize the status update XML */ xml_top = create_xml_node(NULL, XML_CIB_TAG_STATUS); } /* Attribute will be written shortly, so clear changed flag */ a->changed = FALSE; /* We will check all peers' uuids shortly, so initialize this to false */ a->unknown_peer_uuids = FALSE; /* Make the table for the attribute trap */ alert_attribute_value = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, NULL, free_attribute_value); /* Iterate over each peer value of this attribute */ g_hash_table_iter_init(&iter, a->values); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & v)) { crm_node_t *peer = crm_get_peer_full(v->nodeid, v->nodename, CRM_GET_PEER_ANY); /* If the value's peer info does not correspond to a peer, ignore it */ if (peer == NULL) { crm_notice("Cannot update %s[%s]=%s because peer not known", a->id, v->nodename, v->current); continue; } /* If we're just learning the peer's node id, remember it */ if (peer->id && (v->nodeid == 0)) { crm_trace("Learned ID %u for node %s", peer->id, v->nodename); v->nodeid = peer->id; } /* If this is a private attribute, no update needs to be sent */ if (a->is_private) { private_updates++; continue; } /* If the peer is found, but its uuid is unknown, defer write */ if (peer->uuid == NULL) { a->unknown_peer_uuids = TRUE; crm_notice("Cannot update %s[%s]=%s because peer UUID not known " "(will retry if learned)", a->id, v->nodename, v->current); continue; } /* Add this value to status update XML */ crm_debug("Updating %s[%s]=%s (peer known as %s, UUID %s, ID %u/%u)", a->id, v->nodename, v->current, peer->uname, peer->uuid, peer->id, v->nodeid); build_update_element(xml_top, a, peer->uuid, v->current); cib_updates++; /* Preservation of the attribute to transmit alert */ set_alert_attribute_value(alert_attribute_value, v); free(v->requested); v->requested = NULL; if (v->current) { v->requested = strdup(v->current); } else { /* Older attrd versions don't know about the cib_mixed_update * flag so make sure it goes to the local cib which does */ flags |= cib_mixed_update|cib_scope_local; } } if (private_updates) { crm_info("Processed %d private change%s for %s, id=%s, set=%s", private_updates, s_if_plural(private_updates), a->id, (a->uuid? a->uuid : "n/a"), (a->set? a->set : "n/a")); } if (cib_updates) { crm_log_xml_trace(xml_top, __FUNCTION__); a->update = cib_internal_op(the_cib, CIB_OP_MODIFY, NULL, XML_CIB_TAG_STATUS, xml_top, NULL, flags, a->user); crm_info("Sent CIB request %d with %d change%s for %s (id %s, set %s)", a->update, cib_updates, s_if_plural(cib_updates), a->id, (a->uuid? a->uuid : "n/a"), (a->set? a->set : "n/a")); the_cib->cmds->register_callback_full(the_cib, a->update, 120, FALSE, strdup(a->id), "attrd_cib_callback", attrd_cib_callback, free); /* Transmit alert of the attribute */ send_alert_attributes_value(a, alert_attribute_value); } g_hash_table_destroy(alert_attribute_value); free_xml(xml_top); } diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c new file mode 100644 index 0000000000..9b779aa332 --- /dev/null +++ b/daemons/attrd/attrd_elections.c @@ -0,0 +1,140 @@ +/* + * Copyright 2013-2018 Andrew Beekhof + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include +#include +#include +#include + +#include "pacemaker-attrd.h" + +static char *peer_writer = NULL; +static election_t *writer = NULL; + +void +attrd_election_init() +{ + writer = election_init(T_ATTRD, attrd_cluster->uname, 120000, + attrd_election_cb); +} + +void +attrd_election_fini() +{ + election_fini(writer); +} + +void +attrd_start_election_if_needed() +{ + if ((peer_writer == NULL) + && (election_state(writer) != election_in_progress)) { + crm_info("Starting an election to determine the writer"); + election_vote(writer); + } +} + +bool +attrd_election_won() +{ + return (election_state(writer) == election_won); +} + +void +attrd_handle_election_op(const crm_node_t *peer, xmlNode *xml) +{ + enum election_result rc = 0; + enum election_result previous = election_state(writer); + + crm_xml_add(xml, F_CRM_HOST_FROM, peer->uname); + rc = election_count_vote(writer, xml, TRUE); + switch(rc) { + case election_start: + free(peer_writer); + peer_writer = NULL; + crm_debug("Unsetting writer (was %s) and starting new election", + peer_writer? peer_writer : "unset"); + election_vote(writer); + break; + + case election_lost: + /* Losing to this peer does not mean this peer definitely won + * (another peer may eventually win). However if we don't already + * have a writer, we tentatively record this peer as writer so that + * we don't enter "peer_writer == NULL" blocks after this point + * (which might start new elections). + * + * However, we don't do this if the state was already lost, because + * we may just be getting the current state back when processing a + * late no-vote. + */ + if ((peer_writer == NULL) || (previous != election_lost)) { + free(peer_writer); + peer_writer = strdup(peer->uname); + crm_debug("Election lost, presuming %s is writer for now", + peer_writer); + } + break; + + case election_in_progress: + election_check(writer); + break; + + default: + crm_info("Ignoring election op from %s due to error", peer->uname); + break; + } +} + +bool +attrd_check_for_new_writer(const crm_node_t *peer, const xmlNode *xml) +{ + int peer_state = 0; + + crm_element_value_int(xml, F_ATTRD_WRITER, &peer_state); + if (peer_state == election_won) { + if ((election_state(writer) == election_won) + && safe_str_neq(peer->uname, attrd_cluster->uname)) { + crm_notice("Detected another attribute writer (%s), starting new election", + peer->uname); + election_vote(writer); + + } else if (safe_str_neq(peer->uname, peer_writer)) { + crm_notice("Recorded new attribute writer: %s (was %s)", + peer->uname, (peer_writer? peer_writer : "unset")); + free(peer_writer); + peer_writer = strdup(peer->uname); + } + } + return (peer_state == election_won); +} + +void +attrd_declare_winner() +{ + crm_notice("Recorded local node as attribute writer (was %s)", + (peer_writer? peer_writer : "unset")); + free(peer_writer); + peer_writer = strdup(attrd_cluster->uname); +} + +void +attrd_remove_voter(const crm_node_t *peer) +{ + if (peer_writer && safe_str_eq(peer->uname, peer_writer)) { + free(peer_writer); + peer_writer = NULL; + crm_notice("Lost attribute writer %s", peer->uname); + } + election_remove(writer, peer->uname); +} + +void +attrd_xml_add_writer(xmlNode *xml) +{ + crm_xml_add_int(xml, F_ATTRD_WRITER, election_state(writer)); +} diff --git a/daemons/attrd/pacemaker-attrd.c b/daemons/attrd/pacemaker-attrd.c index b4e265b8fe..d96e666aeb 100644 --- a/daemons/attrd/pacemaker-attrd.c +++ b/daemons/attrd/pacemaker-attrd.c @@ -1,399 +1,396 @@ /* * Copyright 2013-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include - #include +#include #include #include "pacemaker-attrd.h" lrmd_t *the_lrmd = NULL; crm_cluster_t *attrd_cluster = NULL; -election_t *writer = NULL; crm_trigger_t *attrd_config_read = NULL; static crm_exit_t attrd_exit_status = CRM_EX_OK; static void attrd_cpg_dispatch(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if(data == NULL) { return; } if (kind == crm_class_cluster) { xml = string2xml(data); } if (xml == NULL) { crm_err("Bad message of class %d received from %s[%u]: '%.120s'", kind, from, nodeid, data); } else { crm_node_t *peer = crm_get_peer(nodeid, from); attrd_peer_message(peer, xml); } free_xml(xml); free(data); } static void attrd_cpg_destroy(gpointer unused) { if (attrd_shutting_down()) { crm_info("Corosync disconnection complete"); } else { crm_crit("Lost connection to cluster layer, shutting down"); attrd_exit_status = CRM_EX_DISCONNECT; attrd_shutdown(0); } } static void attrd_cib_replaced_cb(const char *event, xmlNode * msg) { crm_notice("Updating all attributes after %s event", event); - if(election_state(writer) == election_won) { + if (attrd_election_won()) { write_attributes(TRUE, FALSE); } } static void attrd_cib_destroy_cb(gpointer user_data) { cib_t *conn = user_data; conn->cmds->signoff(conn); /* Ensure IPC is cleaned up */ if (attrd_shutting_down()) { crm_info("Connection disconnection complete"); } else { /* eventually this should trigger a reconnect, not a shutdown */ crm_crit("Lost connection to the CIB manager, shutting down"); attrd_exit_status = CRM_EX_DISCONNECT; attrd_shutdown(0); } return; } static void attrd_erase_cb(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { do_crm_log_unlikely((rc? LOG_NOTICE : LOG_DEBUG), "Cleared transient attributes: %s " CRM_XS " xpath=%s rc=%d", pcmk_strerror(rc), (char *) user_data, rc); } #define XPATH_TRANSIENT "//node_state[@uname='%s']/" XML_TAG_TRANSIENT_NODEATTRS /*! * \internal * \brief Wipe all transient attributes for this node from the CIB * * Clear any previous transient node attributes from the CIB. This is * normally done by the DC's controller when this node leaves the cluster, but * this handles the case where the node restarted so quickly that the * cluster layer didn't notice. * * \todo If pacemaker-attrd respawns after crashing (see PCMK_respawned), * ideally we'd skip this and sync our attributes from the writer. * However, currently we reject any values for us that the writer has, in * attrd_peer_update(). */ static void attrd_erase_attrs() { int call_id; char *xpath = crm_strdup_printf(XPATH_TRANSIENT, attrd_cluster->uname); crm_info("Clearing transient attributes from CIB " CRM_XS " xpath=%s", xpath); call_id = the_cib->cmds->remove(the_cib, xpath, NULL, cib_quorum_override | cib_xpath); the_cib->cmds->register_callback_full(the_cib, call_id, 120, FALSE, xpath, "attrd_erase_cb", attrd_erase_cb, free); } static int attrd_cib_connect(int max_retry) { static int attempts = 0; int rc = -ENOTCONN; the_cib = cib_new(); if (the_cib == NULL) { return -ENOTCONN; } do { if(attempts > 0) { sleep(attempts); } attempts++; crm_debug("Connection attempt %d to the CIB manager", attempts); rc = the_cib->cmds->signon(the_cib, T_ATTRD, cib_command); } while(rc != pcmk_ok && attempts < max_retry); if (rc != pcmk_ok) { crm_err("Connection to the CIB manager failed: %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); goto cleanup; } crm_debug("Connected to the CIB manager after %d attempts", attempts); rc = the_cib->cmds->set_connection_dnotify(the_cib, attrd_cib_destroy_cb); if (rc != pcmk_ok) { crm_err("Could not set disconnection callback"); goto cleanup; } rc = the_cib->cmds->add_notify_callback(the_cib, T_CIB_REPLACE_NOTIFY, attrd_cib_replaced_cb); if(rc != pcmk_ok) { crm_err("Could not set CIB notification callback"); goto cleanup; } rc = the_cib->cmds->add_notify_callback(the_cib, T_CIB_DIFF_NOTIFY, attrd_cib_updated_cb); if (rc != pcmk_ok) { crm_err("Could not set CIB notification callback (update)"); goto cleanup; } // We have no attribute values in memory, wipe the CIB to match attrd_erase_attrs(); // Set a trigger for reading the CIB (for the alerts section) attrd_config_read = mainloop_add_trigger(G_PRIORITY_HIGH, attrd_read_options, NULL); // Always read the CIB at start-up mainloop_set_trigger(attrd_config_read); /* Set a private attribute for ourselves with the protocol version we * support. This lets all nodes determine the minimum supported version * across all nodes. It also ensures that the writer learns our node name, * so it can send our attributes to the CIB. */ attrd_broadcast_protocol(); return pcmk_ok; cleanup: the_cib->cmds->signoff(the_cib); cib_delete(the_cib); the_cib = NULL; return -ENOTCONN; } static int32_t attrd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; crm_client_t *client = crm_client_get(c); xmlNode *xml = crm_ipcs_recv(client, data, size, &id, &flags); const char *op; if (xml == NULL) { crm_debug("No msg from %d (%p)", crm_ipcs_client_pid(c), c); return 0; } #if ENABLE_ACL CRM_ASSERT(client->user != NULL); crm_acl_get_set_user(xml, F_ATTRD_USER, client->user); #endif crm_trace("Processing msg from %d (%p)", crm_ipcs_client_pid(c), c); crm_log_xml_trace(xml, __FUNCTION__); op = crm_element_value(xml, F_ATTRD_TASK); if (client->name == NULL) { const char *value = crm_element_value(xml, F_ORIG); client->name = crm_strdup_printf("%s.%d", value?value:"unknown", client->pid); } if (safe_str_eq(op, ATTRD_OP_PEER_REMOVE)) { attrd_send_ack(client, id, flags); attrd_client_peer_remove(client->name, xml); } else if (safe_str_eq(op, ATTRD_OP_CLEAR_FAILURE)) { attrd_send_ack(client, id, flags); attrd_client_clear_failure(xml); } else if (safe_str_eq(op, ATTRD_OP_UPDATE)) { attrd_send_ack(client, id, flags); attrd_client_update(xml); } else if (safe_str_eq(op, ATTRD_OP_UPDATE_BOTH)) { attrd_send_ack(client, id, flags); attrd_client_update(xml); } else if (safe_str_eq(op, ATTRD_OP_UPDATE_DELAY)) { attrd_send_ack(client, id, flags); attrd_client_update(xml); } else if (safe_str_eq(op, ATTRD_OP_REFRESH)) { attrd_send_ack(client, id, flags); attrd_client_refresh(); } else if (safe_str_eq(op, ATTRD_OP_QUERY)) { /* queries will get reply, so no ack is necessary */ attrd_client_query(client, id, flags, xml); } else { crm_info("Ignoring request from client %s with unknown operation %s", client->name, op); } free_xml(xml); return 0; } static int attrd_cluster_connect() { attrd_cluster = calloc(1, sizeof(crm_cluster_t)); attrd_cluster->destroy = attrd_cpg_destroy; attrd_cluster->cpg.cpg_deliver_fn = attrd_cpg_dispatch; attrd_cluster->cpg.cpg_confchg_fn = pcmk_cpg_membership; crm_set_status_callback(&attrd_peer_change_cb); if (crm_cluster_connect(attrd_cluster) == FALSE) { crm_err("Cluster connection failed"); return -ENOTCONN; } return pcmk_ok; } /* *INDENT-OFF* */ static struct crm_option long_options[] = { /* Top-level Options */ {"help", 0, 0, '?', "\tThis text"}, {"verbose", 0, 0, 'V', "\tIncrease debug output"}, {0, 0, 0, 0} }; /* *INDENT-ON* */ int main(int argc, char **argv) { int flag = 0; int index = 0; int argerr = 0; qb_ipcs_service_t *ipcs = NULL; attrd_init_mainloop(); crm_log_preinit(NULL, argc, argv); crm_set_options(NULL, "[options]", long_options, "Daemon for aggregating and atomically storing node attribute updates into the CIB"); mainloop_add_signal(SIGTERM, attrd_shutdown); while (1) { flag = crm_get_option(argc, argv, &index); if (flag == -1) break; switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'h': /* Help message */ crm_help(flag, CRM_EX_OK); break; default: ++argerr; break; } } if (optind > argc) { ++argerr; } if (argerr) { crm_help('?', CRM_EX_USAGE); } crm_log_init(T_ATTRD, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); crm_info("Starting up"); attributes = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_attribute); if (attrd_cluster_connect() != pcmk_ok) { attrd_exit_status = CRM_EX_FATAL; goto done; } crm_info("Cluster connection active"); + attrd_election_init(); + if (attrd_cib_connect(10) != pcmk_ok) { attrd_exit_status = CRM_EX_FATAL; goto done; } crm_info("CIB connection active"); - writer = election_init(T_ATTRD, attrd_cluster->uname, 120000, attrd_election_cb); attrd_init_ipc(&ipcs, attrd_ipc_dispatch); crm_info("Accepting attribute updates"); - attrd_run_mainloop(); done: crm_info("Shutting down attribute manager"); - election_fini(writer); + attrd_election_fini(); if (ipcs) { crm_client_disconnect_all(ipcs); qb_ipcs_destroy(ipcs); g_hash_table_destroy(attributes); } attrd_lrmd_disconnect(); attrd_cib_disconnect(); return crm_exit(attrd_exit_status); } diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h index 284f05d190..9390242cff 100644 --- a/daemons/attrd/pacemaker-attrd.h +++ b/daemons/attrd/pacemaker-attrd.h @@ -1,116 +1,126 @@ /* * Copyright 2013-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #ifndef PACEMAKER_ATTRD__H # define PACEMAKER_ATTRD__H #include #include #include #include #include #include void attrd_init_mainloop(void); void attrd_run_mainloop(void); gboolean attrd_shutting_down(void); void attrd_shutdown(int nsig); void attrd_init_ipc(qb_ipcs_service_t **ipcs, qb_ipcs_msg_process_fn dispatch_fn); void attrd_cib_disconnect(void); gboolean attrd_value_needs_expansion(const char *value); int attrd_expand_value(const char *value, const char *old_value); /* regular expression to clear failures of all resources */ #define ATTRD_RE_CLEAR_ALL \ "^(" CRM_FAIL_COUNT_PREFIX "|" CRM_LAST_FAILURE_PREFIX ")-" /* regular expression to clear failure of all operations for one resource * (format takes resource name) * * @COMPAT attributes set < 1.1.17: * also match older attributes that do not have the operation part */ #define ATTRD_RE_CLEAR_ONE ATTRD_RE_CLEAR_ALL "%s(#.+_[0-9]+)?$" /* regular expression to clear failure of one operation for one resource * (format takes resource name, operation name, and interval) * * @COMPAT attributes set < 1.1.17: * also match older attributes that do not have the operation part */ #define ATTRD_RE_CLEAR_OP ATTRD_RE_CLEAR_ALL "%s(#%s_%u)?$" int attrd_failure_regex(regex_t *regex, const char *rsc, const char *op, guint interval_ms); extern cib_t *the_cib; /* Alerts */ extern lrmd_t *the_lrmd; extern crm_trigger_t *attrd_config_read; void attrd_lrmd_disconnect(void); gboolean attrd_read_options(gpointer user_data); void attrd_cib_updated_cb(const char *event, xmlNode *msg); int attrd_send_attribute_alert(const char *node, int nodeid, const char *attr, const char *value); +// Elections +void attrd_election_init(void); +void attrd_election_fini(void); +void attrd_start_election_if_needed(void); +bool attrd_election_won(void); +void attrd_handle_election_op(const crm_node_t *peer, xmlNode *xml); +bool attrd_check_for_new_writer(const crm_node_t *peer, const xmlNode *xml); +void attrd_declare_winner(void); +void attrd_remove_voter(const crm_node_t *peer); +void attrd_xml_add_writer(xmlNode *xml); + typedef struct attribute_s { char *uuid; /* TODO: Remove if at all possible */ char *id; char *set; GHashTable *values; int update; int timeout_ms; /* TODO: refactor these three as a bitmask */ bool changed; /* whether attribute value has changed since last write */ bool unknown_peer_uuids; /* whether we know we're missing a peer uuid */ gboolean is_private; /* whether to keep this attribute out of the CIB */ mainloop_timer_t *timer; char *user; } attribute_t; typedef struct attribute_value_s { uint32_t nodeid; gboolean is_remote; char *nodename; char *current; char *requested; gboolean seen; } attribute_value_t; crm_cluster_t *attrd_cluster; GHashTable *attributes; -election_t *writer; #define attrd_send_ack(client, id, flags) \ crm_ipcs_send_ack((client), (id), (flags), "ack", __FUNCTION__, __LINE__) void write_attributes(bool all, bool ignore_delay); void attrd_broadcast_protocol(void); void attrd_peer_message(crm_node_t *client, xmlNode *msg); void attrd_client_peer_remove(const char *client_name, xmlNode *xml); void attrd_client_clear_failure(xmlNode *xml); void attrd_client_update(xmlNode *xml); void attrd_client_refresh(void); void attrd_client_query(crm_client_t *client, uint32_t id, uint32_t flags, xmlNode *query); void free_attribute(gpointer data); gboolean attrd_election_cb(gpointer user_data); void attrd_peer_change_cb(enum crm_status_type type, crm_node_t *peer, const void *data); #endif /* PACEMAKER_ATTRD__H */ diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c index 0649a44748..a188263109 100644 --- a/daemons/controld/controld_callbacks.c +++ b/daemons/controld/controld_callbacks.c @@ -1,331 +1,339 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) { const char *from = crm_element_value(msg, F_ORIG); if (safe_str_neq(from, fsa_our_uname)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __FUNCTION__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (safe_str_eq(sys_to, CRM_SYSTEM_DC)) { return; } } /* crm_log_xml_trace("HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(fsa_source); } /*! * \internal * \brief Check whether a node is online * * \param[in] node Node to check * * \retval -1 if completely dead * \retval 0 if partially alive * \retval 1 if completely alive */ static int node_alive(const crm_node_t *node) { if (is_set(node->flags, crm_remote_node)) { // Pacemaker Remote nodes can't be partially alive return safe_str_eq(node->state, CRM_NODE_MEMBER)? 1: -1; } else if (crm_is_peer_active(node)) { // Completely up cluster node: both cluster member and peer return 1; } else if (is_not_set(node->processes, crm_get_cluster_proc()) && safe_str_neq(node->state, CRM_NODE_MEMBER)) { // Completely down cluster node: neither cluster member nor peer return -1; } // Partially up cluster node: only cluster member or only peer return 0; } #define state_text(state) ((state)? (const char *)(state) : "in unknown state") void peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { uint32_t old = 0; bool appeared = FALSE; bool is_remote = is_set(node->flags, crm_remote_node); /* The controller waits to receive some information from the membership * layer before declaring itself operational. If this is being called for a * cluster node, indicate that we have it. */ if (!is_remote) { set_bit(fsa_input_register, R_PEER_DATA); } if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: /* If we've never seen the node, then it also won't be in the status section */ crm_info("%s node %s is now %s", (is_remote? "Remote" : "Cluster"), node->uname, state_text(node->state)); return; case crm_status_nstate: /* This callback should not be called unless the state actually * changed, but here's a failsafe just in case. */ CRM_CHECK(safe_str_neq(data, node->state), return); crm_info("%s node %s is now %s (was %s)", (is_remote? "Remote" : "Cluster"), node->uname, state_text(node->state), state_text(data)); if (safe_str_eq(CRM_NODE_MEMBER, node->state)) { appeared = TRUE; if (!is_remote) { remove_stonith_cleanup(node->uname); } + } else { + controld_remove_voter(node->uname); } crmd_alert_node_event(node); break; case crm_status_processes: CRM_CHECK(data != NULL, return); old = *(const uint32_t *)data; appeared = is_set(node->processes, crm_get_cluster_proc()); crm_info("Node %s is %s a peer " CRM_XS " DC=%s old=0x%07x new=0x%07x", node->uname, (appeared? "now" : "no longer"), (AM_I_DC? "true" : (fsa_our_dc? fsa_our_dc : "")), old, node->processes); if (is_not_set((node->processes ^ old), crm_get_cluster_proc())) { /* Peer status did not change. This should not be possible, * since we don't track process flags other than peer status. */ crm_trace("Process flag 0x%7x did not change from 0x%7x to 0x%7x", crm_get_cluster_proc(), old, node->processes); return; - } else if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) { + } + + if (!appeared) { + controld_remove_voter(node->uname); + } + + if (is_not_set(fsa_input_register, R_CIB_CONNECTED)) { crm_trace("Ignoring peer status change because not connected to CIB"); return; } else if (fsa_state == S_STOPPING) { crm_trace("Ignoring peer status change because stopping"); return; } if (safe_str_eq(node->uname, fsa_our_uname) && !appeared) { /* Did we get evicted? */ crm_notice("Our peer connection failed"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL); } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) { /* Did the DC leave us? */ crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); /* @COMPAT DC < 1.1.13: If a DC shuts down normally, we don't * want to fence it. Newer DCs will send their shutdown request * to all peers, who will update the DC's expected state to * down, thus avoiding fencing. We can safely erase the DC's * transient attributes when it leaves in that case. However, * the only way to avoid fencing older DCs is to leave the * transient attributes intact until it rejoins. */ if (compare_version(fsa_our_dc_version, "3.0.9") > 0) { erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); } } else if(AM_I_DC) { if (appeared) { te_trigger_stonith_history_sync(); } else { erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); } } break; } if (AM_I_DC) { xmlNode *update = NULL; int flags = node_update_peer; int alive = node_alive(node); crm_action_t *down = match_down_event(node->uuid); crm_trace("Alive=%d, appeared=%d, down=%d", alive, appeared, (down? down->id : -1)); if (appeared && (alive > 0)) { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRM_OP_FENCE)) { /* tengine_stonith_callback() confirms fence actions */ crm_trace("Updating CIB %s fencer reported fencing of %s complete", (down->confirmed? "after" : "before"), node->uname); } else if (!appeared && safe_str_eq(task, CRM_OP_SHUTDOWN)) { // Shutdown actions are immediately confirmed (i.e. no_wait) if (!is_remote) { flags |= node_update_join | node_update_expected; crmd_peer_down(node, FALSE); check_join_state(fsa_state, __FUNCTION__); } if (alive >= 0) { crm_info("%s of peer %s is in progress " CRM_XS " action=%d", task, node->uname, down->id); } else { crm_notice("%s of peer %s is complete " CRM_XS " action=%d", task, node->uname, down->id); update_graph(transition_graph, down); trigger_graph(); } } else { crm_trace("Node %s is %s, was expected to %s (op %d)", node->uname, ((alive > 0)? "alive" : ((alive < 0)? "dead" : "partially alive")), task, down->id); } } else if (appeared == FALSE) { crm_warn("Stonith/shutdown of node %s was not expected", node->uname); if (!is_remote) { crm_update_peer_join(__FUNCTION__, node, crm_join_none); check_join_state(fsa_state, __FUNCTION__); } abort_transition(INFINITY, tg_restart, "Node failure", NULL); fail_incompletable_actions(transition_graph, node->uuid); } else { crm_trace("Node %s came up, was not expected to be down", node->uname); } if (is_remote) { /* A pacemaker_remote node won't have its cluster status updated * in the CIB by membership-layer callbacks, so do it here. */ flags |= node_update_cluster; /* Trigger resource placement on newly integrated nodes */ if (appeared) { abort_transition(INFINITY, tg_restart, "pacemaker_remote node integrated", NULL); } } /* Update the CIB node state */ update = create_node_state_update(node, flags, NULL, __FUNCTION__); if (update == NULL) { crm_debug("Node state update not yet possible for %s", node->uname); } else { fsa_cib_anon_update(XML_CIB_TAG_STATUS, update); } free_xml(update); } trigger_fsa(fsa_source); } void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn,;); crm_trace("Invoked"); trigger_fsa(fsa_source); fsa_cib_conn->state = cib_disconnected; if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Connection to the CIB manager terminated"); return; } // @TODO This should trigger a reconnect, not a shutdown crm_crit("Lost connection to the CIB manager, shutting down"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit(fsa_input_register, R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 633520beaa..cf2a92b10a 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -1,913 +1,901 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include qb_ipcs_service_t *ipcs = NULL; #if SUPPORT_COROSYNC extern gboolean crm_connect_corosync(crm_cluster_t * cluster); #endif void crm_shutdown(int nsig); gboolean crm_read_options(gpointer user_data); gboolean fsa_has_quorum = FALSE; crm_trigger_t *fsa_source = NULL; crm_trigger_t *config_read = NULL; bool no_quorum_suicide_escalation = FALSE; -static gboolean -election_timeout_popped(gpointer data) -{ - /* Not everyone voted */ - crm_info("Election failed: Declaring ourselves the winner"); - register_fsa_input(C_TIMER_POPPED, I_ELECTION_DC, NULL); - return FALSE; -} - /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; static crm_cluster_t *cluster = NULL; if (cluster == NULL) { cluster = calloc(1, sizeof(crm_cluster_t)); } if (action & A_HA_DISCONNECT) { crm_cluster_disconnect(cluster); crm_info("Disconnected from the cluster"); set_bit(fsa_input_register, R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { crm_set_status_callback(&peer_update_callback); crm_set_autoreap(FALSE); if (is_corosync_cluster()) { #if SUPPORT_COROSYNC registered = crm_connect_corosync(cluster); #endif } - fsa_election = election_init(NULL, cluster->uname, 60000/*60s*/, election_timeout_popped); + controld_election_init(cluster->uname); fsa_our_uname = cluster->uname; fsa_our_uuid = cluster->uuid; if(cluster->uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } if (registered == FALSE) { set_bit(fsa_input_register, R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __FUNCTION__); clear_bit(fsa_input_register, R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ set_bit(fsa_input_register, R_SHUTDOWN); if (stonith_api) { /* Prevent it from coming up again */ clear_bit(fsa_input_register, R_ST_REQUIRED); crm_info("Disconnecting from fencer"); stonith_api->cmds->disconnect(stonith_api); } } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; set_bit(fsa_input_register, R_SHUTDOWN); crm_info("Sending shutdown request to all peers (DC is %s)", (fsa_our_dc? fsa_our_dc : "not set")); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); /* set_bit(fsa_input_register, R_STAYDOWN); */ if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } extern crm_ipc_t *attrd_ipc; extern char *max_generation_from; extern xmlNode *max_generation_xml; extern GHashTable *resource_history; extern GHashTable *voted; extern char *te_client_id; crm_exit_t crmd_fast_exit(crm_exit_t exit_code) { if (is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn "CRM_XS" remapping exit code %d to %d", exit_code, CRM_EX_FATAL); exit_code = CRM_EX_FATAL; } else if ((exit_code == CRM_EX_OK) && is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = CRM_EX_ERROR; } return crm_exit(exit_code); } crm_exit_t crmd_exit(crm_exit_t exit_code) { GListPtr gIter = NULL; GMainLoop *mloop = crmd_mainloop; static bool in_progress = FALSE; if (in_progress && (exit_code == CRM_EX_OK)) { crm_debug("Exit is already in progress"); return exit_code; } else if(in_progress) { crm_notice("Error during shutdown process, exiting now with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(exit_code); } in_progress = TRUE; crm_trace("Preparing to exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); /* Suppress secondary errors resulting from us disconnecting everything */ set_bit(fsa_input_register, R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } if (attrd_ipc) { crm_trace("Closing connection to pacemaker-attrd"); crm_ipc_close(attrd_ipc); crm_ipc_destroy(attrd_ipc); attrd_ipc = NULL; } pe_subsystem_free(); if(stonith_api) { crm_trace("Disconnecting fencing API"); clear_bit(fsa_input_register, R_ST_REQUIRED); stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if ((exit_code == CRM_EX_OK) && (crmd_mainloop == NULL)) { crm_debug("No mainloop detected"); exit_code = CRM_EX_ERROR; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if (exit_code != CRM_EX_OK) { crm_notice("Forcing immediate exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); crm_write_blackbox(SIGTRAP, NULL); return crmd_fast_exit(exit_code); } /* Clean up as much memory as possible for valgrind */ for (gIter = fsa_message_queue; gIter != NULL; gIter = gIter->next) { fsa_data_t *fsa_data = gIter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } clear_bit(fsa_input_register, R_MEMBERSHIP); g_list_free(fsa_message_queue); fsa_message_queue = NULL; metadata_cache_fini(); - - election_fini(fsa_election); - fsa_election = NULL; + controld_election_fini(); /* Tear down the CIB manager connection, but don't free it yet -- it could * be used when we drain the mainloop later. */ cib_free_callbacks(fsa_cib_conn); fsa_cib_conn->cmds->signoff(fsa_cib_conn); verify_stopped(fsa_state, LOG_WARNING); clear_bit(fsa_input_register, R_LRM_CONNECTED); lrm_state_destroy_all(); /* This basically will not work, since mainloop has a reference to it */ mainloop_destroy_trigger(fsa_source); fsa_source = NULL; mainloop_destroy_trigger(config_read); config_read = NULL; mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; crm_client_cleanup(); crm_peer_destroy(); crm_timer_stop(transition_timer); crm_timer_stop(integration_timer); crm_timer_stop(finalization_timer); crm_timer_stop(election_trigger); - election_timeout_stop(fsa_election); crm_timer_stop(shutdown_escalation_timer); crm_timer_stop(wait_timer); crm_timer_stop(recheck_timer); free(transition_timer); transition_timer = NULL; free(integration_timer); integration_timer = NULL; free(finalization_timer); finalization_timer = NULL; free(election_trigger); election_trigger = NULL; free(shutdown_escalation_timer); shutdown_escalation_timer = NULL; free(wait_timer); wait_timer = NULL; free(recheck_timer); recheck_timer = NULL; free(fsa_our_dc_version); fsa_our_dc_version = NULL; free(fsa_our_uname); fsa_our_uname = NULL; free(fsa_our_uuid); fsa_our_uuid = NULL; free(fsa_our_dc); fsa_our_dc = NULL; free(fsa_cluster_name); fsa_cluster_name = NULL; free(te_uuid); te_uuid = NULL; free(te_client_id); te_client_id = NULL; free(fsa_pe_ref); fsa_pe_ref = NULL; free(failed_stop_offset); failed_stop_offset = NULL; free(failed_start_offset); failed_start_offset = NULL; free(max_generation_from); max_generation_from = NULL; free_xml(max_generation_xml); max_generation_xml = NULL; mainloop_destroy_signal(SIGPIPE); mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); /* leave SIGCHLD engaged as we might still want to drain some service-actions */ if (mloop) { GMainContext *ctx = g_main_loop_get_context(crmd_mainloop); /* Don't re-enter this block */ crmd_mainloop = NULL; /* no signals on final draining anymore */ mainloop_destroy_signal(SIGCHLD); crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); { int lpc = 0; while((g_main_context_pending(ctx) && lpc < 10)) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); } else { mainloop_destroy_signal(SIGCHLD); } cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; throttle_fini(); /* Graceful */ crm_trace("Done preparing for exit with status %d (%s)", exit_code, crm_exit_str(exit_code)); return exit_code; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_exit_t exit_code = CRM_EX_OK; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { log_level = LOG_ERR; exit_type = "forcefully"; exit_code = CRM_EX_ERROR; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the controller", fsa_action2string(action), exit_type); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); crmd_exit(exit_code); } static void sigpipe_ignore(int nsig) { return; } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int was_error = 0; crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); mainloop_add_signal(SIGPIPE, sigpipe_ignore); fsa_source = mainloop_add_trigger(G_PRIORITY_HIGH, crm_fsa_trigger, NULL); config_read = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); transition_trigger = mainloop_add_trigger(G_PRIORITY_LOW, te_graph_trigger, NULL); crm_debug("Creating CIB manager and executor objects"); fsa_cib_conn = cib_new(); lrm_state_init_local(); /* set up the timers */ transition_timer = calloc(1, sizeof(fsa_timer_t)); integration_timer = calloc(1, sizeof(fsa_timer_t)); finalization_timer = calloc(1, sizeof(fsa_timer_t)); election_trigger = calloc(1, sizeof(fsa_timer_t)); shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t)); wait_timer = calloc(1, sizeof(fsa_timer_t)); recheck_timer = calloc(1, sizeof(fsa_timer_t)); if (election_trigger != NULL) { election_trigger->source_id = 0; election_trigger->period_ms = -1; election_trigger->fsa_input = I_DC_TIMEOUT; election_trigger->callback = crm_timer_popped; election_trigger->repeat = FALSE; } else { was_error = TRUE; } if (transition_timer != NULL) { transition_timer->source_id = 0; transition_timer->period_ms = -1; transition_timer->fsa_input = I_PE_CALC; transition_timer->callback = crm_timer_popped; transition_timer->repeat = FALSE; } else { was_error = TRUE; } if (integration_timer != NULL) { integration_timer->source_id = 0; integration_timer->period_ms = -1; integration_timer->fsa_input = I_INTEGRATED; integration_timer->callback = crm_timer_popped; integration_timer->repeat = FALSE; } else { was_error = TRUE; } if (finalization_timer != NULL) { finalization_timer->source_id = 0; finalization_timer->period_ms = -1; finalization_timer->fsa_input = I_FINALIZED; finalization_timer->callback = crm_timer_popped; finalization_timer->repeat = FALSE; /* for possible enabling... a bug in the join protocol left * a slave in S_PENDING while we think it's in S_NOT_DC * * raising I_FINALIZED put us into a transition loop which is * never resolved. * in this loop we continually send probes which the node * NACK's because it's in S_PENDING * * if we have nodes where the cluster layer is active but the * CRM is not... then this will be handled in the * integration phase */ finalization_timer->fsa_input = I_ELECTION; } else { was_error = TRUE; } if (shutdown_escalation_timer != NULL) { shutdown_escalation_timer->source_id = 0; shutdown_escalation_timer->period_ms = -1; shutdown_escalation_timer->fsa_input = I_STOP; shutdown_escalation_timer->callback = crm_timer_popped; shutdown_escalation_timer->repeat = FALSE; } else { was_error = TRUE; } if (wait_timer != NULL) { wait_timer->source_id = 0; wait_timer->period_ms = 2000; wait_timer->fsa_input = I_NULL; wait_timer->callback = crm_timer_popped; wait_timer->repeat = FALSE; } else { was_error = TRUE; } if (recheck_timer != NULL) { recheck_timer->source_id = 0; recheck_timer->period_ms = -1; recheck_timer->fsa_input = I_PE_CALC; recheck_timer->callback = crm_timer_popped; recheck_timer->repeat = FALSE; } else { was_error = TRUE; } if (was_error) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } static int32_t crmd_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { crm_trace("Connection %p", c); if (crm_client_new(c, uid, gid) == NULL) { return -EIO; } return 0; } static void crmd_ipc_created(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); } static int32_t crmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; crm_client_t *client = crm_client_get(c); xmlNode *msg = crm_ipcs_recv(client, data, size, &id, &flags); crm_trace("Invoked: %s", crm_client_name(client)); crm_ipcs_send_ack(client, id, flags, "ack", __FUNCTION__, __LINE__); if (msg == NULL) { return 0; } #if ENABLE_ACL CRM_ASSERT(client->user != NULL); crm_acl_get_set_user(msg, F_CRM_USER, client->user); #endif crm_trace("Processing msg from %s", crm_client_name(client)); crm_log_xml_trace(msg, "controller[inbound]"); crm_xml_add(msg, F_CRM_SYS_FROM, client->id); if (crmd_authorize_message(msg, client, NULL)) { route_message(C_IPC_MESSAGE, msg); } trigger_fsa(fsa_source); free_xml(msg); return 0; } static int32_t crmd_ipc_closed(qb_ipcs_connection_t * c) { crm_client_t *client = crm_client_get(c); if (client) { crm_trace("Disconnecting %sregistered client %s (%p/%p)", (client->userdata? "" : "un"), crm_client_name(client), c, client); free(client->userdata); crm_client_destroy(client); trigger_fsa(fsa_source); } return 0; } static void crmd_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); crmd_ipc_closed(c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = crmd_ipc_accept, .connection_created = crmd_ipc_created, .msg_process = crmd_ipc_dispatch, .connection_closed = crmd_ipc_closed, .connection_destroyed = crmd_ipc_destroy }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (is_set(fsa_input_register, R_MEMBERSHIP) == FALSE) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_LRM_CONNECTED) == FALSE) { crm_info("Delaying start, not connected to executor (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_READ_CONFIG) == FALSE) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_PEER_DATA) == FALSE) { crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = crmd_ipc_server_init(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } if (stonith_reconnect == NULL) { int dummy; stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); } set_bit(fsa_input_register, R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); crm_notice("The local CRM is operational"); clear_bit(fsa_input_register, R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { set_bit(fsa_input_register, R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* *INDENT-OFF* */ static pe_cluster_option crmd_opts[] = { /* name, old-name, validate, values, default, short description, long description */ { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact changeset it was built from. Used for diagnostic purposes." }, { "cluster-infrastructure", NULL, "string", NULL, "corosync", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, { XML_CONFIG_ATTR_DC_DEADTIME, NULL, "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, { XML_CONFIG_ATTR_RECHECK, NULL, "time", "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time." " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, { "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization, "The maximum amount of system resources that should be used by nodes in the cluster", "The cluster will slow down its recovery process when the amount of system resources used" " (currently CPU) approaches this limit", }, { "node-action-limit", NULL, "integer", NULL, "0", &check_number, "The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"}, { XML_CONFIG_ATTR_ELECTION_FAIL, NULL, "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { XML_CONFIG_ATTR_FORCE_QUIT, NULL, "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "join-integration-timeout", "crmd-integration-timeout", "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***", "If need to adjust this value, it probably indicates the presence of a bug" }, { "join-finalization-timeout", "crmd-finalization-timeout", "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***", "If you need to adjust this value, it probably indicates the presence of a bug" }, { "transition-delay", "crmd-transition-delay", "time", NULL, "0s", &check_timer, "*** Advanced Use Only *** Enabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\n" "Useful if your configuration is sensitive to the order in which ping updates arrive." }, { "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_sbd_timeout, "How long to wait before we can assume nodes are safely down", NULL }, { "stonith-max-attempts",NULL,"integer",NULL,"10",&check_positive_number, "How many times stonith can fail before it will no longer be attempted on a target" }, { "no-quorum-policy", NULL, "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL }, }; /* *INDENT-ON* */ void crmd_metadata(void) { config_metadata("pacemaker-controld", "1.0", "controller properties", "Cluster properties used by Pacemaker's controller," " formerly known as crmd", crmd_opts, DIMOF(crmd_opts)); } static void verify_crmd_options(GHashTable * options) { verify_all_options(options, crmd_opts, DIMOF(crmd_opts)); } static const char * crmd_pref(GHashTable * options, const char *name) { return get_cluster_pref(options, crmd_opts, DIMOF(crmd_opts), name); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); xmlNode *crmconfig = NULL; xmlNode *alerts = NULL; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_schema_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); set_bit(fsa_input_register, R_STAYDOWN); } goto bail; } crmconfig = output; if ((crmconfig) && (crm_element_name(crmconfig)) && (strcmp(crm_element_name(crmconfig), XML_CIB_TAG_CRMCONFIG) != 0)) { crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG); } if (!crmconfig) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query for " XML_CIB_TAG_CRMCONFIG " section failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = crm_str_table_new(); unpack_instance_attributes(crmconfig, crmconfig, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now); verify_crmd_options(config_hash); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "node-action-limit"); /* Also checks migration-limit */ throttle_update_job_max(value); value = crmd_pref(config_hash, "load-threshold"); if(value) { throttle_set_load_target(strtof(value, NULL) / 100.0); } value = crmd_pref(config_hash, "no-quorum-policy"); if (safe_str_eq(value, "suicide") && pcmk_locate_sbd()) { no_quorum_suicide_escalation = TRUE; } value = crmd_pref(config_hash,"stonith-max-attempts"); update_stonith_max_attempts(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_get_msec(value); /* How long to declare an election over - even if not everyone voted */ crm_debug("Shutdown escalation occurs after: %dms", shutdown_escalation_timer->period_ms); value = crmd_pref(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); - election_timeout_set_period(fsa_election, crm_get_msec(value)); + controld_set_election_period(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_timer->period_ms = crm_get_msec(value); crm_debug("Checking for expired actions every %dms", recheck_timer->period_ms); value = crmd_pref(config_hash, "transition-delay"); transition_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "join-integration-timeout"); integration_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "join-finalization-timeout"); finalization_timer->period_ms = crm_get_msec(value); free(fsa_cluster_name); fsa_cluster_name = NULL; value = g_hash_table_lookup(config_hash, "cluster-name"); if (value) { fsa_cluster_name = strdup(value); } alerts = first_named_child(output, XML_CIB_TAG_ALERTS); crmd_unpack_alerts(alerts); set_bit(fsa_input_register, R_READ_CONFIG); crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } gboolean crm_read_options(gpointer user_data) { int call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, "//" XML_CIB_TAG_CRMCONFIG " | //" XML_CIB_TAG_ALERTS, NULL, cib_xpath | cib_scope_local); fsa_register_cib_callback(call_id, FALSE, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); mainloop_set_trigger(config_read); } void crm_shutdown(int nsig) { if (crmd_mainloop != NULL && g_main_is_running(crmd_mainloop)) { if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating the shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); } else { set_bit(fsa_input_register, R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); if (shutdown_escalation_timer->period_ms < 1) { const char *value = crmd_pref(NULL, XML_CONFIG_ATTR_FORCE_QUIT); int msec = crm_get_msec(value); crm_debug("Using default shutdown escalation: %dms", msec); shutdown_escalation_timer->period_ms = msec; } /* can't rely on this... */ crm_notice("Shutting down cluster resource manager " CRM_XS " limit=%dms", shutdown_escalation_timer->period_ms); crm_timer_start(shutdown_escalation_timer); } } else { crm_info("exit from shutdown"); crmd_exit(CRM_EX_OK); } } diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c index 3a28e748a0..9fbf1e1ebc 100644 --- a/daemons/controld/controld_election.c +++ b/daemons/controld/controld_election.c @@ -1,232 +1,274 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include +#include #include #include #include #include +static election_t *fsa_election = NULL; + +static gboolean +election_win_cb(gpointer data) +{ + register_fsa_input(C_FSA_INTERNAL, I_ELECTION_DC, NULL); + return FALSE; +} + +void +controld_election_init(const char *uname) +{ + fsa_election = election_init("DC", uname, 60000 /*60s*/, election_win_cb); +} + +void +controld_remove_voter(const char *uname) +{ + election_remove(fsa_election, uname); +} + +void +controld_stop_election_timeout() +{ + election_timeout_stop(fsa_election); +} + +void +controld_election_fini() +{ + election_fini(fsa_election); + fsa_election = NULL; +} + +void +controld_set_election_period(const char *value) +{ + election_timeout_set_period(fsa_election, crm_get_msec(value)); +} + +void +controld_stop_election_timer() +{ + election_timeout_stop(fsa_election); +} + /* A_ELECTION_VOTE */ void do_election_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean not_voting = FALSE; /* don't vote if we're in one of these states or wanting to shut down */ switch (cur_state) { case S_STARTING: case S_RECOVERY: case S_STOPPING: case S_TERMINATE: crm_warn("Not voting in election, we're in state %s", fsa_state2string(cur_state)); not_voting = TRUE; break; case S_ELECTION: case S_INTEGRATION: case S_RELEASE_DC: break; default: crm_err("Broken? Voting in state %s", fsa_state2string(cur_state)); break; } if (not_voting == FALSE) { if (is_set(fsa_input_register, R_STARTING)) { not_voting = TRUE; } } if (not_voting) { if (AM_I_DC) { register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL); } else { register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL); } return; } election_vote(fsa_election); return; } void do_election_check(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { - if (fsa_state != S_ELECTION) { + if (fsa_state == S_ELECTION) { + election_check(fsa_election); + } else { crm_debug("Ignoring election check because we are not in an election"); - - } else if(election_check(fsa_election)) { - register_fsa_input(C_FSA_INTERNAL, I_ELECTION_DC, NULL); } - - return; } /* A_ELECTION_COUNT */ void do_election_count_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { enum election_result rc = 0; ha_msg_input_t *vote = fsa_typed_data(fsa_dt_ha_msg); if(crm_peer_cache == NULL) { if(is_not_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Internal error, no peer cache"); } return; } rc = election_count_vote(fsa_election, vote->msg, cur_state != S_STARTING); switch(rc) { case election_start: election_reset(fsa_election); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); break; case election_lost: update_dc(NULL); if (fsa_input_register & R_THE_DC) { register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL); fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); } else if (cur_state != S_STARTING) { register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL); } break; - case election_in_progress: - break; default: - crm_err("Unhandled election result: %d", rc); + crm_trace("Election message resulted in state %d", rc); } } static void feature_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_notice("Feature update failed: %s "CRM_XS" rc=%d", pcmk_strerror(rc), rc); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /* A_DC_TAKEOVER */ void do_dc_takeover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int rc = pcmk_ok; xmlNode *cib = NULL; const char *cluster_type = name_for_cluster_type(get_cluster_type()); pid_t watchdog = pcmk_locate_sbd(); crm_info("Taking over DC status for this partition"); set_bit(fsa_input_register, R_THE_DC); execute_stonith_cleanup(); election_reset(fsa_election); set_bit(fsa_input_register, R_JOIN_OK); set_bit(fsa_input_register, R_INVOKE_PE); fsa_cib_conn->cmds->set_master(fsa_cib_conn, cib_scope_local); cib = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add(cib, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); fsa_cib_update(XML_TAG_CIB, cib, cib_quorum_override, rc, NULL); fsa_register_cib_callback(rc, FALSE, NULL, feature_update_callback); update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, XML_ATTR_HAVE_WATCHDOG, watchdog?"true":"false", FALSE, NULL, NULL); update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "dc-version", PACEMAKER_VERSION "-" BUILD_VERSION, FALSE, NULL, NULL); update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "cluster-infrastructure", cluster_type, FALSE, NULL, NULL); #if SUPPORT_COROSYNC if (fsa_cluster_name == NULL && is_corosync_cluster()) { char *cluster_name = corosync_cluster_name(); if (cluster_name) { update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "cluster-name", cluster_name, FALSE, NULL, NULL); } free(cluster_name); } #endif mainloop_set_trigger(config_read); free_xml(cib); } /* A_DC_RELEASE */ void do_dc_release(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (action & A_DC_RELEASE) { crm_debug("Releasing the role of DC"); clear_bit(fsa_input_register, R_THE_DC); } else if (action & A_DC_RELEASED) { crm_info("DC role released"); #if 0 if (are there errors) { /* we can't stay up if not healthy */ /* or perhaps I_ERROR and go to S_RECOVER? */ result = I_SHUTDOWN; } #endif if (is_set(fsa_input_register, R_SHUTDOWN)) { xmlNode *update = NULL; crm_node_t *node = crm_get_peer(0, fsa_our_uname); crm_update_peer_expected(__FUNCTION__, node, CRMD_JOINSTATE_DOWN); update = create_node_state_update(node, node_update_expected, NULL, __FUNCTION__); fsa_cib_anon_update(XML_CIB_TAG_STATUS, update); free_xml(update); } register_fsa_input(C_FSA_INTERNAL, I_RELEASE_SUCCESS, NULL); } else { crm_err("Unknown DC action %s", fsa_action2string(action)); } crm_trace("Am I still the DC? %s", AM_I_DC ? XML_BOOLEAN_YES : XML_BOOLEAN_NO); } diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c index 7302679eed..4017996a00 100644 --- a/daemons/controld/controld_fsa.c +++ b/daemons/controld/controld_fsa.c @@ -1,662 +1,658 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include char *fsa_our_dc = NULL; cib_t *fsa_cib_conn = NULL; char *fsa_our_dc_version = NULL; char *fsa_our_uuid = NULL; char *fsa_our_uname = NULL; char *fsa_cluster_name = NULL; -election_t *fsa_election = NULL; - fsa_timer_t *wait_timer = NULL; // How long to wait before retrying a cib or executor connection fsa_timer_t *recheck_timer = NULL; // Periodically re-run scheduler to handle time-based actions fsa_timer_t *election_trigger = NULL; /* How long to wait at startup, or after an election, for the DC to make contact */ fsa_timer_t *transition_timer = NULL; /* How long to delay the start of a new transition with the expectation something else might happen too */ fsa_timer_t *integration_timer = NULL; fsa_timer_t *finalization_timer = NULL; fsa_timer_t *shutdown_escalation_timer = NULL; /* How long to wait for the DC to stop all resources and give us the all-clear to shut down */ volatile gboolean do_fsa_stall = FALSE; volatile long long fsa_input_register = 0; volatile long long fsa_actions = A_NOTHING; volatile enum crmd_fsa_state fsa_state = S_STARTING; extern uint highest_born_on; extern uint num_join_invites; extern void initialize_join(gboolean before); #define DOT_PREFIX "actions:trace: " #define do_dot_log(fmt, args...) crm_trace( fmt, ##args) long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data); void s_crmd_fsa_actions(fsa_data_t * fsa_data); void log_fsa_input(fsa_data_t * stored_msg); void init_dotfile(void); void init_dotfile(void) { do_dot_log(DOT_PREFIX "digraph \"g\" {"); do_dot_log(DOT_PREFIX " size = \"30,30\""); do_dot_log(DOT_PREFIX " graph ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " bb = \"0,0,398.922306,478.927856\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " node ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " shape = \"ellipse\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " edge ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// special nodes"); do_dot_log(DOT_PREFIX " \"S_PENDING\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"blue\""); do_dot_log(DOT_PREFIX " fontcolor = \"blue\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " \"S_TERMINATE\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"red\""); do_dot_log(DOT_PREFIX " fontcolor = \"red\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// DC only nodes"); do_dot_log(DOT_PREFIX " \"S_INTEGRATION\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_POLICY_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_TRANSITION_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_RELEASE_DC\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_IDLE\" [ fontcolor = \"green\" ]"); } static void do_fsa_action(fsa_data_t * fsa_data, long long an_action, void (*function) (long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)) { fsa_actions &= ~an_action; crm_trace(DOT_PREFIX "\t// %s", fsa_action2string(an_action)); function(an_action, fsa_data->fsa_cause, fsa_state, fsa_data->fsa_input, fsa_data); } static long long startup_actions = A_STARTUP | A_CIB_START | A_LRM_CONNECT | A_HA_CONNECT | A_READCONFIG | A_STARTED | A_CL_JOIN_QUERY; // A_LOG, A_WARN, A_ERROR void do_log(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { unsigned log_type = LOG_TRACE; if (action & A_LOG) { log_type = LOG_INFO; } else if (action & A_WARN) { log_type = LOG_WARNING; } else if (action & A_ERROR) { log_type = LOG_ERR; } do_crm_log(log_type, "Input %s received in state %s from %s", fsa_input2string(msg_data->fsa_input), fsa_state2string(cur_state), msg_data->origin); if (msg_data->data_type == fsa_dt_ha_msg) { ha_msg_input_t *input = fsa_typed_data(msg_data->data_type); crm_log_xml_debug(input->msg, __FUNCTION__); } else if (msg_data->data_type == fsa_dt_xml) { xmlNode *input = fsa_typed_data(msg_data->data_type); crm_log_xml_debug(input, __FUNCTION__); } else if (msg_data->data_type == fsa_dt_lrm) { lrmd_event_data_t *input = fsa_typed_data(msg_data->data_type); do_crm_log(log_type, "Resource %s: Call ID %d returned %d (%d)." " New status if rc=0: %s", input->rsc_id, input->call_id, input->rc, input->op_status, (char *)input->user_data); } } enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause) { fsa_data_t *fsa_data = NULL; long long register_copy = fsa_input_register; long long new_actions = A_NOTHING; enum crmd_fsa_state last_state; crm_trace("FSA invoked with Cause: %s\tState: %s", fsa_cause2string(cause), fsa_state2string(fsa_state)); fsa_dump_actions(fsa_actions, "Initial"); do_fsa_stall = FALSE; if (is_message() == FALSE && fsa_actions != A_NOTHING) { /* fake the first message so we can get into the loop */ fsa_data = calloc(1, sizeof(fsa_data_t)); fsa_data->fsa_input = I_NULL; fsa_data->fsa_cause = C_FSA_INTERNAL; fsa_data->origin = __FUNCTION__; fsa_data->data_type = fsa_dt_none; fsa_message_queue = g_list_append(fsa_message_queue, fsa_data); fsa_data = NULL; } while (is_message() && do_fsa_stall == FALSE) { crm_trace("Checking messages (%d remaining)", g_list_length(fsa_message_queue)); fsa_data = get_message(); if(fsa_data == NULL) { continue; } log_fsa_input(fsa_data); /* add any actions back to the queue */ fsa_actions |= fsa_data->actions; fsa_dump_actions(fsa_data->actions, "Restored actions"); /* get the next batch of actions */ new_actions = crmd_fsa_actions[fsa_data->fsa_input][fsa_state]; fsa_actions |= new_actions; fsa_dump_actions(new_actions, "New actions"); if (fsa_data->fsa_input != I_NULL && fsa_data->fsa_input != I_ROUTER) { crm_debug("Processing %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); } /* logging : *before* the state is changed */ if (is_set(fsa_actions, A_ERROR)) { do_fsa_action(fsa_data, A_ERROR, do_log); } if (is_set(fsa_actions, A_WARN)) { do_fsa_action(fsa_data, A_WARN, do_log); } if (is_set(fsa_actions, A_LOG)) { do_fsa_action(fsa_data, A_LOG, do_log); } /* update state variables */ last_state = fsa_state; fsa_state = crmd_fsa_state[fsa_data->fsa_input][fsa_state]; /* * Remove certain actions during shutdown */ if (fsa_state == S_STOPPING || ((fsa_input_register & R_SHUTDOWN) == R_SHUTDOWN)) { clear_bit(fsa_actions, startup_actions); } /* * Hook for change of state. * Allows actions to be added or removed when entering a state */ if (last_state != fsa_state) { fsa_actions = do_state_transition(fsa_actions, last_state, fsa_state, fsa_data); } else { do_dot_log(DOT_PREFIX "\t// FSA input: State=%s \tCause=%s" " \tInput=%s \tOrigin=%s() \tid=%d", fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_input2string(fsa_data->fsa_input), fsa_data->origin, fsa_data->id); } /* start doing things... */ s_crmd_fsa_actions(fsa_data); delete_fsa_input(fsa_data); fsa_data = NULL; } if (g_list_length(fsa_message_queue) > 0 || fsa_actions != A_NOTHING || do_fsa_stall) { crm_debug("Exiting the FSA: queue=%d, fsa_actions=0x%llx, stalled=%s", g_list_length(fsa_message_queue), fsa_actions, do_fsa_stall ? "true" : "false"); } else { crm_trace("Exiting the FSA"); } /* cleanup inputs? */ if (register_copy != fsa_input_register) { long long same = register_copy & fsa_input_register; fsa_dump_inputs(LOG_DEBUG, "Added", fsa_input_register ^ same); fsa_dump_inputs(LOG_DEBUG, "Removed", register_copy ^ same); } fsa_dump_actions(fsa_actions, "Remaining"); fsa_dump_queue(LOG_DEBUG); return fsa_state; } void s_crmd_fsa_actions(fsa_data_t * fsa_data) { /* * Process actions in order of priority but do only one * action at a time to avoid complicating the ordering. */ CRM_CHECK(fsa_data != NULL, return); while (fsa_actions != A_NOTHING && do_fsa_stall == FALSE) { /* regular action processing in order of action priority * * Make sure all actions that connect to required systems * are performed first */ if (fsa_actions & A_ERROR) { do_fsa_action(fsa_data, A_ERROR, do_log); } else if (fsa_actions & A_WARN) { do_fsa_action(fsa_data, A_WARN, do_log); } else if (fsa_actions & A_LOG) { do_fsa_action(fsa_data, A_LOG, do_log); /* get out of here NOW! before anything worse happens */ } else if (fsa_actions & A_EXIT_1) { do_fsa_action(fsa_data, A_EXIT_1, do_exit); /* sub-system restart */ } else if ((fsa_actions & O_LRM_RECONNECT) == O_LRM_RECONNECT) { do_fsa_action(fsa_data, O_LRM_RECONNECT, do_lrm_control); } else if ((fsa_actions & O_CIB_RESTART) == O_CIB_RESTART) { do_fsa_action(fsa_data, O_CIB_RESTART, do_cib_control); } else if ((fsa_actions & O_PE_RESTART) == O_PE_RESTART) { do_fsa_action(fsa_data, O_PE_RESTART, do_pe_control); } else if ((fsa_actions & O_TE_RESTART) == O_TE_RESTART) { do_fsa_action(fsa_data, O_TE_RESTART, do_te_control); /* essential start tasks */ } else if (fsa_actions & A_STARTUP) { do_fsa_action(fsa_data, A_STARTUP, do_startup); } else if (fsa_actions & A_CIB_START) { do_fsa_action(fsa_data, A_CIB_START, do_cib_control); } else if (fsa_actions & A_HA_CONNECT) { do_fsa_action(fsa_data, A_HA_CONNECT, do_ha_control); } else if (fsa_actions & A_READCONFIG) { do_fsa_action(fsa_data, A_READCONFIG, do_read_config); /* sub-system start/connect */ } else if (fsa_actions & A_LRM_CONNECT) { do_fsa_action(fsa_data, A_LRM_CONNECT, do_lrm_control); } else if (fsa_actions & A_TE_START) { do_fsa_action(fsa_data, A_TE_START, do_te_control); } else if (fsa_actions & A_PE_START) { do_fsa_action(fsa_data, A_PE_START, do_pe_control); /* Timers */ /* else if(fsa_actions & O_DC_TIMER_RESTART) { do_fsa_action(fsa_data, O_DC_TIMER_RESTART, do_timer_control) */ ; } else if (fsa_actions & A_DC_TIMER_STOP) { do_fsa_action(fsa_data, A_DC_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_STOP) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_START) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_START, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_STOP) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_START) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_START, do_timer_control); /* * Highest priority actions */ } else if (fsa_actions & A_MSG_ROUTE) { do_fsa_action(fsa_data, A_MSG_ROUTE, do_msg_route); } else if (fsa_actions & A_RECOVER) { do_fsa_action(fsa_data, A_RECOVER, do_recover); } else if (fsa_actions & A_CL_JOIN_RESULT) { do_fsa_action(fsa_data, A_CL_JOIN_RESULT, do_cl_join_finalize_respond); } else if (fsa_actions & A_CL_JOIN_REQUEST) { do_fsa_action(fsa_data, A_CL_JOIN_REQUEST, do_cl_join_offer_respond); } else if (fsa_actions & A_SHUTDOWN_REQ) { do_fsa_action(fsa_data, A_SHUTDOWN_REQ, do_shutdown_req); } else if (fsa_actions & A_ELECTION_VOTE) { do_fsa_action(fsa_data, A_ELECTION_VOTE, do_election_vote); } else if (fsa_actions & A_ELECTION_COUNT) { do_fsa_action(fsa_data, A_ELECTION_COUNT, do_election_count_vote); } else if (fsa_actions & A_LRM_EVENT) { do_fsa_action(fsa_data, A_LRM_EVENT, do_lrm_event); /* * High priority actions */ } else if (fsa_actions & A_STARTED) { do_fsa_action(fsa_data, A_STARTED, do_started); } else if (fsa_actions & A_CL_JOIN_QUERY) { do_fsa_action(fsa_data, A_CL_JOIN_QUERY, do_cl_join_query); } else if (fsa_actions & A_DC_TIMER_START) { do_fsa_action(fsa_data, A_DC_TIMER_START, do_timer_control); /* * Medium priority actions * - Membership */ } else if (fsa_actions & A_DC_TAKEOVER) { do_fsa_action(fsa_data, A_DC_TAKEOVER, do_dc_takeover); } else if (fsa_actions & A_DC_RELEASE) { do_fsa_action(fsa_data, A_DC_RELEASE, do_dc_release); } else if (fsa_actions & A_DC_JOIN_FINAL) { do_fsa_action(fsa_data, A_DC_JOIN_FINAL, do_dc_join_final); } else if (fsa_actions & A_ELECTION_CHECK) { do_fsa_action(fsa_data, A_ELECTION_CHECK, do_election_check); } else if (fsa_actions & A_ELECTION_START) { do_fsa_action(fsa_data, A_ELECTION_START, do_election_vote); } else if (fsa_actions & A_DC_JOIN_OFFER_ALL) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ALL, do_dc_join_offer_all); } else if (fsa_actions & A_DC_JOIN_OFFER_ONE) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ONE, do_dc_join_offer_one); } else if (fsa_actions & A_DC_JOIN_PROCESS_REQ) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_REQ, do_dc_join_filter_offer); } else if (fsa_actions & A_DC_JOIN_PROCESS_ACK) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_ACK, do_dc_join_ack); } else if (fsa_actions & A_DC_JOIN_FINALIZE) { do_fsa_action(fsa_data, A_DC_JOIN_FINALIZE, do_dc_join_finalize); } else if (fsa_actions & A_CL_JOIN_ANNOUNCE) { do_fsa_action(fsa_data, A_CL_JOIN_ANNOUNCE, do_cl_join_announce); /* * Low(er) priority actions * Make sure the CIB is always updated before invoking the * PE, and the PE before the TE */ } else if (fsa_actions & A_TE_HALT) { do_fsa_action(fsa_data, A_TE_HALT, do_te_invoke); } else if (fsa_actions & A_TE_CANCEL) { do_fsa_action(fsa_data, A_TE_CANCEL, do_te_invoke); } else if (fsa_actions & A_LRM_INVOKE) { do_fsa_action(fsa_data, A_LRM_INVOKE, do_lrm_invoke); } else if (fsa_actions & A_PE_INVOKE) { do_fsa_action(fsa_data, A_PE_INVOKE, do_pe_invoke); } else if (fsa_actions & A_TE_INVOKE) { do_fsa_action(fsa_data, A_TE_INVOKE, do_te_invoke); /* Shutdown actions */ } else if (fsa_actions & A_DC_RELEASED) { do_fsa_action(fsa_data, A_DC_RELEASED, do_dc_release); } else if (fsa_actions & A_PE_STOP) { do_fsa_action(fsa_data, A_PE_STOP, do_pe_control); } else if (fsa_actions & A_TE_STOP) { do_fsa_action(fsa_data, A_TE_STOP, do_te_control); } else if (fsa_actions & A_SHUTDOWN) { do_fsa_action(fsa_data, A_SHUTDOWN, do_shutdown); } else if (fsa_actions & A_LRM_DISCONNECT) { do_fsa_action(fsa_data, A_LRM_DISCONNECT, do_lrm_control); } else if (fsa_actions & A_HA_DISCONNECT) { do_fsa_action(fsa_data, A_HA_DISCONNECT, do_ha_control); } else if (fsa_actions & A_CIB_STOP) { do_fsa_action(fsa_data, A_CIB_STOP, do_cib_control); } else if (fsa_actions & A_STOP) { do_fsa_action(fsa_data, A_STOP, do_stop); /* exit gracefully */ } else if (fsa_actions & A_EXIT_0) { do_fsa_action(fsa_data, A_EXIT_0, do_exit); /* Error checking and reporting */ } else { crm_err("Action %s not supported "CRM_XS" 0x%llx", fsa_action2string(fsa_actions), fsa_actions); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, fsa_data, NULL, __FUNCTION__); } } } void log_fsa_input(fsa_data_t * stored_msg) { CRM_ASSERT(stored_msg); crm_trace("Processing queued input %d", stored_msg->id); if (stored_msg->fsa_cause == C_LRM_OP_CALLBACK) { crm_trace("FSA processing LRM callback from %s", stored_msg->origin); } else if (stored_msg->data == NULL) { crm_trace("FSA processing input from %s", stored_msg->origin); } else { ha_msg_input_t *ha_input = fsa_typed_data_adv(stored_msg, fsa_dt_ha_msg, __FUNCTION__); crm_trace("FSA processing XML message from %s", stored_msg->origin); crm_log_xml_trace(ha_input->xml, "FSA message data"); } } long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data) { int level = LOG_INFO; long long tmp = actions; gboolean clear_recovery_bit = TRUE; enum crmd_fsa_cause cause = msg_data->fsa_cause; enum crmd_fsa_input current_input = msg_data->fsa_input; const char *state_from = fsa_state2string(cur_state); const char *state_to = fsa_state2string(next_state); const char *input = fsa_input2string(current_input); CRM_LOG_ASSERT(cur_state != next_state); do_dot_log(DOT_PREFIX "\t%s -> %s [ label=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); if (cur_state == S_IDLE || next_state == S_IDLE) { level = LOG_NOTICE; } else if (cur_state == S_NOT_DC || next_state == S_NOT_DC) { level = LOG_NOTICE; } else if (cur_state == S_ELECTION) { level = LOG_NOTICE; } else if (cur_state == S_STARTING) { level = LOG_NOTICE; } else if (next_state == S_RECOVERY) { level = LOG_WARNING; } do_crm_log(level, "State transition %s -> %s " CRM_XS " input=%s cause=%s origin=%s", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); - /* the last two clauses might cause trouble later */ if (next_state != S_ELECTION && cur_state != S_RELEASE_DC) { - election_timeout_stop(fsa_election); -/* } else { */ -/* crm_timer_start(election_timeout); */ + controld_stop_election_timer(); } #if 0 if ((fsa_input_register & R_SHUTDOWN)) { set_bit(tmp, A_DC_TIMER_STOP); } #endif if (next_state == S_INTEGRATION) { set_bit(tmp, A_INTEGRATE_TIMER_START); } else { set_bit(tmp, A_INTEGRATE_TIMER_STOP); } if (next_state == S_FINALIZE_JOIN) { set_bit(tmp, A_FINALIZE_TIMER_START); } else { set_bit(tmp, A_FINALIZE_TIMER_STOP); } if (next_state != S_PENDING) { set_bit(tmp, A_DC_TIMER_STOP); } if (next_state != S_ELECTION) { highest_born_on = 0; } if (next_state != S_IDLE) { crm_timer_stop(recheck_timer); } if (cur_state == S_FINALIZE_JOIN && next_state == S_POLICY_ENGINE) { populate_cib_nodes(node_update_quick|node_update_all, __FUNCTION__); } switch (next_state) { case S_PENDING: fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); /* fall through */ case S_ELECTION: crm_trace("Resetting our DC to NULL on transition to %s", fsa_state2string(next_state)); update_dc(NULL); break; case S_NOT_DC: election_trigger->counter = 0; purge_stonith_cleanup(); if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we have a new DC"); set_bit(tmp, A_SHUTDOWN_REQ); } CRM_LOG_ASSERT(fsa_our_dc != NULL); if (fsa_our_dc == NULL) { crm_err("Reached S_NOT_DC without a DC" " being recorded"); } break; case S_RECOVERY: clear_recovery_bit = FALSE; break; case S_FINALIZE_JOIN: CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_warn("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (crmd_join_phase_count(crm_join_welcomed) > 0) { crm_warn("%u cluster nodes failed to respond" " to the join offer.", crmd_join_phase_count(crm_join_welcomed)); crmd_join_phase_log(LOG_NOTICE); } else { crm_debug("All %d cluster nodes responded to the join offer.", crmd_join_phase_count(crm_join_integrated)); } break; case S_POLICY_ENGINE: election_trigger->counter = 0; CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_info("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (crmd_join_phase_count(crm_join_finalized) > 0) { crm_err("%u cluster nodes failed to confirm their join.", crmd_join_phase_count(crm_join_finalized)); crmd_join_phase_log(LOG_NOTICE); } else if (crmd_join_phase_count(crm_join_confirmed) == crm_active_peers()) { crm_debug("All %u cluster nodes are" " eligible to run resources.", crm_active_peers()); } else if (crmd_join_phase_count(crm_join_confirmed) > crm_active_peers()) { crm_err("We have more confirmed nodes than our membership does: %d vs. %d", crmd_join_phase_count(crm_join_confirmed), crm_active_peers()); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } else if (saved_ccm_membership_id != crm_peer_seq) { crm_info("Membership changed: %llu -> %llu - join restart", saved_ccm_membership_id, crm_peer_seq); register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } else { crm_warn("Only %u of %u cluster " "nodes are eligible to run resources - continue %d", crmd_join_phase_count(crm_join_confirmed), crm_active_peers(), crmd_join_phase_count(crm_join_welcomed)); } /* initialize_join(FALSE); */ break; case S_STOPPING: case S_TERMINATE: /* possibly redundant */ set_bit(fsa_input_register, R_SHUTDOWN); break; case S_IDLE: CRM_LOG_ASSERT(AM_I_DC); if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we are the DC"); set_bit(tmp, A_SHUTDOWN_REQ); } if (recheck_timer->period_ms > 0) { crm_debug("Starting %s", get_timer_desc(recheck_timer)); crm_timer_start(recheck_timer); } break; default: break; } if (clear_recovery_bit && next_state != S_PENDING) { tmp &= ~A_RECOVER; } else if (clear_recovery_bit == FALSE) { tmp |= A_RECOVER; } if (tmp != actions) { /* fsa_dump_actions(actions ^ tmp, "New actions"); */ actions = tmp; } return actions; } diff --git a/daemons/controld/controld_fsa.h b/daemons/controld/controld_fsa.h index 254d78457f..e52a04e106 100644 --- a/daemons/controld/controld_fsa.h +++ b/daemons/controld/controld_fsa.h @@ -1,734 +1,719 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef CRMD_FSA__H # define CRMD_FSA__H # include # include # include # include # include # include # include /*! States the controller can be in */ enum crmd_fsa_state { S_IDLE = 0, /* Nothing happening */ S_ELECTION, /* Take part in the election algorithm as * described below */ S_INTEGRATION, /* integrate that status of new nodes (which is * all of them if we have just been elected DC) * to form a complete and up-to-date picture of * the CIB */ S_FINALIZE_JOIN, /* integrate that status of new nodes (which is * all of them if we have just been elected DC) * to form a complete and up-to-date picture of * the CIB */ S_NOT_DC, /* we are in non-DC mode */ S_POLICY_ENGINE, /* Determine next stable state of the cluster */ S_RECOVERY, /* Something bad happened, check everything is ok * before continuing and attempt to recover if * required */ S_RELEASE_DC, /* we were the DC, but now we arent anymore, * possibly by our own request, and we should * release all unnecessary sub-systems, finish * any pending actions, do general cleanup and * unset anything that makes us think we are * special :) */ S_STARTING, /* we are just starting out */ S_PENDING, /* we are not a full/active member yet */ S_STOPPING, /* We are in the final stages of shutting down */ S_TERMINATE, /* We are going to shutdown, this is the equiv of * "Sending TERM signal to all processes" in Linux * and in worst case scenarios could be considered * a self STONITH */ S_TRANSITION_ENGINE, /* Attempt to make the calculated next stable * state of the cluster a reality */ S_HALT, /* Freeze - don't do anything * Something bad happened that needs the admin to fix * Wait for I_ELECTION */ /* ----------- Last input found in table is above ---------- */ S_ILLEGAL /* This is an illegal FSA state */ /* (must be last) */ }; # define MAXSTATE S_ILLEGAL /* Once we start and do some basic sanity checks, we go into the S_NOT_DC state and await instructions from the DC or input from the cluster layer which indicates the election algorithm needs to run. If the election algorithm is triggered, we enter the S_ELECTION state from where we can either go back to the S_NOT_DC state or progress to the S_INTEGRATION state (or S_RELEASE_DC if we used to be the DC - but aren't anymore). - - The election algorithm has been adapted from - http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR521 - - Loosely known as the Bully Algorithm, its major points are: - - Election is initiated by any node (N) that notices that the controller - is no longer responding - - Concurrent multiple elections are possible - - Algorithm: - + N sends ELECTION messages to all nodes that occur earlier in the - cluster layer's membership list - + If no one responds, N wins and becomes controller - + N sends out CONTROLLER messages to all other nodes in the partition - + If one of higher-ups answers, it takes over. N is done. + but aren't anymore). See the libcrmcluster API documentation for more + information about the election algorithm. Once the election is complete, if we are the DC, we enter the S_INTEGRATION state which is a DC-in-waiting style state. We are the DC, but we shouldn't do anything yet because we may not have an up-to-date picture of the cluster. There may of course be times when this fails, so we should go back to the S_RECOVERY stage and check everything is ok. We may also end up here if a new node came online, since each node is authorative on itself and we would want to incorporate its information into the CIB. Once we have the latest CIB, we then enter the S_POLICY_ENGINE state where invoke the scheduler. It is possible that between invoking the scheduler and receiving an answer, that we receive more input. In this case, we would discard the orginal result and invoke it again. Once we are satisfied with the output from the scheduler, we enter S_TRANSITION_ENGINE and feed the scheduler's output to the Transition Engine who attempts to make the scheduler's calculation a reality. If the transition completes successfully, we enter S_IDLE, otherwise we go back to S_POLICY_ENGINE with the current unstable state and try again. Of course, we may be asked to shutdown at any time, however we must progress to S_NOT_DC before doing so. Once we have handed over DC duties to another node, we can then shut down like everyone else, that is, by asking the DC for permission and waiting for it to take all our resources away. The case where we are the DC and the only node in the cluster is a special case and handled as an escalation which takes us to S_SHUTDOWN. Similarly, if any other point in the shutdown fails or stalls, this is escalated and we end up in S_TERMINATE. At any point, the controller can relay messages for its subsystems, but outbound messages (from subsystems) should probably be blocked until S_INTEGRATION (for the DC) or the join protocol has completed (for non-DC controllers). */ /*====================================== * * Inputs/Events/Stimuli to be given to the finite state machine * * Some of these a true events, and others are synthesised based on * the "register" (see below) and the contents or source of messages. * * The machine keeps processing until receiving I_NULL * *======================================*/ enum crmd_fsa_input { /* 0 */ I_NULL, /* Nothing happened */ /* 1 */ I_CIB_OP, /* An update to the CIB occurred */ I_CIB_UPDATE, /* An update to the CIB occurred */ I_DC_TIMEOUT, /* We have lost communication with the DC */ I_ELECTION, /* Someone started an election */ I_PE_CALC, /* The scheduler needs to be invoked */ I_RELEASE_DC, /* The election completed and we were not * elected, but we were the DC beforehand */ I_ELECTION_DC, /* The election completed and we were (re-)elected * DC */ I_ERROR, /* Something bad happened (more serious than * I_FAIL) and may not have been due to the action * being performed. For example, we may have lost * our connection to the CIB. */ /* 9 */ I_FAIL, /* The action failed to complete successfully */ I_INTEGRATED, I_FINALIZED, I_NODE_JOIN, /* A node has entered the cluster */ I_NOT_DC, /* We are not and were not the DC before or after * the current operation or state */ I_RECOVERED, /* The recovery process completed successfully */ I_RELEASE_FAIL, /* We could not give up DC status for some reason */ I_RELEASE_SUCCESS, /* We are no longer the DC */ I_RESTART, /* The current set of actions needs to be * restarted */ I_TE_SUCCESS, /* Some non-resource, non-cluster-layer action * is required of us, e.g. ping */ /* 20 */ I_ROUTER, /* Do our job as router and forward this to the * right place */ I_SHUTDOWN, /* We are asking to shutdown */ I_STOP, /* We have been told to shutdown */ I_TERMINATE, /* Actually exit */ I_STARTUP, I_PE_SUCCESS, /* The action completed successfully */ I_JOIN_OFFER, /* The DC is offering membership */ I_JOIN_REQUEST, /* The client is requesting membership */ I_JOIN_RESULT, /* If not the DC: The result of a join request * Else: A client is responding with its local state info */ I_WAIT_FOR_EVENT, /* we may be waiting for an async task to "happen" * and until it does, we can't do anything else */ I_DC_HEARTBEAT, /* The DC is telling us that it is alive and well */ I_LRM_EVENT, /* 30 */ I_PENDING, I_HALT, /* ------------ Last input found in table is above ----------- */ I_ILLEGAL /* This is an illegal value for an FSA input */ /* (must be last) */ }; # define MAXINPUT I_ILLEGAL # define I_MESSAGE I_ROUTER /*====================================== * * actions * * Some of the actions below will always occur together for now, but this may * not always be the case, so they are split up so that they can easily be * called independently in the future, if necessary. * * For example, separating A_LRM_CONNECT from A_STARTUP might be useful * if we ever try to recover from a faulty or disconnected executor. * *======================================*/ /* Don't do anything */ # define A_NOTHING 0x0000000000000000ULL /* -- Startup actions -- */ /* Hook to perform any actions (other than connecting to other daemons) * that might be needed as part of the startup. */ # define A_STARTUP 0x0000000000000001ULL /* Hook to perform any actions that might be needed as part * after startup is successful. */ # define A_STARTED 0x0000000000000002ULL /* Connect to cluster layer */ # define A_HA_CONNECT 0x0000000000000004ULL # define A_HA_DISCONNECT 0x0000000000000008ULL # define A_INTEGRATE_TIMER_START 0x0000000000000010ULL # define A_INTEGRATE_TIMER_STOP 0x0000000000000020ULL # define A_FINALIZE_TIMER_START 0x0000000000000040ULL # define A_FINALIZE_TIMER_STOP 0x0000000000000080ULL /* -- Election actions -- */ # define A_DC_TIMER_START 0x0000000000000100ULL # define A_DC_TIMER_STOP 0x0000000000000200ULL # define A_ELECTION_COUNT 0x0000000000000400ULL # define A_ELECTION_VOTE 0x0000000000000800ULL # define A_ELECTION_START 0x0000000000001000ULL /* -- Message processing -- */ /* Process the queue of requests */ # define A_MSG_PROCESS 0x0000000000002000ULL /* Send the message to the correct recipient */ # define A_MSG_ROUTE 0x0000000000004000ULL /* Send a welcome message to new node(s) */ # define A_DC_JOIN_OFFER_ONE 0x0000000000008000ULL /* -- Server Join protocol actions -- */ /* Send a welcome message to all nodes */ # define A_DC_JOIN_OFFER_ALL 0x0000000000010000ULL /* Process the remote node's ack of our join message */ # define A_DC_JOIN_PROCESS_REQ 0x0000000000020000ULL /* Send out the results of the Join phase */ # define A_DC_JOIN_FINALIZE 0x0000000000040000ULL /* Send out the results of the Join phase */ # define A_DC_JOIN_PROCESS_ACK 0x0000000000080000ULL /* -- Client Join protocol actions -- */ # define A_CL_JOIN_QUERY 0x0000000000100000ULL # define A_CL_JOIN_ANNOUNCE 0x0000000000200000ULL /* Request membership to the DC list */ # define A_CL_JOIN_REQUEST 0x0000000000400000ULL /* Did the DC accept or reject the request */ # define A_CL_JOIN_RESULT 0x0000000000800000ULL /* -- Recovery, DC start/stop -- */ /* Something bad happened, try to recover */ # define A_RECOVER 0x0000000001000000ULL /* Hook to perform any actions (apart from starting, the TE, scheduler, * and gathering the latest CIB) that might be necessary before * giving up the responsibilities of being the DC. */ # define A_DC_RELEASE 0x0000000002000000ULL /* */ # define A_DC_RELEASED 0x0000000004000000ULL /* Hook to perform any actions (apart from starting, the TE, scheduler, * and gathering the latest CIB) that might be necessary before * taking over the responsibilities of being the DC. */ # define A_DC_TAKEOVER 0x0000000008000000ULL /* -- Shutdown actions -- */ # define A_SHUTDOWN 0x0000000010000000ULL # define A_STOP 0x0000000020000000ULL # define A_EXIT_0 0x0000000040000000ULL # define A_EXIT_1 0x0000000080000000ULL # define A_SHUTDOWN_REQ 0x0000000100000000ULL # define A_ELECTION_CHECK 0x0000000200000000ULL # define A_DC_JOIN_FINAL 0x0000000400000000ULL /* -- CIB actions -- */ # define A_CIB_START 0x0000020000000000ULL # define A_CIB_STOP 0x0000040000000000ULL /* -- Transition Engine actions -- */ /* Attempt to reach the newly calculated cluster state. This is * only called once per transition (except if it is asked to * stop the transition or start a new one). * Once given a cluster state to reach, the TE will determine * tasks that can be performed in parallel, execute them, wait * for replies and then determine the next set until the new * state is reached or no further tasks can be taken. */ # define A_TE_INVOKE 0x0000100000000000ULL # define A_TE_START 0x0000200000000000ULL # define A_TE_STOP 0x0000400000000000ULL # define A_TE_CANCEL 0x0000800000000000ULL # define A_TE_HALT 0x0001000000000000ULL /* -- Scheduler actions -- */ /* Calculate the next state for the cluster. This is only * invoked once per needed calculation. */ # define A_PE_INVOKE 0x0002000000000000ULL # define A_PE_START 0x0004000000000000ULL # define A_PE_STOP 0x0008000000000000ULL /* -- Misc actions -- */ /* Add a system generate "block" so that resources arent moved * to or are activly moved away from the affected node. This * way we can return quickly even if busy with other things. */ # define A_NODE_BLOCK 0x0010000000000000ULL /* Update our information in the local CIB */ # define A_UPDATE_NODESTATUS 0x0020000000000000ULL # define A_READCONFIG 0x0080000000000000ULL /* -- LRM Actions -- */ /* Connect to pacemaker-execd */ # define A_LRM_CONNECT 0x0100000000000000ULL /* Disconnect from pacemaker-execd */ # define A_LRM_DISCONNECT 0x0200000000000000ULL # define A_LRM_INVOKE 0x0400000000000000ULL # define A_LRM_EVENT 0x0800000000000000ULL /* -- Logging actions -- */ # define A_LOG 0x1000000000000000ULL # define A_ERROR 0x2000000000000000ULL # define A_WARN 0x4000000000000000ULL # define O_EXIT (A_SHUTDOWN|A_STOP|A_LRM_DISCONNECT|A_HA_DISCONNECT|A_EXIT_0|A_CIB_STOP) # define O_RELEASE (A_DC_TIMER_STOP|A_DC_RELEASE|A_PE_STOP|A_TE_STOP|A_DC_RELEASED) # define O_PE_RESTART (A_PE_START|A_PE_STOP) # define O_TE_RESTART (A_TE_START|A_TE_STOP) # define O_CIB_RESTART (A_CIB_START|A_CIB_STOP) # define O_LRM_RECONNECT (A_LRM_CONNECT|A_LRM_DISCONNECT) # define O_DC_TIMER_RESTART (A_DC_TIMER_STOP|A_DC_TIMER_START) /*====================================== * * "register" contents * * Things we may want to remember regardless of which state we are in. * * These also count as inputs for synthesizing I_* * *======================================*/ # define R_THE_DC 0x00000001ULL /* Are we the DC? */ # define R_STARTING 0x00000002ULL /* Are we starting up? */ # define R_SHUTDOWN 0x00000004ULL /* Are we trying to shut down? */ # define R_STAYDOWN 0x00000008ULL /* Should we restart? */ # define R_JOIN_OK 0x00000010ULL /* Have we completed the join process */ # define R_READ_CONFIG 0x00000040ULL # define R_INVOKE_PE 0x00000080ULL // Should the scheduler be invoked? # define R_CIB_CONNECTED 0x00000100ULL /* Is the CIB connected? */ # define R_PE_CONNECTED 0x00000200ULL // Is the scheduler connected? # define R_TE_CONNECTED 0x00000400ULL /* Is the Transition Engine connected? */ # define R_LRM_CONNECTED 0x00000800ULL // Is pacemaker-execd connected? # define R_CIB_REQUIRED 0x00001000ULL /* Is the CIB required? */ # define R_PE_REQUIRED 0x00002000ULL // Is the scheduler required? # define R_TE_REQUIRED 0x00004000ULL /* Is the Transition Engine required? */ # define R_ST_REQUIRED 0x00008000ULL /* Is the Stonith daemon required? */ # define R_CIB_DONE 0x00010000ULL /* Have we calculated the CIB? */ # define R_HAVE_CIB 0x00020000ULL /* Do we have an up-to-date CIB */ # define R_CIB_ASKED 0x00040000ULL /* Have we asked for an up-to-date CIB */ # define R_MEMBERSHIP 0x00100000ULL /* Have we got cluster layer data yet */ # define R_PEER_DATA 0x00200000ULL /* Have we got T_CL_STATUS data yet */ # define R_HA_DISCONNECTED 0x00400000ULL /* did we sign out of our own accord */ # define R_REQ_PEND 0x01000000ULL /* Are there Requests waiting for processing? */ # define R_PE_PEND 0x02000000ULL // Are we awaiting reply from scheduler? # define R_TE_PEND 0x04000000ULL /* Has the TE been invoked and we're awaiting completion? */ # define R_RESP_PEND 0x08000000ULL /* Do we have clients waiting on a response? if so perhaps we shouldn't stop yet */ # define R_IN_TRANSITION 0x10000000ULL /* */ # define R_SENT_RSC_STOP 0x20000000ULL /* Have we sent a stop action to all * resources in preparation for * shutting down */ # define R_IN_RECOVERY 0x80000000ULL /* * Magic RC used within the controller to indicate direct nacks * (operation is invalid in current state) */ #define CRM_DIRECT_NACK_RC (99) enum crmd_fsa_cause { C_UNKNOWN = 0, C_STARTUP, C_IPC_MESSAGE, C_HA_MESSAGE, C_CRMD_STATUS_CALLBACK, C_LRM_OP_CALLBACK, C_TIMER_POPPED, C_SHUTDOWN, C_FSA_INTERNAL, }; typedef struct fsa_timer_s fsa_timer_t; struct fsa_timer_s { guint source_id; /* timer source id */ int period_ms; /* timer period */ enum crmd_fsa_input fsa_input; gboolean(*callback) (gpointer data); gboolean repeat; int counter; }; enum fsa_data_type { fsa_dt_none, fsa_dt_ha_msg, fsa_dt_xml, fsa_dt_lrm, }; typedef struct fsa_data_s fsa_data_t; struct fsa_data_s { int id; enum crmd_fsa_input fsa_input; enum crmd_fsa_cause fsa_cause; long long actions; const char *origin; void *data; enum fsa_data_type data_type; }; /* Global FSA stuff */ extern volatile gboolean do_fsa_stall; extern volatile enum crmd_fsa_state fsa_state; extern volatile long long fsa_input_register; extern volatile long long fsa_actions; extern cib_t *fsa_cib_conn; extern char *fsa_our_uname; extern char *fsa_our_uuid; extern char *fsa_pe_ref; // Last invocation of the scheduler extern char *fsa_our_dc; extern char *fsa_our_dc_version; extern GListPtr fsa_message_queue; extern char *fsa_cluster_name; -extern election_t *fsa_election; extern fsa_timer_t *election_trigger; -extern fsa_timer_t *election_timeout; extern fsa_timer_t *shutdown_escalation_timer; extern fsa_timer_t *transition_timer; extern fsa_timer_t *integration_timer; extern fsa_timer_t *finalization_timer; extern fsa_timer_t *wait_timer; extern fsa_timer_t *recheck_timer; extern crm_trigger_t *fsa_source; extern crm_trigger_t *config_read; extern unsigned long long saved_ccm_membership_id; extern gboolean ever_had_quorum; // These should be moved elsewhere void do_update_cib_nodes(gboolean overwrite, const char *caller); int crmd_cib_smart_opt(void); xmlNode *do_lrm_query(gboolean, const char *node_name); const char *fsa_input2string(enum crmd_fsa_input input); const char *fsa_state2string(enum crmd_fsa_state state); const char *fsa_cause2string(enum crmd_fsa_cause cause); const char *fsa_action2string(long long action); enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause); # define AM_I_DC is_set(fsa_input_register, R_THE_DC) # define AM_I_OPERATIONAL (is_set(fsa_input_register, R_STARTING) == FALSE) # define trigger_fsa(source) do { \ crm_trace("Triggering FSA: %s", __FUNCTION__); \ mainloop_set_trigger(source); \ } while(0) /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); /* A_PE_INVOKE */ void do_pe_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); /* A_ERROR */ void do_error(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_LOG */ void do_log(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_CIB_START, STOP, RESTART */ void do_cib_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_LRM_CONNECT */ void do_lrm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_PE_START, STOP, RESTART */ void do_pe_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_TE_START, STOP, RESTART */ void do_te_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_MSG_ROUTE */ void do_msg_route(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_ELECTION_VOTE */ void do_election_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_ELECTION_COUNT */ void do_election_count_vote(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_ELECTION_CHECK */ void do_election_check(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_TIMER_STOP */ void do_timer_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_TAKEOVER */ void do_dc_takeover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_RELEASE */ void do_dc_release(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_JOIN_OFFER_ALL */ void do_dc_join_offer_all(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_JOIN_OFFER_ONE */ void do_dc_join_offer_one(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_JOIN_ACK */ void do_dc_join_ack(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_JOIN_REQ */ void do_dc_join_filter_offer(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_JOIN_FINALIZE */ void do_dc_join_finalize(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_CL_JOIN_QUERY */ /* is there a DC out there? */ void do_cl_join_query(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); /* A_CL_JOIN_ANNOUNCE */ void do_cl_join_announce(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); /* A_CL_JOIN_REQUEST */ void do_cl_join_offer_respond(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); /* A_CL_JOIN_RESULT */ void do_cl_join_finalize_respond(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); /* A_UPDATE_NODESTATUS */ void do_update_node_status(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_LRM_INVOKE */ void do_lrm_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_LRM_EVENT */ void do_lrm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_TE_INVOKE, A_TE_CANCEL */ void do_te_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_TE_INVOKE */ void do_te_copyto(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data); /* A_DC_JOIN_FINAL */ void do_dc_join_final(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data); # include #endif diff --git a/daemons/controld/controld_membership.c b/daemons/controld/controld_membership.c index c90f745511..7ab2ad611a 100644 --- a/daemons/controld/controld_membership.c +++ b/daemons/controld/controld_membership.c @@ -1,442 +1,440 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include #include #include #include #include #include #include gboolean membership_flux_hack = FALSE; void post_cache_update(int instance); int last_peer_update = 0; guint highest_born_on = -1; extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static void reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; if (crm_is_peer_active(node) == FALSE) { crm_update_peer_join(__FUNCTION__, node, crm_join_none); if(node && node->uname) { - election_remove(fsa_election, node->uname); - if (safe_str_eq(fsa_our_uname, node->uname)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if (AM_I_DC == FALSE && safe_str_eq(node->uname, fsa_our_dc)) { crm_warn("Our DC node (%s) left the cluster", node->uname); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } } if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { check_join_state(fsa_state, __FUNCTION__); } if(node && node->uuid) { fail_incompletable_actions(transition_graph, node->uuid); } } } gboolean ever_had_quorum = FALSE; void post_cache_update(int instance) { xmlNode *no_op = NULL; crm_peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL); set_bit(fsa_input_register, R_MEMBERSHIP); if (AM_I_DC) { populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer | node_update_expected, __FUNCTION__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ register_fsa_action(A_ELECTION_CHECK); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); free_xml(no_op); } static void crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; last_peer_update = 0; if (rc == pcmk_ok) { crm_trace("Node update %d complete", call_id); } else if(call_id < pcmk_ok) { crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /*! * \internal * \brief Create an XML node state tag with updates * * \param[in,out] node Node whose state will be used for update * \param[in] flags Bitmask of node_update_flags indicating what to update * \param[in,out] parent XML node to contain update (or NULL) * \param[in] source Who requested the update (only used for logging) * * \return Pointer to created node state tag */ xmlNode * create_node_state_update(crm_node_t *node, int flags, xmlNode *parent, const char *source) { const char *value = NULL; xmlNode *node_state; if (!node->state) { crm_info("Node update for %s cancelled: no state, not seen yet", node->uname); return NULL; } node_state = create_xml_node(parent, XML_CIB_TAG_STATE); if (is_set(node->flags, crm_remote_node)) { crm_xml_add(node_state, XML_NODE_IS_REMOTE, XML_BOOLEAN_TRUE); } set_uuid(node_state, XML_ATTR_UUID, node); if (crm_element_value(node_state, XML_ATTR_UUID) == NULL) { crm_info("Node update for %s cancelled: no id", node->uname); free_xml(node_state); return NULL; } crm_xml_add(node_state, XML_ATTR_UNAME, node->uname); if ((flags & node_update_cluster) && node->state) { crm_xml_add_boolean(node_state, XML_NODE_IN_CLUSTER, safe_str_eq(node->state, CRM_NODE_MEMBER)); } if (!is_set(node->flags, crm_remote_node)) { if (flags & node_update_peer) { value = OFFLINESTATUS; if (is_set(node->processes, crm_get_cluster_proc())) { value = ONLINESTATUS; } crm_xml_add(node_state, XML_NODE_IS_PEER, value); } if (flags & node_update_join) { if (node->join <= crm_join_none) { value = CRMD_JOINSTATE_DOWN; } else { value = CRMD_JOINSTATE_MEMBER; } crm_xml_add(node_state, XML_NODE_JOIN_STATE, value); } if (flags & node_update_expected) { crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected); } } crm_xml_add(node_state, XML_ATTR_ORIGIN, source); return node_state; } static void remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *node_uuid = user_data; do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)", node_uuid, pcmk_strerror(rc), rc); } static void search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *new_node_uuid = user_data; xmlNode *node_xml = NULL; if (rc != pcmk_ok) { if (rc != -ENXIO) { crm_notice("Searching conflicting nodes for %s failed: %s (%d)", new_node_uuid, pcmk_strerror(rc), rc); } return; } else if (output == NULL) { return; } if (safe_str_eq(crm_element_name(output), XML_CIB_TAG_NODE)) { node_xml = output; } else { node_xml = __xml_first_child(output); } for (; node_xml != NULL; node_xml = __xml_next(node_xml)) { const char *node_uuid = NULL; const char *node_uname = NULL; GHashTableIter iter; crm_node_t *node = NULL; gboolean known = FALSE; if (safe_str_neq(crm_element_name(node_xml), XML_CIB_TAG_NODE)) { continue; } node_uuid = crm_element_value(node_xml, XML_ATTR_ID); node_uname = crm_element_value(node_xml, XML_ATTR_UNAME); if (node_uuid == NULL || node_uname == NULL) { continue; } g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (node->uuid && safe_str_eq(node->uuid, node_uuid) && node->uname && safe_str_eq(node->uname, node_uname)) { known = TRUE; break; } } if (known == FALSE) { int delete_call_id = 0; xmlNode *node_state_xml = NULL; crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s", node_uuid, node_uname, new_node_uuid); delete_call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_NODES, node_xml, cib_scope_local | cib_quorum_override); fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid), remove_conflicting_node_callback); node_state_xml = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(node_state_xml, XML_ATTR_ID, node_uuid); crm_xml_add(node_state_xml, XML_ATTR_UNAME, node_uname); delete_call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state_xml, cib_scope_local | cib_quorum_override); fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid), remove_conflicting_node_callback); free_xml(node_state_xml); } } } static void node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if(call_id < pcmk_ok) { crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else if(rc < pcmk_ok) { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } #define NODE_PATH_MAX 512 void populate_cib_nodes(enum node_update_flags flags, const char *source) { int call_id = 0; gboolean from_hashtable = TRUE; int call_options = cib_scope_local | cib_quorum_override; xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); #if SUPPORT_COROSYNC if (is_not_set(flags, node_update_quick) && is_corosync_cluster()) { from_hashtable = corosync_initialize_nodelist(NULL, FALSE, node_list); } #endif if (from_hashtable) { GHashTableIter iter; crm_node_t *node = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *new_node = NULL; crm_trace("Creating node entry for %s/%s", node->uname, node->uuid); if(node->uuid && node->uname) { char xpath[NODE_PATH_MAX]; /* We need both to be valid */ new_node = create_xml_node(node_list, XML_CIB_TAG_NODE); crm_xml_add(new_node, XML_ATTR_ID, node->uuid); crm_xml_add(new_node, XML_ATTR_UNAME, node->uname); /* Search and remove unknown nodes with the conflicting uname from CIB */ snprintf(xpath, NODE_PATH_MAX, "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE "[@uname='%s'][@id!='%s']", node->uname, node->uuid); call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, xpath, NULL, cib_scope_local | cib_xpath); fsa_register_cib_callback(call_id, FALSE, strdup(node->uuid), search_conflicting_node_callback); } } } crm_trace("Populating section from %s", from_hashtable ? "hashtable" : "cluster"); fsa_cib_update(XML_CIB_TAG_NODES, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, node_list_update_callback); free_xml(node_list); if (call_id >= pcmk_ok && crm_peer_cache != NULL && AM_I_DC) { /* * There is no need to update the local CIB with our values if * we've not seen valid membership data */ GHashTableIter iter; crm_node_t *node = NULL; node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } if (crm_remote_peer_cache) { g_hash_table_iter_init(&iter, crm_remote_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { create_node_state_update(node, flags, node_list, source); } } fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, crmd_node_update_complete); last_peer_update = call_id; free_xml(node_list); } } static void cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { ever_had_quorum |= quorum; if(ever_had_quorum && quorum == FALSE && no_quorum_suicide_escalation) { pcmk_panic(__FUNCTION__); } if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) { int call_id = 0; xmlNode *update = NULL; int call_options = cib_scope_local | cib_quorum_override; update = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); crm_xml_add(update, XML_ATTR_DC_UUID, fsa_our_uuid); fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); crm_debug("Updating quorum status to %s (call=%d)", quorum ? "true" : "false", call_id); fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete); free_xml(update); /* Quorum changes usually cause a new transition via other activity: * quorum gained via a node joining will abort via the node join, * and quorum lost via a node leaving will usually abort via resource * activity and/or fencing. * * However, it is possible that nothing else causes a transition (e.g. * someone forces quorum via corosync-cmaptcl, or quorum is lost due to * a node in standby shutting down cleanly), so here ensure a new * transition is triggered. */ if (quorum) { /* If quorum was gained, abort after a short delay, in case multiple * nodes are joining around the same time, so the one that brings us * to quorum doesn't cause all the remaining ones to be fenced. */ abort_after_delay(INFINITY, tg_restart, "Quorum gained", 5000); } else { abort_transition(INFINITY, tg_restart, "Quorum lost", NULL); } } fsa_has_quorum = quorum; } diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h index 98c0d82ec1..a4123a091d 100644 --- a/daemons/controld/pacemaker-controld.h +++ b/daemons/controld/pacemaker-controld.h @@ -1,26 +1,22 @@ -/* - * Copyright (C) 2004 Andrew Beekhof - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +/* + * Copyright 2004-2018 Andrew Beekhof + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ + #ifndef CRMD__H # define CRMD__H extern GMainLoop *crmd_mainloop; extern bool no_quorum_suicide_escalation; -extern void crmd_metadata(void); +void crmd_metadata(void); +void controld_election_init(const char *uname); +void controld_remove_voter(const char *uname); +void controld_stop_election_timeout(void); +void controld_election_fini(void); +void controld_set_election_period(const char *value); +void controld_stop_election_timer(void); #endif diff --git a/include/crm/cluster/election.h b/include/crm/cluster/election.h index 7ce20bf436..ab82ae6725 100644 --- a/include/crm/cluster/election.h +++ b/include/crm/cluster/election.h @@ -1,59 +1,84 @@ /* - * Copyright (C) 2009 Andrew Beekhof + * Copyright 2009-2018 Andrew Beekhof * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ + #ifndef CRM_COMMON_ELECTION__H # define CRM_COMMON_ELECTION__H #ifdef __cplusplus extern "C" { #endif /** * \file * \brief Functions for conducting elections + * + * An election is useful for a daemon that runs on all nodes but needs any one + * instance to perform a special role. + * + * Elections are closely tied to the cluster peer cache. Peers in the cache that + * are active members are eligible to vote. Elections are named for logging + * purposes, but only one election may exist at any time, so typically an + * election would be created at daemon start-up and freed at shutdown. + * + * Pacemaker's election procedure has been heavily adapted from the + * Invitation Algorithm variant of the Garcia-Molina Bully Algorithm: + * + * https://en.wikipedia.org/wiki/Bully_algorithm + * + * Elections are conducted via cluster messages. There are two types of + * messages: a "vote" is a declaration of the voting node's candidacy, and is + * always broadcast; a "no-vote" is a concession by the responding node, and is + * always a reply to the preferred node's vote. (These correspond to "invite" + * and "accept" in the traditional algorithm.) + * + * A vote together with any no-vote replies to it is considered an election + * round. Rounds are numbered with a simple counter unique to each node + * (this would be the group number in the traditional algorithm). Concurrent + * election rounds are possible. + * + * An election round is started when any node broadcasts a vote. When a node + * receives another node's vote, it compares itself against the sending node + * according to certain metrics, and either starts a new round (if it prefers + * itself) or replies to the other node with a no-vote (if it prefers that + * node). + * + * If a node receives no-votes from all other active nodes, it declares itself + * the winner. The library API does not notify other nodes of this; callers + * must implement that if desired. + * * \ingroup core */ typedef struct election_s election_t; -enum election_result -{ - election_start = 0, - election_in_progress, - election_lost, - election_won, - election_error, +/*! Possible election states */ +enum election_result { + election_start = 0, /*! new election needed */ + election_in_progress, /*! election started but not all peers have voted */ + election_lost, /*! local node lost most recent election */ + election_won, /*! local node won most recent election */ + election_error, /*! election message or election object invalid */ }; void election_fini(election_t *e); void election_reset(election_t *e); election_t *election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb); void election_timeout_set_period(election_t *e, guint period_ms); void election_timeout_stop(election_t *e); void election_vote(election_t *e); bool election_check(election_t *e); void election_remove(election_t *e, const char *uname); enum election_result election_state(election_t *e); enum election_result election_count_vote(election_t *e, xmlNode *vote, bool can_win); #ifdef __cplusplus } #endif #endif diff --git a/lib/cluster/election.c b/lib/cluster/election.c index 567729bc64..c39694a342 100644 --- a/lib/cluster/election.c +++ b/lib/cluster/election.c @@ -1,500 +1,703 @@ /* * Copyright 2004-2018 Andrew Beekhof * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #define STORM_INTERVAL 2 /* in seconds */ -struct election_s -{ - enum election_result state; - guint count; - char *name; - char *uname; - GSourceFunc cb; - GHashTable *voted; - mainloop_timer_t *timeout; /* When to stop if not everyone casts a vote */ +struct election_s { + enum election_result state; + guint count; // How many times local node has voted + char *name; // Descriptive name for this election + char *uname; // Local node's name + GSourceFunc cb; // Function to call if election is won + GHashTable *voted; // Key = node name, value = how node voted + mainloop_timer_t *timeout; // When to abort if all votes not received }; static void election_complete(election_t *e) { - crm_info("Election %s complete", e->name); e->state = election_won; if(e->cb) { e->cb(e); } election_reset(e); } static gboolean election_timer_cb(gpointer user_data) { election_t *e = user_data; - crm_info("Election %s %p timed out", e->name, e); + crm_info("%s timed out, declaring local node as winner", e->name); election_complete(e); return FALSE; } enum election_result election_state(election_t *e) { if(e) { return e->state; } return election_error; } +/*! + * \brief Create a new election object + * + * Every node that wishes to participate in an election must create an election + * object. Typically, this should be done once, at start-up. A caller should + * only create a single election object. + * + * \param[in] name Label for election (for logging) + * \param[in] uname Local node's name + * \param[in] period_ms How long to wait for all peers to vote + * \param[in] cb Function to call if local node wins election + * + * \return Newly allocated election object on success, NULL on error + * \note The caller is responsible for freeing the returned value using + * election_fini(). + */ election_t * election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb) { + election_t *e = NULL; + static guint count = 0; - election_t *e = calloc(1, sizeof(election_t)); - if(e != NULL) { - if(name) { - e->name = crm_strdup_printf("election-%s", name); - } else { - e->name = crm_strdup_printf("election-%u", count++); - } + CRM_CHECK(uname != NULL, return NULL); + + e = calloc(1, sizeof(election_t)); + if (e == NULL) { + crm_perror(LOG_CRIT, "Cannot create election"); + return NULL; + } - e->cb = cb; - e->uname = strdup(uname); - e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, election_timer_cb, e); - crm_trace("Created %s %p", e->name, e); + e->uname = strdup(uname); + if (e->uname == NULL) { + crm_perror(LOG_CRIT, "Cannot create election"); + free(e); + return NULL; } + + e->name = name? crm_strdup_printf("election-%s", name) + : crm_strdup_printf("election-%u", count++); + e->cb = cb; + e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, + election_timer_cb, e); + crm_trace("Created %s", e->name); return e; } +/*! + * \brief Disregard any previous vote by specified peer + * + * This discards any recorded vote from a specified peer. Election users should + * call this whenever a voting peer becomes inactive. + * + * \param[in] e Election object + * \param[in] uname Name of peer to disregard + */ void election_remove(election_t *e, const char *uname) { if(e && uname && e->voted) { + crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname); g_hash_table_remove(e->voted, uname); } } +/*! + * \brief Stop election timer and disregard all votes + * + * \param[in] e Election object + */ void election_reset(election_t *e) { if (e != NULL) { crm_trace("Resetting election %s", e->name); mainloop_timer_stop(e->timeout); if (e->voted) { crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted)); g_hash_table_destroy(e->voted); e->voted = NULL; } } } +/*! + * \brief Free an election object + * + * Free all memory associated with an election object, stopping its + * election timer (if running). + * + * \param[in] e Election object + */ void election_fini(election_t *e) { if(e) { election_reset(e); crm_trace("Destroying %s", e->name); mainloop_timer_del(e->timeout); free(e->uname); free(e->name); free(e); } } static void election_timeout_start(election_t *e) { if(e) { mainloop_timer_start(e->timeout); } } +/*! + * \brief Stop an election's timer, if running + * + * \param[in] e Election object + */ void election_timeout_stop(election_t *e) { if(e) { mainloop_timer_stop(e->timeout); } } +/*! + * \brief Change an election's timeout (restarting timer if running) + * + * \param[in] e Election object + * \param[in] period New timeout + */ void election_timeout_set_period(election_t *e, guint period) { if(e) { mainloop_timer_set_period(e->timeout, period); } else { crm_err("No election defined"); } } static int crm_uptime(struct timeval *output) { static time_t expires = 0; static struct rusage info; time_t tm_now = time(NULL); if (expires < tm_now) { int rc = 0; info.ru_utime.tv_sec = 0; info.ru_utime.tv_usec = 0; rc = getrusage(RUSAGE_SELF, &info); output->tv_sec = 0; output->tv_usec = 0; if (rc < 0) { crm_perror(LOG_ERR, "Could not calculate the current uptime"); expires = 0; return -1; } crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec, (long)info.ru_utime.tv_usec); } expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */ output->tv_sec = info.ru_utime.tv_sec; output->tv_usec = info.ru_utime.tv_usec; return 1; } static int crm_compare_age(struct timeval your_age) { struct timeval our_age; crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */ if (our_age.tv_sec > your_age.tv_sec) { crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); return 1; } else if (our_age.tv_sec < your_age.tv_sec) { crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec); return -1; } else if (our_age.tv_usec > your_age.tv_usec) { crm_debug("Win: %ld.%ld vs %ld.%ld (usec)", (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); return 1; } else if (our_age.tv_usec < your_age.tv_usec) { crm_debug("Lose: %ld.%ld vs %ld.%ld (usec)", (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec); return -1; } return 0; } +/*! + * \brief Start a new election by offering local node's candidacy + * + * Broadcast a "vote" election message containing the local node's ID, + * (incremented) election counter, and uptime, and start the election timer. + * + * \param[in] e Election object + * \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if + * all active peers do so, or if the election times out, the local node + * wins the election. (If we lose to any peer vote, we will stop the + * timer, so a timeout means we did not lose -- either some peer did not + * vote, or we did not call election_check() in time.) + */ void election_vote(election_t *e) { struct timeval age; xmlNode *vote = NULL; crm_node_t *our_node; - if(e == NULL) { - crm_trace("Not voting in election: not initialized"); + if (e == NULL) { + crm_trace("Election vote requested, but no election available"); return; } our_node = crm_get_peer(0, e->uname); - if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) { - crm_trace("Cannot vote yet: %p", our_node); + if ((our_node == NULL) || (crm_is_peer_active(our_node) == FALSE)) { + crm_trace("Cannot vote in %s yet: local node not connected to cluster", + e->name); return; } + election_reset(e); e->state = election_in_progress; vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); e->count++; crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid); crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count); crm_uptime(&age); crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec); crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec); send_cluster_message(NULL, crm_msg_crmd, vote, TRUE); free_xml(vote); - crm_debug("Started election %d", e->count); - if (e->voted) { - g_hash_table_destroy(e->voted); - e->voted = NULL; - } - + crm_debug("Started %s round %d", e->name, e->count); election_timeout_start(e); return; } +/*! + * \brief Check whether local node has won an election + * + * If all known peers have sent no-vote messages, stop the election timer, set + * the election state to won, and call any registered win callback. + * + * \param[in] e Election object + * + * \return TRUE if local node has won, FALSE otherwise + * \note If all known peers have sent no-vote messages, but the election owner + * does not call this function, the election will not be won (and the + * callback will not be called) until the election times out. + * \note This should be called when election_count_vote() returns + * \c election_in_progress. + */ bool election_check(election_t *e) { int voted_size = 0; - int num_members = crm_active_peers(); + int num_members = 0; if(e == NULL) { - crm_trace("not initialized"); + crm_trace("Election check requested, but no election available"); return FALSE; } - - if (e->voted) { - voted_size = g_hash_table_size(e->voted); + if (e->voted == NULL) { + crm_trace("%s check requested, but no votes received yet", e->name); + return FALSE; } + + voted_size = g_hash_table_size(e->voted); + num_members = crm_active_peers(); + /* in the case of #voted > #members, it is better to * wait for the timeout and give the cluster time to * stabilize */ if (voted_size >= num_members) { /* we won and everyone has voted */ election_timeout_stop(e); if (voted_size > num_members) { GHashTableIter gIter; const crm_node_t *node; char *key = NULL; + crm_warn("Received too many votes in %s", e->name); g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) { if (crm_is_peer_active(node)) { - crm_err("member: %s proc=%.32x", node->uname, node->processes); + crm_warn("* expected vote: %s", node->uname); } } g_hash_table_iter_init(&gIter, e->voted); while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) { - crm_err("voted: %s", key); + crm_warn("* actual vote: %s", key); } } + crm_info("%s won by local node", e->name); election_complete(e); return TRUE; } else { - crm_debug("Still waiting on %d non-votes (%d total)", - num_members - voted_size, num_members); + crm_debug("%s still waiting on %d of %d votes", + e->name, num_members - voted_size, num_members); } return FALSE; } #define LOSS_DAMPEN 2 /* in seconds */ -/* A_ELECTION_COUNT */ +struct vote { + const char *op; + const char *from; + const char *version; + const char *election_owner; + int election_id; + struct timeval age; +}; + +/*! + * \brief Unpack an election message + * + * \param[in] e Election object + * \param[in] message Election message XML + * \param[out] vote Parsed fields from message + * + * \return TRUE if election message and election are valid, FALSE otherwise + * \note The parsed struct's pointer members are valid only for the lifetime of + * the message argument. + */ +static bool +parse_election_message(election_t *e, xmlNode *message, struct vote *vote) +{ + CRM_CHECK(message && vote, return FALSE); + + vote->election_id = -1; + vote->age.tv_sec = -1; + vote->age.tv_usec = -1; + + vote->op = crm_element_value(message, F_CRM_TASK); + vote->from = crm_element_value(message, F_CRM_HOST_FROM); + vote->version = crm_element_value(message, F_CRM_VERSION); + vote->election_owner = crm_element_value(message, F_CRM_ELECTION_OWNER); + + crm_element_value_int(message, F_CRM_ELECTION_ID, &(vote->election_id)); + + if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL) + || (vote->election_owner == NULL) || (vote->election_id < 0)) { + + crm_warn("Invalid %s message from %s in %s ", + (vote->op? vote->op : "election"), + (vote->from? vote->from : "unspecified node"), + (e? e->name : "election")); + return FALSE; + } + + // Op-specific validation + + if (crm_str_eq(vote->op, CRM_OP_VOTE, TRUE)) { + // Only vote ops have uptime + int age_s = -1; + int age_us = -1; + + // @TODO add functions to parse time_t / suseconds_t directly from XML + crm_element_value_int(message, F_CRM_ELECTION_AGE_S, &age_s); + crm_element_value_int(message, F_CRM_ELECTION_AGE_US, &age_us); + + if ((age_s < 0) || (age_us < 0)) { + crm_warn("Cannot count %s %s from %s because it is missing uptime", + (e? e->name : "election"), vote->op, vote->from); + return FALSE; + } + vote->age.tv_sec = age_s; + vote->age.tv_usec = age_us; + + } else if (!crm_str_eq(vote->op, CRM_OP_NOVOTE, TRUE)) { + crm_info("Cannot process %s message from %s because %s is not a known election op", + (e? e->name : "election"), vote->from, vote->op); + return FALSE; + } + + // Election validation + + if (e == NULL) { + crm_info("Cannot count %s from %s because no election available", + vote->op, vote->from); + return FALSE; + } + + /* If the membership cache is NULL, we REALLY shouldn't be voting -- + * the question is how we managed to get here. + */ + if (crm_peer_cache == NULL) { + crm_info("Cannot count %s %s from %s because no peer information available", + e->name, vote->op, vote->from); + return FALSE; + } + return TRUE; +} + +static void +record_vote(election_t *e, struct vote *vote) +{ + char *voter_copy = NULL; + char *vote_copy = NULL; + + CRM_ASSERT(e && vote && vote->from && vote->op); + if (e->voted == NULL) { + e->voted = crm_str_table_new(); + } + + voter_copy = strdup(vote->from); + vote_copy = strdup(vote->op); + CRM_ASSERT(voter_copy && vote_copy); + + g_hash_table_replace(e->voted, voter_copy, vote_copy); +} + +static void +send_no_vote(crm_node_t *peer, struct vote *vote) +{ + // @TODO probably shouldn't hardcode CRM_SYSTEM_CRMD and crm_msg_crmd + + xmlNode *novote = create_request(CRM_OP_NOVOTE, NULL, vote->from, + CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); + + crm_xml_add(novote, F_CRM_ELECTION_OWNER, vote->election_owner); + crm_xml_add_int(novote, F_CRM_ELECTION_ID, vote->election_id); + + send_cluster_message(peer, crm_msg_crmd, novote, TRUE); + free_xml(novote); +} + +/*! + * \brief Process an election message (vote or no-vote) from a peer + * + * \param[in] e Election object + * \param[in] vote Election message XML from peer + * \param[in] can_win Whether to consider the local node eligible for winning + * + * \return Election state after new vote is considered + * \note If the peer message is a vote, and we prefer the peer to win, this will + * send a no-vote reply to the peer. + * \note The situations "we lost to this vote" from "this is a late no-vote + * after we've already lost" both return election_lost. If a caller needs + * to distinguish them, it should save the current state before calling + * this function, and then compare the result. + */ enum election_result -election_count_vote(election_t *e, xmlNode *vote, bool can_win) +election_count_vote(election_t *e, xmlNode *message, bool can_win) { - int age = 0; - int election_id = -1; int log_level = LOG_INFO; gboolean done = FALSE; gboolean we_lose = FALSE; - const char *op = NULL; - const char *from = NULL; const char *reason = "unknown"; - const char *election_owner = NULL; + bool we_are_owner = FALSE; crm_node_t *our_node = NULL, *your_node = NULL; + time_t tm_now = time(NULL); + struct vote vote; + // @TODO these should be in election_t static int election_wins = 0; - - xmlNode *novote = NULL; - time_t tm_now = time(NULL); static time_t expires = 0; static time_t last_election_loss = 0; - /* if the membership copy is NULL we REALLY shouldn't be voting - * the question is how we managed to get here. - */ - - CRM_CHECK(vote != NULL, return election_error); - - if(e == NULL) { - crm_info("Not voting in election: not initialized"); - return election_lost; - - } else if(crm_peer_cache == NULL) { - crm_info("Not voting in election: no peer cache"); - return election_lost; + CRM_CHECK(message != NULL, return election_error); + if (parse_election_message(e, message, &vote) == FALSE) { + return election_error; } - op = crm_element_value(vote, F_CRM_TASK); - from = crm_element_value(vote, F_CRM_HOST_FROM); - election_owner = crm_element_value(vote, F_CRM_ELECTION_OWNER); - crm_element_value_int(vote, F_CRM_ELECTION_ID, &election_id); - - your_node = crm_get_peer(0, from); + your_node = crm_get_peer(0, vote.from); our_node = crm_get_peer(0, e->uname); - - if (e->voted == NULL) { - crm_debug("Created voted hash"); - e->voted = crm_str_table_new(); - } + we_are_owner = (our_node != NULL) + && crm_str_eq(our_node->uuid, vote.election_owner, TRUE); if(can_win == FALSE) { reason = "Not eligible"; we_lose = TRUE; } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) { reason = "We are not part of the cluster"; log_level = LOG_ERR; we_lose = TRUE; - } else if (election_id != e->count && crm_str_eq(our_node->uuid, election_owner, TRUE)) { + } else if (we_are_owner && (vote.election_id != e->count)) { log_level = LOG_TRACE; reason = "Superseded"; done = TRUE; } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) { /* Possibly we cached the message in the FSA queue at a point that it wasn't */ reason = "Peer is not part of our cluster"; log_level = LOG_WARNING; done = TRUE; - } else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) { - char *op_copy = strdup(op); - char *uname_copy = strdup(from); - - CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE)); - - /* update the list of nodes that have voted */ - g_hash_table_replace(e->voted, uname_copy, op_copy); + } else if (crm_str_eq(vote.op, CRM_OP_NOVOTE, TRUE) + || crm_str_eq(vote.from, e->uname, TRUE)) { + /* Receiving our own broadcast vote, or a no-vote from peer, is a vote + * for us to win + */ + if (!we_are_owner) { + crm_warn("Cannot count %s round %d %s from %s because we are not election owner (%s)", + e->name, vote.election_id, vote.op, vote.from, + vote.election_owner); + return election_error; + } + if (e->state != election_in_progress) { + // Should only happen if we already lost + crm_debug("Not counting %s round %d %s from %s because no election in progress", + e->name, vote.election_id, vote.op, vote.from); + return e->state; + } + record_vote(e, &vote); reason = "Recorded"; done = TRUE; } else { - struct timeval your_age; - const char *your_version = crm_element_value(vote, F_CRM_VERSION); - int tv_sec = 0; - int tv_usec = 0; - - crm_element_value_int(vote, F_CRM_ELECTION_AGE_S, &tv_sec); - crm_element_value_int(vote, F_CRM_ELECTION_AGE_US, &tv_usec); - - your_age.tv_sec = tv_sec; - your_age.tv_usec = tv_usec; + // A peer vote requires a comparison to determine which node is better + int age_result = crm_compare_age(vote.age); + int version_result = compare_version(vote.version, CRM_FEATURE_SET); - age = crm_compare_age(your_age); - if (crm_str_eq(from, e->uname, TRUE)) { - char *op_copy = strdup(op); - char *uname_copy = strdup(from); - - CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE)); - - /* update ourselves in the list of nodes that have voted */ - g_hash_table_replace(e->voted, uname_copy, op_copy); - reason = "Recorded"; - done = TRUE; - - } else if (compare_version(your_version, CRM_FEATURE_SET) < 0) { + if (version_result < 0) { reason = "Version"; we_lose = TRUE; - } else if (compare_version(your_version, CRM_FEATURE_SET) > 0) { + } else if (version_result > 0) { reason = "Version"; - } else if (age < 0) { + } else if (age_result < 0) { reason = "Uptime"; we_lose = TRUE; - } else if (age > 0) { + } else if (age_result > 0) { reason = "Uptime"; - } else if (e->uname == NULL) { - reason = "Unknown host name"; - we_lose = TRUE; - - } else if (strcasecmp(e->uname, from) > 0) { + } else if (strcasecmp(e->uname, vote.from) > 0) { reason = "Host name"; we_lose = TRUE; } else { reason = "Host name"; - CRM_ASSERT(strcasecmp(e->uname, from) < 0); -/* can't happen... - * } else if(strcasecmp(e->uname, from) == 0) { - * - */ } } if (expires < tm_now) { election_wins = 0; expires = tm_now + STORM_INTERVAL; } else if (done == FALSE && we_lose == FALSE) { int peers = 1 + g_hash_table_size(crm_peer_cache); + static bool wrote_blackbox = FALSE; // @TODO move to election_t /* If every node has to vote down every other node, thats N*(N-1) total elections * Allow some leeway before _really_ complaining */ election_wins++; if (election_wins > (peers * peers)) { - crm_warn("Election storm detected: %d elections in %d seconds", election_wins, - STORM_INTERVAL); + crm_warn("%s election storm detected: %d wins in %d seconds", + e->name, election_wins, STORM_INTERVAL); election_wins = 0; expires = tm_now + STORM_INTERVAL; - crm_write_blackbox(0, NULL); + if (wrote_blackbox == FALSE) { + /* It's questionable whether a black box (from every node in the + * cluster) would be truly helpful in diagnosing an election + * storm. It's also highly doubtful a production environment + * would get multiple election storms from distinct causes, so + * saving one blackbox per process lifetime should be + * sufficient. Alternatives would be to save a timestamp of the + * last blackbox write instead of a boolean, and write a new one + * if some amount of time has passed; or to save a storm count, + * write a blackbox on every Nth occurrence. + */ + crm_write_blackbox(0, NULL); + } } } if (done) { - do_crm_log(log_level + 1, "Election %d (current: %d, owner: %s): Processed %s from %s (%s)", - election_id, e->count, election_owner, op, from, reason); + do_crm_log(log_level + 1, + "Processed %s round %d %s (current round %d) from %s (%s)", + e->name, vote.election_id, vote.op, e->count, vote.from, + reason); return e->state; } else if (we_lose == FALSE) { - do_crm_log(log_level, "Election %d (owner: %s) pass: %s from %s (%s)", - election_id, election_owner, op, from, reason); - if (last_election_loss == 0 || tm_now - last_election_loss > (time_t) LOSS_DAMPEN) { + do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)", + e->name, vote.election_id, vote.election_owner, vote.op, + vote.from, reason); + last_election_loss = 0; election_timeout_stop(e); /* Start a new election by voting down this, and other, peers */ e->state = election_start; return e->state; - } + } else { + char *loss_time = ctime(&last_election_loss); - crm_info("Election %d ignore: We already lost an election less than %ds ago (%s)", - election_id, LOSS_DAMPEN, ctime(&last_election_loss)); + if (loss_time) { + // Show only HH:MM:SS + loss_time += 11; + loss_time[8] = '\0'; + } + crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s", + e->name, vote.election_id, vote.election_owner, vote.from, + LOSS_DAMPEN, (loss_time? loss_time : "unknown")); + } } - novote = create_request(CRM_OP_NOVOTE, NULL, from, - CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); - - do_crm_log(log_level, "Election %d (owner: %s) lost: %s from %s (%s)", - election_id, election_owner, op, from, reason); - - election_timeout_stop(e); - - crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner); - crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id); + last_election_loss = tm_now; - send_cluster_message(your_node, crm_msg_crmd, novote, TRUE); - free_xml(novote); + do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)", + e->name, vote.election_id, vote.election_owner, vote.op, + vote.from, reason); - last_election_loss = tm_now; + election_reset(e); + send_no_vote(your_node, &vote); e->state = election_lost; return e->state; }