Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F2822936
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
44 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index cb9ee4e514..34759acc0d 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -1,186 +1,185 @@
/*
* Copyright 2013-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <crm/cluster.h>
#include <crm/cluster/election_internal.h>
#include <crm/common/xml.h>
#include "pacemaker-attrd.h"
static char *peer_writer = NULL;
static pcmk__election_t *writer = NULL;
static gboolean
attrd_election_cb(gpointer user_data)
{
attrd_declare_winner();
/* Update the peers after an election */
attrd_peer_sync(NULL);
/* After winning an election, update the CIB with the values of all
* attributes as the winner knows them.
*/
attrd_write_attributes(attrd_write_all);
return G_SOURCE_REMOVE;
}
void
attrd_election_init(void)
{
writer = election_init(pcmk_ipc_attrd, PCMK__VALUE_ATTRD,
- attrd_cluster->priv->node_name, 120000,
- attrd_election_cb);
+ attrd_cluster->priv->node_name, attrd_election_cb);
}
void
attrd_election_fini(void)
{
election_fini(writer);
}
void
attrd_start_election_if_needed(void)
{
if ((peer_writer == NULL)
&& (election_state(writer) != election_in_progress)
&& !attrd_shutting_down(false)) {
crm_info("Starting an election to determine the writer");
election_vote(writer);
}
}
bool
attrd_election_won(void)
{
return (election_state(writer) == election_won);
}
void
attrd_handle_election_op(const pcmk__node_status_t *peer, xmlNode *xml)
{
enum election_result rc = 0;
enum election_result previous = election_state(writer);
crm_xml_add(xml, PCMK__XA_SRC, peer->name);
// Don't become writer if we're shutting down
rc = election_count_vote(writer, xml, !attrd_shutting_down(false));
switch(rc) {
case election_start:
crm_debug("Unsetting writer (was %s) and starting new election",
peer_writer? peer_writer : "unset");
free(peer_writer);
peer_writer = NULL;
election_vote(writer);
break;
case election_lost:
/* The election API should really distinguish between "we just lost
* to this peer" and "we already lost previously, and we are
* discarding this vote for some reason", but it doesn't.
*
* In the first case, we want to tentatively set the peer writer to
* this peer, even though another peer may eventually win (which we
* will learn via attrd_check_for_new_writer()), so
* attrd_start_election_if_needed() doesn't start a new election.
*
* Approximate a test for that case as best as possible.
*/
if ((peer_writer == NULL) || (previous != election_lost)) {
pcmk__str_update(&peer_writer, peer->name);
crm_debug("Election lost, presuming %s is writer for now",
peer_writer);
}
break;
case election_in_progress:
election_check(writer);
break;
default:
crm_info("Ignoring election op from %s due to error", peer->name);
break;
}
}
bool
attrd_check_for_new_writer(const pcmk__node_status_t *peer, const xmlNode *xml)
{
int peer_state = 0;
crm_element_value_int(xml, PCMK__XA_ATTR_WRITER, &peer_state);
if (peer_state == election_won) {
if ((election_state(writer) == election_won)
&& !pcmk__str_eq(peer->name, attrd_cluster->priv->node_name,
pcmk__str_casei)) {
crm_notice("Detected another attribute writer (%s), starting new "
"election",
peer->name);
election_vote(writer);
} else if (!pcmk__str_eq(peer->name, peer_writer, pcmk__str_casei)) {
crm_notice("Recorded new attribute writer: %s (was %s)",
peer->name, pcmk__s(peer_writer, "unset"));
pcmk__str_update(&peer_writer, peer->name);
}
}
return (peer_state == election_won);
}
void
attrd_declare_winner(void)
{
crm_notice("Recorded local node as attribute writer (was %s)",
(peer_writer? peer_writer : "unset"));
pcmk__str_update(&peer_writer, attrd_cluster->priv->node_name);
}
void
attrd_remove_voter(const pcmk__node_status_t *peer)
{
election_remove(writer, peer->name);
if ((peer_writer != NULL)
&& pcmk__str_eq(peer->name, peer_writer, pcmk__str_casei)) {
free(peer_writer);
peer_writer = NULL;
crm_notice("Lost attribute writer %s", peer->name);
/* Clear any election dampening in effect. Otherwise, if the lost writer
* had just won, the election could fizzle out with no new writer.
*/
election_clear_dampening(writer);
/* If the writer received attribute updates during its shutdown, it will
* not have written them to the CIB. Ensure we get a new writer so they
* are written out. This means that every node that sees the writer
* leave will start a new election, but that's better than losing
* attributes.
*/
attrd_start_election_if_needed();
/* If an election is in progress, we need to call election_check(), in case
* this lost peer is the only one that hasn't voted, otherwise the election
* would be pending until it's timed out.
*/
} else if (election_state(writer) == election_in_progress) {
crm_debug("Checking election status upon loss of voter %s", peer->name);
election_check(writer);
}
}
void
attrd_xml_add_writer(xmlNode *xml)
{
crm_xml_add_int(xml, PCMK__XA_ATTR_WRITER, election_state(writer));
}
diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c
index c1b14dc33b..33dac058f3 100644
--- a/daemons/controld/controld_election.c
+++ b/daemons/controld/controld_election.c
@@ -1,291 +1,291 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <crm/common/xml.h>
#include <crm/cluster/internal.h>
#include <crm/cluster/election_internal.h>
#include <crm/crm.h>
#include <pacemaker-controld.h>
static pcmk__election_t *fsa_election = NULL;
static gboolean
election_win_cb(gpointer data)
{
register_fsa_input(C_FSA_INTERNAL, I_ELECTION_DC, NULL);
return FALSE;
}
void
controld_election_init(const char *uname)
{
- fsa_election = election_init(pcmk_ipc_controld, "DC", uname, 60000,
+ fsa_election = election_init(pcmk_ipc_controld, "DC", uname,
election_win_cb);
}
/*!
* \internal
* \brief Configure election options based on the CIB
*
* \param[in,out] options Name/value pairs for configured options
*/
void
controld_configure_election(GHashTable *options)
{
const char *value = g_hash_table_lookup(options, PCMK_OPT_ELECTION_TIMEOUT);
guint interval_ms = 0U;
pcmk_parse_interval_spec(value, &interval_ms);
election_timeout_set_period(fsa_election, interval_ms);
}
void
controld_remove_voter(const char *uname)
{
election_remove(fsa_election, uname);
if (pcmk__str_eq(uname, controld_globals.dc_name, pcmk__str_casei)) {
/* Clear any election dampening in effect. Otherwise, if the lost DC had
* just won, an immediate new election could fizzle out with no new DC.
*/
election_clear_dampening(fsa_election);
}
}
void
controld_election_fini(void)
{
election_fini(fsa_election);
fsa_election = NULL;
}
void
controld_stop_current_election_timeout(void)
{
election_timeout_stop(fsa_election);
}
/* A_ELECTION_VOTE */
void
do_election_vote(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
gboolean not_voting = FALSE;
/* don't vote if we're in one of these states or wanting to shut down */
switch (cur_state) {
case S_STARTING:
case S_RECOVERY:
case S_STOPPING:
case S_TERMINATE:
crm_warn("Not voting in election, we're in state %s", fsa_state2string(cur_state));
not_voting = TRUE;
break;
case S_ELECTION:
case S_INTEGRATION:
case S_RELEASE_DC:
break;
default:
crm_err("Broken? Voting in state %s", fsa_state2string(cur_state));
break;
}
if (not_voting == FALSE) {
if (pcmk_is_set(controld_globals.fsa_input_register, R_STARTING)) {
not_voting = TRUE;
}
}
if (not_voting) {
if (AM_I_DC) {
register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL);
} else {
register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL);
}
return;
}
election_vote(fsa_election);
return;
}
void
do_election_check(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
if (controld_globals.fsa_state == S_ELECTION) {
election_check(fsa_election);
} else {
crm_debug("Ignoring election check because we are not in an election");
}
}
/* A_ELECTION_COUNT */
void
do_election_count_vote(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
enum election_result rc = 0;
ha_msg_input_t *vote = fsa_typed_data(fsa_dt_ha_msg);
if (pcmk__peer_cache == NULL) {
if (!pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
crm_err("Internal error, no peer cache");
}
return;
}
rc = election_count_vote(fsa_election, vote->msg, cur_state != S_STARTING);
switch(rc) {
case election_start:
election_reset(fsa_election);
register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
break;
case election_lost:
update_dc(NULL);
if (pcmk_is_set(controld_globals.fsa_input_register, R_THE_DC)) {
cib_t *cib_conn = controld_globals.cib_conn;
register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL);
cib_conn->cmds->set_secondary(cib_conn, cib_none);
} else if (cur_state != S_STARTING) {
register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL);
}
break;
default:
crm_trace("Election message resulted in state %d", rc);
}
}
static void
feature_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL;
crm_notice("Feature update failed: %s " QB_XS " rc=%d",
pcmk_strerror(rc), rc);
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
}
/*!
* \internal
* \brief Update a node attribute in the CIB during a DC takeover
*
* \param[in] name Name of attribute to update
* \param[in] value New attribute value
*/
#define dc_takeover_update_attr(name, value) do { \
cib__update_node_attr(controld_globals.logger_out, \
controld_globals.cib_conn, cib_none, \
PCMK_XE_CRM_CONFIG, NULL, NULL, NULL, NULL, \
name, value, NULL, NULL); \
} while (0)
/* A_DC_TAKEOVER */
void
do_dc_takeover(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
xmlNode *cib = NULL;
const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer();
const char *cluster_layer_s = pcmk_cluster_layer_text(cluster_layer);
pid_t watchdog = pcmk__locate_sbd();
crm_info("Taking over DC status for this partition");
controld_set_fsa_input_flags(R_THE_DC);
execute_stonith_cleanup();
election_reset(fsa_election);
controld_set_fsa_input_flags(R_JOIN_OK|R_INVOKE_PE);
controld_globals.cib_conn->cmds->set_primary(controld_globals.cib_conn,
cib_none);
cib = pcmk__xe_create(NULL, PCMK_XE_CIB);
crm_xml_add(cib, PCMK_XA_CRM_FEATURE_SET, CRM_FEATURE_SET);
controld_update_cib(PCMK_XE_CIB, cib, cib_none, feature_update_callback);
dc_takeover_update_attr(PCMK_OPT_HAVE_WATCHDOG, pcmk__btoa(watchdog));
dc_takeover_update_attr(PCMK_OPT_DC_VERSION,
PACEMAKER_VERSION "-" BUILD_VERSION);
dc_takeover_update_attr(PCMK_OPT_CLUSTER_INFRASTRUCTURE, cluster_layer_s);
#if SUPPORT_COROSYNC
if ((controld_globals.cluster_name == NULL)
&& (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync)) {
char *cluster_name = pcmk__corosync_cluster_name();
if (cluster_name != NULL) {
dc_takeover_update_attr(PCMK_OPT_CLUSTER_NAME, cluster_name);
}
free(cluster_name);
}
#endif
controld_trigger_config();
pcmk__xml_free(cib);
}
/* A_DC_RELEASE */
void
do_dc_release(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
if (action & A_DC_RELEASE) {
crm_debug("Releasing the role of DC");
controld_clear_fsa_input_flags(R_THE_DC);
controld_expect_sched_reply(NULL);
} else if (action & A_DC_RELEASED) {
crm_info("DC role released");
if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
xmlNode *update = NULL;
pcmk__node_status_t *node =
pcmk__get_node(0, controld_globals.our_nodename,
NULL, pcmk__node_search_cluster_member);
pcmk__update_peer_expected(__func__, node, CRMD_JOINSTATE_DOWN);
update = create_node_state_update(node, node_update_expected, NULL,
__func__);
/* Don't need a based response because controld will stop. */
fsa_cib_anon_update_discard_reply(PCMK_XE_STATUS, update);
pcmk__xml_free(update);
}
register_fsa_input(C_FSA_INTERNAL, I_RELEASE_SUCCESS, NULL);
} else {
crm_err("Unknown DC action %s", fsa_action2string(action));
}
crm_trace("Am I still the DC? %s", pcmk__btoa(AM_I_DC));
}
diff --git a/include/crm/cluster/election_internal.h b/include/crm/cluster/election_internal.h
index 431ad4d7c1..b420ccebdf 100644
--- a/include/crm/cluster/election_internal.h
+++ b/include/crm/cluster/election_internal.h
@@ -1,95 +1,94 @@
/*
* Copyright 2009-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef PCMK__CRM_CLUSTER_ELECTION_INTERNAL__H
#define PCMK__CRM_CLUSTER_ELECTION_INTERNAL__H
#include <stdbool.h> // bool
#include <glib.h> // guint, GSourceFunc
#include <libxml/tree.h> // xmlNode
#include <crm/common/ipc.h> // enum pcmk_ipc_server
#ifdef __cplusplus
extern "C" {
#endif
/**
* \file
* \brief Functions for conducting elections
*
* An election is useful for a daemon that runs on all nodes but needs any one
* instance to perform a special role.
*
* Elections are closely tied to the cluster peer cache. Peers in the cache that
* are active members are eligible to vote. Elections are named for logging
* purposes, but only one election may exist at any time, so typically an
* election would be created at daemon start-up and freed at shutdown.
*
* Pacemaker's election procedure has been heavily adapted from the
* Invitation Algorithm variant of the Garcia-Molina Bully Algorithm:
*
* https://en.wikipedia.org/wiki/Bully_algorithm
*
* Elections are conducted via cluster messages. There are two types of
* messages: a "vote" is a declaration of the voting node's candidacy, and is
* always broadcast; a "no-vote" is a concession by the responding node, and is
* always a reply to the preferred node's vote. (These correspond to "invite"
* and "accept" in the traditional algorithm.)
*
* A vote together with any no-vote replies to it is considered an election
* round. Rounds are numbered with a simple counter unique to each node
* (this would be the group number in the traditional algorithm). Concurrent
* election rounds are possible.
*
* An election round is started when any node broadcasts a vote. When a node
* receives another node's vote, it compares itself against the sending node
* according to certain metrics, and either starts a new round (if it prefers
* itself) or replies to the other node with a no-vote (if it prefers that
* node).
*
* If a node receives no-votes from all other active nodes, it declares itself
* the winner. The library API does not notify other nodes of this; callers
* must implement that if desired.
*/
typedef struct pcmk__election pcmk__election_t;
/*! Possible election states */
enum election_result {
election_start = 0, /*! new election needed */
election_in_progress, /*! election started but not all peers have voted */
election_lost, /*! local node lost most recent election */
election_won, /*! local node won most recent election */
election_error, /*! election message or election object invalid */
};
void election_fini(pcmk__election_t *e);
void election_reset(pcmk__election_t *e);
pcmk__election_t *election_init(enum pcmk_ipc_server, const char *name,
- const char *uname, guint period_ms,
- GSourceFunc cb);
+ const char *uname, GSourceFunc cb);
void election_timeout_set_period(pcmk__election_t *e, guint period_ms);
void election_timeout_stop(pcmk__election_t *e);
void election_vote(pcmk__election_t *e);
bool election_check(pcmk__election_t *e);
void election_remove(pcmk__election_t *e, const char *uname);
enum election_result election_state(const pcmk__election_t *e);
enum election_result election_count_vote(pcmk__election_t *e,
const xmlNode *message, bool can_win);
void election_clear_dampening(pcmk__election_t *e);
#ifdef __cplusplus
}
#endif
#endif // PCMK__CRM_CLUSTER_ELECTION_INTERNAL__H
diff --git a/lib/cluster/election.c b/lib/cluster/election.c
index 540d204220..a4f1206061 100644
--- a/lib/cluster/election.c
+++ b/lib/cluster/election.c
@@ -1,724 +1,729 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <crm/common/xml.h>
#include <crm/common/mainloop.h>
#include <crm/cluster/internal.h>
#include <crm/cluster/election_internal.h>
#include <crm/crm.h>
#define STORM_INTERVAL 2 /* in seconds */
struct pcmk__election {
enum pcmk_ipc_server server; // For message type
enum election_result state;
guint count; // How many times local node has voted
char *name; // Descriptive name for this election
char *uname; // Local node's name
GSourceFunc cb; // Function to call if election is won
GHashTable *voted; // Key = node name, value = how node voted
mainloop_timer_t *timeout; // When to abort if all votes not received
int election_wins; // Track wins, for storm detection
bool wrote_blackbox; // Write a storm blackbox at most once
time_t expires; // When storm detection period ends
time_t last_election_loss; // When dampening period ends
};
static void
election_complete(pcmk__election_t *e)
{
e->state = election_won;
if (e->cb != NULL) {
e->cb(e);
}
election_reset(e);
}
static gboolean
election_timer_cb(gpointer user_data)
{
pcmk__election_t *e = user_data;
crm_info("%s timed out, declaring local node as winner", e->name);
election_complete(e);
return FALSE;
}
/*!
* \brief Get current state of an election
*
* \param[in] e Election object
*
* \return Current state of \e
*/
enum election_result
election_state(const pcmk__election_t *e)
{
return (e == NULL)? election_error : e->state;
}
+/* The local node will be declared the winner if missing votes are not received
+ * within this time. The value is chosen to be the same as the default for the
+ * election-timeout cluster option.
+ */
+#define ELECTION_TIMEOUT_MS 120000
+
/*!
* \brief Create a new election object
*
* Every node that wishes to participate in an election must create an election
* object. Typically, this should be done once, at start-up. A caller should
* only create a single election object.
*
* \param[in] server Server to use for message type in election messages
* \param[in] name Label for election (for logging)
* \param[in] uname Local node's name
- * \param[in] period_ms How long to wait for all peers to vote
* \param[in] cb Function to call if local node wins election
*
* \return Newly allocated election object on success, NULL on error
* \note The caller is responsible for freeing the returned value using
* election_fini().
*/
pcmk__election_t *
election_init(enum pcmk_ipc_server server, const char *name, const char *uname,
- guint period_ms, GSourceFunc cb)
+ GSourceFunc cb)
{
pcmk__election_t *e = NULL;
static guint count = 0;
CRM_ASSERT(uname != NULL);
e = pcmk__assert_alloc(1, sizeof(pcmk__election_t));
e->server = server;
e->uname = pcmk__str_copy(uname);
e->name = name? crm_strdup_printf("election-%s", name)
: crm_strdup_printf("election-%u", count++);
e->cb = cb;
- e->timeout = mainloop_timer_add(e->name, period_ms, FALSE,
+ e->timeout = mainloop_timer_add(e->name, ELECTION_TIMEOUT_MS, FALSE,
election_timer_cb, e);
return e;
}
/*!
* \brief Disregard any previous vote by specified peer
*
* This discards any recorded vote from a specified peer. Election users should
* call this whenever a voting peer becomes inactive.
*
* \param[in,out] e Election object
* \param[in] uname Name of peer to disregard
*/
void
election_remove(pcmk__election_t *e, const char *uname)
{
if ((e != NULL) && (uname != NULL) && (e->voted != NULL)) {
crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname);
g_hash_table_remove(e->voted, uname);
}
}
/*!
* \brief Stop election timer and disregard all votes
*
* \param[in,out] e Election object
*/
void
election_reset(pcmk__election_t *e)
{
if (e != NULL) {
crm_trace("Resetting election %s", e->name);
mainloop_timer_stop(e->timeout);
if (e->voted) {
crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
g_hash_table_destroy(e->voted);
e->voted = NULL;
}
}
}
/*!
* \brief Free an election object
*
* Free all memory associated with an election object, stopping its
* election timer (if running).
*
* \param[in,out] e Election object
*/
void
election_fini(pcmk__election_t *e)
{
if (e != NULL) {
election_reset(e);
crm_trace("Destroying %s", e->name);
mainloop_timer_del(e->timeout);
free(e->uname);
free(e->name);
free(e);
}
}
static void
election_timeout_start(pcmk__election_t *e)
{
if (e != NULL) {
mainloop_timer_start(e->timeout);
}
}
/*!
* \brief Stop an election's timer, if running
*
* \param[in,out] e Election object
*/
void
election_timeout_stop(pcmk__election_t *e)
{
if (e != NULL) {
mainloop_timer_stop(e->timeout);
}
}
/*!
* \brief Change an election's timeout (restarting timer if running)
*
* \param[in,out] e Election object
* \param[in] period New timeout
*/
void
election_timeout_set_period(pcmk__election_t *e, guint period)
{
if (e != NULL) {
mainloop_timer_set_period(e->timeout, period);
} else {
crm_err("No election defined");
}
}
static int
get_uptime(struct timeval *output)
{
static time_t expires = 0;
static struct rusage info;
time_t tm_now = time(NULL);
if (expires < tm_now) {
int rc = 0;
info.ru_utime.tv_sec = 0;
info.ru_utime.tv_usec = 0;
rc = getrusage(RUSAGE_SELF, &info);
output->tv_sec = 0;
output->tv_usec = 0;
if (rc < 0) {
crm_perror(LOG_ERR, "Could not calculate the current uptime");
expires = 0;
return -1;
}
crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
(long)info.ru_utime.tv_usec);
}
expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
output->tv_sec = info.ru_utime.tv_sec;
output->tv_usec = info.ru_utime.tv_usec;
return 1;
}
static int
compare_age(struct timeval your_age)
{
struct timeval our_age;
get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
if (our_age.tv_sec > your_age.tv_sec) {
crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return 1;
} else if (our_age.tv_sec < your_age.tv_sec) {
crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return -1;
} else if (our_age.tv_usec > your_age.tv_usec) {
crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return 1;
} else if (our_age.tv_usec < your_age.tv_usec) {
crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return -1;
}
return 0;
}
/*!
* \brief Start a new election by offering local node's candidacy
*
* Broadcast a "vote" election message containing the local node's ID,
* (incremented) election counter, and uptime, and start the election timer.
*
* \param[in,out] e Election object
*
* \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if
* all active peers do so, or if the election times out, the local node
* wins the election. (If we lose to any peer vote, we will stop the
* timer, so a timeout means we did not lose -- either some peer did not
* vote, or we did not call election_check() in time.)
*/
void
election_vote(pcmk__election_t *e)
{
struct timeval age;
xmlNode *vote = NULL;
pcmk__node_status_t *our_node = NULL;
const char *message_type = NULL;
if (e == NULL) {
crm_trace("Election vote requested, but no election available");
return;
}
our_node = pcmk__get_node(0, e->uname, NULL,
pcmk__node_search_cluster_member);
if (!pcmk__cluster_is_node_active(our_node)) {
crm_trace("Cannot vote in %s yet: local node not connected to cluster",
e->name);
return;
}
election_reset(e);
e->state = election_in_progress;
message_type = pcmk__server_message_type(e->server);
/* @COMPAT We use message_type as the sender and recipient system for
* backward compatibility (see T566).
*/
vote = pcmk__new_request(e->server, message_type, NULL,
message_type, CRM_OP_VOTE, NULL);
e->count++;
crm_xml_add(vote, PCMK__XA_ELECTION_OWNER, our_node->xml_id);
crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, e->count);
// Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is actually microseconds
get_uptime(&age);
crm_xml_add_timeval(vote, PCMK__XA_ELECTION_AGE_SEC,
PCMK__XA_ELECTION_AGE_NANO_SEC, &age);
pcmk__cluster_send_message(NULL, e->server, vote);
pcmk__xml_free(vote);
crm_debug("Started %s round %d", e->name, e->count);
election_timeout_start(e);
return;
}
/*!
* \brief Check whether local node has won an election
*
* If all known peers have sent no-vote messages, stop the election timer, set
* the election state to won, and call any registered win callback.
*
* \param[in,out] e Election object
*
* \return TRUE if local node has won, FALSE otherwise
* \note If all known peers have sent no-vote messages, but the election owner
* does not call this function, the election will not be won (and the
* callback will not be called) until the election times out.
* \note This should be called when election_count_vote() returns
* \c election_in_progress.
*/
bool
election_check(pcmk__election_t *e)
{
int voted_size = 0;
int num_members = 0;
if (e == NULL) {
crm_trace("Election check requested, but no election available");
return FALSE;
}
if (e->voted == NULL) {
crm_trace("%s check requested, but no votes received yet", e->name);
return FALSE;
}
voted_size = g_hash_table_size(e->voted);
num_members = pcmk__cluster_num_active_nodes();
/* in the case of #voted > #members, it is better to
* wait for the timeout and give the cluster time to
* stabilize
*/
if (voted_size >= num_members) {
/* we won and everyone has voted */
election_timeout_stop(e);
if (voted_size > num_members) {
GHashTableIter gIter;
const pcmk__node_status_t *node = NULL;
char *key = NULL;
crm_warn("Received too many votes in %s", e->name);
g_hash_table_iter_init(&gIter, pcmk__peer_cache);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
if (pcmk__cluster_is_node_active(node)) {
crm_warn("* expected vote: %s", node->name);
}
}
g_hash_table_iter_init(&gIter, e->voted);
while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
crm_warn("* actual vote: %s", key);
}
}
crm_info("%s won by local node", e->name);
election_complete(e);
return TRUE;
} else {
crm_debug("%s still waiting on %d of %d votes",
e->name, num_members - voted_size, num_members);
}
return FALSE;
}
#define LOSS_DAMPEN 2 /* in seconds */
struct vote {
const char *op;
const char *from;
const char *version;
const char *election_owner;
int election_id;
struct timeval age;
};
/*!
* \brief Unpack an election message
*
* \param[in] e Election object (for logging only)
* \param[in] message Election message XML
* \param[out] vote Parsed fields from message
*
* \return TRUE if election message and election are valid, FALSE otherwise
* \note The parsed struct's pointer members are valid only for the lifetime of
* the message argument.
*/
static bool
parse_election_message(const pcmk__election_t *e, const xmlNode *message,
struct vote *vote)
{
CRM_CHECK(message && vote, return FALSE);
vote->election_id = -1;
vote->age.tv_sec = -1;
vote->age.tv_usec = -1;
vote->op = crm_element_value(message, PCMK__XA_CRM_TASK);
vote->from = crm_element_value(message, PCMK__XA_SRC);
vote->version = crm_element_value(message, PCMK_XA_VERSION);
vote->election_owner = crm_element_value(message, PCMK__XA_ELECTION_OWNER);
crm_element_value_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id));
if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
|| (vote->election_owner == NULL) || (vote->election_id < 0)) {
crm_warn("Invalid %s message from %s in %s ",
(vote->op? vote->op : "election"),
(vote->from? vote->from : "unspecified node"),
(e? e->name : "election"));
return FALSE;
}
// Op-specific validation
if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) {
/* Only vote ops have uptime.
Warning: PCMK__XA_ELECTION_AGE_NANO_SEC value is in microseconds.
*/
crm_element_value_timeval(message, PCMK__XA_ELECTION_AGE_SEC,
PCMK__XA_ELECTION_AGE_NANO_SEC, &(vote->age));
if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
crm_warn("Cannot count %s %s from %s because it is missing uptime",
(e? e->name : "election"), vote->op, vote->from);
return FALSE;
}
} else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) {
crm_info("Cannot process %s message from %s because %s is not a known election op",
(e? e->name : "election"), vote->from, vote->op);
return FALSE;
}
// Election validation
if (e == NULL) {
crm_info("Cannot count %s from %s because no election available",
vote->op, vote->from);
return FALSE;
}
/* If the membership cache is NULL, we REALLY shouldn't be voting --
* the question is how we managed to get here.
*/
if (pcmk__peer_cache == NULL) {
crm_info("Cannot count %s %s from %s because no peer information available",
e->name, vote->op, vote->from);
return FALSE;
}
return TRUE;
}
static void
record_vote(pcmk__election_t *e, struct vote *vote)
{
CRM_ASSERT(e && vote && vote->from && vote->op);
if (e->voted == NULL) {
e->voted = pcmk__strkey_table(free, free);
}
pcmk__insert_dup(e->voted, vote->from, vote->op);
}
static void
send_no_vote(pcmk__election_t *e, pcmk__node_status_t *peer, struct vote *vote)
{
const char *message_type = pcmk__server_message_type(e->server);
xmlNode *novote = pcmk__new_request(e->server, message_type,
vote->from, message_type,
CRM_OP_NOVOTE, NULL);
crm_xml_add(novote, PCMK__XA_ELECTION_OWNER, vote->election_owner);
crm_xml_add_int(novote, PCMK__XA_ELECTION_ID, vote->election_id);
pcmk__cluster_send_message(peer, e->server, novote);
pcmk__xml_free(novote);
}
/*!
* \brief Process an election message (vote or no-vote) from a peer
*
* \param[in,out] e Election object
* \param[in] message Election message XML from peer
* \param[in] can_win Whether local node is eligible to win
*
* \return Election state after new vote is considered
* \note If the peer message is a vote, and we prefer the peer to win, this will
* send a no-vote reply to the peer.
* \note The situations "we lost to this vote" from "this is a late no-vote
* after we've already lost" both return election_lost. If a caller needs
* to distinguish them, it should save the current state before calling
* this function, and then compare the result.
*/
enum election_result
election_count_vote(pcmk__election_t *e, const xmlNode *message, bool can_win)
{
int log_level = LOG_INFO;
gboolean done = FALSE;
gboolean we_lose = FALSE;
const char *reason = "unknown";
bool we_are_owner = FALSE;
pcmk__node_status_t *our_node = NULL;
pcmk__node_status_t *your_node = NULL;
time_t tm_now = time(NULL);
struct vote vote;
CRM_CHECK(message != NULL, return election_error);
if (parse_election_message(e, message, &vote) == FALSE) {
return election_error;
}
your_node = pcmk__get_node(0, vote.from, NULL,
pcmk__node_search_cluster_member);
our_node = pcmk__get_node(0, e->uname, NULL,
pcmk__node_search_cluster_member);
we_are_owner = (our_node != NULL)
&& pcmk__str_eq(our_node->xml_id, vote.election_owner,
pcmk__str_none);
if (!can_win) {
reason = "Not eligible";
we_lose = TRUE;
} else if (!pcmk__cluster_is_node_active(our_node)) {
reason = "We are not part of the cluster";
log_level = LOG_ERR;
we_lose = TRUE;
} else if (we_are_owner && (vote.election_id != e->count)) {
log_level = LOG_TRACE;
reason = "Superseded";
done = TRUE;
} else if (!pcmk__cluster_is_node_active(your_node)) {
/* Possibly we cached the message in the FSA queue at a point that it wasn't */
reason = "Peer is not part of our cluster";
log_level = LOG_WARNING;
done = TRUE;
} else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none)
|| pcmk__str_eq(vote.from, e->uname, pcmk__str_none)) {
/* Receiving our own broadcast vote, or a no-vote from peer, is a vote
* for us to win
*/
if (!we_are_owner) {
crm_warn("Cannot count %s round %d %s from %s because we are not election owner (%s)",
e->name, vote.election_id, vote.op, vote.from,
vote.election_owner);
return election_error;
}
if (e->state != election_in_progress) {
// Should only happen if we already lost
crm_debug("Not counting %s round %d %s from %s because no election in progress",
e->name, vote.election_id, vote.op, vote.from);
return e->state;
}
record_vote(e, &vote);
reason = "Recorded";
done = TRUE;
} else {
// A peer vote requires a comparison to determine which node is better
int age_result = compare_age(vote.age);
int version_result = compare_version(vote.version, CRM_FEATURE_SET);
if (version_result < 0) {
reason = "Version";
we_lose = TRUE;
} else if (version_result > 0) {
reason = "Version";
} else if (age_result < 0) {
reason = "Uptime";
we_lose = TRUE;
} else if (age_result > 0) {
reason = "Uptime";
} else if (strcasecmp(e->uname, vote.from) > 0) {
reason = "Host name";
we_lose = TRUE;
} else {
reason = "Host name";
}
}
if (e->expires < tm_now) {
e->election_wins = 0;
e->expires = tm_now + STORM_INTERVAL;
} else if (done == FALSE && we_lose == FALSE) {
int peers = 1 + g_hash_table_size(pcmk__peer_cache);
/* If every node has to vote down every other node, thats N*(N-1) total elections
* Allow some leeway before _really_ complaining
*/
e->election_wins++;
if (e->election_wins > (peers * peers)) {
crm_warn("%s election storm detected: %d wins in %d seconds",
e->name, e->election_wins, STORM_INTERVAL);
e->election_wins = 0;
e->expires = tm_now + STORM_INTERVAL;
if (e->wrote_blackbox == FALSE) {
/* It's questionable whether a black box (from every node in the
* cluster) would be truly helpful in diagnosing an election
* storm. It's also highly doubtful a production environment
* would get multiple election storms from distinct causes, so
* saving one blackbox per process lifetime should be
* sufficient. Alternatives would be to save a timestamp of the
* last blackbox write instead of a boolean, and write a new one
* if some amount of time has passed; or to save a storm count,
* write a blackbox on every Nth occurrence.
*/
crm_write_blackbox(0, NULL);
e->wrote_blackbox = TRUE;
}
}
}
if (done) {
do_crm_log(log_level + 1,
"Processed %s round %d %s (current round %d) from %s (%s)",
e->name, vote.election_id, vote.op, e->count, vote.from,
reason);
return e->state;
} else if (we_lose == FALSE) {
/* We track the time of the last election loss to implement an election
* dampening period, reducing the likelihood of an election storm. If
* this node has lost within the dampening period, don't start a new
* election, even if we win against a peer's vote -- the peer we lost to
* should win again.
*
* @TODO This has a problem case: if an election winner immediately
* leaves the cluster, and a new election is immediately called, all
* nodes could lose, with no new winner elected. The ideal solution
* would be to tie the election structure with the peer caches, which
* would allow us to clear the dampening when the previous winner
* leaves (and would allow other improvements as well).
*/
if ((e->last_election_loss == 0)
|| ((tm_now - e->last_election_loss) > (time_t) LOSS_DAMPEN)) {
do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)",
e->name, vote.election_id, vote.election_owner, vote.op,
vote.from, reason);
e->last_election_loss = 0;
election_timeout_stop(e);
/* Start a new election by voting down this, and other, peers */
e->state = election_start;
return e->state;
} else {
char *loss_time = ctime(&e->last_election_loss);
if (loss_time) {
// Show only HH:MM:SS
loss_time += 11;
loss_time[8] = '\0';
}
crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s",
e->name, vote.election_id, vote.election_owner, vote.from,
LOSS_DAMPEN, (loss_time? loss_time : "unknown"));
}
}
e->last_election_loss = tm_now;
do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)",
e->name, vote.election_id, vote.election_owner, vote.op,
vote.from, reason);
election_reset(e);
send_no_vote(e, your_node, &vote);
e->state = election_lost;
return e->state;
}
/*!
* \brief Reset any election dampening currently in effect
*
* \param[in,out] e Election object to clear
*/
void
election_clear_dampening(pcmk__election_t *e)
{
e->last_election_loss = 0;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jan 25, 6:55 AM (1 d, 15 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1296602
Default Alt Text
(44 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment