Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F4639746
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
29 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/include/crm_internal.h b/include/crm_internal.h
index 19d3c29704..678781112f 100644
--- a/include/crm_internal.h
+++ b/include/crm_internal.h
@@ -1,142 +1,143 @@
/*
* Copyright 2006-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef CRM_INTERNAL__H
# define CRM_INTERNAL__H
# ifndef PCMK__CONFIG_H
# define PCMK__CONFIG_H
# include <config.h>
# endif
# include <portability.h>
/* Our minimum glib dependency is 2.42. Define that as both the minimum and
* maximum glib APIs that are allowed (i.e. APIs that were already deprecated
* in 2.42, and APIs introduced after 2.42, cannot be used by Pacemaker code).
*/
#define GLIB_VERSION_MIN_REQUIRED GLIB_VERSION_2_42
#define GLIB_VERSION_MAX_ALLOWED GLIB_VERSION_2_42
# include <glib.h>
# include <stdbool.h>
# include <libxml/tree.h>
/* Public API headers can guard including deprecated API headers with this
* symbol, thus preventing internal code (which includes this header) from using
* deprecated APIs, while still allowing external code to use them by default.
*/
#define PCMK_ALLOW_DEPRECATED 0
# include <crm/lrmd.h>
# include <crm/common/logging.h>
# include <crm/common/logging_internal.h>
# include <crm/common/ipc_internal.h>
# include <crm/common/options_internal.h>
# include <crm/common/output_internal.h>
# include <crm/common/xml_internal.h>
# include <crm/common/internal.h>
# include <locale.h>
# include <gettext.h>
#define N_(String) (String)
#ifdef ENABLE_NLS
# define _(String) gettext(String)
#else
# define _(String) (String)
#endif
/*
* XML attribute names used only by internal code
*/
#define PCMK__XA_ATTR_DAMPENING "attr_dampening"
#define PCMK__XA_ATTR_FORCE "attrd_is_force_write"
#define PCMK__XA_ATTR_INTERVAL "attr_clear_interval"
#define PCMK__XA_ATTR_IS_PRIVATE "attr_is_private"
#define PCMK__XA_ATTR_IS_REMOTE "attr_is_remote"
#define PCMK__XA_ATTR_NAME "attr_name"
#define PCMK__XA_ATTR_NODE_ID "attr_host_id"
#define PCMK__XA_ATTR_NODE_NAME "attr_host"
#define PCMK__XA_ATTR_OPERATION "attr_clear_operation"
#define PCMK__XA_ATTR_PATTERN "attr_regex"
#define PCMK__XA_ATTR_RESOURCE "attr_resource"
#define PCMK__XA_ATTR_SECTION "attr_section"
#define PCMK__XA_ATTR_SET "attr_set"
#define PCMK__XA_ATTR_SET_TYPE "attr_set_type"
#define PCMK__XA_ATTR_SYNC_POINT "attr_sync_point"
#define PCMK__XA_ATTR_USER "attr_user"
#define PCMK__XA_ATTR_UUID "attr_key"
#define PCMK__XA_ATTR_VALUE "attr_value"
#define PCMK__XA_ATTR_VERSION "attr_version"
#define PCMK__XA_ATTR_WRITER "attr_writer"
#define PCMK__XA_CONFIG_ERRORS "config-errors"
#define PCMK__XA_CONFIG_WARNINGS "config-warnings"
#define PCMK__XA_CONFIRM "confirm"
#define PCMK__XA_CONN_HOST "connection_host"
#define PCMK__XA_CRMD "crmd"
#define PCMK__XA_CRM_HOST_TO "crm_host_to"
#define PCMK__XA_CRM_SYS_FROM "crm_sys_from"
#define PCMK__XA_CRM_SYS_TO "crm_sys_to"
#define PCMK__XA_CRM_TASK "crm_task"
#define PCMK__XA_CRM_USER "crm_user"
#define PCMK__XA_DC_LEAVING "dc-leaving"
#define PCMK__XA_DIGEST "digest"
+#define PCMK__XA_ELECTION_ID "election-id"
#define PCMK__XA_EXPECTED "expected"
#define PCMK__XA_FILE "file"
#define PCMK__XA_GRAPH_ERRORS "graph-errors"
#define PCMK__XA_GRAPH_WARNINGS "graph-warnings"
#define PCMK__XA_IN_CCM "in_ccm"
#define PCMK__XA_JOIN "join"
#define PCMK__XA_JOIN_ID "join_id"
#define PCMK__XA_LONG_ID "long-id"
#define PCMK__XA_MODE "mode"
#define PCMK__XA_NODE_START_STATE "node_start_state"
#define PCMK__XA_PATH "path"
#define PCMK__XA_REAP "reap"
#define PCMK__XA_SCHEMA "schema"
#define PCMK__XA_SCHEMAS "schemas"
#define PCMK__XA_SRC "src"
#define PCMK__XA_SUBT "subt" // subtype
#define PCMK__XA_T "t" // type
#define PCMK__XA_TASK "task"
#define PCMK__XA_TRANSITION_KEY "transition-key"
#define PCMK__XA_TRANSITION_MAGIC "transition-magic"
#define PCMK__XA_UPTIME "uptime"
/*
* IPC service names that are only used internally
*/
# define PCMK__SERVER_BASED_RO "cib_ro"
# define PCMK__SERVER_BASED_RW "cib_rw"
# define PCMK__SERVER_BASED_SHM "cib_shm"
/*
* IPC commands that can be sent to Pacemaker daemons
*/
#define PCMK__ATTRD_CMD_PEER_REMOVE "peer-remove"
#define PCMK__ATTRD_CMD_UPDATE "update"
#define PCMK__ATTRD_CMD_UPDATE_BOTH "update-both"
#define PCMK__ATTRD_CMD_UPDATE_DELAY "update-delay"
#define PCMK__ATTRD_CMD_QUERY "query"
#define PCMK__ATTRD_CMD_REFRESH "refresh"
#define PCMK__ATTRD_CMD_FLUSH "flush"
#define PCMK__ATTRD_CMD_SYNC "sync"
#define PCMK__ATTRD_CMD_SYNC_RESPONSE "sync-response"
#define PCMK__ATTRD_CMD_CLEAR_FAILURE "clear-failure"
#define PCMK__ATTRD_CMD_CONFIRM "confirm"
#define PCMK__CONTROLD_CMD_NODES "list-nodes"
#endif /* CRM_INTERNAL__H */
diff --git a/lib/cluster/election.c b/lib/cluster/election.c
index 5c6f17d5de..ac58a6d799 100644
--- a/lib/cluster/election.c
+++ b/lib/cluster/election.c
@@ -1,727 +1,727 @@
/*
* Copyright 2004-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <crm/common/mainloop.h>
#include <crm/cluster/internal.h>
#include <crm/cluster/election_internal.h>
#include <crm/crm.h>
#define STORM_INTERVAL 2 /* in seconds */
struct election_s {
enum election_result state;
guint count; // How many times local node has voted
char *name; // Descriptive name for this election
char *uname; // Local node's name
GSourceFunc cb; // Function to call if election is won
GHashTable *voted; // Key = node name, value = how node voted
mainloop_timer_t *timeout; // When to abort if all votes not received
int election_wins; // Track wins, for storm detection
bool wrote_blackbox; // Write a storm blackbox at most once
time_t expires; // When storm detection period ends
time_t last_election_loss; // When dampening period ends
};
static void
election_complete(election_t *e)
{
e->state = election_won;
if (e->cb != NULL) {
e->cb(e);
}
election_reset(e);
}
static gboolean
election_timer_cb(gpointer user_data)
{
election_t *e = user_data;
crm_info("%s timed out, declaring local node as winner", e->name);
election_complete(e);
return FALSE;
}
/*!
* \brief Get current state of an election
*
* \param[in] e Election object
*
* \return Current state of \e
*/
enum election_result
election_state(const election_t *e)
{
return (e == NULL)? election_error : e->state;
}
/*!
* \brief Create a new election object
*
* Every node that wishes to participate in an election must create an election
* object. Typically, this should be done once, at start-up. A caller should
* only create a single election object.
*
* \param[in] name Label for election (for logging)
* \param[in] uname Local node's name
* \param[in] period_ms How long to wait for all peers to vote
* \param[in] cb Function to call if local node wins election
*
* \return Newly allocated election object on success, NULL on error
* \note The caller is responsible for freeing the returned value using
* election_fini().
*/
election_t *
election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
{
election_t *e = NULL;
static guint count = 0;
CRM_CHECK(uname != NULL, return NULL);
e = calloc(1, sizeof(election_t));
if (e == NULL) {
crm_perror(LOG_CRIT, "Cannot create election");
return NULL;
}
e->uname = strdup(uname);
if (e->uname == NULL) {
crm_perror(LOG_CRIT, "Cannot create election");
free(e);
return NULL;
}
e->name = name? crm_strdup_printf("election-%s", name)
: crm_strdup_printf("election-%u", count++);
e->cb = cb;
e->timeout = mainloop_timer_add(e->name, period_ms, FALSE,
election_timer_cb, e);
crm_trace("Created %s", e->name);
return e;
}
/*!
* \brief Disregard any previous vote by specified peer
*
* This discards any recorded vote from a specified peer. Election users should
* call this whenever a voting peer becomes inactive.
*
* \param[in,out] e Election object
* \param[in] uname Name of peer to disregard
*/
void
election_remove(election_t *e, const char *uname)
{
if ((e != NULL) && (uname != NULL) && (e->voted != NULL)) {
crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname);
g_hash_table_remove(e->voted, uname);
}
}
/*!
* \brief Stop election timer and disregard all votes
*
* \param[in,out] e Election object
*/
void
election_reset(election_t *e)
{
if (e != NULL) {
crm_trace("Resetting election %s", e->name);
mainloop_timer_stop(e->timeout);
if (e->voted) {
crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
g_hash_table_destroy(e->voted);
e->voted = NULL;
}
}
}
/*!
* \brief Free an election object
*
* Free all memory associated with an election object, stopping its
* election timer (if running).
*
* \param[in,out] e Election object
*/
void
election_fini(election_t *e)
{
if (e != NULL) {
election_reset(e);
crm_trace("Destroying %s", e->name);
mainloop_timer_del(e->timeout);
free(e->uname);
free(e->name);
free(e);
}
}
static void
election_timeout_start(election_t *e)
{
if (e != NULL) {
mainloop_timer_start(e->timeout);
}
}
/*!
* \brief Stop an election's timer, if running
*
* \param[in,out] e Election object
*/
void
election_timeout_stop(election_t *e)
{
if (e != NULL) {
mainloop_timer_stop(e->timeout);
}
}
/*!
* \brief Change an election's timeout (restarting timer if running)
*
* \param[in,out] e Election object
* \param[in] period New timeout
*/
void
election_timeout_set_period(election_t *e, guint period)
{
if (e != NULL) {
mainloop_timer_set_period(e->timeout, period);
} else {
crm_err("No election defined");
}
}
static int
get_uptime(struct timeval *output)
{
static time_t expires = 0;
static struct rusage info;
time_t tm_now = time(NULL);
if (expires < tm_now) {
int rc = 0;
info.ru_utime.tv_sec = 0;
info.ru_utime.tv_usec = 0;
rc = getrusage(RUSAGE_SELF, &info);
output->tv_sec = 0;
output->tv_usec = 0;
if (rc < 0) {
crm_perror(LOG_ERR, "Could not calculate the current uptime");
expires = 0;
return -1;
}
crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
(long)info.ru_utime.tv_usec);
}
expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
output->tv_sec = info.ru_utime.tv_sec;
output->tv_usec = info.ru_utime.tv_usec;
return 1;
}
static int
compare_age(struct timeval your_age)
{
struct timeval our_age;
get_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
if (our_age.tv_sec > your_age.tv_sec) {
crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return 1;
} else if (our_age.tv_sec < your_age.tv_sec) {
crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return -1;
} else if (our_age.tv_usec > your_age.tv_usec) {
crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return 1;
} else if (our_age.tv_usec < your_age.tv_usec) {
crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return -1;
}
return 0;
}
/*!
* \brief Start a new election by offering local node's candidacy
*
* Broadcast a "vote" election message containing the local node's ID,
* (incremented) election counter, and uptime, and start the election timer.
*
* \param[in,out] e Election object
*
* \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if
* all active peers do so, or if the election times out, the local node
* wins the election. (If we lose to any peer vote, we will stop the
* timer, so a timeout means we did not lose -- either some peer did not
* vote, or we did not call election_check() in time.)
*/
void
election_vote(election_t *e)
{
struct timeval age;
xmlNode *vote = NULL;
crm_node_t *our_node;
if (e == NULL) {
crm_trace("Election vote requested, but no election available");
return;
}
our_node = pcmk__get_node(0, e->uname, NULL, pcmk__node_search_cluster);
if ((our_node == NULL) || (crm_is_peer_active(our_node) == FALSE)) {
crm_trace("Cannot vote in %s yet: local node not connected to cluster",
e->name);
return;
}
election_reset(e);
e->state = election_in_progress;
vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
e->count++;
crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
- crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
+ crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, e->count);
get_uptime(&age);
crm_xml_add_timeval(vote, F_CRM_ELECTION_AGE_S, F_CRM_ELECTION_AGE_US, &age);
send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
free_xml(vote);
crm_debug("Started %s round %d", e->name, e->count);
election_timeout_start(e);
return;
}
/*!
* \brief Check whether local node has won an election
*
* If all known peers have sent no-vote messages, stop the election timer, set
* the election state to won, and call any registered win callback.
*
* \param[in,out] e Election object
*
* \return TRUE if local node has won, FALSE otherwise
* \note If all known peers have sent no-vote messages, but the election owner
* does not call this function, the election will not be won (and the
* callback will not be called) until the election times out.
* \note This should be called when election_count_vote() returns
* \c election_in_progress.
*/
bool
election_check(election_t *e)
{
int voted_size = 0;
int num_members = 0;
if (e == NULL) {
crm_trace("Election check requested, but no election available");
return FALSE;
}
if (e->voted == NULL) {
crm_trace("%s check requested, but no votes received yet", e->name);
return FALSE;
}
voted_size = g_hash_table_size(e->voted);
num_members = crm_active_peers();
/* in the case of #voted > #members, it is better to
* wait for the timeout and give the cluster time to
* stabilize
*/
if (voted_size >= num_members) {
/* we won and everyone has voted */
election_timeout_stop(e);
if (voted_size > num_members) {
GHashTableIter gIter;
const crm_node_t *node;
char *key = NULL;
crm_warn("Received too many votes in %s", e->name);
g_hash_table_iter_init(&gIter, crm_peer_cache);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
if (crm_is_peer_active(node)) {
crm_warn("* expected vote: %s", node->uname);
}
}
g_hash_table_iter_init(&gIter, e->voted);
while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
crm_warn("* actual vote: %s", key);
}
}
crm_info("%s won by local node", e->name);
election_complete(e);
return TRUE;
} else {
crm_debug("%s still waiting on %d of %d votes",
e->name, num_members - voted_size, num_members);
}
return FALSE;
}
#define LOSS_DAMPEN 2 /* in seconds */
struct vote {
const char *op;
const char *from;
const char *version;
const char *election_owner;
int election_id;
struct timeval age;
};
/*!
* \brief Unpack an election message
*
* \param[in] e Election object (for logging only)
* \param[in] message Election message XML
* \param[out] vote Parsed fields from message
*
* \return TRUE if election message and election are valid, FALSE otherwise
* \note The parsed struct's pointer members are valid only for the lifetime of
* the message argument.
*/
static bool
parse_election_message(const election_t *e, const xmlNode *message,
struct vote *vote)
{
CRM_CHECK(message && vote, return FALSE);
vote->election_id = -1;
vote->age.tv_sec = -1;
vote->age.tv_usec = -1;
vote->op = crm_element_value(message, PCMK__XA_CRM_TASK);
vote->from = crm_element_value(message, PCMK__XA_SRC);
vote->version = crm_element_value(message, PCMK_XA_VERSION);
vote->election_owner = crm_element_value(message, F_CRM_ELECTION_OWNER);
- crm_element_value_int(message, F_CRM_ELECTION_ID, &(vote->election_id));
+ crm_element_value_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id));
if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
|| (vote->election_owner == NULL) || (vote->election_id < 0)) {
crm_warn("Invalid %s message from %s in %s ",
(vote->op? vote->op : "election"),
(vote->from? vote->from : "unspecified node"),
(e? e->name : "election"));
return FALSE;
}
// Op-specific validation
if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) {
// Only vote ops have uptime
crm_element_value_timeval(message, F_CRM_ELECTION_AGE_S,
F_CRM_ELECTION_AGE_US, &(vote->age));
if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
crm_warn("Cannot count %s %s from %s because it is missing uptime",
(e? e->name : "election"), vote->op, vote->from);
return FALSE;
}
} else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) {
crm_info("Cannot process %s message from %s because %s is not a known election op",
(e? e->name : "election"), vote->from, vote->op);
return FALSE;
}
// Election validation
if (e == NULL) {
crm_info("Cannot count %s from %s because no election available",
vote->op, vote->from);
return FALSE;
}
/* If the membership cache is NULL, we REALLY shouldn't be voting --
* the question is how we managed to get here.
*/
if (crm_peer_cache == NULL) {
crm_info("Cannot count %s %s from %s because no peer information available",
e->name, vote->op, vote->from);
return FALSE;
}
return TRUE;
}
static void
record_vote(election_t *e, struct vote *vote)
{
char *voter_copy = NULL;
char *vote_copy = NULL;
CRM_ASSERT(e && vote && vote->from && vote->op);
if (e->voted == NULL) {
e->voted = pcmk__strkey_table(free, free);
}
voter_copy = strdup(vote->from);
vote_copy = strdup(vote->op);
CRM_ASSERT(voter_copy && vote_copy);
g_hash_table_replace(e->voted, voter_copy, vote_copy);
}
static void
send_no_vote(crm_node_t *peer, struct vote *vote)
{
// @TODO probably shouldn't hardcode CRM_SYSTEM_CRMD and crm_msg_crmd
xmlNode *novote = create_request(CRM_OP_NOVOTE, NULL, vote->from,
CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
crm_xml_add(novote, F_CRM_ELECTION_OWNER, vote->election_owner);
- crm_xml_add_int(novote, F_CRM_ELECTION_ID, vote->election_id);
+ crm_xml_add_int(novote, PCMK__XA_ELECTION_ID, vote->election_id);
send_cluster_message(peer, crm_msg_crmd, novote, TRUE);
free_xml(novote);
}
/*!
* \brief Process an election message (vote or no-vote) from a peer
*
* \param[in,out] e Election object
* \param[in] message Election message XML from peer
* \param[in] can_win Whether local node is eligible to win
*
* \return Election state after new vote is considered
* \note If the peer message is a vote, and we prefer the peer to win, this will
* send a no-vote reply to the peer.
* \note The situations "we lost to this vote" from "this is a late no-vote
* after we've already lost" both return election_lost. If a caller needs
* to distinguish them, it should save the current state before calling
* this function, and then compare the result.
*/
enum election_result
election_count_vote(election_t *e, const xmlNode *message, bool can_win)
{
int log_level = LOG_INFO;
gboolean done = FALSE;
gboolean we_lose = FALSE;
const char *reason = "unknown";
bool we_are_owner = FALSE;
crm_node_t *our_node = NULL, *your_node = NULL;
time_t tm_now = time(NULL);
struct vote vote;
CRM_CHECK(message != NULL, return election_error);
if (parse_election_message(e, message, &vote) == FALSE) {
return election_error;
}
your_node = pcmk__get_node(0, vote.from, NULL, pcmk__node_search_cluster);
our_node = pcmk__get_node(0, e->uname, NULL, pcmk__node_search_cluster);
we_are_owner = (our_node != NULL)
&& pcmk__str_eq(our_node->uuid, vote.election_owner,
pcmk__str_none);
if (!can_win) {
reason = "Not eligible";
we_lose = TRUE;
} else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
reason = "We are not part of the cluster";
log_level = LOG_ERR;
we_lose = TRUE;
} else if (we_are_owner && (vote.election_id != e->count)) {
log_level = LOG_TRACE;
reason = "Superseded";
done = TRUE;
} else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
/* Possibly we cached the message in the FSA queue at a point that it wasn't */
reason = "Peer is not part of our cluster";
log_level = LOG_WARNING;
done = TRUE;
} else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none)
|| pcmk__str_eq(vote.from, e->uname, pcmk__str_none)) {
/* Receiving our own broadcast vote, or a no-vote from peer, is a vote
* for us to win
*/
if (!we_are_owner) {
crm_warn("Cannot count %s round %d %s from %s because we are not election owner (%s)",
e->name, vote.election_id, vote.op, vote.from,
vote.election_owner);
return election_error;
}
if (e->state != election_in_progress) {
// Should only happen if we already lost
crm_debug("Not counting %s round %d %s from %s because no election in progress",
e->name, vote.election_id, vote.op, vote.from);
return e->state;
}
record_vote(e, &vote);
reason = "Recorded";
done = TRUE;
} else {
// A peer vote requires a comparison to determine which node is better
int age_result = compare_age(vote.age);
int version_result = compare_version(vote.version, CRM_FEATURE_SET);
if (version_result < 0) {
reason = "Version";
we_lose = TRUE;
} else if (version_result > 0) {
reason = "Version";
} else if (age_result < 0) {
reason = "Uptime";
we_lose = TRUE;
} else if (age_result > 0) {
reason = "Uptime";
} else if (strcasecmp(e->uname, vote.from) > 0) {
reason = "Host name";
we_lose = TRUE;
} else {
reason = "Host name";
}
}
if (e->expires < tm_now) {
e->election_wins = 0;
e->expires = tm_now + STORM_INTERVAL;
} else if (done == FALSE && we_lose == FALSE) {
int peers = 1 + g_hash_table_size(crm_peer_cache);
/* If every node has to vote down every other node, thats N*(N-1) total elections
* Allow some leeway before _really_ complaining
*/
e->election_wins++;
if (e->election_wins > (peers * peers)) {
crm_warn("%s election storm detected: %d wins in %d seconds",
e->name, e->election_wins, STORM_INTERVAL);
e->election_wins = 0;
e->expires = tm_now + STORM_INTERVAL;
if (e->wrote_blackbox == FALSE) {
/* It's questionable whether a black box (from every node in the
* cluster) would be truly helpful in diagnosing an election
* storm. It's also highly doubtful a production environment
* would get multiple election storms from distinct causes, so
* saving one blackbox per process lifetime should be
* sufficient. Alternatives would be to save a timestamp of the
* last blackbox write instead of a boolean, and write a new one
* if some amount of time has passed; or to save a storm count,
* write a blackbox on every Nth occurrence.
*/
crm_write_blackbox(0, NULL);
e->wrote_blackbox = TRUE;
}
}
}
if (done) {
do_crm_log(log_level + 1,
"Processed %s round %d %s (current round %d) from %s (%s)",
e->name, vote.election_id, vote.op, e->count, vote.from,
reason);
return e->state;
} else if (we_lose == FALSE) {
/* We track the time of the last election loss to implement an election
* dampening period, reducing the likelihood of an election storm. If
* this node has lost within the dampening period, don't start a new
* election, even if we win against a peer's vote -- the peer we lost to
* should win again.
*
* @TODO This has a problem case: if an election winner immediately
* leaves the cluster, and a new election is immediately called, all
* nodes could lose, with no new winner elected. The ideal solution
* would be to tie the election structure with the peer caches, which
* would allow us to clear the dampening when the previous winner
* leaves (and would allow other improvements as well).
*/
if ((e->last_election_loss == 0)
|| ((tm_now - e->last_election_loss) > (time_t) LOSS_DAMPEN)) {
do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)",
e->name, vote.election_id, vote.election_owner, vote.op,
vote.from, reason);
e->last_election_loss = 0;
election_timeout_stop(e);
/* Start a new election by voting down this, and other, peers */
e->state = election_start;
return e->state;
} else {
char *loss_time = ctime(&e->last_election_loss);
if (loss_time) {
// Show only HH:MM:SS
loss_time += 11;
loss_time[8] = '\0';
}
crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s",
e->name, vote.election_id, vote.election_owner, vote.from,
LOSS_DAMPEN, (loss_time? loss_time : "unknown"));
}
}
e->last_election_loss = tm_now;
do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)",
e->name, vote.election_id, vote.election_owner, vote.op,
vote.from, reason);
election_reset(e);
send_no_vote(your_node, &vote);
e->state = election_lost;
return e->state;
}
/*!
* \brief Reset any election dampening currently in effect
*
* \param[in,out] e Election object to clear
*/
void
election_clear_dampening(election_t *e)
{
e->last_election_loss = 0;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Jul 10, 3:22 AM (12 h, 25 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2009919
Default Alt Text
(29 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment