Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F4624117
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
20 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/lib/cluster/election.c b/lib/cluster/election.c
index 34059b225f..7556f38198 100644
--- a/lib/cluster/election.c
+++ b/lib/cluster/election.c
@@ -1,604 +1,640 @@
/*
* Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <crm/common/mainloop.h>
#include <crm/cluster/internal.h>
#include <crm/cluster/election.h>
#include <crm/crm.h>
#define STORM_INTERVAL 2 /* in seconds */
struct election_s {
enum election_result state;
guint count; // How many times local node has voted
char *name; // Descriptive name for this election
char *uname; // Local node's name
GSourceFunc cb; // Function to call if election is won
GHashTable *voted; // Key = node name, value = how node voted
mainloop_timer_t *timeout; // When to abort if all votes not received
};
static void election_complete(election_t *e)
{
e->state = election_won;
if(e->cb) {
e->cb(e);
}
election_reset(e);
}
static gboolean election_timer_cb(gpointer user_data)
{
election_t *e = user_data;
crm_info("%s timed out, declaring local node as winner", e->name);
election_complete(e);
return FALSE;
}
enum election_result
election_state(election_t *e)
{
if(e) {
return e->state;
}
return election_error;
}
/*!
* \brief Create a new election object
*
* Every node that wishes to participate in an election must create an election
* object. Typically, this should be done once, at start-up. A caller should
* only create a single election object.
*
* \param[in] name Label for election (for logging)
* \param[in] uname Local node's name
* \param[in] period_ms How long to wait for all peers to vote
* \param[in] cb Function to call if local node wins election
*
* \return Newly allocated election object on success, NULL on error
* \note The caller is responsible for freeing the returned value using
* election_fini().
*/
election_t *
election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
{
election_t *e = NULL;
static guint count = 0;
CRM_CHECK(uname != NULL, return NULL);
e = calloc(1, sizeof(election_t));
if (e == NULL) {
crm_perror(LOG_CRIT, "Cannot create election");
return NULL;
}
e->uname = strdup(uname);
if (e->uname == NULL) {
crm_perror(LOG_CRIT, "Cannot create election");
free(e);
return NULL;
}
e->name = name? crm_strdup_printf("election-%s", name)
: crm_strdup_printf("election-%u", count++);
e->cb = cb;
e->timeout = mainloop_timer_add(e->name, period_ms, FALSE,
election_timer_cb, e);
crm_trace("Created %s", e->name);
return e;
}
/*!
* \brief Disregard any previous vote by specified peer
*
* This discards any recorded vote from a specified peer. Election users should
* call this whenever a voting peer becomes inactive.
*
* \param[in] e Election object
* \param[in] uname Name of peer to disregard
*/
void
election_remove(election_t *e, const char *uname)
{
if(e && uname && e->voted) {
crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname);
g_hash_table_remove(e->voted, uname);
}
}
/*!
* \brief Stop election timer and disregard all votes
*
* \param[in] e Election object
*/
void
election_reset(election_t *e)
{
if (e != NULL) {
crm_trace("Resetting election %s", e->name);
mainloop_timer_stop(e->timeout);
if (e->voted) {
crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
g_hash_table_destroy(e->voted);
e->voted = NULL;
}
}
}
/*!
* \brief Free an election object
*
* Free all memory associated with an election object, stopping its
* election timer (if running).
*
* \param[in] e Election object
*/
void
election_fini(election_t *e)
{
if(e) {
election_reset(e);
crm_trace("Destroying %s", e->name);
mainloop_timer_del(e->timeout);
free(e->uname);
free(e->name);
free(e);
}
}
static void
election_timeout_start(election_t *e)
{
if(e) {
mainloop_timer_start(e->timeout);
}
}
/*!
* \brief Stop an election's timer, if running
*
* \param[in] e Election object
*/
void
election_timeout_stop(election_t *e)
{
if(e) {
mainloop_timer_stop(e->timeout);
}
}
/*!
* \brief Change an election's timeout (restarting timer if running)
*
* \param[in] e Election object
* \param[in] period New timeout
*/
void
election_timeout_set_period(election_t *e, guint period)
{
if(e) {
mainloop_timer_set_period(e->timeout, period);
} else {
crm_err("No election defined");
}
}
static int
crm_uptime(struct timeval *output)
{
static time_t expires = 0;
static struct rusage info;
time_t tm_now = time(NULL);
if (expires < tm_now) {
int rc = 0;
info.ru_utime.tv_sec = 0;
info.ru_utime.tv_usec = 0;
rc = getrusage(RUSAGE_SELF, &info);
output->tv_sec = 0;
output->tv_usec = 0;
if (rc < 0) {
crm_perror(LOG_ERR, "Could not calculate the current uptime");
expires = 0;
return -1;
}
crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
(long)info.ru_utime.tv_usec);
}
expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
output->tv_sec = info.ru_utime.tv_sec;
output->tv_usec = info.ru_utime.tv_usec;
return 1;
}
static int
crm_compare_age(struct timeval your_age)
{
struct timeval our_age;
crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
if (our_age.tv_sec > your_age.tv_sec) {
crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return 1;
} else if (our_age.tv_sec < your_age.tv_sec) {
crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return -1;
} else if (our_age.tv_usec > your_age.tv_usec) {
crm_debug("Win: %ld.%ld vs %ld.%ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return 1;
} else if (our_age.tv_usec < your_age.tv_usec) {
crm_debug("Lose: %ld.%ld vs %ld.%ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return -1;
}
return 0;
}
/*!
* \brief Start a new election by offering local node's candidacy
*
* Broadcast a "vote" election message containing the local node's ID,
* (incremented) election counter, and uptime, and start the election timer.
*
* \param[in] e Election object
* \note Any nodes agreeing to the candidacy will send a "no-vote" reply, and if
* all active peers do so, or if the election times out, the local node
* wins the election. (If we lose to any peer vote, we will stop the
* timer, so a timeout means we did not lose -- either some peer did not
* vote, or we did not call election_check() in time.)
*/
void
election_vote(election_t *e)
{
struct timeval age;
xmlNode *vote = NULL;
crm_node_t *our_node;
if (e == NULL) {
crm_trace("Election vote requested, but no election available");
return;
}
our_node = crm_get_peer(0, e->uname);
if ((our_node == NULL) || (crm_is_peer_active(our_node) == FALSE)) {
crm_trace("Cannot vote in %s yet: local node not connected to cluster",
e->name);
return;
}
e->state = election_in_progress;
vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
e->count++;
crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
crm_uptime(&age);
crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
free_xml(vote);
crm_debug("Started %s round %d", e->name, e->count);
if (e->voted) {
g_hash_table_destroy(e->voted);
e->voted = NULL;
}
election_timeout_start(e);
return;
}
/*!
* \brief Check whether local node has won an election
*
* If all known peers have sent no-vote messages, stop the election timer, set
* the election state to won, and call any registered win callback.
*
* \param[in] e Election object
*
* \return TRUE if local node has won, FALSE otherwise
* \note If all known peers have sent no-vote messages, but the election owner
* does not call this function, the election will not be won (and the
* callback will not be called) until the election times out.
* \note This should be called when election_count_vote() returns
* \c election_in_progress.
*/
bool
election_check(election_t *e)
{
int voted_size = 0;
int num_members = crm_active_peers();
if(e == NULL) {
crm_trace("Election check requested, but no election available");
return FALSE;
}
if (e->voted) {
voted_size = g_hash_table_size(e->voted);
}
/* in the case of #voted > #members, it is better to
* wait for the timeout and give the cluster time to
* stabilize
*/
if (voted_size >= num_members) {
/* we won and everyone has voted */
election_timeout_stop(e);
if (voted_size > num_members) {
GHashTableIter gIter;
const crm_node_t *node;
char *key = NULL;
crm_warn("Received too many votes in %s", e->name);
g_hash_table_iter_init(&gIter, crm_peer_cache);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
if (crm_is_peer_active(node)) {
crm_warn("* expected vote: %s", node->uname);
}
}
g_hash_table_iter_init(&gIter, e->voted);
while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
crm_warn("* actual vote: %s", key);
}
}
crm_info("%s won by local node", e->name);
election_complete(e);
return TRUE;
} else {
crm_debug("%s still waiting on %d of %d votes",
e->name, num_members - voted_size, num_members);
}
return FALSE;
}
#define LOSS_DAMPEN 2 /* in seconds */
/*!
* \brief Process an election message (vote or no-vote) from a peer
*
* \param[in] e Election object
* \param[in] vote Election message XML from peer
* \param[in] can_win Whether to consider the local node eligible for winning
*
* \return Election state after new vote is considered
* \note If the peer message is a vote, and we prefer the peer to win, this will
* send a no-vote reply to the peer.
* \note The situations "we lost to this vote" from "this is a late no-vote
* after we've already lost" both return election_lost. If a caller needs
* to distinguish them, it should save the current state before calling
* this function, and then compare the result.
*/
enum election_result
election_count_vote(election_t *e, xmlNode *vote, bool can_win)
{
int age = 0;
int election_id = -1;
int log_level = LOG_INFO;
gboolean done = FALSE;
gboolean we_lose = FALSE;
const char *op = NULL;
- const char *from = NULL;
const char *reason = "unknown";
- const char *election_owner = NULL;
+ const char *from = NULL; // voter's uname
+ const char *election_owner = NULL; // owner's uuid
+ bool we_are_owner = FALSE;
crm_node_t *our_node = NULL, *your_node = NULL;
xmlNode *novote = NULL;
time_t tm_now = time(NULL);
// @TODO these should be in election_t
static int election_wins = 0;
static time_t expires = 0;
static time_t last_election_loss = 0;
CRM_CHECK(vote != NULL, return election_error);
op = crm_element_value(vote, F_CRM_TASK);
from = crm_element_value(vote, F_CRM_HOST_FROM);
election_owner = crm_element_value(vote, F_CRM_ELECTION_OWNER);
crm_element_value_int(vote, F_CRM_ELECTION_ID, &election_id);
+ if ((op == NULL) || (from == NULL) || (election_owner == NULL)
+ || (election_id < 0)) {
+
+ crm_warn("Invalid %s message from %s in %s ",
+ (op? op : "election"),
+ (from? from : "unspecified node"),
+ (e? e->name : "election"));
+ return election_error;
+ }
+
+ /* There are two types of election ops: a vote is a declaration of the
+ * voter's candidacy in a new election; a no-vote is a vote for the
+ * recipient to win an existing election.
+ */
+ if (!crm_str_eq(op, CRM_OP_VOTE, TRUE)
+ && !crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
+ crm_info("Cannot process %s message from %s because %s is not a known election op",
+ (e? e->name : "election"), from, op);
+ return election_error;
+ }
+
if (e == NULL) {
crm_info("Cannot count %s from %s because no election available",
op, from);
- return election_lost;
+ return election_error;
}
/* If the membership cache is NULL, we REALLY shouldn't be voting --
* the question is how we managed to get here.
*/
if (crm_peer_cache == NULL) {
crm_info("Cannot count %s %s from %s because no peer information available",
e->name, op, from);
- return election_lost;
+ return election_error;
}
your_node = crm_get_peer(0, from);
our_node = crm_get_peer(0, e->uname);
+ we_are_owner = our_node && crm_str_eq(our_node->uuid, election_owner, TRUE);
if (e->voted == NULL) {
e->voted = crm_str_table_new();
}
if(can_win == FALSE) {
reason = "Not eligible";
we_lose = TRUE;
} else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
reason = "We are not part of the cluster";
log_level = LOG_ERR;
we_lose = TRUE;
- } else if (election_id != e->count && crm_str_eq(our_node->uuid, election_owner, TRUE)) {
+ } else if (we_are_owner && (election_id != e->count)) {
log_level = LOG_TRACE;
reason = "Superseded";
done = TRUE;
} else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
/* Possibly we cached the message in the FSA queue at a point that it wasn't */
reason = "Peer is not part of our cluster";
log_level = LOG_WARNING;
done = TRUE;
} else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
- char *op_copy = strdup(op);
- char *uname_copy = strdup(from);
+ char *op_copy = NULL;
+ char *uname_copy = NULL;
+
+ if (!we_are_owner) {
+ crm_warn("Cannot count %s %s from %s because we are not election owner (%s)",
+ e->name, op, from, election_owner);
+ return election_error;
+ }
- CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
+ op_copy = strdup(op);
+ uname_copy = strdup(from);
/* update the list of nodes that have voted */
g_hash_table_replace(e->voted, uname_copy, op_copy);
reason = "Recorded";
done = TRUE;
} else {
struct timeval your_age;
const char *your_version = crm_element_value(vote, F_CRM_VERSION);
int tv_sec = 0;
int tv_usec = 0;
crm_element_value_int(vote, F_CRM_ELECTION_AGE_S, &tv_sec);
crm_element_value_int(vote, F_CRM_ELECTION_AGE_US, &tv_usec);
your_age.tv_sec = tv_sec;
your_age.tv_usec = tv_usec;
age = crm_compare_age(your_age);
if (crm_str_eq(from, e->uname, TRUE)) {
- char *op_copy = strdup(op);
- char *uname_copy = strdup(from);
+ char *op_copy = NULL;
+ char *uname_copy = NULL;
- CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
+ if (!we_are_owner) {
+ crm_warn("Cannot count our own %s %s because we are not election owner (%s)",
+ e->name, op, election_owner);
+ return election_error;
+ }
+ op_copy = strdup(op);
+ uname_copy = strdup(from);
/* update ourselves in the list of nodes that have voted */
g_hash_table_replace(e->voted, uname_copy, op_copy);
reason = "Recorded";
done = TRUE;
} else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
reason = "Version";
we_lose = TRUE;
} else if (compare_version(your_version, CRM_FEATURE_SET) > 0) {
reason = "Version";
} else if (age < 0) {
reason = "Uptime";
we_lose = TRUE;
} else if (age > 0) {
reason = "Uptime";
} else if (strcasecmp(e->uname, from) > 0) {
reason = "Host name";
we_lose = TRUE;
} else {
reason = "Host name";
}
}
if (expires < tm_now) {
election_wins = 0;
expires = tm_now + STORM_INTERVAL;
} else if (done == FALSE && we_lose == FALSE) {
int peers = 1 + g_hash_table_size(crm_peer_cache);
/* If every node has to vote down every other node, thats N*(N-1) total elections
* Allow some leeway before _really_ complaining
*/
election_wins++;
if (election_wins > (peers * peers)) {
crm_warn("%s election storm detected: %d wins in %d seconds",
e->name, election_wins, STORM_INTERVAL);
election_wins = 0;
expires = tm_now + STORM_INTERVAL;
crm_write_blackbox(0, NULL);
}
}
if (done) {
do_crm_log(log_level + 1,
"Processed %s round %d %s (current round %d) from %s (%s)",
e->name, election_id, op, e->count, from, reason);
return e->state;
} else if (we_lose == FALSE) {
if (last_election_loss == 0
|| tm_now - last_election_loss > (time_t) LOSS_DAMPEN) {
do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)",
e->name, election_id, election_owner, op, from, reason);
last_election_loss = 0;
election_timeout_stop(e);
/* Start a new election by voting down this, and other, peers */
e->state = election_start;
return e->state;
} else {
char *loss_time = ctime(&last_election_loss);
if (loss_time) {
// Show only HH:MM:SS
loss_time += 11;
loss_time[8] = '\0';
}
crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s",
e->name, election_id, election_owner, from, LOSS_DAMPEN,
loss_time? loss_time : "unknown");
}
}
novote = create_request(CRM_OP_NOVOTE, NULL, from,
CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)",
e->name, election_id, election_owner, op, from, reason);
election_timeout_stop(e);
crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
send_cluster_message(your_node, crm_msg_crmd, novote, TRUE);
free_xml(novote);
last_election_loss = tm_now;
e->state = election_lost;
return e->state;
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Tue, Jul 8, 5:57 PM (1 d, 2 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2002412
Default Alt Text
(20 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment