Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F4639351
election.c
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
18 KB
Referenced Files
None
Subscribers
None
election.c
View Options
/*
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <crm_internal.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <crm/cluster/internal.h>
#include <crm/crm.h>
#include <crmd_fsa.h>
#include <crmd_messages.h>
#include <crmd_callbacks.h>
#include <tengine.h>
#define STORM_INTERVAL 2 /* in seconds */
#define STORM_MULTIPLIER 5 /* multiplied by the number of nodes */
GHashTable *voted = NULL;
uint highest_born_on = -1;
static int current_election_id = 1;
static int
crm_uptime(struct timeval *output)
{
static time_t expires = 0;
static struct rusage info;
time_t tm_now = time(NULL);
if (expires < tm_now) {
int rc = getrusage(RUSAGE_SELF, &info);
output->tv_sec = 0;
output->tv_usec = 0;
if (rc < 0) {
crm_perror(LOG_ERR, "Could not calculate the current uptime");
expires = 0;
return -1;
}
crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
(long)info.ru_utime.tv_usec);
}
expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
output->tv_sec = info.ru_utime.tv_sec;
output->tv_usec = info.ru_utime.tv_usec;
return 1;
}
static int
crm_compare_age(struct timeval your_age)
{
struct timeval our_age;
if (crm_uptime(&our_age) < 0) {
return -1;
}
/* We want these times to be "significantly" different */
if (our_age.tv_sec > your_age.tv_sec) {
crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return 1;
} else if (our_age.tv_sec < your_age.tv_sec) {
crm_debug("Loose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
return -1;
} else if (our_age.tv_usec > your_age.tv_usec) {
crm_debug("Win: %ld.%ld vs %ld.%ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return 1;
} else if (our_age.tv_usec < your_age.tv_usec) {
crm_debug("Loose: %ld.%ld vs %ld.%ld (usec)",
(long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
return -1;
}
return 0;
}
/* A_ELECTION_VOTE */
void
do_election_vote(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
struct timeval age;
xmlNode *vote = NULL;
gboolean not_voting = FALSE;
/* don't vote if we're in one of these states or wanting to shut down */
switch (cur_state) {
case S_STARTING:
case S_RECOVERY:
case S_STOPPING:
case S_TERMINATE:
crm_warn("Not voting in election, we're in state %s", fsa_state2string(cur_state));
not_voting = TRUE;
break;
default:
break;
}
if (not_voting == FALSE) {
if (is_set(fsa_input_register, R_STARTING)) {
not_voting = TRUE;
}
}
if (not_voting) {
if (AM_I_DC) {
register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL);
} else {
register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL);
}
return;
}
vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
current_election_id++;
crm_xml_add(vote, F_CRM_ELECTION_OWNER, fsa_our_uuid);
crm_xml_add_int(vote, F_CRM_ELECTION_ID, current_election_id);
crm_uptime(&age);
crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
free_xml(vote);
crm_debug("Started election %d", current_election_id);
if (voted) {
g_hash_table_destroy(voted);
}
voted = NULL;
if (cur_state == S_ELECTION || cur_state == S_RELEASE_DC) {
crm_timer_start(election_timeout);
} else if (cur_state != S_INTEGRATION) {
crm_err("Broken? Voting in state %s", fsa_state2string(cur_state));
}
return;
}
char *dc_hb_msg = NULL;
int beat_num = 0;
gboolean
do_dc_heartbeat(gpointer data)
{
return TRUE;
}
struct election_data_s {
const char *winning_uname;
unsigned int winning_bornon;
};
void
do_election_check(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
int voted_size = 0;
int num_members = crm_active_peers();
if (voted) {
voted_size = g_hash_table_size(voted);
}
/* in the case of #voted > #members, it is better to
* wait for the timeout and give the cluster time to
* stabilize
*/
if (fsa_state != S_ELECTION) {
crm_debug("Ignore election check: we not in an election");
} else if (voted_size >= num_members) {
/* we won and everyone has voted */
crm_timer_stop(election_timeout);
register_fsa_input(C_FSA_INTERNAL, I_ELECTION_DC, NULL);
if (voted_size > num_members) {
GHashTableIter gIter;
const crm_node_t *node;
char *key = NULL;
g_hash_table_iter_init(&gIter, crm_peer_cache);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
if (crm_is_peer_active(node)) {
crm_err("member: %s proc=%.32x", node->uname, node->processes);
}
}
g_hash_table_iter_init(&gIter, voted);
while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
crm_err("voted: %s", key);
}
}
crm_debug("Destroying voted hash");
g_hash_table_destroy(voted);
voted = NULL;
} else {
crm_debug("Still waiting on %d non-votes (%d total)",
num_members - voted_size, num_members);
}
return;
}
#define loss_dampen 2 /* in seconds */
/* A_ELECTION_COUNT */
void
do_election_count_vote(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
int age = 0;
int election_id = -1;
int log_level = LOG_INFO;
gboolean use_born_on = FALSE;
gboolean done = FALSE;
gboolean we_loose = FALSE;
const char *op = NULL;
const char *vote_from = NULL;
const char *election_owner = NULL;
const char *reason = "unknown";
crm_node_t *our_node = NULL, *your_node = NULL;
ha_msg_input_t *vote = fsa_typed_data(fsa_dt_ha_msg);
static int election_wins = 0;
time_t tm_now = time(NULL);
static time_t expires = 0;
static time_t last_election_loss = 0;
/* if the membership copy is NULL we REALLY shouldnt be voting
* the question is how we managed to get here.
*/
CRM_CHECK(msg_data != NULL, return);
CRM_CHECK(vote != NULL, crm_err("Bogus data from %s", msg_data->origin); return);
CRM_CHECK(vote->msg != NULL, crm_err("Bogus data from %s", msg_data->origin); return);
if(crm_peer_cache == NULL) {
if(is_not_set(fsa_input_register, R_SHUTDOWN)) {
crm_err("Internal error, no peer cache");
}
return;
}
op = crm_element_value(vote->msg, F_CRM_TASK);
vote_from = crm_element_value(vote->msg, F_CRM_HOST_FROM);
election_owner = crm_element_value(vote->msg, F_CRM_ELECTION_OWNER);
crm_element_value_int(vote->msg, F_CRM_ELECTION_ID, &election_id);
CRM_CHECK(vote_from != NULL, vote_from = fsa_our_uname);
your_node = crm_get_peer(0, vote_from);
our_node = crm_get_peer(0, fsa_our_uname);
if (voted == NULL) {
crm_debug("Created voted hash");
voted = g_hash_table_new_full(crm_str_hash, g_str_equal,
g_hash_destroy_str, g_hash_destroy_str);
}
if (is_heartbeat_cluster()) {
use_born_on = TRUE;
} else if (is_classic_ais_cluster()) {
use_born_on = TRUE;
}
if (cur_state == S_STARTING) {
reason = "Still starting";
we_loose = TRUE;
} else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
reason = "We are not part of the cluster";
log_level = LOG_ERR;
we_loose = TRUE;
} else if (election_id != current_election_id && crm_str_eq(fsa_our_uuid, election_owner, TRUE)) {
log_level = LOG_DEBUG_2;
reason = "Superceeded";
done = TRUE;
} else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
/* Possibly we cached the message in the FSA queue at a point that it wasn't */
reason = "Peer is not part of our cluster";
log_level = LOG_WARNING;
done = TRUE;
} else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
char *op_copy = strdup(op);
char *uname_copy = strdup(vote_from);
CRM_ASSERT(crm_str_eq(fsa_our_uuid, election_owner, TRUE));
/* update the list of nodes that have voted */
g_hash_table_replace(voted, uname_copy, op_copy);
reason = "Recorded";
done = TRUE;
} else {
struct timeval your_age;
const char *your_version = crm_element_value(vote->msg, F_CRM_VERSION);
your_age.tv_sec = 0;
your_age.tv_usec = 0;
crm_element_value_int(vote->msg, F_CRM_ELECTION_AGE_S, (int *)&(your_age.tv_sec));
crm_element_value_int(vote->msg, F_CRM_ELECTION_AGE_US, (int *)&(your_age.tv_usec));
age = crm_compare_age(your_age);
if(your_age.tv_sec == 0 && your_age.tv_usec == 0) {
crm_log_xml_trace(vote->msg, "bad vote");
crm_write_blackbox(0, NULL);
}
if (crm_str_eq(vote_from, fsa_our_uname, TRUE)) {
char *op_copy = strdup(op);
char *uname_copy = strdup(vote_from);
CRM_ASSERT(crm_str_eq(fsa_our_uuid, election_owner, TRUE));
/* update ourselves in the list of nodes that have voted */
g_hash_table_replace(voted, uname_copy, op_copy);
reason = "Recorded";
done = TRUE;
} else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
reason = "Version";
we_loose = TRUE;
} else if (compare_version(your_version, CRM_FEATURE_SET) > 0) {
reason = "Version";
} else if (age < 0) {
reason = "Uptime";
we_loose = TRUE;
} else if (age > 0) {
reason = "Uptime";
/* TODO: Check for y(our) born < 0 */
} else if (use_born_on && your_node->born < our_node->born) {
reason = "Born";
we_loose = TRUE;
} else if (use_born_on && your_node->born > our_node->born) {
reason = "Born";
} else if (fsa_our_uname == NULL) {
reason = "Unknown host name";
we_loose = TRUE;
} else if (strcasecmp(fsa_our_uname, vote_from) > 0) {
reason = "Host name";
we_loose = TRUE;
} else {
reason = "Host name";
CRM_ASSERT(strcasecmp(fsa_our_uname, vote_from) < 0);
/* cant happen...
* } else if(strcasecmp(fsa_our_uname, vote_from) == 0) {
*
*/
}
}
if (expires < tm_now) {
election_wins = 0;
expires = tm_now + STORM_INTERVAL;
} else if (done == FALSE && we_loose == FALSE) {
int peers = 1 + g_hash_table_size(crm_peer_cache);
/* If every node has to vote down every other node, thats N*(N-1) total elections
* Allow some leway before _really_ complaining
*/
election_wins++;
if (election_wins > (peers * peers)) {
crm_warn("Election storm detected: %d elections in %d seconds", election_wins,
STORM_INTERVAL);
election_wins = 0;
expires = tm_now + STORM_INTERVAL;
crm_write_blackbox(0, NULL);
}
}
if (done) {
do_crm_log(log_level + 1, "Election %d (current: %d, owner: %s): Processed %s from %s (%s)",
election_id, current_election_id, election_owner, op, vote_from, reason);
} else if (we_loose) {
xmlNode *novote = create_request(CRM_OP_NOVOTE, NULL, vote_from,
CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
do_crm_log(log_level, "Election %d (owner: %s) lost: %s from %s (%s)",
election_id, election_owner, op, vote_from, reason);
update_dc(NULL);
crm_timer_stop(election_timeout);
if (fsa_input_register & R_THE_DC) {
crm_trace("Give up the DC to %s", vote_from);
register_fsa_input(C_FSA_INTERNAL, I_RELEASE_DC, NULL);
fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local);
} else if (cur_state != S_STARTING) {
crm_trace("We werent the DC anyway");
register_fsa_input(C_FSA_INTERNAL, I_PENDING, NULL);
}
crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
send_cluster_message(crm_get_peer(0, vote_from), crm_msg_crmd, novote, TRUE);
free_xml(novote);
last_election_loss = tm_now;
} else {
do_crm_log(log_level, "Election %d (owner: %s) pass: %s from %s (%s)",
election_id, election_owner, op, vote_from, reason);
if (last_election_loss) {
if (tm_now - last_election_loss < (time_t) loss_dampen) {
crm_info("Election %d ignore: We already lost an election less than %ds ago (%s)",
election_id, loss_dampen, ctime(&last_election_loss));
update_dc(NULL);
return;
}
last_election_loss = 0;
}
register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
g_hash_table_destroy(voted);
voted = NULL;
}
}
/* A_ELECT_TIMER_START, A_ELECTION_TIMEOUT */
/* we won */
void
do_election_timer_ctrl(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
}
static void
feature_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL;
crm_notice("Update failed: %s (%d)", pcmk_strerror(rc), rc);
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
}
/* A_DC_TAKEOVER */
void
do_dc_takeover(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
int rc = pcmk_ok;
xmlNode *cib = NULL;
GListPtr gIter = NULL;
const char *cluster_type = name_for_cluster_type(get_cluster_type());
crm_info("Taking over DC status for this partition");
set_bit(fsa_input_register, R_THE_DC);
for (gIter = stonith_cleanup_list; gIter != NULL; gIter = gIter->next) {
char *target = gIter->data;
crm_node_t *target_node = crm_get_peer(0, target);
const char *uuid = crm_peer_uuid(target_node);
crm_notice("Marking %s, target of a previous stonith action, as clean", target);
send_stonith_update(NULL, target, uuid);
free(target);
}
g_list_free(stonith_cleanup_list);
stonith_cleanup_list = NULL;
#if SUPPORT_COROSYNC
if (is_classic_ais_cluster()) {
send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais);
}
#endif
if (voted != NULL) {
crm_trace("Destroying voted hash");
g_hash_table_destroy(voted);
voted = NULL;
}
set_bit(fsa_input_register, R_JOIN_OK);
set_bit(fsa_input_register, R_INVOKE_PE);
fsa_cib_conn->cmds->set_master(fsa_cib_conn, cib_scope_local);
cib = create_xml_node(NULL, XML_TAG_CIB);
crm_xml_add(cib, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
fsa_cib_update(XML_TAG_CIB, cib, cib_quorum_override, rc, NULL);
fsa_register_cib_callback(rc, FALSE, NULL, feature_update_callback);
update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
"dc-version", VERSION "-" BUILD_VERSION, FALSE, NULL);
update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
"cluster-infrastructure", cluster_type, FALSE, NULL);
mainloop_set_trigger(config_read);
free_xml(cib);
}
/* A_DC_RELEASE */
void
do_dc_release(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
if (action & A_DC_RELEASE) {
crm_debug("Releasing the role of DC");
clear_bit(fsa_input_register, R_THE_DC);
} else if (action & A_DC_RELEASED) {
crm_info("DC role released");
#if 0
if (are there errors) {
/* we cant stay up if not healthy */
/* or perhaps I_ERROR and go to S_RECOVER? */
result = I_SHUTDOWN;
}
#endif
register_fsa_input(C_FSA_INTERNAL, I_RELEASE_SUCCESS, NULL);
} else {
crm_err("Unknown action %s", fsa_action2string(action));
}
crm_trace("Am I still the DC? %s", AM_I_DC ? XML_BOOLEAN_YES : XML_BOOLEAN_NO);
}
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Thu, Jul 10, 2:50 AM (1 d, 10 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2009789
Default Alt Text
election.c (18 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment