Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/exec/totemconfig.c b/exec/totemconfig.c
index a92978d2..1d44940d 100644
--- a/exec/totemconfig.c
+++ b/exec/totemconfig.c
@@ -1,2158 +1,2195 @@
/*
* Copyright (c) 2002-2005 MontaVista Software, Inc.
* Copyright (c) 2006-2018 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
* Jan Friesse (jfriesse@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <ifaddrs.h>
#include <netdb.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/param.h>
#include <sys/utsname.h>
#include <corosync/swab.h>
#include <qb/qblist.h>
#include <qb/qbdefs.h>
#include <libknet.h>
#include <corosync/totem/totem.h>
#include <corosync/config.h>
#include <corosync/logsys.h>
#include <corosync/icmap.h>
#include "util.h"
#include "totemconfig.h"
#define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST 4
#define TOKEN_TIMEOUT 1000
#define TOKEN_WARNING 75
#define TOKEN_COEFFICIENT 650
#define JOIN_TIMEOUT 50
#define MERGE_TIMEOUT 200
#define DOWNCHECK_TIMEOUT 1000
#define FAIL_TO_RECV_CONST 2500
#define SEQNO_UNCHANGED_CONST 30
#define MINIMUM_TIMEOUT (int)(1000/HZ)*3
#define MAX_NETWORK_DELAY 50
#define WINDOW_SIZE 50
#define MAX_MESSAGES 17
#define MISS_COUNT_CONST 5
/* These currently match the defaults in libknet.h */
#define KNET_PING_INTERVAL 1000
#define KNET_PING_TIMEOUT 2000
#define KNET_PING_PRECISION 2048
#define KNET_PONG_COUNT 2
#define KNET_PMTUD_INTERVAL 30
#define KNET_DEFAULT_TRANSPORT KNET_TRANSPORT_UDP
#define DEFAULT_PORT 5405
static char error_string_response[768];
static void add_totem_config_notification(struct totem_config *totem_config);
static void *totem_get_param_by_name(struct totem_config *totem_config, const char *param_name)
{
if (strcmp(param_name, "totem.token") == 0)
return &totem_config->token_timeout;
if (strcmp(param_name, "totem.token_warning") == 0)
return &totem_config->token_warning;
if (strcmp(param_name, "totem.token_retransmit") == 0)
return &totem_config->token_retransmit_timeout;
if (strcmp(param_name, "totem.hold") == 0)
return &totem_config->token_hold_timeout;
if (strcmp(param_name, "totem.token_retransmits_before_loss_const") == 0)
return &totem_config->token_retransmits_before_loss_const;
if (strcmp(param_name, "totem.join") == 0)
return &totem_config->join_timeout;
if (strcmp(param_name, "totem.send_join") == 0)
return &totem_config->send_join_timeout;
if (strcmp(param_name, "totem.consensus") == 0)
return &totem_config->consensus_timeout;
if (strcmp(param_name, "totem.merge") == 0)
return &totem_config->merge_timeout;
if (strcmp(param_name, "totem.downcheck") == 0)
return &totem_config->downcheck_timeout;
if (strcmp(param_name, "totem.fail_recv_const") == 0)
return &totem_config->fail_to_recv_const;
if (strcmp(param_name, "totem.seqno_unchanged_const") == 0)
return &totem_config->seqno_unchanged_const;
if (strcmp(param_name, "totem.heartbeat_failures_allowed") == 0)
return &totem_config->heartbeat_failures_allowed;
if (strcmp(param_name, "totem.max_network_delay") == 0)
return &totem_config->max_network_delay;
if (strcmp(param_name, "totem.window_size") == 0)
return &totem_config->window_size;
if (strcmp(param_name, "totem.max_messages") == 0)
return &totem_config->max_messages;
if (strcmp(param_name, "totem.miss_count_const") == 0)
return &totem_config->miss_count_const;
if (strcmp(param_name, "totem.knet_pmtud_interval") == 0)
return &totem_config->knet_pmtud_interval;
if (strcmp(param_name, "totem.knet_compression_threshold") == 0)
return &totem_config->knet_compression_threshold;
if (strcmp(param_name, "totem.knet_compression_level") == 0)
return &totem_config->knet_compression_level;
if (strcmp(param_name, "totem.knet_compression_model") == 0)
return &totem_config->knet_compression_model;
return NULL;
}
/*
* Read key_name from icmap. If key is not found or key_name == delete_key or if allow_zero is false
* and readed value is zero, default value is used and stored into totem_config.
*/
static void totem_volatile_config_set_uint32_value (struct totem_config *totem_config,
const char *key_name, const char *deleted_key, unsigned int default_value,
int allow_zero_value)
{
char runtime_key_name[ICMAP_KEYNAME_MAXLEN];
if (icmap_get_uint32(key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK ||
(deleted_key != NULL && strcmp(deleted_key, key_name) == 0) ||
(!allow_zero_value && *(uint32_t *)totem_get_param_by_name(totem_config, key_name) == 0)) {
*(uint32_t *)totem_get_param_by_name(totem_config, key_name) = default_value;
}
/*
* Store totem_config value to cmap runtime section
*/
if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) {
/*
* This shouldn't happen
*/
return ;
}
strcpy(runtime_key_name, "runtime.config.");
strcat(runtime_key_name, key_name);
icmap_set_uint32(runtime_key_name, *(uint32_t *)totem_get_param_by_name(totem_config, key_name));
}
static void totem_volatile_config_set_int32_value (struct totem_config *totem_config,
const char *key_name, const char *deleted_key, int default_value,
int allow_zero_value)
{
char runtime_key_name[ICMAP_KEYNAME_MAXLEN];
if (icmap_get_int32(key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK ||
(deleted_key != NULL && strcmp(deleted_key, key_name) == 0) ||
(!allow_zero_value && *(int32_t *)totem_get_param_by_name(totem_config, key_name) == 0)) {
*(int32_t *)totem_get_param_by_name(totem_config, key_name) = default_value;
}
/*
* Store totem_config value to cmap runtime section
*/
if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) {
/*
* This shouldn't happen
*/
return ;
}
strcpy(runtime_key_name, "runtime.config.");
strcat(runtime_key_name, key_name);
icmap_set_int32(runtime_key_name, *(int32_t *)totem_get_param_by_name(totem_config, key_name));
}
static void totem_volatile_config_set_string_value (struct totem_config *totem_config,
const char *key_name, const char *deleted_key, const char *default_value)
{
char runtime_key_name[ICMAP_KEYNAME_MAXLEN];
void **config_value;
void *old_config_ptr;
config_value = totem_get_param_by_name(totem_config, key_name);
old_config_ptr = *config_value;
if (icmap_get_string(key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK ||
(deleted_key != NULL && strcmp(deleted_key, key_name) == 0)) {
/* Need to strdup() here so that the free() below works for a default and a configured value */
*config_value = strdup(default_value);
}
free(old_config_ptr);
/*
* Store totem_config value to cmap runtime section
*/
if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) {
/*
* This shouldn't happen
*/
return ;
}
strcpy(runtime_key_name, "runtime.config.");
strcat(runtime_key_name, key_name);
icmap_set_string(runtime_key_name, (char *)*config_value);
}
/*
* Read and validate config values from cmap and store them into totem_config. If key doesn't exists,
* default value is stored. deleted_key is name of key beeing processed by delete operation
* from cmap. It is considered as non existing even if it can be read. Can be NULL.
*/
static void totem_volatile_config_read (struct totem_config *totem_config, const char *deleted_key)
{
uint32_t u32;
totem_volatile_config_set_uint32_value(totem_config, "totem.token_retransmits_before_loss_const", deleted_key,
TOKEN_RETRANSMITS_BEFORE_LOSS_CONST, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.token", deleted_key, TOKEN_TIMEOUT, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.token_warning", deleted_key, TOKEN_WARNING, 1);
if (totem_config->interfaces[0].member_count > 2) {
u32 = TOKEN_COEFFICIENT;
icmap_get_uint32("totem.token_coefficient", &u32);
totem_config->token_timeout += (totem_config->interfaces[0].member_count - 2) * u32;
/*
* Store totem_config value to cmap runtime section
*/
icmap_set_uint32("runtime.config.totem.token", totem_config->token_timeout);
}
totem_volatile_config_set_uint32_value(totem_config, "totem.max_network_delay", deleted_key, MAX_NETWORK_DELAY, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.window_size", deleted_key, WINDOW_SIZE, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.max_messages", deleted_key, MAX_MESSAGES, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.miss_count_const", deleted_key, MISS_COUNT_CONST, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.knet_pmtud_interval", deleted_key, KNET_PMTUD_INTERVAL, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.token_retransmit", deleted_key,
(int)(totem_config->token_timeout / (totem_config->token_retransmits_before_loss_const + 0.2)), 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.hold", deleted_key,
(int)(totem_config->token_retransmit_timeout * 0.8 - (1000/HZ)), 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.join", deleted_key, JOIN_TIMEOUT, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.consensus", deleted_key,
(int)(float)(1.2 * totem_config->token_timeout), 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.merge", deleted_key, MERGE_TIMEOUT, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.downcheck", deleted_key, DOWNCHECK_TIMEOUT, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.fail_recv_const", deleted_key, FAIL_TO_RECV_CONST, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.seqno_unchanged_const", deleted_key,
SEQNO_UNCHANGED_CONST, 0);
totem_volatile_config_set_uint32_value(totem_config, "totem.send_join", deleted_key, 0, 1);
totem_volatile_config_set_uint32_value(totem_config, "totem.heartbeat_failures_allowed", deleted_key, 0, 1);
totem_volatile_config_set_uint32_value(totem_config, "totem.knet_compression_threshold", deleted_key, 0, 1);
totem_volatile_config_set_int32_value(totem_config, "totem.knet_compression_level", deleted_key, 0, 1);
totem_volatile_config_set_string_value(totem_config, "totem.knet_compression_model", deleted_key, "none");
}
static int totem_volatile_config_validate (
struct totem_config *totem_config,
const char **error_string)
{
static char local_error_reason[512];
const char *error_reason = local_error_reason;
char name_key[ICMAP_KEYNAME_MAXLEN];
char *name_str;
int i, num_configured, members;
if (totem_config->max_network_delay < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The max_network_delay parameter (%d ms) may not be less than (%d ms).",
totem_config->max_network_delay, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->token_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->token_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->token_warning > 100 || totem_config->token_warning < 0) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token warning parameter (%d%%) must be between 0 (disabled) and 100.",
totem_config->token_warning);
goto parse_error;
}
if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token retransmit timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->token_retransmit_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->token_hold_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token hold timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->token_hold_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->join_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The join timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->join_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->consensus_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The consensus timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->consensus_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->consensus_timeout < totem_config->join_timeout) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The consensus timeout parameter (%d ms) may not be less than join timeout (%d ms).",
totem_config->consensus_timeout, totem_config->join_timeout);
goto parse_error;
}
if (totem_config->merge_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The merge timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->merge_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->downcheck_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The downcheck timeout parameter (%d ms) may not be less than (%d ms).",
totem_config->downcheck_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
/* Check that we have nodelist 'name' if there is more than one link */
num_configured = 0;
for (i = 0; i < INTERFACE_MAX; i++) {
if (totem_config->interfaces[i].configured) {
num_configured++;
}
}
if (num_configured > 1) {
members = totem_config->interfaces[0].member_count;
for (i=0; i<totem_config->interfaces[0].member_count; i++) {
snprintf(name_key, sizeof(name_key), "nodelist.node.%d.name", i);
if (icmap_get_string(name_key, &name_str) != CS_OK) {
snprintf (local_error_reason, sizeof(local_error_reason),
"for a multi-link configuration, all nodes must have a 'name' attribute");
goto parse_error;
}
}
for (i=0; i<num_configured; i++) {
if (totem_config->interfaces[i].member_count != members) {
snprintf (local_error_reason, sizeof(local_error_reason),
"Not all nodes have the same number of links");
goto parse_error;
}
}
}
return 0;
parse_error:
snprintf (error_string_response, sizeof(error_string_response),
"parse error in config: %s\n", error_reason);
*error_string = error_string_response;
return (-1);
}
static int totem_get_crypto(struct totem_config *totem_config, const char **error_string)
{
char *str;
const char *tmp_cipher;
const char *tmp_hash;
const char *tmp_model;
tmp_hash = "none";
tmp_cipher = "none";
tmp_model = "none";
if (icmap_get_string("totem.crypto_model", &str) == CS_OK) {
if (strcmp(str, "nss") == 0) {
tmp_model = "nss";
}
if (strcmp(str, "openssl") == 0) {
tmp_model = "openssl";
}
free(str);
} else {
tmp_model = "nss";
}
if (icmap_get_string("totem.crypto_cipher", &str) == CS_OK) {
if (strcmp(str, "none") == 0) {
tmp_cipher = "none";
}
if (strcmp(str, "aes256") == 0) {
tmp_cipher = "aes256";
}
if (strcmp(str, "aes192") == 0) {
tmp_cipher = "aes192";
}
if (strcmp(str, "aes128") == 0) {
tmp_cipher = "aes128";
}
if (strcmp(str, "3des") == 0) {
tmp_cipher = "3des";
}
free(str);
}
if (icmap_get_string("totem.crypto_hash", &str) == CS_OK) {
if (strcmp(str, "none") == 0) {
tmp_hash = "none";
}
if (strcmp(str, "md5") == 0) {
tmp_hash = "md5";
}
if (strcmp(str, "sha1") == 0) {
tmp_hash = "sha1";
}
if (strcmp(str, "sha256") == 0) {
tmp_hash = "sha256";
}
if (strcmp(str, "sha384") == 0) {
tmp_hash = "sha384";
}
if (strcmp(str, "sha512") == 0) {
tmp_hash = "sha512";
}
free(str);
}
if ((strcmp(tmp_cipher, "none") != 0) &&
(strcmp(tmp_hash, "none") == 0)) {
*error_string = "crypto_cipher requires crypto_hash with value other than none";
return -1;
}
if (strcmp(tmp_model, "none") == 0) {
*error_string = "crypto_model should be 'nss' or 'openssl'";
return -1;
}
free(totem_config->crypto_cipher_type);
free(totem_config->crypto_hash_type);
free(totem_config->crypto_model);
totem_config->crypto_cipher_type = strdup(tmp_cipher);
totem_config->crypto_hash_type = strdup(tmp_hash);
totem_config->crypto_model = strdup(tmp_model);
return 0;
}
static int nodelist_byname(const char *find_name, int strip_domain)
{
icmap_iter_t iter;
const char *iter_key;
char name_str[ICMAP_KEYNAME_MAXLEN];
int res = 0;
unsigned int node_pos;
char *name;
unsigned int namelen;
iter = icmap_iter_init("nodelist.node.");
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, name_str);
if (res != 2) {
continue;
}
/* ring0_addr is allowed as a fallback */
if (strcmp(name_str, "name") && strcmp(name_str, "ring0_addr")) {
continue;
}
if (icmap_get_string(iter_key, &name) != CS_OK) {
continue;
}
namelen = strlen(name);
if (strip_domain) {
char *dot;
dot = strchr(name, '.');
if (dot) {
namelen = name - dot - 1;
}
}
if (strncmp(find_name, name, namelen) == 0 &&
strlen(find_name) == strlen(name)) {
icmap_iter_finalize(iter);
return node_pos;
}
}
icmap_iter_finalize(iter);
return -1;
}
/* Compare two addresses */
static int ipaddr_equal(struct sockaddr_storage *addr1, struct sockaddr_storage *addr2)
{
int addrlen = 0;
if (addr1->ss_family != addr2->ss_family)
return 0;
if (addr1->ss_family == AF_INET) {
addrlen = sizeof(struct sockaddr_in);
}
if (addr1->ss_family == AF_INET6) {
addrlen = sizeof(struct sockaddr_in6);
}
assert(addrlen);
if (memcmp(addr1, addr2, addrlen) == 0)
return 1;
else
return 0;
}
/* Finds the local node and returns its position in the nodelist.
* Uses nodelist.local_node_pos as a cache to save effort
*/
static int find_local_node(int use_cache)
{
char nodename2[PATH_MAX];
char name_str[ICMAP_KEYNAME_MAXLEN];
icmap_iter_t iter;
const char *iter_key;
unsigned int cached_pos;
char *dot = NULL;
const char *node;
struct ifaddrs *ifa, *ifa_list;
struct sockaddr *sa;
int found = 0;
int node_pos = -1;
int res;
struct utsname utsname;
/* Check for cached value first */
if (use_cache) {
if (icmap_get_uint32("nodelist.local_node_pos", &cached_pos) == CS_OK) {
return cached_pos;
}
}
res = uname(&utsname);
if (res) {
return -1;
}
node = utsname.nodename;
/* 1. Exact match */
node_pos = nodelist_byname(node, 0);
if (node_pos > -1) {
found = 1;
goto ret_found;
}
/* 2. Try to match with increasingly more
* specific versions of it
*/
strcpy(nodename2, node);
dot = strrchr(nodename2, '.');
while (dot) {
*dot = '\0';
node_pos = nodelist_byname(nodename2, 0);
if (node_pos > -1) {
found = 1;
goto ret_found;
}
dot = strrchr(nodename2, '.');
}
node_pos = nodelist_byname(nodename2, 1);
if (node_pos > -1) {
found = 1;
goto ret_found;
}
/*
* The corosync.conf name may not be related to uname at all,
* they may match a hostname on some network interface.
*/
if (getifaddrs(&ifa_list))
return -1;
for (ifa = ifa_list; ifa; ifa = ifa->ifa_next) {
socklen_t salen = 0;
/* Restore this */
strcpy(nodename2, node);
sa = ifa->ifa_addr;
if (!sa) {
continue;
}
if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) {
continue;
}
if (sa->sa_family == AF_INET) {
salen = sizeof(struct sockaddr_in);
}
if (sa->sa_family == AF_INET6) {
salen = sizeof(struct sockaddr_in6);
}
if (getnameinfo(sa, salen,
nodename2, sizeof(nodename2),
NULL, 0, 0) == 0) {
node_pos = nodelist_byname(nodename2, 0);
if (node_pos > -1) {
found = 1;
goto out;
}
/* Truncate this name and try again */
dot = strchr(nodename2, '.');
if (dot) {
*dot = '\0';
node_pos = nodelist_byname(nodename2, 0);
if (node_pos > -1) {
found = 1;
goto out;
}
}
}
/* See if it's the IP address that's in corosync.conf */
if (getnameinfo(sa, sizeof(*sa),
nodename2, sizeof(nodename2),
NULL, 0, NI_NUMERICHOST))
continue;
node_pos = nodelist_byname(nodename2, 0);
if (node_pos > -1) {
found = 1;
goto out;
}
}
out:
if (found) {
freeifaddrs(ifa_list);
goto ret_found;
}
/*
* This section covers the usecase where the nodename specified in cluster.conf
* is an alias specified in /etc/hosts. For example:
* <ipaddr> hostname alias1 alias2
* and <clusternode name="alias2">
* the above calls use uname and getnameinfo does not return aliases.
* here we take the name specified in cluster.conf, resolve it to an address
* and then compare against all known local ip addresses.
* if we have a match, we found our nodename. In theory this chunk of code
* could replace all the checks above, but let's avoid any possible regressions
* and use it as last.
*/
iter = icmap_iter_init("nodelist.node.");
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
char *dbnodename = NULL;
struct addrinfo hints;
struct addrinfo *result = NULL, *rp = NULL;
res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, name_str);
if (res != 2) {
continue;
}
/* 'ring0_addr' is allowed as a fallback, but 'name' will be found first
* because the names are in alpha order.
*/
if (strcmp(name_str, "name") && strcmp(name_str, "ring0_addr")) {
continue;
}
if (icmap_get_string(iter_key, &dbnodename) != CS_OK) {
continue;
}
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_DGRAM;
hints.ai_flags = 0;
hints.ai_protocol = IPPROTO_UDP;
if (getaddrinfo(dbnodename, NULL, &hints, &result)) {
continue;
}
for (rp = result; rp != NULL; rp = rp->ai_next) {
for (ifa = ifa_list; ifa; ifa = ifa->ifa_next) {
if (ifa->ifa_addr &&
ipaddr_equal((struct sockaddr_storage *)rp->ai_addr,
(struct sockaddr_storage *)ifa->ifa_addr)) {
freeaddrinfo(result);
found = 1;
goto out2;
}
}
}
freeaddrinfo(result);
}
out2:
icmap_iter_finalize(iter);
freeifaddrs(ifa_list);
ret_found:
if (found) {
res = icmap_set_uint32("nodelist.local_node_pos", node_pos);
}
return node_pos;
}
static int totem_config_get_ip_version(struct totem_config *totem_config)
{
int res;
char *str;
res = AF_INET;
if (totem_config->transport_number == TOTEM_TRANSPORT_KNET) {
res = AF_UNSPEC;
} else {
if (icmap_get_string("totem.ip_version", &str) == CS_OK) {
if (strcmp(str, "ipv4") == 0) {
res = AF_INET;
}
if (strcmp(str, "ipv6") == 0) {
res = AF_INET6;
}
free(str);
}
}
return (res);
}
static uint16_t generate_cluster_id (const char *cluster_name)
{
int i;
int value = 0;
for (i = 0; i < strlen(cluster_name); i++) {
value <<= 1;
value += cluster_name[i];
}
return (value & 0xFFFF);
}
static int get_cluster_mcast_addr (
const char *cluster_name,
unsigned int linknumber,
int ip_version,
struct totem_ip_address *res)
{
uint16_t clusterid;
char addr[INET6_ADDRSTRLEN + 1];
int err;
if (cluster_name == NULL) {
return (-1);
}
clusterid = generate_cluster_id(cluster_name) + linknumber;
memset (res, 0, sizeof(*res));
switch (ip_version) {
case AF_INET:
snprintf(addr, sizeof(addr), "239.192.%d.%d", clusterid >> 8, clusterid % 0xFF);
break;
case AF_INET6:
snprintf(addr, sizeof(addr), "ff15::%x", clusterid);
break;
default:
/*
* Unknown family
*/
return (-1);
}
err = totemip_parse (res, addr, ip_version);
return (err);
}
-static unsigned int generate_nodeid_for_duplicate_test(
+static unsigned int generate_nodeid(
struct totem_config *totem_config,
char *addr)
{
unsigned int nodeid;
struct totem_ip_address totemip;
/* AF_INET hard-coded here because auto-generated nodeids
are only for IPv4 */
if (totemip_parse(&totemip, addr, AF_INET) != 0)
return -1;
memcpy (&nodeid, &totemip.addr, sizeof (unsigned int));
#if __BYTE_ORDER == __LITTLE_ENDIAN
nodeid = swab32 (nodeid);
#endif
if (totem_config->clear_node_high_bit) {
nodeid &= 0x7FFFFFFF;
}
return nodeid;
}
static int check_for_duplicate_nodeids(
struct totem_config *totem_config,
const char **error_string)
{
icmap_iter_t iter;
icmap_iter_t subiter;
const char *iter_key;
int res = 0;
int retval = 0;
char tmp_key[ICMAP_KEYNAME_MAXLEN];
char *ring0_addr=NULL;
char *ring0_addr1=NULL;
unsigned int node_pos;
unsigned int node_pos1;
+ unsigned int last_node_pos = -1;
unsigned int nodeid;
unsigned int nodeid1;
int autogenerated;
iter = icmap_iter_init("nodelist.node.");
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key);
if (res != 2) {
continue;
}
- if (strcmp(tmp_key, "nodeid") != 0) {
+ /*
+ * This relies on the fact the icmap keys are always returned in order
+ * so all of the keys for a node will be grouped together. We're basically
+ * just running the code below once for each node.
+ */
+ if (last_node_pos == node_pos) {
continue;
}
+ last_node_pos = node_pos;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos);
autogenerated = 0;
/* Generated nodeids are only allowed for UDP/UDPU so ring0_addr is valid here */
if (icmap_get_uint32(tmp_key, &nodeid) != CS_OK) {
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos);
if (icmap_get_string(tmp_key, &ring0_addr) != CS_OK) {
continue;
}
/* Generate nodeid so we can check that auto-generated nodeids don't clash either */
- nodeid = generate_nodeid_for_duplicate_test(totem_config, ring0_addr);
+ nodeid = generate_nodeid(totem_config, ring0_addr);
if (nodeid == -1) {
continue;
}
autogenerated = 1;
}
node_pos1 = 0;
subiter = icmap_iter_init("nodelist.node.");
while (((iter_key = icmap_iter_next(subiter, NULL, NULL)) != NULL) && (node_pos1 < node_pos)) {
res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos1, tmp_key);
if ((res != 2) || (node_pos1 >= node_pos)) {
continue;
}
if (strcmp(tmp_key, "nodeid") != 0) {
continue;
}
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos1);
if (icmap_get_uint32(tmp_key, &nodeid1) != CS_OK) {
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos1);
if (icmap_get_string(tmp_key, &ring0_addr1) != CS_OK) {
continue;
}
- nodeid1 = generate_nodeid_for_duplicate_test(totem_config, ring0_addr1);
+ nodeid1 = generate_nodeid(totem_config, ring0_addr1);
if (nodeid1 == -1) {
continue;
}
}
if (nodeid == nodeid1) {
retval = -1;
snprintf (error_string_response, sizeof(error_string_response),
"Nodeid %u%s%s%s appears twice in corosync.conf", nodeid,
autogenerated?"(autogenerated from ":"",
autogenerated?ring0_addr:"",
autogenerated?")":"");
log_printf (LOGSYS_LEVEL_ERROR, error_string_response);
*error_string = error_string_response;
break;
}
}
icmap_iter_finalize(subiter);
}
icmap_iter_finalize(iter);
return retval;
}
/*
* This needs to be done last of all. It would be nice to do it when reading the
* interface params, but the totem params need to have them to be read first. We
* need both, so this is a way round that circular dependancy.
*/
static void calc_knet_ping_timers(struct totem_config *totem_config)
{
char runtime_key_name[ICMAP_KEYNAME_MAXLEN];
int interface;
for (interface = 0; interface < INTERFACE_MAX; interface++) {
if (totem_config->interfaces[interface].configured) {
if (!totem_config->interfaces[interface].knet_pong_count) {
totem_config->interfaces[interface].knet_pong_count = KNET_PONG_COUNT;
}
if (!totem_config->interfaces[interface].knet_ping_timeout) {
totem_config->interfaces[interface].knet_ping_timeout =
totem_config->token_timeout / totem_config->interfaces[interface].knet_pong_count;
}
snprintf(runtime_key_name, sizeof(runtime_key_name),
"runtime.config.totem.interface.%d.knet_ping_timeout", interface);
icmap_set_uint32(runtime_key_name, totem_config->interfaces[interface].knet_ping_timeout);
if (!totem_config->interfaces[interface].knet_ping_interval) {
totem_config->interfaces[interface].knet_ping_interval =
totem_config->token_timeout / (totem_config->interfaces[interface].knet_pong_count * 2);
}
snprintf(runtime_key_name, sizeof(runtime_key_name),
"runtime.config.totem.interface.%d.knet_ping_interval", interface);
icmap_set_uint32(runtime_key_name, totem_config->interfaces[interface].knet_ping_interval);
}
}
}
/*
* Compute difference between two set of totem interface arrays. set1 and set2
* are changed so for same ring, ip existing in both set1 and set2 are cleared
* (set to 0), and ips which are only in set1 or set2 remains untouched.
* totempg_node_add/remove is called.
*/
static void compute_interfaces_diff(struct totem_interface *set1,
struct totem_interface *set2)
{
int ring_no, set1_pos, set2_pos;
struct totem_ip_address empty_ip_address;
memset(&empty_ip_address, 0, sizeof(empty_ip_address));
for (ring_no = 0; ring_no < INTERFACE_MAX; ring_no++) {
if (!set1[ring_no].configured && !set2[ring_no].configured) {
continue;
}
for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) {
for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) {
/*
* For current ring_no remove all set1 items existing
* in set2
*/
if (memcmp(&set1[ring_no].member_list[set1_pos],
&set2[ring_no].member_list[set2_pos],
sizeof(struct totem_ip_address)) == 0) {
memset(&set1[ring_no].member_list[set1_pos], 0,
sizeof(struct totem_ip_address));
memset(&set2[ring_no].member_list[set2_pos], 0,
sizeof(struct totem_ip_address));
}
}
}
}
for (ring_no = 0; ring_no < INTERFACE_MAX; ring_no++) {
for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) {
/*
* All items which remained in set1 doesn't exists in set2 any longer so
* node has to be removed.
*/
if (memcmp(&set1[ring_no].member_list[set1_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) {
log_printf(LOGSYS_LEVEL_DEBUG,
"removing dynamic member %s for ring %u",
totemip_print(&set1[ring_no].member_list[set1_pos]),
ring_no);
totempg_member_remove(&set1[ring_no].member_list[set1_pos], ring_no);
}
}
if (!set2[ring_no].configured) {
continue;
}
for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) {
/*
* All items which remained in set2 doesn't existed in set1 so this is no node
* and has to be added.
*/
if (memcmp(&set2[ring_no].member_list[set2_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) {
log_printf(LOGSYS_LEVEL_DEBUG,
"adding dynamic member %s for ring %u",
totemip_print(&set2[ring_no].member_list[set2_pos]),
ring_no);
totempg_member_add(&set2[ring_no].member_list[set2_pos], ring_no);
}
}
}
}
/*
* Reconfigure links in totempg. Sets new local IP address and adds params for new links.
*/
static void reconfigure_links(struct totem_config *totem_config)
{
int i;
char tmp_key[ICMAP_KEYNAME_MAXLEN];
char *addr_string;
struct totem_ip_address local_ip;
int err;
int local_node_pos = find_local_node(0);
for (i = 0; i<INTERFACE_MAX; i++) {
if (!totem_config->interfaces[i].configured) {
continue;
}
log_printf(LOGSYS_LEVEL_INFO, "Configuring link %d\n", i);
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring%u_addr", local_node_pos, i);
if (icmap_get_string(tmp_key, &addr_string) != CS_OK) {
continue;
}
err = totemip_parse(&local_ip, addr_string, AF_UNSPEC);
if (err != 0) {
continue;
}
local_ip.nodeid = totem_config->node_id;
/* In case this is a new link, fill in the defaults if there was no interface{} section for it */
if (!totem_config->interfaces[i].knet_link_priority)
totem_config->interfaces[i].knet_link_priority = 1;
/* knet_ping_interval & knet_ping_timeout are set later once we know all the other params */
if (!totem_config->interfaces[i].knet_ping_precision)
totem_config->interfaces[i].knet_ping_precision = KNET_PING_PRECISION;
if (!totem_config->interfaces[i].knet_pong_count)
totem_config->interfaces[i].knet_pong_count = KNET_PONG_COUNT;
if (!totem_config->interfaces[i].knet_transport)
totem_config->interfaces[i].knet_transport = KNET_TRANSPORT_UDP;
if (!totem_config->interfaces[i].ip_port)
totem_config->interfaces[i].ip_port = DEFAULT_PORT + i;
totempg_iface_set(&local_ip, totem_config->interfaces[i].ip_port, i);
}
}
/* Check for differences in config that can't be done on-the-fly and print an error */
static void check_things_have_not_changed(struct totem_config *totem_config)
{
int i,j;
const char *ip_str;
char addr_buf[INET6_ADDRSTRLEN];
int changed = 0;
for (i = 0; i<INTERFACE_MAX; i++) {
if (totem_config->interfaces[i].configured) {
if (totem_config->interfaces[i].knet_transport !=
totem_config->orig_interfaces[i].knet_transport) {
log_printf(LOGSYS_LEVEL_ERROR, "New config has different knet transport for link %d. Internal value was NOT changed.\n", i);
changed = 1;
}
for (j=0; j < min(totem_config->interfaces[i].member_count, totem_config->orig_interfaces[i].member_count); j++) {
if (memcmp(&totem_config->interfaces[i].member_list[j],
&totem_config->orig_interfaces[i].member_list[j],
sizeof(struct totem_ip_address))) {
ip_str = totemip_print(&totem_config->orig_interfaces[i].member_list[j]);
strncpy(addr_buf, ip_str, sizeof(addr_buf));
addr_buf[sizeof(addr_buf) - 1] = '\0';
log_printf(LOGSYS_LEVEL_ERROR, "new config has different address for link %d (addr changed from %s to %s). Internal value was NOT changed.\n", i, addr_buf, totemip_print(&totem_config->interfaces[i].member_list[j]));
changed = 1;
}
}
}
}
if (changed) {
log_printf(LOGSYS_LEVEL_ERROR, "To reconfigure an interface it must be deleted and recreated. A working interface needs to be available to corosync at all times");
}
}
static void put_nodelist_members_to_config(struct totem_config *totem_config, int reload)
{
icmap_iter_t iter, iter2;
const char *iter_key, *iter_key2;
int res = 0;
unsigned int node_pos;
char tmp_key[ICMAP_KEYNAME_MAXLEN];
char tmp_key2[ICMAP_KEYNAME_MAXLEN];
char *node_addr_str;
int member_count;
unsigned int linknumber = 0;
int i, j;
+ int last_node_pos = -1;
struct totem_interface *new_interfaces = NULL;
if (reload) {
/*
* We need to compute diff only for reload. Also for initial configuration
* not all totem structures are initialized so corosync will crash during
* member_add/remove
*/
new_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX);
assert(new_interfaces != NULL);
}
/* Clear out nodelist so we can put the new one in if needed */
for (i = 0; i < INTERFACE_MAX; i++) {
for (j = 0; j < PROCESSOR_COUNT_MAX; j++) {
memset(&totem_config->interfaces[i].member_list[j], 0, sizeof(struct totem_ip_address));
}
totem_config->interfaces[i].member_count = 0;
}
iter = icmap_iter_init("nodelist.node.");
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key);
if (res != 2) {
continue;
}
- if (strcmp(tmp_key, "nodeid") != 0) {
+ /* If it's the same as the last node_pos then skip it */
+ if (node_pos == last_node_pos) {
continue;
}
+ last_node_pos = node_pos;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos);
iter2 = icmap_iter_init(tmp_key);
while ((iter_key2 = icmap_iter_next(iter2, NULL, NULL)) != NULL) {
unsigned int nodeid;
+ char *str;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos);
if (icmap_get_uint32(tmp_key, &nodeid) != CS_OK) {
+ nodeid = 0;
}
res = sscanf(iter_key2, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2);
if (res != 3 || strcmp(tmp_key2, "_addr") != 0) {
continue;
}
if (icmap_get_string(iter_key2, &node_addr_str) != CS_OK) {
continue;
}
+ /* Generate nodeids if they are not provided and transport is UDP/U */
+ if (!nodeid &&
+ (totem_config->transport_number == TOTEM_TRANSPORT_UDP ||
+ totem_config->transport_number == TOTEM_TRANSPORT_UDPU)) {
+ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos);
+ if (icmap_get_string(tmp_key, &str) == CS_OK) {
+ nodeid = generate_nodeid(totem_config, str);
+ free(str);
+ log_printf(LOGSYS_LEVEL_DEBUG,
+ "Generated nodeid = 0x%x for %s\n", nodeid, str);
+ }
+ }
+
member_count = totem_config->interfaces[linknumber].member_count;
res = totemip_parse(&totem_config->interfaces[linknumber].member_list[member_count],
node_addr_str, totem_config->ip_version);
if (res != -1) {
totem_config->interfaces[linknumber].member_list[member_count].nodeid = nodeid;
totem_config->interfaces[linknumber].member_count++;
}
totem_config->interfaces[linknumber].configured = 1;
free(node_addr_str);
}
icmap_iter_finalize(iter2);
}
icmap_iter_finalize(iter);
if (reload) {
log_printf(LOGSYS_LEVEL_DEBUG, "About to reconfigure links from nodelist.\n");
reconfigure_links(totem_config);
memcpy(new_interfaces, totem_config->interfaces, sizeof (struct totem_interface) * INTERFACE_MAX);
check_things_have_not_changed(totem_config);
compute_interfaces_diff(totem_config->orig_interfaces, new_interfaces);
free(new_interfaces);
}
}
static void nodelist_dynamic_notify(
int32_t event,
const char *key_name,
struct icmap_notify_value new_val,
struct icmap_notify_value old_val,
void *user_data)
{
int res;
unsigned int ring_no;
unsigned int member_no;
char tmp_str[ICMAP_KEYNAME_MAXLEN];
uint8_t reloading;
struct totem_config *totem_config = (struct totem_config *)user_data;
/*
* If a full reload is in progress then don't do anything until it's done and
* can reconfigure it all atomically
*/
if (icmap_get_uint8("config.totemconfig_reload_in_progress", &reloading) == CS_OK && reloading) {
return ;
}
res = sscanf(key_name, "nodelist.node.%u.ring%u%s", &member_no, &ring_no, tmp_str);
if (res != 3)
return ;
if (strcmp(tmp_str, "_addr") != 0) {
return;
}
put_nodelist_members_to_config(totem_config, 1);
}
static void config_convert_nodelist_to_interface(struct totem_config *totem_config)
{
int res = 0;
int node_pos;
char tmp_key[ICMAP_KEYNAME_MAXLEN];
char tmp_key2[ICMAP_KEYNAME_MAXLEN];
char *node_addr_str;
unsigned int linknumber = 0;
icmap_iter_t iter;
const char *iter_key;
node_pos = find_local_node(1);
if (node_pos > -1) {
/*
* We found node, so create interface section
*/
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos);
iter = icmap_iter_init(tmp_key);
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
res = sscanf(iter_key, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2);
if (res != 3 || strcmp(tmp_key2, "_addr") != 0) {
continue ;
}
if (icmap_get_string(iter_key, &node_addr_str) != CS_OK) {
continue;
}
snprintf(tmp_key2, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber);
icmap_set_string(tmp_key2, node_addr_str);
free(node_addr_str);
}
icmap_iter_finalize(iter);
}
}
static int get_interface_params(struct totem_config *totem_config,
const char **error_string, uint64_t *warnings,
int reload)
{
int res = 0;
unsigned int linknumber = 0;
int member_count = 0;
int i;
icmap_iter_t iter, member_iter;
const char *iter_key;
const char *member_iter_key;
char linknumber_key[ICMAP_KEYNAME_MAXLEN];
char tmp_key[ICMAP_KEYNAME_MAXLEN];
uint8_t u8;
uint32_t u32;
char *str;
char *cluster_name = NULL;
if (reload) {
for (i=0; i<INTERFACE_MAX; i++) {
/*
* Set back to defaults things that might have been configured and
* now have been taken out of corosync.conf. These won't be caught by the
* code below which only looks at interface{} sections that actually exist.
*/
totem_config->interfaces[i].configured = 0;
totem_config->interfaces[i].knet_ping_timeout = 0;
totem_config->interfaces[i].knet_ping_interval = 0;
totem_config->interfaces[i].knet_ping_precision = KNET_PING_PRECISION;
totem_config->interfaces[i].knet_pong_count = KNET_PONG_COUNT;
}
}
if (icmap_get_string("totem.cluster_name", &cluster_name) != CS_OK) {
cluster_name = NULL;
}
iter = icmap_iter_init("totem.interface.");
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
res = sscanf(iter_key, "totem.interface.%[^.].%s", linknumber_key, tmp_key);
if (res != 2) {
continue;
}
if (strcmp(tmp_key, "bindnetaddr") != 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) {
continue;
}
member_count = 0;
linknumber = atoi(linknumber_key);
if (linknumber >= INTERFACE_MAX) {
free(cluster_name);
snprintf (error_string_response, sizeof(error_string_response),
"parse error in config: interface ring number %u is bigger than allowed maximum %u\n",
linknumber, INTERFACE_MAX - 1);
*error_string = error_string_response;
return -1;
}
/* These things are only valid for the initial read */
if (!reload) {
/*
* Get the bind net address
*/
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber);
if (icmap_get_string(tmp_key, &str) == CS_OK) {
res = totemip_parse (&totem_config->interfaces[linknumber].bindnet, str,
totem_config->ip_version);
free(str);
}
/*
* Get interface multicast address
*/
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", linknumber);
if (icmap_get_string(tmp_key, &str) == CS_OK) {
res = totemip_parse (&totem_config->interfaces[linknumber].mcast_addr, str, totem_config->ip_version);
free(str);
} else {
/*
* User not specified address -> autogenerate one from cluster_name key
* (if available). Return code is intentionally ignored, because
* udpu doesn't need mcastaddr and validity of mcastaddr for udp is
* checked later anyway.
*/
(void)get_cluster_mcast_addr (cluster_name,
linknumber,
totem_config->ip_version,
&totem_config->interfaces[linknumber].mcast_addr);
}
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.broadcast", linknumber);
if (icmap_get_string(tmp_key, &str) == CS_OK) {
if (strcmp (str, "yes") == 0) {
totem_config->broadcast_use = 1;
}
free(str);
}
}
/* These things are only valid for the initial read OR a newly-defined link */
if (!reload || (totem_config->interfaces[linknumber].configured == 0)) {
/*
* Get mcast port
*/
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", linknumber);
if (icmap_get_uint16(tmp_key, &totem_config->interfaces[linknumber].ip_port) != CS_OK) {
if (totem_config->broadcast_use) {
totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + (2 * linknumber);
} else {
totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + linknumber;
}
}
/*
* Get the TTL
*/
totem_config->interfaces[linknumber].ttl = 1;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.ttl", linknumber);
if (icmap_get_uint8(tmp_key, &u8) == CS_OK) {
totem_config->interfaces[linknumber].ttl = u8;
}
totem_config->interfaces[linknumber].knet_transport = KNET_DEFAULT_TRANSPORT;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_transport", linknumber);
if (icmap_get_string(tmp_key, &str) == CS_OK) {
if (strcmp(str, "sctp") == 0) {
totem_config->interfaces[linknumber].knet_transport = KNET_TRANSPORT_SCTP;
}
else if (strcmp(str, "udp") == 0) {
totem_config->interfaces[linknumber].knet_transport = KNET_TRANSPORT_UDP;
}
else {
*error_string = "Unrecognised knet_transport. expected 'udp' or 'sctp'";
return -1;
}
}
}
totem_config->interfaces[linknumber].configured = 1;
/*
* Get the knet link params
*/
totem_config->interfaces[linknumber].knet_link_priority = 1;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_link_priority", linknumber);
if (icmap_get_uint8(tmp_key, &u8) == CS_OK) {
totem_config->interfaces[linknumber].knet_link_priority = u8;
}
totem_config->interfaces[linknumber].knet_ping_interval = 0; /* real default applied later */
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_interval", linknumber);
if (icmap_get_uint32(tmp_key, &u32) == CS_OK) {
totem_config->interfaces[linknumber].knet_ping_interval = u32;
}
totem_config->interfaces[linknumber].knet_ping_timeout = 0; /* real default applied later */
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_timeout", linknumber);
if (icmap_get_uint32(tmp_key, &u32) == CS_OK) {
totem_config->interfaces[linknumber].knet_ping_timeout = u32;
}
totem_config->interfaces[linknumber].knet_ping_precision = KNET_PING_PRECISION;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_precision", linknumber);
if (icmap_get_uint32(tmp_key, &u32) == CS_OK) {
totem_config->interfaces[linknumber].knet_ping_precision = u32;
}
totem_config->interfaces[linknumber].knet_pong_count = KNET_PONG_COUNT;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_pong_count", linknumber);
if (icmap_get_uint32(tmp_key, &u32) == CS_OK) {
totem_config->interfaces[linknumber].knet_pong_count = u32;
}
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.member.", linknumber);
member_iter = icmap_iter_init(tmp_key);
while ((member_iter_key = icmap_iter_next(member_iter, NULL, NULL)) != NULL) {
if (member_count == 0) {
if (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK) {
free(str);
*warnings |= TOTEM_CONFIG_WARNING_MEMBERS_IGNORED;
break;
} else {
*warnings |= TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED;
}
}
if (icmap_get_string(member_iter_key, &str) == CS_OK) {
res = totemip_parse (&totem_config->interfaces[linknumber].member_list[member_count++],
str, totem_config->ip_version);
}
}
icmap_iter_finalize(member_iter);
totem_config->interfaces[linknumber].member_count = member_count;
}
icmap_iter_finalize(iter);
return 0;
}
extern int totem_config_read (
struct totem_config *totem_config,
const char **error_string,
uint64_t *warnings)
{
int res = 0;
char *str, *ring0_addr_str;
char tmp_key[ICMAP_KEYNAME_MAXLEN];
uint16_t u16;
int i;
int local_node_pos;
int nodeid_set;
*warnings = 0;
memset (totem_config, 0, sizeof (struct totem_config));
totem_config->interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX);
if (totem_config->interfaces == 0) {
*error_string = "Out of memory trying to allocate ethernet interface storage area";
return -1;
}
totem_config->transport_number = TOTEM_TRANSPORT_KNET;
if (icmap_get_string("totem.transport", &str) == CS_OK) {
if (strcmp (str, "udpu") == 0) {
totem_config->transport_number = TOTEM_TRANSPORT_UDPU;
}
if (strcmp (str, "udp") == 0) {
totem_config->transport_number = TOTEM_TRANSPORT_UDP;
}
if (strcmp (str, "knet") == 0) {
totem_config->transport_number = TOTEM_TRANSPORT_KNET;
}
free(str);
}
memset (totem_config->interfaces, 0,
sizeof (struct totem_interface) * INTERFACE_MAX);
strcpy (totem_config->link_mode, "passive");
icmap_get_uint32("totem.version", (uint32_t *)&totem_config->version);
if (totem_get_crypto(totem_config, error_string) != 0) {
return -1;
}
if (icmap_get_string("totem.link_mode", &str) == CS_OK) {
if (strlen(str) >= TOTEM_LINK_MODE_BYTES) {
*error_string = "totem.link_mode is too long";
free(str);
return -1;
}
strcpy (totem_config->link_mode, str);
free(str);
}
icmap_get_uint32("totem.nodeid", &totem_config->node_id);
totem_config->clear_node_high_bit = 0;
if (icmap_get_string("totem.clear_node_high_bit", &str) == CS_OK) {
if (strcmp (str, "yes") == 0) {
totem_config->clear_node_high_bit = 1;
}
free(str);
}
icmap_get_uint32("totem.threads", &totem_config->threads);
icmap_get_uint32("totem.netmtu", &totem_config->net_mtu);
totem_config->ip_version = totem_config_get_ip_version(totem_config);
if (icmap_get_string("totem.interface.0.bindnetaddr", &str) != CS_OK) {
/*
* We were not able to find ring 0 bindnet addr. Try to use nodelist informations
*/
config_convert_nodelist_to_interface(totem_config);
} else {
if (icmap_get_string("nodelist.node.0.ring0_addr", &ring0_addr_str) == CS_OK) {
/*
* Both bindnetaddr and ring0_addr are set.
* Log warning information, and use nodelist instead
*/
*warnings |= TOTEM_CONFIG_BINDNETADDR_NODELIST_SET;
config_convert_nodelist_to_interface(totem_config);
free(ring0_addr_str);
}
free(str);
}
/*
* Broadcast option is global but set in interface section,
* so reset before processing interfaces.
*/
totem_config->broadcast_use = 0;
res = get_interface_params(totem_config, error_string, warnings, 0);
if (res < 0) {
return res;
}
/*
* Use broadcast is global, so if set, make sure to fill mcast addr correctly
* broadcast is only supported for UDP so just do interface 0;
*/
if (totem_config->broadcast_use) {
totemip_parse (&totem_config->interfaces[0].mcast_addr,
"255.255.255.255", 0);
}
/*
* Store automatically generated items back to icmap only for UDP
*/
if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) {
for (i = 0; i < INTERFACE_MAX; i++) {
if (!totem_config->interfaces[i].configured) {
continue;
}
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", i);
if (icmap_get_string(tmp_key, &str) == CS_OK) {
free(str);
} else {
str = (char *)totemip_print(&totem_config->interfaces[i].mcast_addr);
icmap_set_string(tmp_key, str);
}
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", i);
if (icmap_get_uint16(tmp_key, &u16) != CS_OK) {
icmap_set_uint16(tmp_key, totem_config->interfaces[i].ip_port);
}
}
}
/*
* Check existence of nodelist
*/
if ((icmap_get_string("nodelist.node.0.name", &str) == CS_OK) ||
(icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK)) {
free(str);
/*
* find local node
*/
local_node_pos = find_local_node(1);
if (local_node_pos != -1) {
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", local_node_pos);
nodeid_set = (totem_config->node_id != 0);
if (icmap_get_uint32(tmp_key, &totem_config->node_id) == CS_OK && nodeid_set) {
*warnings |= TOTEM_CONFIG_WARNING_TOTEM_NODEID_IGNORED;
}
if ((totem_config->transport_number == TOTEM_TRANSPORT_KNET) && (!totem_config->node_id)) {
*error_string = "With knet, you must specify nodeid for current node";
return -1;
}
+ if ((totem_config->transport_number == TOTEM_TRANSPORT_UDP ||
+ totem_config->transport_number == TOTEM_TRANSPORT_UDPU) && (!totem_config->node_id)) {
+
+ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", local_node_pos);
+ icmap_get_string(tmp_key, &str);
+
+ totem_config->node_id = generate_nodeid(totem_config, str);
+ totem_config->interfaces[0].member_list[local_node_pos].nodeid = totem_config->node_id;
+
+ free(str);
+ }
+
/* Users must not change this */
icmap_set_ro_access("nodelist.local_node_pos", 0, 1);
}
put_nodelist_members_to_config(totem_config, 0);
}
/*
* Get things that might change in the future (and can depend on totem_config->interfaces);
*/
totem_volatile_config_read(totem_config, NULL);
calc_knet_ping_timers(totem_config);
icmap_set_uint8("config.totemconfig_reload_in_progress", 0);
add_totem_config_notification(totem_config);
return 0;
}
int totem_config_validate (
struct totem_config *totem_config,
const char **error_string)
{
static char local_error_reason[512];
char parse_error[512];
const char *error_reason = local_error_reason;
int i,j;
uint32_t u32;
int num_configured = 0;
unsigned int interface_max = INTERFACE_MAX;
for (i = 0; i < INTERFACE_MAX; i++) {
if (totem_config->interfaces[i].configured) {
num_configured++;
}
}
if (num_configured == 0) {
error_reason = "No interfaces defined";
goto parse_error;
}
/* Check we found a local node name */
if (icmap_get_uint32("nodelist.local_node_pos", &u32) != CS_OK) {
error_reason = "No valid name found for local host";
goto parse_error;
}
for (i = 0; i < INTERFACE_MAX; i++) {
/*
* Some error checking of parsed data to make sure its valid
*/
struct totem_ip_address null_addr;
if (!totem_config->interfaces[i].configured) {
continue;
}
memset (&null_addr, 0, sizeof (struct totem_ip_address));
if ((totem_config->transport_number == TOTEM_TRANSPORT_UDP) &&
memcmp (&totem_config->interfaces[i].mcast_addr, &null_addr,
sizeof (struct totem_ip_address)) == 0) {
error_reason = "No multicast address specified";
goto parse_error;
}
if (totem_config->interfaces[i].ip_port == 0) {
error_reason = "No multicast port specified";
goto parse_error;
}
if (totem_config->interfaces[i].ttl > 255) {
error_reason = "Invalid TTL (should be 0..255)";
goto parse_error;
}
if (totem_config->transport_number != TOTEM_TRANSPORT_UDP &&
totem_config->interfaces[i].ttl != 1) {
error_reason = "Can only set ttl on multicast transport types";
goto parse_error;
}
if (totem_config->interfaces[i].knet_link_priority > 255) {
error_reason = "Invalid link priority (should be 0..255)";
goto parse_error;
}
if (totem_config->transport_number != TOTEM_TRANSPORT_KNET &&
totem_config->interfaces[i].knet_link_priority != 1) {
error_reason = "Can only set link priority on knet transport type";
goto parse_error;
}
if (totem_config->interfaces[i].mcast_addr.family == AF_INET6 &&
totem_config->node_id == 0) {
error_reason = "An IPV6 network requires that a node ID be specified.";
goto parse_error;
}
if (totem_config->broadcast_use == 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) {
if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) {
error_reason = "Multicast address family does not match bind address family";
goto parse_error;
}
if (totemip_is_mcast (&totem_config->interfaces[i].mcast_addr) != 0) {
error_reason = "mcastaddr is not a correct multicast address.";
goto parse_error;
}
}
/* Verify that all nodes on the same knet link have the same IP family */
for (j=1; j<totem_config->interfaces[i].member_count; j++) {
if (totem_config->interfaces[i].configured) {
if (totem_config->interfaces[i].member_list[j].family !=
totem_config->interfaces[i].member_list[0].family) {
snprintf (local_error_reason, sizeof(local_error_reason),
"Nodes for link %d have different IP families", i);
goto parse_error;
}
}
}
}
if (totem_config->version != 2) {
error_reason = "This totem parser can only parse version 2 configurations.";
goto parse_error;
}
if (totem_volatile_config_validate(totem_config, error_string) == -1) {
return (-1);
}
if (check_for_duplicate_nodeids(totem_config, error_string) == -1) {
return (-1);
}
/*
* KNET Link values validation
*/
if (strcmp (totem_config->link_mode, "active") &&
strcmp (totem_config->link_mode, "rr") &&
strcmp (totem_config->link_mode, "passive")) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The Knet link mode \"%s\" specified is invalid. It must be active, passive or rr.\n", totem_config->link_mode);
goto parse_error;
}
/* Only Knet does multiple interfaces */
if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) {
interface_max = 1;
}
if (interface_max < num_configured) {
snprintf (parse_error, sizeof(parse_error),
"%d is too many configured interfaces for non-Knet transport.",
num_configured);
error_reason = parse_error;
goto parse_error;
}
/* Only knet allows crypto */
if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) {
if ((strcmp(totem_config->crypto_cipher_type, "none") != 0) ||
(strcmp(totem_config->crypto_hash_type, "none") != 0)) {
snprintf (parse_error, sizeof(parse_error),
"crypto_cipher & crypto_hash are only valid for the Knet transport.");
error_reason = parse_error;
goto parse_error;
}
}
if (totem_config->net_mtu == 0) {
if (totem_config->transport_number == TOTEM_TRANSPORT_KNET) {
totem_config->net_mtu = KNET_MAX_PACKET_SIZE;
}
else {
totem_config->net_mtu = 1500;
}
}
return 0;
parse_error:
snprintf (error_string_response, sizeof(error_string_response),
"parse error in config: %s\n", error_reason);
*error_string = error_string_response;
return (-1);
}
static int read_keyfile (
const char *key_location,
struct totem_config *totem_config,
const char **error_string)
{
int fd;
int res;
int saved_errno;
char error_str[100];
const char *error_ptr;
fd = open (key_location, O_RDONLY);
if (fd == -1) {
error_ptr = qb_strerror_r(errno, error_str, sizeof(error_str));
snprintf (error_string_response, sizeof(error_string_response),
"Could not open %s: %s\n",
key_location, error_ptr);
goto parse_error;
}
res = read (fd, totem_config->private_key, TOTEM_PRIVATE_KEY_LEN_MAX);
saved_errno = errno;
close (fd);
if (res == -1) {
error_ptr = qb_strerror_r (saved_errno, error_str, sizeof(error_str));
snprintf (error_string_response, sizeof(error_string_response),
"Could not read %s: %s\n",
key_location, error_ptr);
goto parse_error;
}
if (res < TOTEM_PRIVATE_KEY_LEN_MIN) {
snprintf (error_string_response, sizeof(error_string_response),
"Could only read %d bits of minimum %u bits from %s.\n",
res * 8, TOTEM_PRIVATE_KEY_LEN_MIN * 8, key_location);
goto parse_error;
}
totem_config->private_key_len = res;
return 0;
parse_error:
*error_string = error_string_response;
return (-1);
}
int totem_config_keyread (
struct totem_config *totem_config,
const char **error_string)
{
int got_key = 0;
char *key_location = NULL;
int res;
size_t key_len;
memset (totem_config->private_key, 0, sizeof(totem_config->private_key));
totem_config->private_key_len = 0;
if (strcmp(totem_config->crypto_cipher_type, "none") == 0 &&
strcmp(totem_config->crypto_hash_type, "none") == 0) {
return (0);
}
/* cmap may store the location of the key file */
if (icmap_get_string("totem.keyfile", &key_location) == CS_OK) {
res = read_keyfile(key_location, totem_config, error_string);
free(key_location);
if (res) {
goto key_error;
}
got_key = 1;
} else { /* Or the key itself may be in the cmap */
if (icmap_get("totem.key", NULL, &key_len, NULL) == CS_OK) {
if (key_len > sizeof(totem_config->private_key)) {
sprintf(error_string_response, "key is too long");
goto key_error;
}
if (key_len < TOTEM_PRIVATE_KEY_LEN_MIN) {
sprintf(error_string_response, "key is too short");
goto key_error;
}
if (icmap_get("totem.key", totem_config->private_key, &key_len, NULL) == CS_OK) {
totem_config->private_key_len = key_len;
got_key = 1;
} else {
sprintf(error_string_response, "can't load private key");
goto key_error;
}
}
}
/* In desperation we read the default filename */
if (!got_key) {
const char *filename = getenv("COROSYNC_TOTEM_AUTHKEY_FILE");
if (!filename)
filename = COROSYSCONFDIR "/authkey";
res = read_keyfile(filename, totem_config, error_string);
if (res)
goto key_error;
}
return (0);
key_error:
*error_string = error_string_response;
return (-1);
}
static void debug_dump_totem_config(const struct totem_config *totem_config)
{
log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)",
totem_config->token_timeout, totem_config->token_retransmit_timeout);
if (totem_config->token_warning) {
uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
log_printf(LOGSYS_LEVEL_DEBUG, "Token warning every %d ms (%d%% of Token Timeout)",
token_warning_ms, totem_config->token_warning);
if (token_warning_ms < totem_config->token_retransmit_timeout)
log_printf (LOGSYS_LEVEL_DEBUG,
"The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
"which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
token_warning_ms, totem_config->token_retransmit_timeout);
} else
log_printf(LOGSYS_LEVEL_DEBUG, "Token warnings disabled");
log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)",
totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)",
totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout,
totem_config->merge_timeout);
log_printf(LOGSYS_LEVEL_DEBUG, "downcheck (%d ms) fail to recv const (%d msgs)",
totem_config->downcheck_timeout, totem_config->fail_to_recv_const);
log_printf(LOGSYS_LEVEL_DEBUG,
"seqno unchanged const (%d rotations) Maximum network MTU %d",
totem_config->seqno_unchanged_const, totem_config->net_mtu);
log_printf(LOGSYS_LEVEL_DEBUG,
"window size per rotation (%d messages) maximum messages per rotation (%d messages)",
totem_config->window_size, totem_config->max_messages);
log_printf(LOGSYS_LEVEL_DEBUG, "missed count const (%d messages)", totem_config->miss_count_const);
log_printf(LOGSYS_LEVEL_DEBUG, "heartbeat_failures_allowed (%d)",
totem_config->heartbeat_failures_allowed);
log_printf(LOGSYS_LEVEL_DEBUG, "max_network_delay (%d ms)", totem_config->max_network_delay);
}
static void totem_change_notify(
int32_t event,
const char *key_name,
struct icmap_notify_value new_val,
struct icmap_notify_value old_val,
void *user_data)
{
struct totem_config *totem_config = (struct totem_config *)user_data;
uint32_t *param;
uint8_t reloading;
const char *deleted_key = NULL;
const char *error_string;
/*
* If a full reload is in progress then don't do anything until it's done and
* can reconfigure it all atomically
*/
if (icmap_get_uint8("config.reload_in_progress", &reloading) == CS_OK && reloading)
return;
param = totem_get_param_by_name((struct totem_config *)user_data, key_name);
/*
* Process change only if changed key is found in totem_config (-> param is not NULL)
* or for special key token_coefficient. token_coefficient key is not stored in
* totem_config, but it is used for computation of token timeout.
*/
if (!param && strcmp(key_name, "totem.token_coefficient") != 0)
return;
/*
* Values other than UINT32 are not supported, or needed (yet)
*/
switch (event) {
case ICMAP_TRACK_DELETE:
deleted_key = key_name;
break;
case ICMAP_TRACK_ADD:
case ICMAP_TRACK_MODIFY:
deleted_key = NULL;
break;
default:
break;
}
totem_volatile_config_read (totem_config, deleted_key);
log_printf(LOGSYS_LEVEL_DEBUG, "Totem related config key changed. Dumping actual totem config.");
debug_dump_totem_config(totem_config);
if (totem_volatile_config_validate(totem_config, &error_string) == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
/*
* TODO: Consider corosync exit and/or load defaults for volatile
* values. For now, log error seems to be enough
*/
}
}
static void totem_reload_notify(
int32_t event,
const char *key_name,
struct icmap_notify_value new_val,
struct icmap_notify_value old_val,
void *user_data)
{
struct totem_config *totem_config = (struct totem_config *)user_data;
const char *error_string;
uint64_t warnings;
/* Reload has completed */
if (*(uint8_t *)new_val.data == 0) {
totem_config->orig_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX);
assert(totem_config->orig_interfaces != NULL);
memcpy(totem_config->orig_interfaces, totem_config->interfaces, sizeof (struct totem_interface) * INTERFACE_MAX);
get_interface_params(totem_config, &error_string, &warnings, 1);
put_nodelist_members_to_config (totem_config, 1);
totem_volatile_config_read (totem_config, NULL);
calc_knet_ping_timers(totem_config);
log_printf(LOGSYS_LEVEL_DEBUG, "Configuration reloaded. Dumping actual totem config.");
debug_dump_totem_config(totem_config);
if (totem_volatile_config_validate(totem_config, &error_string) == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
/*
* TODO: Consider corosync exit and/or load defaults for volatile
* values. For now, log error seems to be enough
*/
}
/* Reinstate the local_node_pos */
(void)find_local_node(0);
/* Reconfigure network params as appropriate */
totempg_reconfigure();
free(totem_config->orig_interfaces);
icmap_set_uint8("config.totemconfig_reload_in_progress", 0);
} else {
icmap_set_uint8("config.totemconfig_reload_in_progress", 1);
}
}
static void add_totem_config_notification(struct totem_config *totem_config)
{
icmap_track_t icmap_track;
icmap_track_add("totem.",
ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX,
totem_change_notify,
totem_config,
&icmap_track);
icmap_track_add("config.reload_in_progress",
ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY,
totem_reload_notify,
totem_config,
&icmap_track);
icmap_track_add("nodelist.node.",
ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX,
nodelist_dynamic_notify,
(void *)totem_config,
&icmap_track);
}
diff --git a/exec/totemip.c b/exec/totemip.c
index 1eb0889c..c0f753d1 100644
--- a/exec/totemip.c
+++ b/exec/totemip.c
@@ -1,533 +1,521 @@
/*
* Copyright (c) 2005-2011 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Patrick Caulfield (pcaulfie@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
/* IPv4/6 abstraction */
#include <config.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <net/if.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <ifaddrs.h>
#include <corosync/totem/totemip.h>
#include <corosync/swab.h>
#define LOCALHOST_IPV4 "127.0.0.1"
#define LOCALHOST_IPV6 "::1"
#define NETLINK_BUFSIZE 16384
#ifdef SO_NOSIGPIPE
void totemip_nosigpipe(int s)
{
int on = 1;
setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on, sizeof(on));
}
#endif
/* Compare two addresses */
int totemip_equal(const struct totem_ip_address *addr1,
const struct totem_ip_address *addr2)
{
int addrlen = 0;
if (addr1->family != addr2->family)
return 0;
if (addr1->family == AF_INET) {
addrlen = sizeof(struct in_addr);
}
if (addr1->family == AF_INET6) {
addrlen = sizeof(struct in6_addr);
}
assert(addrlen);
if (memcmp(addr1->addr, addr2->addr, addrlen) == 0)
return 1;
else
return 0;
}
/* Copy a totem_ip_address */
void totemip_copy(struct totem_ip_address *addr1,
const struct totem_ip_address *addr2)
{
memcpy(addr1, addr2, sizeof(struct totem_ip_address));
}
void totemip_copy_endian_convert(struct totem_ip_address *addr1,
const struct totem_ip_address *addr2)
{
addr1->nodeid = swab32(addr2->nodeid);
addr1->family = swab16(addr2->family);
memcpy(addr1->addr, addr2->addr, TOTEMIP_ADDRLEN);
}
/*
* Multicast address range is 224.0.0.0 to 239.255.255.255 this
* translates to the first 4 bits == 1110 (0xE).
* http://en.wikipedia.org/wiki/Multicast_address
*/
int32_t totemip_is_mcast(struct totem_ip_address *ip_addr)
{
uint32_t addr = 0;
memcpy (&addr, ip_addr->addr, sizeof (uint32_t));
if (ip_addr->family == AF_INET) {
addr = ntohl(addr);
if ((addr >> 28) != 0xE) {
return -1;
}
}
return 0;
}
/* For sorting etc. params are void * for qsort's benefit */
int totemip_compare(const void *a, const void *b)
{
int i;
const struct totem_ip_address *totemip_a = (const struct totem_ip_address *)a;
const struct totem_ip_address *totemip_b = (const struct totem_ip_address *)b;
struct in_addr ipv4_a1;
struct in_addr ipv4_a2;
struct in6_addr ipv6_a1;
struct in6_addr ipv6_a2;
unsigned short family;
/*
* Use memcpy to align since totem_ip_address is unaligned on various archs
*/
memcpy (&family, &totemip_a->family, sizeof (unsigned short));
if (family == AF_INET) {
memcpy (&ipv4_a1, totemip_a->addr, sizeof (struct in_addr));
memcpy (&ipv4_a2, totemip_b->addr, sizeof (struct in_addr));
if (ipv4_a1.s_addr == ipv4_a2.s_addr) {
return (0);
}
if (htonl(ipv4_a1.s_addr) < htonl(ipv4_a2.s_addr)) {
return -1;
} else {
return +1;
}
} else
if (family == AF_INET6) {
/*
* We can only compare 8 bits at time for portability reasons
*/
memcpy (&ipv6_a1, totemip_a->addr, sizeof (struct in6_addr));
memcpy (&ipv6_a2, totemip_b->addr, sizeof (struct in6_addr));
for (i = 0; i < 16; i++) {
int res = ipv6_a1.s6_addr[i] -
ipv6_a2.s6_addr[i];
if (res) {
return res;
}
}
return 0;
} else {
/*
* Family not set, should be!
*/
assert (0);
}
return 0;
}
/* Build a localhost totem_ip_address */
int totemip_localhost(int family, struct totem_ip_address *localhost)
{
const char *addr_text;
memset (localhost, 0, sizeof (struct totem_ip_address));
if (family == AF_INET) {
addr_text = LOCALHOST_IPV4;
if (inet_pton(family, addr_text, (char *)&localhost->nodeid) <= 0) {
return -1;
}
} else {
addr_text = LOCALHOST_IPV6;
}
if (inet_pton(family, addr_text, (char *)localhost->addr) <= 0)
return -1;
localhost->family = family;
return 0;
}
int totemip_localhost_check(const struct totem_ip_address *addr)
{
struct totem_ip_address localhost;
if (totemip_localhost(addr->family, &localhost))
return 0;
return totemip_equal(addr, &localhost);
}
const char *totemip_sa_print(const struct sockaddr *sa)
{
static char buf[INET6_ADDRSTRLEN];
buf[0] = 0;
switch (sa->sa_family) {
case AF_INET:
inet_ntop(sa->sa_family, &((struct sockaddr_in *)(sa))->sin_addr, buf,
INET6_ADDRSTRLEN);
break;
case AF_INET6:
inet_ntop(sa->sa_family, &((struct sockaddr_in6 *)(sa))->sin6_addr, buf,
INET6_ADDRSTRLEN);
break;
default:
return (NULL);
}
return (buf);
}
const char *totemip_print(const struct totem_ip_address *addr)
{
static char buf[INET6_ADDRSTRLEN];
return (inet_ntop(addr->family, addr->addr, buf, sizeof(buf)));
}
/* Make a totem_ip_address into a usable sockaddr_storage */
int totemip_totemip_to_sockaddr_convert(struct totem_ip_address *ip_addr,
uint16_t port, struct sockaddr_storage *saddr, int *addrlen)
{
int ret = -1;
if (ip_addr->family == AF_INET) {
struct sockaddr_in *sin = (struct sockaddr_in *)saddr;
memset(sin, 0, sizeof(struct sockaddr_in));
#ifdef HAVE_SOCK_SIN_LEN
sin->sin_len = sizeof(struct sockaddr_in);
#endif
sin->sin_family = ip_addr->family;
sin->sin_port = ntohs(port);
memcpy(&sin->sin_addr, ip_addr->addr, sizeof(struct in_addr));
*addrlen = sizeof(struct sockaddr_in);
ret = 0;
}
if (ip_addr->family == AF_INET6) {
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)saddr;
memset(sin, 0, sizeof(struct sockaddr_in6));
#ifdef HAVE_SOCK_SIN6_LEN
sin->sin6_len = sizeof(struct sockaddr_in6);
#endif
sin->sin6_family = ip_addr->family;
sin->sin6_port = ntohs(port);
sin->sin6_scope_id = 2;
memcpy(&sin->sin6_addr, ip_addr->addr, sizeof(struct in6_addr));
*addrlen = sizeof(struct sockaddr_in6);
ret = 0;
}
return ret;
}
/* Converts an address string string into a totem_ip_address.
family can be AF_INET, AF_INET6 or 0 ("for "don't care")
*/
int totemip_parse(struct totem_ip_address *totemip, const char *addr, int family)
{
struct addrinfo *ainfo;
struct addrinfo ahints;
struct sockaddr_in *sa;
struct sockaddr_in6 *sa6;
int ret;
memset(&ahints, 0, sizeof(ahints));
ahints.ai_socktype = SOCK_DGRAM;
ahints.ai_protocol = IPPROTO_UDP;
ahints.ai_family = family;
/* Lookup the nodename address */
ret = getaddrinfo(addr, NULL, &ahints, &ainfo);
if (ret)
return -1;
sa = (struct sockaddr_in *)ainfo->ai_addr;
sa6 = (struct sockaddr_in6 *)ainfo->ai_addr;
totemip->family = ainfo->ai_family;
if (ainfo->ai_family == AF_INET)
memcpy(totemip->addr, &sa->sin_addr, sizeof(struct in_addr));
else
memcpy(totemip->addr, &sa6->sin6_addr, sizeof(struct in6_addr));
freeaddrinfo(ainfo);
return 0;
}
/* Make a sockaddr_* into a totem_ip_address */
int totemip_sockaddr_to_totemip_convert(const struct sockaddr_storage *saddr,
struct totem_ip_address *ip_addr)
{
int ret = -1;
ip_addr->family = saddr->ss_family;
ip_addr->nodeid = 0;
if (saddr->ss_family == AF_INET) {
const struct sockaddr_in *sin = (const struct sockaddr_in *)saddr;
memcpy(ip_addr->addr, &sin->sin_addr, sizeof(struct in_addr));
ret = 0;
}
if (saddr->ss_family == AF_INET6) {
const struct sockaddr_in6 *sin
= (const struct sockaddr_in6 *)saddr;
memcpy(ip_addr->addr, &sin->sin6_addr, sizeof(struct in6_addr));
ret = 0;
}
return ret;
}
int totemip_getifaddrs(struct qb_list_head *addrs)
{
struct ifaddrs *ifap, *ifa;
struct totem_ip_if_address *if_addr;
if (getifaddrs(&ifap) != 0)
return (-1);
qb_list_init(addrs);
for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
if (ifa->ifa_addr == NULL || ifa->ifa_netmask == NULL)
continue ;
if ((ifa->ifa_addr->sa_family != AF_INET && ifa->ifa_addr->sa_family != AF_INET6) ||
(ifa->ifa_netmask->sa_family != AF_INET && ifa->ifa_netmask->sa_family != AF_INET6 &&
ifa->ifa_netmask->sa_family != 0))
continue ;
if (ifa->ifa_netmask->sa_family == 0) {
ifa->ifa_netmask->sa_family = ifa->ifa_addr->sa_family;
}
if_addr = malloc(sizeof(struct totem_ip_if_address));
if (if_addr == NULL) {
goto error_free_ifaddrs;
}
qb_list_init(&if_addr->list);
memset(if_addr, 0, sizeof(struct totem_ip_if_address));
if_addr->interface_up = ifa->ifa_flags & IFF_UP;
if_addr->interface_num = if_nametoindex(ifa->ifa_name);
if_addr->name = strdup(ifa->ifa_name);
if (if_addr->name == NULL) {
goto error_free_addr;
}
if (totemip_sockaddr_to_totemip_convert((const struct sockaddr_storage *)ifa->ifa_addr,
&if_addr->ip_addr) == -1) {
goto error_free_addr_name;
}
if (totemip_sockaddr_to_totemip_convert((const struct sockaddr_storage *)ifa->ifa_netmask,
&if_addr->mask_addr) == -1) {
goto error_free_addr_name;
}
qb_list_add_tail(&if_addr->list, addrs);
}
freeifaddrs(ifap);
return (0);
error_free_addr_name:
free(if_addr->name);
error_free_addr:
free(if_addr);
error_free_ifaddrs:
totemip_freeifaddrs(addrs);
freeifaddrs(ifap);
return (-1);
}
void totemip_freeifaddrs(struct qb_list_head *addrs)
{
struct totem_ip_if_address *if_addr;
struct qb_list_head *list, *tmp_iter;
qb_list_for_each_safe(list, tmp_iter, addrs) {
if_addr = qb_list_entry(list, struct totem_ip_if_address, list);
free(if_addr->name);
qb_list_del(&if_addr->list);
free(if_addr);
}
qb_list_init(addrs);
}
int totemip_iface_check(struct totem_ip_address *bindnet,
struct totem_ip_address *boundto,
int *interface_up,
int *interface_num,
int mask_high_bit)
{
struct qb_list_head addrs;
struct qb_list_head *list;
struct totem_ip_if_address *if_addr;
struct totem_ip_address bn_netaddr, if_netaddr;
socklen_t addr_len;
socklen_t si;
int res = -1;
int exact_match_found = 0;
int net_match_found = 0;
*interface_up = 0;
*interface_num = 0;
if (totemip_getifaddrs(&addrs) == -1) {
return (-1);
}
qb_list_for_each(list, &addrs) {
if_addr = qb_list_entry(list, struct totem_ip_if_address, list);
if (bindnet->family != if_addr->ip_addr.family)
continue ;
addr_len = 0;
switch (bindnet->family) {
case AF_INET:
addr_len = sizeof(struct in_addr);
break;
case AF_INET6:
addr_len = sizeof(struct in6_addr);
break;
}
if (addr_len == 0)
continue ;
totemip_copy(&bn_netaddr, bindnet);
totemip_copy(&if_netaddr, &if_addr->ip_addr);
if (totemip_equal(&bn_netaddr, &if_netaddr)) {
exact_match_found = 1;
}
for (si = 0; si < addr_len; si++) {
bn_netaddr.addr[si] = bn_netaddr.addr[si] & if_addr->mask_addr.addr[si];
if_netaddr.addr[si] = if_netaddr.addr[si] & if_addr->mask_addr.addr[si];
}
if (exact_match_found || (!net_match_found && totemip_equal(&bn_netaddr, &if_netaddr))) {
totemip_copy(boundto, &if_addr->ip_addr);
boundto->nodeid = bindnet->nodeid;
*interface_up = if_addr->interface_up;
*interface_num = if_addr->interface_num;
- if (boundto->family == AF_INET && boundto->nodeid == 0) {
- unsigned int nodeid = 0;
- memcpy (&nodeid, boundto->addr, sizeof (int));
-#if __BYTE_ORDER == __LITTLE_ENDIAN
- nodeid = swab32 (nodeid);
-#endif
- if (mask_high_bit) {
- nodeid &= 0x7FFFFFFF;
- }
- boundto->nodeid = nodeid;
- }
-
net_match_found = 1;
res = 0;
if (exact_match_found) {
goto finished;
}
}
}
finished:
totemip_freeifaddrs(&addrs);
return (res);
}
#define TOTEMIP_UDP_HEADER_SIZE 8
#define TOTEMIP_IPV4_HEADER_SIZE 20
#define TOTEMIP_IPV6_HEADER_SIZE 40
size_t totemip_udpip_header_size(int family)
{
size_t header_size;
header_size = 0;
switch (family) {
case AF_INET:
header_size = TOTEMIP_UDP_HEADER_SIZE + TOTEMIP_IPV4_HEADER_SIZE;
break;
case AF_INET6:
header_size = TOTEMIP_UDP_HEADER_SIZE + TOTEMIP_IPV6_HEADER_SIZE;
break;
}
return (header_size);
}
diff --git a/exec/votequorum.c b/exec/votequorum.c
index 246c5e0f..b105e7cc 100644
--- a/exec/votequorum.c
+++ b/exec/votequorum.c
@@ -1,3041 +1,3047 @@
/*
* Copyright (c) 2009-2015 Red Hat, Inc.
*
* All rights reserved.
*
* Authors: Christine Caulfield (ccaulfie@redhat.com)
* Fabio M. Di Nitto (fdinitto@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>
#include <qb/qblist.h>
#include <qb/qbipc_common.h>
#include "quorum.h"
#include <corosync/corodefs.h>
#include <corosync/logsys.h>
#include <corosync/coroapi.h>
#include <corosync/icmap.h>
#include <corosync/votequorum.h>
#include <corosync/ipc_votequorum.h>
#include "service.h"
#include "util.h"
LOGSYS_DECLARE_SUBSYS ("VOTEQ");
/*
* interface with corosync
*/
static struct corosync_api_v1 *corosync_api;
/*
* votequorum global config vars
*/
static char qdevice_name[VOTEQUORUM_QDEVICE_MAX_NAME_LEN];
static struct cluster_node *qdevice = NULL;
static unsigned int qdevice_timeout = VOTEQUORUM_QDEVICE_DEFAULT_TIMEOUT;
static unsigned int qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT;
static uint8_t qdevice_can_operate = 1;
static void *qdevice_reg_conn = NULL;
static uint8_t qdevice_master_wins = 0;
static uint8_t two_node = 0;
static uint8_t wait_for_all = 0;
static uint8_t wait_for_all_status = 0;
static enum {ATB_NONE, ATB_LOWEST, ATB_HIGHEST, ATB_LIST} auto_tie_breaker = ATB_NONE, initial_auto_tie_breaker = ATB_NONE;
static int lowest_node_id = -1;
static int highest_node_id = -1;
#define DEFAULT_LMS_WIN 10000
static uint8_t last_man_standing = 0;
static uint32_t last_man_standing_window = DEFAULT_LMS_WIN;
static uint8_t allow_downscale = 0;
static uint32_t ev_barrier = 0;
static uint8_t ev_tracking = 0;
static uint32_t ev_tracking_barrier = 0;
static int ev_tracking_fd = -1;
/*
* votequorum_exec defines/structs/forward definitions
*/
struct req_exec_quorum_nodeinfo {
struct qb_ipc_request_header header __attribute__((aligned(8)));
uint32_t nodeid;
uint32_t votes;
uint32_t expected_votes;
uint32_t flags;
} __attribute__((packed));
struct req_exec_quorum_reconfigure {
struct qb_ipc_request_header header __attribute__((aligned(8)));
uint32_t nodeid;
uint32_t value;
uint8_t param;
uint8_t _pad0;
uint8_t _pad1;
uint8_t _pad2;
} __attribute__((packed));
struct req_exec_quorum_qdevice_reg {
struct qb_ipc_request_header header __attribute__((aligned(8)));
uint32_t operation;
char qdevice_name[VOTEQUORUM_QDEVICE_MAX_NAME_LEN];
} __attribute__((packed));
struct req_exec_quorum_qdevice_reconfigure {
struct qb_ipc_request_header header __attribute__((aligned(8)));
char oldname[VOTEQUORUM_QDEVICE_MAX_NAME_LEN];
char newname[VOTEQUORUM_QDEVICE_MAX_NAME_LEN];
} __attribute__((packed));
/*
* votequorum_exec onwire version (via totem)
*/
#include "votequorum.h"
/*
* votequorum_exec onwire messages (via totem)
*/
#define MESSAGE_REQ_EXEC_VOTEQUORUM_NODEINFO 0
#define MESSAGE_REQ_EXEC_VOTEQUORUM_RECONFIGURE 1
#define MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_REG 2
#define MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_RECONFIGURE 3
static void votequorum_exec_send_expectedvotes_notification(void);
static int votequorum_exec_send_quorum_notification(void *conn, uint64_t context);
static int votequorum_exec_send_nodelist_notification(void *conn, uint64_t context);
#define VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES 1
#define VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES 2
#define VOTEQUORUM_RECONFIG_PARAM_CANCEL_WFA 3
static int votequorum_exec_send_reconfigure(uint8_t param, unsigned int nodeid, uint32_t value);
/*
* used by req_exec_quorum_qdevice_reg
*/
#define VOTEQUORUM_QDEVICE_OPERATION_UNREGISTER 0
#define VOTEQUORUM_QDEVICE_OPERATION_REGISTER 1
/*
* votequorum internal node status/view
*/
#define NODE_FLAGS_QUORATE 1
#define NODE_FLAGS_LEAVING 2
#define NODE_FLAGS_WFASTATUS 4
#define NODE_FLAGS_FIRST 8
#define NODE_FLAGS_QDEVICE_REGISTERED 16
#define NODE_FLAGS_QDEVICE_ALIVE 32
#define NODE_FLAGS_QDEVICE_CAST_VOTE 64
#define NODE_FLAGS_QDEVICE_MASTER_WINS 128
typedef enum {
NODESTATE_MEMBER=1,
NODESTATE_DEAD,
NODESTATE_LEAVING
} nodestate_t;
struct cluster_node {
int node_id;
nodestate_t state;
uint32_t votes;
uint32_t expected_votes;
uint32_t flags;
struct qb_list_head list;
};
/*
* votequorum internal quorum status
*/
static uint8_t quorum;
static uint8_t cluster_is_quorate;
/*
* votequorum membership data
*/
static struct cluster_node *us;
static struct qb_list_head cluster_members_list;
static unsigned int quorum_members[PROCESSOR_COUNT_MAX];
static unsigned int previous_quorum_members[PROCESSOR_COUNT_MAX];
static unsigned int atb_nodelist[PROCESSOR_COUNT_MAX];
static int quorum_members_entries = 0;
static int previous_quorum_members_entries = 0;
static int atb_nodelist_entries = 0;
static struct memb_ring_id quorum_ringid;
/*
* pre allocate all cluster_nodes + one for qdevice
*/
static struct cluster_node cluster_nodes[PROCESSOR_COUNT_MAX+2];
static int cluster_nodes_entries = 0;
/*
* votequorum tracking
*/
struct quorum_pd {
unsigned char track_flags;
int tracking_enabled;
uint64_t tracking_context;
struct qb_list_head list;
void *conn;
};
static struct qb_list_head trackers_list;
/*
* votequorum timers
*/
static corosync_timer_handle_t qdevice_timer;
static int qdevice_timer_set = 0;
static corosync_timer_handle_t last_man_standing_timer;
static int last_man_standing_timer_set = 0;
static int sync_nodeinfo_sent = 0;
static int sync_wait_for_poll_or_timeout = 0;
/*
* Service Interfaces required by service_message_handler struct
*/
static int sync_in_progress = 0;
static void votequorum_sync_init (
const unsigned int *trans_list,
size_t trans_list_entries,
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id);
static int votequorum_sync_process (void);
static void votequorum_sync_activate (void);
static void votequorum_sync_abort (void);
static quorum_set_quorate_fn_t quorum_callback;
/*
* votequorum_exec handler and definitions
*/
static char *votequorum_exec_init_fn (struct corosync_api_v1 *api);
static int votequorum_exec_exit_fn (void);
static int votequorum_exec_send_nodeinfo(uint32_t nodeid);
static void message_handler_req_exec_votequorum_nodeinfo (
const void *message,
unsigned int nodeid);
static void exec_votequorum_nodeinfo_endian_convert (void *message);
static void message_handler_req_exec_votequorum_reconfigure (
const void *message,
unsigned int nodeid);
static void exec_votequorum_reconfigure_endian_convert (void *message);
static void message_handler_req_exec_votequorum_qdevice_reg (
const void *message,
unsigned int nodeid);
static void exec_votequorum_qdevice_reg_endian_convert (void *message);
static void message_handler_req_exec_votequorum_qdevice_reconfigure (
const void *message,
unsigned int nodeid);
static void exec_votequorum_qdevice_reconfigure_endian_convert (void *message);
static struct corosync_exec_handler votequorum_exec_engine[] =
{
{ /* 0 */
.exec_handler_fn = message_handler_req_exec_votequorum_nodeinfo,
.exec_endian_convert_fn = exec_votequorum_nodeinfo_endian_convert
},
{ /* 1 */
.exec_handler_fn = message_handler_req_exec_votequorum_reconfigure,
.exec_endian_convert_fn = exec_votequorum_reconfigure_endian_convert
},
{ /* 2 */
.exec_handler_fn = message_handler_req_exec_votequorum_qdevice_reg,
.exec_endian_convert_fn = exec_votequorum_qdevice_reg_endian_convert
},
{ /* 3 */
.exec_handler_fn = message_handler_req_exec_votequorum_qdevice_reconfigure,
.exec_endian_convert_fn = exec_votequorum_qdevice_reconfigure_endian_convert
},
};
/*
* Library Handler and Functions Definitions
*/
static int quorum_lib_init_fn (void *conn);
static int quorum_lib_exit_fn (void *conn);
static void qdevice_timer_fn(void *arg);
static void message_handler_req_lib_votequorum_getinfo (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_setexpected (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_setvotes (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_trackstart (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_trackstop (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_qdevice_register (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_qdevice_unregister (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_qdevice_update (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_qdevice_poll (void *conn,
const void *message);
static void message_handler_req_lib_votequorum_qdevice_master_wins (void *conn,
const void *message);
static struct corosync_lib_handler quorum_lib_service[] =
{
{ /* 0 */
.lib_handler_fn = message_handler_req_lib_votequorum_getinfo,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 1 */
.lib_handler_fn = message_handler_req_lib_votequorum_setexpected,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 2 */
.lib_handler_fn = message_handler_req_lib_votequorum_setvotes,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 3 */
.lib_handler_fn = message_handler_req_lib_votequorum_trackstart,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 4 */
.lib_handler_fn = message_handler_req_lib_votequorum_trackstop,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 5 */
.lib_handler_fn = message_handler_req_lib_votequorum_qdevice_register,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 6 */
.lib_handler_fn = message_handler_req_lib_votequorum_qdevice_unregister,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 7 */
.lib_handler_fn = message_handler_req_lib_votequorum_qdevice_update,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 8 */
.lib_handler_fn = message_handler_req_lib_votequorum_qdevice_poll,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
},
{ /* 9 */
.lib_handler_fn = message_handler_req_lib_votequorum_qdevice_master_wins,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED
}
};
static struct corosync_service_engine votequorum_service_engine = {
.name = "corosync vote quorum service v1.0",
.id = VOTEQUORUM_SERVICE,
.priority = 2,
.private_data_size = sizeof (struct quorum_pd),
.allow_inquorate = CS_LIB_ALLOW_INQUORATE,
.flow_control = COROSYNC_LIB_FLOW_CONTROL_REQUIRED,
.lib_init_fn = quorum_lib_init_fn,
.lib_exit_fn = quorum_lib_exit_fn,
.lib_engine = quorum_lib_service,
.lib_engine_count = sizeof (quorum_lib_service) / sizeof (struct corosync_lib_handler),
.exec_init_fn = votequorum_exec_init_fn,
.exec_exit_fn = votequorum_exec_exit_fn,
.exec_engine = votequorum_exec_engine,
.exec_engine_count = sizeof (votequorum_exec_engine) / sizeof (struct corosync_exec_handler),
.sync_init = votequorum_sync_init,
.sync_process = votequorum_sync_process,
.sync_activate = votequorum_sync_activate,
.sync_abort = votequorum_sync_abort
};
struct corosync_service_engine *votequorum_get_service_engine_ver0 (void)
{
return (&votequorum_service_engine);
}
static struct default_service votequorum_service[] = {
{
.name = "corosync_votequorum",
.ver = 0,
.loader = votequorum_get_service_engine_ver0
},
};
/*
* common/utility macros/functions
*/
#define max(a,b) (((a) > (b)) ? (a) : (b))
static void node_add_ordered(struct cluster_node *newnode)
{
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
ENTER();
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if (newnode->node_id < node->node_id) {
break;
}
}
if (!node) {
qb_list_add(&newnode->list, &cluster_members_list);
} else {
qb_list_add_tail(&newnode->list, &node->list);
}
LEAVE();
}
static struct cluster_node *allocate_node(unsigned int nodeid)
{
struct cluster_node *cl = NULL;
struct qb_list_head *tmp;
ENTER();
if (cluster_nodes_entries <= PROCESSOR_COUNT_MAX + 1) {
cl = (struct cluster_node *)&cluster_nodes[cluster_nodes_entries];
cluster_nodes_entries++;
} else {
qb_list_for_each(tmp, &cluster_members_list) {
cl = qb_list_entry(tmp, struct cluster_node, list);
if (cl->state == NODESTATE_DEAD) {
break;
}
}
/*
* this should never happen
*/
if (!cl) {
log_printf(LOGSYS_LEVEL_CRIT, "Unable to find memory for node %u data!!", nodeid);
goto out;
}
qb_list_del(tmp);
}
memset(cl, 0, sizeof(struct cluster_node));
cl->node_id = nodeid;
if (nodeid != VOTEQUORUM_QDEVICE_NODEID) {
node_add_ordered(cl);
}
out:
LEAVE();
return cl;
}
static struct cluster_node *find_node_by_nodeid(unsigned int nodeid)
{
struct cluster_node *node;
struct qb_list_head *tmp;
ENTER();
if (nodeid == us->node_id) {
LEAVE();
return us;
}
if (nodeid == VOTEQUORUM_QDEVICE_NODEID) {
LEAVE();
return qdevice;
}
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if (node->node_id == nodeid) {
LEAVE();
return node;
}
}
LEAVE();
return NULL;
}
static void get_lowest_node_id(void)
{
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
ENTER();
lowest_node_id = us->node_id;
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if ((node->state == NODESTATE_MEMBER) &&
(node->node_id < lowest_node_id)) {
lowest_node_id = node->node_id;
}
}
log_printf(LOGSYS_LEVEL_DEBUG, "lowest node id: %d us: %d", lowest_node_id, us->node_id);
icmap_set_uint32("runtime.votequorum.lowest_node_id", lowest_node_id);
LEAVE();
}
static void get_highest_node_id(void)
{
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
ENTER();
highest_node_id = us->node_id;
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if ((node->state == NODESTATE_MEMBER) &&
(node->node_id > highest_node_id)) {
highest_node_id = node->node_id;
}
}
log_printf(LOGSYS_LEVEL_DEBUG, "highest node id: %d us: %d", highest_node_id, us->node_id);
icmap_set_uint32("runtime.votequorum.highest_node_id", highest_node_id);
LEAVE();
}
static int check_low_node_id_partition(void)
{
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
int found = 0;
ENTER();
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if ((node->state == NODESTATE_MEMBER) &&
(node->node_id == lowest_node_id)) {
found = 1;
}
}
LEAVE();
return found;
}
static int check_high_node_id_partition(void)
{
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
int found = 0;
ENTER();
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if ((node->state == NODESTATE_MEMBER) &&
(node->node_id == highest_node_id)) {
found = 1;
}
}
LEAVE();
return found;
}
static int is_in_nodelist(int nodeid, unsigned int *members, int entries)
{
int i;
ENTER();
for (i=0; i<entries; i++) {
if (nodeid == members[i]) {
LEAVE();
return 1;
}
}
LEAVE();
return 0;
}
/*
* The algorithm for a list of tie-breaker nodes is:
* travel the list of nodes in the auto_tie_breaker list,
* if the node IS in our current partition, check if the
* nodes earlier in the atb list are in the 'previous' partition;
* If none are found then we are safe to be quorate, if any are
* then we cannot be as we don't know if that node is up or down.
* If we don't have a node in the current list we are NOT quorate.
* Obviously if we find the first node in the atb list in our
* partition then we are quorate.
*
* Special cases lowest nodeid, and highest nodeid are handled separately.
*/
static int check_auto_tie_breaker(void)
{
int i, j;
int res;
ENTER();
if (auto_tie_breaker == ATB_LOWEST) {
res = check_low_node_id_partition();
log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LOWEST decision: %d", res);
LEAVE();
return res;
}
if (auto_tie_breaker == ATB_HIGHEST) {
res = check_high_node_id_partition();
log_printf(LOGSYS_LEVEL_DEBUG, "ATB_HIGHEST decision: %d", res);
LEAVE();
return res;
}
/* Assume ATB_LIST, we should never be called for ATB_NONE */
for (i=0; i < atb_nodelist_entries; i++) {
if (is_in_nodelist(atb_nodelist[i], quorum_members, quorum_members_entries)) {
/*
* Node is in our partition, if any of its predecessors are
* in the previous quorum partition then it might be in the
* 'other half' (as we've got this far without seeing it here)
* and so we can't be quorate.
*/
for (j=0; j<i; j++) {
if (is_in_nodelist(atb_nodelist[j], previous_quorum_members, previous_quorum_members_entries)) {
log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LIST found node %d in previous partition but not here, quorum denied", atb_nodelist[j]);
LEAVE();
return 0;
}
}
/*
* None of the other list nodes were in the previous partition, if there
* are enough votes, we can be quorate
*/
log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LIST found node %d in current partition, we can be quorate", atb_nodelist[i]);
LEAVE();
return 1;
}
}
log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LIST found no list nodes in current partition, we cannot be quorate");
LEAVE();
return 0;
}
/*
* atb_string can be either:
* 'lowest'
* 'highest'
* a list of nodeids
*/
static void parse_atb_string(char *atb_string)
{
char *ptr;
long num;
ENTER();
auto_tie_breaker = ATB_NONE;
if (!strcmp(atb_string, "lowest"))
auto_tie_breaker = ATB_LOWEST;
if (!strcmp(atb_string, "highest"))
auto_tie_breaker = ATB_HIGHEST;
if (atoi(atb_string)) {
atb_nodelist_entries = 0;
ptr = atb_string;
do {
num = strtol(ptr, &ptr, 10);
if (num) {
log_printf(LOGSYS_LEVEL_DEBUG, "ATB nodelist[%d] = %d", atb_nodelist_entries, num);
atb_nodelist[atb_nodelist_entries++] = num;
}
} while (num);
if (atb_nodelist_entries) {
auto_tie_breaker = ATB_LIST;
}
}
icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
log_printf(LOGSYS_LEVEL_DEBUG, "ATB type = %d", auto_tie_breaker);
/* Make sure we got something */
if (auto_tie_breaker == ATB_NONE) {
log_printf(LOGSYS_LEVEL_WARNING, "auto_tie_breaker_nodes is not valid. It must be 'lowest', 'highest' or a space-separated list of node IDs. auto_tie_breaker is disabled");
auto_tie_breaker = ATB_NONE;
}
LEAVE();
}
static int check_qdevice_master(void)
{
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
int found = 0;
ENTER();
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if ((node->state == NODESTATE_MEMBER) &&
(node->flags & NODE_FLAGS_QDEVICE_MASTER_WINS) &&
(node->flags & NODE_FLAGS_QDEVICE_CAST_VOTE)) {
found = 1;
}
}
LEAVE();
return found;
}
static void decode_flags(uint32_t flags)
{
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG,
"flags: quorate: %s Leaving: %s WFA Status: %s First: %s Qdevice: %s QdeviceAlive: %s QdeviceCastVote: %s QdeviceMasterWins: %s",
(flags & NODE_FLAGS_QUORATE)?"Yes":"No",
(flags & NODE_FLAGS_LEAVING)?"Yes":"No",
(flags & NODE_FLAGS_WFASTATUS)?"Yes":"No",
(flags & NODE_FLAGS_FIRST)?"Yes":"No",
(flags & NODE_FLAGS_QDEVICE_REGISTERED)?"Yes":"No",
(flags & NODE_FLAGS_QDEVICE_ALIVE)?"Yes":"No",
(flags & NODE_FLAGS_QDEVICE_CAST_VOTE)?"Yes":"No",
(flags & NODE_FLAGS_QDEVICE_MASTER_WINS)?"Yes":"No");
LEAVE();
}
/*
* load/save are copied almost pristine from totemsrp,c
*/
static int load_ev_tracking_barrier(void)
{
int res = 0;
char filename[PATH_MAX];
ENTER();
snprintf(filename, sizeof(filename) - 1, "%s/ev_tracking", get_run_dir());
ev_tracking_fd = open(filename, O_RDWR, 0700);
if (ev_tracking_fd != -1) {
res = read (ev_tracking_fd, &ev_tracking_barrier, sizeof(uint32_t));
close(ev_tracking_fd);
if (res == sizeof (uint32_t)) {
LEAVE();
return 0;
}
}
ev_tracking_barrier = 0;
umask(0);
ev_tracking_fd = open (filename, O_CREAT|O_RDWR, 0700);
if (ev_tracking_fd != -1) {
res = write (ev_tracking_fd, &ev_tracking_barrier, sizeof (uint32_t));
if ((res == -1) || (res != sizeof (uint32_t))) {
log_printf(LOGSYS_LEVEL_WARNING,
"Unable to write to %s", filename);
}
close(ev_tracking_fd);
LEAVE();
return 0;
}
log_printf(LOGSYS_LEVEL_WARNING,
"Unable to create %s file", filename);
LEAVE();
return -1;
}
static void update_wait_for_all_status(uint8_t wfa_status)
{
ENTER();
wait_for_all_status = wfa_status;
if (wait_for_all_status) {
us->flags |= NODE_FLAGS_WFASTATUS;
} else {
us->flags &= ~NODE_FLAGS_WFASTATUS;
}
icmap_set_uint8("runtime.votequorum.wait_for_all_status",
wait_for_all_status);
LEAVE();
}
static void update_two_node(void)
{
ENTER();
icmap_set_uint8("runtime.votequorum.two_node", two_node);
LEAVE();
}
static void update_ev_barrier(uint32_t expected_votes)
{
ENTER();
ev_barrier = expected_votes;
icmap_set_uint32("runtime.votequorum.ev_barrier", ev_barrier);
LEAVE();
}
static void update_qdevice_can_operate(uint8_t status)
{
ENTER();
qdevice_can_operate = status;
icmap_set_uint8("runtime.votequorum.qdevice_can_operate", qdevice_can_operate);
LEAVE();
}
static void update_qdevice_master_wins(uint8_t allow)
{
ENTER();
qdevice_master_wins = allow;
icmap_set_uint8("runtime.votequorum.qdevice_master_wins", qdevice_master_wins);
LEAVE();
}
static void update_ev_tracking_barrier(uint32_t ev_t_barrier)
{
int res;
ENTER();
ev_tracking_barrier = ev_t_barrier;
icmap_set_uint32("runtime.votequorum.ev_tracking_barrier", ev_tracking_barrier);
if (lseek (ev_tracking_fd, 0, SEEK_SET) != 0) {
log_printf(LOGSYS_LEVEL_WARNING,
"Unable to update ev_tracking_barrier on disk data!!!");
LEAVE();
return;
}
res = write (ev_tracking_fd, &ev_tracking_barrier, sizeof (uint32_t));
if (res != sizeof (uint32_t)) {
log_printf(LOGSYS_LEVEL_WARNING,
"Unable to update ev_tracking_barrier on disk data!!!");
}
#ifdef HAVE_FDATASYNC
fdatasync(ev_tracking_fd);
#else
fsync(ev_tracking_fd);
#endif
LEAVE();
}
/*
* quorum calculation core bits
*/
static int calculate_quorum(int allow_decrease, unsigned int max_expected, unsigned int *ret_total_votes)
{
struct qb_list_head *nodelist;
struct cluster_node *node;
unsigned int total_votes = 0;
unsigned int highest_expected = 0;
unsigned int newquorum, q1, q2;
unsigned int total_nodes = 0;
ENTER();
if ((allow_downscale) && (allow_decrease) && (max_expected)) {
max_expected = max(ev_barrier, max_expected);
}
qb_list_for_each(nodelist, &cluster_members_list) {
node = qb_list_entry(nodelist, struct cluster_node, list);
log_printf(LOGSYS_LEVEL_DEBUG, "node %u state=%d, votes=%u, expected=%u",
node->node_id, node->state, node->votes, node->expected_votes);
if (node->state == NODESTATE_MEMBER) {
highest_expected = max(highest_expected, node->expected_votes);
total_votes += node->votes;
total_nodes++;
}
}
if (us->flags & NODE_FLAGS_QDEVICE_CAST_VOTE) {
log_printf(LOGSYS_LEVEL_DEBUG, "node 0 state=1, votes=%u", qdevice->votes);
total_votes += qdevice->votes;
total_nodes++;
}
if (max_expected > 0) {
highest_expected = max_expected;
}
/*
* This quorum calculation is taken from the OpenVMS Cluster Systems
* manual, but, then, you guessed that didn't you
*/
q1 = (highest_expected + 2) / 2;
q2 = (total_votes + 2) / 2;
newquorum = max(q1, q2);
/*
* Normally quorum never decreases but the system administrator can
* force it down by setting expected votes to a maximum value
*/
if (!allow_decrease) {
newquorum = max(quorum, newquorum);
}
/*
* The special two_node mode allows each of the two nodes to retain
* quorum if the other fails. Only one of the two should live past
* fencing (as both nodes try to fence each other in split-brain.)
* Also: if there are more than two nodes, force us inquorate to avoid
* any damage or confusion.
*/
if (two_node && total_nodes <= 2) {
newquorum = 1;
}
if (ret_total_votes) {
*ret_total_votes = total_votes;
}
LEAVE();
return newquorum;
}
static void update_node_expected_votes(int new_expected_votes)
{
struct qb_list_head *nodelist;
struct cluster_node *node;
if (new_expected_votes) {
qb_list_for_each(nodelist, &cluster_members_list) {
node = qb_list_entry(nodelist, struct cluster_node, list);
if (node->state == NODESTATE_MEMBER) {
node->expected_votes = new_expected_votes;
}
}
}
}
static void are_we_quorate(unsigned int total_votes)
{
int quorate;
int quorum_change = 0;
ENTER();
/*
* wait for all nodes to show up before granting quorum
*/
if ((wait_for_all) && (wait_for_all_status)) {
if (total_votes != us->expected_votes) {
log_printf(LOGSYS_LEVEL_NOTICE,
"Waiting for all cluster members. "
"Current votes: %d expected_votes: %d",
total_votes, us->expected_votes);
cluster_is_quorate = 0;
return;
}
update_wait_for_all_status(0);
}
if (quorum > total_votes) {
quorate = 0;
} else {
quorate = 1;
get_lowest_node_id();
get_highest_node_id();
}
if ((auto_tie_breaker != ATB_NONE) &&
/* Must be a half (or half-1) split */
(total_votes == (us->expected_votes / 2)) &&
/* If the 'other' partition in a split might have quorum then we can't run ATB */
(previous_quorum_members_entries - quorum_members_entries < quorum) &&
(check_auto_tie_breaker() == 1)) {
quorate = 1;
}
if ((qdevice_master_wins) &&
(!quorate) &&
(check_qdevice_master() == 1)) {
log_printf(LOGSYS_LEVEL_DEBUG, "node is quorate as part of master_wins partition");
quorate = 1;
}
if (cluster_is_quorate && !quorate) {
quorum_change = 1;
log_printf(LOGSYS_LEVEL_DEBUG, "quorum lost, blocking activity");
}
if (!cluster_is_quorate && quorate) {
quorum_change = 1;
log_printf(LOGSYS_LEVEL_DEBUG, "quorum regained, resuming activity");
}
cluster_is_quorate = quorate;
if (cluster_is_quorate) {
us->flags |= NODE_FLAGS_QUORATE;
} else {
us->flags &= ~NODE_FLAGS_QUORATE;
}
if (wait_for_all) {
if (quorate) {
update_wait_for_all_status(0);
} else {
update_wait_for_all_status(1);
}
}
if ((quorum_change) &&
(sync_in_progress == 0)) {
quorum_callback(quorum_members, quorum_members_entries,
cluster_is_quorate, &quorum_ringid);
votequorum_exec_send_quorum_notification(NULL, 0L);
}
LEAVE();
}
static void get_total_votes(unsigned int *totalvotes, unsigned int *current_members)
{
unsigned int total_votes = 0;
unsigned int cluster_members = 0;
struct qb_list_head *nodelist;
struct cluster_node *node;
ENTER();
qb_list_for_each(nodelist, &cluster_members_list) {
node = qb_list_entry(nodelist, struct cluster_node, list);
if (node->state == NODESTATE_MEMBER) {
cluster_members++;
total_votes += node->votes;
}
}
if (qdevice->votes) {
total_votes += qdevice->votes;
cluster_members++;
}
*totalvotes = total_votes;
*current_members = cluster_members;
LEAVE();
}
/*
* Recalculate cluster quorum, set quorate and notify changes
*/
static void recalculate_quorum(int allow_decrease, int by_current_nodes)
{
unsigned int total_votes = 0;
unsigned int cluster_members = 0;
ENTER();
get_total_votes(&total_votes, &cluster_members);
if (!by_current_nodes) {
cluster_members = 0;
}
/*
* Keep expected_votes at the highest number of votes in the cluster
*/
log_printf(LOGSYS_LEVEL_DEBUG, "total_votes=%d, expected_votes=%d", total_votes, us->expected_votes);
if (total_votes > us->expected_votes) {
us->expected_votes = total_votes;
votequorum_exec_send_expectedvotes_notification();
}
if ((ev_tracking) &&
(us->expected_votes > ev_tracking_barrier)) {
update_ev_tracking_barrier(us->expected_votes);
}
quorum = calculate_quorum(allow_decrease, cluster_members, &total_votes);
update_node_expected_votes(cluster_members);
are_we_quorate(total_votes);
LEAVE();
}
/*
* configuration bits and pieces
*/
static int votequorum_read_nodelist_configuration(uint32_t *votes,
uint32_t *nodes,
uint32_t *expected_votes)
{
icmap_iter_t iter;
const char *iter_key;
char tmp_key[ICMAP_KEYNAME_MAXLEN];
- uint32_t our_pos, node_pos;
+ uint32_t our_pos, node_pos, last_node_pos=-1;
uint32_t nodecount = 0;
uint32_t nodelist_expected_votes = 0;
uint32_t node_votes = 0;
int res = 0;
ENTER();
if (icmap_get_uint32("nodelist.local_node_pos", &our_pos) != CS_OK) {
log_printf(LOGSYS_LEVEL_DEBUG,
"No nodelist defined or our node is not in the nodelist");
return 0;
}
iter = icmap_iter_init("nodelist.node.");
while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) {
res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key);
if (res != 2) {
continue;
}
- if (strcmp(tmp_key, "nodeid") != 0) {
+ /*
+ * If current node_pos is the same as the last_node_pos then skip it
+ * so we only do the code below once per node.
+ * (icmap keys are always in order)
+ */
+ if (last_node_pos == node_pos) {
continue;
}
+ last_node_pos = node_pos;
nodecount++;
snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.quorum_votes", node_pos);
if (icmap_get_uint32(tmp_key, &node_votes) != CS_OK) {
node_votes = 1;
}
nodelist_expected_votes = nodelist_expected_votes + node_votes;
if (node_pos == our_pos) {
*votes = node_votes;
}
}
*expected_votes = nodelist_expected_votes;
*nodes = nodecount;
icmap_iter_finalize(iter);
LEAVE();
return 1;
}
static int votequorum_qdevice_is_configured(uint32_t *qdevice_votes)
{
char *qdevice_model = NULL;
int ret = 0;
ENTER();
if (icmap_get_string("quorum.device.model", &qdevice_model) == CS_OK) {
if (strlen(qdevice_model)) {
if (icmap_get_uint32("quorum.device.votes", qdevice_votes) != CS_OK) {
*qdevice_votes = -1;
}
if (icmap_get_uint32("quorum.device.timeout", &qdevice_timeout) != CS_OK) {
qdevice_timeout = VOTEQUORUM_QDEVICE_DEFAULT_TIMEOUT;
}
if (icmap_get_uint32("quorum.device.sync_timeout", &qdevice_sync_timeout) != CS_OK) {
qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT;
}
update_qdevice_can_operate(1);
ret = 1;
}
free(qdevice_model);
}
LEAVE();
return ret;
}
#define VOTEQUORUM_READCONFIG_STARTUP 0
#define VOTEQUORUM_READCONFIG_RUNTIME 1
static char *votequorum_readconfig(int runtime)
{
uint32_t node_votes = 0, qdevice_votes = 0;
uint32_t node_expected_votes = 0, expected_votes = 0;
uint32_t node_count = 0;
uint8_t atb = 0;
int have_nodelist, have_qdevice;
char *atb_string = NULL;
char *error = NULL;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "Reading configuration (runtime: %d)", runtime);
/*
* Set the few things we re-read at runtime back to their defaults
*/
if (runtime) {
two_node = 0;
expected_votes = 0;
/* auto_tie_breaker cannot be changed by config reload, but
* we automatically disable it on odd-sized clusters without
* wait_for_all.
* We may need to re-enable it when membership changes to ensure
* that auto_tie_breaker is consistent across all nodes */
auto_tie_breaker = initial_auto_tie_breaker;
icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
}
/*
* gather basic data here
*/
icmap_get_uint32("quorum.expected_votes", &expected_votes);
have_nodelist = votequorum_read_nodelist_configuration(&node_votes, &node_count, &node_expected_votes);
have_qdevice = votequorum_qdevice_is_configured(&qdevice_votes);
icmap_get_uint8("quorum.two_node", &two_node);
/*
* do config verification and enablement
*/
if ((!have_nodelist) && (!expected_votes)) {
if (!runtime) {
error = (char *)"configuration error: nodelist or quorum.expected_votes must be configured!";
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: nodelist or quorum.expected_votes must be configured!");
log_printf(LOGSYS_LEVEL_CRIT, "will continue with current runtime data");
}
goto out;
}
/*
* two_node and qdevice are not compatible in the same config.
* try to make an educated guess of what to do
*/
if ((two_node) && (have_qdevice)) {
if (!runtime) {
error = (char *)"configuration error: two_node and quorum device cannot be configured at the same time!";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: two_node and quorum device cannot be configured at the same time!");
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
log_printf(LOGSYS_LEVEL_CRIT, "quorum device is registered, disabling two_node");
two_node = 0;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "quorum device is not registered, allowing two_node");
update_qdevice_can_operate(0);
}
}
}
/*
* Enable special features
*/
if (!runtime) {
if (two_node) {
wait_for_all = 1;
}
icmap_get_uint8("quorum.allow_downscale", &allow_downscale);
icmap_get_uint8("quorum.wait_for_all", &wait_for_all);
icmap_get_uint8("quorum.last_man_standing", &last_man_standing);
icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window);
icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking);
icmap_get_uint8("quorum.auto_tie_breaker", &atb);
icmap_get_string("quorum.auto_tie_breaker_node", &atb_string);
/* auto_tie_breaker defaults to LOWEST */
if (atb) {
auto_tie_breaker = ATB_LOWEST;
icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
}
else {
auto_tie_breaker = ATB_NONE;
if (atb_string) {
log_printf(LOGSYS_LEVEL_WARNING,
"auto_tie_breaker_node: is meaningless if auto_tie_breaker is set to 0");
}
}
if (atb && atb_string) {
parse_atb_string(atb_string);
}
free(atb_string);
initial_auto_tie_breaker = auto_tie_breaker;
/* allow_downscale requires ev_tracking */
if (allow_downscale) {
ev_tracking = 1;
}
if (ev_tracking) {
if (load_ev_tracking_barrier() < 0) {
LEAVE();
return ((char *)"Unable to load ev_tracking file!");
}
update_ev_tracking_barrier(ev_tracking_barrier);
}
}
/* two_node and auto_tie_breaker are not compatible as two_node uses
* a fence race to decide quorum whereas ATB decides based on node id
*/
if (two_node && auto_tie_breaker != ATB_NONE) {
log_printf(LOGSYS_LEVEL_CRIT, "two_node and auto_tie_breaker are both specified but are not compatible.");
log_printf(LOGSYS_LEVEL_CRIT, "two_node has been disabled, please fix your corosync.conf");
two_node = 0;
}
/* If ATB is set and the cluster has an odd number of nodes then wait_for_all needs
* to be set so that an isolated half+1 without the tie breaker node
* does not have quorum on reboot.
*/
if ((auto_tie_breaker != ATB_NONE) && (node_expected_votes % 2) &&
(!wait_for_all)) {
if (last_man_standing) {
/* if LMS is set too, it's a fatal configuration error. We can't dictate to the user what
* they might want so we'll just quit.
*/
log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set, the cluster has an odd number of nodes\n");
log_printf(LOGSYS_LEVEL_CRIT, "and last_man_standing is also set. With this situation a better\n");
log_printf(LOGSYS_LEVEL_CRIT, "solution would be to disable LMS, leave ATB enabled, and also\n");
log_printf(LOGSYS_LEVEL_CRIT, "enable wait_for_all (mandatory for ATB in odd-numbered clusters).\n");
log_printf(LOGSYS_LEVEL_CRIT, "Due to this ambiguity, corosync will fail to start. Please fix your corosync.conf\n");
error = (char *)"configuration error: auto_tie_breaker & last_man_standing not available in odd sized cluster";
goto out;
}
else {
log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set and the cluster has an odd number of nodes.\n");
log_printf(LOGSYS_LEVEL_CRIT, "wait_for_all needs to be set for this configuration but it is missing\n");
log_printf(LOGSYS_LEVEL_CRIT, "Therefore auto_tie_breaker has been disabled. Please fix your corosync.conf\n");
auto_tie_breaker = ATB_NONE;
icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
}
}
/*
* quorum device is not compatible with last_man_standing and auto_tie_breaker
* neither lms or atb can be set at runtime, so there is no need to check for
* runtime incompatibilities, but qdevice can be configured _after_ LMS and ATB have
* been enabled at startup.
*/
if ((have_qdevice) && (last_man_standing)) {
if (!runtime) {
error = (char *)"configuration error: quorum.device is not compatible with last_man_standing";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device is not compatible with last_man_standing");
log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations");
update_qdevice_can_operate(0);
}
}
if ((have_qdevice) && (auto_tie_breaker != ATB_NONE)) {
if (!runtime) {
error = (char *)"configuration error: quorum.device is not compatible with auto_tie_breaker";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device is not compatible with auto_tie_breaker");
log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations");
update_qdevice_can_operate(0);
}
}
if ((have_qdevice) && (allow_downscale)) {
if (!runtime) {
error = (char *)"configuration error: quorum.device is not compatible with allow_downscale";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device is not compatible with allow_downscale");
log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations");
update_qdevice_can_operate(0);
}
}
/*
* if user specifies quorum.expected_votes + quorum.device but NOT the device.votes
* we don't know what the quorum device should vote.
*/
if ((expected_votes) && (have_qdevice) && (qdevice_votes == -1)) {
if (!runtime) {
error = (char *)"configuration error: quorum.device.votes must be specified when quorum.expected_votes is set";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device.votes must be specified when quorum.expected_votes is set");
log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations");
update_qdevice_can_operate(0);
}
}
/*
* if user specifies a node list with uneven votes and no device.votes
* we cannot autocalculate the votes
*/
if ((have_qdevice) &&
(qdevice_votes == -1) &&
(have_nodelist) &&
(node_count != node_expected_votes)) {
if (!runtime) {
error = (char *)"configuration error: quorum.device.votes must be specified when not all nodes votes 1";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device.votes must be specified when not all nodes votes 1");
log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations");
update_qdevice_can_operate(0);
}
}
/*
* validate quorum device votes vs expected_votes
*/
if ((qdevice_votes > 0) && (expected_votes)) {
int delta = expected_votes - qdevice_votes;
if (delta < 2) {
if (!runtime) {
error = (char *)"configuration error: quorum.device.votes is too high or expected_votes is too low";
goto out;
} else {
log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device.votes is too high or expected_votes is too low");
log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations");
update_qdevice_can_operate(0);
}
}
}
/*
* automatically calculate device votes and adjust expected_votes from nodelist
*/
if ((have_qdevice) &&
(qdevice_votes == -1) &&
(!expected_votes) &&
(have_nodelist) &&
(node_count == node_expected_votes)) {
qdevice_votes = node_expected_votes - 1;
node_expected_votes = node_expected_votes + qdevice_votes;
}
/*
* set this node votes and expected_votes
*/
log_printf(LOGSYS_LEVEL_DEBUG, "ev_tracking=%d, ev_tracking_barrier = %d: expected_votes = %d\n", ev_tracking, ev_tracking_barrier, expected_votes);
if (ev_tracking) {
expected_votes = ev_tracking_barrier;
}
if (have_nodelist) {
us->votes = node_votes;
us->expected_votes = node_expected_votes;
} else {
us->votes = 1;
icmap_get_uint32("quorum.votes", &us->votes);
}
if (expected_votes) {
us->expected_votes = expected_votes;
}
/*
* set qdevice votes
*/
if (!have_qdevice) {
qdevice->votes = 0;
}
if (qdevice_votes != -1) {
qdevice->votes = qdevice_votes;
}
update_ev_barrier(us->expected_votes);
update_two_node();
if (wait_for_all) {
update_wait_for_all_status(1);
}
out:
LEAVE();
return error;
}
static void votequorum_refresh_config(
int32_t event,
const char *key_name,
struct icmap_notify_value new_val,
struct icmap_notify_value old_val,
void *user_data)
{
int old_votes, old_expected_votes;
uint8_t reloading;
uint8_t cancel_wfa;
ENTER();
/*
* If a full reload is in progress then don't do anything until it's done and
* can reconfigure it all atomically
*/
if (icmap_get_uint8("config.totemconfig_reload_in_progress", &reloading) == CS_OK && reloading) {
return ;
}
icmap_get_uint8("quorum.cancel_wait_for_all", &cancel_wfa);
if (strcmp(key_name, "quorum.cancel_wait_for_all") == 0 &&
cancel_wfa >= 1) {
icmap_set_uint8("quorum.cancel_wait_for_all", 0);
if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_CANCEL_WFA,
us->node_id, 0)) {
log_printf(LOGSYS_LEVEL_ERROR, "Failed to send Cancel WFA message to other nodes");
}
return;
}
old_votes = us->votes;
old_expected_votes = us->expected_votes;
/*
* Reload the configuration
*/
votequorum_readconfig(VOTEQUORUM_READCONFIG_RUNTIME);
/*
* activate new config
*/
votequorum_exec_send_nodeinfo(us->node_id);
votequorum_exec_send_nodeinfo(VOTEQUORUM_QDEVICE_NODEID);
if (us->votes != old_votes) {
if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES,
us->node_id, us->votes)) {
log_printf(LOGSYS_LEVEL_ERROR, "Failed to send new votes message to other nodes");
}
}
if (us->expected_votes != old_expected_votes) {
if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES,
us->node_id, us->expected_votes)) {
log_printf(LOGSYS_LEVEL_ERROR, "Failed to send expected votes message to other nodes");
}
}
LEAVE();
}
static void votequorum_exec_add_config_notification(void)
{
icmap_track_t icmap_track_nodelist = NULL;
icmap_track_t icmap_track_quorum = NULL;
icmap_track_t icmap_track_reload = NULL;
ENTER();
icmap_track_add("nodelist.",
ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX,
votequorum_refresh_config,
NULL,
&icmap_track_nodelist);
icmap_track_add("quorum.",
ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX,
votequorum_refresh_config,
NULL,
&icmap_track_quorum);
icmap_track_add("config.totemconfig_reload_in_progress",
ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY,
votequorum_refresh_config,
NULL,
&icmap_track_reload);
LEAVE();
}
/*
* votequorum_exec core
*/
static int votequorum_exec_send_reconfigure(uint8_t param, unsigned int nodeid, uint32_t value)
{
struct req_exec_quorum_reconfigure req_exec_quorum_reconfigure;
struct iovec iov[1];
int ret;
ENTER();
req_exec_quorum_reconfigure.nodeid = nodeid;
req_exec_quorum_reconfigure.value = value;
req_exec_quorum_reconfigure.param = param;
req_exec_quorum_reconfigure._pad0 = 0;
req_exec_quorum_reconfigure._pad1 = 0;
req_exec_quorum_reconfigure._pad2 = 0;
req_exec_quorum_reconfigure.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_RECONFIGURE);
req_exec_quorum_reconfigure.header.size = sizeof(req_exec_quorum_reconfigure);
iov[0].iov_base = (void *)&req_exec_quorum_reconfigure;
iov[0].iov_len = sizeof(req_exec_quorum_reconfigure);
ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED);
LEAVE();
return ret;
}
static int votequorum_exec_send_nodeinfo(uint32_t nodeid)
{
struct req_exec_quorum_nodeinfo req_exec_quorum_nodeinfo;
struct iovec iov[1];
struct cluster_node *node;
int ret;
ENTER();
node = find_node_by_nodeid(nodeid);
if (!node) {
return -1;
}
req_exec_quorum_nodeinfo.nodeid = nodeid;
req_exec_quorum_nodeinfo.votes = node->votes;
req_exec_quorum_nodeinfo.expected_votes = node->expected_votes;
req_exec_quorum_nodeinfo.flags = node->flags;
if (nodeid != VOTEQUORUM_QDEVICE_NODEID) {
decode_flags(node->flags);
}
req_exec_quorum_nodeinfo.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_NODEINFO);
req_exec_quorum_nodeinfo.header.size = sizeof(req_exec_quorum_nodeinfo);
iov[0].iov_base = (void *)&req_exec_quorum_nodeinfo;
iov[0].iov_len = sizeof(req_exec_quorum_nodeinfo);
ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED);
LEAVE();
return ret;
}
static int votequorum_exec_send_qdevice_reconfigure(const char *oldname, const char *newname)
{
struct req_exec_quorum_qdevice_reconfigure req_exec_quorum_qdevice_reconfigure;
struct iovec iov[1];
int ret;
ENTER();
req_exec_quorum_qdevice_reconfigure.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_RECONFIGURE);
req_exec_quorum_qdevice_reconfigure.header.size = sizeof(req_exec_quorum_qdevice_reconfigure);
strcpy(req_exec_quorum_qdevice_reconfigure.oldname, oldname);
strcpy(req_exec_quorum_qdevice_reconfigure.newname, newname);
iov[0].iov_base = (void *)&req_exec_quorum_qdevice_reconfigure;
iov[0].iov_len = sizeof(req_exec_quorum_qdevice_reconfigure);
ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED);
LEAVE();
return ret;
}
static int votequorum_exec_send_qdevice_reg(uint32_t operation, const char *qdevice_name_req)
{
struct req_exec_quorum_qdevice_reg req_exec_quorum_qdevice_reg;
struct iovec iov[1];
int ret;
ENTER();
req_exec_quorum_qdevice_reg.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_REG);
req_exec_quorum_qdevice_reg.header.size = sizeof(req_exec_quorum_qdevice_reg);
req_exec_quorum_qdevice_reg.operation = operation;
strcpy(req_exec_quorum_qdevice_reg.qdevice_name, qdevice_name_req);
iov[0].iov_base = (void *)&req_exec_quorum_qdevice_reg;
iov[0].iov_len = sizeof(req_exec_quorum_qdevice_reg);
ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED);
LEAVE();
return ret;
}
static int votequorum_exec_send_quorum_notification(void *conn, uint64_t context)
{
struct res_lib_votequorum_quorum_notification *res_lib_votequorum_notification;
struct qb_list_head *tmp;
struct cluster_node *node;
int i = 0;
int cluster_members = 0;
int size;
char buf[sizeof(struct res_lib_votequorum_quorum_notification) + sizeof(struct votequorum_node) * (PROCESSOR_COUNT_MAX + 2)];
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "Sending quorum callback, quorate = %d", cluster_is_quorate);
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
cluster_members++;
}
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
cluster_members++;
}
size = sizeof(struct res_lib_votequorum_quorum_notification) + sizeof(struct votequorum_node) * cluster_members;
res_lib_votequorum_notification = (struct res_lib_votequorum_quorum_notification *)&buf;
res_lib_votequorum_notification->quorate = cluster_is_quorate;
res_lib_votequorum_notification->context = context;
res_lib_votequorum_notification->node_list_entries = cluster_members;
res_lib_votequorum_notification->header.id = MESSAGE_RES_VOTEQUORUM_QUORUM_NOTIFICATION;
res_lib_votequorum_notification->header.size = size;
res_lib_votequorum_notification->header.error = CS_OK;
/* Send all known nodes and their states */
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
res_lib_votequorum_notification->node_list[i].nodeid = node->node_id;
res_lib_votequorum_notification->node_list[i++].state = node->state;
}
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
res_lib_votequorum_notification->node_list[i].nodeid = VOTEQUORUM_QDEVICE_NODEID;
res_lib_votequorum_notification->node_list[i++].state = qdevice->state;
}
/* Send it to all interested parties */
if (conn) {
int ret = corosync_api->ipc_dispatch_send(conn, &buf, size);
LEAVE();
return ret;
} else {
struct quorum_pd *qpd;
qb_list_for_each(tmp, &trackers_list) {
qpd = qb_list_entry(tmp, struct quorum_pd, list);
res_lib_votequorum_notification->context = qpd->tracking_context;
corosync_api->ipc_dispatch_send(qpd->conn, &buf, size);
}
}
LEAVE();
return 0;
}
static int votequorum_exec_send_nodelist_notification(void *conn, uint64_t context)
{
struct res_lib_votequorum_nodelist_notification *res_lib_votequorum_notification;
int i = 0;
int size;
struct qb_list_head *tmp;
char buf[sizeof(struct res_lib_votequorum_nodelist_notification) + sizeof(uint32_t) * quorum_members_entries];
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "Sending nodelist callback. ring_id = %d/%lld", quorum_ringid.nodeid, quorum_ringid.seq);
size = sizeof(struct res_lib_votequorum_nodelist_notification) + sizeof(uint32_t) * quorum_members_entries;
res_lib_votequorum_notification = (struct res_lib_votequorum_nodelist_notification *)&buf;
res_lib_votequorum_notification->node_list_entries = quorum_members_entries;
res_lib_votequorum_notification->ring_id.nodeid = quorum_ringid.nodeid;
res_lib_votequorum_notification->ring_id.seq = quorum_ringid.seq;
res_lib_votequorum_notification->context = context;
for (i=0; i<quorum_members_entries; i++) {
res_lib_votequorum_notification->node_list[i] = quorum_members[i];
}
res_lib_votequorum_notification->header.id = MESSAGE_RES_VOTEQUORUM_NODELIST_NOTIFICATION;
res_lib_votequorum_notification->header.size = size;
res_lib_votequorum_notification->header.error = CS_OK;
/* Send it to all interested parties */
if (conn) {
int ret = corosync_api->ipc_dispatch_send(conn, &buf, size);
LEAVE();
return ret;
} else {
struct quorum_pd *qpd;
qb_list_for_each(tmp, &trackers_list) {
qpd = qb_list_entry(tmp, struct quorum_pd, list);
res_lib_votequorum_notification->context = qpd->tracking_context;
corosync_api->ipc_dispatch_send(qpd->conn, &buf, size);
}
}
LEAVE();
return 0;
}
static void votequorum_exec_send_expectedvotes_notification(void)
{
struct res_lib_votequorum_expectedvotes_notification res_lib_votequorum_expectedvotes_notification;
struct quorum_pd *qpd;
struct qb_list_head *tmp;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "Sending expected votes callback");
res_lib_votequorum_expectedvotes_notification.header.id = MESSAGE_RES_VOTEQUORUM_EXPECTEDVOTES_NOTIFICATION;
res_lib_votequorum_expectedvotes_notification.header.size = sizeof(res_lib_votequorum_expectedvotes_notification);
res_lib_votequorum_expectedvotes_notification.header.error = CS_OK;
res_lib_votequorum_expectedvotes_notification.expected_votes = us->expected_votes;
qb_list_for_each(tmp, &trackers_list) {
qpd = qb_list_entry(tmp, struct quorum_pd, list);
res_lib_votequorum_expectedvotes_notification.context = qpd->tracking_context;
corosync_api->ipc_dispatch_send(qpd->conn, &res_lib_votequorum_expectedvotes_notification,
sizeof(struct res_lib_votequorum_expectedvotes_notification));
}
LEAVE();
}
static void exec_votequorum_qdevice_reconfigure_endian_convert (void *message)
{
ENTER();
LEAVE();
}
static void message_handler_req_exec_votequorum_qdevice_reconfigure (
const void *message,
unsigned int nodeid)
{
const struct req_exec_quorum_qdevice_reconfigure *req_exec_quorum_qdevice_reconfigure = message;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "Received qdevice name change req from node %u [from: %s to: %s]",
nodeid,
req_exec_quorum_qdevice_reconfigure->oldname,
req_exec_quorum_qdevice_reconfigure->newname);
if (!strcmp(req_exec_quorum_qdevice_reconfigure->oldname, qdevice_name)) {
log_printf(LOGSYS_LEVEL_DEBUG, "Allowing qdevice rename");
memset(qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN);
strcpy(qdevice_name, req_exec_quorum_qdevice_reconfigure->newname);
/*
* TODO: notify qdevices about name change?
* this is not relevant for now and can wait later on since
* qdevices are local only and libvotequorum is not final
*/
}
LEAVE();
}
static void exec_votequorum_qdevice_reg_endian_convert (void *message)
{
struct req_exec_quorum_qdevice_reg *req_exec_quorum_qdevice_reg = message;
ENTER();
req_exec_quorum_qdevice_reg->operation = swab32(req_exec_quorum_qdevice_reg->operation);
LEAVE();
}
static void message_handler_req_exec_votequorum_qdevice_reg (
const void *message,
unsigned int nodeid)
{
const struct req_exec_quorum_qdevice_reg *req_exec_quorum_qdevice_reg = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
int wipe_qdevice_name = 1;
struct cluster_node *node = NULL;
struct qb_list_head *tmp;
cs_error_t error = CS_OK;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "Received qdevice op %u req from node %u [%s]",
req_exec_quorum_qdevice_reg->operation,
nodeid, req_exec_quorum_qdevice_reg->qdevice_name);
switch(req_exec_quorum_qdevice_reg->operation)
{
case VOTEQUORUM_QDEVICE_OPERATION_REGISTER:
if (nodeid != us->node_id) {
if (!strlen(qdevice_name)) {
log_printf(LOGSYS_LEVEL_DEBUG, "Remote qdevice name recorded");
strcpy(qdevice_name, req_exec_quorum_qdevice_reg->qdevice_name);
}
LEAVE();
return;
}
/*
* protect against the case where we broadcast qdevice registration
* to new memebers, we receive the message back, but there is no registration
* connection in progress
*/
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
LEAVE();
return;
}
/*
* this should NEVER happen
*/
if (!qdevice_reg_conn) {
log_printf(LOGSYS_LEVEL_WARNING, "Unable to determine origin of the qdevice register call!");
LEAVE();
return;
}
/*
* registering our own device in this case
*/
if (!strlen(qdevice_name)) {
strcpy(qdevice_name, req_exec_quorum_qdevice_reg->qdevice_name);
}
/*
* check if it is our device or something else
*/
if ((!strncmp(req_exec_quorum_qdevice_reg->qdevice_name,
qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN))) {
us->flags |= NODE_FLAGS_QDEVICE_REGISTERED;
votequorum_exec_send_nodeinfo(VOTEQUORUM_QDEVICE_NODEID);
votequorum_exec_send_nodeinfo(us->node_id);
} else {
log_printf(LOGSYS_LEVEL_WARNING,
"A new qdevice with different name (new: %s old: %s) is trying to register!",
req_exec_quorum_qdevice_reg->qdevice_name, qdevice_name);
error = CS_ERR_EXIST;
}
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(qdevice_reg_conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
qdevice_reg_conn = NULL;
break;
case VOTEQUORUM_QDEVICE_OPERATION_UNREGISTER:
qb_list_for_each(tmp, &cluster_members_list) {
node = qb_list_entry(tmp, struct cluster_node, list);
if ((node->state == NODESTATE_MEMBER) &&
(node->flags & NODE_FLAGS_QDEVICE_REGISTERED)) {
wipe_qdevice_name = 0;
}
}
if (wipe_qdevice_name) {
memset(qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN);
}
break;
}
LEAVE();
}
static void exec_votequorum_nodeinfo_endian_convert (void *message)
{
struct req_exec_quorum_nodeinfo *nodeinfo = message;
ENTER();
nodeinfo->nodeid = swab32(nodeinfo->nodeid);
nodeinfo->votes = swab32(nodeinfo->votes);
nodeinfo->expected_votes = swab32(nodeinfo->expected_votes);
nodeinfo->flags = swab32(nodeinfo->flags);
LEAVE();
}
static void message_handler_req_exec_votequorum_nodeinfo (
const void *message,
unsigned int sender_nodeid)
{
const struct req_exec_quorum_nodeinfo *req_exec_quorum_nodeinfo = message;
struct cluster_node *node = NULL;
int old_votes;
int old_expected;
uint32_t old_flags;
nodestate_t old_state;
int new_node = 0;
int allow_downgrade = 0;
int by_node = 0;
unsigned int nodeid = req_exec_quorum_nodeinfo->nodeid;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "got nodeinfo message from cluster node %u", sender_nodeid);
log_printf(LOGSYS_LEVEL_DEBUG, "nodeinfo message[%u]: votes: %d, expected: %d flags: %d",
nodeid,
req_exec_quorum_nodeinfo->votes,
req_exec_quorum_nodeinfo->expected_votes,
req_exec_quorum_nodeinfo->flags);
if (nodeid != VOTEQUORUM_QDEVICE_NODEID) {
decode_flags(req_exec_quorum_nodeinfo->flags);
}
node = find_node_by_nodeid(nodeid);
if (!node) {
node = allocate_node(nodeid);
new_node = 1;
}
if (!node) {
corosync_api->error_memory_failure();
LEAVE();
return;
}
if (new_node) {
old_votes = 0;
old_expected = 0;
old_state = NODESTATE_DEAD;
old_flags = 0;
} else {
old_votes = node->votes;
old_expected = node->expected_votes;
old_state = node->state;
old_flags = node->flags;
}
if (nodeid == VOTEQUORUM_QDEVICE_NODEID) {
struct cluster_node *sender_node = find_node_by_nodeid(sender_nodeid);
assert(sender_node != NULL);
if ((!cluster_is_quorate) &&
(sender_node->flags & NODE_FLAGS_QUORATE)) {
node->votes = req_exec_quorum_nodeinfo->votes;
} else {
node->votes = max(node->votes, req_exec_quorum_nodeinfo->votes);
}
goto recalculate;
}
/* Update node state */
node->flags = req_exec_quorum_nodeinfo->flags;
node->votes = req_exec_quorum_nodeinfo->votes;
node->state = NODESTATE_MEMBER;
if (node->flags & NODE_FLAGS_LEAVING) {
node->state = NODESTATE_LEAVING;
allow_downgrade = 1;
by_node = 1;
}
if ((!cluster_is_quorate) &&
(node->flags & NODE_FLAGS_QUORATE)) {
allow_downgrade = 1;
us->expected_votes = req_exec_quorum_nodeinfo->expected_votes;
}
if (node->flags & NODE_FLAGS_QUORATE || (ev_tracking)) {
node->expected_votes = req_exec_quorum_nodeinfo->expected_votes;
} else {
node->expected_votes = us->expected_votes;
}
if ((last_man_standing) && (node->votes > 1)) {
log_printf(LOGSYS_LEVEL_WARNING, "Last Man Standing feature is supported only when all"
"cluster nodes votes are set to 1. Disabling LMS.");
last_man_standing = 0;
if (last_man_standing_timer_set) {
corosync_api->timer_delete(last_man_standing_timer);
last_man_standing_timer_set = 0;
}
}
recalculate:
if ((new_node) ||
(nodeid == us->node_id) ||
(node->flags & NODE_FLAGS_FIRST) ||
(old_votes != node->votes) ||
(old_expected != node->expected_votes) ||
(old_flags != node->flags) ||
(old_state != node->state)) {
recalculate_quorum(allow_downgrade, by_node);
}
if ((wait_for_all) &&
(!(node->flags & NODE_FLAGS_WFASTATUS)) &&
(node->flags & NODE_FLAGS_QUORATE)) {
update_wait_for_all_status(0);
}
LEAVE();
}
static void exec_votequorum_reconfigure_endian_convert (void *message)
{
struct req_exec_quorum_reconfigure *reconfigure = message;
ENTER();
reconfigure->nodeid = swab32(reconfigure->nodeid);
reconfigure->value = swab32(reconfigure->value);
LEAVE();
}
static void message_handler_req_exec_votequorum_reconfigure (
const void *message,
unsigned int nodeid)
{
const struct req_exec_quorum_reconfigure *req_exec_quorum_reconfigure = message;
struct cluster_node *node;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "got reconfigure message from cluster node %u for %u",
nodeid, req_exec_quorum_reconfigure->nodeid);
switch(req_exec_quorum_reconfigure->param)
{
case VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES:
update_node_expected_votes(req_exec_quorum_reconfigure->value);
votequorum_exec_send_expectedvotes_notification();
update_ev_barrier(req_exec_quorum_reconfigure->value);
if (ev_tracking) {
us->expected_votes = max(us->expected_votes, ev_tracking_barrier);
}
recalculate_quorum(1, 0); /* Allow decrease */
break;
case VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES:
node = find_node_by_nodeid(req_exec_quorum_reconfigure->nodeid);
if (!node) {
LEAVE();
return;
}
node->votes = req_exec_quorum_reconfigure->value;
recalculate_quorum(1, 0); /* Allow decrease */
break;
case VOTEQUORUM_RECONFIG_PARAM_CANCEL_WFA:
update_wait_for_all_status(0);
log_printf(LOGSYS_LEVEL_INFO, "wait_for_all_status reset by user on node %d.",
req_exec_quorum_reconfigure->nodeid);
recalculate_quorum(0, 0);
break;
}
LEAVE();
}
static int votequorum_exec_exit_fn (void)
{
int ret = 0;
ENTER();
/*
* tell the other nodes we are leaving
*/
if (allow_downscale) {
us->flags |= NODE_FLAGS_LEAVING;
ret = votequorum_exec_send_nodeinfo(us->node_id);
}
if ((ev_tracking) && (ev_tracking_fd != -1)) {
close(ev_tracking_fd);
}
LEAVE();
return ret;
}
static void votequorum_set_icmap_ro_keys(void)
{
icmap_set_ro_access("quorum.allow_downscale", CS_FALSE, CS_TRUE);
icmap_set_ro_access("quorum.wait_for_all", CS_FALSE, CS_TRUE);
icmap_set_ro_access("quorum.last_man_standing", CS_FALSE, CS_TRUE);
icmap_set_ro_access("quorum.last_man_standing_window", CS_FALSE, CS_TRUE);
icmap_set_ro_access("quorum.expected_votes_tracking", CS_FALSE, CS_TRUE);
icmap_set_ro_access("quorum.auto_tie_breaker", CS_FALSE, CS_TRUE);
icmap_set_ro_access("quorum.auto_tie_breaker_node", CS_FALSE, CS_TRUE);
}
static char *votequorum_exec_init_fn (struct corosync_api_v1 *api)
{
char *error = NULL;
ENTER();
/*
* make sure we start clean
*/
qb_list_init(&cluster_members_list);
qb_list_init(&trackers_list);
qdevice = NULL;
us = NULL;
memset(cluster_nodes, 0, sizeof(cluster_nodes));
/*
* Allocate a cluster_node for qdevice
*/
qdevice = allocate_node(VOTEQUORUM_QDEVICE_NODEID);
if (!qdevice) {
LEAVE();
return ((char *)"Could not allocate node.");
}
qdevice->votes = 0;
memset(qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN);
/*
* Allocate a cluster_node for us
*/
us = allocate_node(corosync_api->totem_nodeid_get());
if (!us) {
LEAVE();
return ((char *)"Could not allocate node.");
}
icmap_set_uint32("runtime.votequorum.this_node_id", us->node_id);
us->state = NODESTATE_MEMBER;
us->votes = 1;
us->flags |= NODE_FLAGS_FIRST;
error = votequorum_readconfig(VOTEQUORUM_READCONFIG_STARTUP);
if (error) {
return error;
}
recalculate_quorum(0, 0);
/*
* Set RO keys in icmap
*/
votequorum_set_icmap_ro_keys();
/*
* Listen for changes
*/
votequorum_exec_add_config_notification();
/*
* Start us off with one node
*/
votequorum_exec_send_nodeinfo(us->node_id);
LEAVE();
return (NULL);
}
/*
* votequorum service core
*/
static void votequorum_last_man_standing_timer_fn(void *arg)
{
ENTER();
last_man_standing_timer_set = 0;
if (cluster_is_quorate) {
recalculate_quorum(1,1);
}
LEAVE();
}
static void votequorum_sync_init (
const unsigned int *trans_list, size_t trans_list_entries,
const unsigned int *member_list, size_t member_list_entries,
const struct memb_ring_id *ring_id)
{
int i, j;
int found;
int left_nodes;
struct cluster_node *node;
ENTER();
sync_in_progress = 1;
sync_nodeinfo_sent = 0;
sync_wait_for_poll_or_timeout = 0;
if (member_list_entries > 1) {
us->flags &= ~NODE_FLAGS_FIRST;
}
/*
* we don't need to track which nodes have left directly,
* since that info is in the node db, but we need to know
* if somebody has left for last_man_standing
*/
left_nodes = 0;
for (i = 0; i < quorum_members_entries; i++) {
found = 0;
for (j = 0; j < member_list_entries; j++) {
if (quorum_members[i] == member_list[j]) {
found = 1;
break;
}
}
if (found == 0) {
left_nodes = 1;
node = find_node_by_nodeid(quorum_members[i]);
if (node) {
node->state = NODESTATE_DEAD;
}
}
}
if (last_man_standing) {
if (((member_list_entries >= quorum) && (left_nodes)) ||
((member_list_entries <= quorum) && (auto_tie_breaker != ATB_NONE) && (check_low_node_id_partition() == 1))) {
if (last_man_standing_timer_set) {
corosync_api->timer_delete(last_man_standing_timer);
last_man_standing_timer_set = 0;
}
corosync_api->timer_add_duration((unsigned long long)last_man_standing_window*1000000,
NULL, votequorum_last_man_standing_timer_fn,
&last_man_standing_timer);
last_man_standing_timer_set = 1;
}
}
memcpy(previous_quorum_members, quorum_members, sizeof(unsigned int) * quorum_members_entries);
previous_quorum_members_entries = quorum_members_entries;
memcpy(quorum_members, member_list, sizeof(unsigned int) * member_list_entries);
quorum_members_entries = member_list_entries;
memcpy(&quorum_ringid, ring_id, sizeof(*ring_id));
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED && us->flags & NODE_FLAGS_QDEVICE_ALIVE) {
/*
* Reset poll timer. Sync waiting is interrupted on valid qdevice poll or after timeout
*/
if (qdevice_timer_set) {
corosync_api->timer_delete(qdevice_timer);
}
corosync_api->timer_add_duration((unsigned long long)qdevice_sync_timeout*1000000, qdevice,
qdevice_timer_fn, &qdevice_timer);
qdevice_timer_set = 1;
sync_wait_for_poll_or_timeout = 1;
log_printf(LOGSYS_LEVEL_INFO, "waiting for quorum device %s poll (but maximum for %u ms)",
qdevice_name, qdevice_sync_timeout);
}
LEAVE();
}
static int votequorum_sync_process (void)
{
if (!sync_nodeinfo_sent) {
votequorum_exec_send_nodeinfo(us->node_id);
votequorum_exec_send_nodeinfo(VOTEQUORUM_QDEVICE_NODEID);
if (strlen(qdevice_name)) {
votequorum_exec_send_qdevice_reg(VOTEQUORUM_QDEVICE_OPERATION_REGISTER,
qdevice_name);
}
votequorum_exec_send_nodelist_notification(NULL, 0LL);
sync_nodeinfo_sent = 1;
}
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED && sync_wait_for_poll_or_timeout) {
/*
* Waiting for qdevice to poll with new ringid or timeout
*/
return (-1);
}
return 0;
}
static void votequorum_sync_activate (void)
{
recalculate_quorum(0, 0);
quorum_callback(quorum_members, quorum_members_entries,
cluster_is_quorate, &quorum_ringid);
votequorum_exec_send_quorum_notification(NULL, 0L);
sync_in_progress = 0;
}
static void votequorum_sync_abort (void)
{
}
char *votequorum_init(struct corosync_api_v1 *api,
quorum_set_quorate_fn_t q_set_quorate_fn)
{
char *error;
ENTER();
if (q_set_quorate_fn == NULL) {
return ((char *)"Quorate function not set");
}
corosync_api = api;
quorum_callback = q_set_quorate_fn;
error = corosync_service_link_and_init(corosync_api,
&votequorum_service[0]);
if (error) {
return (error);
}
LEAVE();
return (NULL);
}
/*
* Library Handler init/fini
*/
static int quorum_lib_init_fn (void *conn)
{
struct quorum_pd *pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn);
ENTER();
qb_list_init (&pd->list);
pd->conn = conn;
LEAVE();
return (0);
}
static int quorum_lib_exit_fn (void *conn)
{
struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn);
ENTER();
if (quorum_pd->tracking_enabled) {
qb_list_del (&quorum_pd->list);
qb_list_init (&quorum_pd->list);
}
LEAVE();
return (0);
}
/*
* library internal functions
*/
static void qdevice_timer_fn(void *arg)
{
ENTER();
if ((!(us->flags & NODE_FLAGS_QDEVICE_ALIVE)) ||
(!qdevice_timer_set)) {
LEAVE();
return;
}
us->flags &= ~NODE_FLAGS_QDEVICE_ALIVE;
us->flags &= ~NODE_FLAGS_QDEVICE_CAST_VOTE;
log_printf(LOGSYS_LEVEL_INFO, "lost contact with quorum device %s", qdevice_name);
votequorum_exec_send_nodeinfo(us->node_id);
qdevice_timer_set = 0;
sync_wait_for_poll_or_timeout = 0;
LEAVE();
}
/*
* Library Handler Functions
*/
static void message_handler_req_lib_votequorum_getinfo (void *conn, const void *message)
{
const struct req_lib_votequorum_getinfo *req_lib_votequorum_getinfo = message;
struct res_lib_votequorum_getinfo res_lib_votequorum_getinfo;
struct cluster_node *node;
unsigned int highest_expected = 0;
unsigned int total_votes = 0;
cs_error_t error = CS_OK;
uint32_t nodeid = req_lib_votequorum_getinfo->nodeid;
ENTER();
log_printf(LOGSYS_LEVEL_DEBUG, "got getinfo request on %p for node %u", conn, req_lib_votequorum_getinfo->nodeid);
if (nodeid == VOTEQUORUM_QDEVICE_NODEID) {
nodeid = us->node_id;
}
node = find_node_by_nodeid(nodeid);
if (node) {
struct cluster_node *iternode;
struct qb_list_head *nodelist;
qb_list_for_each(nodelist, &cluster_members_list) {
iternode = qb_list_entry(nodelist, struct cluster_node, list);
if (iternode->state == NODESTATE_MEMBER) {
highest_expected =
max(highest_expected, iternode->expected_votes);
total_votes += iternode->votes;
}
}
if (node->flags & NODE_FLAGS_QDEVICE_CAST_VOTE) {
total_votes += qdevice->votes;
}
switch(node->state) {
case NODESTATE_MEMBER:
res_lib_votequorum_getinfo.state = VOTEQUORUM_NODESTATE_MEMBER;
break;
case NODESTATE_DEAD:
res_lib_votequorum_getinfo.state = VOTEQUORUM_NODESTATE_DEAD;
break;
case NODESTATE_LEAVING:
res_lib_votequorum_getinfo.state = VOTEQUORUM_NODESTATE_LEAVING;
break;
default:
res_lib_votequorum_getinfo.state = node->state;
break;
}
res_lib_votequorum_getinfo.state = node->state;
res_lib_votequorum_getinfo.votes = node->votes;
res_lib_votequorum_getinfo.expected_votes = node->expected_votes;
res_lib_votequorum_getinfo.highest_expected = highest_expected;
res_lib_votequorum_getinfo.quorum = quorum;
res_lib_votequorum_getinfo.total_votes = total_votes;
res_lib_votequorum_getinfo.flags = 0;
res_lib_votequorum_getinfo.nodeid = node->node_id;
if (two_node) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_TWONODE;
}
if (cluster_is_quorate) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QUORATE;
}
if (wait_for_all) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_WAIT_FOR_ALL;
}
if (last_man_standing) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_LAST_MAN_STANDING;
}
if (auto_tie_breaker != ATB_NONE) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_AUTO_TIE_BREAKER;
}
if (allow_downscale) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_ALLOW_DOWNSCALE;
}
memset(res_lib_votequorum_getinfo.qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN);
strcpy(res_lib_votequorum_getinfo.qdevice_name, qdevice_name);
res_lib_votequorum_getinfo.qdevice_votes = qdevice->votes;
if (node->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_REGISTERED;
}
if (node->flags & NODE_FLAGS_QDEVICE_ALIVE) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_ALIVE;
}
if (node->flags & NODE_FLAGS_QDEVICE_CAST_VOTE) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_CAST_VOTE;
}
if (node->flags & NODE_FLAGS_QDEVICE_MASTER_WINS) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_MASTER_WINS;
}
} else {
error = CS_ERR_NOT_EXIST;
}
res_lib_votequorum_getinfo.header.size = sizeof(res_lib_votequorum_getinfo);
res_lib_votequorum_getinfo.header.id = MESSAGE_RES_VOTEQUORUM_GETINFO;
res_lib_votequorum_getinfo.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_getinfo, sizeof(res_lib_votequorum_getinfo));
log_printf(LOGSYS_LEVEL_DEBUG, "getinfo response error: %d", error);
LEAVE();
}
static void message_handler_req_lib_votequorum_setexpected (void *conn, const void *message)
{
const struct req_lib_votequorum_setexpected *req_lib_votequorum_setexpected = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
cs_error_t error = CS_OK;
unsigned int newquorum;
unsigned int total_votes;
uint8_t allow_downscale_status = 0;
ENTER();
allow_downscale_status = allow_downscale;
allow_downscale = 0;
/*
* Validate new expected votes
*/
newquorum = calculate_quorum(1, req_lib_votequorum_setexpected->expected_votes, &total_votes);
allow_downscale = allow_downscale_status;
if (newquorum < total_votes / 2 ||
newquorum > total_votes) {
error = CS_ERR_INVALID_PARAM;
goto error_exit;
}
update_node_expected_votes(req_lib_votequorum_setexpected->expected_votes);
if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES, us->node_id,
req_lib_votequorum_setexpected->expected_votes)) {
error = CS_ERR_NO_RESOURCES;
}
error_exit:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_setvotes (void *conn, const void *message)
{
const struct req_lib_votequorum_setvotes *req_lib_votequorum_setvotes = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
struct cluster_node *node;
unsigned int newquorum;
unsigned int total_votes;
unsigned int saved_votes;
cs_error_t error = CS_OK;
unsigned int nodeid;
ENTER();
nodeid = req_lib_votequorum_setvotes->nodeid;
node = find_node_by_nodeid(nodeid);
if (!node) {
error = CS_ERR_NAME_NOT_FOUND;
goto error_exit;
}
/*
* Check votes is valid
*/
saved_votes = node->votes;
node->votes = req_lib_votequorum_setvotes->votes;
newquorum = calculate_quorum(1, 0, &total_votes);
if (newquorum < total_votes / 2 ||
newquorum > total_votes) {
node->votes = saved_votes;
error = CS_ERR_INVALID_PARAM;
goto error_exit;
}
if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES, nodeid,
req_lib_votequorum_setvotes->votes)) {
error = CS_ERR_NO_RESOURCES;
}
error_exit:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_trackstart (void *conn,
const void *message)
{
const struct req_lib_votequorum_trackstart *req_lib_votequorum_trackstart = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn);
cs_error_t error = CS_OK;
ENTER();
/*
* If an immediate listing of the current cluster membership
* is requested, generate membership list
*/
if (req_lib_votequorum_trackstart->track_flags & CS_TRACK_CURRENT ||
req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES) {
log_printf(LOGSYS_LEVEL_DEBUG, "sending initial status to %p", conn);
votequorum_exec_send_nodelist_notification(conn, req_lib_votequorum_trackstart->context);
votequorum_exec_send_quorum_notification(conn, req_lib_votequorum_trackstart->context);
}
if (quorum_pd->tracking_enabled) {
error = CS_ERR_EXIST;
goto response_send;
}
/*
* Record requests for tracking
*/
if (req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES ||
req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES_ONLY) {
quorum_pd->track_flags = req_lib_votequorum_trackstart->track_flags;
quorum_pd->tracking_enabled = 1;
quorum_pd->tracking_context = req_lib_votequorum_trackstart->context;
qb_list_add (&quorum_pd->list, &trackers_list);
}
response_send:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_trackstop (void *conn,
const void *message)
{
struct res_lib_votequorum_status res_lib_votequorum_status;
struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn);
int error = CS_OK;
ENTER();
if (quorum_pd->tracking_enabled) {
error = CS_OK;
quorum_pd->tracking_enabled = 0;
qb_list_del (&quorum_pd->list);
qb_list_init (&quorum_pd->list);
} else {
error = CS_ERR_NOT_EXIST;
}
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_qdevice_register (void *conn,
const void *message)
{
const struct req_lib_votequorum_qdevice_register *req_lib_votequorum_qdevice_register = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
cs_error_t error = CS_OK;
ENTER();
if (!qdevice_can_operate) {
log_printf(LOGSYS_LEVEL_INFO, "Registration of quorum device is disabled by incorrect corosync.conf. See logs for more information");
error = CS_ERR_ACCESS;
goto out;
}
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
if ((!strncmp(req_lib_votequorum_qdevice_register->name,
qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN))) {
goto out;
} else {
log_printf(LOGSYS_LEVEL_WARNING,
"A new qdevice with different name (new: %s old: %s) is trying to re-register!",
req_lib_votequorum_qdevice_register->name, qdevice_name);
error = CS_ERR_EXIST;
goto out;
}
} else {
if (qdevice_reg_conn != NULL) {
log_printf(LOGSYS_LEVEL_WARNING,
"Registration request already in progress");
error = CS_ERR_TRY_AGAIN;
goto out;
}
qdevice_reg_conn = conn;
if (votequorum_exec_send_qdevice_reg(VOTEQUORUM_QDEVICE_OPERATION_REGISTER,
req_lib_votequorum_qdevice_register->name) != 0) {
log_printf(LOGSYS_LEVEL_WARNING,
"Unable to send qdevice registration request to cluster");
error = CS_ERR_TRY_AGAIN;
qdevice_reg_conn = NULL;
} else {
LEAVE();
return;
}
}
out:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_qdevice_unregister (void *conn,
const void *message)
{
const struct req_lib_votequorum_qdevice_unregister *req_lib_votequorum_qdevice_unregister = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
cs_error_t error = CS_OK;
ENTER();
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
if (strncmp(req_lib_votequorum_qdevice_unregister->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) {
error = CS_ERR_INVALID_PARAM;
goto out;
}
if (qdevice_timer_set) {
corosync_api->timer_delete(qdevice_timer);
qdevice_timer_set = 0;
sync_wait_for_poll_or_timeout = 0;
}
us->flags &= ~NODE_FLAGS_QDEVICE_REGISTERED;
us->flags &= ~NODE_FLAGS_QDEVICE_ALIVE;
us->flags &= ~NODE_FLAGS_QDEVICE_CAST_VOTE;
us->flags &= ~NODE_FLAGS_QDEVICE_MASTER_WINS;
votequorum_exec_send_nodeinfo(us->node_id);
votequorum_exec_send_qdevice_reg(VOTEQUORUM_QDEVICE_OPERATION_UNREGISTER,
req_lib_votequorum_qdevice_unregister->name);
} else {
error = CS_ERR_NOT_EXIST;
}
out:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_qdevice_update (void *conn,
const void *message)
{
const struct req_lib_votequorum_qdevice_update *req_lib_votequorum_qdevice_update = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
cs_error_t error = CS_OK;
ENTER();
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
if (strncmp(req_lib_votequorum_qdevice_update->oldname, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) {
error = CS_ERR_INVALID_PARAM;
goto out;
}
votequorum_exec_send_qdevice_reconfigure(req_lib_votequorum_qdevice_update->oldname,
req_lib_votequorum_qdevice_update->newname);
} else {
error = CS_ERR_NOT_EXIST;
}
out:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_qdevice_poll (void *conn,
const void *message)
{
const struct req_lib_votequorum_qdevice_poll *req_lib_votequorum_qdevice_poll = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
cs_error_t error = CS_OK;
uint32_t oldflags;
ENTER();
if (!qdevice_can_operate) {
error = CS_ERR_ACCESS;
goto out;
}
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
if (!(req_lib_votequorum_qdevice_poll->ring_id.nodeid == quorum_ringid.nodeid &&
req_lib_votequorum_qdevice_poll->ring_id.seq == quorum_ringid.seq)) {
log_printf(LOGSYS_LEVEL_DEBUG, "Received poll ring id (%u.%"PRIu64") != last sync "
"ring id (%u.%"PRIu64"). Ignoring poll call.",
req_lib_votequorum_qdevice_poll->ring_id.nodeid, req_lib_votequorum_qdevice_poll->ring_id.seq,
quorum_ringid.nodeid, quorum_ringid.seq);
error = CS_ERR_MESSAGE_ERROR;
goto out;
}
if (strncmp(req_lib_votequorum_qdevice_poll->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) {
error = CS_ERR_INVALID_PARAM;
goto out;
}
if (qdevice_timer_set) {
corosync_api->timer_delete(qdevice_timer);
qdevice_timer_set = 0;
}
oldflags = us->flags;
us->flags |= NODE_FLAGS_QDEVICE_ALIVE;
if (req_lib_votequorum_qdevice_poll->cast_vote) {
us->flags |= NODE_FLAGS_QDEVICE_CAST_VOTE;
} else {
us->flags &= ~NODE_FLAGS_QDEVICE_CAST_VOTE;
}
if (us->flags != oldflags) {
votequorum_exec_send_nodeinfo(us->node_id);
}
corosync_api->timer_add_duration((unsigned long long)qdevice_timeout*1000000, qdevice,
qdevice_timer_fn, &qdevice_timer);
qdevice_timer_set = 1;
sync_wait_for_poll_or_timeout = 0;
} else {
error = CS_ERR_NOT_EXIST;
}
out:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}
static void message_handler_req_lib_votequorum_qdevice_master_wins (void *conn,
const void *message)
{
const struct req_lib_votequorum_qdevice_master_wins *req_lib_votequorum_qdevice_master_wins = message;
struct res_lib_votequorum_status res_lib_votequorum_status;
cs_error_t error = CS_OK;
uint32_t oldflags = us->flags;
ENTER();
if (!qdevice_can_operate) {
error = CS_ERR_ACCESS;
goto out;
}
if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) {
if (strncmp(req_lib_votequorum_qdevice_master_wins->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) {
error = CS_ERR_INVALID_PARAM;
goto out;
}
if (req_lib_votequorum_qdevice_master_wins->allow) {
us->flags |= NODE_FLAGS_QDEVICE_MASTER_WINS;
} else {
us->flags &= ~NODE_FLAGS_QDEVICE_MASTER_WINS;
}
if (us->flags != oldflags) {
votequorum_exec_send_nodeinfo(us->node_id);
}
update_qdevice_master_wins(req_lib_votequorum_qdevice_master_wins->allow);
} else {
error = CS_ERR_NOT_EXIST;
}
out:
res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status);
res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS;
res_lib_votequorum_status.header.error = error;
corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status));
LEAVE();
}

File Metadata

Mime Type
text/x-diff
Expires
Wed, Feb 26, 5:58 AM (23 h, 53 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1465194
Default Alt Text
(169 KB)

Event Timeline