Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/exec/totemconfig.c b/exec/totemconfig.c
index 7a30afdb..26a7142f 100644
--- a/exec/totemconfig.c
+++ b/exec/totemconfig.c
@@ -1,924 +1,931 @@
/*
* Copyright (c) 2002-2005 MontaVista Software, Inc.
* Copyright (c) 2006-2010 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/param.h>
#include <corosync/swab.h>
#include <corosync/list.h>
#include <corosync/totem/totem.h>
#include <corosync/engine/objdb.h>
#include <corosync/engine/config.h>
#include <corosync/engine/logsys.h>
#ifdef HAVE_LIBNSS
#include <nss.h>
#include <pk11pub.h>
#include <pkcs11.h>
#include <prerror.h>
#endif
#include "util.h"
#include "totemconfig.h"
#include "tlist.h" /* for HZ */
#define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST 4
#define TOKEN_TIMEOUT 1000
#define TOKEN_RETRANSMIT_TIMEOUT (int)(TOKEN_TIMEOUT / (TOKEN_RETRANSMITS_BEFORE_LOSS_CONST + 0.2))
#define TOKEN_HOLD_TIMEOUT (int)(TOKEN_RETRANSMIT_TIMEOUT * 0.8 - (1000/(int)HZ))
#define JOIN_TIMEOUT 50
#define MERGE_TIMEOUT 200
#define DOWNCHECK_TIMEOUT 1000
#define FAIL_TO_RECV_CONST 50
#define SEQNO_UNCHANGED_CONST 30
#define MINIMUM_TIMEOUT (int)(1000/HZ)*3
#define MAX_NETWORK_DELAY 50
#define WINDOW_SIZE 50
#define MAX_MESSAGES 17
+#define MISS_COUNT_CONST 5
#define RRP_PROBLEM_COUNT_TIMEOUT 2000
#define RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT 10
#define RRP_PROBLEM_COUNT_THRESHOLD_MIN 5
static char error_string_response[512];
static struct objdb_iface_ver0 *global_objdb;
static void add_totem_config_notification(
struct objdb_iface_ver0 *objdb,
struct totem_config *totem_config,
hdb_handle_t totem_object_handle);
/* These just makes the code below a little neater */
static inline int objdb_get_string (
const struct objdb_iface_ver0 *objdb,
hdb_handle_t object_service_handle,
const char *key, const char **value)
{
int res;
*value = NULL;
if ( !(res = objdb->object_key_get (object_service_handle,
key,
strlen (key),
(void *)value,
NULL))) {
if (*value) {
return 0;
}
}
return -1;
}
static inline void objdb_get_int (
const struct objdb_iface_ver0 *objdb,
hdb_handle_t object_service_handle,
const char *key, unsigned int *intvalue)
{
char *value = NULL;
if (!objdb->object_key_get (object_service_handle,
key,
strlen (key),
(void *)&value,
NULL)) {
if (value) {
*intvalue = atoi(value);
}
}
}
static unsigned int totem_handle_find (
struct objdb_iface_ver0 *objdb,
hdb_handle_t *totem_find_handle) {
hdb_handle_t object_find_handle;
unsigned int res;
/*
* Find a network section
*/
objdb->object_find_create (
OBJECT_PARENT_HANDLE,
"network",
strlen ("network"),
&object_find_handle);
res = objdb->object_find_next (
object_find_handle,
totem_find_handle);
objdb->object_find_destroy (object_find_handle);
/*
* Network section not found in configuration, checking for totem
*/
if (res == -1) {
objdb->object_find_create (
OBJECT_PARENT_HANDLE,
"totem",
strlen ("totem"),
&object_find_handle);
res = objdb->object_find_next (
object_find_handle,
totem_find_handle);
objdb->object_find_destroy (object_find_handle);
}
if (res == -1) {
return (-1);
}
return (0);
}
static void totem_volatile_config_read (
struct objdb_iface_ver0 *objdb,
struct totem_config *totem_config,
hdb_handle_t object_totem_handle)
{
objdb_get_int (objdb,object_totem_handle, "token", &totem_config->token_timeout);
objdb_get_int (objdb,object_totem_handle, "token_retransmit", &totem_config->token_retransmit_timeout);
objdb_get_int (objdb,object_totem_handle, "hold", &totem_config->token_hold_timeout);
objdb_get_int (objdb,object_totem_handle, "token_retransmits_before_loss_const", &totem_config->token_retransmits_before_loss_const);
objdb_get_int (objdb,object_totem_handle, "join", &totem_config->join_timeout);
objdb_get_int (objdb,object_totem_handle, "send_join", &totem_config->send_join_timeout);
objdb_get_int (objdb,object_totem_handle, "consensus", &totem_config->consensus_timeout);
objdb_get_int (objdb,object_totem_handle, "merge", &totem_config->merge_timeout);
objdb_get_int (objdb,object_totem_handle, "downcheck", &totem_config->downcheck_timeout);
objdb_get_int (objdb,object_totem_handle, "fail_recv_const", &totem_config->fail_to_recv_const);
objdb_get_int (objdb,object_totem_handle, "seqno_unchanged_const", &totem_config->seqno_unchanged_const);
objdb_get_int (objdb,object_totem_handle, "rrp_token_expired_timeout", &totem_config->rrp_token_expired_timeout);
objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_timeout", &totem_config->rrp_problem_count_timeout);
objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_threshold", &totem_config->rrp_problem_count_threshold);
objdb_get_int (objdb,object_totem_handle, "heartbeat_failures_allowed", &totem_config->heartbeat_failures_allowed);
objdb_get_int (objdb,object_totem_handle, "max_network_delay", &totem_config->max_network_delay);
objdb_get_int (objdb,object_totem_handle, "window_size", &totem_config->window_size);
objdb_get_string (objdb, object_totem_handle, "vsftype", &totem_config->vsf_type);
objdb_get_int (objdb,object_totem_handle, "max_messages", &totem_config->max_messages);
+
+ objdb_get_int (objdb,object_totem_handle, "miss_count_const", &totem_config->miss_count_const);
}
static void totem_get_crypto_type(
const struct objdb_iface_ver0 *objdb,
hdb_handle_t object_totem_handle,
struct totem_config *totem_config)
{
const char *str;
totem_config->crypto_accept = TOTEM_CRYPTO_ACCEPT_OLD;
if (!objdb_get_string (objdb, object_totem_handle, "crypto_accept", &str)) {
if (strcmp(str, "new") == 0) {
totem_config->crypto_accept = TOTEM_CRYPTO_ACCEPT_NEW;
}
}
totem_config->crypto_type = TOTEM_CRYPTO_SOBER;
#ifdef HAVE_LIBNSS
/*
* We must set these even if the key does not exist.
* Encryption type can be set on-the-fly using CFG
*/
totem_config->crypto_crypt_type = CKM_AES_CBC_PAD;
totem_config->crypto_sign_type = CKM_SHA256_RSA_PKCS;
#endif
if (!objdb_get_string (objdb, object_totem_handle, "crypto_type", &str)) {
if (strcmp(str, "sober") == 0) {
return;
}
#ifdef HAVE_LIBNSS
if (strcmp(str, "nss") == 0) {
totem_config->crypto_type = TOTEM_CRYPTO_NSS;
}
#endif
}
}
extern int totem_config_read (
struct objdb_iface_ver0 *objdb,
struct totem_config *totem_config,
const char **error_string)
{
int res = 0;
hdb_handle_t object_totem_handle;
hdb_handle_t object_interface_handle;
hdb_handle_t object_member_handle;
const char *str;
unsigned int ringnumber = 0;
hdb_handle_t object_find_interface_handle;
hdb_handle_t object_find_member_handle;
const char *transport_type;
int member_count = 0;
res = totem_handle_find (objdb, &object_totem_handle);
if (res == -1) {
printf ("couldn't find totem handle\n");
return (-1);
}
memset (totem_config, 0, sizeof (struct totem_config));
totem_config->interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX);
if (totem_config->interfaces == 0) {
*error_string = "Out of memory trying to allocate ethernet interface storage area";
return -1;
}
memset (totem_config->interfaces, 0,
sizeof (struct totem_interface) * INTERFACE_MAX);
totem_config->secauth = 1;
strcpy (totem_config->rrp_mode, "none");
if (!objdb_get_string (objdb, object_totem_handle, "version", &str)) {
if (strcmp (str, "2") == 0) {
totem_config->version = 2;
}
}
if (!objdb_get_string (objdb, object_totem_handle, "secauth", &str)) {
if (strcmp (str, "on") == 0) {
totem_config->secauth = 1;
}
if (strcmp (str, "off") == 0) {
totem_config->secauth = 0;
}
}
if (totem_config->secauth == 1) {
totem_get_crypto_type(objdb, object_totem_handle, totem_config);
}
if (!objdb_get_string (objdb, object_totem_handle, "rrp_mode", &str)) {
strcpy (totem_config->rrp_mode, str);
}
/*
* Get interface node id
*/
objdb_get_int (objdb, object_totem_handle, "nodeid", &totem_config->node_id);
totem_config->clear_node_high_bit = 0;
if (!objdb_get_string (objdb,object_totem_handle, "clear_node_high_bit", &str)) {
if (strcmp (str, "yes") == 0) {
totem_config->clear_node_high_bit = 1;
}
}
objdb_get_int (objdb,object_totem_handle, "threads", &totem_config->threads);
objdb_get_int (objdb,object_totem_handle, "netmtu", &totem_config->net_mtu);
/*
* Get things that might change in the future
*/
totem_volatile_config_read (objdb, totem_config, object_totem_handle);
objdb->object_find_create (
object_totem_handle,
"interface",
strlen ("interface"),
&object_find_interface_handle);
while (objdb->object_find_next (
object_find_interface_handle,
&object_interface_handle) == 0) {
member_count = 0;
objdb_get_int (objdb, object_interface_handle, "ringnumber", &ringnumber);
/*
* Get interface multicast address
*/
if (!objdb_get_string (objdb, object_interface_handle, "mcastaddr", &str)) {
res = totemip_parse (&totem_config->interfaces[ringnumber].mcast_addr, str, 0);
}
totem_config->broadcast_use = 0;
if (!objdb_get_string (objdb, object_interface_handle, "broadcast", &str)) {
if (strcmp (str, "yes") == 0) {
totem_config->broadcast_use = 1;
totemip_parse (
&totem_config->interfaces[ringnumber].mcast_addr,
"255.255.255.255", 0);
}
}
/*
* Get mcast port
*/
if (!objdb_get_string (objdb, object_interface_handle, "mcastport", &str)) {
totem_config->interfaces[ringnumber].ip_port = atoi (str);
}
/*
* Get the bind net address
*/
if (!objdb_get_string (objdb, object_interface_handle, "bindnetaddr", &str)) {
res = totemip_parse (&totem_config->interfaces[ringnumber].bindnet, str,
totem_config->interfaces[ringnumber].mcast_addr.family);
}
objdb->object_find_create (
object_interface_handle,
"member",
strlen ("member"),
&object_find_member_handle);
while (objdb->object_find_next (
object_find_member_handle,
&object_member_handle) == 0) {
if (!objdb_get_string (objdb, object_member_handle, "memberaddr", &str)) {
res = totemip_parse (&totem_config->interfaces[ringnumber].member_list[member_count++], str, 0);
}
}
totem_config->interfaces[ringnumber].member_count = member_count;
totem_config->interface_count++;
}
objdb->object_find_destroy (object_find_interface_handle);
add_totem_config_notification(objdb, totem_config, object_totem_handle);
totem_config->transport_number = TOTEM_TRANSPORT_UDP;
objdb_get_string (objdb, object_totem_handle, "transport", &transport_type);
if (transport_type) {
if (strcmp (transport_type, "udpu") == 0) {
totem_config->transport_number = TOTEM_TRANSPORT_UDPU;
}
}
if (transport_type) {
if (strcmp (transport_type, "iba") == 0) {
totem_config->transport_number = TOTEM_TRANSPORT_RDMA;
}
}
return 0;
}
int totem_config_validate (
struct totem_config *totem_config,
const char **error_string)
{
static char local_error_reason[512];
char parse_error[512];
const char *error_reason = local_error_reason;
int i;
unsigned int interface_max = INTERFACE_MAX;
if (totem_config->interface_count == 0) {
error_reason = "No interfaces defined";
goto parse_error;
}
for (i = 0; i < totem_config->interface_count; i++) {
/*
* Some error checking of parsed data to make sure its valid
*/
struct totem_ip_address null_addr;
memset (&null_addr, 0, sizeof (struct totem_ip_address));
if ((totem_config->transport_number == 0) &&
memcmp (&totem_config->interfaces[i].mcast_addr, &null_addr,
sizeof (struct totem_ip_address)) == 0) {
error_reason = "No multicast address specified";
goto parse_error;
}
if (totem_config->interfaces[i].ip_port == 0) {
error_reason = "No multicast port specified";
goto parse_error;
}
if (totem_config->interfaces[i].mcast_addr.family == AF_INET6 &&
totem_config->node_id == 0) {
error_reason = "An IPV6 network requires that a node ID be specified.";
goto parse_error;
}
if (totem_config->broadcast_use == 0 && totem_config->transport_number == 0) {
if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) {
error_reason = "Multicast address family does not match bind address family";
goto parse_error;
}
if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) {
error_reason = "Not all bind address belong to the same IP family";
goto parse_error;
}
if (totemip_is_mcast (&totem_config->interfaces[i].mcast_addr) != 0) {
error_reason = "mcastaddr is not a correct multicast address.";
goto parse_error;
}
}
}
if (totem_config->version != 2) {
error_reason = "This totem parser can only parse version 2 configurations.";
goto parse_error;
}
if (totem_config->token_retransmits_before_loss_const == 0) {
totem_config->token_retransmits_before_loss_const =
TOKEN_RETRANSMITS_BEFORE_LOSS_CONST;
}
/*
* Setup timeout values that are not setup by user
*/
if (totem_config->token_timeout == 0) {
totem_config->token_timeout = TOKEN_TIMEOUT;
if (totem_config->token_retransmits_before_loss_const == 0) {
totem_config->token_retransmits_before_loss_const = TOKEN_RETRANSMITS_BEFORE_LOSS_CONST;
}
if (totem_config->token_retransmit_timeout == 0) {
totem_config->token_retransmit_timeout =
(int)(totem_config->token_timeout /
(totem_config->token_retransmits_before_loss_const + 0.2));
}
if (totem_config->token_hold_timeout == 0) {
totem_config->token_hold_timeout =
(int)(totem_config->token_retransmit_timeout * 0.8 -
(1000/HZ));
}
}
if (totem_config->max_network_delay == 0) {
totem_config->max_network_delay = MAX_NETWORK_DELAY;
}
if (totem_config->max_network_delay < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The max_network_delay parameter (%d ms) may not be less then (%d ms).",
totem_config->max_network_delay, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->window_size == 0) {
totem_config->window_size = WINDOW_SIZE;
}
if (totem_config->max_messages == 0) {
totem_config->max_messages = MAX_MESSAGES;
}
+ if (totem_config->miss_count_const == 0) {
+ totem_config->miss_count_const = MISS_COUNT_CONST;
+ }
+
if (totem_config->token_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->token_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->token_retransmit_timeout == 0) {
totem_config->token_retransmit_timeout =
(int)(totem_config->token_timeout /
(totem_config->token_retransmits_before_loss_const + 0.2));
}
if (totem_config->token_hold_timeout == 0) {
totem_config->token_hold_timeout =
(int)(totem_config->token_retransmit_timeout * 0.8 -
(1000/HZ));
}
if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token retransmit timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->token_retransmit_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->token_hold_timeout == 0) {
totem_config->token_hold_timeout = TOKEN_HOLD_TIMEOUT;
}
if (totem_config->token_hold_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The token hold timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->token_hold_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->join_timeout == 0) {
totem_config->join_timeout = JOIN_TIMEOUT;
}
if (totem_config->join_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The join timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->join_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->consensus_timeout == 0) {
totem_config->consensus_timeout = (int)(float)(1.2 * totem_config->token_timeout);
}
if (totem_config->consensus_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The consensus timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->consensus_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->merge_timeout == 0) {
totem_config->merge_timeout = MERGE_TIMEOUT;
}
if (totem_config->merge_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The merge timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->merge_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->downcheck_timeout == 0) {
totem_config->downcheck_timeout = DOWNCHECK_TIMEOUT;
}
if (totem_config->downcheck_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The downcheck timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->downcheck_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
/*
* RRP values validation
*/
if (strcmp (totem_config->rrp_mode, "none") &&
strcmp (totem_config->rrp_mode, "active") &&
strcmp (totem_config->rrp_mode, "passive")) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The RRP mode \"%s\" specified is invalid. It must be none, active, or passive.\n", totem_config->rrp_mode);
goto parse_error;
}
if (totem_config->rrp_problem_count_timeout == 0) {
totem_config->rrp_problem_count_timeout = RRP_PROBLEM_COUNT_TIMEOUT;
}
if (totem_config->rrp_problem_count_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The RRP problem count timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->rrp_problem_count_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (totem_config->rrp_problem_count_threshold == 0) {
totem_config->rrp_problem_count_threshold = RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT;
}
if (totem_config->rrp_problem_count_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The RRP problem count threshold (%d problem count) may not be less then (%d problem count).",
totem_config->rrp_problem_count_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN);
goto parse_error;
}
if (totem_config->rrp_token_expired_timeout == 0) {
totem_config->rrp_token_expired_timeout =
totem_config->token_retransmit_timeout;
}
if (totem_config->rrp_token_expired_timeout < MINIMUM_TIMEOUT) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The RRP token expired timeout parameter (%d ms) may not be less then (%d ms).",
totem_config->rrp_token_expired_timeout, MINIMUM_TIMEOUT);
goto parse_error;
}
if (strcmp (totem_config->rrp_mode, "none") == 0) {
interface_max = 1;
}
if (interface_max < totem_config->interface_count) {
snprintf (parse_error, sizeof(parse_error),
"%d is too many configured interfaces for the rrp_mode setting %s.",
totem_config->interface_count,
totem_config->rrp_mode);
error_reason = parse_error;
goto parse_error;
}
if (totem_config->fail_to_recv_const == 0) {
totem_config->fail_to_recv_const = FAIL_TO_RECV_CONST;
}
if (totem_config->seqno_unchanged_const == 0) {
totem_config->seqno_unchanged_const = SEQNO_UNCHANGED_CONST;
}
if (totem_config->net_mtu == 0) {
totem_config->net_mtu = 1500;
}
if ((MESSAGE_QUEUE_MAX) < totem_config->max_messages) {
snprintf (local_error_reason, sizeof(local_error_reason),
"The max_messages parameter (%d messages) may not be greater then (%d messages).",
totem_config->max_messages, MESSAGE_QUEUE_MAX);
goto parse_error;
}
if (totem_config->threads > SEND_THREADS_MAX) {
totem_config->threads = SEND_THREADS_MAX;
}
if (totem_config->secauth == 0) {
totem_config->threads = 0;
}
if (totem_config->net_mtu > FRAME_SIZE_MAX) {
error_reason = "This net_mtu parameter is greater then the maximum frame size";
goto parse_error;
}
if (totem_config->vsf_type == NULL) {
totem_config->vsf_type = "none";
}
return (0);
parse_error:
snprintf (error_string_response, sizeof(error_string_response),
"parse error in config: %s\n", error_reason);
*error_string = error_string_response;
return (-1);
}
static int read_keyfile (
const char *key_location,
struct totem_config *totem_config,
const char **error_string)
{
int fd;
int res;
ssize_t expected_key_len = sizeof (totem_config->private_key);
int saved_errno;
char error_str[100];
fd = open (key_location, O_RDONLY);
if (fd == -1) {
strerror_r (errno, error_str, 100);
snprintf (error_string_response, sizeof(error_string_response),
"Could not open %s: %s\n",
key_location, error_str);
goto parse_error;
}
res = read (fd, totem_config->private_key, expected_key_len);
saved_errno = errno;
close (fd);
if (res == -1) {
strerror_r (errno, error_str, 100);
snprintf (error_string_response, sizeof(error_string_response),
"Could not read %s: %s\n",
key_location, error_str);
goto parse_error;
}
totem_config->private_key_len = expected_key_len;
if (res != expected_key_len) {
snprintf (error_string_response, sizeof(error_string_response),
"Could only read %d bits of 1024 bits from %s.\n",
res * 8, key_location);
goto parse_error;
}
return 0;
parse_error:
*error_string = error_string_response;
return (-1);
}
int totem_config_keyread (
struct objdb_iface_ver0 *objdb,
struct totem_config *totem_config,
const char **error_string)
{
int got_key = 0;
const char *key_location = NULL;
hdb_handle_t object_totem_handle;
int res;
memset (totem_config->private_key, 0, 128);
totem_config->private_key_len = 128;
if (totem_config->secauth == 0) {
return (0);
}
res = totem_handle_find (objdb, &object_totem_handle);
if (res == -1) {
return (-1);
}
/* objdb may store the location of the key file */
if (!objdb_get_string (objdb,object_totem_handle, "keyfile", &key_location)
&& key_location) {
res = read_keyfile(key_location, totem_config, error_string);
if (res) {
goto key_error;
}
got_key = 1;
} else { /* Or the key itself may be in the objdb */
char *key = NULL;
size_t key_len;
res = objdb->object_key_get (object_totem_handle,
"key",
strlen ("key"),
(void *)&key,
&key_len);
if (res == 0 && key) {
if (key_len > sizeof (totem_config->private_key)) {
goto key_error;
}
memcpy(totem_config->private_key, key, key_len);
totem_config->private_key_len = key_len;
got_key = 1;
}
}
/* In desperation we read the default filename */
if (!got_key) {
const char *filename = getenv("COROSYNC_TOTEM_AUTHKEY_FILE");
if (!filename)
filename = COROSYSCONFDIR "/authkey";
res = read_keyfile(filename, totem_config, error_string);
if (res)
goto key_error;
}
return (0);
key_error:
*error_string = error_string_response;
return (-1);
}
static void totem_key_change_notify(object_change_type_t change_type,
hdb_handle_t parent_object_handle,
hdb_handle_t object_handle,
const void *object_name_pt, size_t object_name_len,
const void *key_name_pt, size_t key_len,
const void *key_value_pt, size_t key_value_len,
void *priv_data_pt)
{
struct totem_config *totem_config = priv_data_pt;
if (memcmp(object_name_pt, "totem", object_name_len) == 0)
totem_volatile_config_read(global_objdb,
totem_config,
object_handle); // CHECK
}
static void totem_objdb_reload_notify(objdb_reload_notify_type_t type, int flush,
void *priv_data_pt)
{
struct totem_config *totem_config = priv_data_pt;
hdb_handle_t totem_object_handle;
if (totem_config == NULL)
return;
/*
* A new totem {} key might exist, cancel the
* existing notification at the start of reload,
* and start a new one on the new object when
* it's all settled.
*/
if (type == OBJDB_RELOAD_NOTIFY_START) {
global_objdb->object_track_stop(
totem_key_change_notify,
NULL,
NULL,
NULL,
totem_config);
}
if (type == OBJDB_RELOAD_NOTIFY_END ||
type == OBJDB_RELOAD_NOTIFY_FAILED) {
if (!totem_handle_find(global_objdb,
&totem_object_handle)) {
global_objdb->object_track_start(totem_object_handle,
1,
totem_key_change_notify,
NULL, // object_create_notify,
NULL, // object_destroy_notify,
NULL, // object_reload_notify
totem_config); // priv_data
/*
* Reload the configuration
*/
totem_volatile_config_read(global_objdb,
totem_config,
totem_object_handle);
}
else {
log_printf(LOGSYS_LEVEL_ERROR, "totem objdb tracking stopped, cannot find totem{} handle on objdb\n");
}
}
}
static void add_totem_config_notification(
struct objdb_iface_ver0 *objdb,
struct totem_config *totem_config,
hdb_handle_t totem_object_handle)
{
global_objdb = objdb;
objdb->object_track_start(totem_object_handle,
1,
totem_key_change_notify,
NULL, // object_create_notify,
NULL, // object_destroy_notify,
NULL, // object_reload_notify
totem_config); // priv_data
/*
* Reload notify must be on the parent object
*/
objdb->object_track_start(OBJECT_PARENT_HANDLE,
1,
NULL, // key_change_notify,
NULL, // object_create_notify,
NULL, // object_destroy_notify,
totem_objdb_reload_notify, // object_reload_notify
totem_config); // priv_data
}
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
index f7a66383..0eaa6383 100644
--- a/exec/totemsrp.c
+++ b/exec/totemsrp.c
@@ -1,4422 +1,4440 @@
/*
* Copyright (c) 2003-2006 MontaVista Software, Inc.
* Copyright (c) 2006-2009 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The first version of this code was based upon Yair Amir's PhD thesis:
* http://www.cs.jhu.edu/~yairamir/phd.ps) (ch4,5).
*
* The current version of totemsrp implements the Totem protocol specified in:
* http://citeseer.ist.psu.edu/amir95totem.html
*
* The deviations from the above published protocols are:
* - encryption of message contents with SOBER128
* - authentication of meessage contents with SHA1/HMAC
* - token hold mode where token doesn't rotate on unused ring - reduces cpu
* usage on 1.6ghz xeon from 35% to less then .1 % as measured by top
*/
#include <config.h>
#include <assert.h>
#ifdef HAVE_ALLOCA_H
#include <alloca.h>
#endif
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <netdb.h>
#include <sys/un.h>
#include <sys/ioctl.h>
#include <sys/param.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <sched.h>
#include <time.h>
#include <sys/time.h>
#include <sys/poll.h>
#include <limits.h>
#include <corosync/swab.h>
#include <corosync/cs_queue.h>
#include <corosync/sq.h>
#include <corosync/list.h>
#include <corosync/hdb.h>
#include <corosync/totem/coropoll.h>
#define LOGSYS_UTILS_ONLY 1
#include <corosync/engine/logsys.h>
#include "totemsrp.h"
#include "totemrrp.h"
#include "totemnet.h"
#include "wthread.h"
#include "crypto.h"
#include "tlist.h"
#define LOCALHOST_IP inet_addr("127.0.0.1")
#define QUEUE_RTR_ITEMS_SIZE_MAX 16384 /* allow 16384 retransmit items */
#define RETRANS_MESSAGE_QUEUE_SIZE_MAX 16384 /* allow 500 messages to be queued */
#define RECEIVED_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */
#define MAXIOVS 5
#define RETRANSMIT_ENTRIES_MAX 30
#define TOKEN_SIZE_MAX 64000 /* bytes */
#define LEAVE_DUMMY_NODEID 0
/*
* Rollover handling:
* SEQNO_START_MSG is the starting sequence number after a new configuration
* This should remain zero, unless testing overflow in which case
* 0x7ffff000 and 0xfffff000 are good starting values.
*
* SEQNO_START_TOKEN is the starting sequence number after a new configuration
* for a token. This should remain zero, unless testing overflow in which
* case 07fffff00 or 0xffffff00 are good starting values.
*
* SEQNO_START_MSG is the starting sequence number after a new configuration
* This should remain zero, unless testing overflow in which case
* 0x7ffff000 and 0xfffff000 are good values to start with
*/
#define SEQNO_START_MSG 0x0
#define SEQNO_START_TOKEN 0x0
/*
* These can be used ot test different rollover points
* #define SEQNO_START_MSG 0xfffffe00
* #define SEQNO_START_TOKEN 0xfffffe00
*/
/*
* These can be used to test the error recovery algorithms
* #define TEST_DROP_ORF_TOKEN_PERCENTAGE 30
* #define TEST_DROP_COMMIT_TOKEN_PERCENTAGE 30
* #define TEST_DROP_MCAST_PERCENTAGE 50
* #define TEST_RECOVERY_MSG_COUNT 300
*/
/*
* we compare incoming messages to determine if their endian is
* different - if so convert them
*
* do not change
*/
#define ENDIAN_LOCAL 0xff22
enum message_type {
MESSAGE_TYPE_ORF_TOKEN = 0, /* Ordering, Reliability, Flow (ORF) control Token */
MESSAGE_TYPE_MCAST = 1, /* ring ordered multicast message */
MESSAGE_TYPE_MEMB_MERGE_DETECT = 2, /* merge rings if there are available rings */
MESSAGE_TYPE_MEMB_JOIN = 3, /* membership join message */
MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4, /* membership commit token */
MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5, /* cancel the holding of the token */
};
enum encapsulation_type {
MESSAGE_ENCAPSULATED = 1,
MESSAGE_NOT_ENCAPSULATED = 2
};
/*
* New membership algorithm local variables
*/
struct srp_addr {
struct totem_ip_address addr[INTERFACE_MAX];
};
struct consensus_list_item {
struct srp_addr addr;
int set;
};
struct token_callback_instance {
struct list_head list;
int (*callback_fn) (enum totem_callback_token_type type, const void *);
enum totem_callback_token_type callback_type;
int delete;
void *data;
};
struct totemsrp_socket {
int mcast;
int token;
};
struct message_header {
char type;
char encapsulated;
unsigned short endian_detector;
unsigned int nodeid;
} __attribute__((packed));
struct mcast {
struct message_header header;
struct srp_addr system_from;
unsigned int seq;
int this_seqno;
struct memb_ring_id ring_id;
unsigned int node_id;
int guarantee;
} __attribute__((packed));
struct rtr_item {
struct memb_ring_id ring_id;
unsigned int seq;
}__attribute__((packed));
struct orf_token {
struct message_header header;
unsigned int seq;
unsigned int token_seq;
unsigned int aru;
unsigned int aru_addr;
struct memb_ring_id ring_id;
unsigned int backlog;
unsigned int fcc;
int retrans_flg;
int rtr_list_entries;
struct rtr_item rtr_list[0];
}__attribute__((packed));
struct memb_join {
struct message_header header;
struct srp_addr system_from;
unsigned int proc_list_entries;
unsigned int failed_list_entries;
unsigned long long ring_seq;
unsigned char end_of_memb_join[0];
/*
* These parts of the data structure are dynamic:
* struct srp_addr proc_list[];
* struct srp_addr failed_list[];
*/
} __attribute__((packed));
struct memb_merge_detect {
struct message_header header;
struct srp_addr system_from;
struct memb_ring_id ring_id;
} __attribute__((packed));
struct token_hold_cancel {
struct message_header header;
struct memb_ring_id ring_id;
} __attribute__((packed));
struct memb_commit_token_memb_entry {
struct memb_ring_id ring_id;
unsigned int aru;
unsigned int high_delivered;
unsigned int received_flg;
}__attribute__((packed));
struct memb_commit_token {
struct message_header header;
unsigned int token_seq;
struct memb_ring_id ring_id;
unsigned int retrans_flg;
int memb_index;
int addr_entries;
unsigned char end_of_commit_token[0];
/*
* These parts of the data structure are dynamic:
*
* struct srp_addr addr[PROCESSOR_COUNT_MAX];
* struct memb_commit_token_memb_entry memb_list[PROCESSOR_COUNT_MAX];
*/
}__attribute__((packed));
struct message_item {
struct mcast *mcast;
unsigned int msg_len;
};
struct sort_queue_item {
struct mcast *mcast;
unsigned int msg_len;
};
struct orf_token_mcast_thread_state {
char iobuf[9000];
prng_state prng_state;
};
enum memb_state {
MEMB_STATE_OPERATIONAL = 1,
MEMB_STATE_GATHER = 2,
MEMB_STATE_COMMIT = 3,
MEMB_STATE_RECOVERY = 4
};
struct totemsrp_instance {
int iface_changes;
int failed_to_recv;
/*
* Flow control mcasts and remcasts on last and current orf_token
*/
int fcc_remcast_last;
int fcc_mcast_last;
int fcc_remcast_current;
struct consensus_list_item consensus_list[PROCESSOR_COUNT_MAX];
int consensus_list_entries;
struct srp_addr my_id;
struct srp_addr my_proc_list[PROCESSOR_COUNT_MAX];
struct srp_addr my_failed_list[PROCESSOR_COUNT_MAX];
struct srp_addr my_new_memb_list[PROCESSOR_COUNT_MAX];
struct srp_addr my_trans_memb_list[PROCESSOR_COUNT_MAX];
struct srp_addr my_memb_list[PROCESSOR_COUNT_MAX];
struct srp_addr my_deliver_memb_list[PROCESSOR_COUNT_MAX];
struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX];
int my_proc_list_entries;
int my_failed_list_entries;
int my_new_memb_entries;
int my_trans_memb_entries;
int my_memb_entries;
int my_deliver_memb_entries;
int my_left_memb_entries;
struct memb_ring_id my_ring_id;
struct memb_ring_id my_old_ring_id;
int my_aru_count;
int my_merge_detect_timeout_outstanding;
unsigned int my_last_aru;
int my_seq_unchanged;
int my_received_flg;
unsigned int my_high_seq_received;
unsigned int my_install_seq;
int my_rotation_counter;
int my_set_retrans_flg;
int my_retrans_flg_count;
unsigned int my_high_ring_delivered;
int heartbeat_timeout;
/*
* Queues used to order, deliver, and recover messages
*/
struct cs_queue new_message_queue;
struct cs_queue retrans_message_queue;
struct sq regular_sort_queue;
struct sq recovery_sort_queue;
/*
* Received up to and including
*/
unsigned int my_aru;
unsigned int my_high_delivered;
struct list_head token_callback_received_listhead;
struct list_head token_callback_sent_listhead;
char orf_token_retransmit[TOKEN_SIZE_MAX];
int orf_token_retransmit_size;
unsigned int my_token_seq;
/*
* Timers
*/
poll_timer_handle timer_pause_timeout;
poll_timer_handle timer_orf_token_timeout;
poll_timer_handle timer_orf_token_retransmit_timeout;
poll_timer_handle timer_orf_token_hold_retransmit_timeout;
poll_timer_handle timer_merge_detect_timeout;
poll_timer_handle memb_timer_state_gather_join_timeout;
poll_timer_handle memb_timer_state_gather_consensus_timeout;
poll_timer_handle memb_timer_state_commit_timeout;
poll_timer_handle timer_heartbeat_timeout;
/*
* Function and data used to log messages
*/
int totemsrp_log_level_security;
int totemsrp_log_level_error;
int totemsrp_log_level_warning;
int totemsrp_log_level_notice;
int totemsrp_log_level_debug;
int totemsrp_subsys_id;
void (*totemsrp_log_printf) (
unsigned int rec_ident,
const char *function,
const char *file,
int line,
const char *format, ...)__attribute__((format(printf, 5, 6)));;
enum memb_state memb_state;
//TODO struct srp_addr next_memb;
hdb_handle_t totemsrp_poll_handle;
struct totem_ip_address mcast_address;
void (*totemsrp_deliver_fn) (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required);
void (*totemsrp_confchg_fn) (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id);
void (*totemsrp_service_ready_fn) (void);
int global_seqno;
int my_token_held;
unsigned long long token_ring_id_seq;
unsigned int last_released;
unsigned int set_aru;
int old_ring_state_saved;
int old_ring_state_aru;
unsigned int old_ring_state_high_seq_received;
unsigned int my_last_seq;
struct timeval tv_old;
void *totemrrp_context;
struct totem_config *totem_config;
unsigned int use_heartbeat;
unsigned int my_trc;
unsigned int my_pbl;
unsigned int my_cbl;
unsigned long long int pause_timestamp;
struct memb_commit_token *commit_token;
totemsrp_stats_t stats;
void * token_recv_event_handle;
void * token_sent_event_handle;
char commit_token_storage[9000];
};
struct message_handlers {
int count;
int (*handler_functions[6]) (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
};
/*
* forward decls
*/
static int message_handler_orf_token (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
static int message_handler_mcast (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
static int message_handler_memb_merge_detect (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
static int message_handler_memb_join (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
static int message_handler_memb_commit_token (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
static int message_handler_token_hold_cancel (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed);
static void totemsrp_instance_initialize (struct totemsrp_instance *instance);
static unsigned int main_msgs_missing (void);
static void main_token_seqid_get (
const void *msg,
unsigned int *seqid,
unsigned int *token_is);
static void srp_addr_copy (struct srp_addr *dest, const struct srp_addr *src);
static void srp_addr_to_nodeid (
unsigned int *nodeid_out,
struct srp_addr *srp_addr_in,
unsigned int entries);
static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b);
static void memb_leave_message_send (struct totemsrp_instance *instance);
static void memb_ring_id_create_or_load (struct totemsrp_instance *, struct memb_ring_id *);
static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type);
static void memb_state_gather_enter (struct totemsrp_instance *instance, int gather_from);
static void messages_deliver_to_app (struct totemsrp_instance *instance, int skip, unsigned int end_point);
static int orf_token_mcast (struct totemsrp_instance *instance, struct orf_token *oken,
int fcc_mcasts_allowed);
static void messages_free (struct totemsrp_instance *instance, unsigned int token_aru);
static void memb_ring_id_set_and_store (struct totemsrp_instance *instance,
const struct memb_ring_id *ring_id);
static void target_set_completed (void *context);
static void memb_state_commit_token_update (struct totemsrp_instance *instance);
static void memb_state_commit_token_target_set (struct totemsrp_instance *instance);
static int memb_state_commit_token_send (struct totemsrp_instance *instance);
static int memb_state_commit_token_send_recovery (struct totemsrp_instance *instance, struct memb_commit_token *memb_commit_token);
static void memb_state_commit_token_create (struct totemsrp_instance *instance);
static int token_hold_cancel_send (struct totemsrp_instance *instance);
static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out);
static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out);
static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out);
static void mcast_endian_convert (const struct mcast *in, struct mcast *out);
static void memb_merge_detect_endian_convert (
const struct memb_merge_detect *in,
struct memb_merge_detect *out);
static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in);
static void timer_function_orf_token_timeout (void *data);
static void timer_function_pause_timeout (void *data);
static void timer_function_heartbeat_timeout (void *data);
static void timer_function_token_retransmit_timeout (void *data);
static void timer_function_token_hold_retransmit_timeout (void *data);
static void timer_function_merge_detect_timeout (void *data);
void main_deliver_fn (
void *context,
const void *msg,
unsigned int msg_len);
void main_iface_change_fn (
void *context,
const struct totem_ip_address *iface_address,
unsigned int iface_no);
struct message_handlers totemsrp_message_handlers = {
6,
{
message_handler_orf_token,
message_handler_mcast,
message_handler_memb_merge_detect,
message_handler_memb_join,
message_handler_memb_commit_token,
message_handler_token_hold_cancel
}
};
static const char *rundir = NULL;
#define log_printf(level, format, args...) \
do { \
instance->totemsrp_log_printf ( \
LOGSYS_ENCODE_RECID(level, \
instance->totemsrp_subsys_id, \
LOGSYS_RECID_LOG), \
__FUNCTION__, __FILE__, __LINE__, \
format, ##args); \
} while (0);
static void totemsrp_instance_initialize (struct totemsrp_instance *instance)
{
memset (instance, 0, sizeof (struct totemsrp_instance));
list_init (&instance->token_callback_received_listhead);
list_init (&instance->token_callback_sent_listhead);
instance->my_received_flg = 1;
instance->my_token_seq = SEQNO_START_TOKEN - 1;
instance->memb_state = MEMB_STATE_OPERATIONAL;
instance->set_aru = -1;
instance->my_aru = SEQNO_START_MSG;
instance->my_high_seq_received = SEQNO_START_MSG;
instance->my_high_delivered = SEQNO_START_MSG;
instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage;
}
static void main_token_seqid_get (
const void *msg,
unsigned int *seqid,
unsigned int *token_is)
{
const struct orf_token *token = msg;
*seqid = 0;
*token_is = 0;
if (token->header.type == MESSAGE_TYPE_ORF_TOKEN) {
*seqid = token->token_seq;
*token_is = 1;
}
}
static unsigned int main_msgs_missing (void)
{
// TODO
return (0);
}
static int pause_flush (struct totemsrp_instance *instance)
{
uint64_t now_msec;
uint64_t timestamp_msec;
int res = 0;
now_msec = (timerlist_nano_current_get () / TIMERLIST_NS_IN_MSEC);
timestamp_msec = instance->pause_timestamp / TIMERLIST_NS_IN_MSEC;
if ((now_msec - timestamp_msec) > (instance->totem_config->token_timeout / 2)) {
log_printf (instance->totemsrp_log_level_notice,
"Process pause detected for %d ms, flushing membership messages.\n", (unsigned int)(now_msec - timestamp_msec));
/*
* -1 indicates an error from recvmsg
*/
do {
res = totemrrp_mcast_recv_empty (instance->totemrrp_context);
} while (res == -1);
}
return (res);
}
static int token_event_stats_collector (enum totem_callback_token_type type, const void *void_instance)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)void_instance;
uint32_t time_now;
unsigned long long nano_secs = timerlist_nano_current_get ();
time_now = (nano_secs / TIMERLIST_NS_IN_MSEC);
if (type == TOTEM_CALLBACK_TOKEN_RECEIVED) {
/* incr latest token the index */
if (instance->stats.latest_token == (TOTEM_TOKEN_STATS_MAX - 1))
instance->stats.latest_token = 0;
else
instance->stats.latest_token++;
if (instance->stats.earliest_token == instance->stats.latest_token) {
/* we have filled up the array, start overwriting */
if (instance->stats.earliest_token == (TOTEM_TOKEN_STATS_MAX - 1))
instance->stats.earliest_token = 0;
else
instance->stats.earliest_token++;
instance->stats.token[instance->stats.earliest_token].rx = 0;
instance->stats.token[instance->stats.earliest_token].tx = 0;
instance->stats.token[instance->stats.earliest_token].backlog_calc = 0;
}
instance->stats.token[instance->stats.latest_token].rx = time_now;
instance->stats.token[instance->stats.latest_token].tx = 0; /* in case we drop the token */
} else {
instance->stats.token[instance->stats.latest_token].tx = time_now;
}
return 0;
}
/*
* Exported interfaces
*/
int totemsrp_initialize (
hdb_handle_t poll_handle,
void **srp_context,
struct totem_config *totem_config,
totemmrp_stats_t *stats,
void (*deliver_fn) (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required),
void (*confchg_fn) (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id))
{
struct totemsrp_instance *instance;
unsigned int res;
instance = malloc (sizeof (struct totemsrp_instance));
if (instance == NULL) {
goto error_exit;
}
rundir = getenv ("COROSYNC_RUN_DIR");
if (rundir == NULL) {
rundir = LOCALSTATEDIR "/lib/corosync";
}
res = mkdir (rundir, 0700);
if (res == -1 && errno != EEXIST) {
goto error_destroy;
}
res = chdir (rundir);
if (res == -1) {
goto error_destroy;
}
totemsrp_instance_initialize (instance);
stats->srp = &instance->stats;
instance->stats.latest_token = 0;
instance->stats.earliest_token = 0;
instance->totem_config = totem_config;
/*
* Configure logging
*/
instance->totemsrp_log_level_security = totem_config->totem_logging_configuration.log_level_security;
instance->totemsrp_log_level_error = totem_config->totem_logging_configuration.log_level_error;
instance->totemsrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning;
instance->totemsrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice;
instance->totemsrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug;
instance->totemsrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id;
instance->totemsrp_log_printf = totem_config->totem_logging_configuration.log_printf;
/*
* Initialize local variables for totemsrp
*/
totemip_copy (&instance->mcast_address, &totem_config->interfaces[0].mcast_addr);
/*
* Display totem configuration
*/
log_printf (instance->totemsrp_log_level_debug,
"Token Timeout (%d ms) retransmit timeout (%d ms)\n",
totem_config->token_timeout, totem_config->token_retransmit_timeout);
log_printf (instance->totemsrp_log_level_debug,
"token hold (%d ms) retransmits before loss (%d retrans)\n",
totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
log_printf (instance->totemsrp_log_level_debug,
"join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)\n",
totem_config->join_timeout,
totem_config->send_join_timeout,
totem_config->consensus_timeout,
totem_config->merge_timeout);
log_printf (instance->totemsrp_log_level_debug,
"downcheck (%d ms) fail to recv const (%d msgs)\n",
totem_config->downcheck_timeout, totem_config->fail_to_recv_const);
log_printf (instance->totemsrp_log_level_debug,
"seqno unchanged const (%d rotations) Maximum network MTU %d\n", totem_config->seqno_unchanged_const, totem_config->net_mtu);
log_printf (instance->totemsrp_log_level_debug,
"window size per rotation (%d messages) maximum messages per rotation (%d messages)\n",
totem_config->window_size, totem_config->max_messages);
+ log_printf (instance->totemsrp_log_level_debug,
+ "missed count const (%d messages)\n",
+ totem_config->miss_count_const);
+
log_printf (instance->totemsrp_log_level_debug,
"send threads (%d threads)\n", totem_config->threads);
log_printf (instance->totemsrp_log_level_debug,
"RRP token expired timeout (%d ms)\n",
totem_config->rrp_token_expired_timeout);
log_printf (instance->totemsrp_log_level_debug,
"RRP token problem counter (%d ms)\n",
totem_config->rrp_problem_count_timeout);
log_printf (instance->totemsrp_log_level_debug,
"RRP threshold (%d problem count)\n",
totem_config->rrp_problem_count_threshold);
log_printf (instance->totemsrp_log_level_debug,
"RRP mode set to %s.\n", instance->totem_config->rrp_mode);
log_printf (instance->totemsrp_log_level_debug,
"heartbeat_failures_allowed (%d)\n", totem_config->heartbeat_failures_allowed);
log_printf (instance->totemsrp_log_level_debug,
"max_network_delay (%d ms)\n", totem_config->max_network_delay);
cs_queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX,
sizeof (struct message_item));
sq_init (&instance->regular_sort_queue,
QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0);
sq_init (&instance->recovery_sort_queue,
QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0);
instance->totemsrp_poll_handle = poll_handle;
instance->totemsrp_deliver_fn = deliver_fn;
instance->totemsrp_confchg_fn = confchg_fn;
instance->use_heartbeat = 1;
timer_function_pause_timeout (instance);
if ( totem_config->heartbeat_failures_allowed == 0 ) {
log_printf (instance->totemsrp_log_level_debug,
"HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0\n");
instance->use_heartbeat = 0;
}
if (instance->use_heartbeat) {
instance->heartbeat_timeout
= (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout
+ totem_config->max_network_delay;
if (instance->heartbeat_timeout >= totem_config->token_timeout) {
log_printf (instance->totemsrp_log_level_debug,
"total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)\n",
instance->heartbeat_timeout,
totem_config->token_timeout);
log_printf (instance->totemsrp_log_level_debug,
"heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay\n");
log_printf (instance->totemsrp_log_level_debug,
"heartbeat timeout should be less than the token timeout. HeartBeat is Diabled !!\n");
instance->use_heartbeat = 0;
}
else {
log_printf (instance->totemsrp_log_level_debug,
"total heartbeat_timeout (%d ms)\n", instance->heartbeat_timeout);
}
}
totemrrp_initialize (
poll_handle,
&instance->totemrrp_context,
totem_config,
instance,
main_deliver_fn,
main_iface_change_fn,
main_token_seqid_get,
main_msgs_missing,
target_set_completed);
/*
* Must have net_mtu adjusted by totemrrp_initialize first
*/
cs_queue_init (&instance->new_message_queue,
MESSAGE_QUEUE_MAX,
sizeof (struct message_item));
totemsrp_callback_token_create (instance,
&instance->token_recv_event_handle,
TOTEM_CALLBACK_TOKEN_RECEIVED,
0,
token_event_stats_collector,
instance);
totemsrp_callback_token_create (instance,
&instance->token_sent_event_handle,
TOTEM_CALLBACK_TOKEN_SENT,
0,
token_event_stats_collector,
instance);
*srp_context = instance;
return (0);
error_destroy:
free (instance);
error_exit:
return (-1);
}
void totemsrp_finalize (
void *srp_context)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
memb_leave_message_send (instance);
free (srp_context);
}
int totemsrp_ifaces_get (
void *srp_context,
unsigned int nodeid,
struct totem_ip_address *interfaces,
char ***status,
unsigned int *iface_count)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
int res = 0;
unsigned int found = 0;
unsigned int i;
for (i = 0; i < instance->my_memb_entries; i++) {
if (instance->my_memb_list[i].addr[0].nodeid == nodeid) {
found = 1;
break;
}
}
if (found) {
memcpy (interfaces, &instance->my_memb_list[i],
sizeof (struct srp_addr));
*iface_count = instance->totem_config->interface_count;
goto finish;
}
for (i = 0; i < instance->my_left_memb_entries; i++) {
if (instance->my_left_memb_list[i].addr[0].nodeid == nodeid) {
found = 1;
break;
}
}
if (found) {
memcpy (interfaces, &instance->my_left_memb_list[i],
sizeof (struct srp_addr));
*iface_count = instance->totem_config->interface_count;
} else {
res = -1;
}
finish:
totemrrp_ifaces_get (instance->totemrrp_context, status, NULL);
return (res);
}
int totemsrp_crypto_set (
void *srp_context,
unsigned int type)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
int res;
res = totemrrp_crypto_set(instance->totemrrp_context, type);
return (res);
}
unsigned int totemsrp_my_nodeid_get (
void *srp_context)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
unsigned int res;
res = instance->totem_config->interfaces[0].boundto.nodeid;
return (res);
}
int totemsrp_my_family_get (
void *srp_context)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
int res;
res = instance->totem_config->interfaces[0].boundto.family;
return (res);
}
int totemsrp_ring_reenable (
void *srp_context)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
totemrrp_ring_reenable (instance->totemrrp_context);
return (0);
}
/*
* Set operations for use by the membership algorithm
*/
static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b)
{
unsigned int i;
unsigned int res;
for (i = 0; i < 1; i++) {
res = totemip_equal (&a->addr[i], &b->addr[i]);
if (res == 0) {
return (0);
}
}
return (1);
}
static void srp_addr_copy (struct srp_addr *dest, const struct srp_addr *src)
{
unsigned int i;
for (i = 0; i < INTERFACE_MAX; i++) {
totemip_copy (&dest->addr[i], &src->addr[i]);
}
}
static void srp_addr_to_nodeid (
unsigned int *nodeid_out,
struct srp_addr *srp_addr_in,
unsigned int entries)
{
unsigned int i;
for (i = 0; i < entries; i++) {
nodeid_out[i] = srp_addr_in[i].addr[0].nodeid;
}
}
static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in)
{
int i;
for (i = 0; i < INTERFACE_MAX; i++) {
totemip_copy_endian_convert (&out->addr[i], &in->addr[i]);
}
}
static void memb_consensus_reset (struct totemsrp_instance *instance)
{
instance->consensus_list_entries = 0;
}
static void memb_set_subtract (
struct srp_addr *out_list, int *out_list_entries,
struct srp_addr *one_list, int one_list_entries,
struct srp_addr *two_list, int two_list_entries)
{
int found = 0;
int i;
int j;
*out_list_entries = 0;
for (i = 0; i < one_list_entries; i++) {
for (j = 0; j < two_list_entries; j++) {
if (srp_addr_equal (&one_list[i], &two_list[j])) {
found = 1;
break;
}
}
if (found == 0) {
srp_addr_copy (&out_list[*out_list_entries], &one_list[i]);
*out_list_entries = *out_list_entries + 1;
}
found = 0;
}
}
/*
* Set consensus for a specific processor
*/
static void memb_consensus_set (
struct totemsrp_instance *instance,
const struct srp_addr *addr)
{
int found = 0;
int i;
if (addr->addr[0].nodeid == LEAVE_DUMMY_NODEID)
return;
for (i = 0; i < instance->consensus_list_entries; i++) {
if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) {
found = 1;
break; /* found entry */
}
}
srp_addr_copy (&instance->consensus_list[i].addr, addr);
instance->consensus_list[i].set = 1;
if (found == 0) {
instance->consensus_list_entries++;
}
return;
}
/*
* Is consensus set for a specific processor
*/
static int memb_consensus_isset (
struct totemsrp_instance *instance,
const struct srp_addr *addr)
{
int i;
for (i = 0; i < instance->consensus_list_entries; i++) {
if (srp_addr_equal (addr, &instance->consensus_list[i].addr)) {
return (instance->consensus_list[i].set);
}
}
return (0);
}
/*
* Is consensus agreed upon based upon consensus database
*/
static int memb_consensus_agreed (
struct totemsrp_instance *instance)
{
struct srp_addr token_memb[PROCESSOR_COUNT_MAX];
int token_memb_entries = 0;
int agreed = 1;
int i;
memb_set_subtract (token_memb, &token_memb_entries,
instance->my_proc_list, instance->my_proc_list_entries,
instance->my_failed_list, instance->my_failed_list_entries);
for (i = 0; i < token_memb_entries; i++) {
if (memb_consensus_isset (instance, &token_memb[i]) == 0) {
agreed = 0;
break;
}
}
assert (token_memb_entries >= 1);
return (agreed);
}
static void memb_consensus_notset (
struct totemsrp_instance *instance,
struct srp_addr *no_consensus_list,
int *no_consensus_list_entries,
struct srp_addr *comparison_list,
int comparison_list_entries)
{
int i;
*no_consensus_list_entries = 0;
for (i = 0; i < instance->my_proc_list_entries; i++) {
if (memb_consensus_isset (instance, &instance->my_proc_list[i]) == 0) {
srp_addr_copy (&no_consensus_list[*no_consensus_list_entries], &instance->my_proc_list[i]);
*no_consensus_list_entries = *no_consensus_list_entries + 1;
}
}
}
/*
* Is set1 equal to set2 Entries can be in different orders
*/
static int memb_set_equal (
struct srp_addr *set1, int set1_entries,
struct srp_addr *set2, int set2_entries)
{
int i;
int j;
int found = 0;
if (set1_entries != set2_entries) {
return (0);
}
for (i = 0; i < set2_entries; i++) {
for (j = 0; j < set1_entries; j++) {
if (srp_addr_equal (&set1[j], &set2[i])) {
found = 1;
break;
}
}
if (found == 0) {
return (0);
}
found = 0;
}
return (1);
}
/*
* Is subset fully contained in fullset
*/
static int memb_set_subset (
const struct srp_addr *subset, int subset_entries,
const struct srp_addr *fullset, int fullset_entries)
{
int i;
int j;
int found = 0;
if (subset_entries > fullset_entries) {
return (0);
}
for (i = 0; i < subset_entries; i++) {
for (j = 0; j < fullset_entries; j++) {
if (srp_addr_equal (&subset[i], &fullset[j])) {
found = 1;
}
}
if (found == 0) {
return (0);
}
found = 0;
}
return (1);
}
/*
* merge subset into fullset taking care not to add duplicates
*/
static void memb_set_merge (
const struct srp_addr *subset, int subset_entries,
struct srp_addr *fullset, int *fullset_entries)
{
int found = 0;
int i;
int j;
for (i = 0; i < subset_entries; i++) {
for (j = 0; j < *fullset_entries; j++) {
if (srp_addr_equal (&fullset[j], &subset[i])) {
found = 1;
break;
}
}
if (found == 0) {
srp_addr_copy (&fullset[*fullset_entries], &subset[i]);
*fullset_entries = *fullset_entries + 1;
}
found = 0;
}
return;
}
static void memb_set_and_with_ring_id (
struct srp_addr *set1,
struct memb_ring_id *set1_ring_ids,
int set1_entries,
struct srp_addr *set2,
int set2_entries,
struct memb_ring_id *old_ring_id,
struct srp_addr *and,
int *and_entries)
{
int i;
int j;
int found = 0;
*and_entries = 0;
for (i = 0; i < set2_entries; i++) {
for (j = 0; j < set1_entries; j++) {
if (srp_addr_equal (&set1[j], &set2[i])) {
if (memcmp (&set1_ring_ids[j], old_ring_id, sizeof (struct memb_ring_id)) == 0) {
found = 1;
}
break;
}
}
if (found) {
srp_addr_copy (&and[*and_entries], &set1[j]);
*and_entries = *and_entries + 1;
}
found = 0;
}
return;
}
#ifdef CODE_COVERAGE
static void memb_set_print (
char *string,
struct srp_addr *list,
int list_entries)
{
int i;
int j;
printf ("List '%s' contains %d entries:\n", string, list_entries);
for (i = 0; i < list_entries; i++) {
for (j = 0; j < INTERFACE_MAX; j++) {
printf ("Address %d\n", i);
printf ("\tiface %d %s\n", j, totemip_print (&list[i].addr[j]));
printf ("family %d\n", list[i].addr[j].family);
}
}
}
#endif
static void reset_token_retransmit_timeout (struct totemsrp_instance *instance)
{
poll_timer_delete (instance->totemsrp_poll_handle,
instance->timer_orf_token_retransmit_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->token_retransmit_timeout,
(void *)instance,
timer_function_token_retransmit_timeout,
&instance->timer_orf_token_retransmit_timeout);
}
static void start_merge_detect_timeout (struct totemsrp_instance *instance)
{
if (instance->my_merge_detect_timeout_outstanding == 0) {
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->merge_timeout,
(void *)instance,
timer_function_merge_detect_timeout,
&instance->timer_merge_detect_timeout);
instance->my_merge_detect_timeout_outstanding = 1;
}
}
static void cancel_merge_detect_timeout (struct totemsrp_instance *instance)
{
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_merge_detect_timeout);
instance->my_merge_detect_timeout_outstanding = 0;
}
/*
* ring_state_* is used to save and restore the sort queue
* state when a recovery operation fails (and enters gather)
*/
static void old_ring_state_save (struct totemsrp_instance *instance)
{
if (instance->old_ring_state_saved == 0) {
instance->old_ring_state_saved = 1;
memcpy (&instance->my_old_ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id));
instance->old_ring_state_aru = instance->my_aru;
instance->old_ring_state_high_seq_received = instance->my_high_seq_received;
log_printf (instance->totemsrp_log_level_debug,
"Saving state aru %x high seq received %x\n",
instance->my_aru, instance->my_high_seq_received);
}
}
static void ring_state_restore (struct totemsrp_instance *instance)
{
if (instance->old_ring_state_saved) {
memcpy (&instance->my_ring_id, &instance->my_old_ring_id,
sizeof (struct memb_ring_id));
instance->my_aru = instance->old_ring_state_aru;
instance->my_high_seq_received = instance->old_ring_state_high_seq_received;
log_printf (instance->totemsrp_log_level_debug,
"Restoring instance->my_aru %x my high seq received %x\n",
instance->my_aru, instance->my_high_seq_received);
}
}
static void old_ring_state_reset (struct totemsrp_instance *instance)
{
log_printf (instance->totemsrp_log_level_debug,
"Resetting old ring state\n");
instance->old_ring_state_saved = 0;
}
static void reset_pause_timeout (struct totemsrp_instance *instance)
{
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_pause_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->token_timeout / 5,
(void *)instance,
timer_function_pause_timeout,
&instance->timer_pause_timeout);
}
static void reset_token_timeout (struct totemsrp_instance *instance) {
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->token_timeout,
(void *)instance,
timer_function_orf_token_timeout,
&instance->timer_orf_token_timeout);
}
static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->heartbeat_timeout,
(void *)instance,
timer_function_heartbeat_timeout,
&instance->timer_heartbeat_timeout);
}
static void cancel_token_timeout (struct totemsrp_instance *instance) {
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
}
static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) {
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout);
}
static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance)
{
poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout);
}
static void start_token_hold_retransmit_timeout (struct totemsrp_instance *instance)
{
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->token_hold_timeout,
(void *)instance,
timer_function_token_hold_retransmit_timeout,
&instance->timer_orf_token_hold_retransmit_timeout);
}
static void cancel_token_hold_retransmit_timeout (struct totemsrp_instance *instance)
{
poll_timer_delete (instance->totemsrp_poll_handle,
instance->timer_orf_token_hold_retransmit_timeout);
}
static void memb_state_consensus_timeout_expired (
struct totemsrp_instance *instance)
{
struct srp_addr no_consensus_list[PROCESSOR_COUNT_MAX];
int no_consensus_list_entries;
instance->stats.consensus_timeouts++;
if (memb_consensus_agreed (instance)) {
memb_consensus_reset (instance);
memb_consensus_set (instance, &instance->my_id);
reset_token_timeout (instance); // REVIEWED
} else {
memb_consensus_notset (
instance,
no_consensus_list,
&no_consensus_list_entries,
instance->my_proc_list,
instance->my_proc_list_entries);
memb_set_merge (no_consensus_list, no_consensus_list_entries,
instance->my_failed_list, &instance->my_failed_list_entries);
memb_state_gather_enter (instance, 0);
}
}
static void memb_join_message_send (struct totemsrp_instance *instance);
static void memb_merge_detect_transmit (struct totemsrp_instance *instance);
/*
* Timers used for various states of the membership algorithm
*/
static void timer_function_pause_timeout (void *data)
{
struct totemsrp_instance *instance = data;
instance->pause_timestamp = timerlist_nano_current_get ();
reset_pause_timeout (instance);
}
static void timer_function_orf_token_timeout (void *data)
{
struct totemsrp_instance *instance = data;
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
log_printf (instance->totemsrp_log_level_debug,
"The token was lost in the OPERATIONAL state.\n");
log_printf (instance->totemsrp_log_level_notice,
"A processor failed, forming new configuration.\n");
totemrrp_iface_check (instance->totemrrp_context);
memb_state_gather_enter (instance, 2);
instance->stats.operational_token_lost++;
break;
case MEMB_STATE_GATHER:
log_printf (instance->totemsrp_log_level_debug,
"The consensus timeout expired.\n");
memb_state_consensus_timeout_expired (instance);
memb_state_gather_enter (instance, 3);
instance->stats.gather_token_lost++;
break;
case MEMB_STATE_COMMIT:
log_printf (instance->totemsrp_log_level_debug,
"The token was lost in the COMMIT state.\n");
memb_state_gather_enter (instance, 4);
instance->stats.commit_token_lost++;
break;
case MEMB_STATE_RECOVERY:
log_printf (instance->totemsrp_log_level_debug,
"The token was lost in the RECOVERY state.\n");
ring_state_restore (instance);
memb_state_gather_enter (instance, 5);
instance->stats.recovery_token_lost++;
break;
}
}
static void timer_function_heartbeat_timeout (void *data)
{
struct totemsrp_instance *instance = data;
log_printf (instance->totemsrp_log_level_debug,
"HeartBeat Timer expired Invoking token loss mechanism in state %d \n", instance->memb_state);
timer_function_orf_token_timeout(data);
}
static void memb_timer_function_state_gather (void *data)
{
struct totemsrp_instance *instance = data;
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
case MEMB_STATE_RECOVERY:
assert (0); /* this should never happen */
break;
case MEMB_STATE_GATHER:
case MEMB_STATE_COMMIT:
memb_join_message_send (instance);
/*
* Restart the join timeout
`*/
poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->join_timeout,
(void *)instance,
memb_timer_function_state_gather,
&instance->memb_timer_state_gather_join_timeout);
break;
}
}
static void memb_timer_function_gather_consensus_timeout (void *data)
{
struct totemsrp_instance *instance = data;
memb_state_consensus_timeout_expired (instance);
}
static void deliver_messages_from_recovery_to_regular (struct totemsrp_instance *instance)
{
unsigned int i;
struct sort_queue_item *recovery_message_item;
struct sort_queue_item regular_message_item;
unsigned int range = 0;
int res;
void *ptr;
struct mcast *mcast;
log_printf (instance->totemsrp_log_level_debug,
"recovery to regular %x-%x\n", SEQNO_START_MSG + 1, instance->my_aru);
range = instance->my_aru - SEQNO_START_MSG;
/*
* Move messages from recovery to regular sort queue
*/
// todo should i be initialized to 0 or 1 ?
for (i = 1; i <= range; i++) {
res = sq_item_get (&instance->recovery_sort_queue,
i + SEQNO_START_MSG, &ptr);
if (res != 0) {
continue;
}
recovery_message_item = ptr;
/*
* Convert recovery message into regular message
*/
mcast = recovery_message_item->mcast;
if (mcast->header.encapsulated == MESSAGE_ENCAPSULATED) {
/*
* Message is a recovery message encapsulated
* in a new ring message
*/
regular_message_item.mcast =
(struct mcast *)(((char *)recovery_message_item->mcast) + sizeof (struct mcast));
regular_message_item.msg_len =
recovery_message_item->msg_len - sizeof (struct mcast);
mcast = regular_message_item.mcast;
} else {
/*
* TODO this case shouldn't happen
*/
continue;
}
log_printf (instance->totemsrp_log_level_debug,
"comparing if ring id is for this processors old ring seqno %d\n",
mcast->seq);
/*
* Only add this message to the regular sort
* queue if it was originated with the same ring
* id as the previous ring
*/
if (memcmp (&instance->my_old_ring_id, &mcast->ring_id,
sizeof (struct memb_ring_id)) == 0) {
res = sq_item_inuse (&instance->regular_sort_queue, mcast->seq);
if (res == 0) {
sq_item_add (&instance->regular_sort_queue,
&regular_message_item, mcast->seq);
if (sq_lt_compare (instance->old_ring_state_high_seq_received, mcast->seq)) {
instance->old_ring_state_high_seq_received = mcast->seq;
}
}
} else {
log_printf (instance->totemsrp_log_level_debug,
"-not adding msg with seq no %x\n", mcast->seq);
}
}
}
/*
* Change states in the state machine of the membership algorithm
*/
static void memb_state_operational_enter (struct totemsrp_instance *instance)
{
struct srp_addr joined_list[PROCESSOR_COUNT_MAX];
int joined_list_entries = 0;
unsigned int aru_save;
unsigned int joined_list_totemip[PROCESSOR_COUNT_MAX];
unsigned int trans_memb_list_totemip[PROCESSOR_COUNT_MAX];
unsigned int new_memb_list_totemip[PROCESSOR_COUNT_MAX];
unsigned int left_list[PROCESSOR_COUNT_MAX];
memb_consensus_reset (instance);
old_ring_state_reset (instance);
deliver_messages_from_recovery_to_regular (instance);
log_printf (instance->totemsrp_log_level_debug,
"Delivering to app %x to %x\n",
instance->my_high_delivered + 1, instance->old_ring_state_high_seq_received);
aru_save = instance->my_aru;
instance->my_aru = instance->old_ring_state_aru;
messages_deliver_to_app (instance, 0, instance->old_ring_state_high_seq_received);
/*
* Calculate joined and left list
*/
memb_set_subtract (instance->my_left_memb_list,
&instance->my_left_memb_entries,
instance->my_memb_list, instance->my_memb_entries,
instance->my_trans_memb_list, instance->my_trans_memb_entries);
memb_set_subtract (joined_list, &joined_list_entries,
instance->my_new_memb_list, instance->my_new_memb_entries,
instance->my_trans_memb_list, instance->my_trans_memb_entries);
/*
* Install new membership
*/
instance->my_memb_entries = instance->my_new_memb_entries;
memcpy (&instance->my_memb_list, instance->my_new_memb_list,
sizeof (struct srp_addr) * instance->my_memb_entries);
instance->last_released = 0;
instance->my_set_retrans_flg = 0;
/*
* Deliver transitional configuration to application
*/
srp_addr_to_nodeid (left_list, instance->my_left_memb_list,
instance->my_left_memb_entries);
srp_addr_to_nodeid (trans_memb_list_totemip,
instance->my_trans_memb_list, instance->my_trans_memb_entries);
instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_TRANSITIONAL,
trans_memb_list_totemip, instance->my_trans_memb_entries,
left_list, instance->my_left_memb_entries,
0, 0, &instance->my_ring_id);
// TODO we need to filter to ensure we only deliver those
// messages which are part of instance->my_deliver_memb
messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received);
instance->my_aru = aru_save;
/*
* Deliver regular configuration to application
*/
srp_addr_to_nodeid (new_memb_list_totemip,
instance->my_new_memb_list, instance->my_new_memb_entries);
srp_addr_to_nodeid (joined_list_totemip, joined_list,
joined_list_entries);
instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_REGULAR,
new_memb_list_totemip, instance->my_new_memb_entries,
0, 0,
joined_list_totemip, joined_list_entries, &instance->my_ring_id);
/*
* The recovery sort queue now becomes the regular
* sort queue. It is necessary to copy the state
* into the regular sort queue.
*/
sq_copy (&instance->regular_sort_queue, &instance->recovery_sort_queue);
instance->my_last_aru = SEQNO_START_MSG;
sq_items_release (&instance->regular_sort_queue, SEQNO_START_MSG - 1);
/* When making my_proc_list smaller, ensure that the
* now non-used entries are zero-ed out. There are some suspect
* assert's that assume that there is always 2 entries in the list.
* These fail when my_proc_list is reduced to 1 entry (and the
* valid [0] entry is the same as the 'unused' [1] entry).
*/
memset(instance->my_proc_list, 0,
sizeof (struct srp_addr) * instance->my_proc_list_entries);
instance->my_proc_list_entries = instance->my_new_memb_entries;
memcpy (instance->my_proc_list, instance->my_new_memb_list,
sizeof (struct srp_addr) * instance->my_memb_entries);
instance->my_failed_list_entries = 0;
instance->my_high_delivered = instance->my_aru;
// TODO the recovery messages are leaked
log_printf (instance->totemsrp_log_level_debug,
"entering OPERATIONAL state.\n");
log_printf (instance->totemsrp_log_level_notice,
"A processor joined or left the membership and a new membership was formed.\n");
instance->memb_state = MEMB_STATE_OPERATIONAL;
instance->stats.operational_entered++;
instance->my_received_flg = 1;
reset_pause_timeout (instance);
/*
* Save ring id information from this configuration to determine
* which processors are transitioning from old regular configuration
* in to new regular configuration on the next configuration change
*/
memcpy (&instance->my_old_ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id));
return;
}
static void memb_state_gather_enter (
struct totemsrp_instance *instance,
int gather_from)
{
memb_set_merge (
&instance->my_id, 1,
instance->my_proc_list, &instance->my_proc_list_entries);
memb_join_message_send (instance);
/*
* Restart the join timeout
*/
poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->join_timeout,
(void *)instance,
memb_timer_function_state_gather,
&instance->memb_timer_state_gather_join_timeout);
/*
* Restart the consensus timeout
*/
poll_timer_delete (instance->totemsrp_poll_handle,
instance->memb_timer_state_gather_consensus_timeout);
poll_timer_add (instance->totemsrp_poll_handle,
instance->totem_config->consensus_timeout,
(void *)instance,
memb_timer_function_gather_consensus_timeout,
&instance->memb_timer_state_gather_consensus_timeout);
/*
* Cancel the token loss and token retransmission timeouts
*/
cancel_token_retransmit_timeout (instance); // REVIEWED
cancel_token_timeout (instance); // REVIEWED
cancel_merge_detect_timeout (instance);
memb_consensus_reset (instance);
memb_consensus_set (instance, &instance->my_id);
log_printf (instance->totemsrp_log_level_debug,
"entering GATHER state from %d.\n", gather_from);
instance->memb_state = MEMB_STATE_GATHER;
instance->stats.gather_entered++;
return;
}
static void timer_function_token_retransmit_timeout (void *data);
static void target_set_completed (
void *context)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)context;
memb_state_commit_token_send (instance);
}
static void memb_state_commit_enter (
struct totemsrp_instance *instance)
{
old_ring_state_save (instance);
memb_state_commit_token_update (instance);
memb_state_commit_token_target_set (instance);
poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout);
instance->memb_timer_state_gather_join_timeout = 0;
poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout);
instance->memb_timer_state_gather_consensus_timeout = 0;
memb_ring_id_set_and_store (instance, &instance->commit_token->ring_id);
instance->token_ring_id_seq = instance->my_ring_id.seq;
log_printf (instance->totemsrp_log_level_debug,
"entering COMMIT state.\n");
instance->memb_state = MEMB_STATE_COMMIT;
reset_token_retransmit_timeout (instance); // REVIEWED
reset_token_timeout (instance); // REVIEWED
instance->stats.commit_entered++;
/*
* reset all flow control variables since we are starting a new ring
*/
instance->my_trc = 0;
instance->my_pbl = 0;
instance->my_cbl = 0;
/*
* commit token sent after callback that token target has been set
*/
}
static void memb_state_recovery_enter (
struct totemsrp_instance *instance,
struct memb_commit_token *commit_token)
{
int i;
int local_received_flg = 1;
unsigned int low_ring_aru;
unsigned int range = 0;
unsigned int messages_originated = 0;
const struct srp_addr *addr;
struct memb_commit_token_memb_entry *memb_list;
struct memb_ring_id my_new_memb_ring_id_list[PROCESSOR_COUNT_MAX];
addr = (const struct srp_addr *)commit_token->end_of_commit_token;
memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries);
log_printf (instance->totemsrp_log_level_debug,
"entering RECOVERY state.\n");
instance->my_high_ring_delivered = 0;
sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG);
cs_queue_reinit (&instance->retrans_message_queue);
low_ring_aru = instance->old_ring_state_high_seq_received;
memb_state_commit_token_send_recovery (instance, commit_token);
instance->my_token_seq = SEQNO_START_TOKEN - 1;
/*
* Build regular configuration
*/
totemrrp_processor_count_set (
instance->totemrrp_context,
commit_token->addr_entries);
/*
* Build transitional configuration
*/
for (i = 0; i < instance->my_new_memb_entries; i++) {
memcpy (&my_new_memb_ring_id_list[i],
&memb_list[i].ring_id,
sizeof (struct memb_ring_id));
}
memb_set_and_with_ring_id (
instance->my_new_memb_list,
my_new_memb_ring_id_list,
instance->my_new_memb_entries,
instance->my_memb_list,
instance->my_memb_entries,
&instance->my_old_ring_id,
instance->my_trans_memb_list,
&instance->my_trans_memb_entries);
for (i = 0; i < instance->my_trans_memb_entries; i++) {
log_printf (instance->totemsrp_log_level_debug,
"TRANS [%d] member %s:\n", i, totemip_print (&instance->my_trans_memb_list[i].addr[0]));
}
for (i = 0; i < instance->my_new_memb_entries; i++) {
log_printf (instance->totemsrp_log_level_debug,
"position [%d] member %s:\n", i, totemip_print (&addr[i].addr[0]));
log_printf (instance->totemsrp_log_level_debug,
"previous ring seq %lld rep %s\n",
memb_list[i].ring_id.seq,
totemip_print (&memb_list[i].ring_id.rep));
log_printf (instance->totemsrp_log_level_debug,
"aru %x high delivered %x received flag %d\n",
memb_list[i].aru,
memb_list[i].high_delivered,
memb_list[i].received_flg);
// assert (totemip_print (&memb_list[i].ring_id.rep) != 0);
}
/*
* Determine if any received flag is false
*/
for (i = 0; i < commit_token->addr_entries; i++) {
if (memb_set_subset (&instance->my_new_memb_list[i], 1,
instance->my_trans_memb_list, instance->my_trans_memb_entries) &&
memb_list[i].received_flg == 0) {
instance->my_deliver_memb_entries = instance->my_trans_memb_entries;
memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list,
sizeof (struct srp_addr) * instance->my_trans_memb_entries);
local_received_flg = 0;
break;
}
}
if (local_received_flg == 1) {
goto no_originate;
} /* Else originate messages if we should */
/*
* Calculate my_low_ring_aru, instance->my_high_ring_delivered for the transitional membership
*/
for (i = 0; i < commit_token->addr_entries; i++) {
if (memb_set_subset (&instance->my_new_memb_list[i], 1,
instance->my_deliver_memb_list,
instance->my_deliver_memb_entries) &&
memcmp (&instance->my_old_ring_id,
&memb_list[i].ring_id,
sizeof (struct memb_ring_id)) == 0) {
if (sq_lt_compare (memb_list[i].aru, low_ring_aru)) {
low_ring_aru = memb_list[i].aru;
}
if (sq_lt_compare (instance->my_high_ring_delivered, memb_list[i].high_delivered)) {
instance->my_high_ring_delivered = memb_list[i].high_delivered;
}
}
}
/*
* Copy all old ring messages to instance->retrans_message_queue
*/
range = instance->old_ring_state_high_seq_received - low_ring_aru;
if (range == 0) {
/*
* No messages to copy
*/
goto no_originate;
}
assert (range < QUEUE_RTR_ITEMS_SIZE_MAX);
log_printf (instance->totemsrp_log_level_debug,
"copying all old ring messages from %x-%x.\n",
low_ring_aru + 1, instance->old_ring_state_high_seq_received);
for (i = 1; i <= range; i++) {
struct sort_queue_item *sort_queue_item;
struct message_item message_item;
void *ptr;
int res;
res = sq_item_get (&instance->regular_sort_queue,
low_ring_aru + i, &ptr);
if (res != 0) {
continue;
}
sort_queue_item = ptr;
messages_originated++;
memset (&message_item, 0, sizeof (struct message_item));
// TODO LEAK
message_item.mcast = malloc (FRAME_SIZE_MAX);
assert (message_item.mcast);
message_item.mcast->header.type = MESSAGE_TYPE_MCAST;
srp_addr_copy (&message_item.mcast->system_from, &instance->my_id);
message_item.mcast->header.encapsulated = MESSAGE_ENCAPSULATED;
message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid;
assert (message_item.mcast->header.nodeid);
message_item.mcast->header.endian_detector = ENDIAN_LOCAL;
memcpy (&message_item.mcast->ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id));
message_item.msg_len = sort_queue_item->msg_len + sizeof (struct mcast);
memcpy (((char *)message_item.mcast) + sizeof (struct mcast),
sort_queue_item->mcast,
sort_queue_item->msg_len);
cs_queue_item_add (&instance->retrans_message_queue, &message_item);
}
log_printf (instance->totemsrp_log_level_debug,
"Originated %d messages in RECOVERY.\n", messages_originated);
goto originated;
no_originate:
log_printf (instance->totemsrp_log_level_debug,
"Did not need to originate any messages in recovery.\n");
originated:
instance->my_aru = SEQNO_START_MSG;
instance->my_aru_count = 0;
instance->my_seq_unchanged = 0;
instance->my_high_seq_received = SEQNO_START_MSG;
instance->my_install_seq = SEQNO_START_MSG;
instance->last_released = SEQNO_START_MSG;
reset_token_timeout (instance); // REVIEWED
reset_token_retransmit_timeout (instance); // REVIEWED
instance->memb_state = MEMB_STATE_RECOVERY;
instance->stats.recovery_entered++;
return;
}
void totemsrp_event_signal (void *srp_context, enum totem_event_type type, int value)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
token_hold_cancel_send (instance);
return;
}
int totemsrp_mcast (
void *srp_context,
struct iovec *iovec,
unsigned int iov_len,
int guarantee)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
int i;
struct message_item message_item;
char *addr;
unsigned int addr_idx;
if (cs_queue_is_full (&instance->new_message_queue)) {
log_printf (instance->totemsrp_log_level_debug, "queue full\n");
return (-1);
}
memset (&message_item, 0, sizeof (struct message_item));
/*
* Allocate pending item
*/
message_item.mcast = malloc (FRAME_SIZE_MAX);
if (message_item.mcast == 0) {
goto error_mcast;
}
/*
* Set mcast header
*/
message_item.mcast->header.type = MESSAGE_TYPE_MCAST;
message_item.mcast->header.endian_detector = ENDIAN_LOCAL;
message_item.mcast->header.encapsulated = MESSAGE_NOT_ENCAPSULATED;
message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid;
assert (message_item.mcast->header.nodeid);
message_item.mcast->guarantee = guarantee;
srp_addr_copy (&message_item.mcast->system_from, &instance->my_id);
addr = (char *)message_item.mcast;
addr_idx = sizeof (struct mcast);
for (i = 0; i < iov_len; i++) {
memcpy (&addr[addr_idx], iovec[i].iov_base, iovec[i].iov_len);
addr_idx += iovec[i].iov_len;
}
message_item.msg_len = addr_idx;
log_printf (instance->totemsrp_log_level_debug, "mcasted message added to pending queue\n");
instance->stats.mcast_tx++;
cs_queue_item_add (&instance->new_message_queue, &message_item);
return (0);
error_mcast:
return (-1);
}
/*
* Determine if there is room to queue a new message
*/
int totemsrp_avail (void *srp_context)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
int avail;
cs_queue_avail (&instance->new_message_queue, &avail);
return (avail);
}
/*
* ORF Token Management
*/
/*
* Recast message to mcast group if it is available
*/
static int orf_token_remcast (
struct totemsrp_instance *instance,
int seq)
{
struct sort_queue_item *sort_queue_item;
int res;
void *ptr;
struct sq *sort_queue;
if (instance->memb_state == MEMB_STATE_RECOVERY) {
sort_queue = &instance->recovery_sort_queue;
} else {
sort_queue = &instance->regular_sort_queue;
}
res = sq_in_range (sort_queue, seq);
if (res == 0) {
log_printf (instance->totemsrp_log_level_debug, "sq not in range\n");
return (-1);
}
/*
* Get RTR item at seq, if not available, return
*/
res = sq_item_get (sort_queue, seq, &ptr);
if (res != 0) {
return -1;
}
sort_queue_item = ptr;
totemrrp_mcast_noflush_send (
instance->totemrrp_context,
sort_queue_item->mcast,
sort_queue_item->msg_len);
return (0);
}
/*
* Free all freeable messages from ring
*/
static void messages_free (
struct totemsrp_instance *instance,
unsigned int token_aru)
{
struct sort_queue_item *regular_message;
unsigned int i;
int res;
int log_release = 0;
unsigned int release_to;
unsigned int range = 0;
release_to = token_aru;
if (sq_lt_compare (instance->my_last_aru, release_to)) {
release_to = instance->my_last_aru;
}
if (sq_lt_compare (instance->my_high_delivered, release_to)) {
release_to = instance->my_high_delivered;
}
/*
* Ensure we dont try release before an already released point
*/
if (sq_lt_compare (release_to, instance->last_released)) {
return;
}
range = release_to - instance->last_released;
assert (range < QUEUE_RTR_ITEMS_SIZE_MAX);
/*
* Release retransmit list items if group aru indicates they are transmitted
*/
for (i = 1; i <= range; i++) {
void *ptr;
res = sq_item_get (&instance->regular_sort_queue,
instance->last_released + i, &ptr);
if (res == 0) {
regular_message = ptr;
free (regular_message->mcast);
}
sq_items_release (&instance->regular_sort_queue,
instance->last_released + i);
log_release = 1;
}
instance->last_released += range;
if (log_release) {
log_printf (instance->totemsrp_log_level_debug,
"releasing messages up to and including %x\n", release_to);
}
}
static void update_aru (
struct totemsrp_instance *instance)
{
unsigned int i;
int res;
struct sq *sort_queue;
unsigned int range;
unsigned int my_aru_saved = 0;
if (instance->memb_state == MEMB_STATE_RECOVERY) {
sort_queue = &instance->recovery_sort_queue;
} else {
sort_queue = &instance->regular_sort_queue;
}
range = instance->my_high_seq_received - instance->my_aru;
if (range > 1024) {
return;
}
my_aru_saved = instance->my_aru;
for (i = 1; i <= range; i++) {
void *ptr;
res = sq_item_get (sort_queue, my_aru_saved + i, &ptr);
/*
* If hole, stop updating aru
*/
if (res != 0) {
break;
}
}
instance->my_aru += i - 1;
}
/*
* Multicasts pending messages onto the ring (requires orf_token possession)
*/
static int orf_token_mcast (
struct totemsrp_instance *instance,
struct orf_token *token,
int fcc_mcasts_allowed)
{
struct message_item *message_item = 0;
struct cs_queue *mcast_queue;
struct sq *sort_queue;
struct sort_queue_item sort_queue_item;
struct mcast *mcast;
unsigned int fcc_mcast_current;
if (instance->memb_state == MEMB_STATE_RECOVERY) {
mcast_queue = &instance->retrans_message_queue;
sort_queue = &instance->recovery_sort_queue;
reset_token_retransmit_timeout (instance); // REVIEWED
} else {
mcast_queue = &instance->new_message_queue;
sort_queue = &instance->regular_sort_queue;
}
for (fcc_mcast_current = 0; fcc_mcast_current < fcc_mcasts_allowed; fcc_mcast_current++) {
if (cs_queue_is_empty (mcast_queue)) {
break;
}
message_item = (struct message_item *)cs_queue_item_get (mcast_queue);
/* preincrement required by algo */
if (instance->old_ring_state_saved &&
(instance->memb_state == MEMB_STATE_GATHER ||
instance->memb_state == MEMB_STATE_COMMIT)) {
log_printf (instance->totemsrp_log_level_debug,
"not multicasting at seqno is %d\n",
token->seq);
return (0);
}
message_item->mcast->seq = ++token->seq;
message_item->mcast->this_seqno = instance->global_seqno++;
/*
* Build IO vector
*/
memset (&sort_queue_item, 0, sizeof (struct sort_queue_item));
sort_queue_item.mcast = message_item->mcast;
sort_queue_item.msg_len = message_item->msg_len;
mcast = sort_queue_item.mcast;
memcpy (&mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id));
/*
* Add message to retransmit queue
*/
sq_item_add (sort_queue, &sort_queue_item, message_item->mcast->seq);
totemrrp_mcast_noflush_send (
instance->totemrrp_context,
message_item->mcast,
message_item->msg_len);
/*
* Delete item from pending queue
*/
cs_queue_item_remove (mcast_queue);
/*
* If messages mcasted, deliver any new messages to totempg
*/
instance->my_high_seq_received = token->seq;
}
update_aru (instance);
/*
* Return 1 if more messages are available for single node clusters
*/
return (fcc_mcast_current);
}
/*
* Remulticasts messages in orf_token's retransmit list (requires orf_token)
* Modify's orf_token's rtr to include retransmits required by this process
*/
static int orf_token_rtr (
struct totemsrp_instance *instance,
struct orf_token *orf_token,
unsigned int *fcc_allowed)
{
unsigned int res;
unsigned int i, j;
unsigned int found;
unsigned int total_entries;
struct sq *sort_queue;
struct rtr_item *rtr_list;
unsigned int range = 0;
char retransmit_msg[1024];
char value[64];
if (instance->memb_state == MEMB_STATE_RECOVERY) {
sort_queue = &instance->recovery_sort_queue;
} else {
sort_queue = &instance->regular_sort_queue;
}
rtr_list = &orf_token->rtr_list[0];
strcpy (retransmit_msg, "Retransmit List: ");
if (orf_token->rtr_list_entries) {
log_printf (instance->totemsrp_log_level_debug,
"Retransmit List %d\n", orf_token->rtr_list_entries);
for (i = 0; i < orf_token->rtr_list_entries; i++) {
sprintf (value, "%x ", rtr_list[i].seq);
strcat (retransmit_msg, value);
}
strcat (retransmit_msg, "\n");
- log_printf (instance->totemsrp_log_level_debug,
+ log_printf (instance->totemsrp_log_level_notice,
"%s", retransmit_msg);
}
total_entries = orf_token->rtr_list_entries;
/*
* Retransmit messages on orf_token's RTR list from RTR queue
*/
for (instance->fcc_remcast_current = 0, i = 0;
instance->fcc_remcast_current < *fcc_allowed && i < orf_token->rtr_list_entries;) {
/*
* If this retransmit request isn't from this configuration,
* try next rtr entry
*/
if (memcmp (&rtr_list[i].ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id)) != 0) {
i += 1;
continue;
}
res = orf_token_remcast (instance, rtr_list[i].seq);
if (res == 0) {
/*
* Multicasted message, so no need to copy to new retransmit list
*/
orf_token->rtr_list_entries -= 1;
assert (orf_token->rtr_list_entries >= 0);
memmove (&rtr_list[i], &rtr_list[i + 1],
sizeof (struct rtr_item) * (orf_token->rtr_list_entries - i));
instance->stats.mcast_retx++;
instance->fcc_remcast_current++;
} else {
i += 1;
}
}
*fcc_allowed = *fcc_allowed - instance->fcc_remcast_current;
/*
* Add messages to retransmit to RTR list
* but only retry if there is room in the retransmit list
*/
range = orf_token->seq - instance->my_aru;
assert (range < QUEUE_RTR_ITEMS_SIZE_MAX);
for (i = 1; (orf_token->rtr_list_entries < RETRANSMIT_ENTRIES_MAX) &&
(i <= range); i++) {
/*
* Ensure message is within the sort queue range
*/
res = sq_in_range (sort_queue, instance->my_aru + i);
if (res == 0) {
break;
}
/*
* Find if a message is missing from this processor
*/
res = sq_item_inuse (sort_queue, instance->my_aru + i);
if (res == 0) {
+ /*
+ * Determine how many times we have missed receiving
+ * this sequence number. sq_item_miss_count increments
+ * a counter for the sequence number. The miss count
+ * will be returned and compared. This allows time for
+ * delayed multicast messages to be received before
+ * declaring the message is missing and requesting a
+ * retransmit.
+ */
+ res = sq_item_miss_count (sort_queue, instance->my_aru + i);
+ if (res < instance->totem_config->miss_count_const) {
+ continue;
+ }
+
/*
* Determine if missing message is already in retransmit list
*/
found = 0;
for (j = 0; j < orf_token->rtr_list_entries; j++) {
if (instance->my_aru + i == rtr_list[j].seq) {
found = 1;
}
}
if (found == 0) {
/*
* Missing message not found in current retransmit list so add it
*/
memcpy (&rtr_list[orf_token->rtr_list_entries].ring_id,
&instance->my_ring_id, sizeof (struct memb_ring_id));
rtr_list[orf_token->rtr_list_entries].seq = instance->my_aru + i;
orf_token->rtr_list_entries++;
}
}
}
return (instance->fcc_remcast_current);
}
static void token_retransmit (struct totemsrp_instance *instance)
{
totemrrp_token_send (instance->totemrrp_context,
instance->orf_token_retransmit,
instance->orf_token_retransmit_size);
}
/*
* Retransmit the regular token if no mcast or token has
* been received in retransmit token period retransmit
* the token to the next processor
*/
static void timer_function_token_retransmit_timeout (void *data)
{
struct totemsrp_instance *instance = data;
switch (instance->memb_state) {
case MEMB_STATE_GATHER:
break;
case MEMB_STATE_COMMIT:
case MEMB_STATE_OPERATIONAL:
case MEMB_STATE_RECOVERY:
token_retransmit (instance);
reset_token_retransmit_timeout (instance); // REVIEWED
break;
}
}
static void timer_function_token_hold_retransmit_timeout (void *data)
{
struct totemsrp_instance *instance = data;
switch (instance->memb_state) {
case MEMB_STATE_GATHER:
break;
case MEMB_STATE_COMMIT:
break;
case MEMB_STATE_OPERATIONAL:
case MEMB_STATE_RECOVERY:
token_retransmit (instance);
break;
}
}
static void timer_function_merge_detect_timeout(void *data)
{
struct totemsrp_instance *instance = data;
instance->my_merge_detect_timeout_outstanding = 0;
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) {
memb_merge_detect_transmit (instance);
}
break;
case MEMB_STATE_GATHER:
case MEMB_STATE_COMMIT:
case MEMB_STATE_RECOVERY:
break;
}
}
/*
* Send orf_token to next member (requires orf_token)
*/
static int token_send (
struct totemsrp_instance *instance,
struct orf_token *orf_token,
int forward_token)
{
int res = 0;
unsigned int orf_token_size;
orf_token_size = sizeof (struct orf_token) +
(orf_token->rtr_list_entries * sizeof (struct rtr_item));
memcpy (instance->orf_token_retransmit, orf_token, orf_token_size);
instance->orf_token_retransmit_size = orf_token_size;
orf_token->header.nodeid = instance->my_id.addr[0].nodeid;
assert (orf_token->header.nodeid);
if (forward_token == 0) {
return (0);
}
totemrrp_token_send (instance->totemrrp_context,
orf_token,
orf_token_size);
return (res);
}
static int token_hold_cancel_send (struct totemsrp_instance *instance)
{
struct token_hold_cancel token_hold_cancel;
/*
* Only cancel if the token is currently held
*/
if (instance->my_token_held == 0) {
return (0);
}
instance->my_token_held = 0;
/*
* Build message
*/
token_hold_cancel.header.type = MESSAGE_TYPE_TOKEN_HOLD_CANCEL;
token_hold_cancel.header.endian_detector = ENDIAN_LOCAL;
token_hold_cancel.header.encapsulated = 0;
token_hold_cancel.header.nodeid = instance->my_id.addr[0].nodeid;
memcpy (&token_hold_cancel.ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id));
assert (token_hold_cancel.header.nodeid);
instance->stats.token_hold_cancel_tx++;
totemrrp_mcast_flush_send (instance->totemrrp_context, &token_hold_cancel,
sizeof (struct token_hold_cancel));
return (0);
}
static int orf_token_send_initial (struct totemsrp_instance *instance)
{
struct orf_token orf_token;
int res;
orf_token.header.type = MESSAGE_TYPE_ORF_TOKEN;
orf_token.header.endian_detector = ENDIAN_LOCAL;
orf_token.header.encapsulated = 0;
orf_token.header.nodeid = instance->my_id.addr[0].nodeid;
assert (orf_token.header.nodeid);
orf_token.seq = SEQNO_START_MSG;
orf_token.token_seq = SEQNO_START_TOKEN;
orf_token.retrans_flg = 1;
instance->my_set_retrans_flg = 1;
instance->stats.orf_token_tx++;
if (cs_queue_is_empty (&instance->retrans_message_queue) == 1) {
orf_token.retrans_flg = 0;
instance->my_set_retrans_flg = 0;
} else {
orf_token.retrans_flg = 1;
instance->my_set_retrans_flg = 1;
}
orf_token.aru = 0;
orf_token.aru = SEQNO_START_MSG - 1;
orf_token.aru_addr = instance->my_id.addr[0].nodeid;
memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id));
orf_token.fcc = 0;
orf_token.backlog = 0;
orf_token.rtr_list_entries = 0;
res = token_send (instance, &orf_token, 1);
return (res);
}
static void memb_state_commit_token_update (
struct totemsrp_instance *instance)
{
struct srp_addr *addr;
struct memb_commit_token_memb_entry *memb_list;
unsigned int high_aru;
unsigned int i;
addr = (struct srp_addr *)instance->commit_token->end_of_commit_token;
memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries);
memcpy (instance->my_new_memb_list, addr,
sizeof (struct srp_addr) * instance->commit_token->addr_entries);
instance->my_new_memb_entries = instance->commit_token->addr_entries;
memcpy (&memb_list[instance->commit_token->memb_index].ring_id,
&instance->my_old_ring_id, sizeof (struct memb_ring_id));
memb_list[instance->commit_token->memb_index].aru = instance->old_ring_state_aru;
/*
* TODO high delivered is really instance->my_aru, but with safe this
* could change?
*/
instance->my_received_flg =
(instance->my_aru == instance->my_high_seq_received);
memb_list[instance->commit_token->memb_index].received_flg = instance->my_received_flg;
memb_list[instance->commit_token->memb_index].high_delivered = instance->my_high_delivered;
/*
* find high aru up to current memb_index for all matching ring ids
* if any ring id matching memb_index has aru less then high aru set
* received flag for that entry to false
*/
high_aru = memb_list[instance->commit_token->memb_index].aru;
for (i = 0; i <= instance->commit_token->memb_index; i++) {
if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id,
&memb_list[i].ring_id,
sizeof (struct memb_ring_id)) == 0) {
if (sq_lt_compare (high_aru, memb_list[i].aru)) {
high_aru = memb_list[i].aru;
}
}
}
for (i = 0; i <= instance->commit_token->memb_index; i++) {
if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id,
&memb_list[i].ring_id,
sizeof (struct memb_ring_id)) == 0) {
if (sq_lt_compare (memb_list[i].aru, high_aru)) {
memb_list[i].received_flg = 0;
if (i == instance->commit_token->memb_index) {
instance->my_received_flg = 0;
}
}
}
}
instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid;
instance->commit_token->memb_index += 1;
assert (instance->commit_token->memb_index <= instance->commit_token->addr_entries);
assert (instance->commit_token->header.nodeid);
}
static void memb_state_commit_token_target_set (
struct totemsrp_instance *instance)
{
struct srp_addr *addr;
unsigned int i;
addr = (struct srp_addr *)instance->commit_token->end_of_commit_token;
for (i = 0; i < instance->totem_config->interface_count; i++) {
totemrrp_token_target_set (
instance->totemrrp_context,
&addr[instance->commit_token->memb_index %
instance->commit_token->addr_entries].addr[i],
i);
}
}
static int memb_state_commit_token_send_recovery (
struct totemsrp_instance *instance,
struct memb_commit_token *commit_token)
{
struct srp_addr *addr;
struct memb_commit_token_memb_entry *memb_list;
unsigned int commit_token_size;
addr = (struct srp_addr *)commit_token->end_of_commit_token;
memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries);
commit_token->token_seq++;
commit_token_size = sizeof (struct memb_commit_token) +
((sizeof (struct srp_addr) +
sizeof (struct memb_commit_token_memb_entry)) * commit_token->addr_entries);
/*
* Make a copy for retransmission if necessary
*/
memcpy (instance->orf_token_retransmit, commit_token, commit_token_size);
instance->orf_token_retransmit_size = commit_token_size;
instance->stats.memb_commit_token_tx++;
totemrrp_token_send (instance->totemrrp_context,
commit_token,
commit_token_size);
/*
* Request retransmission of the commit token in case it is lost
*/
reset_token_retransmit_timeout (instance);
return (0);
}
static int memb_state_commit_token_send (
struct totemsrp_instance *instance)
{
struct srp_addr *addr;
struct memb_commit_token_memb_entry *memb_list;
unsigned int commit_token_size;
addr = (struct srp_addr *)instance->commit_token->end_of_commit_token;
memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries);
instance->commit_token->token_seq++;
commit_token_size = sizeof (struct memb_commit_token) +
((sizeof (struct srp_addr) +
sizeof (struct memb_commit_token_memb_entry)) * instance->commit_token->addr_entries);
/*
* Make a copy for retransmission if necessary
*/
memcpy (instance->orf_token_retransmit, instance->commit_token, commit_token_size);
instance->orf_token_retransmit_size = commit_token_size;
instance->stats.memb_commit_token_tx++;
totemrrp_token_send (instance->totemrrp_context,
instance->commit_token,
commit_token_size);
/*
* Request retransmission of the commit token in case it is lost
*/
reset_token_retransmit_timeout (instance);
return (0);
}
static int memb_lowest_in_config (struct totemsrp_instance *instance)
{
struct srp_addr token_memb[PROCESSOR_COUNT_MAX];
int token_memb_entries = 0;
int i;
struct totem_ip_address *lowest_addr;
memb_set_subtract (token_memb, &token_memb_entries,
instance->my_proc_list, instance->my_proc_list_entries,
instance->my_failed_list, instance->my_failed_list_entries);
/*
* find representative by searching for smallest identifier
*/
lowest_addr = &token_memb[0].addr[0];
for (i = 1; i < token_memb_entries; i++) {
if (totemip_compare(lowest_addr, &token_memb[i].addr[0]) > 0) {
totemip_copy (lowest_addr, &token_memb[i].addr[0]);
}
}
return (totemip_compare (lowest_addr, &instance->my_id.addr[0]) == 0);
}
static int srp_addr_compare (const void *a, const void *b)
{
const struct srp_addr *srp_a = (const struct srp_addr *)a;
const struct srp_addr *srp_b = (const struct srp_addr *)b;
return (totemip_compare (&srp_a->addr[0], &srp_b->addr[0]));
}
static void memb_state_commit_token_create (
struct totemsrp_instance *instance)
{
struct srp_addr token_memb[PROCESSOR_COUNT_MAX];
struct srp_addr *addr;
struct memb_commit_token_memb_entry *memb_list;
int token_memb_entries = 0;
log_printf (instance->totemsrp_log_level_debug,
"Creating commit token because I am the rep.\n");
memb_set_subtract (token_memb, &token_memb_entries,
instance->my_proc_list, instance->my_proc_list_entries,
instance->my_failed_list, instance->my_failed_list_entries);
memset (instance->commit_token, 0, sizeof (struct memb_commit_token));
instance->commit_token->header.type = MESSAGE_TYPE_MEMB_COMMIT_TOKEN;
instance->commit_token->header.endian_detector = ENDIAN_LOCAL;
instance->commit_token->header.encapsulated = 0;
instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid;
assert (instance->commit_token->header.nodeid);
totemip_copy(&instance->commit_token->ring_id.rep, &instance->my_id.addr[0]);
instance->commit_token->ring_id.seq = instance->token_ring_id_seq + 4;
/*
* This qsort is necessary to ensure the commit token traverses
* the ring in the proper order
*/
qsort (token_memb, token_memb_entries, sizeof (struct srp_addr),
srp_addr_compare);
instance->commit_token->memb_index = 0;
instance->commit_token->addr_entries = token_memb_entries;
addr = (struct srp_addr *)instance->commit_token->end_of_commit_token;
memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries);
memcpy (addr, token_memb,
token_memb_entries * sizeof (struct srp_addr));
memset (memb_list, 0,
sizeof (struct memb_commit_token_memb_entry) * token_memb_entries);
}
static void memb_join_message_send (struct totemsrp_instance *instance)
{
char memb_join_data[10000];
struct memb_join *memb_join = (struct memb_join *)memb_join_data;
char *addr;
unsigned int addr_idx;
memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN;
memb_join->header.endian_detector = ENDIAN_LOCAL;
memb_join->header.encapsulated = 0;
memb_join->header.nodeid = instance->my_id.addr[0].nodeid;
assert (memb_join->header.nodeid);
memb_join->ring_seq = instance->my_ring_id.seq;
memb_join->proc_list_entries = instance->my_proc_list_entries;
memb_join->failed_list_entries = instance->my_failed_list_entries;
srp_addr_copy (&memb_join->system_from, &instance->my_id);
/*
* This mess adds the joined and failed processor lists into the join
* message
*/
addr = (char *)memb_join;
addr_idx = sizeof (struct memb_join);
memcpy (&addr[addr_idx],
instance->my_proc_list,
instance->my_proc_list_entries *
sizeof (struct srp_addr));
addr_idx +=
instance->my_proc_list_entries *
sizeof (struct srp_addr);
memcpy (&addr[addr_idx],
instance->my_failed_list,
instance->my_failed_list_entries *
sizeof (struct srp_addr));
addr_idx +=
instance->my_failed_list_entries *
sizeof (struct srp_addr);
if (instance->totem_config->send_join_timeout) {
usleep (random() % (instance->totem_config->send_join_timeout * 1000));
}
instance->stats.memb_join_tx++;
totemrrp_mcast_flush_send (
instance->totemrrp_context,
memb_join,
addr_idx);
}
static void memb_leave_message_send (struct totemsrp_instance *instance)
{
char memb_join_data[10000];
struct memb_join *memb_join = (struct memb_join *)memb_join_data;
char *addr;
unsigned int addr_idx;
int active_memb_entries;
struct srp_addr active_memb[PROCESSOR_COUNT_MAX];
log_printf (instance->totemsrp_log_level_debug,
"sending join/leave message\n");
/*
* add us to the failed list, and remove us from
* the members list
*/
memb_set_merge(
&instance->my_id, 1,
instance->my_failed_list, &instance->my_failed_list_entries);
memb_set_subtract (active_memb, &active_memb_entries,
instance->my_proc_list, instance->my_proc_list_entries,
&instance->my_id, 1);
memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN;
memb_join->header.endian_detector = ENDIAN_LOCAL;
memb_join->header.encapsulated = 0;
memb_join->header.nodeid = LEAVE_DUMMY_NODEID;
memb_join->ring_seq = instance->my_ring_id.seq;
memb_join->proc_list_entries = active_memb_entries;
memb_join->failed_list_entries = instance->my_failed_list_entries;
srp_addr_copy (&memb_join->system_from, &instance->my_id);
memb_join->system_from.addr[0].nodeid = LEAVE_DUMMY_NODEID;
// TODO: CC Maybe use the actual join send routine.
/*
* This mess adds the joined and failed processor lists into the join
* message
*/
addr = (char *)memb_join;
addr_idx = sizeof (struct memb_join);
memcpy (&addr[addr_idx],
active_memb,
active_memb_entries *
sizeof (struct srp_addr));
addr_idx +=
active_memb_entries *
sizeof (struct srp_addr);
memcpy (&addr[addr_idx],
instance->my_failed_list,
instance->my_failed_list_entries *
sizeof (struct srp_addr));
addr_idx +=
instance->my_failed_list_entries *
sizeof (struct srp_addr);
if (instance->totem_config->send_join_timeout) {
usleep (random() % (instance->totem_config->send_join_timeout * 1000));
}
instance->stats.memb_join_tx++;
totemrrp_mcast_flush_send (
instance->totemrrp_context,
memb_join,
addr_idx);
}
static void memb_merge_detect_transmit (struct totemsrp_instance *instance)
{
struct memb_merge_detect memb_merge_detect;
memb_merge_detect.header.type = MESSAGE_TYPE_MEMB_MERGE_DETECT;
memb_merge_detect.header.endian_detector = ENDIAN_LOCAL;
memb_merge_detect.header.encapsulated = 0;
memb_merge_detect.header.nodeid = instance->my_id.addr[0].nodeid;
srp_addr_copy (&memb_merge_detect.system_from, &instance->my_id);
memcpy (&memb_merge_detect.ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id));
assert (memb_merge_detect.header.nodeid);
instance->stats.memb_merge_detect_tx++;
totemrrp_mcast_flush_send (instance->totemrrp_context,
&memb_merge_detect,
sizeof (struct memb_merge_detect));
}
static void memb_ring_id_create_or_load (
struct totemsrp_instance *instance,
struct memb_ring_id *memb_ring_id)
{
int fd;
int res;
char filename[256];
char error_str[100];
snprintf (filename, sizeof(filename), "%s/ringid_%s",
rundir, totemip_print (&instance->my_id.addr[0]));
fd = open (filename, O_RDONLY, 0700);
if (fd > 0) {
res = read (fd, &memb_ring_id->seq, sizeof (unsigned long long));
assert (res == sizeof (unsigned long long));
close (fd);
} else
if (fd == -1 && errno == ENOENT) {
memb_ring_id->seq = 0;
umask(0);
fd = open (filename, O_CREAT|O_RDWR, 0700);
if (fd >= 0) {
res = write (fd, &memb_ring_id->seq, sizeof (unsigned long long));
assert (res == sizeof (unsigned long long));
close (fd);
} else {
strerror_r (errno, error_str, 100);
log_printf (instance->totemsrp_log_level_warning,
"Couldn't create %s %s\n", filename, error_str);
}
} else {
strerror_r (errno, error_str, 100);
log_printf (instance->totemsrp_log_level_warning,
"Couldn't open %s %s\n", filename, error_str);
}
totemip_copy(&memb_ring_id->rep, &instance->my_id.addr[0]);
assert (!totemip_zero_check(&memb_ring_id->rep));
instance->token_ring_id_seq = memb_ring_id->seq;
}
static void memb_ring_id_set_and_store (
struct totemsrp_instance *instance,
const struct memb_ring_id *ring_id)
{
char filename[256];
int fd;
int res;
memcpy (&instance->my_ring_id, ring_id, sizeof (struct memb_ring_id));
snprintf (filename, sizeof(filename), "%s/ringid_%s",
rundir, totemip_print (&instance->my_id.addr[0]));
fd = open (filename, O_WRONLY, 0777);
if (fd == -1) {
fd = open (filename, O_CREAT|O_RDWR, 0777);
}
if (fd == -1) {
char error_str[100];
strerror_r(errno, error_str, 100);
log_printf (instance->totemsrp_log_level_warning,
"Couldn't store new ring id %llx to stable storage (%s)\n",
instance->my_ring_id.seq, error_str);
assert (0);
return;
}
log_printf (instance->totemsrp_log_level_debug,
"Storing new sequence id for ring %llx\n", instance->my_ring_id.seq);
//assert (fd > 0);
res = write (fd, &instance->my_ring_id.seq, sizeof (unsigned long long));
assert (res == sizeof (unsigned long long));
close (fd);
}
int totemsrp_callback_token_create (
void *srp_context,
void **handle_out,
enum totem_callback_token_type type,
int delete,
int (*callback_fn) (enum totem_callback_token_type type, const void *),
const void *data)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context;
struct token_callback_instance *callback_handle;
token_hold_cancel_send (instance);
callback_handle = malloc (sizeof (struct token_callback_instance));
if (callback_handle == 0) {
return (-1);
}
*handle_out = (void *)callback_handle;
list_init (&callback_handle->list);
callback_handle->callback_fn = callback_fn;
callback_handle->data = (void *) data;
callback_handle->callback_type = type;
callback_handle->delete = delete;
switch (type) {
case TOTEM_CALLBACK_TOKEN_RECEIVED:
list_add (&callback_handle->list, &instance->token_callback_received_listhead);
break;
case TOTEM_CALLBACK_TOKEN_SENT:
list_add (&callback_handle->list, &instance->token_callback_sent_listhead);
break;
}
return (0);
}
void totemsrp_callback_token_destroy (void *srp_context, void **handle_out)
{
struct token_callback_instance *h;
if (*handle_out) {
h = (struct token_callback_instance *)*handle_out;
list_del (&h->list);
free (h);
h = NULL;
*handle_out = 0;
}
}
static void token_callbacks_execute (
struct totemsrp_instance *instance,
enum totem_callback_token_type type)
{
struct list_head *list;
struct list_head *list_next;
struct list_head *callback_listhead = 0;
struct token_callback_instance *token_callback_instance;
int res;
int del;
switch (type) {
case TOTEM_CALLBACK_TOKEN_RECEIVED:
callback_listhead = &instance->token_callback_received_listhead;
break;
case TOTEM_CALLBACK_TOKEN_SENT:
callback_listhead = &instance->token_callback_sent_listhead;
break;
default:
assert (0);
}
for (list = callback_listhead->next; list != callback_listhead;
list = list_next) {
token_callback_instance = list_entry (list, struct token_callback_instance, list);
list_next = list->next;
del = token_callback_instance->delete;
if (del == 1) {
list_del (list);
}
res = token_callback_instance->callback_fn (
token_callback_instance->callback_type,
token_callback_instance->data);
/*
* This callback failed to execute, try it again on the next token
*/
if (res == -1 && del == 1) {
list_add (list, callback_listhead);
} else if (del) {
free (token_callback_instance);
}
}
}
/*
* Flow control functions
*/
static unsigned int backlog_get (struct totemsrp_instance *instance)
{
unsigned int backlog = 0;
if (instance->memb_state == MEMB_STATE_OPERATIONAL) {
backlog = cs_queue_used (&instance->new_message_queue);
} else
if (instance->memb_state == MEMB_STATE_RECOVERY) {
backlog = cs_queue_used (&instance->retrans_message_queue);
}
instance->stats.token[instance->stats.latest_token].backlog_calc = backlog;
return (backlog);
}
static int fcc_calculate (
struct totemsrp_instance *instance,
struct orf_token *token)
{
unsigned int transmits_allowed;
unsigned int backlog_calc;
transmits_allowed = instance->totem_config->max_messages;
if (transmits_allowed > instance->totem_config->window_size - token->fcc) {
transmits_allowed = instance->totem_config->window_size - token->fcc;
}
instance->my_cbl = backlog_get (instance);
/*
* Only do backlog calculation if there is a backlog otherwise
* we would result in div by zero
*/
if (token->backlog + instance->my_cbl - instance->my_pbl) {
backlog_calc = (instance->totem_config->window_size * instance->my_pbl) /
(token->backlog + instance->my_cbl - instance->my_pbl);
if (backlog_calc > 0 && transmits_allowed > backlog_calc) {
transmits_allowed = backlog_calc;
}
}
return (transmits_allowed);
}
/*
* don't overflow the RTR sort queue
*/
static void fcc_rtr_limit (
struct totemsrp_instance *instance,
struct orf_token *token,
unsigned int *transmits_allowed)
{
int check = QUEUE_RTR_ITEMS_SIZE_MAX;
check -= (*transmits_allowed + instance->totem_config->window_size);
assert (check >= 0);
if (sq_lt_compare (instance->last_released +
QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed -
instance->totem_config->window_size,
token->seq)) {
*transmits_allowed = 0;
}
}
static void fcc_token_update (
struct totemsrp_instance *instance,
struct orf_token *token,
unsigned int msgs_transmitted)
{
token->fcc += msgs_transmitted - instance->my_trc;
token->backlog += instance->my_cbl - instance->my_pbl;
instance->my_trc = msgs_transmitted;
instance->my_pbl = instance->my_cbl;
}
/*
* Message Handlers
*/
unsigned long long int tv_old;
/*
* message handler called when TOKEN message type received
*/
static int message_handler_orf_token (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed)
{
char token_storage[1500];
char token_convert[1500];
struct orf_token *token = NULL;
int forward_token;
unsigned int transmits_allowed;
unsigned int mcasted_retransmit;
unsigned int mcasted_regular;
unsigned int last_aru;
#ifdef GIVEINFO
unsigned long long tv_current;
unsigned long long tv_diff;
tv_current = timerlist_nano_current_get ();
tv_diff = tv_current - tv_old;
tv_old = tv_current;
log_printf (instance->totemsrp_log_level_debug,
"Time since last token %0.4f ms\n", ((float)tv_diff) / 1000000.0);
#endif
#ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE
if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) {
return (0);
}
#endif
if (endian_conversion_needed) {
orf_token_endian_convert ((struct orf_token *)msg,
(struct orf_token *)token_convert);
msg = (struct orf_token *)token_convert;
}
/*
* Make copy of token and retransmit list in case we have
* to flush incoming messages from the kernel queue
*/
token = (struct orf_token *)token_storage;
memcpy (token, msg, sizeof (struct orf_token));
memcpy (&token->rtr_list[0], (char *)msg + sizeof (struct orf_token),
sizeof (struct rtr_item) * RETRANSMIT_ENTRIES_MAX);
/*
* Handle merge detection timeout
*/
if (token->seq == instance->my_last_seq) {
start_merge_detect_timeout (instance);
instance->my_seq_unchanged += 1;
} else {
cancel_merge_detect_timeout (instance);
cancel_token_hold_retransmit_timeout (instance);
instance->my_seq_unchanged = 0;
}
instance->my_last_seq = token->seq;
#ifdef TEST_RECOVERY_MSG_COUNT
if (instance->memb_state == MEMB_STATE_OPERATIONAL && token->seq > TEST_RECOVERY_MSG_COUNT) {
return (0);
}
#endif
totemrrp_recv_flush (instance->totemrrp_context);
/*
* Determine if we should hold (in reality drop) the token
*/
instance->my_token_held = 0;
if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) &&
instance->my_seq_unchanged > instance->totem_config->seqno_unchanged_const) {
instance->my_token_held = 1;
} else
if (!totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) &&
instance->my_seq_unchanged >= instance->totem_config->seqno_unchanged_const) {
instance->my_token_held = 1;
}
/*
* Hold onto token when there is no activity on ring and
* this processor is the ring rep
*/
forward_token = 1;
if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) {
if (instance->my_token_held) {
forward_token = 0;
}
}
token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_RECEIVED);
switch (instance->memb_state) {
case MEMB_STATE_COMMIT:
/* Discard token */
break;
case MEMB_STATE_OPERATIONAL:
messages_free (instance, token->aru);
/*
* Do NOT add break, this case should also execute code in gather case.
*/
case MEMB_STATE_GATHER:
/*
* DO NOT add break, we use different free mechanism in recovery state
*/
case MEMB_STATE_RECOVERY:
/*
* Discard tokens from another configuration
*/
if (memcmp (&token->ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id)) != 0) {
if ((forward_token)
&& instance->use_heartbeat) {
reset_heartbeat_timeout(instance);
}
else {
cancel_heartbeat_timeout(instance);
}
return (0); /* discard token */
}
/*
* Discard retransmitted tokens
*/
if (sq_lte_compare (token->token_seq, instance->my_token_seq)) {
return (0); /* discard token */
}
last_aru = instance->my_last_aru;
instance->my_last_aru = token->aru;
transmits_allowed = fcc_calculate (instance, token);
mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed);
fcc_rtr_limit (instance, token, &transmits_allowed);
mcasted_regular = orf_token_mcast (instance, token, transmits_allowed);
/*
if (mcasted_regular) {
printf ("mcasted regular %d\n", mcasted_regular);
printf ("token seq %d\n", token->seq);
}
*/
fcc_token_update (instance, token, mcasted_retransmit +
mcasted_regular);
if (sq_lt_compare (instance->my_aru, token->aru) ||
instance->my_id.addr[0].nodeid == token->aru_addr ||
token->aru_addr == 0) {
token->aru = instance->my_aru;
if (token->aru == token->seq) {
token->aru_addr = 0;
} else {
token->aru_addr = instance->my_id.addr[0].nodeid;
}
}
if (token->aru == last_aru && token->aru_addr != 0) {
instance->my_aru_count += 1;
} else {
instance->my_aru_count = 0;
}
if (instance->my_aru_count > instance->totem_config->fail_to_recv_const &&
token->aru_addr == instance->my_id.addr[0].nodeid) {
log_printf (instance->totemsrp_log_level_error,
"FAILED TO RECEIVE\n");
instance->failed_to_recv = 1;
memb_set_merge (&instance->my_id, 1,
instance->my_failed_list,
&instance->my_failed_list_entries);
memb_state_gather_enter (instance, 6);
} else {
instance->my_token_seq = token->token_seq;
token->token_seq += 1;
if (instance->memb_state == MEMB_STATE_RECOVERY) {
/*
* instance->my_aru == instance->my_high_seq_received means this processor
* has recovered all messages it can recover
* (ie: its retrans queue is empty)
*/
if (cs_queue_is_empty (&instance->retrans_message_queue) == 0) {
if (token->retrans_flg == 0) {
token->retrans_flg = 1;
instance->my_set_retrans_flg = 1;
}
} else
if (token->retrans_flg == 1 && instance->my_set_retrans_flg) {
token->retrans_flg = 0;
instance->my_set_retrans_flg = 0;
}
log_printf (instance->totemsrp_log_level_debug,
"token retrans flag is %d my set retrans flag%d retrans queue empty %d count %d, aru %x\n",
token->retrans_flg, instance->my_set_retrans_flg,
cs_queue_is_empty (&instance->retrans_message_queue),
instance->my_retrans_flg_count, token->aru);
if (token->retrans_flg == 0) {
instance->my_retrans_flg_count += 1;
} else {
instance->my_retrans_flg_count = 0;
}
if (instance->my_retrans_flg_count == 2) {
instance->my_install_seq = token->seq;
}
log_printf (instance->totemsrp_log_level_debug,
"install seq %x aru %x high seq received %x\n",
instance->my_install_seq, instance->my_aru, instance->my_high_seq_received);
if (instance->my_retrans_flg_count >= 2 &&
instance->my_received_flg == 0 &&
sq_lte_compare (instance->my_install_seq, instance->my_aru)) {
instance->my_received_flg = 1;
instance->my_deliver_memb_entries = instance->my_trans_memb_entries;
memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list,
sizeof (struct totem_ip_address) * instance->my_trans_memb_entries);
}
if (instance->my_retrans_flg_count >= 3 &&
sq_lte_compare (instance->my_install_seq, token->aru)) {
instance->my_rotation_counter += 1;
} else {
instance->my_rotation_counter = 0;
}
if (instance->my_rotation_counter == 2) {
log_printf (instance->totemsrp_log_level_debug,
"retrans flag count %x token aru %x install seq %x aru %x %x\n",
instance->my_retrans_flg_count, token->aru, instance->my_install_seq,
instance->my_aru, token->seq);
memb_state_operational_enter (instance);
instance->my_rotation_counter = 0;
instance->my_retrans_flg_count = 0;
}
}
totemrrp_send_flush (instance->totemrrp_context);
token_send (instance, token, forward_token);
#ifdef GIVEINFO
tv_current = timerlist_nano_current_get ();
tv_diff = tv_current - tv_old;
tv_old = tv_current;
log_printf (instance->totemsrp_log_level_debug,
"I held %0.4f ms\n",
((float)tv_diff) / 1000000.0);
#endif
if (instance->memb_state == MEMB_STATE_OPERATIONAL) {
messages_deliver_to_app (instance, 0,
instance->my_high_seq_received);
}
/*
* Deliver messages after token has been transmitted
* to improve performance
*/
reset_token_timeout (instance); // REVIEWED
reset_token_retransmit_timeout (instance); // REVIEWED
if (totemip_equal(&instance->my_id.addr[0], &instance->my_ring_id.rep) &&
instance->my_token_held == 1) {
start_token_hold_retransmit_timeout (instance);
}
token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_SENT);
}
break;
}
if ((forward_token)
&& instance->use_heartbeat) {
reset_heartbeat_timeout(instance);
}
else {
cancel_heartbeat_timeout(instance);
}
return (0);
}
static void messages_deliver_to_app (
struct totemsrp_instance *instance,
int skip,
unsigned int end_point)
{
struct sort_queue_item *sort_queue_item_p;
unsigned int i;
int res;
struct mcast *mcast_in;
struct mcast mcast_header;
unsigned int range = 0;
int endian_conversion_required;
unsigned int my_high_delivered_stored = 0;
range = end_point - instance->my_high_delivered;
if (range) {
log_printf (instance->totemsrp_log_level_debug,
"Delivering %x to %x\n", instance->my_high_delivered,
end_point);
}
assert (range < QUEUE_RTR_ITEMS_SIZE_MAX);
my_high_delivered_stored = instance->my_high_delivered;
/*
* Deliver messages in order from rtr queue to pending delivery queue
*/
for (i = 1; i <= range; i++) {
void *ptr = 0;
/*
* If out of range of sort queue, stop assembly
*/
res = sq_in_range (&instance->regular_sort_queue,
my_high_delivered_stored + i);
if (res == 0) {
break;
}
res = sq_item_get (&instance->regular_sort_queue,
my_high_delivered_stored + i, &ptr);
/*
* If hole, stop assembly
*/
if (res != 0 && skip == 0) {
break;
}
instance->my_high_delivered = my_high_delivered_stored + i;
if (res != 0) {
continue;
}
sort_queue_item_p = ptr;
mcast_in = sort_queue_item_p->mcast;
assert (mcast_in != (struct mcast *)0xdeadbeef);
endian_conversion_required = 0;
if (mcast_in->header.endian_detector != ENDIAN_LOCAL) {
endian_conversion_required = 1;
mcast_endian_convert (mcast_in, &mcast_header);
} else {
memcpy (&mcast_header, mcast_in, sizeof (struct mcast));
}
/*
* Skip messages not originated in instance->my_deliver_memb
*/
if (skip &&
memb_set_subset (&mcast_header.system_from,
1,
instance->my_deliver_memb_list,
instance->my_deliver_memb_entries) == 0) {
instance->my_high_delivered = my_high_delivered_stored + i;
continue;
}
/*
* Message found
*/
log_printf (instance->totemsrp_log_level_debug,
"Delivering MCAST message with seq %x to pending delivery queue\n",
mcast_header.seq);
/*
* Message is locally originated multicast
*/
instance->totemsrp_deliver_fn (
mcast_header.header.nodeid,
((char *)sort_queue_item_p->mcast) + sizeof (struct mcast),
sort_queue_item_p->msg_len - sizeof (struct mcast),
endian_conversion_required);
}
}
/*
* recv message handler called when MCAST message type received
*/
static int message_handler_mcast (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed)
{
struct sort_queue_item sort_queue_item;
struct sq *sort_queue;
struct mcast mcast_header;
if (endian_conversion_needed) {
mcast_endian_convert (msg, &mcast_header);
} else {
memcpy (&mcast_header, msg, sizeof (struct mcast));
}
if (mcast_header.header.encapsulated == MESSAGE_ENCAPSULATED) {
sort_queue = &instance->recovery_sort_queue;
} else {
sort_queue = &instance->regular_sort_queue;
}
assert (msg_len <= FRAME_SIZE_MAX);
#ifdef TEST_DROP_MCAST_PERCENTAGE
if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) {
return (0);
}
#endif
/*
* If the message is foreign execute the switch below
*/
if (memcmp (&instance->my_ring_id, &mcast_header.ring_id,
sizeof (struct memb_ring_id)) != 0) {
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
memb_set_merge (
&mcast_header.system_from, 1,
instance->my_proc_list, &instance->my_proc_list_entries);
memb_state_gather_enter (instance, 7);
break;
case MEMB_STATE_GATHER:
if (!memb_set_subset (
&mcast_header.system_from,
1,
instance->my_proc_list,
instance->my_proc_list_entries)) {
memb_set_merge (&mcast_header.system_from, 1,
instance->my_proc_list, &instance->my_proc_list_entries);
memb_state_gather_enter (instance, 8);
return (0);
}
break;
case MEMB_STATE_COMMIT:
/* discard message */
instance->stats.rx_msg_dropped++;
break;
case MEMB_STATE_RECOVERY:
/* discard message */
instance->stats.rx_msg_dropped++;
break;
}
return (0);
}
log_printf (instance->totemsrp_log_level_debug,
"Received ringid(%s:%lld) seq %x\n",
totemip_print (&mcast_header.ring_id.rep),
mcast_header.ring_id.seq,
mcast_header.seq);
/*
* Add mcast message to rtr queue if not already in rtr queue
* otherwise free io vectors
*/
if (msg_len > 0 && msg_len <= FRAME_SIZE_MAX &&
sq_in_range (sort_queue, mcast_header.seq) &&
sq_item_inuse (sort_queue, mcast_header.seq) == 0) {
/*
* Allocate new multicast memory block
*/
// TODO LEAK
sort_queue_item.mcast = malloc (msg_len);
if (sort_queue_item.mcast == NULL) {
return (-1); /* error here is corrected by the algorithm */
}
memcpy (sort_queue_item.mcast, msg, msg_len);
sort_queue_item.msg_len = msg_len;
if (sq_lt_compare (instance->my_high_seq_received,
mcast_header.seq)) {
instance->my_high_seq_received = mcast_header.seq;
}
sq_item_add (sort_queue, &sort_queue_item, mcast_header.seq);
}
update_aru (instance);
if (instance->memb_state == MEMB_STATE_OPERATIONAL) {
messages_deliver_to_app (instance, 0, instance->my_high_seq_received);
}
/* TODO remove from retrans message queue for old ring in recovery state */
return (0);
}
static int message_handler_memb_merge_detect (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed)
{
struct memb_merge_detect memb_merge_detect;
if (endian_conversion_needed) {
memb_merge_detect_endian_convert (msg, &memb_merge_detect);
} else {
memcpy (&memb_merge_detect, msg,
sizeof (struct memb_merge_detect));
}
/*
* do nothing if this is a merge detect from this configuration
*/
if (memcmp (&instance->my_ring_id, &memb_merge_detect.ring_id,
sizeof (struct memb_ring_id)) == 0) {
return (0);
}
/*
* Execute merge operation
*/
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
memb_set_merge (&memb_merge_detect.system_from, 1,
instance->my_proc_list, &instance->my_proc_list_entries);
memb_state_gather_enter (instance, 9);
break;
case MEMB_STATE_GATHER:
if (!memb_set_subset (
&memb_merge_detect.system_from,
1,
instance->my_proc_list,
instance->my_proc_list_entries)) {
memb_set_merge (&memb_merge_detect.system_from, 1,
instance->my_proc_list, &instance->my_proc_list_entries);
memb_state_gather_enter (instance, 10);
return (0);
}
break;
case MEMB_STATE_COMMIT:
/* do nothing in commit */
break;
case MEMB_STATE_RECOVERY:
/* do nothing in recovery */
break;
}
return (0);
}
static void memb_join_process (
struct totemsrp_instance *instance,
const struct memb_join *memb_join)
{
struct srp_addr *proc_list;
struct srp_addr *failed_list;
int gather_entered = 0;
int fail_minus_memb_entries = 0;
struct srp_addr fail_minus_memb[PROCESSOR_COUNT_MAX];
proc_list = (struct srp_addr *)memb_join->end_of_memb_join;
failed_list = proc_list + memb_join->proc_list_entries;
/*
memb_set_print ("proclist", proc_list, memb_join->proc_list_entries);
memb_set_print ("faillist", failed_list, memb_join->failed_list_entries);
memb_set_print ("my_proclist", instance->my_proc_list, instance->my_proc_list_entries);
memb_set_print ("my_faillist", instance->my_failed_list, instance->my_failed_list_entries);
-*/
if (memb_set_equal (proc_list,
memb_join->proc_list_entries,
instance->my_proc_list,
instance->my_proc_list_entries) &&
memb_set_equal (failed_list,
memb_join->failed_list_entries,
instance->my_failed_list,
instance->my_failed_list_entries)) {
memb_consensus_set (instance, &memb_join->system_from);
if (memb_consensus_agreed (instance) && instance->failed_to_recv == 1) {
instance->failed_to_recv = 0;
srp_addr_copy (&instance->my_proc_list[0],
&instance->my_id);
instance->my_proc_list_entries = 1;
instance->my_failed_list_entries = 0;
memb_state_commit_token_create (instance);
memb_state_commit_enter (instance);
return;
}
if (memb_consensus_agreed (instance) &&
memb_lowest_in_config (instance)) {
memb_state_commit_token_create (instance);
memb_state_commit_enter (instance);
} else {
return;
}
} else
if (memb_set_subset (proc_list,
memb_join->proc_list_entries,
instance->my_proc_list,
instance->my_proc_list_entries) &&
memb_set_subset (failed_list,
memb_join->failed_list_entries,
instance->my_failed_list,
instance->my_failed_list_entries)) {
return;
} else
if (memb_set_subset (&memb_join->system_from, 1,
instance->my_failed_list, instance->my_failed_list_entries)) {
return;
} else {
memb_set_merge (proc_list,
memb_join->proc_list_entries,
instance->my_proc_list, &instance->my_proc_list_entries);
if (memb_set_subset (
&instance->my_id, 1,
failed_list, memb_join->failed_list_entries)) {
memb_set_merge (
&memb_join->system_from, 1,
instance->my_failed_list, &instance->my_failed_list_entries);
} else {
if (memb_set_subset (
&memb_join->system_from, 1,
instance->my_memb_list,
instance->my_memb_entries)) {
if (memb_set_subset (
&memb_join->system_from, 1,
instance->my_failed_list,
instance->my_failed_list_entries) == 0) {
memb_set_merge (failed_list,
memb_join->failed_list_entries,
instance->my_failed_list, &instance->my_failed_list_entries);
} else {
memb_set_subtract (fail_minus_memb,
&fail_minus_memb_entries,
failed_list,
memb_join->failed_list_entries,
instance->my_memb_list,
instance->my_memb_entries);
memb_set_merge (fail_minus_memb,
fail_minus_memb_entries,
instance->my_failed_list,
&instance->my_failed_list_entries);
}
}
}
memb_state_gather_enter (instance, 11);
gather_entered = 1;
}
if (gather_entered == 0 &&
instance->memb_state == MEMB_STATE_OPERATIONAL) {
memb_state_gather_enter (instance, 12);
}
}
static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out)
{
int i;
struct srp_addr *in_proc_list;
struct srp_addr *in_failed_list;
struct srp_addr *out_proc_list;
struct srp_addr *out_failed_list;
out->header.type = in->header.type;
out->header.endian_detector = ENDIAN_LOCAL;
out->header.nodeid = swab32 (in->header.nodeid);
srp_addr_copy_endian_convert (&out->system_from, &in->system_from);
out->proc_list_entries = swab32 (in->proc_list_entries);
out->failed_list_entries = swab32 (in->failed_list_entries);
out->ring_seq = swab64 (in->ring_seq);
in_proc_list = (struct srp_addr *)in->end_of_memb_join;
in_failed_list = in_proc_list + out->proc_list_entries;
out_proc_list = (struct srp_addr *)out->end_of_memb_join;
out_failed_list = out_proc_list + out->proc_list_entries;
for (i = 0; i < out->proc_list_entries; i++) {
srp_addr_copy_endian_convert (&out_proc_list[i], &in_proc_list[i]);
}
for (i = 0; i < out->failed_list_entries; i++) {
srp_addr_copy_endian_convert (&out_failed_list[i], &in_failed_list[i]);
}
}
static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out)
{
int i;
struct srp_addr *in_addr = (struct srp_addr *)in->end_of_commit_token;
struct srp_addr *out_addr = (struct srp_addr *)out->end_of_commit_token;
struct memb_commit_token_memb_entry *in_memb_list;
struct memb_commit_token_memb_entry *out_memb_list;
out->header.type = in->header.type;
out->header.endian_detector = ENDIAN_LOCAL;
out->header.nodeid = swab32 (in->header.nodeid);
out->token_seq = swab32 (in->token_seq);
totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep);
out->ring_id.seq = swab64 (in->ring_id.seq);
out->retrans_flg = swab32 (in->retrans_flg);
out->memb_index = swab32 (in->memb_index);
out->addr_entries = swab32 (in->addr_entries);
in_memb_list = (struct memb_commit_token_memb_entry *)(in_addr + out->addr_entries);
out_memb_list = (struct memb_commit_token_memb_entry *)(out_addr + out->addr_entries);
for (i = 0; i < out->addr_entries; i++) {
srp_addr_copy_endian_convert (&out_addr[i], &in_addr[i]);
/*
* Only convert the memb entry if it has been set
*/
if (in_memb_list[i].ring_id.rep.family != 0) {
totemip_copy_endian_convert (&out_memb_list[i].ring_id.rep,
&in_memb_list[i].ring_id.rep);
out_memb_list[i].ring_id.seq =
swab64 (in_memb_list[i].ring_id.seq);
out_memb_list[i].aru = swab32 (in_memb_list[i].aru);
out_memb_list[i].high_delivered = swab32 (in_memb_list[i].high_delivered);
out_memb_list[i].received_flg = swab32 (in_memb_list[i].received_flg);
}
}
}
static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out)
{
int i;
out->header.type = in->header.type;
out->header.endian_detector = ENDIAN_LOCAL;
out->header.nodeid = swab32 (in->header.nodeid);
out->seq = swab32 (in->seq);
out->token_seq = swab32 (in->token_seq);
out->aru = swab32 (in->aru);
totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep);
out->aru_addr = swab32(in->aru_addr);
out->ring_id.seq = swab64 (in->ring_id.seq);
out->fcc = swab32 (in->fcc);
out->backlog = swab32 (in->backlog);
out->retrans_flg = swab32 (in->retrans_flg);
out->rtr_list_entries = swab32 (in->rtr_list_entries);
for (i = 0; i < out->rtr_list_entries; i++) {
totemip_copy_endian_convert(&out->rtr_list[i].ring_id.rep, &in->rtr_list[i].ring_id.rep);
out->rtr_list[i].ring_id.seq = swab64 (in->rtr_list[i].ring_id.seq);
out->rtr_list[i].seq = swab32 (in->rtr_list[i].seq);
}
}
static void mcast_endian_convert (const struct mcast *in, struct mcast *out)
{
out->header.type = in->header.type;
out->header.endian_detector = ENDIAN_LOCAL;
out->header.nodeid = swab32 (in->header.nodeid);
out->header.encapsulated = in->header.encapsulated;
out->seq = swab32 (in->seq);
out->this_seqno = swab32 (in->this_seqno);
totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep);
out->ring_id.seq = swab64 (in->ring_id.seq);
out->node_id = swab32 (in->node_id);
out->guarantee = swab32 (in->guarantee);
srp_addr_copy_endian_convert (&out->system_from, &in->system_from);
}
static void memb_merge_detect_endian_convert (
const struct memb_merge_detect *in,
struct memb_merge_detect *out)
{
out->header.type = in->header.type;
out->header.endian_detector = ENDIAN_LOCAL;
out->header.nodeid = swab32 (in->header.nodeid);
totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep);
out->ring_id.seq = swab64 (in->ring_id.seq);
srp_addr_copy_endian_convert (&out->system_from, &in->system_from);
}
static int message_handler_memb_join (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed)
{
const struct memb_join *memb_join;
struct memb_join *memb_join_convert = alloca (msg_len);
if (endian_conversion_needed) {
memb_join = memb_join_convert;
memb_join_endian_convert (msg, memb_join_convert);
} else {
memb_join = msg;
}
/*
* If the process paused because it wasn't scheduled in a timely
* fashion, flush the join messages because they may be queued
* entries
*/
if (pause_flush (instance)) {
return (0);
}
if (instance->token_ring_id_seq < memb_join->ring_seq) {
instance->token_ring_id_seq = memb_join->ring_seq;
}
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
memb_join_process (instance, memb_join);
break;
case MEMB_STATE_GATHER:
memb_join_process (instance, memb_join);
break;
case MEMB_STATE_COMMIT:
if (memb_set_subset (&memb_join->system_from,
1,
instance->my_new_memb_list,
instance->my_new_memb_entries) &&
memb_join->ring_seq >= instance->my_ring_id.seq) {
memb_join_process (instance, memb_join);
memb_state_gather_enter (instance, 13);
}
break;
case MEMB_STATE_RECOVERY:
if (memb_set_subset (&memb_join->system_from,
1,
instance->my_new_memb_list,
instance->my_new_memb_entries) &&
memb_join->ring_seq >= instance->my_ring_id.seq) {
ring_state_restore (instance);
memb_join_process (instance, memb_join);
memb_state_gather_enter (instance, 14);
}
break;
}
return (0);
}
static int message_handler_memb_commit_token (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed)
{
struct memb_commit_token *memb_commit_token_convert = alloca (msg_len);
struct memb_commit_token *memb_commit_token;
struct srp_addr sub[PROCESSOR_COUNT_MAX];
int sub_entries;
struct srp_addr *addr;
struct memb_commit_token_memb_entry *memb_list;
log_printf (instance->totemsrp_log_level_debug,
"got commit token\n");
if (endian_conversion_needed) {
memb_commit_token_endian_convert (msg, memb_commit_token_convert);
} else {
memcpy (memb_commit_token_convert, msg, msg_len);
}
memb_commit_token = memb_commit_token_convert;
addr = (struct srp_addr *)memb_commit_token->end_of_commit_token;
memb_list = (struct memb_commit_token_memb_entry *)(addr + memb_commit_token->addr_entries);
#ifdef TEST_DROP_COMMIT_TOKEN_PERCENTAGE
if (random()%100 < TEST_DROP_COMMIT_TOKEN_PERCENTAGE) {
return (0);
}
#endif
switch (instance->memb_state) {
case MEMB_STATE_OPERATIONAL:
/* discard token */
break;
case MEMB_STATE_GATHER:
memb_set_subtract (sub, &sub_entries,
instance->my_proc_list, instance->my_proc_list_entries,
instance->my_failed_list, instance->my_failed_list_entries);
if (memb_set_equal (addr,
memb_commit_token->addr_entries,
sub,
sub_entries) &&
memb_commit_token->ring_id.seq > instance->my_ring_id.seq) {
memcpy (instance->commit_token, memb_commit_token, msg_len);
memb_state_commit_enter (instance);
}
break;
case MEMB_STATE_COMMIT:
/*
* If retransmitted commit tokens are sent on this ring
* filter them out and only enter recovery once the
* commit token has traversed the array. This is
* determined by :
* memb_commit_token->memb_index == memb_commit_token->addr_entries) {
*/
if (memb_commit_token->ring_id.seq == instance->my_ring_id.seq &&
memb_commit_token->memb_index == memb_commit_token->addr_entries) {
memb_state_recovery_enter (instance, memb_commit_token);
}
break;
case MEMB_STATE_RECOVERY:
if (totemip_equal (&instance->my_id.addr[0], &instance->my_ring_id.rep)) {
log_printf (instance->totemsrp_log_level_debug,
"Sending initial ORF token\n");
// TODO convert instead of initiate
orf_token_send_initial (instance);
reset_token_timeout (instance); // REVIEWED
reset_token_retransmit_timeout (instance); // REVIEWED
}
break;
}
return (0);
}
static int message_handler_token_hold_cancel (
struct totemsrp_instance *instance,
const void *msg,
size_t msg_len,
int endian_conversion_needed)
{
const struct token_hold_cancel *token_hold_cancel = msg;
if (memcmp (&token_hold_cancel->ring_id, &instance->my_ring_id,
sizeof (struct memb_ring_id)) == 0) {
instance->my_seq_unchanged = 0;
if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) {
timer_function_token_retransmit_timeout (instance);
}
}
return (0);
}
void main_deliver_fn (
void *context,
const void *msg,
unsigned int msg_len)
{
struct totemsrp_instance *instance = context;
const struct message_header *message_header = msg;
if (msg_len < sizeof (struct message_header)) {
log_printf (instance->totemsrp_log_level_security,
"Received message is too short... ignoring %u.\n",
(unsigned int)msg_len);
return;
}
if ((int)message_header->type >= totemsrp_message_handlers.count) {
log_printf (instance->totemsrp_log_level_security, "Type of received message is wrong... ignoring %d.\n", (int)message_header->type);
printf ("wrong message type\n");
instance->stats.rx_msg_dropped++;
return;
}
switch (message_header->type) {
case MESSAGE_TYPE_ORF_TOKEN:
instance->stats.orf_token_rx++;
break;
case MESSAGE_TYPE_MCAST:
instance->stats.mcast_rx++;
break;
case MESSAGE_TYPE_MEMB_MERGE_DETECT:
instance->stats.memb_merge_detect_rx++;
break;
case MESSAGE_TYPE_MEMB_JOIN:
instance->stats.memb_join_rx++;
break;
case MESSAGE_TYPE_MEMB_COMMIT_TOKEN:
instance->stats.memb_commit_token_rx++;
break;
case MESSAGE_TYPE_TOKEN_HOLD_CANCEL:
instance->stats.token_hold_cancel_rx++;
break;
default:
break;
}
/*
* Handle incoming message
*/
totemsrp_message_handlers.handler_functions[(int)message_header->type] (
instance,
msg,
msg_len,
message_header->endian_detector != ENDIAN_LOCAL);
}
void main_iface_change_fn (
void *context,
const struct totem_ip_address *iface_addr,
unsigned int iface_no)
{
struct totemsrp_instance *instance = context;
int i;
totemip_copy (&instance->my_id.addr[iface_no], iface_addr);
assert (instance->my_id.addr[iface_no].nodeid);
totemip_copy (&instance->my_memb_list[0].addr[iface_no], iface_addr);
if (instance->iface_changes++ == 0) {
memb_ring_id_create_or_load (instance, &instance->my_ring_id);
log_printf (
instance->totemsrp_log_level_debug,
"Created or loaded sequence id %lld.%s for this ring.\n",
instance->my_ring_id.seq,
totemip_print (&instance->my_ring_id.rep));
for (i = 0; i < instance->totem_config->interfaces[iface_no].member_count; i++) {
totemsrp_member_add (instance,
&instance->totem_config->interfaces[iface_no].member_list[i],
iface_no);
}
if (instance->totemsrp_service_ready_fn) {
instance->totemsrp_service_ready_fn ();
}
}
if (instance->iface_changes >= instance->totem_config->interface_count) {
memb_state_gather_enter (instance, 15);
}
}
void totemsrp_net_mtu_adjust (struct totem_config *totem_config) {
totem_config->net_mtu -= sizeof (struct mcast);
}
void totemsrp_service_ready_register (
void *context,
void (*totem_service_ready) (void))
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)context;
instance->totemsrp_service_ready_fn = totem_service_ready;
}
int totemsrp_member_add (
void *context,
const struct totem_ip_address *member,
int ring_no)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)context;
int res;
res = totemrrp_member_add (instance->totemrrp_context, member, ring_no);
return (res);
}
int totemsrp_member_remove (
void *context,
const struct totem_ip_address *member,
int ring_no)
{
struct totemsrp_instance *instance = (struct totemsrp_instance *)context;
int res;
res = totemrrp_member_remove (instance->totemrrp_context, member, ring_no);
return (res);
}
diff --git a/include/corosync/sq.h b/include/corosync/sq.h
index aa5bc1e0..ce3b1f86 100644
--- a/include/corosync/sq.h
+++ b/include/corosync/sq.h
@@ -1,293 +1,317 @@
/*
* Copyright (c) 2003-2004 MontaVista Software, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SORTQUEUE_H_DEFINED
#define SORTQUEUE_H_DEFINED
#include <errno.h>
#include <string.h>
struct sq {
unsigned int head;
unsigned int size;
void *items;
unsigned int *items_inuse;
+ unsigned int *items_miss_count;
unsigned int size_per_item;
unsigned int head_seqid;
unsigned int item_count;
unsigned int pos_max;
};
/*
* Compare a unsigned rollover-safe value to an unsigned rollover-safe value
*/
/*
* ADJUST_ROLLOVER_POINT is the value used to determine when a window should be
* used to calculate a less-then or less-then-equal comparison.
*
* ADJUST_ROLLOVER_VALUE is the value by which both values in a comparison are
* adjusted if either value in a comparison is greater then
* ADJUST_ROLLOVER_POINT.
*/
#define ADJUST_ROLLOVER_POINT 0x80000000
#define ADJUST_ROLLOVER_VALUE 0x10000
static inline int sq_lt_compare (unsigned int a, unsigned int b) {
if ((a > ADJUST_ROLLOVER_POINT) || (b > ADJUST_ROLLOVER_POINT)) {
if ((a - ADJUST_ROLLOVER_VALUE) < (b - ADJUST_ROLLOVER_VALUE)) {
return (1);
}
} else {
if (a < b) {
return (1);
}
}
return (0);
}
static inline int sq_lte_compare (unsigned int a, unsigned int b) {
if ((a > ADJUST_ROLLOVER_POINT) || (b > ADJUST_ROLLOVER_POINT)) {
if ((a - ADJUST_ROLLOVER_VALUE) <= (b - ADJUST_ROLLOVER_VALUE)) {
return (1);
}
} else {
if (a <= b) {
return (1);
}
}
return (0);
}
static inline int sq_init (
struct sq *sq,
int item_count,
int size_per_item,
int head_seqid)
{
sq->head = 0;
sq->size = item_count;
sq->size_per_item = size_per_item;
sq->head_seqid = head_seqid;
sq->item_count = item_count;
sq->pos_max = 0;
sq->items = malloc (item_count * size_per_item);
if (sq->items == NULL) {
return (-ENOMEM);
}
memset (sq->items, 0, item_count * size_per_item);
if ((sq->items_inuse = malloc (item_count * sizeof (unsigned int)))
== NULL) {
return (-ENOMEM);
}
+ if ((sq->items_miss_count = malloc (item_count * sizeof (unsigned int)))
+ == NULL) {
+ return (-ENOMEM);
+ }
memset (sq->items_inuse, 0, item_count * sizeof (unsigned int));
+ memset (sq->items_miss_count, 0, item_count * sizeof (unsigned int));
return (0);
}
static inline void sq_reinit (struct sq *sq, unsigned int head_seqid)
{
sq->head = 0;
sq->head_seqid = head_seqid;
sq->pos_max = 0;
memset (sq->items, 0, sq->item_count * sq->size_per_item);
memset (sq->items_inuse, 0, sq->item_count * sizeof (unsigned int));
+ memset (sq->items_miss_count, 0, sq->item_count * sizeof (unsigned int));
}
static inline void sq_assert (const struct sq *sq, unsigned int pos)
{
unsigned int i;
// printf ("Instrument[%d] Asserting from %d to %d\n",
// pos, sq->pos_max, sq->size);
for (i = sq->pos_max + 1; i < sq->size; i++) {
assert (sq->items_inuse[i] == 0);
}
}
static inline void sq_copy (struct sq *sq_dest, const struct sq *sq_src)
{
sq_assert (sq_src, 20);
sq_dest->head = sq_src->head;
sq_dest->size = sq_src->item_count;
sq_dest->size_per_item = sq_src->size_per_item;
sq_dest->head_seqid = sq_src->head_seqid;
sq_dest->item_count = sq_src->item_count;
sq_dest->pos_max = sq_src->pos_max;
memcpy (sq_dest->items, sq_src->items,
sq_src->item_count * sq_src->size_per_item);
memcpy (sq_dest->items_inuse, sq_src->items_inuse,
sq_src->item_count * sizeof (unsigned int));
+ memcpy (sq_dest->items_miss_count, sq_src->items_miss_count,
+ sq_src->item_count * sizeof (unsigned int));
}
static inline void sq_free (struct sq *sq) {
free (sq->items);
free (sq->items_inuse);
+ free (sq->items_miss_count);
}
static inline void *sq_item_add (
struct sq *sq,
void *item,
unsigned int seqid)
{
char *sq_item;
unsigned int sq_position;
sq_position = (sq->head + seqid - sq->head_seqid) % sq->size;
if (sq_position > sq->pos_max) {
sq->pos_max = sq_position;
}
sq_item = sq->items;
sq_item += sq_position * sq->size_per_item;
assert(sq->items_inuse[sq_position] == 0);
memcpy (sq_item, item, sq->size_per_item);
if (seqid == 0) {
sq->items_inuse[sq_position] = 1;
} else {
sq->items_inuse[sq_position] = seqid;
}
+ sq->items_miss_count[sq_position] = 0;
return (sq_item);
}
static inline unsigned int sq_item_inuse (
const struct sq *sq,
unsigned int seq_id) {
unsigned int sq_position;
/*
* We need to say that the seqid is in use if it shouldn't
* be here in the first place.
* To keep old messages from being inserted.
*/
#ifdef COMPILE_OUT
if (seq_id < sq->head_seqid) {
fprintf(stderr, "sq_item_inuse: seqid %d, head %d\n",
seq_id, sq->head_seqid);
return 1;
}
#endif
sq_position = (sq->head - sq->head_seqid + seq_id) % sq->size;
return (sq->items_inuse[sq_position] != 0);
}
+static inline unsigned int sq_item_miss_count (
+ const struct sq *sq,
+ unsigned int seq_id)
+{
+ unsigned int sq_position;
+
+ sq_position = (sq->head - sq->head_seqid + seq_id) % sq->size;
+ sq->items_miss_count[sq_position]++;
+ return (sq->items_miss_count[sq_position]);
+}
+
static inline unsigned int sq_size_get (
const struct sq *sq)
{
return sq->size;
}
static inline unsigned int sq_in_range (
const struct sq *sq,
unsigned int seq_id)
{
int res = 1;
if (sq->head_seqid > ADJUST_ROLLOVER_POINT) {
if (seq_id - ADJUST_ROLLOVER_VALUE <
sq->head_seqid - ADJUST_ROLLOVER_VALUE) {
res = 0;
}
if ((seq_id - ADJUST_ROLLOVER_VALUE) >=
((sq->head_seqid - ADJUST_ROLLOVER_VALUE) + sq->size)) {
res = 0;
}
} else {
if (seq_id < sq->head_seqid) {
res = 0;
}
if ((seq_id) >= ((sq->head_seqid) + sq->size)) {
res = 0;
}
}
return (res);
}
static inline unsigned int sq_item_get (
const struct sq *sq,
unsigned int seq_id,
void **sq_item_out)
{
char *sq_item;
unsigned int sq_position;
if (seq_id > ADJUST_ROLLOVER_POINT) {
assert ((seq_id - ADJUST_ROLLOVER_POINT) <
((sq->head_seqid - ADJUST_ROLLOVER_POINT) + sq->size));
sq_position = ((sq->head - ADJUST_ROLLOVER_VALUE) -
(sq->head_seqid - ADJUST_ROLLOVER_VALUE) + seq_id) % sq->size;
} else {
assert (seq_id < (sq->head_seqid + sq->size));
sq_position = (sq->head - sq->head_seqid + seq_id) % sq->size;
}
//printf ("seqid %x head %x head %x pos %x\n", seq_id, sq->head, sq->head_seqid, sq_position);
// sq_position = (sq->head - sq->head_seqid + seq_id) % sq->size;
//printf ("sq_position = %x\n", sq_position);
//printf ("ITEMGET %d %d %d %d\n", sq_position, sq->head, sq->head_seqid, seq_id);
if (sq->items_inuse[sq_position] == 0) {
return (ENOENT);
}
sq_item = sq->items;
sq_item += sq_position * sq->size_per_item;
*sq_item_out = sq_item;
return (0);
}
static inline void sq_items_release (struct sq *sq, unsigned int seqid)
{
unsigned int oldhead;
oldhead = sq->head;
sq->head = (sq->head + seqid - sq->head_seqid + 1) % sq->size;
if ((oldhead + seqid - sq->head_seqid + 1) > sq->size) {
// printf ("releasing %d for %d\n", oldhead, sq->size - oldhead);
// printf ("releasing %d for %d\n", 0, sq->head);
memset (&sq->items_inuse[oldhead], 0, (sq->size - oldhead) * sizeof (unsigned int));
memset (sq->items_inuse, 0, sq->head * sizeof (unsigned int));
} else {
// printf ("releasing %d for %d\n", oldhead, seqid - sq->head_seqid + 1);
memset (&sq->items_inuse[oldhead], 0,
(seqid - sq->head_seqid + 1) * sizeof (unsigned int));
+ memset (&sq->items_miss_count[oldhead], 0,
+ (seqid - sq->head_seqid + 1) * sizeof (unsigned int));
}
sq->head_seqid = seqid + 1;
}
#endif /* SORTQUEUE_H_DEFINED */
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
index b84d9ba5..227745f8 100644
--- a/include/corosync/totem/totem.h
+++ b/include/corosync/totem/totem.h
@@ -1,270 +1,275 @@
/*
* Copyright (c) 2005 MontaVista Software, Inc.
* Copyright (c) 2006-2009 Red Hat, Inc.
*
* Author: Steven Dake (sdake@redhat.com)
*
* All rights reserved.
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef TOTEM_H_DEFINED
#define TOTEM_H_DEFINED
#include "totemip.h"
#include <corosync/hdb.h>
#ifdef HAVE_SMALL_MEMORY_FOOTPRINT
#define PROCESSOR_COUNT_MAX 16
#define MESSAGE_SIZE_MAX 1024*64
#define MESSAGE_QUEUE_MAX 512
#else
#define PROCESSOR_COUNT_MAX 384
#define MESSAGE_SIZE_MAX 1024*1024 /* (1MB) */
#define MESSAGE_QUEUE_MAX MESSAGE_SIZE_MAX / totem_config->net_mtu
#endif /* HAVE_SMALL_MEMORY_FOOTPRINT */
#define FRAME_SIZE_MAX 10000
#define TRANSMITS_ALLOWED 16
#define SEND_THREADS_MAX 16
#define INTERFACE_MAX 2
struct totem_interface {
struct totem_ip_address bindnet;
struct totem_ip_address boundto;
struct totem_ip_address mcast_addr;
uint16_t ip_port;
int member_count;
struct totem_ip_address member_list[PROCESSOR_COUNT_MAX];
};
struct totem_logging_configuration {
void (*log_printf) (
unsigned int rec_ident,
const char *function_name,
const char *file_name,
int file_line,
const char *format,
...) __attribute__((format(printf, 5, 6)));
int log_level_security;
int log_level_error;
int log_level_warning;
int log_level_notice;
int log_level_debug;
int log_subsys_id;
};
enum { TOTEM_PRIVATE_KEY_LEN = 128 };
enum { TOTEM_RRP_MODE_BYTES = 64 };
typedef enum {
TOTEM_TRANSPORT_UDP = 0,
TOTEM_TRANSPORT_UDPU = 1,
TOTEM_TRANSPORT_RDMA = 2
} totem_transport_t;
struct totem_config {
int version;
/*
* network
*/
struct totem_interface *interfaces;
unsigned int interface_count;
unsigned int node_id;
unsigned int clear_node_high_bit;
/*
* key information
*/
unsigned char private_key[TOTEM_PRIVATE_KEY_LEN];
unsigned int private_key_len;
/*
* Totem configuration parameters
*/
unsigned int token_timeout;
unsigned int token_retransmit_timeout;
unsigned int token_hold_timeout;
unsigned int token_retransmits_before_loss_const;
unsigned int join_timeout;
unsigned int send_join_timeout;
unsigned int consensus_timeout;
unsigned int merge_timeout;
unsigned int downcheck_timeout;
unsigned int fail_to_recv_const;
unsigned int seqno_unchanged_const;
unsigned int rrp_token_expired_timeout;
unsigned int rrp_problem_count_timeout;
unsigned int rrp_problem_count_threshold;
char rrp_mode[TOTEM_RRP_MODE_BYTES];
struct totem_logging_configuration totem_logging_configuration;
void (*log_rec) (
int subsysid,
const char *function_name,
const char *file_name,
int file_line,
unsigned int rec_ident,
...);
unsigned int secauth;
unsigned int net_mtu;
unsigned int threads;
unsigned int heartbeat_failures_allowed;
unsigned int max_network_delay;
unsigned int window_size;
unsigned int max_messages;
const char *vsf_type;
unsigned int broadcast_use;
enum { TOTEM_CRYPTO_SOBER=0, TOTEM_CRYPTO_NSS } crypto_type;
enum { TOTEM_CRYPTO_ACCEPT_OLD=0, TOTEM_CRYPTO_ACCEPT_NEW } crypto_accept;
int crypto_crypt_type;
int crypto_sign_type;
totem_transport_t transport_number;
+
+ unsigned int miss_count_const;
};
#define TOTEM_CONFIGURATION_TYPE
enum totem_configuration_type {
TOTEM_CONFIGURATION_REGULAR,
TOTEM_CONFIGURATION_TRANSITIONAL
};
#define TOTEM_CALLBACK_TOKEN_TYPE
enum totem_callback_token_type {
TOTEM_CALLBACK_TOKEN_RECEIVED = 1,
TOTEM_CALLBACK_TOKEN_SENT = 2
};
enum totem_event_type {
TOTEM_EVENT_DELIVERY_CONGESTED,
TOTEM_EVENT_NEW_MSG,
};
#define MEMB_RING_ID
struct memb_ring_id {
struct totem_ip_address rep;
unsigned long long seq;
} __attribute__((packed));
typedef struct {
hdb_handle_t handle;
int is_dirty;
time_t last_updated;
} totem_stats_header_t;
typedef struct {
totem_stats_header_t hdr;
uint32_t iface_changes;
} totemnet_stats_t;
typedef struct {
totem_stats_header_t hdr;
totemnet_stats_t *net;
char *algo_name;
} totemrrp_stats_t;
typedef struct {
uint32_t rx;
uint32_t tx;
int backlog_calc;
} totemsrp_token_stats_t;
typedef struct {
totem_stats_header_t hdr;
totemrrp_stats_t *rrp;
uint64_t orf_token_tx;
uint64_t orf_token_rx;
uint64_t memb_merge_detect_tx;
uint64_t memb_merge_detect_rx;
uint64_t memb_join_tx;
uint64_t memb_join_rx;
uint64_t mcast_tx;
uint64_t mcast_retx;
uint64_t mcast_rx;
uint64_t memb_commit_token_tx;
uint64_t memb_commit_token_rx;
uint64_t token_hold_cancel_tx;
uint64_t token_hold_cancel_rx;
uint64_t operational_entered;
uint64_t operational_token_lost;
uint64_t gather_entered;
uint64_t gather_token_lost;
uint64_t commit_entered;
uint64_t commit_token_lost;
uint64_t recovery_entered;
uint64_t recovery_token_lost;
uint64_t consensus_timeouts;
uint64_t rx_msg_dropped;
int earliest_token;
int latest_token;
#define TOTEM_TOKEN_STATS_MAX 100
totemsrp_token_stats_t token[TOTEM_TOKEN_STATS_MAX];
} totemsrp_stats_t;
+
+ #define TOTEM_CONFIGURATION_TYPE
+
typedef struct {
totem_stats_header_t hdr;
totemsrp_stats_t *srp;
} totemmrp_stats_t;
typedef struct {
totem_stats_header_t hdr;
totemmrp_stats_t *mrp;
} totempg_stats_t;
#endif /* TOTEM_H_DEFINED */
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
index 8fc3dc6f..2faea0f9 100644
--- a/man/corosync.conf.5
+++ b/man/corosync.conf.5
@@ -1,614 +1,624 @@
.\"/*
.\" * Copyright (c) 2005 MontaVista Software, Inc.
.\" * Copyright (c) 2006-2010 Red Hat, Inc.
.\" *
.\" * All rights reserved.
.\" *
.\" * Author: Steven Dake (sdake@redhat.com)
.\" *
.\" * This software licensed under BSD license, the text of which follows:
.\" *
.\" * Redistribution and use in source and binary forms, with or without
.\" * modification, are permitted provided that the following conditions are met:
.\" *
.\" * - Redistributions of source code must retain the above copyright notice,
.\" * this list of conditions and the following disclaimer.
.\" * - Redistributions in binary form must reproduce the above copyright notice,
.\" * this list of conditions and the following disclaimer in the documentation
.\" * and/or other materials provided with the distribution.
.\" * - Neither the name of the MontaVista Software, Inc. nor the names of its
.\" * contributors may be used to endorse or promote products derived from this
.\" * software without specific prior written permission.
.\" *
.\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
.\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
.\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
.\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
.\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
.\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
.\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
.\" * THE POSSIBILITY OF SUCH DAMAGE.
.\" */
.TH COROSYNC_CONF 5 2006-03-28 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
.SH NAME
corosync.conf - corosync executive configuration file
.SH SYNOPSIS
/etc/corosync.conf
.SH DESCRIPTION
The corosync.conf instructs the corosync executive about various parameters
needed to control the corosync executive. Empty lines and lines starting with
# character are ignored. The configuration file consists of bracketed top level
directives. The possible directive choices are:
.TP
totem { }
This top level directive contains configuration options for the totem protocol.
.TP
logging { }
This top level directive contains configuration options for logging.
.TP
event { }
This top level directive contains configuration options for the event service.
.PP
.PP
It is also possible to specify the top level parameter
.B compatibility.
This directive indicates the level of compatibility requested by the user. The
option whitetank can be specified to remain backward compatable with
openais-0.80.z. The option none can be specified to only be compatable
with corosync-1.Y.Z. Extra processing during configuration changes is
required to remain backward compatable.
The default is whitetank. (backwards compatibility)
.PP
.PP
Within the
.B totem
directive, an interface directive is required. There is also one configuration
option which is required:
.PP
.PP
Within the
.B interface
sub-directive of totem there are four parameters which are required. There is
one parameter which is optional.
.TP
ringnumber
This specifies the ring number for the interface. When using the redundant
ring protocol, each interface should specify separate ring numbers to uniquely
identify to the membership protocol which interface to use for which redundant
ring. The ringnumber must start at 0.
.TP
bindnetaddr
This specifies the network address the corosync executive should bind
to. For example, if the local interface is 192.168.5.92 with netmask
255.255.255.0, set bindnetaddr to 192.168.5.0. If the local interface
is 192.168.5.92 with netmask 255.255.255.192, set bindnetaddr to
192.168.5.64, and so forth.
This may also be an IPV6 address, in which case IPV6 networking will be used.
In this case, the full address must be specified and there is no automatic
selection of the network interface within a specific subnet as with IPv4.
If IPv6 networking is used, the nodeid field must be specified.
.TP
broadcast
This is optional and can be set to yes. If it is set to yes, the broadcast
address will be used for communication. If this option is set, mcastaddr
should not be set.
.TP
mcastaddr
This is the multicast address used by corosync executive. The default
should work for most networks, but the network administrator should be queried
about a multicast address to use. Avoid 224.x.x.x because this is a "config"
multicast address.
This may also be an IPV6 multicast address, in which case IPV6 networking
will be used. If IPv6 networking is used, the nodeid field must be specified.
.TP
mcastport
This specifies the UDP port number. It is possible to use the same multicast
address on a network with the corosync services configured for different
UDP ports.
Please note corosync uses two UDP ports mcastport (for mcast receives) and
mcastport - 1 (for mcast sends).
If you have multiple clusters on the same network using the same mcastaddr
please configure the mcastports with a gap.
.TP
member
This specifies a member on the interface and used with the udpu transport only.
Every node that should be a member of the membership should be specified as
a separate member directive. Within the member directive there is a parameter
memberaddr which specifies the ip address of one of the nodes.
.PP
.PP
Within the
.B totem
directive, there are seven configuration options of which one is required,
five are optional, and one is required when IPV6 is configured in the interface
subdirective. The required directive controls the version of the totem
configuration. The optional option unless using IPV6 directive controls
identification of the processor. The optional options control secrecy and
authentication, the redundant ring mode of operation, maximum network MTU,
and number of sending threads, and the nodeid field.
.TP
version
This specifies the version of the configuration file. Currently the only
valid version for this directive is 2.
.PP
.PP
.TP
nodeid
This configuration option is optional when using IPv4 and required when using
IPv6. This is a 32 bit value specifying the node identifier delivered to the
cluster membership service. If this is not specified with IPv4, the node id
will be determined from the 32 bit IP address the system to which the system
is bound with ring identifier of 0. The node identifier value of zero is
reserved and should not be used.
.TP
clear_node_high_bit
This configuration option is optional and is only relevant when no nodeid is
specified. Some openais clients require a signed 32 bit nodeid that is greater
than zero however by default openais uses all 32 bits of the IPv4 address space
when generating a nodeid. Set this option to yes to force the high bit to be
zero and therefor ensure the nodeid is a positive signed 32 bit integer.
WARNING: The clusters behavior is undefined if this option is enabled on only
a subset of the cluster (for example during a rolling upgrade).
.TP
secauth
This specifies that HMAC/SHA1 authentication should be used to authenticate
all messages. It further specifies that all data should be encrypted with the
sober128 encryption algorithm to protect data from eavesdropping.
Enabling this option adds a 36 byte header to every message sent by totem which
reduces total throughput. Encryption and authentication consume 75% of CPU
cycles in aisexec as measured with gprof when enabled.
For 100mbit networks with 1500 MTU frame transmissions:
A throughput of 9mb/sec is possible with 100% cpu utilization when this
option is enabled on 3ghz cpus.
A throughput of 10mb/sec is possible wth 20% cpu utilization when this
optin is disabled on 3ghz cpus.
For gig-e networks with large frame transmissions:
A throughput of 20mb/sec is possible when this option is enabled on
3ghz cpus.
A throughput of 60mb/sec is possible when this option is disabled on
3ghz cpus.
The default is on.
.TP
rrp_mode
This specifies the mode of redundant ring, which may be none, active, or
passive. Active replication offers slightly lower latency from transmit
to delivery in faulty network environments but with less performance.
Passive replication may nearly double the speed of the totem protocol
if the protocol doesn't become cpu bound. The final option is none, in
which case only one network interface will be used to operate the totem
protocol.
If only one interface directive is specified, none is automatically chosen.
If multiple interface directives are specified, only active or passive may
be chosen.
.TP
netmtu
This specifies the network maximum transmit unit. To set this value beyond
1500, the regular frame MTU, requires ethernet devices that support large, or
also called jumbo, frames. If any device in the network doesn't support large
frames, the protocol will not operate properly. The hosts must also have their
mtu size set from 1500 to whatever frame size is specified here.
Please note while some NICs or switches claim large frame support, they support
9000 MTU as the maximum frame size including the IP header. Setting the netmtu
and host MTUs to 9000 will cause totem to use the full 9000 bytes of the frame.
Then Linux will add a 18 byte header moving the full frame size to 9018. As a
result some hardware will not operate properly with this size of data. A netmtu
of 8982 seems to work for the few large frame devices that have been tested.
Some manufacturers claim large frame support when in fact they support frame
sizes of 4500 bytes.
Increasing the MTU from 1500 to 8982 doubles throughput performance from 30MB/sec
to 60MB/sec as measured with evsbench with 175000 byte messages with the secauth
directive set to off.
When sending multicast traffic, if the network frequently reconfigures, chances are
that some device in the network doesn't support large frames.
Choose hardware carefully if intending to use large frame support.
The default is 1500.
.TP
threads
This directive controls how many threads are used to encrypt and send multicast
messages. If secauth is off, the protocol will never use threaded sending.
If secauth is on, this directive allows systems to be configured to use
multiple threads to encrypt and send multicast messages.
A thread directive of 0 indicates that no threaded send should be used. This
mode offers best performance for non-SMP systems.
The default is 0.
.TP
vsftype
This directive controls the virtual synchrony filter type used to identify
a primary component. The preferred choice is YKD dynamic linear voting,
however, for clusters larger then 32 nodes YKD consumes alot of memory. For
large scale clusters that are created by changing the MAX_PROCESSORS_COUNT
#define in the C code totem.h file, the virtual synchrony filter "none" is
recommended but then AMF and DLCK services (which are currently experimental)
are not safe for use.
The default is ykd. The vsftype can also be set to none.
.TP
transport
This directive controls the transport mechanism used. If the interface to
which corosync is binding is an RDMA interface such as RoCEE or Infiniband, the
"iba" parameter may be specified. To avoid the use of multicast entirely, a
unicast transport parameter "udpu" can be specified. This requires specifying
the list of members that could potentially make up the membership before
deployment.
The default is udp. The transport type can also be set to udpu or iba.
Within the
.B totem
directive, there are several configuration options which are used to control
the operation of the protocol. It is generally not recommended to change any
of these values without proper guidance and sufficient testing. Some networks
may require larger values if suffering from frequent reconfigurations. Some
applications may require faster failure detection times which can be achieved
by reducing the token timeout.
.TP
token
This timeout specifies in milliseconds until a token loss is declared after not
receiving a token. This is the time spent detecting a failure of a processor
in the current configuration. Reforming a new configuration takes about 50
milliseconds in addition to this timeout.
The default is 1000 milliseconds.
.TP
token_retransmit
This timeout specifies in milliseconds after how long before receiving a token
the token is retransmitted. This will be automatically calculated if token
is modified. It is not recommended to alter this value without guidance from
the corosync community.
The default is 238 milliseconds.
.TP
hold
This timeout specifies in milliseconds how long the token should be held by
the representative when the protocol is under low utilization. It is not
recommended to alter this value without guidance from the corosync community.
The default is 180 milliseconds.
.TP
token_retransmits_before_loss_const
This value identifies how many token retransmits should be attempted before
forming a new configuration. If this value is set, retransmit and hold will
be automatically calculated from retransmits_before_loss and token.
The default is 4 retransmissions.
.TP
join
This timeout specifies in milliseconds how long to wait for join messages in
the membership protocol.
The default is 50 milliseconds.
.TP
send_join
This timeout specifies in milliseconds an upper range between 0 and send_join
to wait before sending a join message. For configurations with less then
32 nodes, this parameter is not necessary. For larger rings, this parameter
is necessary to ensure the NIC is not overflowed with join messages on
formation of a new ring. A reasonable value for large rings (128 nodes) would
be 80msec. Other timer values must also change if this value is changed. Seek
advice from the corosync mailing list if trying to run larger configurations.
The default is 0 milliseconds.
.TP
consensus
This timeout specifies in milliseconds how long to wait for consensus to be
achieved before starting a new round of membership configuration. The minimum
value for consensus must be 1.2 * token. This value will be automatically
calculated at 1.2 * token if the user doesn't specify a consensus value.
For two node clusters, a consensus larger then the join timeout but less then
token is safe. For three node or larger clusters, consensus should be larger
then token. There is an increasing risk of odd membership changes, which stil
guarantee virtual synchrony, as node count grows if consensus is less than
token.
The default is 1200 milliseconds.
.TP
merge
This timeout specifies in milliseconds how long to wait before checking for
a partition when no multicast traffic is being sent. If multicast traffic
is being sent, the merge detection happens automatically as a function of
the protocol.
The default is 200 milliseconds.
.TP
downcheck
This timeout specifies in milliseconds how long to wait before checking
that a network interface is back up after it has been downed.
The default is 1000 millseconds.
.TP
fail_recv_const
This constant specifies how many rotations of the token without receiving any
of the messages when messages should be received may occur before a new
configuration is formed.
The default is 50 failures to receive a message.
.TP
seqno_unchanged_const
This constant specifies how many rotations of the token without any multicast
traffic should occur before the merge detection timeout is started.
The default is 30 rotations.
.TP
heartbeat_failures_allowed
[HeartBeating mechanism]
Configures the optional HeartBeating mechanism for faster failure detection. Keep in
mind that engaging this mechanism in lossy networks could cause faulty loss declaration
as the mechanism relies on the network for heartbeating.
So as a rule of thumb use this mechanism if you require improved failure in low to
medium utilized networks.
This constant specifies the number of heartbeat failures the system should tolerate
before declaring heartbeat failure e.g 3. Also if this value is not set or is 0 then the
heartbeat mechanism is not engaged in the system and token rotation is the method
of failure detection
The default is 0 (disabled).
.TP
max_network_delay
[HeartBeating mechanism]
This constant specifies in milliseconds the approximate delay that your network takes
to transport one packet from one machine to another. This value is to be set by system
engineers and please dont change if not sure as this effects the failure detection
mechanism using heartbeat.
The default is 50 milliseconds.
.TP
window_size
This constant specifies the maximum number of messages that may be sent on one
token rotation. If all processors perform equally well, this value could be
large (300), which would introduce higher latency from origination to delivery
for very large rings. To reduce latency in large rings(16+), the defaults are
a safe compromise. If 1 or more slow processor(s) are present among fast
processors, window_size should be no larger then 256000 / netmtu to avoid
overflow of the kernel receive buffers. The user is notified of this by
the display of a retransmit list in the notification logs. There is no loss
of data, but performance is reduced when these errors occur.
The default is 50 messages.
.TP
max_messages
This constant specifies the maximum number of messages that may be sent by one
processor on receipt of the token. The max_messages parameter is limited to
256000 / netmtu to prevent overflow of the kernel transmit buffers.
The default is 17 messages.
+.TP
+miss_count_const
+This constant defines the maximum number of times on receipt of a token
+a message is checked for retransmission before a retransmission occurs. This
+parameter is useful to modify for switches that delay multicast packets
+compared to unicast packets. The default setting works well for nearly all
+modern switches.
+
+The default is 5 messages.
+
.TP
rrp_problem_count_timeout
This specifies the time in milliseconds to wait before decrementing the
problem count by 1 for a particular ring to ensure a link is not marked
faulty for transient network failures.
The default is 2000 milliseconds.
.TP
rrp_problem_count_threshold
This specifies the number of times a problem is detected with a link before
setting the link faulty. Once a link is set faulty, no more data is
transmitted upon it. Also, the problem counter is no longer decremented when
the problem count timeout expires.
A problem is detected whenever all tokens from the proceeding processor have
not been received within the rrp_token_expired_timeout. The
rrp_problem_count_threshold * rrp_token_expired_timeout should be atleast 50
milliseconds less then the token timeout, or a complete reconfiguration
may occur.
The default is 10 problem counts.
.TP
rrp_token_expired_timeout
This specifies the time in milliseconds to increment the problem counter for
the redundant ring protocol after not having received a token from all rings
for a particular processor.
This value will automatically be calculated from the token timeout and
problem_count_threshold but may be overridden. It is not recommended to
override this value without guidance from the corosync community.
The default is 47 milliseconds.
.PP
Within the
.B logging
directive, there are several configuration options which are all optional.
.PP
The following 3 options are valid only for the top level logging directive:
.TP
timestamp
This specifies that a timestamp is placed on all log messages.
The default is off.
.TP
fileline
This specifies that file and line should be printed.
The default is off.
.TP
function_name
This specifies that the code function name should be printed.
The default is off.
.PP
The following options are valid both for top level logging directive
and they can be overriden in logger_subsys entries.
.TP
to_stderr
.TP
to_logfile
.TP
to_syslog
These specify the destination of logging output. Any combination of
these options may be specified. Valid options are
.B yes
and
.B no.
The default is syslog and stderr.
Please note, if you are using to_logfile and want to rotate the file, use logrotate(8)
with the option
.B
copytruncate.
eg.
.IP
.RS
.ne 18
.nf
.ta 4n 30n 33n
/var/log/corosync.log {
missingok
compress
notifempty
daily
rotate 7
copytruncate
}
.ta
.fi
.RE
.IP
.PP
.TP
logfile
If the
.B to_logfile
directive is set to
.B yes
, this option specifies the pathname of the log file.
No default.
.TP
logfile_priority
This specifies the logfile priority for this particular subsystem. Ignored if debug is on.
Possible values are: alert, crit, debug (same as debug = on), emerg, err, info, notice, warning.
The default is: info.
.TP
syslog_facility
This specifies the syslog facility type that will be used for any messages
sent to syslog. options are daemon, local0, local1, local2, local3, local4,
local5, local6 & local7.
The default is daemon.
.TP
syslog_priority
This specifies the syslog level for this particular subsystem. Ignored if debug is on.
Possible values are: alert, crit, debug (same as debug = on), emerg, err, info, notice, warning.
The default is: info.
.TP
debug
This specifies whether debug output is logged for this particular logger.
The default is off.
.TP
tags
This specifies which tags should be traced for this particular logger.
Set debug directive to
.B on
in order to enable tracing using tags.
Values are specified using a vertical bar as a logical OR separator:
enter|leave|trace1|trace2|trace3|...
The default is none.
.PP
Within the
.B logging
directive, logger_subsys directives are optional.
.PP
Within the
.B logger_subsys
sub-directive, all of the above logging configuration options are valid and
can be used to override the default settings.
The subsys entry, described below, is mandatory to identify the subsystem.
.TP
subsys
This specifies the subsystem identity (name) for which logging is specified. This is the
name used by a service in the log_init () call. E.g. 'CKPT'. This directive is
required.
.SH "FILES"
.TP
/etc/corosync.conf
The corosync executive configuration file.
.SH "SEE ALSO"
.BR corosync_overview (8),
.BR logrotate (8)
.PP

File Metadata

Mime Type
text/x-diff
Expires
Mon, Dec 23, 11:37 AM (1 d, 2 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1128260
Default Alt Text
(191 KB)

Event Timeline