diff --git a/exec/totemconfig.c b/exec/totemconfig.c index 224ff316..8116a6fe 100644 --- a/exec/totemconfig.c +++ b/exec/totemconfig.c @@ -1,957 +1,968 @@ /* * Copyright (c) 2002-2005 MontaVista Software, Inc. * Copyright (c) 2006-2010 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LIBNSS #include #include #include #include #endif #include "util.h" #include "totemconfig.h" #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST 4 #define TOKEN_TIMEOUT 1000 #define TOKEN_RETRANSMIT_TIMEOUT (int)(TOKEN_TIMEOUT / (TOKEN_RETRANSMITS_BEFORE_LOSS_CONST + 0.2)) #define TOKEN_HOLD_TIMEOUT (int)(TOKEN_RETRANSMIT_TIMEOUT * 0.8 - (1000/(int)HZ)) #define JOIN_TIMEOUT 50 #define MERGE_TIMEOUT 200 #define DOWNCHECK_TIMEOUT 1000 #define FAIL_TO_RECV_CONST 2500 #define SEQNO_UNCHANGED_CONST 30 #define MINIMUM_TIMEOUT (int)(1000/HZ)*3 #define MAX_NETWORK_DELAY 50 #define WINDOW_SIZE 50 #define MAX_MESSAGES 17 #define MISS_COUNT_CONST 5 #define RRP_PROBLEM_COUNT_TIMEOUT 2000 #define RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT 10 #define RRP_PROBLEM_COUNT_THRESHOLD_MIN 5 #define RRP_AUTORECOVERY_CHECK_TIMEOUT 1000 static char error_string_response[512]; static struct objdb_iface_ver0 *global_objdb; static void add_totem_config_notification( struct objdb_iface_ver0 *objdb, struct totem_config *totem_config, hdb_handle_t totem_object_handle); /* These just makes the code below a little neater */ static inline int objdb_get_string ( const struct objdb_iface_ver0 *objdb, hdb_handle_t object_service_handle, const char *key, const char **value) { int res; *value = NULL; if ( !(res = objdb->object_key_get (object_service_handle, key, strlen (key), (void *)value, NULL))) { if (*value) { return 0; } } return -1; } static inline void objdb_get_int ( const struct objdb_iface_ver0 *objdb, hdb_handle_t object_service_handle, const char *key, unsigned int *intvalue) { char *value = NULL; if (!objdb->object_key_get (object_service_handle, key, strlen (key), (void *)&value, NULL)) { if (value) { *intvalue = atoi(value); } } } static unsigned int totem_handle_find ( struct objdb_iface_ver0 *objdb, hdb_handle_t *totem_find_handle) { hdb_handle_t object_find_handle; unsigned int res; /* * Find a network section */ objdb->object_find_create ( OBJECT_PARENT_HANDLE, "network", strlen ("network"), &object_find_handle); res = objdb->object_find_next ( object_find_handle, totem_find_handle); objdb->object_find_destroy (object_find_handle); /* * Network section not found in configuration, checking for totem */ if (res == -1) { objdb->object_find_create ( OBJECT_PARENT_HANDLE, "totem", strlen ("totem"), &object_find_handle); res = objdb->object_find_next ( object_find_handle, totem_find_handle); objdb->object_find_destroy (object_find_handle); } if (res == -1) { return (-1); } return (0); } static void totem_volatile_config_read ( struct objdb_iface_ver0 *objdb, struct totem_config *totem_config, hdb_handle_t object_totem_handle) { objdb_get_int (objdb,object_totem_handle, "token", &totem_config->token_timeout); objdb_get_int (objdb,object_totem_handle, "token_retransmit", &totem_config->token_retransmit_timeout); objdb_get_int (objdb,object_totem_handle, "hold", &totem_config->token_hold_timeout); objdb_get_int (objdb,object_totem_handle, "token_retransmits_before_loss_const", &totem_config->token_retransmits_before_loss_const); objdb_get_int (objdb,object_totem_handle, "join", &totem_config->join_timeout); objdb_get_int (objdb,object_totem_handle, "send_join", &totem_config->send_join_timeout); objdb_get_int (objdb,object_totem_handle, "consensus", &totem_config->consensus_timeout); objdb_get_int (objdb,object_totem_handle, "merge", &totem_config->merge_timeout); objdb_get_int (objdb,object_totem_handle, "downcheck", &totem_config->downcheck_timeout); objdb_get_int (objdb,object_totem_handle, "fail_recv_const", &totem_config->fail_to_recv_const); objdb_get_int (objdb,object_totem_handle, "seqno_unchanged_const", &totem_config->seqno_unchanged_const); objdb_get_int (objdb,object_totem_handle, "rrp_token_expired_timeout", &totem_config->rrp_token_expired_timeout); objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_timeout", &totem_config->rrp_problem_count_timeout); objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_threshold", &totem_config->rrp_problem_count_threshold); + objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_mcast_threshold", &totem_config->rrp_problem_count_mcast_threshold); + objdb_get_int (objdb,object_totem_handle, "rrp_autorecovery_check_timeout", &totem_config->rrp_autorecovery_check_timeout); objdb_get_int (objdb,object_totem_handle, "heartbeat_failures_allowed", &totem_config->heartbeat_failures_allowed); objdb_get_int (objdb,object_totem_handle, "max_network_delay", &totem_config->max_network_delay); objdb_get_int (objdb,object_totem_handle, "window_size", &totem_config->window_size); (void)objdb_get_string (objdb, object_totem_handle, "vsftype", &totem_config->vsf_type); objdb_get_int (objdb,object_totem_handle, "max_messages", &totem_config->max_messages); objdb_get_int (objdb,object_totem_handle, "miss_count_const", &totem_config->miss_count_const); } static void totem_get_crypto_type( const struct objdb_iface_ver0 *objdb, hdb_handle_t object_totem_handle, struct totem_config *totem_config) { const char *str; totem_config->crypto_accept = TOTEM_CRYPTO_ACCEPT_OLD; if (!objdb_get_string (objdb, object_totem_handle, "crypto_accept", &str)) { if (strcmp(str, "new") == 0) { totem_config->crypto_accept = TOTEM_CRYPTO_ACCEPT_NEW; } } totem_config->crypto_type = TOTEM_CRYPTO_SOBER; #ifdef HAVE_LIBNSS /* * We must set these even if the key does not exist. * Encryption type can be set on-the-fly using CFG */ totem_config->crypto_crypt_type = CKM_AES_CBC_PAD; totem_config->crypto_sign_type = CKM_SHA256_RSA_PKCS; #endif if (!objdb_get_string (objdb, object_totem_handle, "crypto_type", &str)) { if (strcmp(str, "sober") == 0) { return; } #ifdef HAVE_LIBNSS if (strcmp(str, "nss") == 0) { totem_config->crypto_type = TOTEM_CRYPTO_NSS; } #endif } } extern int totem_config_read ( struct objdb_iface_ver0 *objdb, struct totem_config *totem_config, const char **error_string) { int res = 0; hdb_handle_t object_totem_handle; hdb_handle_t object_interface_handle; hdb_handle_t object_member_handle; const char *str; unsigned int ringnumber = 0; hdb_handle_t object_find_interface_handle; hdb_handle_t object_find_member_handle; const char *transport_type; int member_count = 0; res = totem_handle_find (objdb, &object_totem_handle); if (res == -1) { printf ("couldn't find totem handle\n"); return (-1); } memset (totem_config, 0, sizeof (struct totem_config)); totem_config->interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); if (totem_config->interfaces == 0) { *error_string = "Out of memory trying to allocate ethernet interface storage area"; return -1; } memset (totem_config->interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); totem_config->secauth = 1; strcpy (totem_config->rrp_mode, "none"); if (!objdb_get_string (objdb, object_totem_handle, "version", &str)) { if (strcmp (str, "2") == 0) { totem_config->version = 2; } } if (!objdb_get_string (objdb, object_totem_handle, "secauth", &str)) { if (strcmp (str, "on") == 0) { totem_config->secauth = 1; } if (strcmp (str, "off") == 0) { totem_config->secauth = 0; } } if (totem_config->secauth == 1) { totem_get_crypto_type(objdb, object_totem_handle, totem_config); } if (!objdb_get_string (objdb, object_totem_handle, "rrp_mode", &str)) { strcpy (totem_config->rrp_mode, str); } /* * Get interface node id */ objdb_get_int (objdb, object_totem_handle, "nodeid", &totem_config->node_id); totem_config->clear_node_high_bit = 0; if (!objdb_get_string (objdb,object_totem_handle, "clear_node_high_bit", &str)) { if (strcmp (str, "yes") == 0) { totem_config->clear_node_high_bit = 1; } } objdb_get_int (objdb,object_totem_handle, "threads", &totem_config->threads); objdb_get_int (objdb,object_totem_handle, "netmtu", &totem_config->net_mtu); /* * Get things that might change in the future */ totem_volatile_config_read (objdb, totem_config, object_totem_handle); objdb->object_find_create ( object_totem_handle, "interface", strlen ("interface"), &object_find_interface_handle); while (objdb->object_find_next ( object_find_interface_handle, &object_interface_handle) == 0) { member_count = 0; objdb_get_int (objdb, object_interface_handle, "ringnumber", &ringnumber); /* * Get interface multicast address */ if (!objdb_get_string (objdb, object_interface_handle, "mcastaddr", &str)) { res = totemip_parse (&totem_config->interfaces[ringnumber].mcast_addr, str, 0); } totem_config->broadcast_use = 0; if (!objdb_get_string (objdb, object_interface_handle, "broadcast", &str)) { if (strcmp (str, "yes") == 0) { totem_config->broadcast_use = 1; totemip_parse ( &totem_config->interfaces[ringnumber].mcast_addr, "255.255.255.255", 0); } } /* * Get mcast port */ if (!objdb_get_string (objdb, object_interface_handle, "mcastport", &str)) { totem_config->interfaces[ringnumber].ip_port = atoi (str); } /* * Get the bind net address */ if (!objdb_get_string (objdb, object_interface_handle, "bindnetaddr", &str)) { res = totemip_parse (&totem_config->interfaces[ringnumber].bindnet, str, totem_config->interfaces[ringnumber].mcast_addr.family); } /* * Get the TTL */ totem_config->interfaces[ringnumber].ttl = 1; if (!objdb_get_string (objdb, object_interface_handle, "ttl", &str)) { totem_config->interfaces[ringnumber].ttl = atoi (str); } objdb->object_find_create ( object_interface_handle, "member", strlen ("member"), &object_find_member_handle); while (objdb->object_find_next ( object_find_member_handle, &object_member_handle) == 0) { if (!objdb_get_string (objdb, object_member_handle, "memberaddr", &str)) { res = totemip_parse (&totem_config->interfaces[ringnumber].member_list[member_count++], str, 0); } } totem_config->interfaces[ringnumber].member_count = member_count; totem_config->interface_count++; } objdb->object_find_destroy (object_find_interface_handle); add_totem_config_notification(objdb, totem_config, object_totem_handle); totem_config->transport_number = TOTEM_TRANSPORT_UDP; (void)objdb_get_string (objdb, object_totem_handle, "transport", &transport_type); if (transport_type) { if (strcmp (transport_type, "udpu") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDPU; } } if (transport_type) { if (strcmp (transport_type, "iba") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_RDMA; } } return 0; } int totem_config_validate ( struct totem_config *totem_config, const char **error_string) { static char local_error_reason[512]; char parse_error[512]; const char *error_reason = local_error_reason; int i; unsigned int interface_max = INTERFACE_MAX; if (totem_config->interface_count == 0) { error_reason = "No interfaces defined"; goto parse_error; } for (i = 0; i < totem_config->interface_count; i++) { /* * Some error checking of parsed data to make sure its valid */ struct totem_ip_address null_addr; memset (&null_addr, 0, sizeof (struct totem_ip_address)); if ((totem_config->transport_number == 0) && memcmp (&totem_config->interfaces[i].mcast_addr, &null_addr, sizeof (struct totem_ip_address)) == 0) { error_reason = "No multicast address specified"; goto parse_error; } if (totem_config->interfaces[i].ip_port == 0) { error_reason = "No multicast port specified"; goto parse_error; } if (totem_config->interfaces[i].ttl > 255) { error_reason = "Invalid TTL (should be 0..255)"; goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_UDP && totem_config->interfaces[i].ttl != 1) { error_reason = "Can only set ttl on multicast transport types"; goto parse_error; } if (totem_config->interfaces[i].mcast_addr.family == AF_INET6 && totem_config->node_id == 0) { error_reason = "An IPV6 network requires that a node ID be specified."; goto parse_error; } if (totem_config->broadcast_use == 0 && totem_config->transport_number == 0) { if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) { error_reason = "Multicast address family does not match bind address family"; goto parse_error; } if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) { error_reason = "Not all bind address belong to the same IP family"; goto parse_error; } if (totemip_is_mcast (&totem_config->interfaces[i].mcast_addr) != 0) { error_reason = "mcastaddr is not a correct multicast address."; goto parse_error; } } } if (totem_config->version != 2) { error_reason = "This totem parser can only parse version 2 configurations."; goto parse_error; } if (totem_config->token_retransmits_before_loss_const == 0) { totem_config->token_retransmits_before_loss_const = TOKEN_RETRANSMITS_BEFORE_LOSS_CONST; } /* * Setup timeout values that are not setup by user */ if (totem_config->token_timeout == 0) { totem_config->token_timeout = TOKEN_TIMEOUT; if (totem_config->token_retransmits_before_loss_const == 0) { totem_config->token_retransmits_before_loss_const = TOKEN_RETRANSMITS_BEFORE_LOSS_CONST; } if (totem_config->token_retransmit_timeout == 0) { totem_config->token_retransmit_timeout = (int)(totem_config->token_timeout / (totem_config->token_retransmits_before_loss_const + 0.2)); } if (totem_config->token_hold_timeout == 0) { totem_config->token_hold_timeout = (int)(totem_config->token_retransmit_timeout * 0.8 - (1000/HZ)); } } if (totem_config->max_network_delay == 0) { totem_config->max_network_delay = MAX_NETWORK_DELAY; } if (totem_config->max_network_delay < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The max_network_delay parameter (%d ms) may not be less then (%d ms).", totem_config->max_network_delay, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->window_size == 0) { totem_config->window_size = WINDOW_SIZE; } if (totem_config->max_messages == 0) { totem_config->max_messages = MAX_MESSAGES; } if (totem_config->miss_count_const == 0) { totem_config->miss_count_const = MISS_COUNT_CONST; } if (totem_config->token_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token timeout parameter (%d ms) may not be less then (%d ms).", totem_config->token_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_retransmit_timeout == 0) { totem_config->token_retransmit_timeout = (int)(totem_config->token_timeout / (totem_config->token_retransmits_before_loss_const + 0.2)); } if (totem_config->token_hold_timeout == 0) { totem_config->token_hold_timeout = (int)(totem_config->token_retransmit_timeout * 0.8 - (1000/HZ)); } if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token retransmit timeout parameter (%d ms) may not be less then (%d ms).", totem_config->token_retransmit_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_hold_timeout == 0) { totem_config->token_hold_timeout = TOKEN_HOLD_TIMEOUT; } if (totem_config->token_hold_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token hold timeout parameter (%d ms) may not be less then (%d ms).", totem_config->token_hold_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->join_timeout == 0) { totem_config->join_timeout = JOIN_TIMEOUT; } if (totem_config->join_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The join timeout parameter (%d ms) may not be less then (%d ms).", totem_config->join_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout == 0) { totem_config->consensus_timeout = (int)(float)(1.2 * totem_config->token_timeout); } if (totem_config->consensus_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less then (%d ms).", totem_config->consensus_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->merge_timeout == 0) { totem_config->merge_timeout = MERGE_TIMEOUT; } if (totem_config->merge_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The merge timeout parameter (%d ms) may not be less then (%d ms).", totem_config->merge_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->downcheck_timeout == 0) { totem_config->downcheck_timeout = DOWNCHECK_TIMEOUT; } if (totem_config->downcheck_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The downcheck timeout parameter (%d ms) may not be less then (%d ms).", totem_config->downcheck_timeout, MINIMUM_TIMEOUT); goto parse_error; } /* * RRP values validation */ if (strcmp (totem_config->rrp_mode, "none") && strcmp (totem_config->rrp_mode, "active") && strcmp (totem_config->rrp_mode, "passive")) { snprintf (local_error_reason, sizeof(local_error_reason), "The RRP mode \"%s\" specified is invalid. It must be none, active, or passive.\n", totem_config->rrp_mode); goto parse_error; } if (totem_config->rrp_problem_count_timeout == 0) { totem_config->rrp_problem_count_timeout = RRP_PROBLEM_COUNT_TIMEOUT; } if (totem_config->rrp_problem_count_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The RRP problem count timeout parameter (%d ms) may not be less then (%d ms).", totem_config->rrp_problem_count_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->rrp_problem_count_threshold == 0) { totem_config->rrp_problem_count_threshold = RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT; } + if (totem_config->rrp_problem_count_mcast_threshold == 0) { + totem_config->rrp_problem_count_mcast_threshold = totem_config->rrp_problem_count_threshold * 10; + } if (totem_config->rrp_problem_count_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) { snprintf (local_error_reason, sizeof(local_error_reason), "The RRP problem count threshold (%d problem count) may not be less then (%d problem count).", totem_config->rrp_problem_count_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN); goto parse_error; } + if (totem_config->rrp_problem_count_mcast_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) { + snprintf (local_error_reason, sizeof(local_error_reason), + "The RRP multicast problem count threshold (%d problem count) may not be less then (%d problem count).", + totem_config->rrp_problem_count_mcast_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN); + goto parse_error; + } if (totem_config->rrp_token_expired_timeout == 0) { totem_config->rrp_token_expired_timeout = totem_config->token_retransmit_timeout; } if (totem_config->rrp_token_expired_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The RRP token expired timeout parameter (%d ms) may not be less then (%d ms).", totem_config->rrp_token_expired_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->rrp_autorecovery_check_timeout == 0) { totem_config->rrp_autorecovery_check_timeout = RRP_AUTORECOVERY_CHECK_TIMEOUT; } if (strcmp (totem_config->rrp_mode, "none") == 0) { interface_max = 1; } if (interface_max < totem_config->interface_count) { snprintf (parse_error, sizeof(parse_error), "%d is too many configured interfaces for the rrp_mode setting %s.", totem_config->interface_count, totem_config->rrp_mode); error_reason = parse_error; goto parse_error; } if (totem_config->fail_to_recv_const == 0) { totem_config->fail_to_recv_const = FAIL_TO_RECV_CONST; } if (totem_config->seqno_unchanged_const == 0) { totem_config->seqno_unchanged_const = SEQNO_UNCHANGED_CONST; } if (totem_config->net_mtu == 0) { totem_config->net_mtu = 1500; } if ((MESSAGE_QUEUE_MAX) < totem_config->max_messages) { snprintf (local_error_reason, sizeof(local_error_reason), "The max_messages parameter (%d messages) may not be greater then (%d messages).", totem_config->max_messages, MESSAGE_QUEUE_MAX); goto parse_error; } if (totem_config->threads > SEND_THREADS_MAX) { totem_config->threads = SEND_THREADS_MAX; } if (totem_config->secauth == 0) { totem_config->threads = 0; } if (totem_config->net_mtu > FRAME_SIZE_MAX) { error_reason = "This net_mtu parameter is greater then the maximum frame size"; goto parse_error; } if (totem_config->vsf_type == NULL) { totem_config->vsf_type = "none"; } return (0); parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } static int read_keyfile ( const char *key_location, struct totem_config *totem_config, const char **error_string) { int fd; int res; ssize_t expected_key_len = sizeof (totem_config->private_key); int saved_errno; char error_str[100]; const char *error_ptr; fd = open (key_location, O_RDONLY); if (fd == -1) { error_ptr = qb_strerror_r(errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not open %s: %s\n", key_location, error_ptr); goto parse_error; } res = read (fd, totem_config->private_key, expected_key_len); saved_errno = errno; close (fd); if (res == -1) { error_ptr = qb_strerror_r (saved_errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not read %s: %s\n", key_location, error_ptr); goto parse_error; } totem_config->private_key_len = expected_key_len; if (res != expected_key_len) { snprintf (error_string_response, sizeof(error_string_response), "Could only read %d bits of 1024 bits from %s.\n", res * 8, key_location); goto parse_error; } return 0; parse_error: *error_string = error_string_response; return (-1); } int totem_config_keyread ( struct objdb_iface_ver0 *objdb, struct totem_config *totem_config, const char **error_string) { int got_key = 0; const char *key_location = NULL; hdb_handle_t object_totem_handle; int res; memset (totem_config->private_key, 0, 128); totem_config->private_key_len = 128; if (totem_config->secauth == 0) { return (0); } res = totem_handle_find (objdb, &object_totem_handle); if (res == -1) { return (-1); } /* objdb may store the location of the key file */ if (!objdb_get_string (objdb,object_totem_handle, "keyfile", &key_location) && key_location) { res = read_keyfile(key_location, totem_config, error_string); if (res) { goto key_error; } got_key = 1; } else { /* Or the key itself may be in the objdb */ char *key = NULL; size_t key_len; res = objdb->object_key_get (object_totem_handle, "key", strlen ("key"), (void *)&key, &key_len); if (res == 0 && key) { if (key_len > sizeof (totem_config->private_key)) { goto key_error; } memcpy(totem_config->private_key, key, key_len); totem_config->private_key_len = key_len; got_key = 1; } } /* In desperation we read the default filename */ if (!got_key) { const char *filename = getenv("COROSYNC_TOTEM_AUTHKEY_FILE"); if (!filename) filename = COROSYSCONFDIR "/authkey"; res = read_keyfile(filename, totem_config, error_string); if (res) goto key_error; } return (0); key_error: *error_string = error_string_response; return (-1); } static void totem_key_change_notify(object_change_type_t change_type, hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *object_name_pt, size_t object_name_len, const void *key_name_pt, size_t key_len, const void *key_value_pt, size_t key_value_len, void *priv_data_pt) { struct totem_config *totem_config = priv_data_pt; if (memcmp(object_name_pt, "totem", object_name_len) == 0) totem_volatile_config_read(global_objdb, totem_config, object_handle); // CHECK } static void totem_objdb_reload_notify(objdb_reload_notify_type_t type, int flush, void *priv_data_pt) { struct totem_config *totem_config = priv_data_pt; hdb_handle_t totem_object_handle; if (totem_config == NULL) return; /* * A new totem {} key might exist, cancel the * existing notification at the start of reload, * and start a new one on the new object when * it's all settled. */ if (type == OBJDB_RELOAD_NOTIFY_START) { global_objdb->object_track_stop( totem_key_change_notify, NULL, NULL, NULL, totem_config); } if (type == OBJDB_RELOAD_NOTIFY_END || type == OBJDB_RELOAD_NOTIFY_FAILED) { if (!totem_handle_find(global_objdb, &totem_object_handle)) { global_objdb->object_track_start(totem_object_handle, 1, totem_key_change_notify, NULL, // object_create_notify, NULL, // object_destroy_notify, NULL, // object_reload_notify totem_config); // priv_data /* * Reload the configuration */ totem_volatile_config_read(global_objdb, totem_config, totem_object_handle); } else { log_printf(LOGSYS_LEVEL_ERROR, "totem objdb tracking stopped, cannot find totem{} handle on objdb\n"); } } } static void add_totem_config_notification( struct objdb_iface_ver0 *objdb, struct totem_config *totem_config, hdb_handle_t totem_object_handle) { global_objdb = objdb; objdb->object_track_start(totem_object_handle, 1, totem_key_change_notify, NULL, // object_create_notify, NULL, // object_destroy_notify, NULL, // object_reload_notify totem_config); // priv_data /* * Reload notify must be on the parent object */ objdb->object_track_start(OBJECT_PARENT_HANDLE, 1, NULL, // key_change_notify, NULL, // object_create_notify, NULL, // object_destroy_notify, totem_objdb_reload_notify, // object_reload_notify totem_config); // priv_data } diff --git a/exec/totemrrp.c b/exec/totemrrp.c index 92399ecb..41f2604c 100644 --- a/exec/totemrrp.c +++ b/exec/totemrrp.c @@ -1,2106 +1,2108 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemnet.h" #include "totemrrp.h" void rrp_deliver_fn ( void *context, const void *msg, unsigned int msg_len); void rrp_iface_change_fn ( void *context, const struct totem_ip_address *iface_addr); struct totemrrp_instance; struct passive_instance { struct totemrrp_instance *rrp_instance; unsigned int *faulty; unsigned int *token_recv_count; unsigned int *mcast_recv_count; unsigned char token[15000]; unsigned int token_len; qb_loop_timer_handle timer_expired_token; qb_loop_timer_handle timer_problem_decrementer; void *totemrrp_context; unsigned int token_xmit_iface; unsigned int msg_xmit_iface; }; struct active_instance { struct totemrrp_instance *rrp_instance; unsigned int *faulty; unsigned int *last_token_recv; unsigned int *counter_problems; unsigned char token[15000]; unsigned int token_len; unsigned int last_token_seq; qb_loop_timer_handle timer_expired_token; qb_loop_timer_handle timer_problem_decrementer; void *totemrrp_context; }; struct rrp_algo { const char *name; void * (*initialize) ( struct totemrrp_instance *rrp_instance, int interface_count); void (*mcast_recv) ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len); void (*mcast_noflush_send) ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); void (*mcast_flush_send) ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); void (*token_recv) ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seqid); void (*token_send) ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); void (*recv_flush) ( struct totemrrp_instance *instance); void (*send_flush) ( struct totemrrp_instance *instance); void (*iface_check) ( struct totemrrp_instance *instance); void (*processor_count_set) ( struct totemrrp_instance *instance, unsigned int processor_count); void (*token_target_set) ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no); void (*ring_reenable) ( struct totemrrp_instance *instance, unsigned int iface_no); int (*mcast_recv_empty) ( struct totemrrp_instance *instance); int (*member_add) ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); int (*member_remove) ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); }; struct totemrrp_instance { qb_loop_t *poll_handle; struct totem_interface *interfaces; struct rrp_algo *rrp_algo; void *context; char *status[INTERFACE_MAX]; void (*totemrrp_deliver_fn) ( void *context, const void *msg, unsigned int msg_len); void (*totemrrp_iface_change_fn) ( void *context, const struct totem_ip_address *iface_addr, unsigned int iface_no); void (*totemrrp_token_seqid_get) ( const void *msg, unsigned int *seqid, unsigned int *token_is); void (*totemrrp_target_set_completed) ( void *context); unsigned int (*totemrrp_msgs_missing) (void); /* * Function and data used to log messages */ int totemrrp_log_level_security; int totemrrp_log_level_error; int totemrrp_log_level_warning; int totemrrp_log_level_notice; int totemrrp_log_level_debug; int totemrrp_subsys_id; void (*totemrrp_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); void **net_handles; void *rrp_algo_instance; int interface_count; int processor_count; int my_nodeid; struct totem_config *totem_config; void *deliver_fn_context[INTERFACE_MAX]; qb_loop_timer_handle timer_active_test_ring_timeout[INTERFACE_MAX]; }; /* * None Replication Forward Declerations */ static void none_mcast_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len); static void none_mcast_noflush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void none_mcast_flush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void none_token_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seqid); static void none_token_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void none_recv_flush ( struct totemrrp_instance *instance); static void none_send_flush ( struct totemrrp_instance *instance); static void none_iface_check ( struct totemrrp_instance *instance); static void none_processor_count_set ( struct totemrrp_instance *instance, unsigned int processor_count_set); static void none_token_target_set ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no); static void none_ring_reenable ( struct totemrrp_instance *instance, unsigned int iface_no); static int none_mcast_recv_empty ( struct totemrrp_instance *instance); static int none_member_add ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); static int none_member_remove ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); /* * Passive Replication Forward Declerations */ static void *passive_instance_initialize ( struct totemrrp_instance *rrp_instance, int interface_count); static void passive_mcast_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len); static void passive_mcast_noflush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void passive_mcast_flush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void passive_monitor ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, int is_token_recv_count); static void passive_token_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seqid); static void passive_token_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void passive_recv_flush ( struct totemrrp_instance *instance); static void passive_send_flush ( struct totemrrp_instance *instance); static void passive_iface_check ( struct totemrrp_instance *instance); static void passive_processor_count_set ( struct totemrrp_instance *instance, unsigned int processor_count_set); static void passive_token_target_set ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no); static void passive_ring_reenable ( struct totemrrp_instance *instance, unsigned int iface_no); static int passive_mcast_recv_empty ( struct totemrrp_instance *instance); static int passive_member_add ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); static int passive_member_remove ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); /* * Active Replication Forward Definitions */ static void *active_instance_initialize ( struct totemrrp_instance *rrp_instance, int interface_count); static void active_mcast_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len); static void active_mcast_noflush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void active_mcast_flush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void active_token_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seqid); static void active_token_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len); static void active_recv_flush ( struct totemrrp_instance *instance); static void active_send_flush ( struct totemrrp_instance *instance); static void active_iface_check ( struct totemrrp_instance *instance); static void active_processor_count_set ( struct totemrrp_instance *instance, unsigned int processor_count_set); static void active_token_target_set ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no); static void active_ring_reenable ( struct totemrrp_instance *instance, unsigned int iface_no); static int active_mcast_recv_empty ( struct totemrrp_instance *instance); static int active_member_add ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); static int active_member_remove ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no); static void active_timer_expired_token_start ( struct active_instance *active_instance); static void active_timer_expired_token_cancel ( struct active_instance *active_instance); static void active_timer_problem_decrementer_start ( struct active_instance *active_instance); static void active_timer_problem_decrementer_cancel ( struct active_instance *active_instance); /* * 0-5 reserved for totemsrp.c */ #define MESSAGE_TYPE_RING_TEST_ACTIVE 6 #define MESSAGE_TYPE_RING_TEST_ACTIVATE 7 #define ENDIAN_LOCAL 0xff22 /* * Rollover handling: * * ARR_SEQNO_START_TOKEN is the starting sequence number of last seen sequence * for a token for active redundand ring. This should remain zero, unless testing * overflow in which case 07fffff00 or 0xffffff00 are good starting values. * It should be same as on defined in totemsrp.c */ #define ARR_SEQNO_START_TOKEN 0x0 /* * These can be used ot test different rollover points * #define ARR_SEQNO_START_MSG 0xfffffe00 */ /* * Threshold value when recv_count for passive rrp should be adjusted. * Set this value to some smaller for testing of adjusting proper * functionality. Also keep in mind that this value must be smaller * then rrp_problem_count_threshold */ #define PASSIVE_RECV_COUNT_THRESHOLD (INT_MAX / 2) struct message_header { char type; char encapsulated; unsigned short endian_detector; int ring_number; int nodeid_activator; } __attribute__((packed)); struct deliver_fn_context { struct totemrrp_instance *instance; void *context; int iface_no; }; struct rrp_algo none_algo = { .name = "none", .initialize = NULL, .mcast_recv = none_mcast_recv, .mcast_noflush_send = none_mcast_noflush_send, .mcast_flush_send = none_mcast_flush_send, .token_recv = none_token_recv, .token_send = none_token_send, .recv_flush = none_recv_flush, .send_flush = none_send_flush, .iface_check = none_iface_check, .processor_count_set = none_processor_count_set, .token_target_set = none_token_target_set, .ring_reenable = none_ring_reenable, .mcast_recv_empty = none_mcast_recv_empty, .member_add = none_member_add, .member_remove = none_member_remove }; struct rrp_algo passive_algo = { .name = "passive", .initialize = passive_instance_initialize, .mcast_recv = passive_mcast_recv, .mcast_noflush_send = passive_mcast_noflush_send, .mcast_flush_send = passive_mcast_flush_send, .token_recv = passive_token_recv, .token_send = passive_token_send, .recv_flush = passive_recv_flush, .send_flush = passive_send_flush, .iface_check = passive_iface_check, .processor_count_set = passive_processor_count_set, .token_target_set = passive_token_target_set, .ring_reenable = passive_ring_reenable, .mcast_recv_empty = passive_mcast_recv_empty, .member_add = passive_member_add, .member_remove = passive_member_remove }; struct rrp_algo active_algo = { .name = "active", .initialize = active_instance_initialize, .mcast_recv = active_mcast_recv, .mcast_noflush_send = active_mcast_noflush_send, .mcast_flush_send = active_mcast_flush_send, .token_recv = active_token_recv, .token_send = active_token_send, .recv_flush = active_recv_flush, .send_flush = active_send_flush, .iface_check = active_iface_check, .processor_count_set = active_processor_count_set, .token_target_set = active_token_target_set, .ring_reenable = active_ring_reenable, .mcast_recv_empty = active_mcast_recv_empty, .member_add = active_member_add, .member_remove = active_member_remove }; struct rrp_algo *rrp_algos[] = { &none_algo, &passive_algo, &active_algo }; #define RRP_ALGOS_COUNT 3 #define log_printf(level, format, args...) \ do { \ rrp_instance->totemrrp_log_printf ( \ level, rrp_instance->totemrrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); static void test_active_msg_endian_convert(const struct message_header *in, struct message_header *out) { out->type = in->type; out->encapsulated = in->encapsulated; out->endian_detector = ENDIAN_LOCAL; out->ring_number = swab32 (in->ring_number); out->nodeid_activator = swab32(in->nodeid_activator); } static void timer_function_test_ring_timeout (void *context) { struct deliver_fn_context *deliver_fn_context = (struct deliver_fn_context *)context; struct totemrrp_instance *rrp_instance = deliver_fn_context->instance; unsigned int *faulty = NULL; int iface_no = deliver_fn_context->iface_no; struct message_header msg = { .type = MESSAGE_TYPE_RING_TEST_ACTIVE, .endian_detector = ENDIAN_LOCAL, }; if (strcmp(rrp_instance->totem_config->rrp_mode, "active") == 0) faulty = ((struct active_instance *)(rrp_instance->rrp_algo_instance))->faulty; if (strcmp(rrp_instance->totem_config->rrp_mode, "passive") == 0) faulty = ((struct passive_instance *)(rrp_instance->rrp_algo_instance))->faulty; assert (faulty != NULL); if (faulty[iface_no] == 1) { msg.ring_number = iface_no; msg.nodeid_activator = rrp_instance->my_nodeid; totemnet_token_send ( rrp_instance->net_handles[iface_no], &msg, sizeof (struct message_header)); qb_loop_timer_add (rrp_instance->poll_handle, QB_LOOP_MED, rrp_instance->totem_config->rrp_autorecovery_check_timeout*QB_TIME_NS_IN_MSEC, (void *)deliver_fn_context, timer_function_test_ring_timeout, &rrp_instance->timer_active_test_ring_timeout[iface_no]); } } /* * None Replication Implementation */ static void none_mcast_recv ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len) { rrp_instance->totemrrp_deliver_fn ( context, msg, msg_len); } static void none_mcast_flush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { totemnet_mcast_flush_send (instance->net_handles[0], msg, msg_len); } static void none_mcast_noflush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { totemnet_mcast_noflush_send (instance->net_handles[0], msg, msg_len); } static void none_token_recv ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seq) { rrp_instance->totemrrp_deliver_fn ( context, msg, msg_len); } static void none_token_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { totemnet_token_send ( instance->net_handles[0], msg, msg_len); } static void none_recv_flush (struct totemrrp_instance *instance) { totemnet_recv_flush (instance->net_handles[0]); } static void none_send_flush (struct totemrrp_instance *instance) { totemnet_send_flush (instance->net_handles[0]); } static void none_iface_check (struct totemrrp_instance *instance) { totemnet_iface_check (instance->net_handles[0]); } static void none_processor_count_set ( struct totemrrp_instance *instance, unsigned int processor_count) { totemnet_processor_count_set (instance->net_handles[0], processor_count); } static void none_token_target_set ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no) { totemnet_token_target_set (instance->net_handles[0], token_target); } static void none_ring_reenable ( struct totemrrp_instance *instance, unsigned int iface_no) { /* * No operation */ } static int none_mcast_recv_empty ( struct totemrrp_instance *instance) { int res; res = totemnet_recv_mcast_empty (instance->net_handles[0]); return (res); } static int none_member_add ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no) { int res; res = totemnet_member_add (instance->net_handles[0], member); return (res); } static int none_member_remove ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no) { int res; res = totemnet_member_remove (instance->net_handles[0], member); return (res); } /* * Passive Replication Implementation */ void *passive_instance_initialize ( struct totemrrp_instance *rrp_instance, int interface_count) { struct passive_instance *instance; instance = malloc (sizeof (struct passive_instance)); if (instance == 0) { goto error_exit; } memset (instance, 0, sizeof (struct passive_instance)); instance->faulty = malloc (sizeof (int) * interface_count); if (instance->faulty == 0) { free (instance); instance = 0; goto error_exit; } memset (instance->faulty, 0, sizeof (int) * interface_count); instance->token_recv_count = malloc (sizeof (int) * interface_count); if (instance->token_recv_count == 0) { free (instance->faulty); free (instance); instance = 0; goto error_exit; } memset (instance->token_recv_count, 0, sizeof (int) * interface_count); instance->mcast_recv_count = malloc (sizeof (int) * interface_count); if (instance->mcast_recv_count == 0) { free (instance->token_recv_count); free (instance->faulty); free (instance); instance = 0; goto error_exit; } memset (instance->mcast_recv_count, 0, sizeof (int) * interface_count); error_exit: return ((void *)instance); } static void timer_function_passive_token_expired (void *context) { struct passive_instance *passive_instance = (struct passive_instance *)context; struct totemrrp_instance *rrp_instance = passive_instance->rrp_instance; rrp_instance->totemrrp_deliver_fn ( passive_instance->totemrrp_context, passive_instance->token, passive_instance->token_len); } /* TODO static void timer_function_passive_problem_decrementer (void *context) { // struct passive_instance *passive_instance = (struct passive_instance *)context; // struct totemrrp_instance *rrp_instance = passive_instance->rrp_instance; } */ static void passive_timer_expired_token_start ( struct passive_instance *passive_instance) { qb_loop_timer_add ( passive_instance->rrp_instance->poll_handle, QB_LOOP_MED, passive_instance->rrp_instance->totem_config->rrp_token_expired_timeout*QB_TIME_NS_IN_MSEC, (void *)passive_instance, timer_function_passive_token_expired, &passive_instance->timer_expired_token); } static void passive_timer_expired_token_cancel ( struct passive_instance *passive_instance) { qb_loop_timer_del ( passive_instance->rrp_instance->poll_handle, passive_instance->timer_expired_token); } /* static void passive_timer_problem_decrementer_start ( struct passive_instance *passive_instance) { qb_loop_timer_add ( QB_LOOP_MED, passive_instance->rrp_instance->poll_handle, passive_instance->rrp_instance->totem_config->rrp_problem_count_timeout*QB_TIME_NS_IN_MSEC, (void *)passive_instance, timer_function_passive_problem_decrementer, &passive_instance->timer_problem_decrementer); } static void passive_timer_problem_decrementer_cancel ( struct passive_instance *passive_instance) { qb_loop_timer_del ( passive_instance->rrp_instance->poll_handle, passive_instance->timer_problem_decrementer); } */ /* * Monitor function implementation from rrp paper. * rrp_instance is passive rrp instance, iface_no is interface with received messgae/token and * is_token_recv_count is boolean variable which donates if message is token (>1) or regular * message (= 0) */ static void passive_monitor ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, int is_token_recv_count) { struct passive_instance *passive_instance = (struct passive_instance *)rrp_instance->rrp_algo_instance; unsigned int *recv_count; unsigned int max; unsigned int i; unsigned int min_all, min_active; + unsigned int threshold; /* * Monitor for failures */ if (is_token_recv_count) { recv_count = passive_instance->token_recv_count; + threshold = rrp_instance->totem_config->rrp_problem_count_threshold; } else { recv_count = passive_instance->mcast_recv_count; + threshold = rrp_instance->totem_config->rrp_problem_count_mcast_threshold; } recv_count[iface_no] += 1; max = 0; for (i = 0; i < rrp_instance->interface_count; i++) { if (max < recv_count[i]) { max = recv_count[i]; } } /* * Max is larger then threshold -> start adjusting process */ if (max > PASSIVE_RECV_COUNT_THRESHOLD) { min_all = min_active = recv_count[iface_no]; for (i = 0; i < rrp_instance->interface_count; i++) { if (recv_count[i] < min_all) { min_all = recv_count[i]; } if (passive_instance->faulty[i] == 0 && recv_count[i] < min_active) { min_active = recv_count[i]; } } if (min_all > 0) { /* * There is one or more faulty device with recv_count > 0 */ for (i = 0; i < rrp_instance->interface_count; i++) { recv_count[i] -= min_all; } } else { /* * No faulty device with recv_count > 0, adjust only active * devices */ for (i = 0; i < rrp_instance->interface_count; i++) { if (passive_instance->faulty[i] == 0) { recv_count[i] -= min_active; } } } /* * Find again max */ max = 0; for (i = 0; i < rrp_instance->interface_count; i++) { if (max < recv_count[i]) { max = recv_count[i]; } } } for (i = 0; i < rrp_instance->interface_count; i++) { if ((passive_instance->faulty[i] == 0) && - (max - recv_count[i] > - rrp_instance->totem_config->rrp_problem_count_threshold)) { + (max - recv_count[i] > threshold)) { passive_instance->faulty[i] = 1; qb_loop_timer_add (rrp_instance->poll_handle, QB_LOOP_MED, rrp_instance->totem_config->rrp_autorecovery_check_timeout*QB_TIME_NS_IN_MSEC, rrp_instance->deliver_fn_context[i], timer_function_test_ring_timeout, &rrp_instance->timer_active_test_ring_timeout[i]); sprintf (rrp_instance->status[i], "Marking ringid %u interface %s FAULTY", i, totemnet_iface_print (rrp_instance->net_handles[i])); log_printf ( rrp_instance->totemrrp_log_level_error, "%s", rrp_instance->status[i]); } } } static void passive_mcast_recv ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len) { struct passive_instance *passive_instance = (struct passive_instance *)rrp_instance->rrp_algo_instance; rrp_instance->totemrrp_deliver_fn ( context, msg, msg_len); if (rrp_instance->totemrrp_msgs_missing() == 0 && passive_instance->timer_expired_token) { /* * Delivers the last token */ rrp_instance->totemrrp_deliver_fn ( passive_instance->totemrrp_context, passive_instance->token, passive_instance->token_len); passive_timer_expired_token_cancel (passive_instance); } passive_monitor (rrp_instance, iface_no, 0); } static void passive_mcast_flush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { struct passive_instance *passive_instance = (struct passive_instance *)instance->rrp_algo_instance; int i = 0; do { passive_instance->msg_xmit_iface = (passive_instance->msg_xmit_iface + 1) % instance->interface_count; i++; } while ((i <= instance->interface_count) && (passive_instance->faulty[passive_instance->msg_xmit_iface] == 1)); if (i <= instance->interface_count) { totemnet_mcast_flush_send (instance->net_handles[passive_instance->msg_xmit_iface], msg, msg_len); } } static void passive_mcast_noflush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { struct passive_instance *passive_instance = (struct passive_instance *)instance->rrp_algo_instance; int i = 0; do { passive_instance->msg_xmit_iface = (passive_instance->msg_xmit_iface + 1) % instance->interface_count; i++; } while ((i <= instance->interface_count) && (passive_instance->faulty[passive_instance->msg_xmit_iface] == 1)); if (i <= instance->interface_count) { totemnet_mcast_noflush_send (instance->net_handles[passive_instance->msg_xmit_iface], msg, msg_len); } } static void passive_token_recv ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seq) { struct passive_instance *passive_instance = (struct passive_instance *)rrp_instance->rrp_algo_instance; passive_instance->totemrrp_context = context; // this should be in totemrrp_instance ? TODO if (rrp_instance->totemrrp_msgs_missing() == 0) { rrp_instance->totemrrp_deliver_fn ( context, msg, msg_len); } else { memcpy (passive_instance->token, msg, msg_len); passive_timer_expired_token_start (passive_instance); } passive_monitor (rrp_instance, iface_no, 1); } static void passive_token_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { struct passive_instance *passive_instance = (struct passive_instance *)instance->rrp_algo_instance; int i = 0; do { passive_instance->token_xmit_iface = (passive_instance->token_xmit_iface + 1) % instance->interface_count; i++; } while ((i <= instance->interface_count) && (passive_instance->faulty[passive_instance->token_xmit_iface] == 1)); if (i <= instance->interface_count) { totemnet_token_send ( instance->net_handles[passive_instance->token_xmit_iface], msg, msg_len); } } static void passive_recv_flush (struct totemrrp_instance *instance) { struct passive_instance *rrp_algo_instance = (struct passive_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_recv_flush (instance->net_handles[i]); } } } static void passive_send_flush (struct totemrrp_instance *instance) { struct passive_instance *rrp_algo_instance = (struct passive_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_send_flush (instance->net_handles[i]); } } } static void passive_iface_check (struct totemrrp_instance *instance) { struct passive_instance *rrp_algo_instance = (struct passive_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_iface_check (instance->net_handles[i]); } } } static void passive_processor_count_set ( struct totemrrp_instance *instance, unsigned int processor_count) { struct passive_instance *rrp_algo_instance = (struct passive_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_processor_count_set (instance->net_handles[i], processor_count); } } } static void passive_token_target_set ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no) { totemnet_token_target_set (instance->net_handles[iface_no], token_target); } static int passive_mcast_recv_empty ( struct totemrrp_instance *instance) { int res; int msgs_emptied = 0; int i; for (i = 0; i < instance->interface_count; i++) { res = totemnet_recv_mcast_empty (instance->net_handles[i]); if (res == -1) { return (-1); } if (res == 1) { msgs_emptied = 1; } } return (msgs_emptied); } static int passive_member_add ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no) { int res; res = totemnet_member_add (instance->net_handles[iface_no], member); return (res); } static int passive_member_remove ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no) { int res; res = totemnet_member_remove (instance->net_handles[iface_no], member); return (res); } static void passive_ring_reenable ( struct totemrrp_instance *instance, unsigned int iface_no) { struct passive_instance *rrp_algo_instance = (struct passive_instance *)instance->rrp_algo_instance; memset (rrp_algo_instance->mcast_recv_count, 0, sizeof (unsigned int) * instance->interface_count); memset (rrp_algo_instance->token_recv_count, 0, sizeof (unsigned int) * instance->interface_count); if (iface_no == instance->interface_count) { memset (rrp_algo_instance->faulty, 0, sizeof (unsigned int) * instance->interface_count); } else { rrp_algo_instance->faulty[iface_no] = 0; } } /* * Active Replication Implementation */ void *active_instance_initialize ( struct totemrrp_instance *rrp_instance, int interface_count) { struct active_instance *instance; instance = malloc (sizeof (struct active_instance)); if (instance == 0) { goto error_exit; } memset (instance, 0, sizeof (struct active_instance)); instance->faulty = malloc (sizeof (int) * interface_count); if (instance->faulty == 0) { free (instance); instance = 0; goto error_exit; } memset (instance->faulty, 0, sizeof (unsigned int) * interface_count); instance->last_token_recv = malloc (sizeof (int) * interface_count); if (instance->last_token_recv == 0) { free (instance->faulty); free (instance); instance = 0; goto error_exit; } memset (instance->last_token_recv, 0, sizeof (unsigned int) * interface_count); instance->counter_problems = malloc (sizeof (int) * interface_count); if (instance->counter_problems == 0) { free (instance->last_token_recv); free (instance->faulty); free (instance); instance = 0; goto error_exit; } memset (instance->counter_problems, 0, sizeof (unsigned int) * interface_count); instance->timer_expired_token = 0; instance->timer_problem_decrementer = 0; instance->rrp_instance = rrp_instance; instance->last_token_seq = ARR_SEQNO_START_TOKEN - 1; error_exit: return ((void *)instance); } static void timer_function_active_problem_decrementer (void *context) { struct active_instance *active_instance = (struct active_instance *)context; struct totemrrp_instance *rrp_instance = active_instance->rrp_instance; unsigned int problem_found = 0; unsigned int i; for (i = 0; i < rrp_instance->interface_count; i++) { if (active_instance->counter_problems[i] > 0) { problem_found = 1; active_instance->counter_problems[i] -= 1; if (active_instance->counter_problems[i] == 0) { sprintf (rrp_instance->status[i], "ring %d active with no faults", i); } else { sprintf (rrp_instance->status[i], "Decrementing problem counter for iface %s to [%d of %d]", totemnet_iface_print (rrp_instance->net_handles[i]), active_instance->counter_problems[i], rrp_instance->totem_config->rrp_problem_count_threshold); } log_printf ( rrp_instance->totemrrp_log_level_warning, "%s", rrp_instance->status[i]); } } if (problem_found) { active_timer_problem_decrementer_start (active_instance); } else { active_instance->timer_problem_decrementer = 0; } } static void timer_function_active_token_expired (void *context) { struct active_instance *active_instance = (struct active_instance *)context; struct totemrrp_instance *rrp_instance = active_instance->rrp_instance; unsigned int i; for (i = 0; i < rrp_instance->interface_count; i++) { if (active_instance->last_token_recv[i] == 0) { active_instance->counter_problems[i] += 1; if (active_instance->timer_problem_decrementer == 0) { active_timer_problem_decrementer_start (active_instance); } sprintf (rrp_instance->status[i], "Incrementing problem counter for seqid %d iface %s to [%d of %d]", active_instance->last_token_seq, totemnet_iface_print (rrp_instance->net_handles[i]), active_instance->counter_problems[i], rrp_instance->totem_config->rrp_problem_count_threshold); log_printf ( rrp_instance->totemrrp_log_level_warning, "%s", rrp_instance->status[i]); } } for (i = 0; i < rrp_instance->interface_count; i++) { if (active_instance->counter_problems[i] >= rrp_instance->totem_config->rrp_problem_count_threshold) { active_instance->faulty[i] = 1; qb_loop_timer_add (rrp_instance->poll_handle, QB_LOOP_MED, rrp_instance->totem_config->rrp_autorecovery_check_timeout*QB_TIME_NS_IN_MSEC, rrp_instance->deliver_fn_context[i], timer_function_test_ring_timeout, &rrp_instance->timer_active_test_ring_timeout[i]); sprintf (rrp_instance->status[i], "Marking seqid %d ringid %u interface %s FAULTY", active_instance->last_token_seq, i, totemnet_iface_print (rrp_instance->net_handles[i])); log_printf ( rrp_instance->totemrrp_log_level_error, "%s", rrp_instance->status[i]); active_timer_problem_decrementer_cancel (active_instance); } } rrp_instance->totemrrp_deliver_fn ( active_instance->totemrrp_context, active_instance->token, active_instance->token_len); } static void active_timer_expired_token_start ( struct active_instance *active_instance) { qb_loop_timer_add ( active_instance->rrp_instance->poll_handle, QB_LOOP_MED, active_instance->rrp_instance->totem_config->rrp_token_expired_timeout*QB_TIME_NS_IN_MSEC, (void *)active_instance, timer_function_active_token_expired, &active_instance->timer_expired_token); } static void active_timer_expired_token_cancel ( struct active_instance *active_instance) { qb_loop_timer_del ( active_instance->rrp_instance->poll_handle, active_instance->timer_expired_token); } static void active_timer_problem_decrementer_start ( struct active_instance *active_instance) { qb_loop_timer_add ( active_instance->rrp_instance->poll_handle, QB_LOOP_MED, active_instance->rrp_instance->totem_config->rrp_problem_count_timeout*QB_TIME_NS_IN_MSEC, (void *)active_instance, timer_function_active_problem_decrementer, &active_instance->timer_problem_decrementer); } static void active_timer_problem_decrementer_cancel ( struct active_instance *active_instance) { qb_loop_timer_del ( active_instance->rrp_instance->poll_handle, active_instance->timer_problem_decrementer); } /* * active replication */ static void active_mcast_recv ( struct totemrrp_instance *instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len) { instance->totemrrp_deliver_fn ( context, msg, msg_len); } static void active_mcast_flush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { int i; struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_mcast_flush_send (instance->net_handles[i], msg, msg_len); } } } static void active_mcast_noflush_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { int i; struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_mcast_noflush_send (instance->net_handles[i], msg, msg_len); } } } static void active_token_recv ( struct totemrrp_instance *rrp_instance, unsigned int iface_no, void *context, const void *msg, unsigned int msg_len, unsigned int token_seq) { int i; struct active_instance *active_instance = (struct active_instance *)rrp_instance->rrp_algo_instance; active_instance->totemrrp_context = context; if (sq_lt_compare (active_instance->last_token_seq, token_seq)) { memcpy (active_instance->token, msg, msg_len); active_instance->token_len = msg_len; for (i = 0; i < rrp_instance->interface_count; i++) { active_instance->last_token_recv[i] = 0; } active_instance->last_token_recv[iface_no] = 1; active_timer_expired_token_start (active_instance); } /* * This doesn't follow spec because the spec assumes we will know * when token resets occur. */ active_instance->last_token_seq = token_seq; if (token_seq == active_instance->last_token_seq) { active_instance->last_token_recv[iface_no] = 1; for (i = 0; i < rrp_instance->interface_count; i++) { if ((active_instance->last_token_recv[i] == 0) && active_instance->faulty[i] == 0) { return; /* don't deliver token */ } } active_timer_expired_token_cancel (active_instance); rrp_instance->totemrrp_deliver_fn ( context, msg, msg_len); } } static void active_token_send ( struct totemrrp_instance *instance, const void *msg, unsigned int msg_len) { struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_token_send ( instance->net_handles[i], msg, msg_len); } } } static void active_recv_flush (struct totemrrp_instance *instance) { struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_recv_flush (instance->net_handles[i]); } } } static void active_send_flush (struct totemrrp_instance *instance) { struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_send_flush (instance->net_handles[i]); } } } static int active_member_add ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no) { int res; res = totemnet_member_add (instance->net_handles[iface_no], member); return (res); } static int active_member_remove ( struct totemrrp_instance *instance, const struct totem_ip_address *member, unsigned int iface_no) { int res; res = totemnet_member_remove (instance->net_handles[iface_no], member); return (res); } static void active_iface_check (struct totemrrp_instance *instance) { struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_iface_check (instance->net_handles[i]); } } } static void active_processor_count_set ( struct totemrrp_instance *instance, unsigned int processor_count) { struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; unsigned int i; for (i = 0; i < instance->interface_count; i++) { if (rrp_algo_instance->faulty[i] == 0) { totemnet_processor_count_set (instance->net_handles[i], processor_count); } } } static void active_token_target_set ( struct totemrrp_instance *instance, struct totem_ip_address *token_target, unsigned int iface_no) { totemnet_token_target_set (instance->net_handles[iface_no], token_target); } static int active_mcast_recv_empty ( struct totemrrp_instance *instance) { int res; int msgs_emptied = 0; int i; for (i = 0; i < instance->interface_count; i++) { res = totemnet_recv_mcast_empty (instance->net_handles[i]); if (res == -1) { return (-1); } if (res == 1) { msgs_emptied = 1; } } return (msgs_emptied); } static void active_ring_reenable ( struct totemrrp_instance *instance, unsigned int iface_no) { struct active_instance *rrp_algo_instance = (struct active_instance *)instance->rrp_algo_instance; if (iface_no == instance->interface_count) { memset (rrp_algo_instance->last_token_recv, 0, sizeof (unsigned int) * instance->interface_count); memset (rrp_algo_instance->faulty, 0, sizeof (unsigned int) * instance->interface_count); memset (rrp_algo_instance->counter_problems, 0, sizeof (unsigned int) * instance->interface_count); } else { rrp_algo_instance->last_token_recv[iface_no] = 0; rrp_algo_instance->faulty[iface_no] = 0; rrp_algo_instance->counter_problems[iface_no] = 0; } } static void totemrrp_instance_initialize (struct totemrrp_instance *instance) { memset (instance, 0, sizeof (struct totemrrp_instance)); } static int totemrrp_algorithm_set ( struct totem_config *totem_config, struct totemrrp_instance *instance) { unsigned int res = -1; unsigned int i; for (i = 0; i < RRP_ALGOS_COUNT; i++) { if (strcmp (totem_config->rrp_mode, rrp_algos[i]->name) == 0) { instance->rrp_algo = rrp_algos[i]; if (rrp_algos[i]->initialize) { instance->rrp_algo_instance = rrp_algos[i]->initialize ( instance, totem_config->interface_count); } res = 0; break; } } for (i = 0; i < totem_config->interface_count; i++) { instance->status[i] = malloc (1024); sprintf (instance->status[i], "ring %d active with no faults", i); } return (res); } void rrp_deliver_fn ( void *context, const void *msg, unsigned int msg_len) { unsigned int token_seqid; unsigned int token_is; struct deliver_fn_context *deliver_fn_context = (struct deliver_fn_context *)context; struct totemrrp_instance *rrp_instance = deliver_fn_context->instance; const struct message_header *hdr = msg; struct message_header tmp_msg, activate_msg; memset(&tmp_msg, 0, sizeof(struct message_header)); memset(&activate_msg, 0, sizeof(struct message_header)); rrp_instance->totemrrp_token_seqid_get ( msg, &token_seqid, &token_is); if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVE) { log_printf ( rrp_instance->totemrrp_log_level_debug, "received message requesting test of ring now active\n"); if (hdr->endian_detector != ENDIAN_LOCAL) { test_active_msg_endian_convert(hdr, &tmp_msg); hdr = &tmp_msg; } if (hdr->nodeid_activator == rrp_instance->my_nodeid) { /* * Send an activate message */ activate_msg.type = MESSAGE_TYPE_RING_TEST_ACTIVATE; activate_msg.endian_detector = ENDIAN_LOCAL; activate_msg.ring_number = hdr->ring_number; activate_msg.nodeid_activator = rrp_instance->my_nodeid; totemnet_token_send ( rrp_instance->net_handles[deliver_fn_context->iface_no], &activate_msg, sizeof (struct message_header)); } else { /* * Send a ring test message */ totemnet_token_send ( rrp_instance->net_handles[deliver_fn_context->iface_no], msg, msg_len); } } else if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVATE) { log_printf ( rrp_instance->totemrrp_log_level_notice, "Automatically recovered ring %d\n", hdr->ring_number); if (hdr->endian_detector != ENDIAN_LOCAL) { test_active_msg_endian_convert(hdr, &tmp_msg); hdr = &tmp_msg; } totemrrp_ring_reenable (rrp_instance, deliver_fn_context->iface_no); if (hdr->nodeid_activator != rrp_instance->my_nodeid) { totemnet_token_send ( rrp_instance->net_handles[deliver_fn_context->iface_no], msg, msg_len); } } else if (token_is) { /* * Deliver to the token receiver for this rrp algorithm */ rrp_instance->rrp_algo->token_recv ( rrp_instance, deliver_fn_context->iface_no, deliver_fn_context->context, msg, msg_len, token_seqid); } else { /* * Deliver to the mcast receiver for this rrp algorithm */ rrp_instance->rrp_algo->mcast_recv ( rrp_instance, deliver_fn_context->iface_no, deliver_fn_context->context, msg, msg_len); } } void rrp_iface_change_fn ( void *context, const struct totem_ip_address *iface_addr) { struct deliver_fn_context *deliver_fn_context = (struct deliver_fn_context *)context; deliver_fn_context->instance->my_nodeid = iface_addr->nodeid; deliver_fn_context->instance->totemrrp_iface_change_fn ( deliver_fn_context->context, iface_addr, deliver_fn_context->iface_no); } int totemrrp_finalize ( void *rrp_context) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int i; for (i = 0; i < instance->interface_count; i++) { totemnet_finalize (instance->net_handles[i]); } return (0); } static void rrp_target_set_completed (void *context) { struct deliver_fn_context *deliver_fn_context = (struct deliver_fn_context *)context; deliver_fn_context->instance->totemrrp_target_set_completed (deliver_fn_context->context); } /* * Totem Redundant Ring interface * depends on poll abstraction, POSIX, IPV4 */ /* * Create an instance */ int totemrrp_initialize ( qb_loop_t *poll_handle, void **rrp_context, struct totem_config *totem_config, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_addr, unsigned int iface_no), void (*token_seqid_get) ( const void *msg, unsigned int *seqid, unsigned int *token_is), unsigned int (*msgs_missing) (void), void (*target_set_completed) (void *context)) { struct totemrrp_instance *instance; unsigned int res; int i; instance = malloc (sizeof (struct totemrrp_instance)); if (instance == 0) { return (-1); } totemrrp_instance_initialize (instance); instance->totem_config = totem_config; res = totemrrp_algorithm_set ( instance->totem_config, instance); if (res == -1) { goto error_destroy; } /* * Configure logging */ instance->totemrrp_log_level_security = totem_config->totem_logging_configuration.log_level_security; instance->totemrrp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemrrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemrrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemrrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemrrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemrrp_log_printf = totem_config->totem_logging_configuration.log_printf; instance->interfaces = totem_config->interfaces; instance->poll_handle = poll_handle; instance->totemrrp_deliver_fn = deliver_fn; instance->totemrrp_iface_change_fn = iface_change_fn; instance->totemrrp_token_seqid_get = token_seqid_get; instance->totemrrp_target_set_completed = target_set_completed; instance->totemrrp_msgs_missing = msgs_missing; instance->interface_count = totem_config->interface_count; instance->net_handles = malloc (sizeof (void *) * totem_config->interface_count); instance->context = context; instance->poll_handle = poll_handle; for (i = 0; i < totem_config->interface_count; i++) { struct deliver_fn_context *deliver_fn_context; deliver_fn_context = malloc (sizeof (struct deliver_fn_context)); assert (deliver_fn_context); deliver_fn_context->instance = instance; deliver_fn_context->context = context; deliver_fn_context->iface_no = i; instance->deliver_fn_context[i] = (void *)deliver_fn_context; totemnet_initialize ( poll_handle, &instance->net_handles[i], totem_config, i, (void *)deliver_fn_context, rrp_deliver_fn, rrp_iface_change_fn, rrp_target_set_completed); totemnet_net_mtu_adjust (instance->net_handles[i], totem_config); } *rrp_context = instance; return (0); error_destroy: free (instance); return (res); } void *totemrrp_buffer_alloc (void *rrp_context) { struct totemrrp_instance *instance = rrp_context; assert (instance != NULL); return totemnet_buffer_alloc (instance->net_handles[0]); } void totemrrp_buffer_release (void *rrp_context, void *ptr) { struct totemrrp_instance *instance = rrp_context; assert (instance != NULL); totemnet_buffer_release (instance->net_handles[0], ptr); } int totemrrp_processor_count_set ( void *rrp_context, unsigned int processor_count) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; instance->rrp_algo->processor_count_set (instance, processor_count); instance->processor_count = processor_count; return (0); } int totemrrp_token_target_set ( void *rrp_context, struct totem_ip_address *addr, unsigned int iface_no) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; instance->rrp_algo->token_target_set (instance, addr, iface_no); return (0); } int totemrrp_recv_flush (void *rrp_context) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; instance->rrp_algo->recv_flush (instance); return (0); } int totemrrp_send_flush (void *rrp_context) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; instance->rrp_algo->send_flush (instance); return (0); } int totemrrp_token_send ( void *rrp_context, const void *msg, unsigned int msg_len) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; instance->rrp_algo->token_send (instance, msg, msg_len); return (0); } int totemrrp_mcast_flush_send ( void *rrp_context, const void *msg, unsigned int msg_len) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int res = 0; // TODO this needs to return the result instance->rrp_algo->mcast_flush_send (instance, msg, msg_len); return (res); } int totemrrp_mcast_noflush_send ( void *rrp_context, const void *msg, unsigned int msg_len) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; /* * merge detects go out through mcast_flush_send so it is safe to * flush these messages if we are only one processor. This avoids * an encryption/hmac and decryption/hmac */ if (instance->processor_count > 1) { // TODO this needs to return the result instance->rrp_algo->mcast_noflush_send (instance, msg, msg_len); } return (0); } int totemrrp_iface_check (void *rrp_context) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; instance->rrp_algo->iface_check (instance); return (0); } int totemrrp_ifaces_get ( void *rrp_context, char ***status, unsigned int *iface_count) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; *status = instance->status; if (iface_count) { *iface_count = instance->interface_count; } return (0); } int totemrrp_crypto_set ( void *rrp_context, unsigned int type) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int res; res = totemnet_crypto_set(instance->net_handles[0], type); return (res); } /* * iface_no indicates the interface number [0, ..., interface_count-1] of the * specific ring which will be reenabled. We specify iface_no == interface_count * means reenabling all the rings. */ int totemrrp_ring_reenable ( void *rrp_context, unsigned int iface_no) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int res = 0; unsigned int i; instance->rrp_algo->ring_reenable (instance, iface_no); if (iface_no == instance->interface_count) { for (i = 0; i < instance->interface_count; i++) { sprintf (instance->status[i], "ring %d active with no faults", i); } } else { sprintf (instance->status[iface_no], "ring %d active with no faults", iface_no); } return (res); } extern int totemrrp_mcast_recv_empty ( void *rrp_context) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int res; res = instance->rrp_algo->mcast_recv_empty (instance); return (res); } int totemrrp_member_add ( void *rrp_context, const struct totem_ip_address *member, int iface_no) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int res; res = instance->rrp_algo->member_add (instance, member, iface_no); return (res); } int totemrrp_member_remove ( void *rrp_context, const struct totem_ip_address *member, int iface_no) { struct totemrrp_instance *instance = (struct totemrrp_instance *)rrp_context; int res; res = instance->rrp_algo->member_remove (instance, member, iface_no); return (res); } diff --git a/exec/totemsrp.c b/exec/totemsrp.c index 861c75be..c693b694 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -1,4501 +1,4504 @@ /* * Copyright (c) 2003-2006 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * The first version of this code was based upon Yair Amir's PhD thesis: * http://www.cs.jhu.edu/~yairamir/phd.ps) (ch4,5). * * The current version of totemsrp implements the Totem protocol specified in: * http://citeseer.ist.psu.edu/amir95totem.html * * The deviations from the above published protocols are: * - encryption of message contents with SOBER128 * - authentication of meessage contents with SHA1/HMAC * - token hold mode where token doesn't rotate on unused ring - reduces cpu * usage on 1.6ghz xeon from 35% to less then .1 % as measured by top */ #include #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemsrp.h" #include "totemrrp.h" #include "totemnet.h" #include "crypto.h" #include "cs_queue.h" #define LOCALHOST_IP inet_addr("127.0.0.1") #define QUEUE_RTR_ITEMS_SIZE_MAX 16384 /* allow 16384 retransmit items */ #define RETRANS_MESSAGE_QUEUE_SIZE_MAX 16384 /* allow 500 messages to be queued */ #define RECEIVED_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */ #define MAXIOVS 5 #define RETRANSMIT_ENTRIES_MAX 30 #define TOKEN_SIZE_MAX 64000 /* bytes */ #define LEAVE_DUMMY_NODEID 0 /* * Rollover handling: * SEQNO_START_MSG is the starting sequence number after a new configuration * This should remain zero, unless testing overflow in which case * 0x7ffff000 and 0xfffff000 are good starting values. * * SEQNO_START_TOKEN is the starting sequence number after a new configuration * for a token. This should remain zero, unless testing overflow in which * case 07fffff00 or 0xffffff00 are good starting values. */ #define SEQNO_START_MSG 0x0 #define SEQNO_START_TOKEN 0x0 /* * These can be used ot test different rollover points * #define SEQNO_START_MSG 0xfffffe00 * #define SEQNO_START_TOKEN 0xfffffe00 */ /* * These can be used to test the error recovery algorithms * #define TEST_DROP_ORF_TOKEN_PERCENTAGE 30 * #define TEST_DROP_COMMIT_TOKEN_PERCENTAGE 30 * #define TEST_DROP_MCAST_PERCENTAGE 50 * #define TEST_RECOVERY_MSG_COUNT 300 */ /* * we compare incoming messages to determine if their endian is * different - if so convert them * * do not change */ #define ENDIAN_LOCAL 0xff22 enum message_type { MESSAGE_TYPE_ORF_TOKEN = 0, /* Ordering, Reliability, Flow (ORF) control Token */ MESSAGE_TYPE_MCAST = 1, /* ring ordered multicast message */ MESSAGE_TYPE_MEMB_MERGE_DETECT = 2, /* merge rings if there are available rings */ MESSAGE_TYPE_MEMB_JOIN = 3, /* membership join message */ MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4, /* membership commit token */ MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5, /* cancel the holding of the token */ }; enum encapsulation_type { MESSAGE_ENCAPSULATED = 1, MESSAGE_NOT_ENCAPSULATED = 2 }; /* * New membership algorithm local variables */ struct srp_addr { struct totem_ip_address addr[INTERFACE_MAX]; }; struct consensus_list_item { struct srp_addr addr; int set; }; struct token_callback_instance { struct list_head list; int (*callback_fn) (enum totem_callback_token_type type, const void *); enum totem_callback_token_type callback_type; int delete; void *data; }; struct totemsrp_socket { int mcast; int token; }; struct message_header { char type; char encapsulated; unsigned short endian_detector; unsigned int nodeid; } __attribute__((packed)); struct mcast { struct message_header header; struct srp_addr system_from; unsigned int seq; int this_seqno; struct memb_ring_id ring_id; unsigned int node_id; int guarantee; } __attribute__((packed)); struct rtr_item { struct memb_ring_id ring_id; unsigned int seq; }__attribute__((packed)); struct orf_token { struct message_header header; unsigned int seq; unsigned int token_seq; unsigned int aru; unsigned int aru_addr; struct memb_ring_id ring_id; unsigned int backlog; unsigned int fcc; int retrans_flg; int rtr_list_entries; struct rtr_item rtr_list[0]; }__attribute__((packed)); struct memb_join { struct message_header header; struct srp_addr system_from; unsigned int proc_list_entries; unsigned int failed_list_entries; unsigned long long ring_seq; unsigned char end_of_memb_join[0]; /* * These parts of the data structure are dynamic: * struct srp_addr proc_list[]; * struct srp_addr failed_list[]; */ } __attribute__((packed)); struct memb_merge_detect { struct message_header header; struct srp_addr system_from; struct memb_ring_id ring_id; } __attribute__((packed)); struct token_hold_cancel { struct message_header header; struct memb_ring_id ring_id; } __attribute__((packed)); struct memb_commit_token_memb_entry { struct memb_ring_id ring_id; unsigned int aru; unsigned int high_delivered; unsigned int received_flg; }__attribute__((packed)); struct memb_commit_token { struct message_header header; unsigned int token_seq; struct memb_ring_id ring_id; unsigned int retrans_flg; int memb_index; int addr_entries; unsigned char end_of_commit_token[0]; /* * These parts of the data structure are dynamic: * * struct srp_addr addr[PROCESSOR_COUNT_MAX]; * struct memb_commit_token_memb_entry memb_list[PROCESSOR_COUNT_MAX]; */ }__attribute__((packed)); struct message_item { struct mcast *mcast; unsigned int msg_len; }; struct sort_queue_item { struct mcast *mcast; unsigned int msg_len; }; struct orf_token_mcast_thread_state { char iobuf[9000]; prng_state prng_state; }; enum memb_state { MEMB_STATE_OPERATIONAL = 1, MEMB_STATE_GATHER = 2, MEMB_STATE_COMMIT = 3, MEMB_STATE_RECOVERY = 4 }; struct totemsrp_instance { int iface_changes; int failed_to_recv; /* * Flow control mcasts and remcasts on last and current orf_token */ int fcc_remcast_last; int fcc_mcast_last; int fcc_remcast_current; struct consensus_list_item consensus_list[PROCESSOR_COUNT_MAX]; int consensus_list_entries; struct srp_addr my_id; struct srp_addr my_proc_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_failed_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_new_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_trans_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_deliver_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX]; int my_proc_list_entries; int my_failed_list_entries; int my_new_memb_entries; int my_trans_memb_entries; int my_memb_entries; int my_deliver_memb_entries; int my_left_memb_entries; struct memb_ring_id my_ring_id; struct memb_ring_id my_old_ring_id; int my_aru_count; int my_merge_detect_timeout_outstanding; unsigned int my_last_aru; int my_seq_unchanged; int my_received_flg; unsigned int my_high_seq_received; unsigned int my_install_seq; int my_rotation_counter; int my_set_retrans_flg; int my_retrans_flg_count; unsigned int my_high_ring_delivered; int heartbeat_timeout; /* * Queues used to order, deliver, and recover messages */ struct cs_queue new_message_queue; struct cs_queue retrans_message_queue; struct sq regular_sort_queue; struct sq recovery_sort_queue; /* * Received up to and including */ unsigned int my_aru; unsigned int my_high_delivered; struct list_head token_callback_received_listhead; struct list_head token_callback_sent_listhead; char orf_token_retransmit[TOKEN_SIZE_MAX]; int orf_token_retransmit_size; unsigned int my_token_seq; /* * Timers */ qb_loop_timer_handle timer_pause_timeout; qb_loop_timer_handle timer_orf_token_timeout; qb_loop_timer_handle timer_orf_token_retransmit_timeout; qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout; qb_loop_timer_handle timer_merge_detect_timeout; qb_loop_timer_handle memb_timer_state_gather_join_timeout; qb_loop_timer_handle memb_timer_state_gather_consensus_timeout; qb_loop_timer_handle memb_timer_state_commit_timeout; qb_loop_timer_handle timer_heartbeat_timeout; /* * Function and data used to log messages */ int totemsrp_log_level_security; int totemsrp_log_level_error; int totemsrp_log_level_warning; int totemsrp_log_level_notice; int totemsrp_log_level_debug; int totemsrp_subsys_id; void (*totemsrp_log_printf) ( int level, int sybsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7)));; enum memb_state memb_state; //TODO struct srp_addr next_memb; qb_loop_t *totemsrp_poll_handle; struct totem_ip_address mcast_address; void (*totemsrp_deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required); void (*totemsrp_confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); void (*totemsrp_service_ready_fn) (void); int global_seqno; int my_token_held; unsigned long long token_ring_id_seq; unsigned int last_released; unsigned int set_aru; int old_ring_state_saved; int old_ring_state_aru; unsigned int old_ring_state_high_seq_received; unsigned int my_last_seq; struct timeval tv_old; void *totemrrp_context; struct totem_config *totem_config; unsigned int use_heartbeat; unsigned int my_trc; unsigned int my_pbl; unsigned int my_cbl; uint64_t pause_timestamp; struct memb_commit_token *commit_token; totemsrp_stats_t stats; uint32_t orf_token_discard; uint32_t threaded_mode_enabled; void * token_recv_event_handle; void * token_sent_event_handle; char commit_token_storage[40000]; }; struct message_handlers { int count; int (*handler_functions[6]) ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); }; /* * forward decls */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static void totemsrp_instance_initialize (struct totemsrp_instance *instance); static unsigned int main_msgs_missing (void); static void main_token_seqid_get ( const void *msg, unsigned int *seqid, unsigned int *token_is); static void srp_addr_copy (struct srp_addr *dest, const struct srp_addr *src); static void srp_addr_to_nodeid ( unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries); static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b); static void memb_leave_message_send (struct totemsrp_instance *instance); static void memb_ring_id_create_or_load (struct totemsrp_instance *, struct memb_ring_id *); static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type); static void memb_state_gather_enter (struct totemsrp_instance *instance, int gather_from); static void messages_deliver_to_app (struct totemsrp_instance *instance, int skip, unsigned int end_point); static int orf_token_mcast (struct totemsrp_instance *instance, struct orf_token *oken, int fcc_mcasts_allowed); static void messages_free (struct totemsrp_instance *instance, unsigned int token_aru); static void memb_ring_id_set_and_store (struct totemsrp_instance *instance, const struct memb_ring_id *ring_id); static void target_set_completed (void *context); static void memb_state_commit_token_update (struct totemsrp_instance *instance); static void memb_state_commit_token_target_set (struct totemsrp_instance *instance); static int memb_state_commit_token_send (struct totemsrp_instance *instance); static int memb_state_commit_token_send_recovery (struct totemsrp_instance *instance, struct memb_commit_token *memb_commit_token); static void memb_state_commit_token_create (struct totemsrp_instance *instance); static int token_hold_cancel_send (struct totemsrp_instance *instance); static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out); static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out); static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out); static void mcast_endian_convert (const struct mcast *in, struct mcast *out); static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out); static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in); static void timer_function_orf_token_timeout (void *data); static void timer_function_pause_timeout (void *data); static void timer_function_heartbeat_timeout (void *data); static void timer_function_token_retransmit_timeout (void *data); static void timer_function_token_hold_retransmit_timeout (void *data); static void timer_function_merge_detect_timeout (void *data); static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance); static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr); void main_deliver_fn ( void *context, const void *msg, unsigned int msg_len); void main_iface_change_fn ( void *context, const struct totem_ip_address *iface_address, unsigned int iface_no); struct message_handlers totemsrp_message_handlers = { 6, { message_handler_orf_token, /* MESSAGE_TYPE_ORF_TOKEN */ message_handler_mcast, /* MESSAGE_TYPE_MCAST */ message_handler_memb_merge_detect, /* MESSAGE_TYPE_MEMB_MERGE_DETECT */ message_handler_memb_join, /* MESSAGE_TYPE_MEMB_JOIN */ message_handler_memb_commit_token, /* MESSAGE_TYPE_MEMB_COMMIT_TOKEN */ message_handler_token_hold_cancel /* MESSAGE_TYPE_TOKEN_HOLD_CANCEL */ } }; static const char *rundir = NULL; #define log_printf(level, format, args...) \ do { \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)\n", ##args, _error_ptr, err_num); \ } while(0) static void totemsrp_instance_initialize (struct totemsrp_instance *instance) { memset (instance, 0, sizeof (struct totemsrp_instance)); list_init (&instance->token_callback_received_listhead); list_init (&instance->token_callback_sent_listhead); instance->my_received_flg = 1; instance->my_token_seq = SEQNO_START_TOKEN - 1; instance->memb_state = MEMB_STATE_OPERATIONAL; instance->set_aru = -1; instance->my_aru = SEQNO_START_MSG; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_high_delivered = SEQNO_START_MSG; instance->orf_token_discard = 0; instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage; } static void main_token_seqid_get ( const void *msg, unsigned int *seqid, unsigned int *token_is) { const struct orf_token *token = msg; *seqid = 0; *token_is = 0; if (token->header.type == MESSAGE_TYPE_ORF_TOKEN) { *seqid = token->token_seq; *token_is = 1; } } static unsigned int main_msgs_missing (void) { // TODO return (0); } static int pause_flush (struct totemsrp_instance *instance) { uint64_t now_msec; uint64_t timestamp_msec; int res = 0; now_msec = (qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC); timestamp_msec = instance->pause_timestamp / QB_TIME_NS_IN_MSEC; if ((now_msec - timestamp_msec) > (instance->totem_config->token_timeout / 2)) { log_printf (instance->totemsrp_log_level_notice, "Process pause detected for %d ms, flushing membership messages.\n", (unsigned int)(now_msec - timestamp_msec)); /* * -1 indicates an error from recvmsg */ do { res = totemrrp_mcast_recv_empty (instance->totemrrp_context); } while (res == -1); } return (res); } static int token_event_stats_collector (enum totem_callback_token_type type, const void *void_instance) { struct totemsrp_instance *instance = (struct totemsrp_instance *)void_instance; uint32_t time_now; unsigned long long nano_secs = qb_util_nano_current_get (); time_now = (nano_secs / QB_TIME_NS_IN_MSEC); if (type == TOTEM_CALLBACK_TOKEN_RECEIVED) { /* incr latest token the index */ if (instance->stats.latest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.latest_token = 0; else instance->stats.latest_token++; if (instance->stats.earliest_token == instance->stats.latest_token) { /* we have filled up the array, start overwriting */ if (instance->stats.earliest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.earliest_token = 0; else instance->stats.earliest_token++; instance->stats.token[instance->stats.earliest_token].rx = 0; instance->stats.token[instance->stats.earliest_token].tx = 0; instance->stats.token[instance->stats.earliest_token].backlog_calc = 0; } instance->stats.token[instance->stats.latest_token].rx = time_now; instance->stats.token[instance->stats.latest_token].tx = 0; /* in case we drop the token */ } else { instance->stats.token[instance->stats.latest_token].tx = time_now; } return 0; } /* * Exported interfaces */ int totemsrp_initialize ( qb_loop_t *poll_handle, void **srp_context, struct totem_config *totem_config, totemmrp_stats_t *stats, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)) { struct totemsrp_instance *instance; unsigned int res; instance = malloc (sizeof (struct totemsrp_instance)); if (instance == NULL) { goto error_exit; } rundir = getenv ("COROSYNC_RUN_DIR"); if (rundir == NULL) { rundir = LOCALSTATEDIR "/lib/corosync"; } res = mkdir (rundir, 0700); if (res == -1 && errno != EEXIST) { goto error_destroy; } res = chdir (rundir); if (res == -1) { goto error_destroy; } totemsrp_instance_initialize (instance); stats->srp = &instance->stats; instance->stats.latest_token = 0; instance->stats.earliest_token = 0; instance->totem_config = totem_config; /* * Configure logging */ instance->totemsrp_log_level_security = totem_config->totem_logging_configuration.log_level_security; instance->totemsrp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemsrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemsrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemsrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemsrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemsrp_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Initialize local variables for totemsrp */ totemip_copy (&instance->mcast_address, &totem_config->interfaces[0].mcast_addr); /* * Display totem configuration */ log_printf (instance->totemsrp_log_level_debug, "Token Timeout (%d ms) retransmit timeout (%d ms)\n", totem_config->token_timeout, totem_config->token_retransmit_timeout); log_printf (instance->totemsrp_log_level_debug, "token hold (%d ms) retransmits before loss (%d retrans)\n", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf (instance->totemsrp_log_level_debug, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)\n", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf (instance->totemsrp_log_level_debug, "downcheck (%d ms) fail to recv const (%d msgs)\n", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf (instance->totemsrp_log_level_debug, "seqno unchanged const (%d rotations) Maximum network MTU %d\n", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf (instance->totemsrp_log_level_debug, "window size per rotation (%d messages) maximum messages per rotation (%d messages)\n", totem_config->window_size, totem_config->max_messages); log_printf (instance->totemsrp_log_level_debug, "missed count const (%d messages)\n", totem_config->miss_count_const); log_printf (instance->totemsrp_log_level_debug, "send threads (%d threads)\n", totem_config->threads); log_printf (instance->totemsrp_log_level_debug, "RRP token expired timeout (%d ms)\n", totem_config->rrp_token_expired_timeout); log_printf (instance->totemsrp_log_level_debug, "RRP token problem counter (%d ms)\n", totem_config->rrp_problem_count_timeout); log_printf (instance->totemsrp_log_level_debug, "RRP threshold (%d problem count)\n", totem_config->rrp_problem_count_threshold); + log_printf (instance->totemsrp_log_level_debug, + "RRP multicast threshold (%d problem count)\n", + totem_config->rrp_problem_count_mcast_threshold); log_printf (instance->totemsrp_log_level_debug, "RRP automatic recovery check timeout (%d ms)\n", totem_config->rrp_autorecovery_check_timeout); log_printf (instance->totemsrp_log_level_debug, "RRP mode set to %s.\n", instance->totem_config->rrp_mode); log_printf (instance->totemsrp_log_level_debug, "heartbeat_failures_allowed (%d)\n", totem_config->heartbeat_failures_allowed); log_printf (instance->totemsrp_log_level_debug, "max_network_delay (%d ms)\n", totem_config->max_network_delay); cs_queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); sq_init (&instance->regular_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); sq_init (&instance->recovery_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); instance->totemsrp_poll_handle = poll_handle; instance->totemsrp_deliver_fn = deliver_fn; instance->totemsrp_confchg_fn = confchg_fn; instance->use_heartbeat = 1; timer_function_pause_timeout (instance); if ( totem_config->heartbeat_failures_allowed == 0 ) { log_printf (instance->totemsrp_log_level_debug, "HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0\n"); instance->use_heartbeat = 0; } if (instance->use_heartbeat) { instance->heartbeat_timeout = (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout + totem_config->max_network_delay; if (instance->heartbeat_timeout >= totem_config->token_timeout) { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)\n", instance->heartbeat_timeout, totem_config->token_timeout); log_printf (instance->totemsrp_log_level_debug, "heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay\n"); log_printf (instance->totemsrp_log_level_debug, "heartbeat timeout should be less than the token timeout. HeartBeat is Diabled !!\n"); instance->use_heartbeat = 0; } else { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms)\n", instance->heartbeat_timeout); } } totemrrp_initialize ( poll_handle, &instance->totemrrp_context, totem_config, instance, main_deliver_fn, main_iface_change_fn, main_token_seqid_get, main_msgs_missing, target_set_completed); /* * Must have net_mtu adjusted by totemrrp_initialize first */ cs_queue_init (&instance->new_message_queue, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); totemsrp_callback_token_create (instance, &instance->token_recv_event_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, token_event_stats_collector, instance); totemsrp_callback_token_create (instance, &instance->token_sent_event_handle, TOTEM_CALLBACK_TOKEN_SENT, 0, token_event_stats_collector, instance); *srp_context = instance; return (0); error_destroy: free (instance); error_exit: return (-1); } void totemsrp_finalize ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; memb_leave_message_send (instance); free (srp_context); } int totemsrp_ifaces_get ( void *srp_context, unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res = 0; unsigned int found = 0; unsigned int i; for (i = 0; i < instance->my_memb_entries; i++) { if (instance->my_memb_list[i].addr[0].nodeid == nodeid) { found = 1; break; } } if (found) { memcpy (interfaces, &instance->my_memb_list[i], sizeof (struct srp_addr)); *iface_count = instance->totem_config->interface_count; goto finish; } for (i = 0; i < instance->my_left_memb_entries; i++) { if (instance->my_left_memb_list[i].addr[0].nodeid == nodeid) { found = 1; break; } } if (found) { memcpy (interfaces, &instance->my_left_memb_list[i], sizeof (struct srp_addr)); *iface_count = instance->totem_config->interface_count; } else { res = -1; } finish: totemrrp_ifaces_get (instance->totemrrp_context, status, NULL); return (res); } int totemsrp_crypto_set ( void *srp_context, unsigned int type) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = totemrrp_crypto_set(instance->totemrrp_context, type); return (res); } unsigned int totemsrp_my_nodeid_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; unsigned int res; res = instance->totem_config->interfaces[0].boundto.nodeid; return (res); } int totemsrp_my_family_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = instance->totem_config->interfaces[0].boundto.family; return (res); } int totemsrp_ring_reenable ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; totemrrp_ring_reenable (instance->totemrrp_context, instance->totem_config->interface_count); return (0); } /* * Set operations for use by the membership algorithm */ static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b) { unsigned int i; unsigned int res; for (i = 0; i < 1; i++) { res = totemip_equal (&a->addr[i], &b->addr[i]); if (res == 0) { return (0); } } return (1); } static void srp_addr_copy (struct srp_addr *dest, const struct srp_addr *src) { unsigned int i; for (i = 0; i < INTERFACE_MAX; i++) { totemip_copy (&dest->addr[i], &src->addr[i]); } } static void srp_addr_to_nodeid ( unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries) { unsigned int i; for (i = 0; i < entries; i++) { nodeid_out[i] = srp_addr_in[i].addr[0].nodeid; } } static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in) { int i; for (i = 0; i < INTERFACE_MAX; i++) { totemip_copy_endian_convert (&out->addr[i], &in->addr[i]); } } static void memb_consensus_reset (struct totemsrp_instance *instance) { instance->consensus_list_entries = 0; } static void memb_set_subtract ( struct srp_addr *out_list, int *out_list_entries, struct srp_addr *one_list, int one_list_entries, struct srp_addr *two_list, int two_list_entries) { int found = 0; int i; int j; *out_list_entries = 0; for (i = 0; i < one_list_entries; i++) { for (j = 0; j < two_list_entries; j++) { if (srp_addr_equal (&one_list[i], &two_list[j])) { found = 1; break; } } if (found == 0) { srp_addr_copy (&out_list[*out_list_entries], &one_list[i]); *out_list_entries = *out_list_entries + 1; } found = 0; } } /* * Set consensus for a specific processor */ static void memb_consensus_set ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int found = 0; int i; if (addr->addr[0].nodeid == LEAVE_DUMMY_NODEID) return; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) { found = 1; break; /* found entry */ } } srp_addr_copy (&instance->consensus_list[i].addr, addr); instance->consensus_list[i].set = 1; if (found == 0) { instance->consensus_list_entries++; } return; } /* * Is consensus set for a specific processor */ static int memb_consensus_isset ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal (addr, &instance->consensus_list[i].addr)) { return (instance->consensus_list[i].set); } } return (0); } /* * Is consensus agreed upon based upon consensus database */ static int memb_consensus_agreed ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int agreed = 1; int i; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); for (i = 0; i < token_memb_entries; i++) { if (memb_consensus_isset (instance, &token_memb[i]) == 0) { agreed = 0; break; } } assert (token_memb_entries >= 1); return (agreed); } static void memb_consensus_notset ( struct totemsrp_instance *instance, struct srp_addr *no_consensus_list, int *no_consensus_list_entries, struct srp_addr *comparison_list, int comparison_list_entries) { int i; *no_consensus_list_entries = 0; for (i = 0; i < instance->my_proc_list_entries; i++) { if (memb_consensus_isset (instance, &instance->my_proc_list[i]) == 0) { srp_addr_copy (&no_consensus_list[*no_consensus_list_entries], &instance->my_proc_list[i]); *no_consensus_list_entries = *no_consensus_list_entries + 1; } } } /* * Is set1 equal to set2 Entries can be in different orders */ static int memb_set_equal ( struct srp_addr *set1, int set1_entries, struct srp_addr *set2, int set2_entries) { int i; int j; int found = 0; if (set1_entries != set2_entries) { return (0); } for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { found = 1; break; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * Is subset fully contained in fullset */ static int memb_set_subset ( const struct srp_addr *subset, int subset_entries, const struct srp_addr *fullset, int fullset_entries) { int i; int j; int found = 0; if (subset_entries > fullset_entries) { return (0); } for (i = 0; i < subset_entries; i++) { for (j = 0; j < fullset_entries; j++) { if (srp_addr_equal (&subset[i], &fullset[j])) { found = 1; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * merge subset into fullset taking care not to add duplicates */ static void memb_set_merge ( const struct srp_addr *subset, int subset_entries, struct srp_addr *fullset, int *fullset_entries) { int found = 0; int i; int j; for (i = 0; i < subset_entries; i++) { for (j = 0; j < *fullset_entries; j++) { if (srp_addr_equal (&fullset[j], &subset[i])) { found = 1; break; } } if (found == 0) { srp_addr_copy (&fullset[*fullset_entries], &subset[i]); *fullset_entries = *fullset_entries + 1; } found = 0; } return; } static void memb_set_and_with_ring_id ( struct srp_addr *set1, struct memb_ring_id *set1_ring_ids, int set1_entries, struct srp_addr *set2, int set2_entries, struct memb_ring_id *old_ring_id, struct srp_addr *and, int *and_entries) { int i; int j; int found = 0; *and_entries = 0; for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { if (memcmp (&set1_ring_ids[j], old_ring_id, sizeof (struct memb_ring_id)) == 0) { found = 1; } break; } } if (found) { srp_addr_copy (&and[*and_entries], &set1[j]); *and_entries = *and_entries + 1; } found = 0; } return; } #ifdef CODE_COVERAGE static void memb_set_print ( char *string, struct srp_addr *list, int list_entries) { int i; int j; printf ("List '%s' contains %d entries:\n", string, list_entries); for (i = 0; i < list_entries; i++) { for (j = 0; j < INTERFACE_MAX; j++) { printf ("Address %d\n", i); printf ("\tiface %d %s\n", j, totemip_print (&list[i].addr[j])); printf ("family %d\n", list[i].addr[j].family); } } } #endif static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance) { assert (instance != NULL); return totemrrp_buffer_alloc (instance->totemrrp_context); } static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr) { assert (instance != NULL); totemrrp_buffer_release (instance->totemrrp_context, ptr); } static void reset_token_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_retransmit_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_retransmit_timeout, &instance->timer_orf_token_retransmit_timeout); } static void start_merge_detect_timeout (struct totemsrp_instance *instance) { if (instance->my_merge_detect_timeout_outstanding == 0) { qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 1; } } static void cancel_merge_detect_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 0; } /* * ring_state_* is used to save and restore the sort queue * state when a recovery operation fails (and enters gather) */ static void old_ring_state_save (struct totemsrp_instance *instance) { if (instance->old_ring_state_saved == 0) { instance->old_ring_state_saved = 1; memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); instance->old_ring_state_aru = instance->my_aru; instance->old_ring_state_high_seq_received = instance->my_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Saving state aru %x high seq received %x\n", instance->my_aru, instance->my_high_seq_received); } } static void old_ring_state_restore (struct totemsrp_instance *instance) { instance->my_aru = instance->old_ring_state_aru; instance->my_high_seq_received = instance->old_ring_state_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Restoring instance->my_aru %x my high seq received %x\n", instance->my_aru, instance->my_high_seq_received); } static void old_ring_state_reset (struct totemsrp_instance *instance) { log_printf (instance->totemsrp_log_level_debug, "Resetting old ring state\n"); instance->old_ring_state_saved = 0; } static void reset_pause_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_pause_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout * QB_TIME_NS_IN_MSEC / 5, (void *)instance, timer_function_pause_timeout, &instance->timer_pause_timeout); } static void reset_token_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_orf_token_timeout, &instance->timer_orf_token_timeout); } static void reset_heartbeat_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->heartbeat_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_heartbeat_timeout, &instance->timer_heartbeat_timeout); } static void cancel_token_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); } static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); } static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); } static void start_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_hold_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_hold_retransmit_timeout, &instance->timer_orf_token_hold_retransmit_timeout); } static void cancel_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_hold_retransmit_timeout); } static void memb_state_consensus_timeout_expired ( struct totemsrp_instance *instance) { struct srp_addr no_consensus_list[PROCESSOR_COUNT_MAX]; int no_consensus_list_entries; instance->stats.consensus_timeouts++; if (memb_consensus_agreed (instance)) { memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); reset_token_timeout (instance); // REVIEWED } else { memb_consensus_notset ( instance, no_consensus_list, &no_consensus_list_entries, instance->my_proc_list, instance->my_proc_list_entries); memb_set_merge (no_consensus_list, no_consensus_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, 0); } } static void memb_join_message_send (struct totemsrp_instance *instance); static void memb_merge_detect_transmit (struct totemsrp_instance *instance); /* * Timers used for various states of the membership algorithm */ static void timer_function_pause_timeout (void *data) { struct totemsrp_instance *instance = data; instance->pause_timestamp = qb_util_nano_current_get (); reset_pause_timeout (instance); } static void memb_recovery_state_token_loss (struct totemsrp_instance *instance) { old_ring_state_restore (instance); memb_state_gather_enter (instance, 5); instance->stats.recovery_token_lost++; } static void timer_function_orf_token_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the OPERATIONAL state.\n"); log_printf (instance->totemsrp_log_level_notice, "A processor failed, forming new configuration.\n"); totemrrp_iface_check (instance->totemrrp_context); memb_state_gather_enter (instance, 2); instance->stats.operational_token_lost++; break; case MEMB_STATE_GATHER: log_printf (instance->totemsrp_log_level_debug, "The consensus timeout expired.\n"); memb_state_consensus_timeout_expired (instance); memb_state_gather_enter (instance, 3); instance->stats.gather_token_lost++; break; case MEMB_STATE_COMMIT: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the COMMIT state.\n"); memb_state_gather_enter (instance, 4); instance->stats.commit_token_lost++; break; case MEMB_STATE_RECOVERY: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the RECOVERY state.\n"); memb_recovery_state_token_loss (instance); instance->orf_token_discard = 1; break; } } static void timer_function_heartbeat_timeout (void *data) { struct totemsrp_instance *instance = data; log_printf (instance->totemsrp_log_level_debug, "HeartBeat Timer expired Invoking token loss mechanism in state %d \n", instance->memb_state); timer_function_orf_token_timeout(data); } static void memb_timer_function_state_gather (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: assert (0); /* this should never happen */ break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: memb_join_message_send (instance); /* * Restart the join timeout `*/ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); break; } } static void memb_timer_function_gather_consensus_timeout (void *data) { struct totemsrp_instance *instance = data; memb_state_consensus_timeout_expired (instance); } static void deliver_messages_from_recovery_to_regular (struct totemsrp_instance *instance) { unsigned int i; struct sort_queue_item *recovery_message_item; struct sort_queue_item regular_message_item; unsigned int range = 0; int res; void *ptr; struct mcast *mcast; log_printf (instance->totemsrp_log_level_debug, "recovery to regular %x-%x\n", SEQNO_START_MSG + 1, instance->my_aru); range = instance->my_aru - SEQNO_START_MSG; /* * Move messages from recovery to regular sort queue */ // todo should i be initialized to 0 or 1 ? for (i = 1; i <= range; i++) { res = sq_item_get (&instance->recovery_sort_queue, i + SEQNO_START_MSG, &ptr); if (res != 0) { continue; } recovery_message_item = ptr; /* * Convert recovery message into regular message */ mcast = recovery_message_item->mcast; if (mcast->header.encapsulated == MESSAGE_ENCAPSULATED) { /* * Message is a recovery message encapsulated * in a new ring message */ regular_message_item.mcast = (struct mcast *)(((char *)recovery_message_item->mcast) + sizeof (struct mcast)); regular_message_item.msg_len = recovery_message_item->msg_len - sizeof (struct mcast); mcast = regular_message_item.mcast; } else { /* * TODO this case shouldn't happen */ continue; } log_printf (instance->totemsrp_log_level_debug, "comparing if ring id is for this processors old ring seqno %d\n", mcast->seq); /* * Only add this message to the regular sort * queue if it was originated with the same ring * id as the previous ring */ if (memcmp (&instance->my_old_ring_id, &mcast->ring_id, sizeof (struct memb_ring_id)) == 0) { res = sq_item_inuse (&instance->regular_sort_queue, mcast->seq); if (res == 0) { sq_item_add (&instance->regular_sort_queue, ®ular_message_item, mcast->seq); if (sq_lt_compare (instance->old_ring_state_high_seq_received, mcast->seq)) { instance->old_ring_state_high_seq_received = mcast->seq; } } } else { log_printf (instance->totemsrp_log_level_debug, "-not adding msg with seq no %x\n", mcast->seq); } } } /* * Change states in the state machine of the membership algorithm */ static void memb_state_operational_enter (struct totemsrp_instance *instance) { struct srp_addr joined_list[PROCESSOR_COUNT_MAX]; int joined_list_entries = 0; unsigned int aru_save; unsigned int joined_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int trans_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int new_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int left_list[PROCESSOR_COUNT_MAX]; unsigned int i; unsigned int res; memb_consensus_reset (instance); old_ring_state_reset (instance); deliver_messages_from_recovery_to_regular (instance); log_printf (instance->totemsrp_log_level_debug, "Delivering to app %x to %x\n", instance->my_high_delivered + 1, instance->old_ring_state_high_seq_received); aru_save = instance->my_aru; instance->my_aru = instance->old_ring_state_aru; messages_deliver_to_app (instance, 0, instance->old_ring_state_high_seq_received); /* * Calculate joined and left list */ memb_set_subtract (instance->my_left_memb_list, &instance->my_left_memb_entries, instance->my_memb_list, instance->my_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); memb_set_subtract (joined_list, &joined_list_entries, instance->my_new_memb_list, instance->my_new_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); /* * Install new membership */ instance->my_memb_entries = instance->my_new_memb_entries; memcpy (&instance->my_memb_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->last_released = 0; instance->my_set_retrans_flg = 0; /* * Deliver transitional configuration to application */ srp_addr_to_nodeid (left_list, instance->my_left_memb_list, instance->my_left_memb_entries); srp_addr_to_nodeid (trans_memb_list_totemip, instance->my_trans_memb_list, instance->my_trans_memb_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_TRANSITIONAL, trans_memb_list_totemip, instance->my_trans_memb_entries, left_list, instance->my_left_memb_entries, 0, 0, &instance->my_ring_id); // TODO we need to filter to ensure we only deliver those // messages which are part of instance->my_deliver_memb messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received); instance->my_aru = aru_save; /* * Deliver regular configuration to application */ srp_addr_to_nodeid (new_memb_list_totemip, instance->my_new_memb_list, instance->my_new_memb_entries); srp_addr_to_nodeid (joined_list_totemip, joined_list, joined_list_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_REGULAR, new_memb_list_totemip, instance->my_new_memb_entries, 0, 0, joined_list_totemip, joined_list_entries, &instance->my_ring_id); /* * The recovery sort queue now becomes the regular * sort queue. It is necessary to copy the state * into the regular sort queue. */ sq_copy (&instance->regular_sort_queue, &instance->recovery_sort_queue); instance->my_last_aru = SEQNO_START_MSG; /* When making my_proc_list smaller, ensure that the * now non-used entries are zero-ed out. There are some suspect * assert's that assume that there is always 2 entries in the list. * These fail when my_proc_list is reduced to 1 entry (and the * valid [0] entry is the same as the 'unused' [1] entry). */ memset(instance->my_proc_list, 0, sizeof (struct srp_addr) * instance->my_proc_list_entries); instance->my_proc_list_entries = instance->my_new_memb_entries; memcpy (instance->my_proc_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->my_failed_list_entries = 0; instance->my_high_delivered = instance->my_high_seq_received; for (i = 0; i <= instance->my_high_delivered; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (res == 0) { struct sort_queue_item *regular_message; regular_message = ptr; free (regular_message->mcast); } } sq_items_release (&instance->regular_sort_queue, instance->my_high_delivered); instance->last_released = instance->my_high_delivered; log_printf (instance->totemsrp_log_level_debug, "entering OPERATIONAL state.\n"); log_printf (instance->totemsrp_log_level_notice, "A processor joined or left the membership and a new membership was formed.\n"); instance->memb_state = MEMB_STATE_OPERATIONAL; instance->stats.operational_entered++; instance->stats.continuous_gather = 0; instance->my_received_flg = 1; reset_pause_timeout (instance); /* * Save ring id information from this configuration to determine * which processors are transitioning from old regular configuration * in to new regular configuration on the next configuration change */ memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); return; } static void memb_state_gather_enter ( struct totemsrp_instance *instance, int gather_from) { instance->orf_token_discard = 1; memb_set_merge ( &instance->my_id, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_join_message_send (instance); /* * Restart the join timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); /* * Restart the consensus timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->consensus_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_gather_consensus_timeout, &instance->memb_timer_state_gather_consensus_timeout); /* * Cancel the token loss and token retransmission timeouts */ cancel_token_retransmit_timeout (instance); // REVIEWED cancel_token_timeout (instance); // REVIEWED cancel_merge_detect_timeout (instance); memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); log_printf (instance->totemsrp_log_level_debug, "entering GATHER state from %d.\n", gather_from); instance->memb_state = MEMB_STATE_GATHER; instance->stats.gather_entered++; if (gather_from == 3) { /* * State 3 means gather, so we are continuously gathering. */ instance->stats.continuous_gather++; } if (instance->stats.continuous_gather > MAX_NO_CONT_GATHER) { log_printf (instance->totemsrp_log_level_warning, "Totem is unable to form a cluster because of an " "operating system or network fault. The most common " "cause of this message is that the local firewall is " "configured improperly.\n"); } return; } static void timer_function_token_retransmit_timeout (void *data); static void target_set_completed ( void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; memb_state_commit_token_send (instance); } static void memb_state_commit_enter ( struct totemsrp_instance *instance) { old_ring_state_save (instance); memb_state_commit_token_update (instance); memb_state_commit_token_target_set (instance); qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); instance->memb_timer_state_gather_join_timeout = 0; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); instance->memb_timer_state_gather_consensus_timeout = 0; memb_ring_id_set_and_store (instance, &instance->commit_token->ring_id); instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf (instance->totemsrp_log_level_debug, "entering COMMIT state.\n"); instance->memb_state = MEMB_STATE_COMMIT; reset_token_retransmit_timeout (instance); // REVIEWED reset_token_timeout (instance); // REVIEWED instance->stats.commit_entered++; instance->stats.continuous_gather = 0; /* * reset all flow control variables since we are starting a new ring */ instance->my_trc = 0; instance->my_pbl = 0; instance->my_cbl = 0; /* * commit token sent after callback that token target has been set */ } static void memb_state_recovery_enter ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { int i; int local_received_flg = 1; unsigned int low_ring_aru; unsigned int range = 0; unsigned int messages_originated = 0; const struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; struct memb_ring_id my_new_memb_ring_id_list[PROCESSOR_COUNT_MAX]; addr = (const struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); log_printf (instance->totemsrp_log_level_debug, "entering RECOVERY state.\n"); instance->orf_token_discard = 0; instance->my_high_ring_delivered = 0; sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG); cs_queue_reinit (&instance->retrans_message_queue); low_ring_aru = instance->old_ring_state_high_seq_received; memb_state_commit_token_send_recovery (instance, commit_token); instance->my_token_seq = SEQNO_START_TOKEN - 1; /* * Build regular configuration */ totemrrp_processor_count_set ( instance->totemrrp_context, commit_token->addr_entries); /* * Build transitional configuration */ for (i = 0; i < instance->my_new_memb_entries; i++) { memcpy (&my_new_memb_ring_id_list[i], &memb_list[i].ring_id, sizeof (struct memb_ring_id)); } memb_set_and_with_ring_id ( instance->my_new_memb_list, my_new_memb_ring_id_list, instance->my_new_memb_entries, instance->my_memb_list, instance->my_memb_entries, &instance->my_old_ring_id, instance->my_trans_memb_list, &instance->my_trans_memb_entries); for (i = 0; i < instance->my_trans_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "TRANS [%d] member %s:\n", i, totemip_print (&instance->my_trans_memb_list[i].addr[0])); } for (i = 0; i < instance->my_new_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "position [%d] member %s:\n", i, totemip_print (&addr[i].addr[0])); log_printf (instance->totemsrp_log_level_debug, "previous ring seq %llx rep %s\n", memb_list[i].ring_id.seq, totemip_print (&memb_list[i].ring_id.rep)); log_printf (instance->totemsrp_log_level_debug, "aru %x high delivered %x received flag %d\n", memb_list[i].aru, memb_list[i].high_delivered, memb_list[i].received_flg); // assert (totemip_print (&memb_list[i].ring_id.rep) != 0); } /* * Determine if any received flag is false */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_trans_memb_list, instance->my_trans_memb_entries) && memb_list[i].received_flg == 0) { instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct srp_addr) * instance->my_trans_memb_entries); local_received_flg = 0; break; } } if (local_received_flg == 1) { goto no_originate; } /* Else originate messages if we should */ /* * Calculate my_low_ring_aru, instance->my_high_ring_delivered for the transitional membership */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) && memcmp (&instance->my_old_ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, low_ring_aru)) { low_ring_aru = memb_list[i].aru; } if (sq_lt_compare (instance->my_high_ring_delivered, memb_list[i].high_delivered)) { instance->my_high_ring_delivered = memb_list[i].high_delivered; } } } /* * Copy all old ring messages to instance->retrans_message_queue */ range = instance->old_ring_state_high_seq_received - low_ring_aru; if (range == 0) { /* * No messages to copy */ goto no_originate; } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); log_printf (instance->totemsrp_log_level_debug, "copying all old ring messages from %x-%x.\n", low_ring_aru + 1, instance->old_ring_state_high_seq_received); for (i = 1; i <= range; i++) { struct sort_queue_item *sort_queue_item; struct message_item message_item; void *ptr; int res; res = sq_item_get (&instance->regular_sort_queue, low_ring_aru + i, &ptr); if (res != 0) { continue; } sort_queue_item = ptr; messages_originated++; memset (&message_item, 0, sizeof (struct message_item)); // TODO LEAK message_item.mcast = totemsrp_buffer_alloc (instance); assert (message_item.mcast); message_item.mcast->header.type = MESSAGE_TYPE_MCAST; srp_addr_copy (&message_item.mcast->system_from, &instance->my_id); message_item.mcast->header.encapsulated = MESSAGE_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->header.endian_detector = ENDIAN_LOCAL; memcpy (&message_item.mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); message_item.msg_len = sort_queue_item->msg_len + sizeof (struct mcast); memcpy (((char *)message_item.mcast) + sizeof (struct mcast), sort_queue_item->mcast, sort_queue_item->msg_len); cs_queue_item_add (&instance->retrans_message_queue, &message_item); } log_printf (instance->totemsrp_log_level_debug, "Originated %d messages in RECOVERY.\n", messages_originated); goto originated; no_originate: log_printf (instance->totemsrp_log_level_debug, "Did not need to originate any messages in recovery.\n"); originated: instance->my_aru = SEQNO_START_MSG; instance->my_aru_count = 0; instance->my_seq_unchanged = 0; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_install_seq = SEQNO_START_MSG; instance->last_released = SEQNO_START_MSG; reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED instance->memb_state = MEMB_STATE_RECOVERY; instance->stats.recovery_entered++; instance->stats.continuous_gather = 0; return; } void totemsrp_event_signal (void *srp_context, enum totem_event_type type, int value) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; token_hold_cancel_send (instance); return; } int totemsrp_mcast ( void *srp_context, struct iovec *iovec, unsigned int iov_len, int guarantee) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int i; struct message_item message_item; char *addr; unsigned int addr_idx; if (cs_queue_is_full (&instance->new_message_queue)) { log_printf (instance->totemsrp_log_level_debug, "queue full\n"); return (-1); } memset (&message_item, 0, sizeof (struct message_item)); /* * Allocate pending item */ message_item.mcast = totemsrp_buffer_alloc (instance); if (message_item.mcast == 0) { goto error_mcast; } /* * Set mcast header */ memset(message_item.mcast, 0, sizeof (struct mcast)); message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->header.endian_detector = ENDIAN_LOCAL; message_item.mcast->header.encapsulated = MESSAGE_NOT_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->guarantee = guarantee; srp_addr_copy (&message_item.mcast->system_from, &instance->my_id); addr = (char *)message_item.mcast; addr_idx = sizeof (struct mcast); for (i = 0; i < iov_len; i++) { memcpy (&addr[addr_idx], iovec[i].iov_base, iovec[i].iov_len); addr_idx += iovec[i].iov_len; } message_item.msg_len = addr_idx; log_printf (instance->totemsrp_log_level_debug, "mcasted message added to pending queue\n"); instance->stats.mcast_tx++; cs_queue_item_add (&instance->new_message_queue, &message_item); return (0); error_mcast: return (-1); } /* * Determine if there is room to queue a new message */ int totemsrp_avail (void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int avail; cs_queue_avail (&instance->new_message_queue, &avail); return (avail); } /* * ORF Token Management */ /* * Recast message to mcast group if it is available */ static int orf_token_remcast ( struct totemsrp_instance *instance, int seq) { struct sort_queue_item *sort_queue_item; int res; void *ptr; struct sq *sort_queue; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } res = sq_in_range (sort_queue, seq); if (res == 0) { log_printf (instance->totemsrp_log_level_debug, "sq not in range\n"); return (-1); } /* * Get RTR item at seq, if not available, return */ res = sq_item_get (sort_queue, seq, &ptr); if (res != 0) { return -1; } sort_queue_item = ptr; totemrrp_mcast_noflush_send ( instance->totemrrp_context, sort_queue_item->mcast, sort_queue_item->msg_len); return (0); } /* * Free all freeable messages from ring */ static void messages_free ( struct totemsrp_instance *instance, unsigned int token_aru) { struct sort_queue_item *regular_message; unsigned int i; int res; int log_release = 0; unsigned int release_to; unsigned int range = 0; release_to = token_aru; if (sq_lt_compare (instance->my_last_aru, release_to)) { release_to = instance->my_last_aru; } if (sq_lt_compare (instance->my_high_delivered, release_to)) { release_to = instance->my_high_delivered; } /* * Ensure we dont try release before an already released point */ if (sq_lt_compare (release_to, instance->last_released)) { return; } range = release_to - instance->last_released; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); /* * Release retransmit list items if group aru indicates they are transmitted */ for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, instance->last_released + i, &ptr); if (res == 0) { regular_message = ptr; totemsrp_buffer_release (instance, regular_message->mcast); } sq_items_release (&instance->regular_sort_queue, instance->last_released + i); log_release = 1; } instance->last_released += range; if (log_release) { log_printf (instance->totemsrp_log_level_debug, "releasing messages up to and including %x\n", release_to); } } static void update_aru ( struct totemsrp_instance *instance) { unsigned int i; int res; struct sq *sort_queue; unsigned int range; unsigned int my_aru_saved = 0; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } range = instance->my_high_seq_received - instance->my_aru; if (range > 1024) { return; } my_aru_saved = instance->my_aru; for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (sort_queue, my_aru_saved + i, &ptr); /* * If hole, stop updating aru */ if (res != 0) { break; } } instance->my_aru += i - 1; } /* * Multicasts pending messages onto the ring (requires orf_token possession) */ static int orf_token_mcast ( struct totemsrp_instance *instance, struct orf_token *token, int fcc_mcasts_allowed) { struct message_item *message_item = 0; struct cs_queue *mcast_queue; struct sq *sort_queue; struct sort_queue_item sort_queue_item; struct mcast *mcast; unsigned int fcc_mcast_current; if (instance->memb_state == MEMB_STATE_RECOVERY) { mcast_queue = &instance->retrans_message_queue; sort_queue = &instance->recovery_sort_queue; reset_token_retransmit_timeout (instance); // REVIEWED } else { mcast_queue = &instance->new_message_queue; sort_queue = &instance->regular_sort_queue; } for (fcc_mcast_current = 0; fcc_mcast_current < fcc_mcasts_allowed; fcc_mcast_current++) { if (cs_queue_is_empty (mcast_queue)) { break; } message_item = (struct message_item *)cs_queue_item_get (mcast_queue); message_item->mcast->seq = ++token->seq; message_item->mcast->this_seqno = instance->global_seqno++; /* * Build IO vector */ memset (&sort_queue_item, 0, sizeof (struct sort_queue_item)); sort_queue_item.mcast = message_item->mcast; sort_queue_item.msg_len = message_item->msg_len; mcast = sort_queue_item.mcast; memcpy (&mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); /* * Add message to retransmit queue */ sq_item_add (sort_queue, &sort_queue_item, message_item->mcast->seq); totemrrp_mcast_noflush_send ( instance->totemrrp_context, message_item->mcast, message_item->msg_len); /* * Delete item from pending queue */ cs_queue_item_remove (mcast_queue); /* * If messages mcasted, deliver any new messages to totempg */ instance->my_high_seq_received = token->seq; } update_aru (instance); /* * Return 1 if more messages are available for single node clusters */ return (fcc_mcast_current); } /* * Remulticasts messages in orf_token's retransmit list (requires orf_token) * Modify's orf_token's rtr to include retransmits required by this process */ static int orf_token_rtr ( struct totemsrp_instance *instance, struct orf_token *orf_token, unsigned int *fcc_allowed) { unsigned int res; unsigned int i, j; unsigned int found; struct sq *sort_queue; struct rtr_item *rtr_list; unsigned int range = 0; char retransmit_msg[1024]; char value[64]; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } rtr_list = &orf_token->rtr_list[0]; strcpy (retransmit_msg, "Retransmit List: "); if (orf_token->rtr_list_entries) { log_printf (instance->totemsrp_log_level_debug, "Retransmit List %d\n", orf_token->rtr_list_entries); for (i = 0; i < orf_token->rtr_list_entries; i++) { sprintf (value, "%x ", rtr_list[i].seq); strcat (retransmit_msg, value); } strcat (retransmit_msg, "\n"); log_printf (instance->totemsrp_log_level_notice, "%s", retransmit_msg); } /* * Retransmit messages on orf_token's RTR list from RTR queue */ for (instance->fcc_remcast_current = 0, i = 0; instance->fcc_remcast_current < *fcc_allowed && i < orf_token->rtr_list_entries;) { /* * If this retransmit request isn't from this configuration, * try next rtr entry */ if (memcmp (&rtr_list[i].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { i += 1; continue; } res = orf_token_remcast (instance, rtr_list[i].seq); if (res == 0) { /* * Multicasted message, so no need to copy to new retransmit list */ orf_token->rtr_list_entries -= 1; assert (orf_token->rtr_list_entries >= 0); memmove (&rtr_list[i], &rtr_list[i + 1], sizeof (struct rtr_item) * (orf_token->rtr_list_entries - i)); instance->stats.mcast_retx++; instance->fcc_remcast_current++; } else { i += 1; } } *fcc_allowed = *fcc_allowed - instance->fcc_remcast_current; /* * Add messages to retransmit to RTR list * but only retry if there is room in the retransmit list */ range = orf_token->seq - instance->my_aru; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); for (i = 1; (orf_token->rtr_list_entries < RETRANSMIT_ENTRIES_MAX) && (i <= range); i++) { /* * Ensure message is within the sort queue range */ res = sq_in_range (sort_queue, instance->my_aru + i); if (res == 0) { break; } /* * Find if a message is missing from this processor */ res = sq_item_inuse (sort_queue, instance->my_aru + i); if (res == 0) { /* * Determine how many times we have missed receiving * this sequence number. sq_item_miss_count increments * a counter for the sequence number. The miss count * will be returned and compared. This allows time for * delayed multicast messages to be received before * declaring the message is missing and requesting a * retransmit. */ res = sq_item_miss_count (sort_queue, instance->my_aru + i); if (res < instance->totem_config->miss_count_const) { continue; } /* * Determine if missing message is already in retransmit list */ found = 0; for (j = 0; j < orf_token->rtr_list_entries; j++) { if (instance->my_aru + i == rtr_list[j].seq) { found = 1; } } if (found == 0) { /* * Missing message not found in current retransmit list so add it */ memcpy (&rtr_list[orf_token->rtr_list_entries].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); rtr_list[orf_token->rtr_list_entries].seq = instance->my_aru + i; orf_token->rtr_list_entries++; } } } return (instance->fcc_remcast_current); } static void token_retransmit (struct totemsrp_instance *instance) { totemrrp_token_send (instance->totemrrp_context, instance->orf_token_retransmit, instance->orf_token_retransmit_size); } /* * Retransmit the regular token if no mcast or token has * been received in retransmit token period retransmit * the token to the next processor */ static void timer_function_token_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); reset_token_retransmit_timeout (instance); // REVIEWED break; } } static void timer_function_token_hold_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: break; case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); break; } } static void timer_function_merge_detect_timeout(void *data) { struct totemsrp_instance *instance = data; instance->my_merge_detect_timeout_outstanding = 0; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { memb_merge_detect_transmit (instance); } break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: case MEMB_STATE_RECOVERY: break; } } /* * Send orf_token to next member (requires orf_token) */ static int token_send ( struct totemsrp_instance *instance, struct orf_token *orf_token, int forward_token) { int res = 0; unsigned int orf_token_size; orf_token_size = sizeof (struct orf_token) + (orf_token->rtr_list_entries * sizeof (struct rtr_item)); memcpy (instance->orf_token_retransmit, orf_token, orf_token_size); instance->orf_token_retransmit_size = orf_token_size; orf_token->header.nodeid = instance->my_id.addr[0].nodeid; assert (orf_token->header.nodeid); if (forward_token == 0) { return (0); } totemrrp_token_send (instance->totemrrp_context, orf_token, orf_token_size); return (res); } static int token_hold_cancel_send (struct totemsrp_instance *instance) { struct token_hold_cancel token_hold_cancel; /* * Only cancel if the token is currently held */ if (instance->my_token_held == 0) { return (0); } instance->my_token_held = 0; /* * Build message */ token_hold_cancel.header.type = MESSAGE_TYPE_TOKEN_HOLD_CANCEL; token_hold_cancel.header.endian_detector = ENDIAN_LOCAL; token_hold_cancel.header.encapsulated = 0; token_hold_cancel.header.nodeid = instance->my_id.addr[0].nodeid; memcpy (&token_hold_cancel.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (token_hold_cancel.header.nodeid); instance->stats.token_hold_cancel_tx++; totemrrp_mcast_flush_send (instance->totemrrp_context, &token_hold_cancel, sizeof (struct token_hold_cancel)); return (0); } static int orf_token_send_initial (struct totemsrp_instance *instance) { struct orf_token orf_token; int res; orf_token.header.type = MESSAGE_TYPE_ORF_TOKEN; orf_token.header.endian_detector = ENDIAN_LOCAL; orf_token.header.encapsulated = 0; orf_token.header.nodeid = instance->my_id.addr[0].nodeid; assert (orf_token.header.nodeid); orf_token.seq = SEQNO_START_MSG; orf_token.token_seq = SEQNO_START_TOKEN; orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; instance->stats.orf_token_tx++; if (cs_queue_is_empty (&instance->retrans_message_queue) == 1) { orf_token.retrans_flg = 0; instance->my_set_retrans_flg = 0; } else { orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; } orf_token.aru = 0; orf_token.aru = SEQNO_START_MSG - 1; orf_token.aru_addr = instance->my_id.addr[0].nodeid; memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); orf_token.fcc = 0; orf_token.backlog = 0; orf_token.rtr_list_entries = 0; res = token_send (instance, &orf_token, 1); return (res); } static void memb_state_commit_token_update ( struct totemsrp_instance *instance) { struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; unsigned int high_aru; unsigned int i; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (instance->my_new_memb_list, addr, sizeof (struct srp_addr) * instance->commit_token->addr_entries); instance->my_new_memb_entries = instance->commit_token->addr_entries; memcpy (&memb_list[instance->commit_token->memb_index].ring_id, &instance->my_old_ring_id, sizeof (struct memb_ring_id)); memb_list[instance->commit_token->memb_index].aru = instance->old_ring_state_aru; /* * TODO high delivered is really instance->my_aru, but with safe this * could change? */ instance->my_received_flg = (instance->my_aru == instance->my_high_seq_received); memb_list[instance->commit_token->memb_index].received_flg = instance->my_received_flg; memb_list[instance->commit_token->memb_index].high_delivered = instance->my_high_delivered; /* * find high aru up to current memb_index for all matching ring ids * if any ring id matching memb_index has aru less then high aru set * received flag for that entry to false */ high_aru = memb_list[instance->commit_token->memb_index].aru; for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (high_aru, memb_list[i].aru)) { high_aru = memb_list[i].aru; } } } for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, high_aru)) { memb_list[i].received_flg = 0; if (i == instance->commit_token->memb_index) { instance->my_received_flg = 0; } } } } instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid; instance->commit_token->memb_index += 1; assert (instance->commit_token->memb_index <= instance->commit_token->addr_entries); assert (instance->commit_token->header.nodeid); } static void memb_state_commit_token_target_set ( struct totemsrp_instance *instance) { struct srp_addr *addr; unsigned int i; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; for (i = 0; i < instance->totem_config->interface_count; i++) { totemrrp_token_target_set ( instance->totemrrp_context, &addr[instance->commit_token->memb_index % instance->commit_token->addr_entries].addr[i], i); } } static int memb_state_commit_token_send_recovery ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { unsigned int commit_token_size; commit_token->token_seq++; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemrrp_token_send (instance->totemrrp_context, commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_state_commit_token_send ( struct totemsrp_instance *instance) { unsigned int commit_token_size; instance->commit_token->token_seq++; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * instance->commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, instance->commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemrrp_token_send (instance->totemrrp_context, instance->commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_lowest_in_config (struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int i; struct totem_ip_address *lowest_addr; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); /* * find representative by searching for smallest identifier */ lowest_addr = &token_memb[0].addr[0]; for (i = 1; i < token_memb_entries; i++) { if (totemip_compare(lowest_addr, &token_memb[i].addr[0]) > 0) { totemip_copy (lowest_addr, &token_memb[i].addr[0]); } } return (totemip_compare (lowest_addr, &instance->my_id.addr[0]) == 0); } static int srp_addr_compare (const void *a, const void *b) { const struct srp_addr *srp_a = (const struct srp_addr *)a; const struct srp_addr *srp_b = (const struct srp_addr *)b; return (totemip_compare (&srp_a->addr[0], &srp_b->addr[0])); } static void memb_state_commit_token_create ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; int token_memb_entries = 0; log_printf (instance->totemsrp_log_level_debug, "Creating commit token because I am the rep.\n"); memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); memset (instance->commit_token, 0, sizeof (struct memb_commit_token)); instance->commit_token->header.type = MESSAGE_TYPE_MEMB_COMMIT_TOKEN; instance->commit_token->header.endian_detector = ENDIAN_LOCAL; instance->commit_token->header.encapsulated = 0; instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid; assert (instance->commit_token->header.nodeid); totemip_copy(&instance->commit_token->ring_id.rep, &instance->my_id.addr[0]); instance->commit_token->ring_id.seq = instance->token_ring_id_seq + 4; /* * This qsort is necessary to ensure the commit token traverses * the ring in the proper order */ qsort (token_memb, token_memb_entries, sizeof (struct srp_addr), srp_addr_compare); instance->commit_token->memb_index = 0; instance->commit_token->addr_entries = token_memb_entries; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (addr, token_memb, token_memb_entries * sizeof (struct srp_addr)); memset (memb_list, 0, sizeof (struct memb_commit_token_memb_entry) * token_memb_entries); } static void memb_join_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.endian_detector = ENDIAN_LOCAL; memb_join->header.encapsulated = 0; memb_join->header.nodeid = instance->my_id.addr[0].nodeid; assert (memb_join->header.nodeid); memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = instance->my_proc_list_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; srp_addr_copy (&memb_join->system_from, &instance->my_id); /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], instance->my_proc_list, instance->my_proc_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_proc_list_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemrrp_mcast_flush_send ( instance->totemrrp_context, memb_join, addr_idx); } static void memb_leave_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; int active_memb_entries; struct srp_addr active_memb[PROCESSOR_COUNT_MAX]; log_printf (instance->totemsrp_log_level_debug, "sending join/leave message\n"); /* * add us to the failed list, and remove us from * the members list */ memb_set_merge( &instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_set_subtract (active_memb, &active_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, &instance->my_id, 1); memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.endian_detector = ENDIAN_LOCAL; memb_join->header.encapsulated = 0; memb_join->header.nodeid = LEAVE_DUMMY_NODEID; memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = active_memb_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; srp_addr_copy (&memb_join->system_from, &instance->my_id); memb_join->system_from.addr[0].nodeid = LEAVE_DUMMY_NODEID; // TODO: CC Maybe use the actual join send routine. /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], active_memb, active_memb_entries * sizeof (struct srp_addr)); addr_idx += active_memb_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemrrp_mcast_flush_send ( instance->totemrrp_context, memb_join, addr_idx); } static void memb_merge_detect_transmit (struct totemsrp_instance *instance) { struct memb_merge_detect memb_merge_detect; memb_merge_detect.header.type = MESSAGE_TYPE_MEMB_MERGE_DETECT; memb_merge_detect.header.endian_detector = ENDIAN_LOCAL; memb_merge_detect.header.encapsulated = 0; memb_merge_detect.header.nodeid = instance->my_id.addr[0].nodeid; srp_addr_copy (&memb_merge_detect.system_from, &instance->my_id); memcpy (&memb_merge_detect.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (memb_merge_detect.header.nodeid); instance->stats.memb_merge_detect_tx++; totemrrp_mcast_flush_send (instance->totemrrp_context, &memb_merge_detect, sizeof (struct memb_merge_detect)); } static void memb_ring_id_create_or_load ( struct totemsrp_instance *instance, struct memb_ring_id *memb_ring_id) { int fd; int res = 0; char filename[PATH_MAX]; snprintf (filename, sizeof(filename), "%s/ringid_%s", rundir, totemip_print (&instance->my_id.addr[0])); fd = open (filename, O_RDONLY, 0700); /* * If file can be opened and read, read the ring id */ if (fd != -1) { res = read (fd, &memb_ring_id->seq, sizeof (uint64_t)); close (fd); } /* * If file could not be opened or read, create a new ring id */ if ((fd == -1) || (res != sizeof (uint64_t))) { memb_ring_id->seq = 0; umask(0); fd = open (filename, O_CREAT|O_RDWR, 0700); if (fd != -1) { res = write (fd, &memb_ring_id->seq, sizeof (uint64_t)); close (fd); if (res == -1) { LOGSYS_PERROR (errno, instance->totemsrp_log_level_warning, "Couldn't write ringid file '%s'", filename); } } else { LOGSYS_PERROR (errno, instance->totemsrp_log_level_warning, "Couldn't create ringid file '%s'", filename); } } totemip_copy(&memb_ring_id->rep, &instance->my_id.addr[0]); assert (!totemip_zero_check(&memb_ring_id->rep)); instance->token_ring_id_seq = memb_ring_id->seq; } static void memb_ring_id_set_and_store ( struct totemsrp_instance *instance, const struct memb_ring_id *ring_id) { char filename[256]; int fd; int res; memcpy (&instance->my_ring_id, ring_id, sizeof (struct memb_ring_id)); snprintf (filename, sizeof(filename), "%s/ringid_%s", rundir, totemip_print (&instance->my_id.addr[0])); fd = open (filename, O_WRONLY, 0777); if (fd == -1) { fd = open (filename, O_CREAT|O_RDWR, 0777); } if (fd == -1) { LOGSYS_PERROR(errno, instance->totemsrp_log_level_warning, "Couldn't store new ring id %llx to stable storage", instance->my_ring_id.seq); assert (0); return; } log_printf (instance->totemsrp_log_level_debug, "Storing new sequence id for ring %llx\n", instance->my_ring_id.seq); //assert (fd > 0); res = write (fd, &instance->my_ring_id.seq, sizeof (unsigned long long)); assert (res == sizeof (unsigned long long)); close (fd); } int totemsrp_callback_token_create ( void *srp_context, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; struct token_callback_instance *callback_handle; token_hold_cancel_send (instance); callback_handle = malloc (sizeof (struct token_callback_instance)); if (callback_handle == 0) { return (-1); } *handle_out = (void *)callback_handle; list_init (&callback_handle->list); callback_handle->callback_fn = callback_fn; callback_handle->data = (void *) data; callback_handle->callback_type = type; callback_handle->delete = delete; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: list_add (&callback_handle->list, &instance->token_callback_received_listhead); break; case TOTEM_CALLBACK_TOKEN_SENT: list_add (&callback_handle->list, &instance->token_callback_sent_listhead); break; } return (0); } void totemsrp_callback_token_destroy (void *srp_context, void **handle_out) { struct token_callback_instance *h; if (*handle_out) { h = (struct token_callback_instance *)*handle_out; list_del (&h->list); free (h); h = NULL; *handle_out = 0; } } static void token_callbacks_execute ( struct totemsrp_instance *instance, enum totem_callback_token_type type) { struct list_head *list; struct list_head *list_next; struct list_head *callback_listhead = 0; struct token_callback_instance *token_callback_instance; int res; int del; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: callback_listhead = &instance->token_callback_received_listhead; break; case TOTEM_CALLBACK_TOKEN_SENT: callback_listhead = &instance->token_callback_sent_listhead; break; default: assert (0); } for (list = callback_listhead->next; list != callback_listhead; list = list_next) { token_callback_instance = list_entry (list, struct token_callback_instance, list); list_next = list->next; del = token_callback_instance->delete; if (del == 1) { list_del (list); } res = token_callback_instance->callback_fn ( token_callback_instance->callback_type, token_callback_instance->data); /* * This callback failed to execute, try it again on the next token */ if (res == -1 && del == 1) { list_add (list, callback_listhead); } else if (del) { free (token_callback_instance); } } } /* * Flow control functions */ static unsigned int backlog_get (struct totemsrp_instance *instance) { unsigned int backlog = 0; if (instance->memb_state == MEMB_STATE_OPERATIONAL) { backlog = cs_queue_used (&instance->new_message_queue); } else if (instance->memb_state == MEMB_STATE_RECOVERY) { backlog = cs_queue_used (&instance->retrans_message_queue); } instance->stats.token[instance->stats.latest_token].backlog_calc = backlog; return (backlog); } static int fcc_calculate ( struct totemsrp_instance *instance, struct orf_token *token) { unsigned int transmits_allowed; unsigned int backlog_calc; transmits_allowed = instance->totem_config->max_messages; if (transmits_allowed > instance->totem_config->window_size - token->fcc) { transmits_allowed = instance->totem_config->window_size - token->fcc; } instance->my_cbl = backlog_get (instance); /* * Only do backlog calculation if there is a backlog otherwise * we would result in div by zero */ if (token->backlog + instance->my_cbl - instance->my_pbl) { backlog_calc = (instance->totem_config->window_size * instance->my_pbl) / (token->backlog + instance->my_cbl - instance->my_pbl); if (backlog_calc > 0 && transmits_allowed > backlog_calc) { transmits_allowed = backlog_calc; } } return (transmits_allowed); } /* * don't overflow the RTR sort queue */ static void fcc_rtr_limit ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int *transmits_allowed) { int check = QUEUE_RTR_ITEMS_SIZE_MAX; check -= (*transmits_allowed + instance->totem_config->window_size); assert (check >= 0); if (sq_lt_compare (instance->last_released + QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed - instance->totem_config->window_size, token->seq)) { *transmits_allowed = 0; } } static void fcc_token_update ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int msgs_transmitted) { token->fcc += msgs_transmitted - instance->my_trc; token->backlog += instance->my_cbl - instance->my_pbl; instance->my_trc = msgs_transmitted; instance->my_pbl = instance->my_cbl; } /* * Message Handlers */ unsigned long long int tv_old; /* * message handler called when TOKEN message type received */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { char token_storage[1500]; char token_convert[1500]; struct orf_token *token = NULL; int forward_token; unsigned int transmits_allowed; unsigned int mcasted_retransmit; unsigned int mcasted_regular; unsigned int last_aru; #ifdef GIVEINFO unsigned long long tv_current; unsigned long long tv_diff; tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "Time since last token %0.4f ms\n", ((float)tv_diff) / 1000000.0); #endif if (instance->orf_token_discard) { return (0); } #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) { return (0); } #endif if (endian_conversion_needed) { orf_token_endian_convert ((struct orf_token *)msg, (struct orf_token *)token_convert); msg = (struct orf_token *)token_convert; } /* * Make copy of token and retransmit list in case we have * to flush incoming messages from the kernel queue */ token = (struct orf_token *)token_storage; memcpy (token, msg, sizeof (struct orf_token)); memcpy (&token->rtr_list[0], (char *)msg + sizeof (struct orf_token), sizeof (struct rtr_item) * RETRANSMIT_ENTRIES_MAX); /* * Handle merge detection timeout */ if (token->seq == instance->my_last_seq) { start_merge_detect_timeout (instance); instance->my_seq_unchanged += 1; } else { cancel_merge_detect_timeout (instance); cancel_token_hold_retransmit_timeout (instance); instance->my_seq_unchanged = 0; } instance->my_last_seq = token->seq; #ifdef TEST_RECOVERY_MSG_COUNT if (instance->memb_state == MEMB_STATE_OPERATIONAL && token->seq > TEST_RECOVERY_MSG_COUNT) { return (0); } #endif totemrrp_recv_flush (instance->totemrrp_context); /* * Determine if we should hold (in reality drop) the token */ instance->my_token_held = 0; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) && instance->my_seq_unchanged > instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } else if (!totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) && instance->my_seq_unchanged >= instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } /* * Hold onto token when there is no activity on ring and * this processor is the ring rep */ forward_token = 1; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { if (instance->my_token_held) { forward_token = 0; } } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_RECEIVED); switch (instance->memb_state) { case MEMB_STATE_COMMIT: /* Discard token */ break; case MEMB_STATE_OPERATIONAL: messages_free (instance, token->aru); /* * Do NOT add break, this case should also execute code in gather case. */ case MEMB_STATE_GATHER: /* * DO NOT add break, we use different free mechanism in recovery state */ case MEMB_STATE_RECOVERY: /* * Discard tokens from another configuration */ if (memcmp (&token->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); /* discard token */ } /* * Discard retransmitted tokens */ if (sq_lte_compare (token->token_seq, instance->my_token_seq)) { return (0); /* discard token */ } last_aru = instance->my_last_aru; instance->my_last_aru = token->aru; transmits_allowed = fcc_calculate (instance, token); mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed); fcc_rtr_limit (instance, token, &transmits_allowed); mcasted_regular = orf_token_mcast (instance, token, transmits_allowed); /* if (mcasted_regular) { printf ("mcasted regular %d\n", mcasted_regular); printf ("token seq %d\n", token->seq); } */ fcc_token_update (instance, token, mcasted_retransmit + mcasted_regular); if (sq_lt_compare (instance->my_aru, token->aru) || instance->my_id.addr[0].nodeid == token->aru_addr || token->aru_addr == 0) { token->aru = instance->my_aru; if (token->aru == token->seq) { token->aru_addr = 0; } else { token->aru_addr = instance->my_id.addr[0].nodeid; } } if (token->aru == last_aru && token->aru_addr != 0) { instance->my_aru_count += 1; } else { instance->my_aru_count = 0; } if (instance->my_aru_count > instance->totem_config->fail_to_recv_const && token->aru_addr == instance->my_id.addr[0].nodeid) { log_printf (instance->totemsrp_log_level_error, "FAILED TO RECEIVE\n"); instance->failed_to_recv = 1; memb_set_merge (&instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, 6); } else { instance->my_token_seq = token->token_seq; token->token_seq += 1; if (instance->memb_state == MEMB_STATE_RECOVERY) { /* * instance->my_aru == instance->my_high_seq_received means this processor * has recovered all messages it can recover * (ie: its retrans queue is empty) */ if (cs_queue_is_empty (&instance->retrans_message_queue) == 0) { if (token->retrans_flg == 0) { token->retrans_flg = 1; instance->my_set_retrans_flg = 1; } } else if (token->retrans_flg == 1 && instance->my_set_retrans_flg) { token->retrans_flg = 0; instance->my_set_retrans_flg = 0; } log_printf (instance->totemsrp_log_level_debug, "token retrans flag is %d my set retrans flag%d retrans queue empty %d count %d, aru %x\n", token->retrans_flg, instance->my_set_retrans_flg, cs_queue_is_empty (&instance->retrans_message_queue), instance->my_retrans_flg_count, token->aru); if (token->retrans_flg == 0) { instance->my_retrans_flg_count += 1; } else { instance->my_retrans_flg_count = 0; } if (instance->my_retrans_flg_count == 2) { instance->my_install_seq = token->seq; } log_printf (instance->totemsrp_log_level_debug, "install seq %x aru %x high seq received %x\n", instance->my_install_seq, instance->my_aru, instance->my_high_seq_received); if (instance->my_retrans_flg_count >= 2 && instance->my_received_flg == 0 && sq_lte_compare (instance->my_install_seq, instance->my_aru)) { instance->my_received_flg = 1; instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct totem_ip_address) * instance->my_trans_memb_entries); } if (instance->my_retrans_flg_count >= 3 && sq_lte_compare (instance->my_install_seq, token->aru)) { instance->my_rotation_counter += 1; } else { instance->my_rotation_counter = 0; } if (instance->my_rotation_counter == 2) { log_printf (instance->totemsrp_log_level_debug, "retrans flag count %x token aru %x install seq %x aru %x %x\n", instance->my_retrans_flg_count, token->aru, instance->my_install_seq, instance->my_aru, token->seq); memb_state_operational_enter (instance); instance->my_rotation_counter = 0; instance->my_retrans_flg_count = 0; } } totemrrp_send_flush (instance->totemrrp_context); token_send (instance, token, forward_token); #ifdef GIVEINFO tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "I held %0.4f ms\n", ((float)tv_diff) / 1000000.0); #endif if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* * Deliver messages after token has been transmitted * to improve performance */ reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED if (totemip_equal(&instance->my_id.addr[0], &instance->my_ring_id.rep) && instance->my_token_held == 1) { start_token_hold_retransmit_timeout (instance); } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_SENT); } break; } if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); } static void messages_deliver_to_app ( struct totemsrp_instance *instance, int skip, unsigned int end_point) { struct sort_queue_item *sort_queue_item_p; unsigned int i; int res; struct mcast *mcast_in; struct mcast mcast_header; unsigned int range = 0; int endian_conversion_required; unsigned int my_high_delivered_stored = 0; range = end_point - instance->my_high_delivered; if (range) { log_printf (instance->totemsrp_log_level_debug, "Delivering %x to %x\n", instance->my_high_delivered, end_point); } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); my_high_delivered_stored = instance->my_high_delivered; /* * Deliver messages in order from rtr queue to pending delivery queue */ for (i = 1; i <= range; i++) { void *ptr = 0; /* * If out of range of sort queue, stop assembly */ res = sq_in_range (&instance->regular_sort_queue, my_high_delivered_stored + i); if (res == 0) { break; } res = sq_item_get (&instance->regular_sort_queue, my_high_delivered_stored + i, &ptr); /* * If hole, stop assembly */ if (res != 0 && skip == 0) { break; } instance->my_high_delivered = my_high_delivered_stored + i; if (res != 0) { continue; } sort_queue_item_p = ptr; mcast_in = sort_queue_item_p->mcast; assert (mcast_in != (struct mcast *)0xdeadbeef); endian_conversion_required = 0; if (mcast_in->header.endian_detector != ENDIAN_LOCAL) { endian_conversion_required = 1; mcast_endian_convert (mcast_in, &mcast_header); } else { memcpy (&mcast_header, mcast_in, sizeof (struct mcast)); } /* * Skip messages not originated in instance->my_deliver_memb */ if (skip && memb_set_subset (&mcast_header.system_from, 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) == 0) { instance->my_high_delivered = my_high_delivered_stored + i; continue; } /* * Message found */ log_printf (instance->totemsrp_log_level_debug, "Delivering MCAST message with seq %x to pending delivery queue\n", mcast_header.seq); /* * Message is locally originated multicast */ instance->totemsrp_deliver_fn ( mcast_header.header.nodeid, ((char *)sort_queue_item_p->mcast) + sizeof (struct mcast), sort_queue_item_p->msg_len - sizeof (struct mcast), endian_conversion_required); } } /* * recv message handler called when MCAST message type received */ static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct sort_queue_item sort_queue_item; struct sq *sort_queue; struct mcast mcast_header; if (endian_conversion_needed) { mcast_endian_convert (msg, &mcast_header); } else { memcpy (&mcast_header, msg, sizeof (struct mcast)); } if (mcast_header.header.encapsulated == MESSAGE_ENCAPSULATED) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } assert (msg_len <= FRAME_SIZE_MAX); #ifdef TEST_DROP_MCAST_PERCENTAGE if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) { return (0); } #endif /* * If the message is foreign execute the switch below */ if (memcmp (&instance->my_ring_id, &mcast_header.ring_id, sizeof (struct memb_ring_id)) != 0) { switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge ( &mcast_header.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 7); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &mcast_header.system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&mcast_header.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 8); return (0); } break; case MEMB_STATE_COMMIT: /* discard message */ instance->stats.rx_msg_dropped++; break; case MEMB_STATE_RECOVERY: /* discard message */ instance->stats.rx_msg_dropped++; break; } return (0); } log_printf (instance->totemsrp_log_level_debug, "Received ringid(%s:%lld) seq %x\n", totemip_print (&mcast_header.ring_id.rep), mcast_header.ring_id.seq, mcast_header.seq); /* * Add mcast message to rtr queue if not already in rtr queue * otherwise free io vectors */ if (msg_len > 0 && msg_len <= FRAME_SIZE_MAX && sq_in_range (sort_queue, mcast_header.seq) && sq_item_inuse (sort_queue, mcast_header.seq) == 0) { /* * Allocate new multicast memory block */ // TODO LEAK sort_queue_item.mcast = totemsrp_buffer_alloc (instance); if (sort_queue_item.mcast == NULL) { return (-1); /* error here is corrected by the algorithm */ } memcpy (sort_queue_item.mcast, msg, msg_len); sort_queue_item.msg_len = msg_len; if (sq_lt_compare (instance->my_high_seq_received, mcast_header.seq)) { instance->my_high_seq_received = mcast_header.seq; } sq_item_add (sort_queue, &sort_queue_item, mcast_header.seq); } update_aru (instance); if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* TODO remove from retrans message queue for old ring in recovery state */ return (0); } static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_merge_detect memb_merge_detect; if (endian_conversion_needed) { memb_merge_detect_endian_convert (msg, &memb_merge_detect); } else { memcpy (&memb_merge_detect, msg, sizeof (struct memb_merge_detect)); } /* * do nothing if this is a merge detect from this configuration */ if (memcmp (&instance->my_ring_id, &memb_merge_detect.ring_id, sizeof (struct memb_ring_id)) == 0) { return (0); } /* * Execute merge operation */ switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge (&memb_merge_detect.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 9); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &memb_merge_detect.system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&memb_merge_detect.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 10); return (0); } break; case MEMB_STATE_COMMIT: /* do nothing in commit */ break; case MEMB_STATE_RECOVERY: /* do nothing in recovery */ break; } return (0); } static void memb_join_process ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; int gather_entered = 0; int fail_minus_memb_entries = 0; struct srp_addr fail_minus_memb[PROCESSOR_COUNT_MAX]; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; /* memb_set_print ("proclist", proc_list, memb_join->proc_list_entries); memb_set_print ("faillist", failed_list, memb_join->failed_list_entries); memb_set_print ("my_proclist", instance->my_proc_list, instance->my_proc_list_entries); memb_set_print ("my_faillist", instance->my_failed_list, instance->my_failed_list_entries); -*/ if (memb_set_equal (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_equal (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { memb_consensus_set (instance, &memb_join->system_from); if (memb_consensus_agreed (instance) && instance->failed_to_recv == 1) { instance->failed_to_recv = 0; srp_addr_copy (&instance->my_proc_list[0], &instance->my_id); instance->my_proc_list_entries = 1; instance->my_failed_list_entries = 0; memb_state_commit_token_create (instance); memb_state_commit_enter (instance); return; } if (memb_consensus_agreed (instance) && memb_lowest_in_config (instance)) { memb_state_commit_token_create (instance); memb_state_commit_enter (instance); } else { return; } } else if (memb_set_subset (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_subset (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { return; } else if (memb_set_subset (&memb_join->system_from, 1, instance->my_failed_list, instance->my_failed_list_entries)) { return; } else { memb_set_merge (proc_list, memb_join->proc_list_entries, instance->my_proc_list, &instance->my_proc_list_entries); if (memb_set_subset ( &instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { memb_set_merge ( &memb_join->system_from, 1, instance->my_failed_list, &instance->my_failed_list_entries); } else { if (memb_set_subset ( &memb_join->system_from, 1, instance->my_memb_list, instance->my_memb_entries)) { if (memb_set_subset ( &memb_join->system_from, 1, instance->my_failed_list, instance->my_failed_list_entries) == 0) { memb_set_merge (failed_list, memb_join->failed_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); } else { memb_set_subtract (fail_minus_memb, &fail_minus_memb_entries, failed_list, memb_join->failed_list_entries, instance->my_memb_list, instance->my_memb_entries); memb_set_merge (fail_minus_memb, fail_minus_memb_entries, instance->my_failed_list, &instance->my_failed_list_entries); } } } memb_state_gather_enter (instance, 11); gather_entered = 1; } if (gather_entered == 0 && instance->memb_state == MEMB_STATE_OPERATIONAL) { memb_state_gather_enter (instance, 12); } } static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out) { int i; struct srp_addr *in_proc_list; struct srp_addr *in_failed_list; struct srp_addr *out_proc_list; struct srp_addr *out_failed_list; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); out->proc_list_entries = swab32 (in->proc_list_entries); out->failed_list_entries = swab32 (in->failed_list_entries); out->ring_seq = swab64 (in->ring_seq); in_proc_list = (struct srp_addr *)in->end_of_memb_join; in_failed_list = in_proc_list + out->proc_list_entries; out_proc_list = (struct srp_addr *)out->end_of_memb_join; out_failed_list = out_proc_list + out->proc_list_entries; for (i = 0; i < out->proc_list_entries; i++) { srp_addr_copy_endian_convert (&out_proc_list[i], &in_proc_list[i]); } for (i = 0; i < out->failed_list_entries; i++) { srp_addr_copy_endian_convert (&out_failed_list[i], &in_failed_list[i]); } } static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out) { int i; struct srp_addr *in_addr = (struct srp_addr *)in->end_of_commit_token; struct srp_addr *out_addr = (struct srp_addr *)out->end_of_commit_token; struct memb_commit_token_memb_entry *in_memb_list; struct memb_commit_token_memb_entry *out_memb_list; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->token_seq = swab32 (in->token_seq); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->retrans_flg = swab32 (in->retrans_flg); out->memb_index = swab32 (in->memb_index); out->addr_entries = swab32 (in->addr_entries); in_memb_list = (struct memb_commit_token_memb_entry *)(in_addr + out->addr_entries); out_memb_list = (struct memb_commit_token_memb_entry *)(out_addr + out->addr_entries); for (i = 0; i < out->addr_entries; i++) { srp_addr_copy_endian_convert (&out_addr[i], &in_addr[i]); /* * Only convert the memb entry if it has been set */ if (in_memb_list[i].ring_id.rep.family != 0) { totemip_copy_endian_convert (&out_memb_list[i].ring_id.rep, &in_memb_list[i].ring_id.rep); out_memb_list[i].ring_id.seq = swab64 (in_memb_list[i].ring_id.seq); out_memb_list[i].aru = swab32 (in_memb_list[i].aru); out_memb_list[i].high_delivered = swab32 (in_memb_list[i].high_delivered); out_memb_list[i].received_flg = swab32 (in_memb_list[i].received_flg); } } } static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out) { int i; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->seq = swab32 (in->seq); out->token_seq = swab32 (in->token_seq); out->aru = swab32 (in->aru); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->aru_addr = swab32(in->aru_addr); out->ring_id.seq = swab64 (in->ring_id.seq); out->fcc = swab32 (in->fcc); out->backlog = swab32 (in->backlog); out->retrans_flg = swab32 (in->retrans_flg); out->rtr_list_entries = swab32 (in->rtr_list_entries); for (i = 0; i < out->rtr_list_entries; i++) { totemip_copy_endian_convert(&out->rtr_list[i].ring_id.rep, &in->rtr_list[i].ring_id.rep); out->rtr_list[i].ring_id.seq = swab64 (in->rtr_list[i].ring_id.seq); out->rtr_list[i].seq = swab32 (in->rtr_list[i].seq); } } static void mcast_endian_convert (const struct mcast *in, struct mcast *out) { out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->header.encapsulated = in->header.encapsulated; out->seq = swab32 (in->seq); out->this_seqno = swab32 (in->this_seqno); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->node_id = swab32 (in->node_id); out->guarantee = swab32 (in->guarantee); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); } static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out) { out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); } static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_join *memb_join; struct memb_join *memb_join_convert = alloca (msg_len); if (endian_conversion_needed) { memb_join = memb_join_convert; memb_join_endian_convert (msg, memb_join_convert); } else { memb_join = msg; } /* * If the process paused because it wasn't scheduled in a timely * fashion, flush the join messages because they may be queued * entries */ if (pause_flush (instance)) { return (0); } if (instance->token_ring_id_seq < memb_join->ring_seq) { instance->token_ring_id_seq = memb_join->ring_seq; } switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_join_process (instance, memb_join); break; case MEMB_STATE_GATHER: memb_join_process (instance, memb_join); break; case MEMB_STATE_COMMIT: if (memb_set_subset (&memb_join->system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_state_gather_enter (instance, 13); } break; case MEMB_STATE_RECOVERY: if (memb_set_subset (&memb_join->system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_recovery_state_token_loss (instance); memb_state_gather_enter (instance, 14); } break; } return (0); } static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_commit_token *memb_commit_token_convert = alloca (msg_len); struct memb_commit_token *memb_commit_token; struct srp_addr sub[PROCESSOR_COUNT_MAX]; int sub_entries; struct srp_addr *addr; log_printf (instance->totemsrp_log_level_debug, "got commit token\n"); if (endian_conversion_needed) { memb_commit_token_endian_convert (msg, memb_commit_token_convert); } else { memcpy (memb_commit_token_convert, msg, msg_len); } memb_commit_token = memb_commit_token_convert; addr = (struct srp_addr *)memb_commit_token->end_of_commit_token; #ifdef TEST_DROP_COMMIT_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_COMMIT_TOKEN_PERCENTAGE) { return (0); } #endif switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: /* discard token */ break; case MEMB_STATE_GATHER: memb_set_subtract (sub, &sub_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); if (memb_set_equal (addr, memb_commit_token->addr_entries, sub, sub_entries) && memb_commit_token->ring_id.seq > instance->my_ring_id.seq) { memcpy (instance->commit_token, memb_commit_token, msg_len); memb_state_commit_enter (instance); } break; case MEMB_STATE_COMMIT: /* * If retransmitted commit tokens are sent on this ring * filter them out and only enter recovery once the * commit token has traversed the array. This is * determined by : * memb_commit_token->memb_index == memb_commit_token->addr_entries) { */ if (memb_commit_token->ring_id.seq == instance->my_ring_id.seq && memb_commit_token->memb_index == memb_commit_token->addr_entries) { memb_state_recovery_enter (instance, memb_commit_token); } break; case MEMB_STATE_RECOVERY: if (totemip_equal (&instance->my_id.addr[0], &instance->my_ring_id.rep)) { log_printf (instance->totemsrp_log_level_debug, "Sending initial ORF token\n"); // TODO convert instead of initiate orf_token_send_initial (instance); reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED } break; } return (0); } static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct token_hold_cancel *token_hold_cancel = msg; if (memcmp (&token_hold_cancel->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) == 0) { instance->my_seq_unchanged = 0; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { timer_function_token_retransmit_timeout (instance); } } return (0); } void main_deliver_fn ( void *context, const void *msg, unsigned int msg_len) { struct totemsrp_instance *instance = context; const struct message_header *message_header = msg; if (msg_len < sizeof (struct message_header)) { log_printf (instance->totemsrp_log_level_security, "Received message is too short... ignoring %u.\n", (unsigned int)msg_len); return; } switch (message_header->type) { case MESSAGE_TYPE_ORF_TOKEN: instance->stats.orf_token_rx++; break; case MESSAGE_TYPE_MCAST: instance->stats.mcast_rx++; break; case MESSAGE_TYPE_MEMB_MERGE_DETECT: instance->stats.memb_merge_detect_rx++; break; case MESSAGE_TYPE_MEMB_JOIN: instance->stats.memb_join_rx++; break; case MESSAGE_TYPE_MEMB_COMMIT_TOKEN: instance->stats.memb_commit_token_rx++; break; case MESSAGE_TYPE_TOKEN_HOLD_CANCEL: instance->stats.token_hold_cancel_rx++; break; default: log_printf (instance->totemsrp_log_level_security, "Type of received message is wrong... ignoring %d.\n", (int)message_header->type); printf ("wrong message type\n"); instance->stats.rx_msg_dropped++; return; } /* * Handle incoming message */ totemsrp_message_handlers.handler_functions[(int)message_header->type] ( instance, msg, msg_len, message_header->endian_detector != ENDIAN_LOCAL); } void main_iface_change_fn ( void *context, const struct totem_ip_address *iface_addr, unsigned int iface_no) { struct totemsrp_instance *instance = context; int i; totemip_copy (&instance->my_id.addr[iface_no], iface_addr); assert (instance->my_id.addr[iface_no].nodeid); totemip_copy (&instance->my_memb_list[0].addr[iface_no], iface_addr); if (instance->iface_changes++ == 0) { memb_ring_id_create_or_load (instance, &instance->my_ring_id); log_printf ( instance->totemsrp_log_level_debug, "Created or loaded sequence id %llx.%s for this ring.\n", instance->my_ring_id.seq, totemip_print (&instance->my_ring_id.rep)); if (instance->totemsrp_service_ready_fn) { instance->totemsrp_service_ready_fn (); } } for (i = 0; i < instance->totem_config->interfaces[iface_no].member_count; i++) { totemsrp_member_add (instance, &instance->totem_config->interfaces[iface_no].member_list[i], iface_no); } if (instance->iface_changes >= instance->totem_config->interface_count) { memb_state_gather_enter (instance, 15); } } void totemsrp_net_mtu_adjust (struct totem_config *totem_config) { totem_config->net_mtu -= sizeof (struct mcast); } void totemsrp_service_ready_register ( void *context, void (*totem_service_ready) (void)) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->totemsrp_service_ready_fn = totem_service_ready; } int totemsrp_member_add ( void *context, const struct totem_ip_address *member, int ring_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemrrp_member_add (instance->totemrrp_context, member, ring_no); return (res); } int totemsrp_member_remove ( void *context, const struct totem_ip_address *member, int ring_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemrrp_member_remove (instance->totemrrp_context, member, ring_no); return (res); } void totemsrp_threaded_mode_enable (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->threaded_mode_enabled = 1; } diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h index a025eab1..2166143a 100644 --- a/include/corosync/totem/totem.h +++ b/include/corosync/totem/totem.h @@ -1,278 +1,280 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * Author: Steven Dake (sdake@redhat.com) * * All rights reserved. * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEM_H_DEFINED #define TOTEM_H_DEFINED #include "totemip.h" #include #ifdef HAVE_SMALL_MEMORY_FOOTPRINT #define PROCESSOR_COUNT_MAX 16 #define MESSAGE_SIZE_MAX 1024*64 #define MESSAGE_QUEUE_MAX 512 #else #define PROCESSOR_COUNT_MAX 384 #define MESSAGE_SIZE_MAX 1024*1024 /* (1MB) */ #define MESSAGE_QUEUE_MAX MESSAGE_SIZE_MAX / totem_config->net_mtu #endif /* HAVE_SMALL_MEMORY_FOOTPRINT */ #define FRAME_SIZE_MAX 10000 #define TRANSMITS_ALLOWED 16 #define SEND_THREADS_MAX 16 #define INTERFACE_MAX 2 /** * Maximum number of continuous gather states */ #define MAX_NO_CONT_GATHER 3 struct totem_interface { struct totem_ip_address bindnet; struct totem_ip_address boundto; struct totem_ip_address mcast_addr; uint16_t ip_port; uint16_t ttl; int member_count; struct totem_ip_address member_list[PROCESSOR_COUNT_MAX]; }; struct totem_logging_configuration { void (*log_printf) ( int level, int subsys, const char *function_name, const char *file_name, int file_line, const char *format, ...) __attribute__((format(printf, 6, 7))); int log_level_security; int log_level_error; int log_level_warning; int log_level_notice; int log_level_debug; int log_subsys_id; }; enum { TOTEM_PRIVATE_KEY_LEN = 128 }; enum { TOTEM_RRP_MODE_BYTES = 64 }; typedef enum { TOTEM_TRANSPORT_UDP = 0, TOTEM_TRANSPORT_UDPU = 1, TOTEM_TRANSPORT_RDMA = 2 } totem_transport_t; struct totem_config { int version; /* * network */ struct totem_interface *interfaces; unsigned int interface_count; unsigned int node_id; unsigned int clear_node_high_bit; /* * key information */ unsigned char private_key[TOTEM_PRIVATE_KEY_LEN]; unsigned int private_key_len; /* * Totem configuration parameters */ unsigned int token_timeout; unsigned int token_retransmit_timeout; unsigned int token_hold_timeout; unsigned int token_retransmits_before_loss_const; unsigned int join_timeout; unsigned int send_join_timeout; unsigned int consensus_timeout; unsigned int merge_timeout; unsigned int downcheck_timeout; unsigned int fail_to_recv_const; unsigned int seqno_unchanged_const; unsigned int rrp_token_expired_timeout; unsigned int rrp_problem_count_timeout; unsigned int rrp_problem_count_threshold; + unsigned int rrp_problem_count_mcast_threshold; + unsigned int rrp_autorecovery_check_timeout; char rrp_mode[TOTEM_RRP_MODE_BYTES]; struct totem_logging_configuration totem_logging_configuration; unsigned int secauth; unsigned int net_mtu; unsigned int threads; unsigned int heartbeat_failures_allowed; unsigned int max_network_delay; unsigned int window_size; unsigned int max_messages; const char *vsf_type; unsigned int broadcast_use; enum { TOTEM_CRYPTO_SOBER=0, TOTEM_CRYPTO_NSS } crypto_type; enum { TOTEM_CRYPTO_ACCEPT_OLD=0, TOTEM_CRYPTO_ACCEPT_NEW } crypto_accept; int crypto_crypt_type; int crypto_sign_type; totem_transport_t transport_number; unsigned int miss_count_const; }; #define TOTEM_CONFIGURATION_TYPE enum totem_configuration_type { TOTEM_CONFIGURATION_REGULAR, TOTEM_CONFIGURATION_TRANSITIONAL }; #define TOTEM_CALLBACK_TOKEN_TYPE enum totem_callback_token_type { TOTEM_CALLBACK_TOKEN_RECEIVED = 1, TOTEM_CALLBACK_TOKEN_SENT = 2 }; enum totem_event_type { TOTEM_EVENT_DELIVERY_CONGESTED, TOTEM_EVENT_NEW_MSG, }; #define MEMB_RING_ID struct memb_ring_id { struct totem_ip_address rep; unsigned long long seq; } __attribute__((packed)); typedef struct { hdb_handle_t handle; int is_dirty; time_t last_updated; } totem_stats_header_t; typedef struct { totem_stats_header_t hdr; uint32_t iface_changes; } totemnet_stats_t; typedef struct { totem_stats_header_t hdr; totemnet_stats_t *net; char *algo_name; } totemrrp_stats_t; typedef struct { uint32_t rx; uint32_t tx; int backlog_calc; } totemsrp_token_stats_t; typedef struct { totem_stats_header_t hdr; totemrrp_stats_t *rrp; uint64_t orf_token_tx; uint64_t orf_token_rx; uint64_t memb_merge_detect_tx; uint64_t memb_merge_detect_rx; uint64_t memb_join_tx; uint64_t memb_join_rx; uint64_t mcast_tx; uint64_t mcast_retx; uint64_t mcast_rx; uint64_t memb_commit_token_tx; uint64_t memb_commit_token_rx; uint64_t token_hold_cancel_tx; uint64_t token_hold_cancel_rx; uint64_t operational_entered; uint64_t operational_token_lost; uint64_t gather_entered; uint64_t gather_token_lost; uint64_t commit_entered; uint64_t commit_token_lost; uint64_t recovery_entered; uint64_t recovery_token_lost; uint64_t consensus_timeouts; uint64_t rx_msg_dropped; uint32_t continuous_gather; int earliest_token; int latest_token; #define TOTEM_TOKEN_STATS_MAX 100 totemsrp_token_stats_t token[TOTEM_TOKEN_STATS_MAX]; } totemsrp_stats_t; #define TOTEM_CONFIGURATION_TYPE typedef struct { totem_stats_header_t hdr; totemsrp_stats_t *srp; } totemmrp_stats_t; typedef struct { totem_stats_header_t hdr; totemmrp_stats_t *mrp; uint32_t msg_reserved; uint32_t msg_queue_avail; } totempg_stats_t; #endif /* TOTEM_H_DEFINED */ diff --git a/man/corosync.conf.5 b/man/corosync.conf.5 index 3f8e90e5..67b76696 100644 --- a/man/corosync.conf.5 +++ b/man/corosync.conf.5 @@ -1,638 +1,646 @@ .\"/* .\" * Copyright (c) 2005 MontaVista Software, Inc. .\" * Copyright (c) 2006-2010 Red Hat, Inc. .\" * .\" * All rights reserved. .\" * .\" * Author: Steven Dake (sdake@redhat.com) .\" * .\" * This software licensed under BSD license, the text of which follows: .\" * .\" * Redistribution and use in source and binary forms, with or without .\" * modification, are permitted provided that the following conditions are met: .\" * .\" * - Redistributions of source code must retain the above copyright notice, .\" * this list of conditions and the following disclaimer. .\" * - Redistributions in binary form must reproduce the above copyright notice, .\" * this list of conditions and the following disclaimer in the documentation .\" * and/or other materials provided with the distribution. .\" * - Neither the name of the MontaVista Software, Inc. nor the names of its .\" * contributors may be used to endorse or promote products derived from this .\" * software without specific prior written permission. .\" * .\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" .\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE .\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF .\" * THE POSSIBILITY OF SUCH DAMAGE. .\" */ .TH COROSYNC_CONF 5 2006-03-28 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" .SH NAME corosync.conf - corosync executive configuration file .SH SYNOPSIS /etc/corosync/corosync.conf .SH DESCRIPTION The corosync.conf instructs the corosync executive about various parameters needed to control the corosync executive. Empty lines and lines starting with # character are ignored. The configuration file consists of bracketed top level directives. The possible directive choices are: .TP totem { } This top level directive contains configuration options for the totem protocol. .TP logging { } This top level directive contains configuration options for logging. .TP event { } This top level directive contains configuration options for the event service. .PP .PP It is also possible to specify the top level parameter .B compatibility. This directive indicates the level of compatibility requested by the user. The option whitetank can be specified to remain backward compatable with openais-0.80.z. The option none can be specified to only be compatable with corosync-1.Y.Z. Extra processing during configuration changes is required to remain backward compatable. The default is whitetank. (backwards compatibility) .PP .PP Within the .B totem directive, an interface directive is required. There is also one configuration option which is required: .PP .PP Within the .B interface sub-directive of totem there are four parameters which are required. There is one parameter which is optional. .TP ringnumber This specifies the ring number for the interface. When using the redundant ring protocol, each interface should specify separate ring numbers to uniquely identify to the membership protocol which interface to use for which redundant ring. The ringnumber must start at 0. .TP bindnetaddr This specifies the network address the corosync executive should bind to. For example, if the local interface is 192.168.5.92 with netmask 255.255.255.0, set bindnetaddr to 192.168.5.0. If the local interface is 192.168.5.92 with netmask 255.255.255.192, set bindnetaddr to 192.168.5.64, and so forth. This may also be an IPV6 address, in which case IPV6 networking will be used. In this case, the full address must be specified and there is no automatic selection of the network interface within a specific subnet as with IPv4. If IPv6 networking is used, the nodeid field must be specified. .TP broadcast This is optional and can be set to yes. If it is set to yes, the broadcast address will be used for communication. If this option is set, mcastaddr should not be set. .TP mcastaddr This is the multicast address used by corosync executive. The default should work for most networks, but the network administrator should be queried about a multicast address to use. Avoid 224.x.x.x because this is a "config" multicast address. This may also be an IPV6 multicast address, in which case IPV6 networking will be used. If IPv6 networking is used, the nodeid field must be specified. .TP mcastport This specifies the UDP port number. It is possible to use the same multicast address on a network with the corosync services configured for different UDP ports. Please note corosync uses two UDP ports mcastport (for mcast receives) and mcastport - 1 (for mcast sends). If you have multiple clusters on the same network using the same mcastaddr please configure the mcastports with a gap. .TP ttl This specifies the Time To Live (TTL). If you run your cluster on a routed network then the default of "1" will be too small. This option provides a way to increase this up to 255. The valid range is 0..255. Note that this is only valid on multicast transport types. .TP member This specifies a member on the interface and used with the udpu transport only. Every node that should be a member of the membership should be specified as a separate member directive. Within the member directive there is a parameter memberaddr which specifies the ip address of one of the nodes. .PP .PP Within the .B totem directive, there are seven configuration options of which one is required, five are optional, and one is required when IPV6 is configured in the interface subdirective. The required directive controls the version of the totem configuration. The optional option unless using IPV6 directive controls identification of the processor. The optional options control secrecy and authentication, the redundant ring mode of operation, maximum network MTU, and number of sending threads, and the nodeid field. .TP version This specifies the version of the configuration file. Currently the only valid version for this directive is 2. .PP .PP .TP nodeid This configuration option is optional when using IPv4 and required when using IPv6. This is a 32 bit value specifying the node identifier delivered to the cluster membership service. If this is not specified with IPv4, the node id will be determined from the 32 bit IP address the system to which the system is bound with ring identifier of 0. The node identifier value of zero is reserved and should not be used. .TP clear_node_high_bit This configuration option is optional and is only relevant when no nodeid is specified. Some openais clients require a signed 32 bit nodeid that is greater than zero however by default openais uses all 32 bits of the IPv4 address space when generating a nodeid. Set this option to yes to force the high bit to be zero and therefor ensure the nodeid is a positive signed 32 bit integer. WARNING: The clusters behavior is undefined if this option is enabled on only a subset of the cluster (for example during a rolling upgrade). .TP secauth This specifies that HMAC/SHA1 authentication should be used to authenticate all messages. It further specifies that all data should be encrypted with the sober128 encryption algorithm to protect data from eavesdropping. Enabling this option adds a 36 byte header to every message sent by totem which reduces total throughput. Encryption and authentication consume 75% of CPU cycles in aisexec as measured with gprof when enabled. For 100mbit networks with 1500 MTU frame transmissions: A throughput of 9mb/sec is possible with 100% cpu utilization when this option is enabled on 3ghz cpus. A throughput of 10mb/sec is possible wth 20% cpu utilization when this optin is disabled on 3ghz cpus. For gig-e networks with large frame transmissions: A throughput of 20mb/sec is possible when this option is enabled on 3ghz cpus. A throughput of 60mb/sec is possible when this option is disabled on 3ghz cpus. The default is on. .TP rrp_mode This specifies the mode of redundant ring, which may be none, active, or passive. Active replication offers slightly lower latency from transmit to delivery in faulty network environments but with less performance. Passive replication may nearly double the speed of the totem protocol if the protocol doesn't become cpu bound. The final option is none, in which case only one network interface will be used to operate the totem protocol. If only one interface directive is specified, none is automatically chosen. If multiple interface directives are specified, only active or passive may be chosen. .TP netmtu This specifies the network maximum transmit unit. To set this value beyond 1500, the regular frame MTU, requires ethernet devices that support large, or also called jumbo, frames. If any device in the network doesn't support large frames, the protocol will not operate properly. The hosts must also have their mtu size set from 1500 to whatever frame size is specified here. Please note while some NICs or switches claim large frame support, they support 9000 MTU as the maximum frame size including the IP header. Setting the netmtu and host MTUs to 9000 will cause totem to use the full 9000 bytes of the frame. Then Linux will add a 18 byte header moving the full frame size to 9018. As a result some hardware will not operate properly with this size of data. A netmtu of 8982 seems to work for the few large frame devices that have been tested. Some manufacturers claim large frame support when in fact they support frame sizes of 4500 bytes. Increasing the MTU from 1500 to 8982 doubles throughput performance from 30MB/sec to 60MB/sec as measured with evsbench with 175000 byte messages with the secauth directive set to off. When sending multicast traffic, if the network frequently reconfigures, chances are that some device in the network doesn't support large frames. Choose hardware carefully if intending to use large frame support. The default is 1500. .TP threads This directive controls how many threads are used to encrypt and send multicast messages. If secauth is off, the protocol will never use threaded sending. If secauth is on, this directive allows systems to be configured to use multiple threads to encrypt and send multicast messages. A thread directive of 0 indicates that no threaded send should be used. This mode offers best performance for non-SMP systems. The default is 0. .TP vsftype This directive controls the virtual synchrony filter type used to identify a primary component. The preferred choice is YKD dynamic linear voting, however, for clusters larger then 32 nodes YKD consumes alot of memory. For large scale clusters that are created by changing the MAX_PROCESSORS_COUNT #define in the C code totem.h file, the virtual synchrony filter "none" is recommended but then AMF and DLCK services (which are currently experimental) are not safe for use. The default is ykd. The vsftype can also be set to none. .TP transport This directive controls the transport mechanism used. If the interface to which corosync is binding is an RDMA interface such as RoCEE or Infiniband, the "iba" parameter may be specified. To avoid the use of multicast entirely, a unicast transport parameter "udpu" can be specified. This requires specifying the list of members that could potentially make up the membership before deployment. The default is udp. The transport type can also be set to udpu or iba. Within the .B totem directive, there are several configuration options which are used to control the operation of the protocol. It is generally not recommended to change any of these values without proper guidance and sufficient testing. Some networks may require larger values if suffering from frequent reconfigurations. Some applications may require faster failure detection times which can be achieved by reducing the token timeout. .TP token This timeout specifies in milliseconds until a token loss is declared after not receiving a token. This is the time spent detecting a failure of a processor in the current configuration. Reforming a new configuration takes about 50 milliseconds in addition to this timeout. The default is 1000 milliseconds. .TP token_retransmit This timeout specifies in milliseconds after how long before receiving a token the token is retransmitted. This will be automatically calculated if token is modified. It is not recommended to alter this value without guidance from the corosync community. The default is 238 milliseconds. .TP hold This timeout specifies in milliseconds how long the token should be held by the representative when the protocol is under low utilization. It is not recommended to alter this value without guidance from the corosync community. The default is 180 milliseconds. .TP token_retransmits_before_loss_const This value identifies how many token retransmits should be attempted before forming a new configuration. If this value is set, retransmit and hold will be automatically calculated from retransmits_before_loss and token. The default is 4 retransmissions. .TP join This timeout specifies in milliseconds how long to wait for join messages in the membership protocol. The default is 50 milliseconds. .TP send_join This timeout specifies in milliseconds an upper range between 0 and send_join to wait before sending a join message. For configurations with less then 32 nodes, this parameter is not necessary. For larger rings, this parameter is necessary to ensure the NIC is not overflowed with join messages on formation of a new ring. A reasonable value for large rings (128 nodes) would be 80msec. Other timer values must also change if this value is changed. Seek advice from the corosync mailing list if trying to run larger configurations. The default is 0 milliseconds. .TP consensus This timeout specifies in milliseconds how long to wait for consensus to be achieved before starting a new round of membership configuration. The minimum value for consensus must be 1.2 * token. This value will be automatically calculated at 1.2 * token if the user doesn't specify a consensus value. For two node clusters, a consensus larger then the join timeout but less then token is safe. For three node or larger clusters, consensus should be larger then token. There is an increasing risk of odd membership changes, which stil guarantee virtual synchrony, as node count grows if consensus is less than token. The default is 1200 milliseconds. .TP merge This timeout specifies in milliseconds how long to wait before checking for a partition when no multicast traffic is being sent. If multicast traffic is being sent, the merge detection happens automatically as a function of the protocol. The default is 200 milliseconds. .TP downcheck This timeout specifies in milliseconds how long to wait before checking that a network interface is back up after it has been downed. The default is 1000 millseconds. .TP fail_recv_const This constant specifies how many rotations of the token without receiving any of the messages when messages should be received may occur before a new configuration is formed. The default is 2500 failures to receive a message. .TP seqno_unchanged_const This constant specifies how many rotations of the token without any multicast traffic should occur before the merge detection timeout is started. The default is 30 rotations. .TP heartbeat_failures_allowed [HeartBeating mechanism] Configures the optional HeartBeating mechanism for faster failure detection. Keep in mind that engaging this mechanism in lossy networks could cause faulty loss declaration as the mechanism relies on the network for heartbeating. So as a rule of thumb use this mechanism if you require improved failure in low to medium utilized networks. This constant specifies the number of heartbeat failures the system should tolerate before declaring heartbeat failure e.g 3. Also if this value is not set or is 0 then the heartbeat mechanism is not engaged in the system and token rotation is the method of failure detection The default is 0 (disabled). .TP max_network_delay [HeartBeating mechanism] This constant specifies in milliseconds the approximate delay that your network takes to transport one packet from one machine to another. This value is to be set by system engineers and please dont change if not sure as this effects the failure detection mechanism using heartbeat. The default is 50 milliseconds. .TP window_size This constant specifies the maximum number of messages that may be sent on one token rotation. If all processors perform equally well, this value could be large (300), which would introduce higher latency from origination to delivery for very large rings. To reduce latency in large rings(16+), the defaults are a safe compromise. If 1 or more slow processor(s) are present among fast processors, window_size should be no larger then 256000 / netmtu to avoid overflow of the kernel receive buffers. The user is notified of this by the display of a retransmit list in the notification logs. There is no loss of data, but performance is reduced when these errors occur. The default is 50 messages. .TP max_messages This constant specifies the maximum number of messages that may be sent by one processor on receipt of the token. The max_messages parameter is limited to 256000 / netmtu to prevent overflow of the kernel transmit buffers. The default is 17 messages. .TP miss_count_const This constant defines the maximum number of times on receipt of a token a message is checked for retransmission before a retransmission occurs. This parameter is useful to modify for switches that delay multicast packets compared to unicast packets. The default setting works well for nearly all modern switches. The default is 5 messages. .TP rrp_problem_count_timeout This specifies the time in milliseconds to wait before decrementing the problem count by 1 for a particular ring to ensure a link is not marked faulty for transient network failures. The default is 2000 milliseconds. .TP rrp_problem_count_threshold This specifies the number of times a problem is detected with a link before setting the link faulty. Once a link is set faulty, no more data is transmitted upon it. Also, the problem counter is no longer decremented when the problem count timeout expires. A problem is detected whenever all tokens from the proceeding processor have not been received within the rrp_token_expired_timeout. The rrp_problem_count_threshold * rrp_token_expired_timeout should be atleast 50 milliseconds less then the token timeout, or a complete reconfiguration may occur. The default is 10 problem counts. +.TP +rrp_problem_count_mcast_threshold +This specifies the number of times a problem is detected with multicast before +setting the link faulty for passive rrp mode. This variable is unused in active +rrp mode. + +The default is 10 times rrp_problem_count_threshold. + .TP rrp_token_expired_timeout This specifies the time in milliseconds to increment the problem counter for the redundant ring protocol after not having received a token from all rings for a particular processor. This value will automatically be calculated from the token timeout and problem_count_threshold but may be overridden. It is not recommended to override this value without guidance from the corosync community. The default is 47 milliseconds. .TP rrp_autorecovery_check_timeout This specifies the time in milliseconds to check if the failed ring can be auto-recovered. The default is 1000 milliseconds. .PP Within the .B logging directive, there are several configuration options which are all optional. .PP The following 3 options are valid only for the top level logging directive: .TP timestamp This specifies that a timestamp is placed on all log messages. The default is off. .TP fileline This specifies that file and line should be printed. The default is off. .TP function_name This specifies that the code function name should be printed. The default is off. .PP The following options are valid both for top level logging directive and they can be overriden in logger_subsys entries. .TP to_stderr .TP to_logfile .TP to_syslog These specify the destination of logging output. Any combination of these options may be specified. Valid options are .B yes and .B no. The default is syslog and stderr. Please note, if you are using to_logfile and want to rotate the file, use logrotate(8) with the option .B copytruncate. eg. .IP .RS .ne 18 .nf .ta 4n 30n 33n /var/log/corosync.log { missingok compress notifempty daily rotate 7 copytruncate } .ta .fi .RE .IP .PP .TP logfile If the .B to_logfile directive is set to .B yes , this option specifies the pathname of the log file. No default. .TP logfile_priority This specifies the logfile priority for this particular subsystem. Ignored if debug is on. Possible values are: alert, crit, debug (same as debug = on), emerg, err, info, notice, warning. The default is: info. .TP syslog_facility This specifies the syslog facility type that will be used for any messages sent to syslog. options are daemon, local0, local1, local2, local3, local4, local5, local6 & local7. The default is daemon. .TP syslog_priority This specifies the syslog level for this particular subsystem. Ignored if debug is on. Possible values are: alert, crit, debug (same as debug = on), emerg, err, info, notice, warning. The default is: info. .TP debug This specifies whether debug output is logged for this particular logger. The default is off. .TP tags This specifies which tags should be traced for this particular logger. Set debug directive to .B on in order to enable tracing using tags. Values are specified using a vertical bar as a logical OR separator: enter|leave|trace1|trace2|trace3|... The default is none. .PP Within the .B logging directive, logger_subsys directives are optional. .PP Within the .B logger_subsys sub-directive, all of the above logging configuration options are valid and can be used to override the default settings. The subsys entry, described below, is mandatory to identify the subsystem. .TP subsys This specifies the subsystem identity (name) for which logging is specified. This is the name used by a service in the log_init () call. E.g. 'CKPT'. This directive is required. .SH "FILES" .TP /etc/corosync/corosync.conf The corosync executive configuration file. .SH "SEE ALSO" .BR corosync_overview (8), .BR logrotate (8) .PP