diff --git a/exec/totemconfig.c b/exec/totemconfig.c index 568f9737..a6394a2f 100644 --- a/exec/totemconfig.c +++ b/exec/totemconfig.c @@ -1,2448 +1,2454 @@ /* * Copyright (c) 2002-2005 MontaVista Software, Inc. * Copyright (c) 2006-2022 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * Jan Friesse (jfriesse@redhat.com) * Chrissie Caulfield (ccaulfie@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "totemconfig.h" #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST 4 #define TOKEN_TIMEOUT 3000 #define TOKEN_WARNING 75 #define TOKEN_COEFFICIENT 650 #define JOIN_TIMEOUT 50 #define MERGE_TIMEOUT 200 #define DOWNCHECK_TIMEOUT 1000 #define FAIL_TO_RECV_CONST 2500 #define SEQNO_UNCHANGED_CONST 30 #define MINIMUM_TIMEOUT (int)(1000/HZ)*3 #define MINIMUM_TIMEOUT_HOLD (int)(MINIMUM_TIMEOUT * 0.8 - (1000/HZ)) #define MAX_NETWORK_DELAY 50 #define WINDOW_SIZE 50 #define MAX_MESSAGES 17 #define MISS_COUNT_CONST 5 #define BLOCK_UNLISTED_IPS 1 #define CANCEL_TOKEN_HOLD_ON_RETRANSMIT 0 /* This constant is not used for knet */ #define UDP_NETMTU 1500 /* Currently all but PONG_COUNT match the defaults in libknet.h */ #define KNET_PING_INTERVAL 1000 #define KNET_PING_TIMEOUT 2000 #define KNET_PING_PRECISION 2048 #define KNET_PONG_COUNT 2 #define KNET_PMTUD_INTERVAL 30 #define KNET_MTU 0 #define KNET_DEFAULT_TRANSPORT KNET_TRANSPORT_UDP #define DEFAULT_PORT 5405 static char error_string_response[768]; static void add_totem_config_notification(struct totem_config *totem_config); static void *totem_get_param_by_name(struct totem_config *totem_config, const char *param_name) { if (strcmp(param_name, "totem.token") == 0) return &totem_config->token_timeout; if (strcmp(param_name, "totem.token_warning") == 0) return &totem_config->token_warning; if (strcmp(param_name, "totem.token_retransmit") == 0) return &totem_config->token_retransmit_timeout; if (strcmp(param_name, "totem.hold") == 0) return &totem_config->token_hold_timeout; if (strcmp(param_name, "totem.token_retransmits_before_loss_const") == 0) return &totem_config->token_retransmits_before_loss_const; if (strcmp(param_name, "totem.join") == 0) return &totem_config->join_timeout; if (strcmp(param_name, "totem.send_join") == 0) return &totem_config->send_join_timeout; if (strcmp(param_name, "totem.consensus") == 0) return &totem_config->consensus_timeout; if (strcmp(param_name, "totem.merge") == 0) return &totem_config->merge_timeout; if (strcmp(param_name, "totem.downcheck") == 0) return &totem_config->downcheck_timeout; if (strcmp(param_name, "totem.fail_recv_const") == 0) return &totem_config->fail_to_recv_const; if (strcmp(param_name, "totem.seqno_unchanged_const") == 0) return &totem_config->seqno_unchanged_const; if (strcmp(param_name, "totem.heartbeat_failures_allowed") == 0) return &totem_config->heartbeat_failures_allowed; if (strcmp(param_name, "totem.max_network_delay") == 0) return &totem_config->max_network_delay; if (strcmp(param_name, "totem.window_size") == 0) return &totem_config->window_size; if (strcmp(param_name, "totem.max_messages") == 0) return &totem_config->max_messages; if (strcmp(param_name, "totem.miss_count_const") == 0) return &totem_config->miss_count_const; if (strcmp(param_name, "totem.knet_pmtud_interval") == 0) return &totem_config->knet_pmtud_interval; if (strcmp(param_name, "totem.knet_mtu") == 0) return &totem_config->knet_mtu; if (strcmp(param_name, "totem.knet_compression_threshold") == 0) return &totem_config->knet_compression_threshold; if (strcmp(param_name, "totem.knet_compression_level") == 0) return &totem_config->knet_compression_level; if (strcmp(param_name, "totem.knet_compression_model") == 0) return totem_config->knet_compression_model; if (strcmp(param_name, "totem.block_unlisted_ips") == 0) return &totem_config->block_unlisted_ips; if (strcmp(param_name, "totem.cancel_token_hold_on_retransmit") == 0) return &totem_config->cancel_token_hold_on_retransmit; return NULL; } /* * Read key_name from icmap. If key is not found or key_name == delete_key or if allow_zero is false * and readed value is zero, default value is used and stored into totem_config. */ static void totem_volatile_config_set_uint32_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, unsigned int default_value, int allow_zero_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; if (icmap_get_uint32_r(map, key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (!allow_zero_value && *(uint32_t *)totem_get_param_by_name(totem_config, key_name) == 0)) { *(uint32_t *)totem_get_param_by_name(totem_config, key_name) = default_value; } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); icmap_set_uint32_r(map, runtime_key_name, *(uint32_t *)totem_get_param_by_name(totem_config, key_name)); } static void totem_volatile_config_set_int32_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, int default_value, int allow_zero_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; if (icmap_get_int32_r(map, key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (!allow_zero_value && *(int32_t *)totem_get_param_by_name(totem_config, key_name) == 0)) { *(int32_t *)totem_get_param_by_name(totem_config, key_name) = default_value; } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); icmap_set_int32_r(map, runtime_key_name, *(int32_t *)totem_get_param_by_name(totem_config, key_name)); } static void totem_volatile_config_set_string_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, const char *default_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; int res; char *new_config_value; const void *config_value; config_value = totem_get_param_by_name(totem_config, key_name); res = icmap_get_string_r(map, key_name, (char **)&new_config_value); if (res != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0)) { /* Slightly pointless use of strncpy but it keeps coverity happy */ strncpy((char *)config_value, default_value, CONFIG_STRING_LEN_MAX); } else { strncpy((char *)config_value, new_config_value, CONFIG_STRING_LEN_MAX); } if (res == CS_OK) { free(new_config_value); } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); (void)icmap_set_string_r(map, runtime_key_name, (char *)config_value); } /* * Read string value stored in key_name from icmap, use it as a boolean (yes/no) type, convert it * to integer value (1/0) and store into totem_config. * * If key is not found or key_name == delete_key default value is used * and stored into totem_config. */ static void totem_volatile_config_set_boolean_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, unsigned int default_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; char *str; int val; str = NULL; val = default_value; if ((deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (icmap_get_string_r(map, key_name, &str) != CS_OK)) { /* * Do nothing. str is NULL (icmap_get_string ether not called or * not changed str). */ } else { if (strcmp(str, "yes") == 0) { val = 1; } else if (strcmp(str, "no") == 0) { val = 0; } free(str); } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); *(uint32_t *)totem_get_param_by_name(totem_config, key_name) = val; icmap_set_uint32_r(map, runtime_key_name, val); } /* * Read and validate config values from cmap and store them into totem_config. If key doesn't exists, * default value is stored. deleted_key is name of key beeing processed by delete operation * from cmap. It is considered as non existing even if it can be read. Can be NULL. */ void totem_volatile_config_read (struct totem_config *totem_config, icmap_map_t temp_map, const char *deleted_key) { uint32_t u32; totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token_retransmits_before_loss_const", deleted_key, TOKEN_RETRANSMITS_BEFORE_LOSS_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token", deleted_key, TOKEN_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token_warning", deleted_key, TOKEN_WARNING, 1); if (totem_config->interfaces[0].member_count > 2) { u32 = TOKEN_COEFFICIENT; icmap_get_uint32_r(temp_map, "totem.token_coefficient", &u32); totem_config->token_timeout += (totem_config->interfaces[0].member_count - 2) * u32; /* * Store totem_config value to cmap runtime section */ icmap_set_uint32_r(temp_map, "runtime.config.totem.token", totem_config->token_timeout); } totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.max_network_delay", deleted_key, MAX_NETWORK_DELAY, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.window_size", deleted_key, WINDOW_SIZE, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.max_messages", deleted_key, MAX_MESSAGES, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.miss_count_const", deleted_key, MISS_COUNT_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.knet_pmtud_interval", deleted_key, KNET_PMTUD_INTERVAL, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.knet_mtu", deleted_key, KNET_MTU, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token_retransmit", deleted_key, (int)(totem_config->token_timeout / (totem_config->token_retransmits_before_loss_const + 0.2)), 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.hold", deleted_key, (int)(totem_config->token_retransmit_timeout * 0.8 - (1000/HZ)), 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.join", deleted_key, JOIN_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.consensus", deleted_key, (int)(float)(1.2 * totem_config->token_timeout), 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.merge", deleted_key, MERGE_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.downcheck", deleted_key, DOWNCHECK_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.fail_recv_const", deleted_key, FAIL_TO_RECV_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.seqno_unchanged_const", deleted_key, SEQNO_UNCHANGED_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.send_join", deleted_key, 0, 1); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.heartbeat_failures_allowed", deleted_key, 0, 1); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.knet_compression_threshold", deleted_key, 0, 1); totem_volatile_config_set_int32_value(totem_config, temp_map, "totem.knet_compression_level", deleted_key, 0, 1); totem_volatile_config_set_string_value(totem_config, temp_map, "totem.knet_compression_model", deleted_key, "none"); totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.block_unlisted_ips", deleted_key, BLOCK_UNLISTED_IPS); totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.cancel_token_hold_on_retransmit", deleted_key, CANCEL_TOKEN_HOLD_ON_RETRANSMIT); } int totem_volatile_config_validate ( struct totem_config *totem_config, icmap_map_t temp_map, const char **error_string) { /* Static just to keep them off the stack */ static char local_error_reason[512]; static char addr_str_buf[INET6_ADDRSTRLEN]; const char *error_reason = local_error_reason; char name_key[ICMAP_KEYNAME_MAXLEN]; char *name_str; int i, j, num_configured, members; uint32_t tmp_config_value; if (totem_config->max_network_delay < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The max_network_delay parameter (%d ms) may not be less than (%d ms).", totem_config->max_network_delay, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_warning > 100 || totem_config->token_warning < 0) { snprintf (local_error_reason, sizeof(local_error_reason), "The token warning parameter (%d%%) must be between 0 (disabled) and 100.", totem_config->token_warning); goto parse_error; } if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) { if (icmap_get_uint32_r(temp_map, "totem.token_retransmit", &tmp_config_value) == CS_OK) { snprintf (local_error_reason, sizeof(local_error_reason), "The token retransmit timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_retransmit_timeout, MINIMUM_TIMEOUT); goto parse_error; } else { snprintf (local_error_reason, sizeof(local_error_reason), "Not appropriate token or token_retransmits_before_loss_const value set"); goto parse_error; } } if (totem_config->token_hold_timeout < MINIMUM_TIMEOUT_HOLD) { snprintf (local_error_reason, sizeof(local_error_reason), "The token hold timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_hold_timeout, MINIMUM_TIMEOUT_HOLD); goto parse_error; } if (totem_config->join_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The join timeout parameter (%d ms) may not be less than (%d ms).", totem_config->join_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less than (%d ms).", totem_config->consensus_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout < totem_config->join_timeout) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less than join timeout (%d ms).", totem_config->consensus_timeout, totem_config->join_timeout); goto parse_error; } if (totem_config->merge_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The merge timeout parameter (%d ms) may not be less than (%d ms).", totem_config->merge_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->downcheck_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The downcheck timeout parameter (%d ms) may not be less than (%d ms).", totem_config->downcheck_timeout, MINIMUM_TIMEOUT); goto parse_error; } /* Check that we have nodelist 'name' if there is more than one link */ num_configured = 0; members = -1; for (i = 0; i < INTERFACE_MAX; i++) { if (totem_config->interfaces[i].configured) { if (num_configured == 0) { members = totem_config->interfaces[i].member_count; } num_configured++; } } if (num_configured > 1) { /* * This assert is here just to make compiler happy */ assert(members != -1); for (i=0; i < members; i++) { snprintf(name_key, sizeof(name_key), "nodelist.node.%d.name", i); if (icmap_get_string_r(temp_map, name_key, &name_str) != CS_OK) { snprintf (local_error_reason, sizeof(local_error_reason), "for a multi-link configuration, all nodes must have a 'name' attribute"); goto parse_error; } free(name_str); } for (i=0; i < INTERFACE_MAX; i++) { if (!totem_config->interfaces[i].configured) { continue; } if (totem_config->interfaces[i].member_count != members) { snprintf (local_error_reason, sizeof(local_error_reason), "Not all nodes have the same number of links"); goto parse_error; } } } /* Verify that all nodes on the same link have the same IP family */ for (i=0; i < INTERFACE_MAX; i++) { for (j=1; jinterfaces[i].member_count; j++) { if (totem_config->interfaces[i].configured) { if (totem_config->interfaces[i].member_list[j].family != totem_config->interfaces[i].member_list[0].family) { memcpy(addr_str_buf, totemip_print(&(totem_config->interfaces[i].member_list[j])), sizeof(addr_str_buf)); snprintf (local_error_reason, sizeof(local_error_reason), "Nodes for link %d have different IP families " "(compared %s with %s)", i, addr_str_buf, totemip_print(&(totem_config->interfaces[i].member_list[0]))); goto parse_error; } } } } return 0; parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } static int totem_get_crypto(struct totem_config *totem_config, icmap_map_t map, const char **error_string) { char *str; const char *tmp_cipher; const char *tmp_hash; const char *tmp_model; char *crypto_model_str; int res = 0; tmp_hash = "none"; tmp_cipher = "none"; tmp_model = "none"; crypto_model_str = NULL; if (icmap_get_string_r(map, "totem.crypto_model", &crypto_model_str) == CS_OK) { tmp_model = crypto_model_str; } else { tmp_model = "nss"; } if (icmap_get_string_r(map, "totem.secauth", &str) == CS_OK) { if (strcmp(str, "on") == 0) { tmp_cipher = "aes256"; tmp_hash = "sha256"; } free(str); } if (icmap_get_string_r(map, "totem.crypto_cipher", &str) == CS_OK) { if (strcmp(str, "none") == 0) { tmp_cipher = "none"; } if (strcmp(str, "aes256") == 0) { tmp_cipher = "aes256"; } if (strcmp(str, "aes192") == 0) { tmp_cipher = "aes192"; } if (strcmp(str, "aes128") == 0) { tmp_cipher = "aes128"; } free(str); } if (icmap_get_string_r(map, "totem.crypto_hash", &str) == CS_OK) { if (strcmp(str, "none") == 0) { tmp_hash = "none"; } if (strcmp(str, "md5") == 0) { tmp_hash = "md5"; } if (strcmp(str, "sha1") == 0) { tmp_hash = "sha1"; } if (strcmp(str, "sha256") == 0) { tmp_hash = "sha256"; } if (strcmp(str, "sha384") == 0) { tmp_hash = "sha384"; } if (strcmp(str, "sha512") == 0) { tmp_hash = "sha512"; } free(str); } if ((strcmp(tmp_cipher, "none") != 0) && (strcmp(tmp_hash, "none") == 0)) { *error_string = "crypto_cipher requires crypto_hash with value other than none"; res = -1; goto out_free_crypto_model_str; } if (strcmp(tmp_model, "none") == 0) { /* * Shouldn't happen because it is handled by coroparse */ *error_string = "invalid crypto_model"; res = -1; goto out_free_crypto_model_str; } if (strcmp(tmp_cipher, totem_config->crypto_cipher_type) || strcmp(tmp_hash, totem_config->crypto_hash_type) || strcmp(tmp_model, totem_config->crypto_model)) { totem_config->crypto_changed = 1; } strncpy(totem_config->crypto_cipher_type, tmp_cipher, CONFIG_STRING_LEN_MAX - 1); totem_config->crypto_cipher_type[CONFIG_STRING_LEN_MAX - 1] = '\0'; strncpy(totem_config->crypto_hash_type, tmp_hash, CONFIG_STRING_LEN_MAX - 1); totem_config->crypto_hash_type[CONFIG_STRING_LEN_MAX - 1] = '\0'; strncpy(totem_config->crypto_model, tmp_model, CONFIG_STRING_LEN_MAX - 1); totem_config->crypto_model[CONFIG_STRING_LEN_MAX - 1] = '\0'; out_free_crypto_model_str: free(crypto_model_str); return (res); } static int nodelist_byname(icmap_map_t map, const char *find_name, int strip_domain) { icmap_iter_t iter; const char *iter_key; char name_str[ICMAP_KEYNAME_MAXLEN]; int res = 0; unsigned int node_pos; char *name; unsigned int namelen; iter = icmap_iter_init_r(map, "nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, name_str); if (res != 2) { continue; } /* ring0_addr is allowed as a fallback */ if (strcmp(name_str, "name") && strcmp(name_str, "ring0_addr")) { continue; } if (icmap_get_string_r(map, iter_key, &name) != CS_OK) { continue; } namelen = strlen(name); if (strip_domain) { char *dot; dot = strchr(name, '.'); if (dot) { namelen = dot - name; } } if (strncmp(find_name, name, namelen) == 0 && strlen(find_name) == namelen) { icmap_iter_finalize(iter); return node_pos; } } icmap_iter_finalize(iter); return -1; } /* Compare two addresses - only address part (sin_addr/sin6_addr) is checked */ static int ipaddr_equal(const struct sockaddr *addr1, const struct sockaddr *addr2) { int addrlen = 0; const void *addr1p, *addr2p; if (addr1->sa_family != addr2->sa_family) return 0; switch (addr1->sa_family) { case AF_INET: addrlen = sizeof(struct in_addr); addr1p = &((struct sockaddr_in *)addr1)->sin_addr; addr2p = &((struct sockaddr_in *)addr2)->sin_addr; break; case AF_INET6: addrlen = sizeof(struct in6_addr); addr1p = &((struct sockaddr_in6 *)addr1)->sin6_addr; addr2p = &((struct sockaddr_in6 *)addr2)->sin6_addr; break; default: assert(0); } return (memcmp(addr1p, addr2p, addrlen) == 0); } /* Finds the local node and returns its position in the nodelist. * Uses nodelist.local_node_pos as a cache to save effort */ static int find_local_node(icmap_map_t map, int use_cache) { char nodename2[PATH_MAX]; char name_str[ICMAP_KEYNAME_MAXLEN]; icmap_iter_t iter; const char *iter_key; unsigned int cached_pos; char *dot = NULL; const char *node; struct ifaddrs *ifa, *ifa_list; struct sockaddr *sa; int found = 0; int node_pos = -1; int res; struct utsname utsname; /* Check for cached value first */ if (use_cache) { if (icmap_get_uint32("nodelist.local_node_pos", &cached_pos) == CS_OK) { return cached_pos; } } res = uname(&utsname); if (res < 0) { return -1; } node = utsname.nodename; /* 1. Exact match */ node_pos = nodelist_byname(map, node, 0); if (node_pos > -1) { found = 1; goto ret_found; } /* 2. Try to match with increasingly more * specific versions of it */ strcpy(nodename2, node); dot = strrchr(nodename2, '.'); while (dot) { *dot = '\0'; node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto ret_found; } dot = strrchr(nodename2, '.'); } node_pos = nodelist_byname(map, nodename2, 1); if (node_pos > -1) { found = 1; goto ret_found; } /* * The corosync.conf name may not be related to uname at all, * they may match a hostname on some network interface. */ if (getifaddrs(&ifa_list)) return -1; for (ifa = ifa_list; ifa; ifa = ifa->ifa_next) { socklen_t salen = 0; /* Restore this */ strcpy(nodename2, node); sa = ifa->ifa_addr; if (!sa) { continue; } if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) { continue; } if (sa->sa_family == AF_INET) { salen = sizeof(struct sockaddr_in); } if (sa->sa_family == AF_INET6) { salen = sizeof(struct sockaddr_in6); } if (getnameinfo(sa, salen, nodename2, sizeof(nodename2), NULL, 0, 0) == 0) { node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto out; } /* Truncate this name and try again */ dot = strchr(nodename2, '.'); if (dot) { *dot = '\0'; node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto out; } } } /* See if it's the IP address that's in corosync.conf */ if (getnameinfo(sa, sizeof(*sa), nodename2, sizeof(nodename2), NULL, 0, NI_NUMERICHOST)) continue; node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto out; } } out: if (found) { freeifaddrs(ifa_list); goto ret_found; } /* * This section covers the usecase where the nodename specified in cluster.conf * is an alias specified in /etc/hosts. For example: * hostname alias1 alias2 * and * the above calls use uname and getnameinfo does not return aliases. * here we take the name specified in cluster.conf, resolve it to an address * and then compare against all known local ip addresses. * if we have a match, we found our nodename. In theory this chunk of code * could replace all the checks above, but let's avoid any possible regressions * and use it as last. */ iter = icmap_iter_init_r(map, "nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { char *dbnodename = NULL; struct addrinfo hints; struct addrinfo *result = NULL, *rp = NULL; res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, name_str); if (res != 2) { continue; } /* 'ring0_addr' is allowed as a fallback, but 'name' will be found first * because the names are in alpha order. */ if (strcmp(name_str, "name") && strcmp(name_str, "ring0_addr")) { continue; } if (icmap_get_string_r(map, iter_key, &dbnodename) != CS_OK) { continue; } memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_DGRAM; hints.ai_flags = 0; hints.ai_protocol = IPPROTO_UDP; if (getaddrinfo(dbnodename, NULL, &hints, &result)) { continue; } for (rp = result; rp != NULL; rp = rp->ai_next) { for (ifa = ifa_list; ifa; ifa = ifa->ifa_next) { if (ifa->ifa_addr && ipaddr_equal(rp->ai_addr, ifa->ifa_addr)) { freeaddrinfo(result); found = 1; goto out2; } } } freeaddrinfo(result); } out2: icmap_iter_finalize(iter); freeifaddrs(ifa_list); ret_found: if (found) { res = icmap_set_uint32_r(map, "nodelist.local_node_pos", node_pos); } return node_pos; } static enum totem_ip_version_enum totem_config_get_ip_version(struct totem_config *totem_config) { enum totem_ip_version_enum res; char *str; res = TOTEM_IP_VERSION_6_4; if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { res = TOTEM_IP_VERSION_4; } if (icmap_get_string("totem.ip_version", &str) == CS_OK) { if (strcmp(str, "ipv4") == 0) { res = TOTEM_IP_VERSION_4; } if (strcmp(str, "ipv6") == 0) { res = TOTEM_IP_VERSION_6; } if (strcmp(str, "ipv6-4") == 0) { res = TOTEM_IP_VERSION_6_4; } if (strcmp(str, "ipv4-6") == 0) { res = TOTEM_IP_VERSION_4_6; } free(str); } return (res); } static uint16_t generate_cluster_id (const char *cluster_name) { int i; int value = 0; for (i = 0; i < strlen(cluster_name); i++) { value <<= 1; value += cluster_name[i]; } return (value & 0xFFFF); } static int get_cluster_mcast_addr ( const char *cluster_name, unsigned int linknumber, enum totem_ip_version_enum ip_version, struct totem_ip_address *res) { uint16_t clusterid; char addr[INET6_ADDRSTRLEN + 1]; int err; if (cluster_name == NULL) { return (-1); } clusterid = generate_cluster_id(cluster_name) + linknumber; memset (res, 0, sizeof(*res)); switch (ip_version) { case TOTEM_IP_VERSION_4: case TOTEM_IP_VERSION_4_6: snprintf(addr, sizeof(addr), "239.192.%d.%d", clusterid >> 8, clusterid % 0xFF); break; case TOTEM_IP_VERSION_6: case TOTEM_IP_VERSION_6_4: snprintf(addr, sizeof(addr), "ff15::%x", clusterid); break; default: /* * Unknown family */ return (-1); } err = totemip_parse (res, addr, ip_version); return (err); } static unsigned int generate_nodeid( struct totem_config *totem_config, char *addr) { unsigned int nodeid; struct totem_ip_address totemip; /* AF_INET hard-coded here because auto-generated nodeids are only for IPv4 */ if (totemip_parse(&totemip, addr, TOTEM_IP_VERSION_4) != 0) return -1; memcpy (&nodeid, &totemip.addr, sizeof (unsigned int)); #if __BYTE_ORDER == __LITTLE_ENDIAN nodeid = swab32 (nodeid); #endif if (totem_config->clear_node_high_bit) { nodeid &= 0x7FFFFFFF; } return nodeid; } static int check_for_duplicate_nodeids( struct totem_config *totem_config, const char **error_string) { icmap_iter_t iter; icmap_iter_t subiter; const char *iter_key; int res = 0; int retval = 0; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char *ring0_addr=NULL; char *ring0_addr1=NULL; unsigned int node_pos; unsigned int node_pos1; unsigned int last_node_pos = -1; unsigned int nodeid; unsigned int nodeid1; int autogenerated; iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } /* * This relies on the fact the icmap keys are always returned in order * so all of the keys for a node will be grouped together. We're basically * just running the code below once for each node. */ if (last_node_pos == node_pos) { continue; } last_node_pos = node_pos; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); autogenerated = 0; /* Generated nodeids are only allowed for UDP/UDPU so ring0_addr is valid here */ if (icmap_get_uint32(tmp_key, &nodeid) != CS_OK) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos); if (icmap_get_string(tmp_key, &ring0_addr) != CS_OK) { continue; } /* Generate nodeid so we can check that auto-generated nodeids don't clash either */ nodeid = generate_nodeid(totem_config, ring0_addr); if (nodeid == -1) { continue; } autogenerated = 1; } node_pos1 = 0; subiter = icmap_iter_init("nodelist.node."); while (((iter_key = icmap_iter_next(subiter, NULL, NULL)) != NULL) && (node_pos1 < node_pos)) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos1, tmp_key); if ((res != 2) || (node_pos1 >= node_pos)) { continue; } if (strcmp(tmp_key, "nodeid") != 0) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos1); if (icmap_get_uint32(tmp_key, &nodeid1) != CS_OK) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos1); if (icmap_get_string(tmp_key, &ring0_addr1) != CS_OK) { continue; } nodeid1 = generate_nodeid(totem_config, ring0_addr1); if (nodeid1 == -1) { continue; } } if (nodeid == nodeid1) { retval = -1; snprintf (error_string_response, sizeof(error_string_response), "Nodeid %u%s%s%s appears twice in corosync.conf", nodeid, autogenerated?"(autogenerated from ":"", autogenerated?ring0_addr:"", autogenerated?")":""); *error_string = error_string_response; break; } } icmap_iter_finalize(subiter); } icmap_iter_finalize(iter); return retval; } /* * This needs to be done last of all. It would be nice to do it when reading the * interface params, but the totem params need to have them to be read first. We * need both, so this is a way round that circular dependancy. */ static void calc_knet_ping_timers(struct totem_config *totem_config) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; int interface; for (interface = 0; interface < INTERFACE_MAX; interface++) { if (totem_config->interfaces[interface].configured) { if (!totem_config->interfaces[interface].knet_pong_count) { totem_config->interfaces[interface].knet_pong_count = KNET_PONG_COUNT; } if (!totem_config->interfaces[interface].knet_ping_timeout) { totem_config->interfaces[interface].knet_ping_timeout = totem_config->token_timeout / totem_config->interfaces[interface].knet_pong_count; } snprintf(runtime_key_name, sizeof(runtime_key_name), "runtime.config.totem.interface.%d.knet_ping_timeout", interface); icmap_set_uint32(runtime_key_name, totem_config->interfaces[interface].knet_ping_timeout); if (!totem_config->interfaces[interface].knet_ping_interval) { totem_config->interfaces[interface].knet_ping_interval = totem_config->token_timeout / (totem_config->interfaces[interface].knet_pong_count * 2); } snprintf(runtime_key_name, sizeof(runtime_key_name), "runtime.config.totem.interface.%d.knet_ping_interval", interface); icmap_set_uint32(runtime_key_name, totem_config->interfaces[interface].knet_ping_interval); } } } /* * Compute difference between two set of totem interface arrays and commit it. * set1 and set2 * are changed so for same ring, ip existing in both set1 and set2 are cleared * (set to 0), and ips which are only in set1 or set2 remains untouched. * totempg_node_add/remove is called. */ -static void compute_and_set_totempg_interfaces(struct totem_interface *set1, +static int compute_and_set_totempg_interfaces(struct totem_interface *set1, struct totem_interface *set2) { int ring_no, set1_pos, set2_pos; struct totem_ip_address empty_ip_address; + int res = 0; memset(&empty_ip_address, 0, sizeof(empty_ip_address)); for (ring_no = 0; ring_no < INTERFACE_MAX; ring_no++) { if (!set1[ring_no].configured && !set2[ring_no].configured) { continue; } for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) { for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) { /* * For current ring_no remove all set1 items existing * in set2 */ if (memcmp(&set1[ring_no].member_list[set1_pos], &set2[ring_no].member_list[set2_pos], sizeof(struct totem_ip_address)) == 0) { memset(&set1[ring_no].member_list[set1_pos], 0, sizeof(struct totem_ip_address)); memset(&set2[ring_no].member_list[set2_pos], 0, sizeof(struct totem_ip_address)); } } } } for (ring_no = 0; ring_no < INTERFACE_MAX; ring_no++) { for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) { /* * All items which remain in set1 and don't exist in set2 any more * have to be removed. */ if (memcmp(&set1[ring_no].member_list[set1_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) { log_printf(LOGSYS_LEVEL_DEBUG, "removing dynamic member %s for ring %u", totemip_print(&set1[ring_no].member_list[set1_pos]), ring_no); totempg_member_remove(&set1[ring_no].member_list[set1_pos], ring_no); } } if (!set2[ring_no].configured) { continue; } for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) { /* * All items which remain in set2 and don't exist in set1 are new nodes * and have to be added. */ if (memcmp(&set2[ring_no].member_list[set2_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) { log_printf(LOGSYS_LEVEL_DEBUG, "adding dynamic member %s for ring %u", totemip_print(&set2[ring_no].member_list[set2_pos]), ring_no); - totempg_member_add(&set2[ring_no].member_list[set2_pos], ring_no); + if (totempg_member_add(&set2[ring_no].member_list[set2_pos], ring_no)) { + res = -1; + } } } } + return res; } /* * Configure parameters for links */ static void configure_link_params(struct totem_config *totem_config, icmap_map_t map) { int i; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char *addr_string; int err; int local_node_pos = find_local_node(map, 0); for (i = 0; iinterfaces[i].configured) { continue; } log_printf(LOGSYS_LEVEL_DEBUG, "Configuring link %d params\n", i); snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring%u_addr", local_node_pos, i); if (icmap_get_string_r(map, tmp_key, &addr_string) != CS_OK) { continue; } err = totemip_parse(&totem_config->interfaces[i].local_ip, addr_string, totem_config->ip_version); if (err != 0) { continue; } totem_config->interfaces[i].local_ip.nodeid = totem_config->node_id; /* In case this is a new link, fill in the defaults if there was no interface{} section for it */ if (!totem_config->interfaces[i].knet_link_priority) totem_config->interfaces[i].knet_link_priority = 1; /* knet_ping_interval & knet_ping_timeout are set later once we know all the other params */ if (!totem_config->interfaces[i].knet_ping_precision) totem_config->interfaces[i].knet_ping_precision = KNET_PING_PRECISION; if (!totem_config->interfaces[i].knet_pong_count) totem_config->interfaces[i].knet_pong_count = KNET_PONG_COUNT; if (!totem_config->interfaces[i].knet_transport) totem_config->interfaces[i].knet_transport = KNET_TRANSPORT_UDP; if (!totem_config->interfaces[i].ip_port) totem_config->interfaces[i].ip_port = DEFAULT_PORT + i; } } static void configure_totem_links(struct totem_config *totem_config, icmap_map_t map) { int i; for (i = 0; iinterfaces[i].configured) { continue; } log_printf(LOGSYS_LEVEL_INFO, "Configuring link %d\n", i); totempg_iface_set(&totem_config->interfaces[i].local_ip, totem_config->interfaces[i].ip_port, i); } } /* Check for differences in config that can't be done on-the-fly and print an error */ static int check_things_have_not_changed(struct totem_config *totem_config, const char **error_string) { int i,j,k; const char *ip_str; char addr_buf[INET6_ADDRSTRLEN]; int changed = 0; for (i = 0; iinterfaces[i].configured && totem_config->orig_interfaces[i].configured) { if (totem_config->interfaces[i].knet_transport != totem_config->orig_interfaces[i].knet_transport) { log_printf(LOGSYS_LEVEL_ERROR, "New config has different knet transport for link %d. Internal value was NOT changed.\n", i); changed = 1; } /* Check each nodeid in the new configuration and make sure its IP address on this link has not changed */ for (j=0; j < totem_config->interfaces[i].member_count; j++) { for (k=0; k < totem_config->orig_interfaces[i].member_count; k++) { if (totem_config->interfaces[i].member_list[j].nodeid == totem_config->orig_interfaces[i].member_list[k].nodeid) { /* Found our nodeid - check the IP address */ if (memcmp(&totem_config->interfaces[i].member_list[j], &totem_config->orig_interfaces[i].member_list[k], sizeof(struct totem_ip_address))) { ip_str = totemip_print(&totem_config->orig_interfaces[i].member_list[k]); /* if ip_str is NULL then the old address was invalid and is allowed to change */ if (ip_str) { strncpy(addr_buf, ip_str, sizeof(addr_buf)); addr_buf[sizeof(addr_buf) - 1] = '\0'; log_printf(LOGSYS_LEVEL_ERROR, "new config has different address for link %d (addr changed from %s to %s). Internal value was NOT changed.\n", i, addr_buf, totemip_print(&totem_config->interfaces[i].member_list[j])); changed = 1; } } } } } } } if (changed) { snprintf (error_string_response, sizeof(error_string_response), "To reconfigure an interface it must be deleted and recreated. A working interface needs to be available to corosync at all times"); *error_string = error_string_response; return -1; } return 0; } static int put_nodelist_members_to_config(struct totem_config *totem_config, icmap_map_t map, int reload, const char **error_string) { icmap_iter_t iter, iter2; const char *iter_key, *iter_key2; int res = 0; unsigned int node_pos; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key2[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; int member_count; unsigned int linknumber = 0; int i, j; int last_node_pos = -1; /* Clear out nodelist so we can put the new one in if needed */ for (i = 0; i < INTERFACE_MAX; i++) { for (j = 0; j < PROCESSOR_COUNT_MAX; j++) { memset(&totem_config->interfaces[i].member_list[j], 0, sizeof(struct totem_ip_address)); } totem_config->interfaces[i].member_count = 0; } iter = icmap_iter_init_r(map, "nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } /* If it's the same as the last node_pos then skip it */ if (node_pos == last_node_pos) { continue; } last_node_pos = node_pos; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos); iter2 = icmap_iter_init_r(map, tmp_key); while ((iter_key2 = icmap_iter_next(iter2, NULL, NULL)) != NULL) { unsigned int nodeid; char *str; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); if (icmap_get_uint32_r(map, tmp_key, &nodeid) != CS_OK) { nodeid = 0; } res = sscanf(iter_key2, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2); if (res != 3 || strcmp(tmp_key2, "_addr") != 0) { continue; } if (linknumber >= INTERFACE_MAX) { snprintf (error_string_response, sizeof(error_string_response), "parse error in config: interface ring number %u is bigger than allowed maximum %u\n", linknumber, INTERFACE_MAX - 1); *error_string = error_string_response; icmap_iter_finalize(iter2); icmap_iter_finalize(iter); return (-1); } if (icmap_get_string_r(map, iter_key2, &node_addr_str) != CS_OK) { continue; } /* Generate nodeids if they are not provided and transport is UDP/U */ if (!nodeid && (totem_config->transport_number == TOTEM_TRANSPORT_UDP || totem_config->transport_number == TOTEM_TRANSPORT_UDPU)) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { nodeid = generate_nodeid(totem_config, str); if (nodeid == -1) { sprintf(error_string_response, "An IPV6 network requires that a node ID be specified " "for address '%s'.", node_addr_str); *error_string = error_string_response; free(str); return (-1); } log_printf(LOGSYS_LEVEL_DEBUG, "Generated nodeid = " CS_PRI_NODE_ID " for %s", nodeid, str); free(str); /* * Put nodeid back to nodelist to make cfgtool work */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); /* * Not critical */ (void)icmap_set_uint32_r(map, tmp_key, nodeid); } } if (!nodeid && totem_config->transport_number == TOTEM_TRANSPORT_KNET) { sprintf(error_string_response, "Knet requires an explicit nodeid to be specified " "for address '%s'.", node_addr_str); *error_string = error_string_response; return (-1); } if (totem_config->transport_number == TOTEM_TRANSPORT_KNET && nodeid >= KNET_MAX_HOST) { sprintf(error_string_response, "Knet requires nodeid to be less than %u " "for address '%s'.", KNET_MAX_HOST, node_addr_str); *error_string = error_string_response; return (-1); } member_count = totem_config->interfaces[linknumber].member_count; res = totemip_parse(&totem_config->interfaces[linknumber].member_list[member_count], node_addr_str, totem_config->ip_version); if (res == 0) { totem_config->interfaces[linknumber].member_list[member_count].nodeid = nodeid; totem_config->interfaces[linknumber].member_count++; totem_config->interfaces[linknumber].configured = 1; } else { sprintf(error_string_response, "failed to parse node address '%s'\n", node_addr_str); *error_string = error_string_response; memset(&totem_config->interfaces[linknumber].member_list[member_count], 0, sizeof(struct totem_ip_address)); free(node_addr_str); icmap_iter_finalize(iter2); icmap_iter_finalize(iter); return -1; } free(node_addr_str); } icmap_iter_finalize(iter2); } icmap_iter_finalize(iter); configure_link_params(totem_config, map); if (reload) { log_printf(LOGSYS_LEVEL_DEBUG, "About to reconfigure links from nodelist.\n"); if (check_things_have_not_changed(totem_config, error_string) == -1) { return -1; } } return 0; } static void config_convert_nodelist_to_interface(icmap_map_t map, struct totem_config *totem_config) { int res = 0; int node_pos; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key2[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; unsigned int linknumber = 0; icmap_iter_t iter; const char *iter_key; node_pos = find_local_node(map, 1); if (node_pos > -1) { /* * We found node, so create interface section */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos); iter = icmap_iter_init_r(map, tmp_key); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2); if (res != 3 || strcmp(tmp_key2, "_addr") != 0) { continue ; } if (icmap_get_string_r(map, iter_key, &node_addr_str) != CS_OK) { continue; } snprintf(tmp_key2, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber); icmap_set_string_r(map, tmp_key2, node_addr_str); free(node_addr_str); } icmap_iter_finalize(iter); } } static int get_interface_params(struct totem_config *totem_config, icmap_map_t map, const char **error_string, uint64_t *warnings, int reload) { int res = 0; unsigned int linknumber = 0; int member_count = 0; int i; icmap_iter_t iter, member_iter; const char *iter_key; const char *member_iter_key; char linknumber_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key[ICMAP_KEYNAME_MAXLEN]; uint8_t u8; uint32_t u32; char *str; char *cluster_name = NULL; enum totem_ip_version_enum tmp_ip_version = TOTEM_IP_VERSION_4; int ret = 0; if (reload) { for (i=0; iinterfaces[i].configured = 0; totem_config->interfaces[i].knet_ping_timeout = 0; totem_config->interfaces[i].knet_ping_interval = 0; totem_config->interfaces[i].knet_ping_precision = KNET_PING_PRECISION; totem_config->interfaces[i].knet_pong_count = KNET_PONG_COUNT; } } if (icmap_get_string_r(map, "totem.cluster_name", &cluster_name) != CS_OK) { cluster_name = NULL; } iter = icmap_iter_init_r(map, "totem.interface."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "totem.interface.%[^.].%s", linknumber_key, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "bindnetaddr") != 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) { continue; } member_count = 0; linknumber = atoi(linknumber_key); if (linknumber >= INTERFACE_MAX) { snprintf (error_string_response, sizeof(error_string_response), "parse error in config: interface ring number %u is bigger than allowed maximum %u\n", linknumber, INTERFACE_MAX - 1); *error_string = error_string_response; ret = -1; goto out; } /* These things are only valid for the initial read */ if (!reload) { /* * Get the bind net address */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].bindnet, str, totem_config->ip_version); if (res) { sprintf(error_string_response, "failed to parse bindnet address '%s'\n", str); *error_string = error_string_response; free(str); ret = -1; goto out; } free(str); } /* * Get interface multicast address */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", linknumber); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].mcast_addr, str, totem_config->ip_version); if (res) { sprintf(error_string_response, "failed to parse mcast address '%s'\n", str); *error_string = error_string_response; free(str); ret = -1; goto out; } free(str); } else if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { /* * User not specified address -> autogenerate one from cluster_name key * (if available). Return code is intentionally ignored, because * udpu doesn't need mcastaddr and validity of mcastaddr for udp is * checked later anyway. */ if (totem_config->interfaces[0].bindnet.family == AF_INET) { tmp_ip_version = TOTEM_IP_VERSION_4; } else if (totem_config->interfaces[0].bindnet.family == AF_INET6) { tmp_ip_version = TOTEM_IP_VERSION_6; } (void)get_cluster_mcast_addr (cluster_name, linknumber, tmp_ip_version, &totem_config->interfaces[linknumber].mcast_addr); } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.broadcast", linknumber); if (icmap_get_string(tmp_key, &str) == CS_OK) { if (strcmp (str, "yes") == 0) { totem_config->broadcast_use = 1; } free(str); } } /* These things are only valid for the initial read OR a newly-defined link */ if (!reload || (totem_config->interfaces[linknumber].configured == 0)) { /* * Get mcast port */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", linknumber); if (icmap_get_uint16_r(map, tmp_key, &totem_config->interfaces[linknumber].ip_port) != CS_OK) { if (totem_config->broadcast_use) { totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + (2 * linknumber); } else { totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + linknumber; } } /* * Get the TTL */ totem_config->interfaces[linknumber].ttl = 1; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.ttl", linknumber); if (icmap_get_uint8_r(map, tmp_key, &u8) == CS_OK) { totem_config->interfaces[linknumber].ttl = u8; } totem_config->interfaces[linknumber].knet_transport = KNET_DEFAULT_TRANSPORT; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_transport", linknumber); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { if (strcmp(str, "sctp") == 0) { totem_config->interfaces[linknumber].knet_transport = KNET_TRANSPORT_SCTP; } else if (strcmp(str, "udp") == 0) { totem_config->interfaces[linknumber].knet_transport = KNET_TRANSPORT_UDP; } else { *error_string = "Unrecognised knet_transport. expected 'udp' or 'sctp'"; ret = -1; goto out; } } } totem_config->interfaces[linknumber].configured = 1; /* * Get the knet link params */ totem_config->interfaces[linknumber].knet_link_priority = 1; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_link_priority", linknumber); if (icmap_get_uint8_r(map, tmp_key, &u8) == CS_OK) { totem_config->interfaces[linknumber].knet_link_priority = u8; } totem_config->interfaces[linknumber].knet_ping_interval = 0; /* real default applied later */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_interval", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_interval = u32; } totem_config->interfaces[linknumber].knet_ping_timeout = 0; /* real default applied later */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_timeout", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_timeout = u32; } totem_config->interfaces[linknumber].knet_ping_precision = KNET_PING_PRECISION; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_precision", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_precision = u32; } totem_config->interfaces[linknumber].knet_pong_count = KNET_PONG_COUNT; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_pong_count", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_pong_count = u32; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.member.", linknumber); member_iter = icmap_iter_init_r(map, tmp_key); while ((member_iter_key = icmap_iter_next(member_iter, NULL, NULL)) != NULL) { if (member_count == 0) { if (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK) { free(str); *warnings |= TOTEM_CONFIG_WARNING_MEMBERS_IGNORED; break; } else { *warnings |= TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED; } } if (icmap_get_string_r(map, member_iter_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].member_list[member_count++], str, totem_config->ip_version); if (res) { sprintf(error_string_response, "failed to parse node address '%s'\n", str); *error_string = error_string_response; icmap_iter_finalize(member_iter); free(str); ret = -1; goto out; } free(str); } } icmap_iter_finalize(member_iter); totem_config->interfaces[linknumber].member_count = member_count; } out: icmap_iter_finalize(iter); free(cluster_name); return (ret); } extern int totem_config_read ( struct totem_config *totem_config, const char **error_string, uint64_t *warnings) { int res = 0; char *str, *ring0_addr_str; char tmp_key[ICMAP_KEYNAME_MAXLEN]; uint16_t u16; int i; int local_node_pos; uint32_t u32; *warnings = 0; memset (totem_config, 0, sizeof (struct totem_config)); totem_config->interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); if (totem_config->interfaces == 0) { *error_string = "Out of memory trying to allocate ethernet interface storage area"; return -1; } totem_config->transport_number = TOTEM_TRANSPORT_KNET; if (icmap_get_string("totem.transport", &str) == CS_OK) { if (strcmp (str, "udpu") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDPU; } else if (strcmp (str, "udp") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDP; } else if (strcmp (str, "knet") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_KNET; } else { *error_string = "Invalid transport type. Should be udpu, udp or knet"; free(str); return -1; } free(str); } memset (totem_config->interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); strcpy (totem_config->link_mode, "passive"); icmap_get_uint32("totem.version", (uint32_t *)&totem_config->version); /* initial crypto load */ if (totem_get_crypto(totem_config, icmap_get_global_map(), error_string) != 0) { return -1; } if (totem_config_keyread(totem_config, icmap_get_global_map(), error_string) != 0) { return -1; } totem_config->crypto_index = 1; totem_config->crypto_changed = 0; if (icmap_get_string("totem.link_mode", &str) == CS_OK) { if (strlen(str) >= TOTEM_LINK_MODE_BYTES) { *error_string = "totem.link_mode is too long"; free(str); return -1; } strcpy (totem_config->link_mode, str); free(str); } if (icmap_get_uint32("totem.nodeid", &u32) == CS_OK) { *warnings |= TOTEM_CONFIG_WARNING_TOTEM_NODEID_SET; } totem_config->clear_node_high_bit = 0; if (icmap_get_string("totem.clear_node_high_bit", &str) == CS_OK) { if (strcmp (str, "yes") == 0) { totem_config->clear_node_high_bit = 1; } free(str); } icmap_get_uint32("totem.threads", &totem_config->threads); icmap_get_uint32("totem.netmtu", &totem_config->net_mtu); totem_config->ip_version = totem_config_get_ip_version(totem_config); if (icmap_get_string("totem.interface.0.bindnetaddr", &str) != CS_OK) { /* * We were not able to find ring 0 bindnet addr. Try to use nodelist informations */ config_convert_nodelist_to_interface(icmap_get_global_map(), totem_config); } else { if (icmap_get_string("nodelist.node.0.ring0_addr", &ring0_addr_str) == CS_OK) { /* * Both bindnetaddr and ring0_addr are set. * Log warning information, and use nodelist instead */ *warnings |= TOTEM_CONFIG_BINDNETADDR_NODELIST_SET; config_convert_nodelist_to_interface(icmap_get_global_map(), totem_config); free(ring0_addr_str); } free(str); } /* * Broadcast option is global but set in interface section, * so reset before processing interfaces. */ totem_config->broadcast_use = 0; res = get_interface_params(totem_config, icmap_get_global_map(), error_string, warnings, 0); if (res < 0) { return res; } /* * Use broadcast is global, so if set, make sure to fill mcast addr correctly * broadcast is only supported for UDP so just do interface 0; */ if (totem_config->broadcast_use) { totemip_parse (&totem_config->interfaces[0].mcast_addr, "255.255.255.255", TOTEM_IP_VERSION_4); } /* * Store automatically generated items back to icmap only for UDP */ if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { for (i = 0; i < INTERFACE_MAX; i++) { if (!totem_config->interfaces[i].configured) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", i); if (icmap_get_string(tmp_key, &str) == CS_OK) { free(str); } else { str = (char *)totemip_print(&totem_config->interfaces[i].mcast_addr); icmap_set_string(tmp_key, str); } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", i); if (icmap_get_uint16(tmp_key, &u16) != CS_OK) { icmap_set_uint16(tmp_key, totem_config->interfaces[i].ip_port); } } } /* * Check existence of nodelist */ if ((icmap_get_string("nodelist.node.0.name", &str) == CS_OK) || (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK)) { free(str); /* * find local node */ local_node_pos = find_local_node(icmap_get_global_map(), 1); if (local_node_pos != -1) { assert(totem_config->node_id == 0); snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", local_node_pos); (void)icmap_get_uint32(tmp_key, &totem_config->node_id); if ((totem_config->transport_number == TOTEM_TRANSPORT_KNET) && (!totem_config->node_id)) { *error_string = "Knet requires an explicit nodeid for the local node"; return -1; } if ((totem_config->transport_number == TOTEM_TRANSPORT_UDP || totem_config->transport_number == TOTEM_TRANSPORT_UDPU) && (!totem_config->node_id)) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", local_node_pos); icmap_get_string(tmp_key, &str); totem_config->node_id = generate_nodeid(totem_config, str); if (totem_config->node_id == -1) { *error_string = "An IPV6 network requires that a node ID be specified"; free(str); return (-1); } totem_config->interfaces[0].member_list[local_node_pos].nodeid = totem_config->node_id; free(str); } /* Users must not change this */ icmap_set_ro_access("nodelist.local_node_pos", 0, 1); } if (put_nodelist_members_to_config(totem_config, icmap_get_global_map(), 0, error_string)) { return -1; } } /* * Get things that might change in the future (and can depend on totem_config->interfaces); */ totem_volatile_config_read(totem_config, icmap_get_global_map(), NULL); calc_knet_ping_timers(totem_config); /* This is now done in the totemknet interface callback */ /* configure_totem_links(totem_config, icmap_get_global_map()); */ add_totem_config_notification(totem_config); return 0; } int totem_config_validate ( struct totem_config *totem_config, const char **error_string) { static char local_error_reason[512]; char parse_error[512]; const char *error_reason = local_error_reason; int i; uint32_t u32; int num_configured = 0; unsigned int interface_max = INTERFACE_MAX; for (i = 0; i < INTERFACE_MAX; i++) { if (totem_config->interfaces[i].configured) { num_configured++; } } if (num_configured == 0) { error_reason = "No interfaces defined"; goto parse_error; } /* Check we found a local node name */ if (icmap_get_uint32("nodelist.local_node_pos", &u32) != CS_OK) { error_reason = "No valid name found for local host"; goto parse_error; } for (i = 0; i < INTERFACE_MAX; i++) { /* * Some error checking of parsed data to make sure its valid */ struct totem_ip_address null_addr; if (!totem_config->interfaces[i].configured) { continue; } memset (&null_addr, 0, sizeof (struct totem_ip_address)); if ((totem_config->transport_number == TOTEM_TRANSPORT_UDP) && memcmp (&totem_config->interfaces[i].mcast_addr, &null_addr, sizeof (struct totem_ip_address)) == 0) { snprintf (local_error_reason, sizeof(local_error_reason), "No multicast address specified for interface %u", i); goto parse_error; } if (totem_config->interfaces[i].ip_port == 0) { snprintf (local_error_reason, sizeof(local_error_reason), "No multicast port specified for interface %u", i); goto parse_error; } if (totem_config->interfaces[i].ttl > 255) { snprintf (local_error_reason, sizeof(local_error_reason), "Invalid TTL (should be 0..255) for interface %u", i); goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_UDP && totem_config->interfaces[i].ttl != 1) { snprintf (local_error_reason, sizeof(local_error_reason), "Can only set ttl on multicast transport types for interface %u", i); goto parse_error; } if (totem_config->interfaces[i].knet_link_priority > 255) { snprintf (local_error_reason, sizeof(local_error_reason), "Invalid link priority (should be 0..255) for interface %u", i); goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_KNET && totem_config->interfaces[i].knet_link_priority != 1) { snprintf (local_error_reason, sizeof(local_error_reason), "Can only set link priority on knet transport type for interface %u", i); goto parse_error; } if (totem_config->interfaces[i].mcast_addr.family == AF_INET6 && totem_config->node_id == 0) { snprintf (local_error_reason, sizeof(local_error_reason), "An IPV6 network requires that a node ID be specified for interface %u", i); goto parse_error; } if (totem_config->broadcast_use == 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) { if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) { snprintf (local_error_reason, sizeof(local_error_reason), "Multicast address family does not match bind address family for interface %u", i); goto parse_error; } if (totemip_is_mcast (&totem_config->interfaces[i].mcast_addr) != 0) { snprintf (local_error_reason, sizeof(local_error_reason), "mcastaddr is not a correct multicast address for interface %u", i); goto parse_error; } } } if (totem_config->version != 2) { error_reason = "This totem parser can only parse version 2 configurations."; goto parse_error; } if (totem_volatile_config_validate(totem_config, icmap_get_global_map(), error_string) == -1) { return (-1); } if (check_for_duplicate_nodeids(totem_config, error_string) == -1) { return (-1); } /* * KNET Link values validation */ if (strcmp (totem_config->link_mode, "active") && strcmp (totem_config->link_mode, "rr") && strcmp (totem_config->link_mode, "passive")) { snprintf (local_error_reason, sizeof(local_error_reason), "The Knet link mode \"%s\" specified is invalid. It must be active, passive or rr.\n", totem_config->link_mode); goto parse_error; } /* Only Knet does multiple interfaces */ if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) { interface_max = 1; } if (interface_max < num_configured) { snprintf (parse_error, sizeof(parse_error), "%d is too many configured interfaces for non-Knet transport.", num_configured); error_reason = parse_error; goto parse_error; } /* Only knet allows crypto */ if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) { if ((strcmp(totem_config->crypto_cipher_type, "none") != 0) || (strcmp(totem_config->crypto_hash_type, "none") != 0)) { snprintf (parse_error, sizeof(parse_error), "crypto_cipher & crypto_hash are only valid for the Knet transport."); error_reason = parse_error; goto parse_error; } } if (totem_config->net_mtu == 0) { if (totem_config->transport_number == TOTEM_TRANSPORT_KNET) { totem_config->net_mtu = KNET_MAX_PACKET_SIZE; } else { totem_config->net_mtu = UDP_NETMTU; } } return 0; parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } static int read_keyfile ( const char *key_location, struct totem_config *totem_config, const char **error_string) { int fd; int res; int saved_errno; char error_str[100]; const char *error_ptr; fd = open (key_location, O_RDONLY); if (fd == -1) { error_ptr = qb_strerror_r(errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not open %s: %s\n", key_location, error_ptr); goto parse_error; } res = read (fd, totem_config->private_key, TOTEM_PRIVATE_KEY_LEN_MAX); saved_errno = errno; close (fd); if (res == -1) { error_ptr = qb_strerror_r (saved_errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not read %s: %s\n", key_location, error_ptr); goto parse_error; } if (res < TOTEM_PRIVATE_KEY_LEN_MIN) { snprintf (error_string_response, sizeof(error_string_response), "Could only read %d bits of minimum %u bits from %s.\n", res * 8, TOTEM_PRIVATE_KEY_LEN_MIN * 8, key_location); goto parse_error; } totem_config->private_key_len = res; return 0; parse_error: *error_string = error_string_response; return (-1); } int totem_config_keyread ( struct totem_config *totem_config, icmap_map_t map, const char **error_string) { int got_key = 0; char *key_location = NULL; int res; size_t key_len; char old_key[TOTEM_PRIVATE_KEY_LEN_MAX]; size_t old_key_len; /* Take a copy so we can see if it has changed */ memcpy(old_key, totem_config->private_key, sizeof(totem_config->private_key)); old_key_len = totem_config->private_key_len; memset (totem_config->private_key, 0, sizeof(totem_config->private_key)); totem_config->private_key_len = 0; if (strcmp(totem_config->crypto_cipher_type, "none") == 0 && strcmp(totem_config->crypto_hash_type, "none") == 0) { return (0); } /* cmap may store the location of the key file */ if (icmap_get_string_r(map, "totem.keyfile", &key_location) == CS_OK) { res = read_keyfile(key_location, totem_config, error_string); free(key_location); if (res) { goto key_error; } got_key = 1; } else { /* Or the key itself may be in the cmap */ if (icmap_get_r(map, "totem.key", NULL, &key_len, NULL) == CS_OK) { if (key_len > sizeof(totem_config->private_key)) { sprintf(error_string_response, "key is too long"); goto key_error; } if (key_len < TOTEM_PRIVATE_KEY_LEN_MIN) { sprintf(error_string_response, "key is too short"); goto key_error; } if (icmap_get_r(map, "totem.key", totem_config->private_key, &key_len, NULL) == CS_OK) { totem_config->private_key_len = key_len; got_key = 1; } else { sprintf(error_string_response, "can't load private key"); goto key_error; } } } /* In desperation we read the default filename */ if (!got_key) { res = read_keyfile(COROSYSCONFDIR "/authkey", totem_config, error_string); if (res) goto key_error; } if (old_key_len != totem_config->private_key_len || memcmp(old_key, totem_config->private_key, sizeof(totem_config->private_key))) { totem_config->crypto_changed = 1; } return (0); key_error: *error_string = error_string_response; return (-1); } int totem_reread_crypto_config(struct totem_config *totem_config, icmap_map_t map, const char **error_string) { if (totem_get_crypto(totem_config, map, error_string) != 0) { return -1; } if (totem_config_keyread(totem_config, map, error_string) != 0) { return -1; } return 0; } static void debug_dump_totem_config(const struct totem_config *totem_config) { log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)", totem_config->token_timeout, totem_config->token_retransmit_timeout); if (totem_config->token_warning) { uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100; log_printf(LOGSYS_LEVEL_DEBUG, "Token warning every %d ms (%d%% of Token Timeout)", token_warning_ms, totem_config->token_warning); if (token_warning_ms < totem_config->token_retransmit_timeout) log_printf (LOGSYS_LEVEL_DEBUG, "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) " "which can lead to spurious token warnings. Consider increasing the token_warning parameter.", token_warning_ms, totem_config->token_retransmit_timeout); } else log_printf(LOGSYS_LEVEL_DEBUG, "Token warnings disabled"); log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf(LOGSYS_LEVEL_DEBUG, "downcheck (%d ms) fail to recv const (%d msgs)", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf(LOGSYS_LEVEL_DEBUG, "seqno unchanged const (%d rotations) Maximum network MTU %d", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf(LOGSYS_LEVEL_DEBUG, "window size per rotation (%d messages) maximum messages per rotation (%d messages)", totem_config->window_size, totem_config->max_messages); log_printf(LOGSYS_LEVEL_DEBUG, "missed count const (%d messages)", totem_config->miss_count_const); log_printf(LOGSYS_LEVEL_DEBUG, "heartbeat_failures_allowed (%d)", totem_config->heartbeat_failures_allowed); log_printf(LOGSYS_LEVEL_DEBUG, "max_network_delay (%d ms)", totem_config->max_network_delay); } static void totem_change_notify( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { struct totem_config *totem_config = (struct totem_config *)user_data; uint32_t *param; uint8_t reloading; const char *deleted_key = NULL; const char *error_string; /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.reload_in_progress", &reloading) == CS_OK && reloading) return; param = totem_get_param_by_name((struct totem_config *)user_data, key_name); /* * Process change only if changed key is found in totem_config (-> param is not NULL) * or for special key token_coefficient. token_coefficient key is not stored in * totem_config, but it is used for computation of token timeout. */ if (!param && strcmp(key_name, "totem.token_coefficient") != 0) return; /* * Values other than UINT32 are not supported, or needed (yet) */ switch (event) { case ICMAP_TRACK_DELETE: deleted_key = key_name; break; case ICMAP_TRACK_ADD: case ICMAP_TRACK_MODIFY: deleted_key = NULL; break; default: break; } totem_volatile_config_read (totem_config, icmap_get_global_map(), deleted_key); log_printf(LOGSYS_LEVEL_DEBUG, "Totem related config key changed. Dumping actual totem config."); debug_dump_totem_config(totem_config); if (totem_volatile_config_validate(totem_config, icmap_get_global_map(), &error_string) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); /* * TODO: Consider corosync exit and/or load defaults for volatile * values. For now, log error seems to be enough */ } } int totemconfig_configure_new_params( struct totem_config *totem_config, icmap_map_t map, const char **error_string) { uint64_t warnings = 0LL; get_interface_params(totem_config, map, error_string, &warnings, 1); if (put_nodelist_members_to_config (totem_config, map, 1, error_string)) { return -1; } calc_knet_ping_timers(totem_config); log_printf(LOGSYS_LEVEL_DEBUG, "Configuration reloaded. Dumping actual totem config."); debug_dump_totem_config(totem_config); /* Reinstate the local_node_pos */ (void)find_local_node(map, 0); return 0; } -void totemconfig_commit_new_params( +int totemconfig_commit_new_params( struct totem_config *totem_config, icmap_map_t map) { + int res; struct totem_interface *new_interfaces = NULL; new_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(new_interfaces != NULL); memcpy(new_interfaces, totem_config->interfaces, sizeof (struct totem_interface) * INTERFACE_MAX); /* Set link parameters including local_ip */ configure_totem_links(totem_config, map); - /* Add & remove nodes */ - compute_and_set_totempg_interfaces(totem_config->orig_interfaces, new_interfaces); + /* Add & remove nodes & link properties */ + res = compute_and_set_totempg_interfaces(totem_config->orig_interfaces, new_interfaces); /* Does basic global params (like compression) */ totempg_reconfigure(); free(new_interfaces); + return res; /* On a reload this is ignored */ } static void add_totem_config_notification(struct totem_config *totem_config) { icmap_track_t icmap_track; icmap_track_add("totem.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, totem_change_notify, totem_config, &icmap_track); } diff --git a/exec/totemconfig.h b/exec/totemconfig.h index 2ea338fe..a0b2e10d 100644 --- a/exec/totemconfig.h +++ b/exec/totemconfig.h @@ -1,93 +1,93 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2012 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMCONFIG_H_DEFINED #define TOTEMCONFIG_H_DEFINED #include #include #include #include #include "totemsrp.h" #define TOTEM_CONFIG_WARNING_MEMBERS_IGNORED (1<<1) #define TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED (1<<2) #define TOTEM_CONFIG_WARNING_TOTEM_NODEID_SET (1<<3) #define TOTEM_CONFIG_BINDNETADDR_NODELIST_SET (1<<4) extern int totem_config_read ( struct totem_config *totem_config, const char **error_string, uint64_t *warnings); extern int totem_config_validate ( struct totem_config *totem_config, const char **error_string); extern int totem_config_keyread ( struct totem_config *totem_config, icmap_map_t map, const char **error_string); extern int totem_config_find_local_addr_in_nodelist( struct totem_config *totem_config, const char *ipaddr_key_prefix, unsigned int *node_pos); extern void totem_volatile_config_read( struct totem_config *totem_config, icmap_map_t temp_map, const char *deleted_key); extern int totem_reread_crypto_config( struct totem_config *totem_config, icmap_map_t map, const char **error_string); extern int totem_volatile_config_validate( struct totem_config *totem_config, icmap_map_t temp_map, const char **error_string); extern int totemconfig_configure_new_params( struct totem_config *totem_config, icmap_map_t map, const char **error_string); -extern void totemconfig_commit_new_params( +extern int totemconfig_commit_new_params( struct totem_config *totem_config, icmap_map_t map); #endif /* TOTEMCONFIG_H_DEFINED */ diff --git a/exec/totemknet.c b/exec/totemknet.c index 55905ee5..f280a094 100644 --- a/exec/totemknet.c +++ b/exec/totemknet.c @@ -1,2287 +1,2306 @@ /* * Copyright (c) 2016-2022 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfie@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LIBNOZZLE #include #include #endif #include #include #include #include #include #include "totemknet.h" #include "main.h" #include "util.h" #include #include #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif #ifdef HAVE_LIBNOZZLE static int setup_nozzle(void *knet_context); #endif /* Should match that used by cfg */ #define CFG_INTERFACE_STATUS_MAX_LEN 512 struct totemknet_instance { struct crypto_instance *crypto_inst; qb_loop_t *poll_handle; knet_handle_t knet_handle; int link_mode; void *context; - void (*totemknet_deliver_fn) ( + int (*totemknet_deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from); - void (*totemknet_iface_change_fn) ( + int (*totemknet_iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int link_no); void (*totemknet_mtu_changed) ( void *context, int net_mtu); void (*totemknet_target_set_completed) (void *context); /* * Function and data used to log messages */ int totemknet_log_level_security; int totemknet_log_level_error; int totemknet_log_level_warning; int totemknet_log_level_notice; int totemknet_log_level_debug; int totemknet_subsys_id; int knet_subsys_id; void (*totemknet_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); void *knet_context; char iov_buffer[KNET_MAX_PACKET_SIZE]; char *link_status[INTERFACE_MAX]; struct totem_ip_address my_ids[INTERFACE_MAX]; uint16_t ip_port[INTERFACE_MAX]; int our_nodeid; int loopback_link; struct totem_config *totem_config; struct totem_ip_address token_target; qb_loop_timer_handle timer_netif_check_timeout; qb_loop_timer_handle timer_merge_detect_timeout; int send_merge_detect_message; unsigned int merge_detect_messages_sent_before_timeout; int logpipes[2]; int knet_fd; pthread_mutex_t log_mutex; #ifdef HAVE_LIBNOZZLE char *nozzle_name; char *nozzle_ipaddr; char *nozzle_prefix; char *nozzle_macaddr; nozzle_t nozzle_handle; #endif }; /* Awkward. But needed to get stats from knet */ struct totemknet_instance *global_instance; struct work_item { const void *msg; unsigned int msg_len; struct totemknet_instance *instance; }; int totemknet_member_list_rebind_ip ( void *knet_context); static int totemknet_configure_compression ( struct totemknet_instance *instance, struct totem_config *totem_config); static void totemknet_start_merge_detect_timeout( void *knet_context); static void totemknet_stop_merge_detect_timeout( void *knet_context); static void log_flush_messages ( void *knet_context); static void totemknet_instance_initialize (struct totemknet_instance *instance) { int res; memset (instance, 0, sizeof (struct totemknet_instance)); res = pthread_mutex_init(&instance->log_mutex, NULL); /* * There is not too much else what can be done. */ assert(res == 0); } #define knet_log_printf_lock(level, subsys, function, file, line, format, args...) \ do { \ (void)pthread_mutex_lock(&instance->log_mutex); \ instance->totemknet_log_printf ( \ level, subsys, function, file, line, \ (const char *)format, ##args); \ (void)pthread_mutex_unlock(&instance->log_mutex); \ } while (0); #define knet_log_printf(level, format, args...) \ do { \ knet_log_printf_lock ( \ level, instance->totemknet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); #define libknet_log_printf(level, format, args...) \ do { \ knet_log_printf_lock ( \ level, instance->knet_subsys_id, \ __FUNCTION__, "libknet.h", __LINE__, \ (const char *)format, ##args); \ } while (0); #define KNET_LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemknet_log_printf ( \ level, instance->totemknet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)", ##args, _error_ptr, err_num); \ } while(0) #ifdef HAVE_LIBNOZZLE static inline int is_ether_addr_multicast(const uint8_t *addr) { return (addr[0] & 0x01); } static inline int is_ether_addr_zero(const uint8_t *addr) { return (!addr[0] && !addr[1] && !addr[2] && !addr[3] && !addr[4] && !addr[5]); } static int ether_host_filter_fn(void *private_data, const unsigned char *outdata, ssize_t outdata_len, uint8_t tx_rx, knet_node_id_t this_host_id, knet_node_id_t src_host_id, int8_t *channel, knet_node_id_t *dst_host_ids, size_t *dst_host_ids_entries) { struct ether_header *eth_h = (struct ether_header *)outdata; uint8_t *dst_mac = (uint8_t *)eth_h->ether_dhost; uint16_t dst_host_id; if (is_ether_addr_zero(dst_mac)) return -1; if (is_ether_addr_multicast(dst_mac)) { return 1; } memmove(&dst_host_id, &dst_mac[4], 2); dst_host_ids[0] = ntohs(dst_host_id); *dst_host_ids_entries = 1; return 0; } #endif static int dst_host_filter_callback_fn(void *private_data, const unsigned char *outdata, ssize_t outdata_len, uint8_t tx_rx, knet_node_id_t this_host_id, knet_node_id_t src_host_id, int8_t *channel, knet_node_id_t *dst_host_ids, size_t *dst_host_ids_entries) { struct totem_message_header *header = (struct totem_message_header *)outdata; int res; #ifdef HAVE_LIBNOZZLE if (*channel != 0) { return ether_host_filter_fn(private_data, outdata, outdata_len, tx_rx, this_host_id, src_host_id, channel, dst_host_ids, dst_host_ids_entries); } #endif if (header->target_nodeid) { dst_host_ids[0] = header->target_nodeid; *dst_host_ids_entries = 1; res = 0; /* unicast message */ } else { *dst_host_ids_entries = 0; res = 1; /* multicast message */ } return res; } static void socket_error_callback_fn(void *private_data, int datafd, int8_t channel, uint8_t tx_rx, int error, int errorno) { struct totemknet_instance *instance = (struct totemknet_instance *)private_data; knet_log_printf (LOGSYS_LEVEL_DEBUG, "Knet socket ERROR notification called: txrx=%d, error=%d, errorno=%d", tx_rx, error, errorno); if ((error == -1 && errorno != EAGAIN) || (error == 0)) { knet_handle_remove_datafd(instance->knet_handle, datafd); } } static void host_change_callback_fn(void *private_data, knet_node_id_t host_id, uint8_t reachable, uint8_t remote, uint8_t external) { struct totemknet_instance *instance = (struct totemknet_instance *)private_data; // TODO: what? if anything. knet_log_printf (LOGSYS_LEVEL_DEBUG, "Knet host change callback. nodeid: " CS_PRI_NODE_ID " reachable: %d", host_id, reachable); } static void pmtu_change_callback_fn(void *private_data, unsigned int data_mtu) { struct totemknet_instance *instance = (struct totemknet_instance *)private_data; knet_log_printf (LOGSYS_LEVEL_DEBUG, "Knet pMTU change: %d", data_mtu); /* We don't need to tell corosync the actual knet MTU */ // instance->totemknet_mtu_changed(instance->context, data_mtu); } int totemknet_crypto_set ( void *knet_context, const char *cipher_type, const char *hash_type) { return (0); } static inline void ucast_sendmsg ( struct totemknet_instance *instance, struct totem_ip_address *system_to, const void *msg, unsigned int msg_len) { int res = 0; struct totem_message_header *header = (struct totem_message_header *)msg; struct msghdr msg_ucast; struct iovec iovec; header->target_nodeid = system_to->nodeid; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; /* * Build unicast message */ memset(&msg_ucast, 0, sizeof(msg_ucast)); msg_ucast.msg_iov = (void *)&iovec; msg_ucast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_ucast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_ucast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_ucast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_ucast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_ucast.msg_accrightslen = 0; #endif /* * Transmit unicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->knet_fd, &msg_ucast, MSG_NOSIGNAL); if (res < 0) { KNET_LOGSYS_PERROR (errno, instance->totemknet_log_level_debug, "sendmsg(ucast) failed (non-critical)"); } } static inline void mcast_sendmsg ( struct totemknet_instance *instance, const void *msg, unsigned int msg_len, int only_active) { int res; struct totem_message_header *header = (struct totem_message_header *)msg; struct msghdr msg_mcast; struct iovec iovec; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; header->target_nodeid = 0; /* * Build multicast message */ memset(&msg_mcast, 0, sizeof(msg_mcast)); msg_mcast.msg_iov = (void *)&iovec; msg_mcast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_mcast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_mcast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_mcast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_mcast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_mcast.msg_accrightslen = 0; #endif // log_printf (LOGSYS_LEVEL_DEBUG, "totemknet: mcast_sendmsg. only_active=%d, len=%d", only_active, msg_len); res = sendmsg (instance->knet_fd, &msg_mcast, MSG_NOSIGNAL); if (res < msg_len) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "totemknet: mcast_send sendmsg returned %d", res); } if (!only_active || instance->send_merge_detect_message) { /* * Current message was sent to all nodes */ instance->merge_detect_messages_sent_before_timeout++; instance->send_merge_detect_message = 0; } } static int node_compare(const void *aptr, const void *bptr) { uint16_t a,b; a = *(uint16_t *)aptr; b = *(uint16_t *)bptr; return a > b; } #ifndef OWN_INDEX_NONE #define OWN_INDEX_NONE -1 #endif int totemknet_nodestatus_get ( void *knet_context, unsigned int nodeid, struct totem_node_status *node_status) { int i; int res = 0; struct knet_link_status link_status; struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; struct knet_host_status knet_host_status; uint8_t link_list[KNET_MAX_LINK]; size_t num_links; if (!instance->knet_handle) { return CS_ERR_NOT_EXIST; /* Not using knet */ } if (!node_status) { return CS_ERR_INVALID_PARAM; } res = knet_host_get_status(instance->knet_handle, nodeid, &knet_host_status); if (res) { knet_log_printf (LOGSYS_LEVEL_WARNING, "knet_handle_get_host_status(%d) failed: %d", nodeid, res); return (-1); } node_status->nodeid = nodeid; node_status->reachable = knet_host_status.reachable; node_status->remote = knet_host_status.remote; node_status->external = knet_host_status.external; #ifdef HAVE_KNET_ONWIRE_VER res = knet_handle_get_onwire_ver(instance->knet_handle, nodeid, &node_status->onwire_min, &node_status->onwire_max, &node_status->onwire_ver); if (res) { knet_log_printf (LOGSYS_LEVEL_WARNING, "knet_handle_get_onwire_ver(%d) failed: %d", nodeid, res); return (-1); } #endif /* Get link info */ res = knet_link_get_link_list(instance->knet_handle, nodeid, link_list, &num_links); if (res) { knet_log_printf (LOGSYS_LEVEL_WARNING, "knet_link_get_link_list(%d) failed: %d", nodeid, res); return (-1); } /* node_status[] has been zeroed for us in totempg.c */ for (i=0; i < num_links; i++) { if (!instance->totem_config->interfaces[link_list[i]].configured) { continue; } res = knet_link_get_status(instance->knet_handle, nodeid, link_list[i], &link_status, sizeof(link_status)); if (res == 0) { node_status->link_status[link_list[i]].enabled = link_status.enabled; node_status->link_status[link_list[i]].connected = link_status.connected; node_status->link_status[link_list[i]].dynconnected = link_status.dynconnected; node_status->link_status[link_list[i]].mtu = link_status.mtu; memcpy(node_status->link_status[link_list[i]].src_ipaddr, link_status.src_ipaddr, KNET_MAX_HOST_LEN); memcpy(node_status->link_status[link_list[i]].dst_ipaddr, link_status.dst_ipaddr, KNET_MAX_HOST_LEN); } else { knet_log_printf (LOGSYS_LEVEL_WARNING, "knet_link_get_link_status(%d, %d) failed: %d", nodeid, link_list[i], res); } } return res; } int totemknet_ifaces_get (void *knet_context, char ***status, unsigned int *iface_count) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; struct knet_link_status link_status; knet_node_id_t host_list[KNET_MAX_HOST]; uint8_t link_list[KNET_MAX_LINK]; size_t num_hosts; size_t num_links; size_t link_idx; int i,j; char *ptr; int res = 0; /* * Don't do the whole 'link_info' bit if the caller just wants * a count of interfaces. */ if (status) { int own_idx = OWN_INDEX_NONE; res = knet_host_get_host_list(instance->knet_handle, host_list, &num_hosts); if (res) { return (-1); } qsort(host_list, num_hosts, sizeof(uint16_t), node_compare); for (j=0; jour_nodeid) { own_idx = j; break; } } for (i=0; ilink_status[i], 'd', CFG_INTERFACE_STATUS_MAX_LEN-1); if (own_idx != OWN_INDEX_NONE) { instance->link_status[i][own_idx] = 'n'; } instance->link_status[i][num_hosts] = '\0'; } /* This is all a bit "inside-out" because "status" is a set of strings per link * and knet orders things by host */ for (j=0; jknet_handle, host_list[j], link_list, &num_links); if (res) { return (-1); } link_idx = 0; for (i=0; i < num_links; i++) { /* * Skip over links that are unconfigured to corosync. This is basically * link0 if corosync isn't using it for comms, as we will still * have it set up for loopback. */ if (!instance->totem_config->interfaces[link_list[i]].configured) { continue; } ptr = instance->link_status[link_idx++]; res = knet_link_get_status(instance->knet_handle, host_list[j], link_list[i], &link_status, sizeof(link_status)); if (res == 0) { ptr[j] = '0' + (link_status.enabled | link_status.connected<<1 | link_status.dynconnected<<2); } else { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet_ifaces_get: Cannot get link status: %s", strerror(errno)); ptr[j] = '?'; } } } *status = instance->link_status; } *iface_count = INTERFACE_MAX; return (res); } int totemknet_finalize ( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; int i,j; static knet_node_id_t nodes[KNET_MAX_HOST]; /* static to save stack */ uint8_t links[KNET_MAX_LINK]; size_t num_nodes; size_t num_links; knet_log_printf(LOG_DEBUG, "totemknet: finalize"); qb_loop_poll_del (instance->poll_handle, instance->logpipes[0]); qb_loop_poll_del (instance->poll_handle, instance->knet_fd); /* * Disable forwarding to make knet flush send queue. This ensures that the LEAVE message will be sent. */ res = knet_handle_setfwd(instance->knet_handle, 0); if (res) { knet_log_printf (LOGSYS_LEVEL_CRIT, "totemknet: knet_handle_setfwd failed: %s", strerror(errno)); } res = knet_host_get_host_list(instance->knet_handle, nodes, &num_nodes); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Cannot get knet node list for shutdown: %s", strerror(errno)); /* Crash out anyway */ goto finalise_error; } /* Tidily shut down all nodes & links. */ for (i=0; iknet_handle, nodes[i], links, &num_links); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Cannot get knet link list for node " CS_PRI_NODE_ID ": %s", nodes[i], strerror(errno)); goto finalise_error; } for (j=0; jknet_handle, nodes[i], links[j], 0); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet: knet_link_set_enable(node " CS_PRI_NODE_ID ", link %d) failed: %s", nodes[i], links[j], strerror(errno)); } res = knet_link_clear_config(instance->knet_handle, nodes[i], links[j]); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet: knet_link_clear_config(node " CS_PRI_NODE_ID ", link %d) failed: %s", nodes[i], links[j], strerror(errno)); } } res = knet_host_remove(instance->knet_handle, nodes[i]); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet: knet_host_remove(node " CS_PRI_NODE_ID ") failed: %s", nodes[i], strerror(errno)); } } finalise_error: res = knet_handle_free(instance->knet_handle); if (res) { knet_log_printf (LOGSYS_LEVEL_CRIT, "totemknet: knet_handle_free failed: %s", strerror(errno)); } totemknet_stop_merge_detect_timeout(instance); log_flush_messages(instance); /* * Error is deliberately ignored */ (void)pthread_mutex_destroy(&instance->log_mutex); return (res); } static int log_deliver_fn ( int fd, int revents, void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; char buffer[sizeof(struct knet_log_msg)*4]; char *bufptr = buffer; int done = 0; int len; len = read(fd, buffer, sizeof(buffer)); while (done < len) { struct knet_log_msg *msg = (struct knet_log_msg *)bufptr; switch (msg->msglevel) { case KNET_LOG_ERR: libknet_log_printf (LOGSYS_LEVEL_ERROR, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; case KNET_LOG_WARN: libknet_log_printf (LOGSYS_LEVEL_WARNING, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; case KNET_LOG_INFO: libknet_log_printf (LOGSYS_LEVEL_INFO, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; case KNET_LOG_DEBUG: libknet_log_printf (LOGSYS_LEVEL_DEBUG, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; #ifdef KNET_LOG_TRACE case KNET_LOG_TRACE: libknet_log_printf (LOGSYS_LEVEL_TRACE, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; #endif } bufptr += sizeof(struct knet_log_msg); done += sizeof(struct knet_log_msg); } return 0; } static int data_deliver_fn ( int fd, int revents, void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; struct msghdr msg_hdr; struct iovec iov_recv; struct sockaddr_storage system_from; ssize_t msg_len; int truncated_packet; iov_recv.iov_base = instance->iov_buffer; iov_recv.iov_len = KNET_MAX_PACKET_SIZE; msg_hdr.msg_name = &system_from; msg_hdr.msg_namelen = sizeof (struct sockaddr_storage); msg_hdr.msg_iov = &iov_recv; msg_hdr.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_hdr.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_hdr.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_hdr.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_hdr.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_hdr.msg_accrightslen = 0; #endif msg_len = recvmsg (fd, &msg_hdr, MSG_NOSIGNAL | MSG_DONTWAIT); if (msg_len <= 0) { return (0); } truncated_packet = 0; #ifdef HAVE_MSGHDR_FLAGS if (msg_hdr.msg_flags & MSG_TRUNC) { truncated_packet = 1; } #else /* * We don't have MSGHDR_FLAGS, but we can (hopefully) safely make assumption that * if bytes_received == KNET_MAX_PACKET_SIZE then packet is truncated */ if (bytes_received == KNET_MAX_PACKET_SIZE) { truncated_packet = 1; } #endif if (truncated_packet) { knet_log_printf(instance->totemknet_log_level_error, "Received too big message. This may be because something bad is happening" "on the network (attack?), or you tried join more nodes than corosync is" "compiled with (%u) or bug in the code (bad estimation of " "the KNET_MAX_PACKET_SIZE). Dropping packet.", PROCESSOR_COUNT_MAX); return (0); } /* * Handle incoming message */ instance->totemknet_deliver_fn ( instance->context, instance->iov_buffer, msg_len, &system_from); return (0); } static void timer_function_netif_check_timeout ( void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; int i; + int res = 0; for (i=0; i < INTERFACE_MAX; i++) { if (!instance->totem_config->interfaces[i].configured) { continue; } - instance->totemknet_iface_change_fn (instance->context, - &instance->my_ids[i], - i); + res = instance->totemknet_iface_change_fn (instance->context, + &instance->my_ids[i], + i); + } + if (res != 0) { + /* This is only called at startup, so we can quit here. + Refresh takes a different path */ + corosync_exit_error(COROSYNC_DONE_MAINCONFIGREAD); } } static void knet_set_access_list_config(struct totemknet_instance *instance) { #ifdef HAVE_KNET_ACCESS_LIST uint32_t value; cs_error_t err; value = instance->totem_config->block_unlisted_ips; knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet_enable access list: %d", value); err = knet_handle_enable_access_lists(instance->knet_handle, value); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_access_lists failed"); } #endif } void totemknet_configure_log_level() { int logsys_log_mode; int knet_log_mode = KNET_LOG_INFO; uint8_t s; int err; if (!global_instance || !global_instance->knet_handle) { return; } /* Reconfigure logging level */ logsys_log_mode = logsys_config_debug_get("KNET"); switch (logsys_log_mode) { case LOGSYS_DEBUG_OFF: knet_log_mode = KNET_LOG_INFO; break; case LOGSYS_DEBUG_ON: knet_log_mode = KNET_LOG_DEBUG; break; case LOGSYS_DEBUG_TRACE: #ifdef KNET_LOG_TRACE knet_log_mode = KNET_LOG_TRACE; #else knet_log_mode = KNET_LOG_DEBUG; #endif break; } log_printf (LOGSYS_LEVEL_DEBUG, "totemknet setting log level %s", knet_log_get_loglevel_name(knet_log_mode)); err = 0; for (s = 0; sknet_handle, s, knet_log_mode); } /* If one fails, they all fail. no point in issuing KNET_MAX_SUBSYSTEMS errors */ if (err) { log_printf (LOGSYS_LEVEL_ERROR, "totemknet failed to set log level: %s", strerror(errno)); } } /* NOTE: this relies on the fact that totem_reload_notify() is called first */ static void totemknet_refresh_config( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { uint8_t reloading; int after_reload; uint32_t link_no; size_t num_nodes; knet_node_id_t host_ids[KNET_MAX_HOST]; int i; int err; struct totemknet_instance *instance = (struct totemknet_instance *)user_data; ENTER(); /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.totemconfig_reload_in_progress", &reloading) == CS_OK && reloading) { return; } after_reload = (strcmp(key_name, "config.totemconfig_reload_in_progress") == 0); knet_set_access_list_config(instance); if (strcmp(key_name, "totem.knet_pmtud_interval") == 0 || after_reload) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet_pmtud_interval now %u", instance->totem_config->knet_pmtud_interval); err = knet_handle_pmtud_setfreq(instance->knet_handle, instance->totem_config->knet_pmtud_interval); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_pmtud_setfreq failed"); } } if (strcmp(key_name, "totem.knet_mtu") == 0 || after_reload) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet_mtu now %u", instance->totem_config->knet_mtu); err = knet_handle_pmtud_set(instance->knet_handle, instance->totem_config->knet_mtu); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_pmtud failed"); } } /* Configure link parameters for each node */ err = knet_host_get_host_list(instance->knet_handle, host_ids, &num_nodes); if (err != 0) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_host_get_host_list failed"); } for (i=0; iour_nodeid || !instance->totem_config->interfaces[link_no].configured) { continue; } err = knet_link_set_ping_timers(instance->knet_handle, host_ids[i], link_no, instance->totem_config->interfaces[link_no].knet_ping_interval, instance->totem_config->interfaces[link_no].knet_ping_timeout, instance->totem_config->interfaces[link_no].knet_ping_precision); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_ping_timers for node " CS_PRI_NODE_ID " link %d failed", host_ids[i], link_no); } err = knet_link_set_pong_count(instance->knet_handle, host_ids[i], link_no, instance->totem_config->interfaces[link_no].knet_pong_count); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_pong_count for node " CS_PRI_NODE_ID " link %d failed",host_ids[i], link_no); } err = knet_link_set_priority(instance->knet_handle, host_ids[i], link_no, instance->totem_config->interfaces[link_no].knet_link_priority); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_priority for node " CS_PRI_NODE_ID " link %d failed", host_ids[i], link_no); } } } /* Log levels get reconfigured from logconfig.c as that happens last in the reload */ LEAVE(); } static void totemknet_add_config_notifications(struct totemknet_instance *instance) { icmap_track_t icmap_track_totem = NULL; icmap_track_t icmap_track_reload = NULL; ENTER(); icmap_track_add("totem.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, totemknet_refresh_config, instance, &icmap_track_totem); icmap_track_add("config.totemconfig_reload_in_progress", ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY, totemknet_refresh_config, instance, &icmap_track_reload); LEAVE(); } static int totemknet_is_crypto_enabled(const struct totemknet_instance *instance) { return (!(strcmp(instance->totem_config->crypto_cipher_type, "none") == 0 && strcmp(instance->totem_config->crypto_hash_type, "none") == 0)); } static int totemknet_set_knet_crypto(struct totemknet_instance *instance) { struct knet_handle_crypto_cfg crypto_cfg; int res; /* These have already been validated */ memcpy(crypto_cfg.crypto_model, instance->totem_config->crypto_model, sizeof(crypto_cfg.crypto_model)); memcpy(crypto_cfg.crypto_cipher_type, instance->totem_config->crypto_cipher_type, sizeof(crypto_cfg.crypto_model)); memcpy(crypto_cfg.crypto_hash_type, instance->totem_config->crypto_hash_type, sizeof(crypto_cfg.crypto_model)); memcpy(crypto_cfg.private_key, instance->totem_config->private_key, instance->totem_config->private_key_len); crypto_cfg.private_key_len = instance->totem_config->private_key_len; #ifdef HAVE_KNET_CRYPTO_RECONF knet_log_printf(LOGSYS_LEVEL_DEBUG, "Configuring crypto %s/%s/%s on index %d", crypto_cfg.crypto_model, crypto_cfg.crypto_cipher_type, crypto_cfg.crypto_hash_type, instance->totem_config->crypto_index ); /* If crypto is being disabled we need to explicitly allow cleartext traffic in knet */ if (!totemknet_is_crypto_enabled(instance)) { res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_ALLOW_CLEAR_TRAFFIC); if (res) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_rx_clear_traffic(ALLOW) failed %s", strerror(errno)); } } /* use_config will be called later when all nodes are synced */ res = knet_handle_crypto_set_config(instance->knet_handle, &crypto_cfg, instance->totem_config->crypto_index); if (res == -1) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config (index %d) failed: %s", instance->totem_config->crypto_index, strerror(errno)); goto exit_error; } if (res == -2) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config (index %d) failed: -2", instance->totem_config->crypto_index); goto exit_error; } #else knet_log_printf(LOGSYS_LEVEL_DEBUG, "Configuring crypto %s/%s/%s", crypto_cfg.crypto_model, crypto_cfg.crypto_cipher_type, crypto_cfg.crypto_hash_type ); res = knet_handle_crypto(instance->knet_handle, &crypto_cfg); if (res == -1) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto failed: %s", strerror(errno)); goto exit_error; } if (res == -2) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto failed: -2"); goto exit_error; } #endif exit_error: return res; } /* * Create an instance */ int totemknet_initialize ( qb_loop_t *poll_handle, void **knet_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int link_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemknet_instance *instance; char *tmp_str; int8_t channel=0; int allow_knet_handle_fallback=0; int res; int i; instance = malloc (sizeof (struct totemknet_instance)); if (instance == NULL) { return (-1); } totemknet_instance_initialize (instance); instance->totem_config = totem_config; /* * Configure logging */ instance->totemknet_log_level_security = 1; //totem_config->totem_logging_configuration.log_level_security; instance->totemknet_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemknet_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemknet_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemknet_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemknet_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemknet_log_printf = totem_config->totem_logging_configuration.log_printf; instance->knet_subsys_id = _logsys_subsys_create("KNET", "libknet.h"); /* * Initialize local variables for totemknet */ instance->our_nodeid = instance->totem_config->node_id; for (i=0; i< INTERFACE_MAX; i++) { totemip_copy(&instance->my_ids[i], &totem_config->interfaces[i].bindnet); instance->my_ids[i].nodeid = instance->our_nodeid; instance->ip_port[i] = totem_config->interfaces[i].ip_port; /* Needed for totemsrp */ totem_config->interfaces[i].boundto.nodeid = instance->our_nodeid; } instance->poll_handle = poll_handle; instance->context = context; instance->totemknet_deliver_fn = deliver_fn; instance->totemknet_iface_change_fn = iface_change_fn; instance->totemknet_mtu_changed = mtu_changed; instance->totemknet_target_set_completed = target_set_completed; instance->loopback_link = 0; res = pipe(instance->logpipes); if (res == -1) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_CRIT, "failed to create pipe for instance->logpipes"); goto exit_error; } if (fcntl(instance->logpipes[0], F_SETFL, O_NONBLOCK) == -1 || fcntl(instance->logpipes[1], F_SETFL, O_NONBLOCK) == -1) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_CRIT, "failed to set O_NONBLOCK flag for instance->logpipes"); goto exit_error; } if (icmap_get_string("system.allow_knet_handle_fallback", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "yes") == 0) { allow_knet_handle_fallback = 1; } free(tmp_str); } #if defined(KNET_API_VER) && (KNET_API_VER == 2) instance->knet_handle = knet_handle_new(instance->totem_config->node_id, instance->logpipes[1], KNET_LOG_DEBUG, KNET_HANDLE_FLAG_PRIVILEGED); #else instance->knet_handle = knet_handle_new(instance->totem_config->node_id, instance->logpipes[1], KNET_LOG_DEBUG); #endif if (allow_knet_handle_fallback && !instance->knet_handle && errno == ENAMETOOLONG) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_new failed, trying unprivileged"); #if defined(KNET_API_VER) && (KNET_API_VER == 2) instance->knet_handle = knet_handle_new(instance->totem_config->node_id, instance->logpipes[1], KNET_LOG_DEBUG, 0); #else instance->knet_handle = knet_handle_new_ex(instance->totem_config->node_id, instance->logpipes[1], KNET_LOG_DEBUG, 0); #endif } if (!instance->knet_handle) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_CRIT, "knet_handle_new failed"); goto exit_error; } knet_set_access_list_config(instance); res = knet_handle_pmtud_setfreq(instance->knet_handle, instance->totem_config->knet_pmtud_interval); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_pmtud_setfreq failed"); } res = knet_handle_pmtud_set(instance->knet_handle, instance->totem_config->knet_mtu); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_pmtud_set failed"); } res = knet_handle_enable_filter(instance->knet_handle, instance, dst_host_filter_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_filter failed"); } res = knet_handle_enable_sock_notify(instance->knet_handle, instance, socket_error_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_sock_notify failed"); } res = knet_host_enable_status_change_notify(instance->knet_handle, instance, host_change_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_host_enable_status_change_notify failed"); } res = knet_handle_enable_pmtud_notify(instance->knet_handle, instance, pmtu_change_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_pmtud_notify failed"); } global_instance = instance; /* Setup knet logging level */ totemknet_configure_log_level(); /* Get an fd into knet */ instance->knet_fd = 0; res = knet_handle_add_datafd(instance->knet_handle, &instance->knet_fd, &channel); if (res) { knet_log_printf(LOG_DEBUG, "knet_handle_add_datafd failed: %s", strerror(errno)); goto exit_error; } /* Enable crypto if requested */ #ifdef HAVE_KNET_CRYPTO_RECONF if (totemknet_is_crypto_enabled(instance)) { res = totemknet_set_knet_crypto(instance); if (res == 0) { res = knet_handle_crypto_use_config(instance->knet_handle, totem_config->crypto_index); if (res) { knet_log_printf(LOG_DEBUG, "knet_handle_crypto_use_config failed: %s", strerror(errno)); goto exit_error; } } else { knet_log_printf(LOG_DEBUG, "Failed to set up knet crypto"); goto exit_error; } res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_DISALLOW_CLEAR_TRAFFIC); if (res) { knet_log_printf(LOG_DEBUG, "knet_handle_crypto_rx_clear_traffic (DISALLOW) failed: %s", strerror(errno)); goto exit_error; } } else { res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_ALLOW_CLEAR_TRAFFIC); if (res) { knet_log_printf(LOG_DEBUG, "knet_handle_crypto_rx_clear_traffic (ALLOW) failed: %s", strerror(errno)); goto exit_error; } } #else if (totemknet_is_crypto_enabled(instance)) { res = totemknet_set_knet_crypto(instance); if (res) { knet_log_printf(LOG_DEBUG, "Failed to set up knet crypto"); goto exit_error; } } #endif /* Set up compression */ if (strcmp(totem_config->knet_compression_model, "none") != 0) { /* Not fatal, but will log */ (void)totemknet_configure_compression(instance, totem_config); } knet_handle_setfwd(instance->knet_handle, 1); instance->link_mode = KNET_LINK_POLICY_PASSIVE; if (strcmp(instance->totem_config->link_mode, "active")==0) { instance->link_mode = KNET_LINK_POLICY_ACTIVE; } if (strcmp(instance->totem_config->link_mode, "rr")==0) { instance->link_mode = KNET_LINK_POLICY_RR; } for (i=0; ilink_status[i] = malloc(CFG_INTERFACE_STATUS_MAX_LEN); if (!instance->link_status[i]) { goto exit_error; } } qb_loop_poll_add (instance->poll_handle, QB_LOOP_MED, instance->logpipes[0], POLLIN, instance, log_deliver_fn); qb_loop_poll_add (instance->poll_handle, QB_LOOP_HIGH, instance->knet_fd, POLLIN, instance, data_deliver_fn); /* * Upper layer isn't ready to receive message because it hasn't * initialized yet. Add short timer to check the interfaces. */ qb_loop_timer_add (instance->poll_handle, QB_LOOP_MED, 100*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); totemknet_start_merge_detect_timeout(instance); /* Start listening for config changes */ totemknet_add_config_notifications(instance); /* Add stats keys to icmap */ stats_knet_add_handle(); knet_log_printf (LOGSYS_LEVEL_INFO, "totemknet initialized"); *knet_context = instance; return (0); exit_error: log_flush_messages(instance); free(instance); return (-1); } void *totemknet_buffer_alloc (void) { /* Need to have space for a message AND a struct mcast in case of encapsulated messages */ return malloc(KNET_MAX_PACKET_SIZE + 512); } void totemknet_buffer_release (void *ptr) { return free (ptr); } int totemknet_processor_count_set ( void *knet_context, int processor_count) { return (0); } int totemknet_recv_flush (void *knet_context) { return (0); } int totemknet_send_flush (void *knet_context) { return (0); } int totemknet_token_send ( void *knet_context, const void *msg, unsigned int msg_len) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; ucast_sendmsg (instance, &instance->token_target, msg, msg_len); return (res); } int totemknet_mcast_flush_send ( void *knet_context, const void *msg, unsigned int msg_len) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 0); return (res); } int totemknet_mcast_noflush_send ( void *knet_context, const void *msg, unsigned int msg_len) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 1); return (res); } extern int totemknet_iface_check (void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; knet_log_printf(LOG_DEBUG, "totemknet: iface_check"); return (res); } extern void totemknet_net_mtu_adjust (void *knet_context, struct totem_config *totem_config) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; knet_log_printf(LOG_DEBUG, "totemknet: Returning MTU of %d", totem_config->net_mtu); } int totemknet_token_target_set ( void *knet_context, unsigned int nodeid) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; instance->token_target.nodeid = nodeid; instance->totemknet_target_set_completed (instance->context); return (res); } extern int totemknet_recv_mcast_empty ( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; unsigned int res; struct sockaddr_storage system_from; struct msghdr msg_hdr; struct iovec iov_recv; struct pollfd ufd; int nfds; int msg_processed = 0; iov_recv.iov_base = instance->iov_buffer; iov_recv.iov_len = KNET_MAX_PACKET_SIZE; msg_hdr.msg_name = &system_from; msg_hdr.msg_namelen = sizeof (struct sockaddr_storage); msg_hdr.msg_iov = &iov_recv; msg_hdr.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_hdr.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_hdr.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_hdr.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_msg_hdr.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_msg_hdr.msg_accrightslen = 0; #endif do { ufd.fd = instance->knet_fd; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { res = recvmsg (instance->knet_fd, &msg_hdr, MSG_NOSIGNAL | MSG_DONTWAIT); if (res != -1) { msg_processed = 1; } else { msg_processed = -1; } } } while (nfds == 1); return (msg_processed); } int totemknet_iface_set (void *knet_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; totemip_copy(&instance->my_ids[iface_no], local_addr); knet_log_printf(LOG_INFO, "Configured link number %d: local addr: %s, port=%d", iface_no, totemip_print(local_addr), ip_port); instance->ip_port[iface_no] = ip_port; return 0; } int totemknet_member_add ( void *knet_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int link_no) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int err; int port = instance->ip_port[link_no]; struct sockaddr_storage remote_ss; struct sockaddr_storage local_ss; int addrlen; int i; int host_found = 0; knet_node_id_t host_ids[KNET_MAX_HOST]; size_t num_host_ids; /* Only create 1 loopback link and use link 0 */ if (member->nodeid == instance->our_nodeid) { if (!instance->loopback_link) { link_no = 0; instance->loopback_link = 1; } else { /* Already done */ return 0; } } knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: member_add: " CS_PRI_NODE_ID " (%s), link=%d", member->nodeid, totemip_print(member), link_no); knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: local: " CS_PRI_NODE_ID " (%s)", local->nodeid, totemip_print(local)); /* Only add the host if it doesn't already exist in knet */ err = knet_host_get_host_list(instance->knet_handle, host_ids, &num_host_ids); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_host_get_host_list"); return -1; } for (i=0; inodeid) { host_found = 1; } } if (!host_found) { err = knet_host_add(instance->knet_handle, member->nodeid); if (err != 0 && errno != EEXIST) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_host_add"); return -1; } } else { knet_log_printf (LOGSYS_LEVEL_DEBUG, "nodeid " CS_PRI_NODE_ID " already added", member->nodeid); } if (err == 0) { if (knet_host_set_policy(instance->knet_handle, member->nodeid, instance->link_mode)) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_set_policy failed"); return -1; } } memset(&local_ss, 0, sizeof(local_ss)); memset(&remote_ss, 0, sizeof(remote_ss)); /* Casts to remove const */ totemip_totemip_to_sockaddr_convert((struct totem_ip_address *)member, port, &remote_ss, &addrlen); totemip_totemip_to_sockaddr_convert((struct totem_ip_address *)local, port, &local_ss, &addrlen); if (member->nodeid == instance->our_nodeid) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: loopback link is %d\n", link_no); err = knet_link_set_config(instance->knet_handle, member->nodeid, link_no, KNET_TRANSPORT_LOOPBACK, &local_ss, &remote_ss, KNET_LINK_FLAG_TRAFFICHIPRIO); } else { err = knet_link_set_config(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_transport, &local_ss, &remote_ss, KNET_LINK_FLAG_TRAFFICHIPRIO); } if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_config failed"); return -1; } knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: member_add: Setting link prio to %d", instance->totem_config->interfaces[link_no].knet_link_priority); err = knet_link_set_priority(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_link_priority); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_priority for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); } - /* ping timeouts maybe 0 here for a newly added interface so we leave this till later, it will - get done in totemknet_refresh_config */ + /* + * Ping timeouts may be 0 here for a newly added interface (on a reload), + * so we leave this till later, it will get done in totemknet_refresh_config. + * For the initial startup, we are all preset and ready to go from here. + */ if (instance->totem_config->interfaces[link_no].knet_ping_interval != 0) { err = knet_link_set_ping_timers(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_ping_interval, instance->totem_config->interfaces[link_no].knet_ping_timeout, instance->totem_config->interfaces[link_no].knet_ping_precision); if (err) { + /* Flush logs before reporting this error so that the knet message prints before ours */ + int saved_errno = errno; + log_flush_messages(instance); + errno = saved_errno; KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_ping_timers for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); + return -1; } err = knet_link_set_pong_count(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_pong_count); if (err) { + /* Flush logs before reporting this error so that the knet message prints before ours */ + int saved_errno = errno; + log_flush_messages(instance); + errno = saved_errno; KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_pong_count for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); + return -1; } } err = knet_link_set_enable(instance->knet_handle, member->nodeid, link_no, 1); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_enable for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); return -1; } /* register stats */ stats_knet_add_member(member->nodeid, link_no); return (0); } int totemknet_member_remove ( void *knet_context, const struct totem_ip_address *token_target, int link_no) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res; uint8_t link_list[KNET_MAX_LINK]; size_t num_links; knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: member_remove: " CS_PRI_NODE_ID ", link=%d", token_target->nodeid, link_no); /* Don't remove the link with the loopback on it until we shut down */ if (token_target->nodeid == instance->our_nodeid) { return 0; } /* Tidy stats */ stats_knet_del_member(token_target->nodeid, link_no); /* Remove the link first */ res = knet_link_set_enable(instance->knet_handle, token_target->nodeid, link_no, 0); if (res != 0) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set enable(off) for nodeid " CS_PRI_NODE_ID ", link %d failed", token_target->nodeid, link_no); return res; } res = knet_link_clear_config(instance->knet_handle, token_target->nodeid, link_no); if (res != 0) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_clear_config for nodeid " CS_PRI_NODE_ID ", link %d failed", token_target->nodeid, link_no); return res; } /* If this is the last link, then remove the node */ res = knet_link_get_link_list(instance->knet_handle, token_target->nodeid, link_list, &num_links); if (res) { return (0); /* not really failure */ } if (num_links == 0) { res = knet_host_remove(instance->knet_handle, token_target->nodeid); } return res; } int totemknet_member_list_rebind_ip ( void *knet_context) { return (0); } static int totemknet_configure_compression ( struct totemknet_instance *instance, struct totem_config *totem_config) { struct knet_handle_compress_cfg compress_cfg; int res = 0; assert(strlen(totem_config->knet_compression_model) < sizeof(compress_cfg.compress_model)); strcpy(compress_cfg.compress_model, totem_config->knet_compression_model); compress_cfg.compress_threshold = totem_config->knet_compression_threshold; compress_cfg.compress_level = totem_config->knet_compression_level; res = knet_handle_compress(instance->knet_handle, &compress_cfg); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_handle_compress failed"); } return res; } int totemknet_reconfigure ( void *knet_context, struct totem_config *totem_config) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; (void)totemknet_configure_compression(instance, totem_config); #ifdef HAVE_LIBNOZZLE /* Set up nozzle device(s). Return code is ignored, because inability * configure nozzle is not fatal problem, errors are logged and * there is not much else we can do */ (void)setup_nozzle(instance); #endif if (totem_config->crypto_changed) { /* Flip crypto_index */ totem_config->crypto_index = 3-totem_config->crypto_index; res = totemknet_set_knet_crypto(instance); knet_log_printf(LOG_INFO, "kronosnet crypto reconfigured on index %d: %s/%s/%s", totem_config->crypto_index, totem_config->crypto_model, totem_config->crypto_cipher_type, totem_config->crypto_hash_type); } return (res); } int totemknet_crypto_reconfigure_phase ( void *knet_context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase) { #ifdef HAVE_KNET_CRYPTO_RECONF int res; int config_to_use; int config_to_clear; struct knet_handle_crypto_cfg crypto_cfg; struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; knet_log_printf(LOGSYS_LEVEL_DEBUG, "totemknet_crypto_reconfigure_phase %d, index=%d\n", phase, totem_config->crypto_index); switch (phase) { case CRYPTO_RECONFIG_PHASE_ACTIVATE: config_to_use = totem_config->crypto_index; if (!totemknet_is_crypto_enabled(instance)) { config_to_use = 0; /* we are clearing it */ } /* Enable the new config on this node */ res = knet_handle_crypto_use_config(instance->knet_handle, config_to_use); if (res == -1) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_use_config %d failed: %s", config_to_use, strerror(errno)); } break; case CRYPTO_RECONFIG_PHASE_CLEANUP: /* * All nodes should now have the new config. clear the old one out * OR disable crypto entirely if that's what the new config insists on. */ config_to_clear = 3-totem_config->crypto_index; knet_log_printf(LOGSYS_LEVEL_DEBUG, "Clearing old knet crypto config %d\n", config_to_clear); strcpy(crypto_cfg.crypto_model, "none"); strcpy(crypto_cfg.crypto_cipher_type, "none"); strcpy(crypto_cfg.crypto_hash_type, "none"); res = knet_handle_crypto_set_config(instance->knet_handle, &crypto_cfg, config_to_clear); if (res == -1) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config to clear index %d failed: %s", config_to_clear, strerror(errno)); } if (res == -2) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config to clear index %d failed: -2", config_to_clear); } /* If crypto is enabled then disable all cleartext reception */ if (totemknet_is_crypto_enabled(instance)) { res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_DISALLOW_CLEAR_TRAFFIC); if (res) { knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_rx_clear_traffic(DISALLOW) failed %s", strerror(errno)); } } } #endif return 0; } void totemknet_stats_clear ( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; (void) knet_handle_clear_stats(instance->knet_handle, KNET_CLEARSTATS_HANDLE_AND_LINK); } /* For the stats module */ int totemknet_link_get_status ( knet_node_id_t node, uint8_t link_no, struct knet_link_status *status) { int res; int ret = CS_OK; /* We are probably not using knet */ if (!global_instance) { return CS_ERR_NOT_EXIST; } if (link_no >= INTERFACE_MAX) { return CS_ERR_NOT_EXIST; /* Invalid link number */ } res = knet_link_get_status(global_instance->knet_handle, node, link_no, status, sizeof(struct knet_link_status)); if (res) { switch (errno) { case EINVAL: ret = CS_ERR_INVALID_PARAM; break; case EBUSY: ret = CS_ERR_BUSY; break; case EDEADLK: ret = CS_ERR_TRY_AGAIN; break; default: ret = CS_ERR_LIBRARY; break; } } return (ret); } int totemknet_handle_get_stats ( struct knet_handle_stats *stats) { int res; /* We are probably not using knet */ if (!global_instance) { return CS_ERR_NOT_EXIST; } res = knet_handle_get_stats(global_instance->knet_handle, stats, sizeof(struct knet_handle_stats)); if (res != 0) { return (qb_to_cs_error(-errno)); } return CS_OK; } static void timer_function_merge_detect_timeout ( void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; if (instance->merge_detect_messages_sent_before_timeout == 0) { instance->send_merge_detect_message = 1; } instance->merge_detect_messages_sent_before_timeout = 0; totemknet_start_merge_detect_timeout(instance); } static void totemknet_start_merge_detect_timeout( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; qb_loop_timer_add(instance->poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout * 2 * QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); } static void totemknet_stop_merge_detect_timeout( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; qb_loop_timer_del(instance->poll_handle, instance->timer_merge_detect_timeout); } static void log_flush_messages (void *knet_context) { struct pollfd pfd; struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int cont; cont = 1; while (cont) { pfd.fd = instance->logpipes[0]; pfd.events = POLLIN; pfd.revents = 0; if ((poll(&pfd, 1, 0) > 0) && (pfd.revents & POLLIN) && (log_deliver_fn(instance->logpipes[0], POLLIN, instance) == 0)) { cont = 1; } else { cont = 0; } } } #ifdef HAVE_LIBNOZZLE #define NOZZLE_NAME "nozzle.name" #define NOZZLE_IPADDR "nozzle.ipaddr" #define NOZZLE_PREFIX "nozzle.ipprefix" #define NOZZLE_MACADDR "nozzle.macaddr" #define NOZZLE_CHANNEL 1 static char *get_nozzle_script_dir(void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; char filename[PATH_MAX + FILENAME_MAX + 1]; static char updown_dirname[PATH_MAX + FILENAME_MAX + 1]; int res; const char *dirname_res; /* * Build script directory based on corosync.conf file location */ res = snprintf(filename, sizeof(filename), "%s", corosync_get_config_file()); if (res >= sizeof(filename)) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "nozzle up/down path too long"); return NULL; } dirname_res = dirname(filename); res = snprintf(updown_dirname, sizeof(updown_dirname), "%s/%s", dirname_res, "updown.d"); if (res >= sizeof(updown_dirname)) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "nozzle up/down path too long"); return NULL; } return updown_dirname; } /* * Deliberately doesn't return the status as caller doesn't care. * The result will be logged though */ static void run_nozzle_script(struct totemknet_instance *instance, int type, const char *typename) { int res; char *exec_string; res = nozzle_run_updown(instance->nozzle_handle, type, &exec_string); if (res == -1 && errno != ENOENT) { knet_log_printf (LOGSYS_LEVEL_INFO, "exec nozzle %s script failed: %s", typename, strerror(errno)); } else if (res == -2) { knet_log_printf (LOGSYS_LEVEL_INFO, "nozzle %s script failed", typename); knet_log_printf (LOGSYS_LEVEL_INFO, "%s", exec_string); } } /* * Reparse IP address to add in our node ID * IPv6 addresses must end in '::' * IPv4 addresses must just be valid * '/xx' lengths are optional for IPv6, mandatory for IPv4 * * Returns the modified IP address as a string to pass into libnozzle */ static int reparse_nozzle_ip_address(struct totemknet_instance *instance, const char *input_addr, const char *prefix, int nodeid, char *output_addr, size_t output_len) { char *coloncolon; int bits; int max_prefix = 64; uint32_t nodeid_mask; uint32_t addr_mask; uint32_t masked_nodeid; struct in_addr *addr; struct totem_ip_address totemip; coloncolon = strstr(input_addr, "::"); if (!coloncolon) { max_prefix = 30; } bits = atoi(prefix); if (bits < 8 || bits > max_prefix) { knet_log_printf(LOGSYS_LEVEL_ERROR, "nozzle IP address prefix must be >= 8 and <= %d (got %d)", max_prefix, bits); return -1; } /* IPv6 is easy */ if (coloncolon) { memcpy(output_addr, input_addr, coloncolon-input_addr); sprintf(output_addr + (coloncolon-input_addr), "::%x", nodeid); return 0; } /* For IPv4 we need to parse the address into binary, mask off the required bits, * add in the masked_nodeid and 'print' it out again */ nodeid_mask = UINT32_MAX & ((1<<(32 - bits)) - 1); addr_mask = UINT32_MAX ^ nodeid_mask; masked_nodeid = nodeid & nodeid_mask; if (totemip_parse(&totemip, input_addr, AF_INET)) { knet_log_printf(LOGSYS_LEVEL_ERROR, "Failed to parse IPv4 nozzle IP address"); return -1; } addr = (struct in_addr *)&totemip.addr; addr->s_addr &= htonl(addr_mask); addr->s_addr |= htonl(masked_nodeid); inet_ntop(AF_INET, addr, output_addr, output_len); return 0; } static int create_nozzle_device(void *knet_context, const char *name, const char *ipaddr, const char *prefix, const char *macaddr) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; char device_name[IFNAMSIZ+1]; size_t size = IFNAMSIZ; int8_t channel = NOZZLE_CHANNEL; nozzle_t nozzle_dev; int nozzle_fd; int res; char *updown_dir; char parsed_ipaddr[INET6_ADDRSTRLEN]; char mac[19]; memset(device_name, 0, size); memset(&mac, 0, sizeof(mac)); strncpy(device_name, name, size); updown_dir = get_nozzle_script_dir(knet_context); knet_log_printf (LOGSYS_LEVEL_INFO, "nozzle script dir is %s", updown_dir); nozzle_dev = nozzle_open(device_name, size, updown_dir); if (!nozzle_dev) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to init nozzle device %s: %s", device_name, strerror(errno)); return -1; } instance->nozzle_handle = nozzle_dev; if (nozzle_set_mac(nozzle_dev, macaddr) < 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to add set nozzle MAC to %s: %s", mac, strerror(errno)); goto out_clean; } if (reparse_nozzle_ip_address(instance, ipaddr, prefix, instance->our_nodeid, parsed_ipaddr, sizeof(parsed_ipaddr))) { /* Prints its own errors */ goto out_clean; } knet_log_printf (LOGSYS_LEVEL_INFO, "Local nozzle IP address is %s / %d", parsed_ipaddr, atoi(prefix)); if (nozzle_add_ip(nozzle_dev, parsed_ipaddr, prefix) < 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to add set nozzle IP addr to %s/%s: %s", parsed_ipaddr, prefix, strerror(errno)); goto out_clean; } nozzle_fd = nozzle_get_fd(nozzle_dev); knet_log_printf (LOGSYS_LEVEL_INFO, "Opened '%s' on fd %d", device_name, nozzle_fd); res = knet_handle_add_datafd(instance->knet_handle, &nozzle_fd, &channel); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to add nozzle FD to knet: %s", strerror(errno)); goto out_clean; } run_nozzle_script(instance, NOZZLE_PREUP, "pre-up"); res = nozzle_set_up(nozzle_dev); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to set nozzle interface UP: %s", strerror(errno)); goto out_clean; } run_nozzle_script(instance, NOZZLE_UP, "up"); return 0; out_clean: nozzle_close(nozzle_dev); return -1; } static int remove_nozzle_device(void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res; int datafd; res = knet_handle_get_datafd(instance->knet_handle, NOZZLE_CHANNEL, &datafd); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't find datafd for channel %d: %s", NOZZLE_CHANNEL, strerror(errno)); return -1; } res = knet_handle_remove_datafd(instance->knet_handle, datafd); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't remove datafd for nozzle channel %d: %s", NOZZLE_CHANNEL, strerror(errno)); return -1; } run_nozzle_script(instance, NOZZLE_DOWN, "pre-down"); res = nozzle_set_down(instance->nozzle_handle); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't set nozzle device down: %s", strerror(errno)); return -1; } run_nozzle_script(instance, NOZZLE_POSTDOWN, "post-down"); res = nozzle_close(instance->nozzle_handle); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't close nozzle device: %s", strerror(errno)); return -1; } knet_log_printf (LOGSYS_LEVEL_INFO, "Removed nozzle device"); return 0; } static void free_nozzle(struct totemknet_instance *instance) { free(instance->nozzle_name); free(instance->nozzle_ipaddr); free(instance->nozzle_prefix); free(instance->nozzle_macaddr); instance->nozzle_name = instance->nozzle_ipaddr = instance->nozzle_prefix = instance->nozzle_macaddr = NULL; } static int setup_nozzle(void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; char *ipaddr_str = NULL; char *name_str = NULL; char *prefix_str = NULL; char *macaddr_str = NULL; char mac[32]; int name_res; int macaddr_res; int res = -1; /* * Return value ignored on purpose. icmap_get_string changes * ipaddr_str/prefix_str only on success. */ (void)icmap_get_string(NOZZLE_IPADDR, &ipaddr_str); (void)icmap_get_string(NOZZLE_PREFIX, &prefix_str); macaddr_res = icmap_get_string(NOZZLE_MACADDR, &macaddr_str); name_res = icmap_get_string(NOZZLE_NAME, &name_str); /* Is is being removed? */ if (name_res == CS_ERR_NOT_EXIST && instance->nozzle_handle) { remove_nozzle_device(instance); free_nozzle(instance); goto out_free; } if (!name_str) { /* no nozzle */ goto out_free; } if (!ipaddr_str) { knet_log_printf (LOGSYS_LEVEL_ERROR, "No IP address supplied for Nozzle device"); goto out_free; } if (!prefix_str) { knet_log_printf (LOGSYS_LEVEL_ERROR, "No prefix supplied for Nozzle IP address"); goto out_free; } if (macaddr_str && strlen(macaddr_str) != 17) { knet_log_printf (LOGSYS_LEVEL_ERROR, "macaddr for nozzle device is not in the correct format '%s'", macaddr_str); goto out_free; } if (!macaddr_str) { macaddr_str = (char*)"54:54:01:00:00:00"; } if (instance->nozzle_name && (strcmp(name_str, instance->nozzle_name) == 0) && (strcmp(ipaddr_str, instance->nozzle_ipaddr) == 0) && (strcmp(prefix_str, instance->nozzle_prefix) == 0) && (instance->nozzle_macaddr == NULL || strcmp(macaddr_str, instance->nozzle_macaddr) == 0)) { /* Nothing has changed */ knet_log_printf (LOGSYS_LEVEL_DEBUG, "Nozzle device info not changed"); goto out_free; } /* Add nodeid into MAC address */ memcpy(mac, macaddr_str, 12); snprintf(mac+12, sizeof(mac) - 13, "%02x:%02x", instance->our_nodeid >> 8, instance->our_nodeid & 0xFF); knet_log_printf (LOGSYS_LEVEL_INFO, "Local nozzle MAC address is %s", mac); if (name_res == CS_OK && name_str) { /* Reconfigure */ if (instance->nozzle_name) { remove_nozzle_device(instance); free_nozzle(instance); } res = create_nozzle_device(knet_context, name_str, ipaddr_str, prefix_str, mac); instance->nozzle_name = strdup(name_str); instance->nozzle_ipaddr = strdup(ipaddr_str); instance->nozzle_prefix = strdup(prefix_str); instance->nozzle_macaddr = strdup(macaddr_str); if (!instance->nozzle_name || !instance->nozzle_ipaddr || !instance->nozzle_prefix) { knet_log_printf (LOGSYS_LEVEL_ERROR, "strdup failed in nozzle allocation"); /* * This 'free' will cause a complete reconfigure of the device next time we reload * but will also let the the current device keep working until then. * remove_nozzle() only needs the, statically-allocated, nozzle_handle */ free_nozzle(instance); } } out_free: free(name_str); free(ipaddr_str); free(prefix_str); if (macaddr_res == CS_OK) { free(macaddr_str); } return res; } #endif // HAVE_LIBNOZZLE diff --git a/exec/totemknet.h b/exec/totemknet.h index 67c0ba6e..4d4f61e2 100644 --- a/exec/totemknet.h +++ b/exec/totemknet.h @@ -1,159 +1,159 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2011 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMKNET_H_DEFINED #define TOTEMKNET_H_DEFINED #include #include #include #include /** * Create an instance */ extern int totemknet_initialize ( qb_loop_t *poll_handle, void **knet_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); extern void *totemknet_buffer_alloc (void); extern void totemknet_buffer_release (void *ptr); extern int totemknet_processor_count_set ( void *knet_context, int processor_count); extern int totemknet_token_send ( void *knet_context, const void *msg, unsigned int msg_len); extern int totemknet_mcast_flush_send ( void *knet_context, const void *msg, unsigned int msg_len); extern int totemknet_mcast_noflush_send ( void *knet_context, const void *msg, unsigned int msg_len); extern int totemknet_recv_flush (void *knet_context); extern int totemknet_send_flush (void *knet_context); extern int totemknet_iface_check (void *knet_context); extern int totemknet_finalize (void *knet_context); extern void totemknet_net_mtu_adjust (void *knet_context, struct totem_config *totem_config); extern int totemknet_nodestatus_get (void *knet_context, unsigned int nodeid, struct totem_node_status *node_status); extern int totemknet_ifaces_get (void *net_context, char ***status, unsigned int *iface_count); extern int totemknet_iface_set (void *net_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no); extern int totemknet_token_target_set ( void *knet_context, unsigned int nodeid); extern int totemknet_crypto_set ( void *knet_context, const char *cipher_type, const char *hash_type); extern int totemknet_recv_mcast_empty ( void *knet_context); extern int totemknet_member_add ( void *knet_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); extern int totemknet_member_remove ( void *knet_context, const struct totem_ip_address *member, int ring_no); extern int totemknet_member_set_active ( void *knet_context, const struct totem_ip_address *member_ip, int active); extern int totemknet_reconfigure ( void *knet_context, struct totem_config *totem_config); extern int totemknet_crypto_reconfigure_phase ( void *knet_context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase); extern void totemknet_stats_clear ( void *knet_context); extern void totemknet_configure_log_level (void); #endif /* TOTEMKNET_H_DEFINED */ diff --git a/exec/totemnet.c b/exec/totemnet.c index a4b90a3d..58992e67 100644 --- a/exec/totemnet.c +++ b/exec/totemnet.c @@ -1,628 +1,628 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include struct transport { const char *name; int (*initialize) ( qb_loop_t *loop_pt, void **transport_instance, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); void *(*buffer_alloc) (void); void (*buffer_release) (void *ptr); int (*processor_count_set) ( void *transport_context, int processor_count); int (*token_send) ( void *transport_context, const void *msg, unsigned int msg_len); int (*mcast_flush_send) ( void *transport_context, const void *msg, unsigned int msg_len); int (*mcast_noflush_send) ( void *transport_context, const void *msg, unsigned int msg_len); int (*recv_flush) (void *transport_context); int (*send_flush) (void *transport_context); int (*iface_check) (void *transport_context); int (*finalize) (void *transport_context); void (*net_mtu_adjust) (void *transport_context, struct totem_config *totem_config); const char *(*iface_print) (void *transport_context); int (*ifaces_get) ( void *transport_context, char ***status, unsigned int *iface_count); int (*nodestatus_get) ( void *transport_context, unsigned int nodeid, struct totem_node_status *node_status); int (*token_target_set) ( void *transport_context, unsigned int nodeid); int (*crypto_set) ( void *transport_context, const char *cipher_type, const char *hash_type); int (*recv_mcast_empty) ( void *transport_context); int (*iface_set) ( void *transport_context, const struct totem_ip_address *local, unsigned short ip_port, unsigned int ring_no); int (*member_add) ( void *transport_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); int (*member_remove) ( void *transport_context, const struct totem_ip_address *member, int ring_no); int (*member_set_active) ( void *transport_context, const struct totem_ip_address *member, int active); int (*reconfigure) ( void *net_context, struct totem_config *totem_config); int (*crypto_reconfigure_phase) ( void *net_context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase); void (*stats_clear) ( void *net_context); }; struct transport transport_entries[] = { { .name = "UDP/IP Multicast", .initialize = totemudp_initialize, .buffer_alloc = totemudp_buffer_alloc, .buffer_release = totemudp_buffer_release, .processor_count_set = totemudp_processor_count_set, .token_send = totemudp_token_send, .mcast_flush_send = totemudp_mcast_flush_send, .mcast_noflush_send = totemudp_mcast_noflush_send, .recv_flush = totemudp_recv_flush, .send_flush = totemudp_send_flush, .iface_set = totemudp_iface_set, .iface_check = totemudp_iface_check, .finalize = totemudp_finalize, .net_mtu_adjust = totemudp_net_mtu_adjust, .ifaces_get = totemudp_ifaces_get, .nodestatus_get = totemudp_nodestatus_get, .token_target_set = totemudp_token_target_set, .crypto_set = totemudp_crypto_set, .recv_mcast_empty = totemudp_recv_mcast_empty, .member_add = totemudp_member_add, .member_remove = totemudp_member_remove, .reconfigure = totemudp_reconfigure, .crypto_reconfigure_phase = NULL }, { .name = "UDP/IP Unicast", .initialize = totemudpu_initialize, .buffer_alloc = totemudpu_buffer_alloc, .buffer_release = totemudpu_buffer_release, .processor_count_set = totemudpu_processor_count_set, .token_send = totemudpu_token_send, .mcast_flush_send = totemudpu_mcast_flush_send, .mcast_noflush_send = totemudpu_mcast_noflush_send, .recv_flush = totemudpu_recv_flush, .send_flush = totemudpu_send_flush, .iface_set = totemudpu_iface_set, .iface_check = totemudpu_iface_check, .finalize = totemudpu_finalize, .net_mtu_adjust = totemudpu_net_mtu_adjust, .ifaces_get = totemudpu_ifaces_get, .nodestatus_get = totemudpu_nodestatus_get, .token_target_set = totemudpu_token_target_set, .crypto_set = totemudpu_crypto_set, .recv_mcast_empty = totemudpu_recv_mcast_empty, .member_add = totemudpu_member_add, .member_remove = totemudpu_member_remove, .reconfigure = totemudpu_reconfigure, .crypto_reconfigure_phase = NULL }, { .name = "Kronosnet", .initialize = totemknet_initialize, .buffer_alloc = totemknet_buffer_alloc, .buffer_release = totemknet_buffer_release, .processor_count_set = totemknet_processor_count_set, .token_send = totemknet_token_send, .mcast_flush_send = totemknet_mcast_flush_send, .mcast_noflush_send = totemknet_mcast_noflush_send, .recv_flush = totemknet_recv_flush, .send_flush = totemknet_send_flush, .iface_set = totemknet_iface_set, .iface_check = totemknet_iface_check, .finalize = totemknet_finalize, .net_mtu_adjust = totemknet_net_mtu_adjust, .ifaces_get = totemknet_ifaces_get, .nodestatus_get = totemknet_nodestatus_get, .token_target_set = totemknet_token_target_set, .crypto_set = totemknet_crypto_set, .recv_mcast_empty = totemknet_recv_mcast_empty, .member_add = totemknet_member_add, .member_remove = totemknet_member_remove, .reconfigure = totemknet_reconfigure, .crypto_reconfigure_phase = totemknet_crypto_reconfigure_phase, .stats_clear = totemknet_stats_clear } }; struct totemnet_instance { void *transport_context; struct transport *transport; void (*totemnet_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); int totemnet_subsys_id; }; #define log_printf(level, format, args...) \ do { \ instance->totemnet_log_printf ( \ level, \ instance->totemnet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); static void totemnet_instance_initialize ( struct totemnet_instance *instance, struct totem_config *config) { int transport; instance->totemnet_log_printf = config->totem_logging_configuration.log_printf; instance->totemnet_subsys_id = config->totem_logging_configuration.log_subsys_id; transport = config->transport_number; log_printf (LOGSYS_LEVEL_NOTICE, "Initializing transport (%s).", transport_entries[transport].name); instance->transport = &transport_entries[transport]; } int totemnet_crypto_set ( void *net_context, const char *cipher_type, const char *hash_type) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->crypto_set (instance->transport_context, cipher_type, hash_type); return res; } int totemnet_finalize ( void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->finalize (instance->transport_context); return (res); } int totemnet_initialize ( qb_loop_t *loop_pt, void **net_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemnet_instance *instance; unsigned int res; instance = malloc (sizeof (struct totemnet_instance)); if (instance == NULL) { return (-1); } totemnet_instance_initialize (instance, totem_config); res = instance->transport->initialize (loop_pt, &instance->transport_context, totem_config, stats, context, deliver_fn, iface_change_fn, mtu_changed, target_set_completed); if (res == -1) { goto error_destroy; } *net_context = instance; return (0); error_destroy: free (instance); return (-1); } void *totemnet_buffer_alloc (void *net_context) { struct totemnet_instance *instance = net_context; assert (instance != NULL); assert (instance->transport != NULL); return instance->transport->buffer_alloc(); } void totemnet_buffer_release (void *net_context, void *ptr) { struct totemnet_instance *instance = net_context; assert (instance != NULL); assert (instance->transport != NULL); instance->transport->buffer_release (ptr); } int totemnet_processor_count_set ( void *net_context, int processor_count) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->processor_count_set (instance->transport_context, processor_count); return (res); } int totemnet_recv_flush (void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->recv_flush (instance->transport_context); return (res); } int totemnet_send_flush (void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->send_flush (instance->transport_context); return (res); } int totemnet_token_send ( void *net_context, const void *msg, unsigned int msg_len) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->token_send (instance->transport_context, msg, msg_len); return (res); } int totemnet_mcast_flush_send ( void *net_context, const void *msg, unsigned int msg_len) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->mcast_flush_send (instance->transport_context, msg, msg_len); return (res); } int totemnet_mcast_noflush_send ( void *net_context, const void *msg, unsigned int msg_len) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->mcast_noflush_send (instance->transport_context, msg, msg_len); return (res); } extern int totemnet_iface_check (void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->iface_check (instance->transport_context); return (res); } extern int totemnet_net_mtu_adjust (void *net_context, struct totem_config *totem_config) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; instance->transport->net_mtu_adjust (instance->transport_context, totem_config); return (res); } int totemnet_iface_set (void *net_context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res; res = instance->transport->iface_set (instance->transport_context, interface_addr, ip_port, iface_no); return (res); } extern int totemnet_nodestatus_get ( void *net_context, unsigned int nodeid, struct totem_node_status *node_status) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->nodestatus_get (instance->transport_context, nodeid, node_status); return (res); } int totemnet_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->ifaces_get (instance->transport_context, status, iface_count); return (res); } int totemnet_token_target_set ( void *net_context, unsigned int nodeid) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->token_target_set (instance->transport_context, nodeid); return (res); } extern int totemnet_recv_mcast_empty ( void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->recv_mcast_empty (instance->transport_context); return (res); } extern int totemnet_member_add ( void *net_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->member_add) { res = instance->transport->member_add ( instance->transport_context, local, member, ring_no); } return (res); } extern int totemnet_member_remove ( void *net_context, const struct totem_ip_address *member, int ring_no) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->member_remove) { res = instance->transport->member_remove ( instance->transport_context, member, ring_no); } return (res); } int totemnet_member_set_active ( void *net_context, const struct totem_ip_address *member, int active) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->member_set_active) { res = instance->transport->member_set_active ( instance->transport_context, member, active); } return (res); } int totemnet_reconfigure ( void *net_context, struct totem_config *totem_config) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; res = instance->transport->reconfigure ( instance->transport_context, totem_config); return (res); } int totemnet_crypto_reconfigure_phase ( void *net_context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->crypto_reconfigure_phase) { res = instance->transport->crypto_reconfigure_phase ( instance->transport_context, totem_config, phase); } return (res); } void totemnet_stats_clear ( void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; if (instance->transport->stats_clear) { instance->transport->stats_clear ( instance->transport_context); } } diff --git a/exec/totemnet.h b/exec/totemnet.h index c6a99235..e71d9e04 100644 --- a/exec/totemnet.h +++ b/exec/totemnet.h @@ -1,166 +1,166 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** * @file * Totem Network interface - also does encryption/decryption * * depends on poll abstraction, POSIX, IPV4 */ #ifndef TOTEMNET_H_DEFINED #define TOTEMNET_H_DEFINED #include #include #include #define TOTEMNET_NOFLUSH 0 #define TOTEMNET_FLUSH 1 /** * Create an instance */ extern int totemnet_initialize ( qb_loop_t *poll_handle, void **net_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int iface_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); extern void *totemnet_buffer_alloc (void *net_context); extern void totemnet_buffer_release (void *net_context, void *ptr); extern int totemnet_processor_count_set ( void *net_context, int processor_count); extern int totemnet_token_send ( void *net_context, const void *msg, unsigned int msg_len); extern int totemnet_mcast_flush_send ( void *net_context, const void *msg, unsigned int msg_len); extern int totemnet_mcast_noflush_send ( void *net_context, const void *msg, unsigned int msg_len); extern int totemnet_recv_flush (void *net_context); extern int totemnet_send_flush (void *net_context); extern int totemnet_iface_set (void *net_context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no); extern int totemnet_iface_check (void *net_context); extern int totemnet_finalize (void *net_context); extern int totemnet_net_mtu_adjust (void *net_context, struct totem_config *totem_config); extern int totemnet_reconfigure (void *net_context, struct totem_config *totem_config); extern int totemnet_crypto_reconfigure_phase (void *net_context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase); extern void totemnet_stats_clear (void *net_context); extern const char *totemnet_iface_print (void *net_context); extern int totemnet_nodestatus_get ( void *net_context, unsigned int nodeid, struct totem_node_status *node_status); extern int totemnet_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count); extern int totemnet_token_target_set ( void *net_context, unsigned int target_nodeid); extern int totemnet_crypto_set ( void *net_context, const char *cipher_type, const char *hash_type); extern int totemnet_recv_mcast_empty ( void *net_context); extern int totemnet_member_add ( void *net_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); extern int totemnet_member_remove ( void *net_context, const struct totem_ip_address *member, int ring_no); extern int totemnet_member_set_active ( void *net_context, const struct totem_ip_address *member, int active); #endif /* TOTEMNET_H_DEFINED */ diff --git a/exec/totemsrp.c b/exec/totemsrp.c index fd71771b..63a47c19 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -1,5250 +1,5252 @@ /* * Copyright (c) 2003-2006 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * The first version of this code was based upon Yair Amir's PhD thesis: * http://www.cs.jhu.edu/~yairamir/phd.ps) (ch4,5). * * The current version of totemsrp implements the Totem protocol specified in: * http://citeseer.ist.psu.edu/amir95totem.html * * The deviations from the above published protocols are: * - token hold mode where token doesn't rotate on unused ring - reduces cpu * usage on 1.6ghz xeon from 35% to less then .1 % as measured by top */ #include #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemsrp.h" #include "totemnet.h" #include "icmap.h" #include "totemconfig.h" #include "cs_queue.h" #define LOCALHOST_IP inet_addr("127.0.0.1") #define QUEUE_RTR_ITEMS_SIZE_MAX 16384 /* allow 16384 retransmit items */ #define RETRANS_MESSAGE_QUEUE_SIZE_MAX 16384 /* allow 500 messages to be queued */ #define RECEIVED_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */ #define MAXIOVS 5 #define RETRANSMIT_ENTRIES_MAX 30 #define TOKEN_SIZE_MAX 64000 /* bytes */ #define LEAVE_DUMMY_NODEID 0 /* * SRP address. */ struct srp_addr { unsigned int nodeid; }; /* * Rollover handling: * SEQNO_START_MSG is the starting sequence number after a new configuration * This should remain zero, unless testing overflow in which case * 0x7ffff000 and 0xfffff000 are good starting values. * * SEQNO_START_TOKEN is the starting sequence number after a new configuration * for a token. This should remain zero, unless testing overflow in which * case 07fffff00 or 0xffffff00 are good starting values. */ #define SEQNO_START_MSG 0x0 #define SEQNO_START_TOKEN 0x0 /* * These can be used ot test different rollover points * #define SEQNO_START_MSG 0xfffffe00 * #define SEQNO_START_TOKEN 0xfffffe00 */ /* * These can be used to test the error recovery algorithms * #define TEST_DROP_ORF_TOKEN_PERCENTAGE 30 * #define TEST_DROP_COMMIT_TOKEN_PERCENTAGE 30 * #define TEST_DROP_MCAST_PERCENTAGE 50 * #define TEST_RECOVERY_MSG_COUNT 300 */ /* * we compare incoming messages to determine if their endian is * different - if so convert them * * do not change */ #define ENDIAN_LOCAL 0xff22 enum message_type { MESSAGE_TYPE_ORF_TOKEN = 0, /* Ordering, Reliability, Flow (ORF) control Token */ MESSAGE_TYPE_MCAST = 1, /* ring ordered multicast message */ MESSAGE_TYPE_MEMB_MERGE_DETECT = 2, /* merge rings if there are available rings */ MESSAGE_TYPE_MEMB_JOIN = 3, /* membership join message */ MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4, /* membership commit token */ MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5, /* cancel the holding of the token */ }; enum encapsulation_type { MESSAGE_ENCAPSULATED = 1, MESSAGE_NOT_ENCAPSULATED = 2 }; /* * New membership algorithm local variables */ struct consensus_list_item { struct srp_addr addr; int set; }; struct token_callback_instance { struct qb_list_head list; int (*callback_fn) (enum totem_callback_token_type type, const void *); enum totem_callback_token_type callback_type; int delete; void *data; }; struct totemsrp_socket { int mcast; int token; }; struct mcast { struct totem_message_header header; struct srp_addr system_from; unsigned int seq; int this_seqno; struct memb_ring_id ring_id; unsigned int node_id; int guarantee; } __attribute__((packed)); struct rtr_item { struct memb_ring_id ring_id; unsigned int seq; }__attribute__((packed)); struct orf_token { struct totem_message_header header; unsigned int seq; unsigned int token_seq; unsigned int aru; unsigned int aru_addr; struct memb_ring_id ring_id; unsigned int backlog; unsigned int fcc; int retrans_flg; int rtr_list_entries; struct rtr_item rtr_list[0]; }__attribute__((packed)); struct memb_join { struct totem_message_header header; struct srp_addr system_from; unsigned int proc_list_entries; unsigned int failed_list_entries; unsigned long long ring_seq; unsigned char end_of_memb_join[0]; /* * These parts of the data structure are dynamic: * struct srp_addr proc_list[]; * struct srp_addr failed_list[]; */ } __attribute__((packed)); struct memb_merge_detect { struct totem_message_header header; struct srp_addr system_from; struct memb_ring_id ring_id; } __attribute__((packed)); struct token_hold_cancel { struct totem_message_header header; struct memb_ring_id ring_id; } __attribute__((packed)); struct memb_commit_token_memb_entry { struct memb_ring_id ring_id; unsigned int aru; unsigned int high_delivered; unsigned int received_flg; }__attribute__((packed)); struct memb_commit_token { struct totem_message_header header; unsigned int token_seq; struct memb_ring_id ring_id; unsigned int retrans_flg; int memb_index; int addr_entries; unsigned char end_of_commit_token[0]; /* * These parts of the data structure are dynamic: * * struct srp_addr addr[PROCESSOR_COUNT_MAX]; * struct memb_commit_token_memb_entry memb_list[PROCESSOR_COUNT_MAX]; */ }__attribute__((packed)); struct message_item { struct mcast *mcast; unsigned int msg_len; }; struct sort_queue_item { struct mcast *mcast; unsigned int msg_len; }; enum memb_state { MEMB_STATE_OPERATIONAL = 1, MEMB_STATE_GATHER = 2, MEMB_STATE_COMMIT = 3, MEMB_STATE_RECOVERY = 4 }; struct totemsrp_instance { int iface_changes; int failed_to_recv; /* * Flow control mcasts and remcasts on last and current orf_token */ int fcc_remcast_last; int fcc_mcast_last; int fcc_remcast_current; struct consensus_list_item consensus_list[PROCESSOR_COUNT_MAX]; int consensus_list_entries; int lowest_active_if; struct srp_addr my_id; struct totem_ip_address my_addrs[INTERFACE_MAX]; struct srp_addr my_proc_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_failed_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_new_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_trans_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_deliver_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX]; unsigned int my_leave_memb_list[PROCESSOR_COUNT_MAX]; int my_proc_list_entries; int my_failed_list_entries; int my_new_memb_entries; int my_trans_memb_entries; int my_memb_entries; int my_deliver_memb_entries; int my_left_memb_entries; int my_leave_memb_entries; struct memb_ring_id my_ring_id; struct memb_ring_id my_old_ring_id; int my_aru_count; int my_merge_detect_timeout_outstanding; unsigned int my_last_aru; int my_seq_unchanged; int my_received_flg; unsigned int my_high_seq_received; unsigned int my_install_seq; int my_rotation_counter; int my_set_retrans_flg; int my_retrans_flg_count; unsigned int my_high_ring_delivered; int heartbeat_timeout; /* * Queues used to order, deliver, and recover messages */ struct cs_queue new_message_queue; struct cs_queue new_message_queue_trans; struct cs_queue retrans_message_queue; struct sq regular_sort_queue; struct sq recovery_sort_queue; /* * Received up to and including */ unsigned int my_aru; unsigned int my_high_delivered; struct qb_list_head token_callback_received_listhead; struct qb_list_head token_callback_sent_listhead; char orf_token_retransmit[TOKEN_SIZE_MAX]; int orf_token_retransmit_size; unsigned int my_token_seq; /* * Timers */ qb_loop_timer_handle timer_pause_timeout; qb_loop_timer_handle timer_orf_token_timeout; qb_loop_timer_handle timer_orf_token_warning; qb_loop_timer_handle timer_orf_token_retransmit_timeout; qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout; qb_loop_timer_handle timer_merge_detect_timeout; qb_loop_timer_handle memb_timer_state_gather_join_timeout; qb_loop_timer_handle memb_timer_state_gather_consensus_timeout; qb_loop_timer_handle memb_timer_state_commit_timeout; qb_loop_timer_handle timer_heartbeat_timeout; /* * Function and data used to log messages */ int totemsrp_log_level_security; int totemsrp_log_level_error; int totemsrp_log_level_warning; int totemsrp_log_level_notice; int totemsrp_log_level_debug; int totemsrp_log_level_trace; int totemsrp_subsys_id; void (*totemsrp_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7)));; enum memb_state memb_state; //TODO struct srp_addr next_memb; qb_loop_t *totemsrp_poll_handle; struct totem_ip_address mcast_address; void (*totemsrp_deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required); void (*totemsrp_confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); void (*totemsrp_service_ready_fn) (void); void (*totemsrp_waiting_trans_ack_cb_fn) ( int waiting_trans_ack); void (*memb_ring_id_create_or_load) ( struct memb_ring_id *memb_ring_id, unsigned int nodeid); void (*memb_ring_id_store) ( const struct memb_ring_id *memb_ring_id, unsigned int nodeid); int global_seqno; int my_token_held; unsigned long long token_ring_id_seq; unsigned int last_released; unsigned int set_aru; int old_ring_state_saved; int old_ring_state_aru; unsigned int old_ring_state_high_seq_received; unsigned int my_last_seq; struct timeval tv_old; void *totemnet_context; struct totem_config *totem_config; unsigned int use_heartbeat; unsigned int my_trc; unsigned int my_pbl; unsigned int my_cbl; uint64_t pause_timestamp; struct memb_commit_token *commit_token; totemsrp_stats_t stats; uint32_t orf_token_discard; uint32_t originated_orf_token; uint32_t threaded_mode_enabled; uint32_t waiting_trans_ack; int flushing; void * token_recv_event_handle; void * token_sent_event_handle; char commit_token_storage[40000]; }; struct message_handlers { int count; int (*handler_functions[6]) ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); }; enum gather_state_from { TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT = 0, TOTEMSRP_GSFROM_GATHER_MISSING1 = 1, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE = 2, TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED = 3, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE = 4, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE = 5, TOTEMSRP_GSFROM_FAILED_TO_RECEIVE = 6, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE = 7, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE = 8, TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE = 9, TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE = 10, TOTEMSRP_GSFROM_MERGE_DURING_JOIN = 11, TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE = 12, TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE = 13, TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY = 14, TOTEMSRP_GSFROM_INTERFACE_CHANGE = 15, TOTEMSRP_GSFROM_MAX = TOTEMSRP_GSFROM_INTERFACE_CHANGE, }; const char* gather_state_from_desc [] = { [TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT] = "consensus timeout", [TOTEMSRP_GSFROM_GATHER_MISSING1] = "MISSING", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE] = "The token was lost in the OPERATIONAL state.", [TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED] = "The consensus timeout expired.", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE] = "The token was lost in the COMMIT state.", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE] = "The token was lost in the RECOVERY state.", [TOTEMSRP_GSFROM_FAILED_TO_RECEIVE] = "failed to receive", [TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE] = "foreign message in operational state", [TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE] = "foreign message in gather state", [TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE] = "merge during operational state", [TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE] = "merge during gather state", [TOTEMSRP_GSFROM_MERGE_DURING_JOIN] = "merge during join", [TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE] = "join during operational state", [TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE] = "join during commit state", [TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY] = "join during recovery", [TOTEMSRP_GSFROM_INTERFACE_CHANGE] = "interface change", }; /* * forward decls */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static void totemsrp_instance_initialize (struct totemsrp_instance *instance); static void srp_addr_to_nodeid ( struct totemsrp_instance *instance, unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries); static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b); static void memb_leave_message_send (struct totemsrp_instance *instance); static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type); static void memb_state_gather_enter (struct totemsrp_instance *instance, enum gather_state_from gather_from); static void messages_deliver_to_app (struct totemsrp_instance *instance, int skip, unsigned int end_point); static int orf_token_mcast (struct totemsrp_instance *instance, struct orf_token *oken, int fcc_mcasts_allowed); static void messages_free (struct totemsrp_instance *instance, unsigned int token_aru); static void memb_ring_id_set (struct totemsrp_instance *instance, const struct memb_ring_id *ring_id); static void target_set_completed (void *context); static void memb_state_commit_token_update (struct totemsrp_instance *instance); static void memb_state_commit_token_target_set (struct totemsrp_instance *instance); static int memb_state_commit_token_send (struct totemsrp_instance *instance); static int memb_state_commit_token_send_recovery (struct totemsrp_instance *instance, struct memb_commit_token *memb_commit_token); static void memb_state_commit_token_create (struct totemsrp_instance *instance); static int token_hold_cancel_send (struct totemsrp_instance *instance); static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out); static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out); static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out); static void mcast_endian_convert (const struct mcast *in, struct mcast *out); static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out); static struct srp_addr srp_addr_endian_convert (struct srp_addr in); static void timer_function_orf_token_timeout (void *data); static void timer_function_orf_token_warning (void *data); static void timer_function_pause_timeout (void *data); static void timer_function_heartbeat_timeout (void *data); static void timer_function_token_retransmit_timeout (void *data); static void timer_function_token_hold_retransmit_timeout (void *data); static void timer_function_merge_detect_timeout (void *data); static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance); static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr); static const char* gsfrom_to_msg(enum gather_state_from gsfrom); -void main_deliver_fn ( +int main_deliver_fn ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from); -void main_iface_change_fn ( +int main_iface_change_fn ( void *context, const struct totem_ip_address *iface_address, unsigned int iface_no); struct message_handlers totemsrp_message_handlers = { 6, { message_handler_orf_token, /* MESSAGE_TYPE_ORF_TOKEN */ message_handler_mcast, /* MESSAGE_TYPE_MCAST */ message_handler_memb_merge_detect, /* MESSAGE_TYPE_MEMB_MERGE_DETECT */ message_handler_memb_join, /* MESSAGE_TYPE_MEMB_JOIN */ message_handler_memb_commit_token, /* MESSAGE_TYPE_MEMB_COMMIT_TOKEN */ message_handler_token_hold_cancel /* MESSAGE_TYPE_TOKEN_HOLD_CANCEL */ } }; #define log_printf(level, format, args...) \ do { \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)\n", ##args, _error_ptr, err_num); \ } while(0) static const char* gsfrom_to_msg(enum gather_state_from gsfrom) { if (gsfrom <= TOTEMSRP_GSFROM_MAX) { return gather_state_from_desc[gsfrom]; } else { return "UNKNOWN"; } } static void totemsrp_instance_initialize (struct totemsrp_instance *instance) { memset (instance, 0, sizeof (struct totemsrp_instance)); qb_list_init (&instance->token_callback_received_listhead); qb_list_init (&instance->token_callback_sent_listhead); instance->my_received_flg = 1; instance->my_token_seq = SEQNO_START_TOKEN - 1; instance->memb_state = MEMB_STATE_OPERATIONAL; instance->set_aru = -1; instance->my_aru = SEQNO_START_MSG; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_high_delivered = SEQNO_START_MSG; instance->orf_token_discard = 0; instance->originated_orf_token = 0; instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage; instance->waiting_trans_ack = 1; } static int pause_flush (struct totemsrp_instance *instance) { uint64_t now_msec; uint64_t timestamp_msec; int res = 0; now_msec = (qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC); timestamp_msec = instance->pause_timestamp / QB_TIME_NS_IN_MSEC; if ((now_msec - timestamp_msec) > (instance->totem_config->token_timeout / 2)) { log_printf (instance->totemsrp_log_level_notice, "Process pause detected for %d ms, flushing membership messages.", (unsigned int)(now_msec - timestamp_msec)); /* * -1 indicates an error from recvmsg */ do { res = totemnet_recv_mcast_empty (instance->totemnet_context); } while (res == -1); } return (res); } static int token_event_stats_collector (enum totem_callback_token_type type, const void *void_instance) { struct totemsrp_instance *instance = (struct totemsrp_instance *)void_instance; uint32_t time_now; unsigned long long nano_secs = qb_util_nano_current_get (); time_now = (nano_secs / QB_TIME_NS_IN_MSEC); if (type == TOTEM_CALLBACK_TOKEN_RECEIVED) { /* incr latest token the index */ if (instance->stats.latest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.latest_token = 0; else instance->stats.latest_token++; if (instance->stats.earliest_token == instance->stats.latest_token) { /* we have filled up the array, start overwriting */ if (instance->stats.earliest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.earliest_token = 0; else instance->stats.earliest_token++; instance->stats.token[instance->stats.earliest_token].rx = 0; instance->stats.token[instance->stats.earliest_token].tx = 0; instance->stats.token[instance->stats.earliest_token].backlog_calc = 0; } instance->stats.token[instance->stats.latest_token].rx = time_now; instance->stats.token[instance->stats.latest_token].tx = 0; /* in case we drop the token */ } else { instance->stats.token[instance->stats.latest_token].tx = time_now; } return 0; } static void totempg_mtu_changed(void *context, int net_mtu) { struct totemsrp_instance *instance = context; instance->totem_config->net_mtu = net_mtu - 2 * sizeof (struct mcast); log_printf (instance->totemsrp_log_level_debug, "Net MTU changed to %d, new value is %d", net_mtu, instance->totem_config->net_mtu); } /* * Exported interfaces */ int totemsrp_initialize ( qb_loop_t *poll_handle, void **srp_context, struct totem_config *totem_config, totempg_stats_t *stats, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id), void (*waiting_trans_ack_cb_fn) ( int waiting_trans_ack)) { struct totemsrp_instance *instance; int res; instance = malloc (sizeof (struct totemsrp_instance)); if (instance == NULL) { goto error_exit; } totemsrp_instance_initialize (instance); instance->totemsrp_waiting_trans_ack_cb_fn = waiting_trans_ack_cb_fn; instance->totemsrp_waiting_trans_ack_cb_fn (1); stats->srp = &instance->stats; instance->stats.latest_token = 0; instance->stats.earliest_token = 0; instance->totem_config = totem_config; /* * Configure logging */ instance->totemsrp_log_level_security = totem_config->totem_logging_configuration.log_level_security; instance->totemsrp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemsrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemsrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemsrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemsrp_log_level_trace = totem_config->totem_logging_configuration.log_level_trace; instance->totemsrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemsrp_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Configure totem store and load functions */ instance->memb_ring_id_create_or_load = totem_config->totem_memb_ring_id_create_or_load; instance->memb_ring_id_store = totem_config->totem_memb_ring_id_store; /* * Initialize local variables for totemsrp */ totemip_copy (&instance->mcast_address, &totem_config->interfaces[instance->lowest_active_if].mcast_addr); /* * Display totem configuration */ log_printf (instance->totemsrp_log_level_debug, "Token Timeout (%d ms) retransmit timeout (%d ms)", totem_config->token_timeout, totem_config->token_retransmit_timeout); if (totem_config->token_warning) { uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100; log_printf(instance->totemsrp_log_level_debug, "Token warning every %d ms (%d%% of Token Timeout)", token_warning_ms, totem_config->token_warning); if (token_warning_ms < totem_config->token_retransmit_timeout) log_printf (LOGSYS_LEVEL_DEBUG, "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) " "which can lead to spurious token warnings. Consider increasing the token_warning parameter.", token_warning_ms, totem_config->token_retransmit_timeout); } else { log_printf(instance->totemsrp_log_level_debug, "Token warnings disabled"); } log_printf (instance->totemsrp_log_level_debug, "token hold (%d ms) retransmits before loss (%d retrans)", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf (instance->totemsrp_log_level_debug, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf (instance->totemsrp_log_level_debug, "downcheck (%d ms) fail to recv const (%d msgs)", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf (instance->totemsrp_log_level_debug, "seqno unchanged const (%d rotations) Maximum network MTU %d", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf (instance->totemsrp_log_level_debug, "window size per rotation (%d messages) maximum messages per rotation (%d messages)", totem_config->window_size, totem_config->max_messages); log_printf (instance->totemsrp_log_level_debug, "missed count const (%d messages)", totem_config->miss_count_const); log_printf (instance->totemsrp_log_level_debug, "send threads (%d threads)", totem_config->threads); log_printf (instance->totemsrp_log_level_debug, "heartbeat_failures_allowed (%d)", totem_config->heartbeat_failures_allowed); log_printf (instance->totemsrp_log_level_debug, "max_network_delay (%d ms)", totem_config->max_network_delay); cs_queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); sq_init (&instance->regular_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); sq_init (&instance->recovery_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); instance->totemsrp_poll_handle = poll_handle; instance->totemsrp_deliver_fn = deliver_fn; instance->totemsrp_confchg_fn = confchg_fn; instance->use_heartbeat = 1; timer_function_pause_timeout (instance); if ( totem_config->heartbeat_failures_allowed == 0 ) { log_printf (instance->totemsrp_log_level_debug, "HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0"); instance->use_heartbeat = 0; } if (instance->use_heartbeat) { instance->heartbeat_timeout = (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout + totem_config->max_network_delay; if (instance->heartbeat_timeout >= totem_config->token_timeout) { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)", instance->heartbeat_timeout, totem_config->token_timeout); log_printf (instance->totemsrp_log_level_debug, "heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay"); log_printf (instance->totemsrp_log_level_debug, "heartbeat timeout should be less than the token timeout. Heartbeat is disabled!!"); instance->use_heartbeat = 0; } else { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms)", instance->heartbeat_timeout); } } res = totemnet_initialize ( poll_handle, &instance->totemnet_context, totem_config, stats->srp, instance, main_deliver_fn, main_iface_change_fn, totempg_mtu_changed, target_set_completed); if (res == -1) { goto error_exit; } instance->my_id.nodeid = instance->totem_config->interfaces[instance->lowest_active_if].boundto.nodeid; /* * Must have net_mtu adjusted by totemnet_initialize first */ cs_queue_init (&instance->new_message_queue, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); cs_queue_init (&instance->new_message_queue_trans, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); totemsrp_callback_token_create (instance, &instance->token_recv_event_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, token_event_stats_collector, instance); totemsrp_callback_token_create (instance, &instance->token_sent_event_handle, TOTEM_CALLBACK_TOKEN_SENT, 0, token_event_stats_collector, instance); *srp_context = instance; return (0); error_exit: return (-1); } void totemsrp_finalize ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; memb_leave_message_send (instance); totemnet_finalize (instance->totemnet_context); cs_queue_free (&instance->new_message_queue); cs_queue_free (&instance->new_message_queue_trans); cs_queue_free (&instance->retrans_message_queue); sq_free (&instance->regular_sort_queue); sq_free (&instance->recovery_sort_queue); free (instance); } int totemsrp_nodestatus_get ( void *srp_context, unsigned int nodeid, struct totem_node_status *node_status) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int i; node_status->version = TOTEM_NODE_STATUS_STRUCTURE_VERSION; /* Fill in 'reachable' here as the lower level UDP[u] layers don't know */ for (i = 0; i < instance->my_proc_list_entries; i++) { if (instance->my_proc_list[i].nodeid == nodeid) { node_status->reachable = 1; } } return totemnet_nodestatus_get(instance->totemnet_context, nodeid, node_status); } /* * Return configured interfaces. interfaces is array of totem_ip addresses allocated by caller, * with interaces_size number of items. iface_count is final number of interfaces filled by this * function. * * Function returns 0 on success, otherwise if interfaces array is not big enough, -2 is returned, * and if interface was not found, -1 is returned. */ int totemsrp_ifaces_get ( void *srp_context, unsigned int nodeid, unsigned int *interface_id, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; struct totem_ip_address *iface_ptr = interfaces; int res = 0; int i,n; int num_ifs = 0; memset(interfaces, 0, sizeof(struct totem_ip_address) * interfaces_size); *iface_count = INTERFACE_MAX; for (i=0; itotem_config->interfaces[i].member_count; n++) { if (instance->totem_config->interfaces[i].configured && instance->totem_config->interfaces[i].member_list[n].nodeid == nodeid) { memcpy(iface_ptr, &instance->totem_config->interfaces[i].member_list[n], sizeof(struct totem_ip_address)); interface_id[num_ifs] = i; iface_ptr++; if (++num_ifs > interfaces_size) { res = -2; break; } } } } totemnet_ifaces_get(instance->totemnet_context, status, iface_count); *iface_count = num_ifs; return (res); } int totemsrp_crypto_set ( void *srp_context, const char *cipher_type, const char *hash_type) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = totemnet_crypto_set(instance->totemnet_context, cipher_type, hash_type); return (res); } unsigned int totemsrp_my_nodeid_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; unsigned int res; res = instance->my_id.nodeid; return (res); } int totemsrp_my_family_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = instance->totem_config->interfaces[instance->lowest_active_if].boundto.family; return (res); } /* * Set operations for use by the membership algorithm */ static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b) { if (a->nodeid == b->nodeid) { return 1; } return 0; } static void srp_addr_to_nodeid ( struct totemsrp_instance *instance, unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries) { unsigned int i; for (i = 0; i < entries; i++) { nodeid_out[i] = srp_addr_in[i].nodeid; } } static struct srp_addr srp_addr_endian_convert (struct srp_addr in) { struct srp_addr res; res.nodeid = swab32 (in.nodeid); return (res); } static void memb_consensus_reset (struct totemsrp_instance *instance) { instance->consensus_list_entries = 0; } static void memb_set_subtract ( struct srp_addr *out_list, int *out_list_entries, struct srp_addr *one_list, int one_list_entries, struct srp_addr *two_list, int two_list_entries) { int found = 0; int i; int j; *out_list_entries = 0; for (i = 0; i < one_list_entries; i++) { for (j = 0; j < two_list_entries; j++) { if (srp_addr_equal (&one_list[i], &two_list[j])) { found = 1; break; } } if (found == 0) { out_list[*out_list_entries] = one_list[i]; *out_list_entries = *out_list_entries + 1; } found = 0; } } /* * Set consensus for a specific processor */ static void memb_consensus_set ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int found = 0; int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) { found = 1; break; /* found entry */ } } instance->consensus_list[i].addr = *addr; instance->consensus_list[i].set = 1; if (found == 0) { instance->consensus_list_entries++; } return; } /* * Is consensus set for a specific processor */ static int memb_consensus_isset ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal (addr, &instance->consensus_list[i].addr)) { return (instance->consensus_list[i].set); } } return (0); } /* * Is consensus agreed upon based upon consensus database */ static int memb_consensus_agreed ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int agreed = 1; int i; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); for (i = 0; i < token_memb_entries; i++) { if (memb_consensus_isset (instance, &token_memb[i]) == 0) { agreed = 0; break; } } if (agreed && instance->failed_to_recv == 1) { /* * Both nodes agreed on our failure. We don't care how many proc list items left because we * will create single ring anyway. */ return (agreed); } assert (token_memb_entries >= 1); return (agreed); } static void memb_consensus_notset ( struct totemsrp_instance *instance, struct srp_addr *no_consensus_list, int *no_consensus_list_entries, struct srp_addr *comparison_list, int comparison_list_entries) { int i; *no_consensus_list_entries = 0; for (i = 0; i < instance->my_proc_list_entries; i++) { if (memb_consensus_isset (instance, &instance->my_proc_list[i]) == 0) { no_consensus_list[*no_consensus_list_entries] = instance->my_proc_list[i]; *no_consensus_list_entries = *no_consensus_list_entries + 1; } } } /* * Is set1 equal to set2 Entries can be in different orders */ static int memb_set_equal ( struct srp_addr *set1, int set1_entries, struct srp_addr *set2, int set2_entries) { int i; int j; int found = 0; if (set1_entries != set2_entries) { return (0); } for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { found = 1; break; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * Is subset fully contained in fullset */ static int memb_set_subset ( const struct srp_addr *subset, int subset_entries, const struct srp_addr *fullset, int fullset_entries) { int i; int j; int found = 0; if (subset_entries > fullset_entries) { return (0); } for (i = 0; i < subset_entries; i++) { for (j = 0; j < fullset_entries; j++) { if (srp_addr_equal (&subset[i], &fullset[j])) { found = 1; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * merge subset into fullset taking care not to add duplicates */ static void memb_set_merge ( const struct srp_addr *subset, int subset_entries, struct srp_addr *fullset, int *fullset_entries) { int found = 0; int i; int j; for (i = 0; i < subset_entries; i++) { for (j = 0; j < *fullset_entries; j++) { if (srp_addr_equal (&fullset[j], &subset[i])) { found = 1; break; } } if (found == 0) { fullset[*fullset_entries] = subset[i]; *fullset_entries = *fullset_entries + 1; } found = 0; } return; } static void memb_set_and_with_ring_id ( struct srp_addr *set1, struct memb_ring_id *set1_ring_ids, int set1_entries, struct srp_addr *set2, int set2_entries, struct memb_ring_id *old_ring_id, struct srp_addr *and, int *and_entries) { int i; int j; int found = 0; *and_entries = 0; for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { if (memcmp (&set1_ring_ids[j], old_ring_id, sizeof (struct memb_ring_id)) == 0) { found = 1; } break; } } if (found) { and[*and_entries] = set1[j]; *and_entries = *and_entries + 1; } found = 0; } return; } static void memb_set_log( struct totemsrp_instance *instance, int level, const char *string, struct srp_addr *list, int list_entries) { char int_buf[32]; char list_str[512]; int i; memset(list_str, 0, sizeof(list_str)); for (i = 0; i < list_entries; i++) { if (i == 0) { snprintf(int_buf, sizeof(int_buf), CS_PRI_NODE_ID, list[i].nodeid); } else { snprintf(int_buf, sizeof(int_buf), "," CS_PRI_NODE_ID, list[i].nodeid); } if (strlen(list_str) + strlen(int_buf) >= sizeof(list_str)) { break ; } strcat(list_str, int_buf); } log_printf(level, "List '%s' contains %d entries: %s", string, list_entries, list_str); } static void my_leave_memb_clear( struct totemsrp_instance *instance) { memset(instance->my_leave_memb_list, 0, sizeof(instance->my_leave_memb_list)); instance->my_leave_memb_entries = 0; } static unsigned int my_leave_memb_match( struct totemsrp_instance *instance, unsigned int nodeid) { int i; unsigned int ret = 0; for (i = 0; i < instance->my_leave_memb_entries; i++){ if (instance->my_leave_memb_list[i] == nodeid){ ret = nodeid; break; } } return ret; } static void my_leave_memb_set( struct totemsrp_instance *instance, unsigned int nodeid) { int i, found = 0; for (i = 0; i < instance->my_leave_memb_entries; i++){ if (instance->my_leave_memb_list[i] == nodeid){ found = 1; break; } } if (found == 1) { return; } if (instance->my_leave_memb_entries < (PROCESSOR_COUNT_MAX - 1)) { instance->my_leave_memb_list[instance->my_leave_memb_entries] = nodeid; instance->my_leave_memb_entries++; } else { log_printf (instance->totemsrp_log_level_warning, "Cannot set LEAVE nodeid=" CS_PRI_NODE_ID, nodeid); } } static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance) { assert (instance != NULL); return totemnet_buffer_alloc (instance->totemnet_context); } static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr) { assert (instance != NULL); totemnet_buffer_release (instance->totemnet_context, ptr); } static void reset_token_retransmit_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_retransmit_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_retransmit_timeout, &instance->timer_orf_token_retransmit_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_retransmit_timeout - qb_loop_timer_add error : %d", res); } } static void start_merge_detect_timeout (struct totemsrp_instance *instance) { int32_t res; if (instance->my_merge_detect_timeout_outstanding == 0) { res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "start_merge_detect_timeout - qb_loop_timer_add error : %d", res); } instance->my_merge_detect_timeout_outstanding = 1; } } static void cancel_merge_detect_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 0; } /* * ring_state_* is used to save and restore the sort queue * state when a recovery operation fails (and enters gather) */ static void old_ring_state_save (struct totemsrp_instance *instance) { if (instance->old_ring_state_saved == 0) { instance->old_ring_state_saved = 1; memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); instance->old_ring_state_aru = instance->my_aru; instance->old_ring_state_high_seq_received = instance->my_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Saving state aru %x high seq received %x", instance->my_aru, instance->my_high_seq_received); } } static void old_ring_state_restore (struct totemsrp_instance *instance) { instance->my_aru = instance->old_ring_state_aru; instance->my_high_seq_received = instance->old_ring_state_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Restoring instance->my_aru %x my high seq received %x", instance->my_aru, instance->my_high_seq_received); } static void old_ring_state_reset (struct totemsrp_instance *instance) { log_printf (instance->totemsrp_log_level_debug, "Resetting old ring state"); instance->old_ring_state_saved = 0; } static void reset_pause_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_pause_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout * QB_TIME_NS_IN_MSEC / 5, (void *)instance, timer_function_pause_timeout, &instance->timer_pause_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_pause_timeout - qb_loop_timer_add error : %d", res); } } static void reset_token_warning (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_warning * instance->totem_config->token_timeout / 100 * QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_orf_token_warning, &instance->timer_orf_token_warning); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_warning - qb_loop_timer_add error : %d", res); } } static void reset_token_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_orf_token_timeout, &instance->timer_orf_token_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_timeout - qb_loop_timer_add error : %d", res); } if (instance->totem_config->token_warning) reset_token_warning(instance); } static void reset_heartbeat_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->heartbeat_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_heartbeat_timeout, &instance->timer_heartbeat_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_heartbeat_timeout - qb_loop_timer_add error : %d", res); } } static void cancel_token_warning (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning); } static void cancel_token_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); if (instance->totem_config->token_warning) cancel_token_warning(instance); } static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); } static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); } static void start_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { int32_t res; res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_hold_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_hold_retransmit_timeout, &instance->timer_orf_token_hold_retransmit_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "start_token_hold_retransmit_timeout - qb_loop_timer_add error : %d", res); } } static void cancel_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_hold_retransmit_timeout); } static void memb_state_consensus_timeout_expired ( struct totemsrp_instance *instance) { struct srp_addr no_consensus_list[PROCESSOR_COUNT_MAX]; int no_consensus_list_entries; instance->stats.consensus_timeouts++; if (memb_consensus_agreed (instance)) { memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); reset_token_timeout (instance); // REVIEWED } else { memb_consensus_notset ( instance, no_consensus_list, &no_consensus_list_entries, instance->my_proc_list, instance->my_proc_list_entries); memb_set_merge (no_consensus_list, no_consensus_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT); } } static void memb_join_message_send (struct totemsrp_instance *instance); static void memb_merge_detect_transmit (struct totemsrp_instance *instance); /* * Timers used for various states of the membership algorithm */ static void timer_function_pause_timeout (void *data) { struct totemsrp_instance *instance = data; instance->pause_timestamp = qb_util_nano_current_get (); reset_pause_timeout (instance); } static void memb_recovery_state_token_loss (struct totemsrp_instance *instance) { old_ring_state_restore (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE); instance->stats.recovery_token_lost++; } static void timer_function_orf_token_warning (void *data) { struct totemsrp_instance *instance = data; uint64_t tv_diff; /* need to protect against the case where token_warning is set to 0 dynamically */ if (instance->totem_config->token_warning) { tv_diff = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC - instance->stats.token[instance->stats.latest_token].rx; log_printf (instance->totemsrp_log_level_notice, "Token has not been received in %d ms ", (unsigned int) tv_diff); reset_token_warning(instance); } else { cancel_token_warning(instance); } } static void timer_function_orf_token_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the OPERATIONAL state."); log_printf (instance->totemsrp_log_level_notice, "A processor failed, forming new configuration:" " token timed out (%ums), waiting %ums for consensus.", instance->totem_config->token_timeout, instance->totem_config->consensus_timeout); totemnet_iface_check (instance->totemnet_context); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE); instance->stats.operational_token_lost++; break; case MEMB_STATE_GATHER: log_printf (instance->totemsrp_log_level_debug, "The consensus timeout expired (%ums).", instance->totem_config->consensus_timeout); memb_state_consensus_timeout_expired (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED); instance->stats.gather_token_lost++; break; case MEMB_STATE_COMMIT: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the COMMIT state."); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE); instance->stats.commit_token_lost++; break; case MEMB_STATE_RECOVERY: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the RECOVERY state."); memb_recovery_state_token_loss (instance); instance->orf_token_discard = 1; break; } } static void timer_function_heartbeat_timeout (void *data) { struct totemsrp_instance *instance = data; log_printf (instance->totemsrp_log_level_debug, "HeartBeat Timer expired Invoking token loss mechanism in state %d ", instance->memb_state); timer_function_orf_token_timeout(data); } static void memb_timer_function_state_gather (void *data) { struct totemsrp_instance *instance = data; int32_t res; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: assert (0); /* this should never happen */ break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: memb_join_message_send (instance); /* * Restart the join timeout `*/ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_timer_function_state_gather - qb_loop_timer_add error : %d", res); } break; } } static void memb_timer_function_gather_consensus_timeout (void *data) { struct totemsrp_instance *instance = data; memb_state_consensus_timeout_expired (instance); } static void deliver_messages_from_recovery_to_regular (struct totemsrp_instance *instance) { unsigned int i; struct sort_queue_item *recovery_message_item; struct sort_queue_item regular_message_item; unsigned int range = 0; int res; void *ptr; struct mcast *mcast; log_printf (instance->totemsrp_log_level_debug, "recovery to regular %x-%x", SEQNO_START_MSG + 1, instance->my_aru); range = instance->my_aru - SEQNO_START_MSG; /* * Move messages from recovery to regular sort queue */ // todo should i be initialized to 0 or 1 ? for (i = 1; i <= range; i++) { res = sq_item_get (&instance->recovery_sort_queue, i + SEQNO_START_MSG, &ptr); if (res != 0) { continue; } recovery_message_item = ptr; /* * Convert recovery message into regular message */ mcast = recovery_message_item->mcast; if (mcast->header.encapsulated == MESSAGE_ENCAPSULATED) { /* * Message is a recovery message encapsulated * in a new ring message */ regular_message_item.mcast = (struct mcast *)(((char *)recovery_message_item->mcast) + sizeof (struct mcast)); regular_message_item.msg_len = recovery_message_item->msg_len - sizeof (struct mcast); mcast = regular_message_item.mcast; } else { /* * TODO this case shouldn't happen */ continue; } log_printf (instance->totemsrp_log_level_debug, "comparing if ring id is for this processors old ring seqno " CS_PRI_RING_ID_SEQ, (uint64_t)mcast->seq); /* * Only add this message to the regular sort * queue if it was originated with the same ring * id as the previous ring */ if (memcmp (&instance->my_old_ring_id, &mcast->ring_id, sizeof (struct memb_ring_id)) == 0) { res = sq_item_inuse (&instance->regular_sort_queue, mcast->seq); if (res == 0) { sq_item_add (&instance->regular_sort_queue, ®ular_message_item, mcast->seq); if (sq_lt_compare (instance->old_ring_state_high_seq_received, mcast->seq)) { instance->old_ring_state_high_seq_received = mcast->seq; } } } else { log_printf (instance->totemsrp_log_level_debug, "-not adding msg with seq no " CS_PRI_RING_ID_SEQ, (uint64_t)mcast->seq); } } } /* * Change states in the state machine of the membership algorithm */ static void memb_state_operational_enter (struct totemsrp_instance *instance) { struct srp_addr joined_list[PROCESSOR_COUNT_MAX]; int joined_list_entries = 0; unsigned int aru_save; unsigned int joined_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int trans_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int new_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int left_list[PROCESSOR_COUNT_MAX]; unsigned int i; unsigned int res; char left_node_msg[1024]; char joined_node_msg[1024]; char failed_node_msg[1024]; instance->originated_orf_token = 0; memb_consensus_reset (instance); old_ring_state_reset (instance); deliver_messages_from_recovery_to_regular (instance); log_printf (instance->totemsrp_log_level_trace, "Delivering to app %x to %x", instance->my_high_delivered + 1, instance->old_ring_state_high_seq_received); aru_save = instance->my_aru; instance->my_aru = instance->old_ring_state_aru; messages_deliver_to_app (instance, 0, instance->old_ring_state_high_seq_received); /* * Calculate joined and left list */ memb_set_subtract (instance->my_left_memb_list, &instance->my_left_memb_entries, instance->my_memb_list, instance->my_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); memb_set_subtract (joined_list, &joined_list_entries, instance->my_new_memb_list, instance->my_new_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); /* * Install new membership */ instance->my_memb_entries = instance->my_new_memb_entries; memcpy (&instance->my_memb_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->last_released = 0; instance->my_set_retrans_flg = 0; /* * Deliver transitional configuration to application */ srp_addr_to_nodeid (instance, left_list, instance->my_left_memb_list, instance->my_left_memb_entries); srp_addr_to_nodeid (instance, trans_memb_list_totemip, instance->my_trans_memb_list, instance->my_trans_memb_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_TRANSITIONAL, trans_memb_list_totemip, instance->my_trans_memb_entries, left_list, instance->my_left_memb_entries, 0, 0, &instance->my_ring_id); /* * Switch new totemsrp messages queue. Messages sent from now on are stored * in different queue so synchronization messages are delivered first. Totempg * buffers will be switched later. */ instance->waiting_trans_ack = 1; // TODO we need to filter to ensure we only deliver those // messages which are part of instance->my_deliver_memb messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received); /* * Switch totempg buffers. This used to be right after * instance->waiting_trans_ack = 1; * line. This was causing problem, because there may be not yet * processed parts of messages in totempg buffers. * So when buffers were switched and recovered messages * got delivered it was not possible to assemble them. */ instance->totemsrp_waiting_trans_ack_cb_fn (1); instance->my_aru = aru_save; /* * Deliver regular configuration to application */ srp_addr_to_nodeid (instance, new_memb_list_totemip, instance->my_new_memb_list, instance->my_new_memb_entries); srp_addr_to_nodeid (instance, joined_list_totemip, joined_list, joined_list_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_REGULAR, new_memb_list_totemip, instance->my_new_memb_entries, 0, 0, joined_list_totemip, joined_list_entries, &instance->my_ring_id); /* * The recovery sort queue now becomes the regular * sort queue. It is necessary to copy the state * into the regular sort queue. */ sq_copy (&instance->regular_sort_queue, &instance->recovery_sort_queue); instance->my_last_aru = SEQNO_START_MSG; /* When making my_proc_list smaller, ensure that the * now non-used entries are zero-ed out. There are some suspect * assert's that assume that there is always 2 entries in the list. * These fail when my_proc_list is reduced to 1 entry (and the * valid [0] entry is the same as the 'unused' [1] entry). */ memset(instance->my_proc_list, 0, sizeof (struct srp_addr) * instance->my_proc_list_entries); instance->my_proc_list_entries = instance->my_new_memb_entries; memcpy (instance->my_proc_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->my_failed_list_entries = 0; /* * TODO Not exactly to spec * * At the entry to this function all messages without a gap are * deliered. * * This code throw away messages from the last gap in the sort queue * to my_high_seq_received * * What should really happen is we should deliver all messages up to * a gap, then delier the transitional configuration, then deliver * the messages between the first gap and my_high_seq_received, then * deliver a regular configuration, then deliver the regular * configuration * * Unfortunately totempg doesn't appear to like this operating mode * which needs more inspection */ i = instance->my_high_seq_received + 1; do { void *ptr; i -= 1; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (i == 0) { break; } } while (res); instance->my_high_delivered = i; for (i = 0; i <= instance->my_high_delivered; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (res == 0) { struct sort_queue_item *regular_message; regular_message = ptr; free (regular_message->mcast); } } sq_items_release (&instance->regular_sort_queue, instance->my_high_delivered); instance->last_released = instance->my_high_delivered; if (joined_list_entries) { int sptr = 0; sptr += snprintf(joined_node_msg, sizeof(joined_node_msg)-sptr, " joined:"); for (i=0; i< joined_list_entries; i++) { sptr += snprintf(joined_node_msg+sptr, sizeof(joined_node_msg)-sptr, " " CS_PRI_NODE_ID, joined_list_totemip[i]); } } else { joined_node_msg[0] = '\0'; } if (instance->my_left_memb_entries) { int sptr = 0; int sptr2 = 0; sptr += snprintf(left_node_msg, sizeof(left_node_msg)-sptr, " left:"); for (i=0; i< instance->my_left_memb_entries; i++) { sptr += snprintf(left_node_msg+sptr, sizeof(left_node_msg)-sptr, " " CS_PRI_NODE_ID, left_list[i]); } for (i=0; i< instance->my_left_memb_entries; i++) { if (my_leave_memb_match(instance, left_list[i]) == 0) { if (sptr2 == 0) { sptr2 += snprintf(failed_node_msg, sizeof(failed_node_msg)-sptr2, " failed:"); } sptr2 += snprintf(failed_node_msg+sptr2, sizeof(left_node_msg)-sptr2, " " CS_PRI_NODE_ID, left_list[i]); } } if (sptr2 == 0) { failed_node_msg[0] = '\0'; } } else { left_node_msg[0] = '\0'; failed_node_msg[0] = '\0'; } my_leave_memb_clear(instance); log_printf (instance->totemsrp_log_level_debug, "entering OPERATIONAL state."); log_printf (instance->totemsrp_log_level_notice, "A new membership (" CS_PRI_RING_ID ") was formed. Members%s%s", instance->my_ring_id.rep, (uint64_t)instance->my_ring_id.seq, joined_node_msg, left_node_msg); if (strlen(failed_node_msg)) { log_printf (instance->totemsrp_log_level_notice, "Failed to receive the leave message.%s", failed_node_msg); } instance->memb_state = MEMB_STATE_OPERATIONAL; instance->stats.operational_entered++; instance->stats.continuous_gather = 0; instance->my_received_flg = 1; reset_pause_timeout (instance); /* * Save ring id information from this configuration to determine * which processors are transitioning from old regular configuration * in to new regular configuration on the next configuration change */ memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); return; } static void memb_state_gather_enter ( struct totemsrp_instance *instance, enum gather_state_from gather_from) { int32_t res; instance->orf_token_discard = 1; instance->originated_orf_token = 0; memb_set_merge ( &instance->my_id, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_join_message_send (instance); /* * Restart the join timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_state_gather_enter - qb_loop_timer_add error(1) : %d", res); } /* * Restart the consensus timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->consensus_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_gather_consensus_timeout, &instance->memb_timer_state_gather_consensus_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_state_gather_enter - qb_loop_timer_add error(2) : %d", res); } /* * Cancel the token loss and token retransmission timeouts */ cancel_token_retransmit_timeout (instance); // REVIEWED cancel_token_timeout (instance); // REVIEWED cancel_merge_detect_timeout (instance); memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); log_printf (instance->totemsrp_log_level_debug, "entering GATHER state from %d(%s).", gather_from, gsfrom_to_msg(gather_from)); instance->memb_state = MEMB_STATE_GATHER; instance->stats.gather_entered++; if (gather_from == TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED) { /* * State 3 means gather, so we are continuously gathering. */ instance->stats.continuous_gather++; } return; } static void timer_function_token_retransmit_timeout (void *data); static void target_set_completed ( void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; memb_state_commit_token_send (instance); } static void memb_state_commit_enter ( struct totemsrp_instance *instance) { old_ring_state_save (instance); memb_state_commit_token_update (instance); memb_state_commit_token_target_set (instance); qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); instance->memb_timer_state_gather_join_timeout = 0; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); instance->memb_timer_state_gather_consensus_timeout = 0; memb_ring_id_set (instance, &instance->commit_token->ring_id); instance->memb_ring_id_store (&instance->my_ring_id, instance->my_id.nodeid); instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf (instance->totemsrp_log_level_debug, "entering COMMIT state."); instance->memb_state = MEMB_STATE_COMMIT; reset_token_retransmit_timeout (instance); // REVIEWED reset_token_timeout (instance); // REVIEWED instance->stats.commit_entered++; instance->stats.continuous_gather = 0; /* * reset all flow control variables since we are starting a new ring */ instance->my_trc = 0; instance->my_pbl = 0; instance->my_cbl = 0; /* * commit token sent after callback that token target has been set */ } static void memb_state_recovery_enter ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { int i; int local_received_flg = 1; unsigned int low_ring_aru; unsigned int range = 0; unsigned int messages_originated = 0; const struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; struct memb_ring_id my_new_memb_ring_id_list[PROCESSOR_COUNT_MAX]; addr = (const struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); log_printf (instance->totemsrp_log_level_debug, "entering RECOVERY state."); instance->orf_token_discard = 0; instance->my_high_ring_delivered = 0; sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG); cs_queue_reinit (&instance->retrans_message_queue); low_ring_aru = instance->old_ring_state_high_seq_received; memb_state_commit_token_send_recovery (instance, commit_token); instance->my_token_seq = SEQNO_START_TOKEN - 1; /* * Build regular configuration */ totemnet_processor_count_set ( instance->totemnet_context, commit_token->addr_entries); /* * Build transitional configuration */ for (i = 0; i < instance->my_new_memb_entries; i++) { memcpy (&my_new_memb_ring_id_list[i], &memb_list[i].ring_id, sizeof (struct memb_ring_id)); } memb_set_and_with_ring_id ( instance->my_new_memb_list, my_new_memb_ring_id_list, instance->my_new_memb_entries, instance->my_memb_list, instance->my_memb_entries, &instance->my_old_ring_id, instance->my_trans_memb_list, &instance->my_trans_memb_entries); for (i = 0; i < instance->my_trans_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "TRANS [%d] member " CS_PRI_NODE_ID ":", i, instance->my_trans_memb_list[i].nodeid); } for (i = 0; i < instance->my_new_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "position [%d] member " CS_PRI_NODE_ID ":", i, addr[i].nodeid); log_printf (instance->totemsrp_log_level_debug, "previous ringid (" CS_PRI_RING_ID ")", memb_list[i].ring_id.rep, (uint64_t)memb_list[i].ring_id.seq); log_printf (instance->totemsrp_log_level_debug, "aru %x high delivered %x received flag %d", memb_list[i].aru, memb_list[i].high_delivered, memb_list[i].received_flg); // assert (totemip_print (&memb_list[i].ring_id.rep) != 0); } /* * Determine if any received flag is false */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_trans_memb_list, instance->my_trans_memb_entries) && memb_list[i].received_flg == 0) { instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct srp_addr) * instance->my_trans_memb_entries); local_received_flg = 0; break; } } if (local_received_flg == 1) { goto no_originate; } /* Else originate messages if we should */ /* * Calculate my_low_ring_aru, instance->my_high_ring_delivered for the transitional membership */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) && memcmp (&instance->my_old_ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, low_ring_aru)) { low_ring_aru = memb_list[i].aru; } if (sq_lt_compare (instance->my_high_ring_delivered, memb_list[i].high_delivered)) { instance->my_high_ring_delivered = memb_list[i].high_delivered; } } } /* * Copy all old ring messages to instance->retrans_message_queue */ range = instance->old_ring_state_high_seq_received - low_ring_aru; if (range == 0) { /* * No messages to copy */ goto no_originate; } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); log_printf (instance->totemsrp_log_level_debug, "copying all old ring messages from %x-%x.", low_ring_aru + 1, instance->old_ring_state_high_seq_received); for (i = 1; i <= range; i++) { struct sort_queue_item *sort_queue_item; struct message_item message_item; void *ptr; int res; res = sq_item_get (&instance->regular_sort_queue, low_ring_aru + i, &ptr); if (res != 0) { continue; } sort_queue_item = ptr; messages_originated++; memset (&message_item, 0, sizeof (struct message_item)); // TODO LEAK message_item.mcast = totemsrp_buffer_alloc (instance); assert (message_item.mcast); memset(message_item.mcast, 0, sizeof (struct mcast)); message_item.mcast->header.magic = TOTEM_MH_MAGIC; message_item.mcast->header.version = TOTEM_MH_VERSION; message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->system_from = instance->my_id; message_item.mcast->header.encapsulated = MESSAGE_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.nodeid; assert (message_item.mcast->header.nodeid); memcpy (&message_item.mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); message_item.msg_len = sort_queue_item->msg_len + sizeof (struct mcast); memcpy (((char *)message_item.mcast) + sizeof (struct mcast), sort_queue_item->mcast, sort_queue_item->msg_len); cs_queue_item_add (&instance->retrans_message_queue, &message_item); } log_printf (instance->totemsrp_log_level_debug, "Originated %d messages in RECOVERY.", messages_originated); goto originated; no_originate: log_printf (instance->totemsrp_log_level_debug, "Did not need to originate any messages in recovery."); originated: instance->my_aru = SEQNO_START_MSG; instance->my_aru_count = 0; instance->my_seq_unchanged = 0; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_install_seq = SEQNO_START_MSG; instance->last_released = SEQNO_START_MSG; reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED instance->memb_state = MEMB_STATE_RECOVERY; instance->stats.recovery_entered++; instance->stats.continuous_gather = 0; return; } void totemsrp_event_signal (void *srp_context, enum totem_event_type type, int value) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; token_hold_cancel_send (instance); return; } int totemsrp_mcast ( void *srp_context, struct iovec *iovec, unsigned int iov_len, int guarantee) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int i; struct message_item message_item; char *addr; unsigned int addr_idx; struct cs_queue *queue_use; if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } if (cs_queue_is_full (queue_use)) { log_printf (instance->totemsrp_log_level_debug, "queue full"); return (-1); } memset (&message_item, 0, sizeof (struct message_item)); /* * Allocate pending item */ message_item.mcast = totemsrp_buffer_alloc (instance); if (message_item.mcast == 0) { goto error_mcast; } /* * Set mcast header */ memset(message_item.mcast, 0, sizeof (struct mcast)); message_item.mcast->header.magic = TOTEM_MH_MAGIC; message_item.mcast->header.version = TOTEM_MH_VERSION; message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->header.encapsulated = MESSAGE_NOT_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->guarantee = guarantee; message_item.mcast->system_from = instance->my_id; addr = (char *)message_item.mcast; addr_idx = sizeof (struct mcast); for (i = 0; i < iov_len; i++) { memcpy (&addr[addr_idx], iovec[i].iov_base, iovec[i].iov_len); addr_idx += iovec[i].iov_len; } message_item.msg_len = addr_idx; log_printf (instance->totemsrp_log_level_trace, "mcasted message added to pending queue"); instance->stats.mcast_tx++; cs_queue_item_add (queue_use, &message_item); return (0); error_mcast: return (-1); } /* * Determine if there is room to queue a new message */ int totemsrp_avail (void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int avail; struct cs_queue *queue_use; if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } cs_queue_avail (queue_use, &avail); return (avail); } /* * ORF Token Management */ /* * Recast message to mcast group if it is available */ static int orf_token_remcast ( struct totemsrp_instance *instance, int seq) { struct sort_queue_item *sort_queue_item; int res; void *ptr; struct sq *sort_queue; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } res = sq_in_range (sort_queue, seq); if (res == 0) { log_printf (instance->totemsrp_log_level_debug, "sq not in range"); return (-1); } /* * Get RTR item at seq, if not available, return */ res = sq_item_get (sort_queue, seq, &ptr); if (res != 0) { return -1; } sort_queue_item = ptr; totemnet_mcast_noflush_send ( instance->totemnet_context, sort_queue_item->mcast, sort_queue_item->msg_len); return (0); } /* * Free all freeable messages from ring */ static void messages_free ( struct totemsrp_instance *instance, unsigned int token_aru) { struct sort_queue_item *regular_message; unsigned int i; int res; int log_release = 0; unsigned int release_to; unsigned int range = 0; release_to = token_aru; if (sq_lt_compare (instance->my_last_aru, release_to)) { release_to = instance->my_last_aru; } if (sq_lt_compare (instance->my_high_delivered, release_to)) { release_to = instance->my_high_delivered; } /* * Ensure we dont try release before an already released point */ if (sq_lt_compare (release_to, instance->last_released)) { return; } range = release_to - instance->last_released; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); /* * Release retransmit list items if group aru indicates they are transmitted */ for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, instance->last_released + i, &ptr); if (res == 0) { regular_message = ptr; totemsrp_buffer_release (instance, regular_message->mcast); } sq_items_release (&instance->regular_sort_queue, instance->last_released + i); log_release = 1; } instance->last_released += range; if (log_release) { log_printf (instance->totemsrp_log_level_trace, "releasing messages up to and including %x", release_to); } } static void update_aru ( struct totemsrp_instance *instance) { unsigned int i; int res; struct sq *sort_queue; unsigned int range; unsigned int my_aru_saved = 0; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } range = instance->my_high_seq_received - instance->my_aru; my_aru_saved = instance->my_aru; for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (sort_queue, my_aru_saved + i, &ptr); /* * If hole, stop updating aru */ if (res != 0) { break; } } instance->my_aru += i - 1; } /* * Multicasts pending messages onto the ring (requires orf_token possession) */ static int orf_token_mcast ( struct totemsrp_instance *instance, struct orf_token *token, int fcc_mcasts_allowed) { struct message_item *message_item = 0; struct cs_queue *mcast_queue; struct sq *sort_queue; struct sort_queue_item sort_queue_item; struct mcast *mcast; unsigned int fcc_mcast_current; if (instance->memb_state == MEMB_STATE_RECOVERY) { mcast_queue = &instance->retrans_message_queue; sort_queue = &instance->recovery_sort_queue; reset_token_retransmit_timeout (instance); // REVIEWED } else { if (instance->waiting_trans_ack) { mcast_queue = &instance->new_message_queue_trans; } else { mcast_queue = &instance->new_message_queue; } sort_queue = &instance->regular_sort_queue; } for (fcc_mcast_current = 0; fcc_mcast_current < fcc_mcasts_allowed; fcc_mcast_current++) { if (cs_queue_is_empty (mcast_queue)) { break; } message_item = (struct message_item *)cs_queue_item_get (mcast_queue); message_item->mcast->seq = ++token->seq; message_item->mcast->this_seqno = instance->global_seqno++; /* * Build IO vector */ memset (&sort_queue_item, 0, sizeof (struct sort_queue_item)); sort_queue_item.mcast = message_item->mcast; sort_queue_item.msg_len = message_item->msg_len; mcast = sort_queue_item.mcast; memcpy (&mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); /* * Add message to retransmit queue */ sq_item_add (sort_queue, &sort_queue_item, message_item->mcast->seq); totemnet_mcast_noflush_send ( instance->totemnet_context, message_item->mcast, message_item->msg_len); /* * Delete item from pending queue */ cs_queue_item_remove (mcast_queue); /* * If messages mcasted, deliver any new messages to totempg */ instance->my_high_seq_received = token->seq; } update_aru (instance); /* * Return 1 if more messages are available for single node clusters */ return (fcc_mcast_current); } /* * Remulticasts messages in orf_token's retransmit list (requires orf_token) * Modify's orf_token's rtr to include retransmits required by this process */ static int orf_token_rtr ( struct totemsrp_instance *instance, struct orf_token *orf_token, unsigned int *fcc_allowed) { unsigned int res; unsigned int i, j; unsigned int found; struct sq *sort_queue; struct rtr_item *rtr_list; unsigned int range = 0; char retransmit_msg[1024]; char value[64]; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } rtr_list = &orf_token->rtr_list[0]; strcpy (retransmit_msg, "Retransmit List: "); if (orf_token->rtr_list_entries) { log_printf (instance->totemsrp_log_level_debug, "Retransmit List %d", orf_token->rtr_list_entries); for (i = 0; i < orf_token->rtr_list_entries; i++) { sprintf (value, "%x ", rtr_list[i].seq); strcat (retransmit_msg, value); } strcat (retransmit_msg, ""); log_printf (instance->totemsrp_log_level_notice, "%s", retransmit_msg); } /* * Retransmit messages on orf_token's RTR list from RTR queue */ for (instance->fcc_remcast_current = 0, i = 0; instance->fcc_remcast_current < *fcc_allowed && i < orf_token->rtr_list_entries;) { /* * If this retransmit request isn't from this configuration, * try next rtr entry */ if (memcmp (&rtr_list[i].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { i += 1; continue; } res = orf_token_remcast (instance, rtr_list[i].seq); if (res == 0) { /* * Multicasted message, so no need to copy to new retransmit list */ orf_token->rtr_list_entries -= 1; assert (orf_token->rtr_list_entries >= 0); memmove (&rtr_list[i], &rtr_list[i + 1], sizeof (struct rtr_item) * (orf_token->rtr_list_entries - i)); instance->stats.mcast_retx++; instance->fcc_remcast_current++; } else { i += 1; } } *fcc_allowed = *fcc_allowed - instance->fcc_remcast_current; /* * Add messages to retransmit to RTR list * but only retry if there is room in the retransmit list */ range = orf_token->seq - instance->my_aru; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); for (i = 1; (orf_token->rtr_list_entries < RETRANSMIT_ENTRIES_MAX) && (i <= range); i++) { /* * Ensure message is within the sort queue range */ res = sq_in_range (sort_queue, instance->my_aru + i); if (res == 0) { break; } /* * Find if a message is missing from this processor */ res = sq_item_inuse (sort_queue, instance->my_aru + i); if (res == 0) { /* * Determine how many times we have missed receiving * this sequence number. sq_item_miss_count increments * a counter for the sequence number. The miss count * will be returned and compared. This allows time for * delayed multicast messages to be received before * declaring the message is missing and requesting a * retransmit. */ res = sq_item_miss_count (sort_queue, instance->my_aru + i); if (res < instance->totem_config->miss_count_const) { continue; } /* * Determine if missing message is already in retransmit list */ found = 0; for (j = 0; j < orf_token->rtr_list_entries; j++) { if (instance->my_aru + i == rtr_list[j].seq) { found = 1; } } if (found == 0) { /* * Missing message not found in current retransmit list so add it */ memcpy (&rtr_list[orf_token->rtr_list_entries].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); rtr_list[orf_token->rtr_list_entries].seq = instance->my_aru + i; orf_token->rtr_list_entries++; } } } return (instance->fcc_remcast_current); } static void token_retransmit (struct totemsrp_instance *instance) { totemnet_token_send (instance->totemnet_context, instance->orf_token_retransmit, instance->orf_token_retransmit_size); } /* * Retransmit the regular token if no mcast or token has * been received in retransmit token period retransmit * the token to the next processor */ static void timer_function_token_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); reset_token_retransmit_timeout (instance); // REVIEWED break; } } static void timer_function_token_hold_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: break; case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); break; } } static void timer_function_merge_detect_timeout(void *data) { struct totemsrp_instance *instance = data; instance->my_merge_detect_timeout_outstanding = 0; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (instance->my_ring_id.rep == instance->my_id.nodeid) { memb_merge_detect_transmit (instance); } break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: case MEMB_STATE_RECOVERY: break; } } /* * Send orf_token to next member (requires orf_token) */ static int token_send ( struct totemsrp_instance *instance, struct orf_token *orf_token, int forward_token) { int res = 0; unsigned int orf_token_size; orf_token_size = sizeof (struct orf_token) + (orf_token->rtr_list_entries * sizeof (struct rtr_item)); orf_token->header.nodeid = instance->my_id.nodeid; memcpy (instance->orf_token_retransmit, orf_token, orf_token_size); instance->orf_token_retransmit_size = orf_token_size; assert (orf_token->header.nodeid); if (forward_token == 0) { return (0); } totemnet_token_send (instance->totemnet_context, orf_token, orf_token_size); return (res); } static int token_hold_cancel_send (struct totemsrp_instance *instance) { struct token_hold_cancel token_hold_cancel; /* * Only cancel if the token is currently held */ if (instance->my_token_held == 0) { return (0); } instance->my_token_held = 0; /* * Build message */ token_hold_cancel.header.magic = TOTEM_MH_MAGIC; token_hold_cancel.header.version = TOTEM_MH_VERSION; token_hold_cancel.header.type = MESSAGE_TYPE_TOKEN_HOLD_CANCEL; token_hold_cancel.header.encapsulated = 0; token_hold_cancel.header.nodeid = instance->my_id.nodeid; memcpy (&token_hold_cancel.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (token_hold_cancel.header.nodeid); instance->stats.token_hold_cancel_tx++; totemnet_mcast_flush_send (instance->totemnet_context, &token_hold_cancel, sizeof (struct token_hold_cancel)); return (0); } static int orf_token_send_initial (struct totemsrp_instance *instance) { struct orf_token orf_token; int res; orf_token.header.magic = TOTEM_MH_MAGIC; orf_token.header.version = TOTEM_MH_VERSION; orf_token.header.type = MESSAGE_TYPE_ORF_TOKEN; orf_token.header.encapsulated = 0; orf_token.header.nodeid = instance->my_id.nodeid; assert (orf_token.header.nodeid); orf_token.seq = SEQNO_START_MSG; orf_token.token_seq = SEQNO_START_TOKEN; orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; instance->stats.orf_token_tx++; if (cs_queue_is_empty (&instance->retrans_message_queue) == 1) { orf_token.retrans_flg = 0; instance->my_set_retrans_flg = 0; } else { orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; } orf_token.aru = 0; orf_token.aru = SEQNO_START_MSG - 1; orf_token.aru_addr = instance->my_id.nodeid; memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); orf_token.fcc = 0; orf_token.backlog = 0; orf_token.rtr_list_entries = 0; res = token_send (instance, &orf_token, 1); return (res); } static void memb_state_commit_token_update ( struct totemsrp_instance *instance) { struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; unsigned int high_aru; unsigned int i; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (instance->my_new_memb_list, addr, sizeof (struct srp_addr) * instance->commit_token->addr_entries); instance->my_new_memb_entries = instance->commit_token->addr_entries; memcpy (&memb_list[instance->commit_token->memb_index].ring_id, &instance->my_old_ring_id, sizeof (struct memb_ring_id)); memb_list[instance->commit_token->memb_index].aru = instance->old_ring_state_aru; /* * TODO high delivered is really instance->my_aru, but with safe this * could change? */ instance->my_received_flg = (instance->my_aru == instance->my_high_seq_received); memb_list[instance->commit_token->memb_index].received_flg = instance->my_received_flg; memb_list[instance->commit_token->memb_index].high_delivered = instance->my_high_delivered; /* * find high aru up to current memb_index for all matching ring ids * if any ring id matching memb_index has aru less then high aru set * received flag for that entry to false */ high_aru = memb_list[instance->commit_token->memb_index].aru; for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (high_aru, memb_list[i].aru)) { high_aru = memb_list[i].aru; } } } for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, high_aru)) { memb_list[i].received_flg = 0; if (i == instance->commit_token->memb_index) { instance->my_received_flg = 0; } } } } instance->commit_token->header.nodeid = instance->my_id.nodeid; instance->commit_token->memb_index += 1; assert (instance->commit_token->memb_index <= instance->commit_token->addr_entries); assert (instance->commit_token->header.nodeid); } static void memb_state_commit_token_target_set ( struct totemsrp_instance *instance) { struct srp_addr *addr; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; /* Totemnet just looks at the node id */ totemnet_token_target_set ( instance->totemnet_context, addr[instance->commit_token->memb_index % instance->commit_token->addr_entries].nodeid); } static int memb_state_commit_token_send_recovery ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { unsigned int commit_token_size; commit_token->token_seq++; commit_token->header.nodeid = instance->my_id.nodeid; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemnet_token_send (instance->totemnet_context, commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_state_commit_token_send ( struct totemsrp_instance *instance) { unsigned int commit_token_size; instance->commit_token->token_seq++; instance->commit_token->header.nodeid = instance->my_id.nodeid; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * instance->commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, instance->commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemnet_token_send (instance->totemnet_context, instance->commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_lowest_in_config (struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int i; unsigned int lowest_nodeid; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); /* * find representative by searching for smallest identifier */ assert(token_memb_entries > 0); lowest_nodeid = token_memb[0].nodeid; for (i = 1; i < token_memb_entries; i++) { if (lowest_nodeid > token_memb[i].nodeid) { lowest_nodeid = token_memb[i].nodeid; } } return (lowest_nodeid == instance->my_id.nodeid); } static int srp_addr_compare (const void *a, const void *b) { const struct srp_addr *srp_a = (const struct srp_addr *)a; const struct srp_addr *srp_b = (const struct srp_addr *)b; if (srp_a->nodeid < srp_b->nodeid) { return -1; } else if (srp_a->nodeid > srp_b->nodeid) { return 1; } else { return 0; } } static void memb_state_commit_token_create ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; int token_memb_entries = 0; log_printf (instance->totemsrp_log_level_debug, "Creating commit token because I am the rep."); memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); memset (instance->commit_token, 0, sizeof (struct memb_commit_token)); instance->commit_token->header.magic = TOTEM_MH_MAGIC; instance->commit_token->header.version = TOTEM_MH_VERSION; instance->commit_token->header.type = MESSAGE_TYPE_MEMB_COMMIT_TOKEN; instance->commit_token->header.encapsulated = 0; instance->commit_token->header.nodeid = instance->my_id.nodeid; assert (instance->commit_token->header.nodeid); instance->commit_token->ring_id.rep = instance->my_id.nodeid; instance->commit_token->ring_id.seq = instance->token_ring_id_seq + 4; /* * This qsort is necessary to ensure the commit token traverses * the ring in the proper order */ qsort (token_memb, token_memb_entries, sizeof (struct srp_addr), srp_addr_compare); instance->commit_token->memb_index = 0; instance->commit_token->addr_entries = token_memb_entries; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (addr, token_memb, token_memb_entries * sizeof (struct srp_addr)); memset (memb_list, 0, sizeof (struct memb_commit_token_memb_entry) * token_memb_entries); } static void memb_join_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; size_t msg_len; memb_join->header.magic = TOTEM_MH_MAGIC; memb_join->header.version = TOTEM_MH_VERSION; memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.encapsulated = 0; memb_join->header.nodeid = instance->my_id.nodeid; assert (memb_join->header.nodeid); msg_len = sizeof(struct memb_join) + ((instance->my_proc_list_entries + instance->my_failed_list_entries) * sizeof(struct srp_addr)); if (msg_len > sizeof(memb_join_data)) { log_printf (instance->totemsrp_log_level_error, "memb_join_message too long. Ignoring message."); return ; } memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = instance->my_proc_list_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; memb_join->system_from = instance->my_id; /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], instance->my_proc_list, instance->my_proc_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_proc_list_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemnet_mcast_flush_send ( instance->totemnet_context, memb_join, addr_idx); } static void memb_leave_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; int active_memb_entries; struct srp_addr active_memb[PROCESSOR_COUNT_MAX]; size_t msg_len; log_printf (instance->totemsrp_log_level_debug, "sending join/leave message"); /* * add us to the failed list, and remove us from * the members list */ memb_set_merge( &instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_set_subtract (active_memb, &active_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, &instance->my_id, 1); msg_len = sizeof(struct memb_join) + ((active_memb_entries + instance->my_failed_list_entries) * sizeof(struct srp_addr)); if (msg_len > sizeof(memb_join_data)) { log_printf (instance->totemsrp_log_level_error, "memb_leave message too long. Ignoring message."); return ; } memb_join->header.magic = TOTEM_MH_MAGIC; memb_join->header.version = TOTEM_MH_VERSION; memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.encapsulated = 0; memb_join->header.nodeid = LEAVE_DUMMY_NODEID; memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = active_memb_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; memb_join->system_from = instance->my_id; // TODO: CC Maybe use the actual join send routine. /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], active_memb, active_memb_entries * sizeof (struct srp_addr)); addr_idx += active_memb_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemnet_mcast_flush_send ( instance->totemnet_context, memb_join, addr_idx); } static void memb_merge_detect_transmit (struct totemsrp_instance *instance) { struct memb_merge_detect memb_merge_detect; memb_merge_detect.header.magic = TOTEM_MH_MAGIC; memb_merge_detect.header.version = TOTEM_MH_VERSION; memb_merge_detect.header.type = MESSAGE_TYPE_MEMB_MERGE_DETECT; memb_merge_detect.header.encapsulated = 0; memb_merge_detect.header.nodeid = instance->my_id.nodeid; memb_merge_detect.system_from = instance->my_id; memcpy (&memb_merge_detect.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (memb_merge_detect.header.nodeid); instance->stats.memb_merge_detect_tx++; totemnet_mcast_flush_send (instance->totemnet_context, &memb_merge_detect, sizeof (struct memb_merge_detect)); } static void memb_ring_id_set ( struct totemsrp_instance *instance, const struct memb_ring_id *ring_id) { memcpy (&instance->my_ring_id, ring_id, sizeof (struct memb_ring_id)); } int totemsrp_callback_token_create ( void *srp_context, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; struct token_callback_instance *callback_handle; token_hold_cancel_send (instance); callback_handle = malloc (sizeof (struct token_callback_instance)); if (callback_handle == 0) { return (-1); } *handle_out = (void *)callback_handle; qb_list_init (&callback_handle->list); callback_handle->callback_fn = callback_fn; callback_handle->data = (void *) data; callback_handle->callback_type = type; callback_handle->delete = delete; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: qb_list_add (&callback_handle->list, &instance->token_callback_received_listhead); break; case TOTEM_CALLBACK_TOKEN_SENT: qb_list_add (&callback_handle->list, &instance->token_callback_sent_listhead); break; } return (0); } void totemsrp_callback_token_destroy (void *srp_context, void **handle_out) { struct token_callback_instance *h; if (*handle_out) { h = (struct token_callback_instance *)*handle_out; qb_list_del (&h->list); free (h); h = NULL; *handle_out = 0; } } static void token_callbacks_execute ( struct totemsrp_instance *instance, enum totem_callback_token_type type) { struct qb_list_head *list, *tmp_iter; struct qb_list_head *callback_listhead = 0; struct token_callback_instance *token_callback_instance; int res; int del; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: callback_listhead = &instance->token_callback_received_listhead; break; case TOTEM_CALLBACK_TOKEN_SENT: callback_listhead = &instance->token_callback_sent_listhead; break; default: assert (0); } qb_list_for_each_safe(list, tmp_iter, callback_listhead) { token_callback_instance = qb_list_entry (list, struct token_callback_instance, list); del = token_callback_instance->delete; if (del == 1) { qb_list_del (list); } res = token_callback_instance->callback_fn ( token_callback_instance->callback_type, token_callback_instance->data); /* * This callback failed to execute, try it again on the next token */ if (res == -1 && del == 1) { qb_list_add (list, callback_listhead); } else if (del) { free (token_callback_instance); } } } /* * Flow control functions */ static unsigned int backlog_get (struct totemsrp_instance *instance) { unsigned int backlog = 0; struct cs_queue *queue_use = NULL; if (instance->memb_state == MEMB_STATE_OPERATIONAL) { if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } } else if (instance->memb_state == MEMB_STATE_RECOVERY) { queue_use = &instance->retrans_message_queue; } if (queue_use != NULL) { backlog = cs_queue_used (queue_use); } instance->stats.token[instance->stats.latest_token].backlog_calc = backlog; return (backlog); } static int fcc_calculate ( struct totemsrp_instance *instance, struct orf_token *token) { unsigned int transmits_allowed; unsigned int backlog_calc; transmits_allowed = instance->totem_config->max_messages; if (transmits_allowed > instance->totem_config->window_size - token->fcc) { transmits_allowed = instance->totem_config->window_size - token->fcc; } instance->my_cbl = backlog_get (instance); /* * Only do backlog calculation if there is a backlog otherwise * we would result in div by zero */ if (token->backlog + instance->my_cbl - instance->my_pbl) { backlog_calc = (instance->totem_config->window_size * instance->my_pbl) / (token->backlog + instance->my_cbl - instance->my_pbl); if (backlog_calc > 0 && transmits_allowed > backlog_calc) { transmits_allowed = backlog_calc; } } return (transmits_allowed); } /* * don't overflow the RTR sort queue */ static void fcc_rtr_limit ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int *transmits_allowed) { int check = QUEUE_RTR_ITEMS_SIZE_MAX; check -= (*transmits_allowed + instance->totem_config->window_size); assert (check >= 0); if (sq_lt_compare (instance->last_released + QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed - instance->totem_config->window_size, token->seq)) { *transmits_allowed = 0; } } static void fcc_token_update ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int msgs_transmitted) { token->fcc += msgs_transmitted - instance->my_trc; token->backlog += instance->my_cbl - instance->my_pbl; instance->my_trc = msgs_transmitted; instance->my_pbl = instance->my_cbl; } /* * Sanity checkers */ static int check_orf_token_sanity( const struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { int rtr_entries; const struct orf_token *token = (const struct orf_token *)msg; size_t required_len; if (msg_len < sizeof(struct orf_token)) { log_printf (instance->totemsrp_log_level_security, "Received orf_token message is too short... ignoring."); return (-1); } if (endian_conversion_needed) { rtr_entries = swab32(token->rtr_list_entries); } else { rtr_entries = token->rtr_list_entries; } required_len = sizeof(struct orf_token) + rtr_entries * sizeof(struct rtr_item); if (msg_len < required_len) { log_printf (instance->totemsrp_log_level_security, "Received orf_token message is too short... ignoring."); return (-1); } return (0); } static int check_mcast_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { if (msg_len < sizeof(struct mcast)) { log_printf (instance->totemsrp_log_level_security, "Received mcast message is too short... ignoring."); return (-1); } return (0); } static int check_memb_merge_detect_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { if (msg_len < sizeof(struct memb_merge_detect)) { log_printf (instance->totemsrp_log_level_security, "Received memb_merge_detect message is too short... ignoring."); return (-1); } return (0); } static int check_memb_join_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_join *mj_msg = (const struct memb_join *)msg; unsigned int proc_list_entries; unsigned int failed_list_entries; size_t required_len; if (msg_len < sizeof(struct memb_join)) { log_printf (instance->totemsrp_log_level_security, "Received memb_join message is too short... ignoring."); return (-1); } proc_list_entries = mj_msg->proc_list_entries; failed_list_entries = mj_msg->failed_list_entries; if (endian_conversion_needed) { proc_list_entries = swab32(proc_list_entries); failed_list_entries = swab32(failed_list_entries); } required_len = sizeof(struct memb_join) + ((proc_list_entries + failed_list_entries) * sizeof(struct srp_addr)); if (msg_len < required_len) { log_printf (instance->totemsrp_log_level_security, "Received memb_join message is too short... ignoring."); return (-1); } return (0); } static int check_memb_commit_token_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_commit_token *mct_msg = (const struct memb_commit_token *)msg; unsigned int addr_entries; size_t required_len; if (msg_len < sizeof(struct memb_commit_token)) { log_printf (instance->totemsrp_log_level_security, "Received memb_commit_token message is too short... ignoring."); return (0); } addr_entries= mct_msg->addr_entries; if (endian_conversion_needed) { addr_entries = swab32(addr_entries); } required_len = sizeof(struct memb_commit_token) + (addr_entries * (sizeof(struct srp_addr) + sizeof(struct memb_commit_token_memb_entry))); if (msg_len < required_len) { log_printf (instance->totemsrp_log_level_security, "Received memb_commit_token message is too short... ignoring."); return (-1); } return (0); } static int check_token_hold_cancel_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { if (msg_len < sizeof(struct token_hold_cancel)) { log_printf (instance->totemsrp_log_level_security, "Received token_hold_cancel message is too short... ignoring."); return (-1); } return (0); } /* * Message Handlers */ unsigned long long int tv_old; /* * message handler called when TOKEN message type received */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { char token_storage[1500]; char token_convert[1500]; struct orf_token *token = NULL; int forward_token; unsigned int transmits_allowed; unsigned int mcasted_retransmit; unsigned int mcasted_regular; unsigned int last_aru; #ifdef GIVEINFO unsigned long long tv_current; unsigned long long tv_diff; tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "Time since last token %0.4f ms", ((float)tv_diff) / 1000000.0); #endif if (check_orf_token_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (instance->orf_token_discard) { return (0); } #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) { return (0); } #endif if (endian_conversion_needed) { orf_token_endian_convert ((struct orf_token *)msg, (struct orf_token *)token_convert); msg = (struct orf_token *)token_convert; } /* * Make copy of token and retransmit list in case we have * to flush incoming messages from the kernel queue */ token = (struct orf_token *)token_storage; memcpy (token, msg, sizeof (struct orf_token)); memcpy (&token->rtr_list[0], (char *)msg + sizeof (struct orf_token), sizeof (struct rtr_item) * RETRANSMIT_ENTRIES_MAX); /* * Handle merge detection timeout */ if (token->seq == instance->my_last_seq) { start_merge_detect_timeout (instance); instance->my_seq_unchanged += 1; } else { cancel_merge_detect_timeout (instance); cancel_token_hold_retransmit_timeout (instance); instance->my_seq_unchanged = 0; } instance->my_last_seq = token->seq; #ifdef TEST_RECOVERY_MSG_COUNT if (instance->memb_state == MEMB_STATE_OPERATIONAL && token->seq > TEST_RECOVERY_MSG_COUNT) { return (0); } #endif instance->flushing = 1; totemnet_recv_flush (instance->totemnet_context); instance->flushing = 0; /* * Determine if we should hold (in reality drop) the token */ instance->my_token_held = 0; if (instance->my_ring_id.rep == instance->my_id.nodeid && instance->my_seq_unchanged > instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } else { if (instance->my_ring_id.rep != instance->my_id.nodeid && instance->my_seq_unchanged >= instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } } /* * Hold onto token when there is no activity on ring and * this processor is the ring rep */ forward_token = 1; if (instance->my_ring_id.rep == instance->my_id.nodeid) { if (instance->my_token_held) { forward_token = 0; } } switch (instance->memb_state) { case MEMB_STATE_COMMIT: /* Discard token */ break; case MEMB_STATE_OPERATIONAL: messages_free (instance, token->aru); /* * Do NOT add break, this case should also execute code in gather case. */ case MEMB_STATE_GATHER: /* * DO NOT add break, we use different free mechanism in recovery state */ case MEMB_STATE_RECOVERY: /* * Discard tokens from another configuration */ if (memcmp (&token->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); /* discard token */ } /* * Discard retransmitted tokens */ if (sq_lte_compare (token->token_seq, instance->my_token_seq)) { return (0); /* discard token */ } /* * Token is valid so trigger callbacks */ token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_RECEIVED); last_aru = instance->my_last_aru; instance->my_last_aru = token->aru; transmits_allowed = fcc_calculate (instance, token); mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed); if (instance->totem_config->cancel_token_hold_on_retransmit && instance->my_token_held == 1 && (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) { instance->my_token_held = 0; forward_token = 1; } fcc_rtr_limit (instance, token, &transmits_allowed); mcasted_regular = orf_token_mcast (instance, token, transmits_allowed); /* if (mcasted_regular) { printf ("mcasted regular %d\n", mcasted_regular); printf ("token seq %d\n", token->seq); } */ fcc_token_update (instance, token, mcasted_retransmit + mcasted_regular); if (sq_lt_compare (instance->my_aru, token->aru) || instance->my_id.nodeid == token->aru_addr || token->aru_addr == 0) { token->aru = instance->my_aru; if (token->aru == token->seq) { token->aru_addr = 0; } else { token->aru_addr = instance->my_id.nodeid; } } if (token->aru == last_aru && token->aru_addr != 0) { instance->my_aru_count += 1; } else { instance->my_aru_count = 0; } /* * We really don't follow specification there. In specification, OTHER nodes * detect failure of one node (based on aru_count) and my_id IS NEVER added * to failed list (so node never mark itself as failed) */ if (instance->my_aru_count > instance->totem_config->fail_to_recv_const && token->aru_addr == instance->my_id.nodeid) { log_printf (instance->totemsrp_log_level_error, "FAILED TO RECEIVE"); instance->failed_to_recv = 1; memb_set_merge (&instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FAILED_TO_RECEIVE); } else { instance->my_token_seq = token->token_seq; token->token_seq += 1; if (instance->memb_state == MEMB_STATE_RECOVERY) { /* * instance->my_aru == instance->my_high_seq_received means this processor * has recovered all messages it can recover * (ie: its retrans queue is empty) */ if (cs_queue_is_empty (&instance->retrans_message_queue) == 0) { if (token->retrans_flg == 0) { token->retrans_flg = 1; instance->my_set_retrans_flg = 1; } } else if (token->retrans_flg == 1 && instance->my_set_retrans_flg) { token->retrans_flg = 0; instance->my_set_retrans_flg = 0; } log_printf (instance->totemsrp_log_level_debug, "token retrans flag is %d my set retrans flag%d retrans queue empty %d count %d, aru %x", token->retrans_flg, instance->my_set_retrans_flg, cs_queue_is_empty (&instance->retrans_message_queue), instance->my_retrans_flg_count, token->aru); if (token->retrans_flg == 0) { instance->my_retrans_flg_count += 1; } else { instance->my_retrans_flg_count = 0; } if (instance->my_retrans_flg_count == 2) { instance->my_install_seq = token->seq; } log_printf (instance->totemsrp_log_level_debug, "install seq %x aru %x high seq received %x", instance->my_install_seq, instance->my_aru, instance->my_high_seq_received); if (instance->my_retrans_flg_count >= 2 && instance->my_received_flg == 0 && sq_lte_compare (instance->my_install_seq, instance->my_aru)) { instance->my_received_flg = 1; instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct totem_ip_address) * instance->my_trans_memb_entries); } if (instance->my_retrans_flg_count >= 3 && sq_lte_compare (instance->my_install_seq, token->aru)) { instance->my_rotation_counter += 1; } else { instance->my_rotation_counter = 0; } if (instance->my_rotation_counter == 2) { log_printf (instance->totemsrp_log_level_debug, "retrans flag count %x token aru %x install seq %x aru %x %x", instance->my_retrans_flg_count, token->aru, instance->my_install_seq, instance->my_aru, token->seq); memb_state_operational_enter (instance); instance->my_rotation_counter = 0; instance->my_retrans_flg_count = 0; } } totemnet_send_flush (instance->totemnet_context); token_send (instance, token, forward_token); #ifdef GIVEINFO tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "I held %0.4f ms", ((float)tv_diff) / 1000000.0); #endif if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* * Deliver messages after token has been transmitted * to improve performance */ reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED if (instance->my_id.nodeid == instance->my_ring_id.rep && instance->my_token_held == 1) { start_token_hold_retransmit_timeout (instance); } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_SENT); } break; } if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); } static void messages_deliver_to_app ( struct totemsrp_instance *instance, int skip, unsigned int end_point) { struct sort_queue_item *sort_queue_item_p; unsigned int i; int res; struct mcast *mcast_in; struct mcast mcast_header; unsigned int range = 0; int endian_conversion_required; unsigned int my_high_delivered_stored = 0; struct srp_addr aligned_system_from; range = end_point - instance->my_high_delivered; if (range) { log_printf (instance->totemsrp_log_level_trace, "Delivering %x to %x", instance->my_high_delivered, end_point); } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); my_high_delivered_stored = instance->my_high_delivered; /* * Deliver messages in order from rtr queue to pending delivery queue */ for (i = 1; i <= range; i++) { void *ptr = 0; /* * If out of range of sort queue, stop assembly */ res = sq_in_range (&instance->regular_sort_queue, my_high_delivered_stored + i); if (res == 0) { break; } res = sq_item_get (&instance->regular_sort_queue, my_high_delivered_stored + i, &ptr); /* * If hole, stop assembly */ if (res != 0 && skip == 0) { break; } instance->my_high_delivered = my_high_delivered_stored + i; if (res != 0) { continue; } sort_queue_item_p = ptr; mcast_in = sort_queue_item_p->mcast; assert (mcast_in != (struct mcast *)0xdeadbeef); endian_conversion_required = 0; if (mcast_in->header.magic != TOTEM_MH_MAGIC) { endian_conversion_required = 1; mcast_endian_convert (mcast_in, &mcast_header); } else { memcpy (&mcast_header, mcast_in, sizeof (struct mcast)); } aligned_system_from = mcast_header.system_from; /* * Skip messages not originated in instance->my_deliver_memb */ if (skip && memb_set_subset (&aligned_system_from, 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) == 0) { instance->my_high_delivered = my_high_delivered_stored + i; continue; } /* * Message found */ log_printf (instance->totemsrp_log_level_trace, "Delivering MCAST message with seq %x to pending delivery queue", mcast_header.seq); /* * Message is locally originated multicast */ instance->totemsrp_deliver_fn ( mcast_header.header.nodeid, ((char *)sort_queue_item_p->mcast) + sizeof (struct mcast), sort_queue_item_p->msg_len - sizeof (struct mcast), endian_conversion_required); } } /* * recv message handler called when MCAST message type received */ static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct sort_queue_item sort_queue_item; struct sq *sort_queue; struct mcast mcast_header; struct srp_addr aligned_system_from; if (check_mcast_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { mcast_endian_convert (msg, &mcast_header); } else { memcpy (&mcast_header, msg, sizeof (struct mcast)); } if (mcast_header.header.encapsulated == MESSAGE_ENCAPSULATED) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } assert (msg_len <= FRAME_SIZE_MAX); #ifdef TEST_DROP_MCAST_PERCENTAGE if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) { return (0); } #endif /* * If the message is foreign execute the switch below */ if (memcmp (&instance->my_ring_id, &mcast_header.ring_id, sizeof (struct memb_ring_id)) != 0) { aligned_system_from = mcast_header.system_from; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge ( &aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &aligned_system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE); return (0); } break; case MEMB_STATE_COMMIT: /* discard message */ instance->stats.rx_msg_dropped++; break; case MEMB_STATE_RECOVERY: /* discard message */ instance->stats.rx_msg_dropped++; break; } return (0); } log_printf (instance->totemsrp_log_level_trace, "Received ringid (" CS_PRI_RING_ID ") seq %x", mcast_header.ring_id.rep, (uint64_t)mcast_header.ring_id.seq, mcast_header.seq); /* * Add mcast message to rtr queue if not already in rtr queue * otherwise free io vectors */ if (msg_len > 0 && msg_len <= FRAME_SIZE_MAX && sq_in_range (sort_queue, mcast_header.seq) && sq_item_inuse (sort_queue, mcast_header.seq) == 0) { /* * Allocate new multicast memory block */ // TODO LEAK sort_queue_item.mcast = totemsrp_buffer_alloc (instance); if (sort_queue_item.mcast == NULL) { return (-1); /* error here is corrected by the algorithm */ } memcpy (sort_queue_item.mcast, msg, msg_len); sort_queue_item.msg_len = msg_len; if (sq_lt_compare (instance->my_high_seq_received, mcast_header.seq)) { instance->my_high_seq_received = mcast_header.seq; } sq_item_add (sort_queue, &sort_queue_item, mcast_header.seq); } update_aru (instance); if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* TODO remove from retrans message queue for old ring in recovery state */ return (0); } static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_merge_detect memb_merge_detect; struct srp_addr aligned_system_from; if (check_memb_merge_detect_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { memb_merge_detect_endian_convert (msg, &memb_merge_detect); } else { memcpy (&memb_merge_detect, msg, sizeof (struct memb_merge_detect)); } /* * do nothing if this is a merge detect from this configuration */ if (memcmp (&instance->my_ring_id, &memb_merge_detect.ring_id, sizeof (struct memb_ring_id)) == 0) { return (0); } aligned_system_from = memb_merge_detect.system_from; /* * Execute merge operation */ switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge (&aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &aligned_system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE); return (0); } break; case MEMB_STATE_COMMIT: /* do nothing in commit */ break; case MEMB_STATE_RECOVERY: /* do nothing in recovery */ break; } return (0); } static void memb_join_process ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; int gather_entered = 0; int fail_minus_memb_entries = 0; struct srp_addr fail_minus_memb[PROCESSOR_COUNT_MAX]; struct srp_addr aligned_system_from; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; aligned_system_from = memb_join->system_from; log_printf(instance->totemsrp_log_level_trace, "memb_join_process"); memb_set_log(instance, instance->totemsrp_log_level_trace, "proclist", proc_list, memb_join->proc_list_entries); memb_set_log(instance, instance->totemsrp_log_level_trace, "faillist", failed_list, memb_join->failed_list_entries); memb_set_log(instance, instance->totemsrp_log_level_trace, "my_proclist", instance->my_proc_list, instance->my_proc_list_entries); memb_set_log(instance, instance->totemsrp_log_level_trace, "my_faillist", instance->my_failed_list, instance->my_failed_list_entries); if (memb_join->header.type == MESSAGE_TYPE_MEMB_JOIN) { if (instance->flushing) { if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) { log_printf (instance->totemsrp_log_level_warning, "Discarding LEAVE message during flush, nodeid=" CS_PRI_NODE_ID, memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].nodeid : LEAVE_DUMMY_NODEID); if (memb_join->failed_list_entries > 0) { my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].nodeid); } } else { log_printf (instance->totemsrp_log_level_warning, "Discarding JOIN message during flush, nodeid=" CS_PRI_NODE_ID, memb_join->header.nodeid); } return; } else { if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) { log_printf (instance->totemsrp_log_level_debug, "Received LEAVE message from " CS_PRI_NODE_ID, memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].nodeid : LEAVE_DUMMY_NODEID); if (memb_join->failed_list_entries > 0) { my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].nodeid); } } } } if (memb_set_equal (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_equal (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { if (memb_join->header.nodeid != LEAVE_DUMMY_NODEID) { memb_consensus_set (instance, &aligned_system_from); } if (memb_consensus_agreed (instance) && instance->failed_to_recv == 1) { instance->failed_to_recv = 0; instance->my_proc_list[0] = instance->my_id; instance->my_proc_list_entries = 1; instance->my_failed_list_entries = 0; memb_state_commit_token_create (instance); memb_state_commit_enter (instance); return; } if (memb_consensus_agreed (instance) && memb_lowest_in_config (instance)) { memb_state_commit_token_create (instance); memb_state_commit_enter (instance); } else { goto out; } } else if (memb_set_subset (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_subset (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { goto out; } else if (memb_set_subset (&aligned_system_from, 1, instance->my_failed_list, instance->my_failed_list_entries)) { goto out; } else { memb_set_merge (proc_list, memb_join->proc_list_entries, instance->my_proc_list, &instance->my_proc_list_entries); if (memb_set_subset ( &instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { memb_set_merge ( &aligned_system_from, 1, instance->my_failed_list, &instance->my_failed_list_entries); } else { if (memb_set_subset ( &aligned_system_from, 1, instance->my_memb_list, instance->my_memb_entries)) { if (memb_set_subset ( &aligned_system_from, 1, instance->my_failed_list, instance->my_failed_list_entries) == 0) { memb_set_merge (failed_list, memb_join->failed_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); } else { memb_set_subtract (fail_minus_memb, &fail_minus_memb_entries, failed_list, memb_join->failed_list_entries, instance->my_memb_list, instance->my_memb_entries); memb_set_merge (fail_minus_memb, fail_minus_memb_entries, instance->my_failed_list, &instance->my_failed_list_entries); } } } memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_JOIN); gather_entered = 1; } out: if (gather_entered == 0 && instance->memb_state == MEMB_STATE_OPERATIONAL) { memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE); } } static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out) { int i; struct srp_addr *in_proc_list; struct srp_addr *in_failed_list; struct srp_addr *out_proc_list; struct srp_addr *out_failed_list; out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->system_from = srp_addr_endian_convert(in->system_from); out->proc_list_entries = swab32 (in->proc_list_entries); out->failed_list_entries = swab32 (in->failed_list_entries); out->ring_seq = swab64 (in->ring_seq); in_proc_list = (struct srp_addr *)in->end_of_memb_join; in_failed_list = in_proc_list + out->proc_list_entries; out_proc_list = (struct srp_addr *)out->end_of_memb_join; out_failed_list = out_proc_list + out->proc_list_entries; for (i = 0; i < out->proc_list_entries; i++) { out_proc_list[i] = srp_addr_endian_convert (in_proc_list[i]); } for (i = 0; i < out->failed_list_entries; i++) { out_failed_list[i] = srp_addr_endian_convert (in_failed_list[i]); } } static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out) { int i; struct srp_addr *in_addr = (struct srp_addr *)in->end_of_commit_token; struct srp_addr *out_addr = (struct srp_addr *)out->end_of_commit_token; struct memb_commit_token_memb_entry *in_memb_list; struct memb_commit_token_memb_entry *out_memb_list; out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->token_seq = swab32 (in->token_seq); out->ring_id.rep = swab32(in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->retrans_flg = swab32 (in->retrans_flg); out->memb_index = swab32 (in->memb_index); out->addr_entries = swab32 (in->addr_entries); in_memb_list = (struct memb_commit_token_memb_entry *)(in_addr + out->addr_entries); out_memb_list = (struct memb_commit_token_memb_entry *)(out_addr + out->addr_entries); for (i = 0; i < out->addr_entries; i++) { out_addr[i] = srp_addr_endian_convert (in_addr[i]); /* * Only convert the memb entry if it has been set */ if (in_memb_list[i].ring_id.rep != 0) { out_memb_list[i].ring_id.rep = swab32(in_memb_list[i].ring_id.rep); out_memb_list[i].ring_id.seq = swab64 (in_memb_list[i].ring_id.seq); out_memb_list[i].aru = swab32 (in_memb_list[i].aru); out_memb_list[i].high_delivered = swab32 (in_memb_list[i].high_delivered); out_memb_list[i].received_flg = swab32 (in_memb_list[i].received_flg); } } } static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out) { int i; out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->seq = swab32 (in->seq); out->token_seq = swab32 (in->token_seq); out->aru = swab32 (in->aru); out->ring_id.rep = swab32(in->ring_id.rep); out->aru_addr = swab32(in->aru_addr); out->ring_id.seq = swab64 (in->ring_id.seq); out->fcc = swab32 (in->fcc); out->backlog = swab32 (in->backlog); out->retrans_flg = swab32 (in->retrans_flg); out->rtr_list_entries = swab32 (in->rtr_list_entries); for (i = 0; i < out->rtr_list_entries; i++) { out->rtr_list[i].ring_id.rep = swab32(in->rtr_list[i].ring_id.rep); out->rtr_list[i].ring_id.seq = swab64 (in->rtr_list[i].ring_id.seq); out->rtr_list[i].seq = swab32 (in->rtr_list[i].seq); } } static void mcast_endian_convert (const struct mcast *in, struct mcast *out) { out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->header.encapsulated = in->header.encapsulated; out->seq = swab32 (in->seq); out->this_seqno = swab32 (in->this_seqno); out->ring_id.rep = swab32(in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->node_id = swab32 (in->node_id); out->guarantee = swab32 (in->guarantee); out->system_from = srp_addr_endian_convert(in->system_from); } static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out) { out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->ring_id.rep = swab32(in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->system_from = srp_addr_endian_convert (in->system_from); } static int ignore_join_under_operational ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; unsigned long long ring_seq; struct srp_addr aligned_system_from; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; ring_seq = memb_join->ring_seq; aligned_system_from = memb_join->system_from; if (memb_set_subset (&instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { return (1); } /* * In operational state, my_proc_list is exactly the same as * my_memb_list. */ if ((memb_set_subset (&aligned_system_from, 1, instance->my_memb_list, instance->my_memb_entries)) && (ring_seq < instance->my_ring_id.seq)) { return (1); } return (0); } static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_join *memb_join; struct memb_join *memb_join_convert = alloca (msg_len); struct srp_addr aligned_system_from; if (check_memb_join_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { memb_join = memb_join_convert; memb_join_endian_convert (msg, memb_join_convert); } else { memb_join = msg; } aligned_system_from = memb_join->system_from; /* * If the process paused because it wasn't scheduled in a timely * fashion, flush the join messages because they may be queued * entries */ if (pause_flush (instance)) { return (0); } if (instance->token_ring_id_seq < memb_join->ring_seq) { instance->token_ring_id_seq = memb_join->ring_seq; } switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (!ignore_join_under_operational (instance, memb_join)) { memb_join_process (instance, memb_join); } break; case MEMB_STATE_GATHER: memb_join_process (instance, memb_join); break; case MEMB_STATE_COMMIT: if (memb_set_subset (&aligned_system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE); } break; case MEMB_STATE_RECOVERY: if (memb_set_subset (&aligned_system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_recovery_state_token_loss (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY); } break; } return (0); } static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_commit_token *memb_commit_token_convert = alloca (msg_len); struct memb_commit_token *memb_commit_token; struct srp_addr sub[PROCESSOR_COUNT_MAX]; int sub_entries; struct srp_addr *addr; log_printf (instance->totemsrp_log_level_debug, "got commit token"); if (check_memb_commit_token_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { memb_commit_token_endian_convert (msg, memb_commit_token_convert); } else { memcpy (memb_commit_token_convert, msg, msg_len); } memb_commit_token = memb_commit_token_convert; addr = (struct srp_addr *)memb_commit_token->end_of_commit_token; #ifdef TEST_DROP_COMMIT_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_COMMIT_TOKEN_PERCENTAGE) { return (0); } #endif switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: /* discard token */ break; case MEMB_STATE_GATHER: memb_set_subtract (sub, &sub_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); if (memb_set_equal (addr, memb_commit_token->addr_entries, sub, sub_entries) && memb_commit_token->ring_id.seq > instance->my_ring_id.seq) { memcpy (instance->commit_token, memb_commit_token, msg_len); memb_state_commit_enter (instance); } break; case MEMB_STATE_COMMIT: /* * If retransmitted commit tokens are sent on this ring * filter them out and only enter recovery once the * commit token has traversed the array. This is * determined by : * memb_commit_token->memb_index == memb_commit_token->addr_entries) { */ if (memb_commit_token->ring_id.seq == instance->my_ring_id.seq && memb_commit_token->memb_index == memb_commit_token->addr_entries) { memb_state_recovery_enter (instance, memb_commit_token); } break; case MEMB_STATE_RECOVERY: if (instance->my_id.nodeid == instance->my_ring_id.rep) { /* Filter out duplicated tokens */ if (instance->originated_orf_token) { break; } instance->originated_orf_token = 1; log_printf (instance->totemsrp_log_level_debug, "Sending initial ORF token"); // TODO convert instead of initiate orf_token_send_initial (instance); reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED } break; } return (0); } static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct token_hold_cancel *token_hold_cancel = msg; if (check_token_hold_cancel_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (memcmp (&token_hold_cancel->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) == 0) { instance->my_seq_unchanged = 0; if (instance->my_ring_id.rep == instance->my_id.nodeid) { timer_function_token_retransmit_timeout (instance); } } return (0); } static int check_message_header_validity( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from) { struct totemsrp_instance *instance = context; const struct totem_message_header *message_header = msg; const char *guessed_str; const char *msg_byte = msg; if (msg_len < sizeof (struct totem_message_header)) { log_printf (instance->totemsrp_log_level_security, "Message received from %s is too short... Ignoring %u.", totemip_sa_print((struct sockaddr *)system_from), (unsigned int)msg_len); return (-1); } if (message_header->magic != TOTEM_MH_MAGIC && message_header->magic != swab16(TOTEM_MH_MAGIC)) { /* * We've received ether Knet, old version of Corosync, * or something else. Do some guessing to display (hopefully) * helpful message */ guessed_str = NULL; if (message_header->magic == 0xFFFF) { /* * Corosync 2.2 used header with two UINT8_MAX */ guessed_str = "Corosync 2.2"; } else if (message_header->magic == 0xFEFE) { /* * Corosync 2.3+ used header with two UINT8_MAX - 1 */ guessed_str = "Corosync 2.3+"; } else if (msg_byte[0] == 0x01) { /* * Knet has stable1 with first byte of message == 1 */ guessed_str = "unencrypted Kronosnet"; } else if (msg_byte[0] >= 0 && msg_byte[0] <= 5) { /* * Unencrypted Corosync 1.x/OpenAIS has first byte * 0-5. Collision with Knet (but still worth the try) */ guessed_str = "unencrypted Corosync 2.0/2.1/1.x/OpenAIS"; } else { /* * Encrypted Kronosned packet has a hash at the end of * the packet and nothing specific at the beginning of the * packet (just encrypted data). * Encrypted Corosync 1.x/OpenAIS is quite similar but hash_digest * is in the beginning of the packet. * * So it's not possible to reliably detect ether of them. */ guessed_str = "encrypted Kronosnet/Corosync 2.0/2.1/1.x/OpenAIS or unknown"; } log_printf(instance->totemsrp_log_level_security, "Message received from %s has bad magic number (probably sent by %s).. Ignoring", totemip_sa_print((struct sockaddr *)system_from), guessed_str); return (-1); } if (message_header->version != TOTEM_MH_VERSION) { log_printf(instance->totemsrp_log_level_security, "Message received from %s has unsupported version %u... Ignoring", totemip_sa_print((struct sockaddr *)system_from), message_header->version); return (-1); } return (0); } -void main_deliver_fn ( +int main_deliver_fn ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from) { struct totemsrp_instance *instance = context; const struct totem_message_header *message_header = msg; if (check_message_header_validity(context, msg, msg_len, system_from) == -1) { - return ; + return -1; } switch (message_header->type) { case MESSAGE_TYPE_ORF_TOKEN: instance->stats.orf_token_rx++; break; case MESSAGE_TYPE_MCAST: instance->stats.mcast_rx++; break; case MESSAGE_TYPE_MEMB_MERGE_DETECT: instance->stats.memb_merge_detect_rx++; break; case MESSAGE_TYPE_MEMB_JOIN: instance->stats.memb_join_rx++; break; case MESSAGE_TYPE_MEMB_COMMIT_TOKEN: instance->stats.memb_commit_token_rx++; break; case MESSAGE_TYPE_TOKEN_HOLD_CANCEL: instance->stats.token_hold_cancel_rx++; break; default: log_printf (instance->totemsrp_log_level_security, "Message received from %s has wrong type... ignoring %d.\n", totemip_sa_print((struct sockaddr *)system_from), (int)message_header->type); instance->stats.rx_msg_dropped++; - return; + return 0; } /* * Handle incoming message */ - totemsrp_message_handlers.handler_functions[(int)message_header->type] ( + return totemsrp_message_handlers.handler_functions[(int)message_header->type] ( instance, msg, msg_len, message_header->magic != TOTEM_MH_MAGIC); } int totemsrp_iface_set ( void *context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no) { struct totemsrp_instance *instance = context; int res; totemip_copy(&instance->my_addrs[iface_no], interface_addr); res = totemnet_iface_set ( instance->totemnet_context, interface_addr, ip_port, iface_no); return (res); } /* Contrary to its name, this only gets called when the interface is enabled */ -void main_iface_change_fn ( +int main_iface_change_fn ( void *context, const struct totem_ip_address *iface_addr, unsigned int iface_no) { struct totemsrp_instance *instance = context; int num_interfaces; int i; + int res = 0; if (!instance->my_id.nodeid) { instance->my_id.nodeid = iface_addr->nodeid; } totemip_copy (&instance->my_addrs[iface_no], iface_addr); if (instance->iface_changes++ == 0) { instance->memb_ring_id_create_or_load (&instance->my_ring_id, instance->my_id.nodeid); /* * Increase the ring_id sequence number. This doesn't follow specification. * Solves problem with restarted leader node (node with lowest nodeid) before * rest of the cluster forms new membership and guarantees unique ring_id for * new singleton configuration. */ instance->my_ring_id.seq++; instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf ( instance->totemsrp_log_level_debug, "Created or loaded sequence id " CS_PRI_RING_ID " for this ring.", instance->my_ring_id.rep, (uint64_t)instance->my_ring_id.seq); if (instance->totemsrp_service_ready_fn) { instance->totemsrp_service_ready_fn (); } } num_interfaces = 0; for (i = 0; i < INTERFACE_MAX; i++) { if (instance->totem_config->interfaces[i].configured) { num_interfaces++; } } if (instance->iface_changes >= num_interfaces) { /* We need to clear orig_interfaces so that 'commit' diffs against nothing */ instance->totem_config->orig_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(instance->totem_config->orig_interfaces != NULL); memset(instance->totem_config->orig_interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); - totemconfig_commit_new_params(instance->totem_config, icmap_get_global_map()); + res = totemconfig_commit_new_params(instance->totem_config, icmap_get_global_map()); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_INTERFACE_CHANGE); free(instance->totem_config->orig_interfaces); } + return res; } void totemsrp_net_mtu_adjust (struct totem_config *totem_config) { totem_config->net_mtu -= 2 * sizeof (struct mcast); } void totemsrp_service_ready_register ( void *context, void (*totem_service_ready) (void)) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->totemsrp_service_ready_fn = totem_service_ready; } int totemsrp_member_add ( void *context, const struct totem_ip_address *member, int iface_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_member_add (instance->totemnet_context, &instance->my_addrs[iface_no], member, iface_no); return (res); } int totemsrp_member_remove ( void *context, const struct totem_ip_address *member, int iface_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_member_remove (instance->totemnet_context, member, iface_no); return (res); } void totemsrp_threaded_mode_enable (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->threaded_mode_enabled = 1; } void totemsrp_trans_ack (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->waiting_trans_ack = 0; instance->totemsrp_waiting_trans_ack_cb_fn (0); } int totemsrp_reconfigure (void *context, struct totem_config *totem_config) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_reconfigure (instance->totemnet_context, totem_config); return (res); } int totemsrp_crypto_reconfigure_phase (void *context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_crypto_reconfigure_phase (instance->totemnet_context, totem_config, phase); return (res); } void totemsrp_stats_clear (void *context, int flags) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; memset(&instance->stats, 0, sizeof(totemsrp_stats_t)); if (flags & TOTEMPG_STATS_CLEAR_TRANSPORT) { totemnet_stats_clear (instance->totemnet_context); } } void totemsrp_force_gather (void *context) { timer_function_orf_token_timeout(context); } diff --git a/exec/totemudp.c b/exec/totemudp.c index fd3215b5..0ebe127a 100644 --- a/exec/totemudp.c +++ b/exec/totemudp.c @@ -1,1549 +1,1549 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemudp.h" #include "util.h" #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif #define MCAST_SOCKET_BUFFER_SIZE (TRANSMITS_ALLOWED * FRAME_SIZE_MAX) #define NETIF_STATE_REPORT_UP 1 #define NETIF_STATE_REPORT_DOWN 2 #define BIND_STATE_UNBOUND 0 #define BIND_STATE_REGULAR 1 #define BIND_STATE_LOOPBACK 2 struct totemudp_member { struct qb_list_head list; struct totem_ip_address member; }; struct totemudp_socket { int mcast_recv; int mcast_send; int token; /* * Socket used for local multicast delivery. We don't rely on multicast * loop and rather this UNIX DGRAM socket is used. Socket is created by * socketpair call and they are used in same way as pipe (so [0] is read * end and [1] is write end) */ int local_mcast_loop[2]; }; struct totemudp_instance { qb_loop_t *totemudp_poll_handle; struct totem_interface *totem_interface; int netif_state_report; int netif_bind_state; void *context; - void (*totemudp_deliver_fn) ( + int (*totemudp_deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from); - void (*totemudp_iface_change_fn) ( + int (*totemudp_iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no); void (*totemudp_target_set_completed) (void *context); /* * Function and data used to log messages */ int totemudp_log_level_security; int totemudp_log_level_error; int totemudp_log_level_warning; int totemudp_log_level_notice; int totemudp_log_level_debug; int totemudp_subsys_id; void (*totemudp_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); void *udp_context; struct qb_list_head member_list; char iov_buffer[UDP_RECEIVE_FRAME_SIZE_MAX]; char iov_buffer_flush[UDP_RECEIVE_FRAME_SIZE_MAX]; struct iovec totemudp_iov_recv; struct iovec totemudp_iov_recv_flush; struct totemudp_socket totemudp_sockets; struct totem_ip_address mcast_address; int stats_sent; int stats_recv; int stats_delv; int stats_remcasts; int stats_orf_token; struct timeval stats_tv_start; struct totem_ip_address my_id; int firstrun; qb_loop_timer_handle timer_netif_check_timeout; unsigned int my_memb_entries; int flushing; struct totem_config *totem_config; totemsrp_stats_t *stats; struct totem_ip_address token_target; }; struct work_item { const void *msg; unsigned int msg_len; struct totemudp_instance *instance; }; static int totemudp_build_sockets ( struct totemudp_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *mcastaddress, struct totemudp_socket *sockets, struct totem_ip_address *bound_to); static struct totem_ip_address localhost; static void totemudp_instance_initialize (struct totemudp_instance *instance) { memset (instance, 0, sizeof (struct totemudp_instance)); instance->netif_state_report = NETIF_STATE_REPORT_UP | NETIF_STATE_REPORT_DOWN; instance->totemudp_iov_recv.iov_base = instance->iov_buffer; instance->totemudp_iov_recv.iov_len = UDP_RECEIVE_FRAME_SIZE_MAX; //sizeof (instance->iov_buffer); instance->totemudp_iov_recv_flush.iov_base = instance->iov_buffer_flush; instance->totemudp_iov_recv_flush.iov_len = UDP_RECEIVE_FRAME_SIZE_MAX; //sizeof (instance->iov_buffer); /* * There is always atleast 1 processor */ instance->my_memb_entries = 1; qb_list_init (&instance->member_list); } #define log_printf(level, format, args...) \ do { \ instance->totemudp_log_printf ( \ level, instance->totemudp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemudp_log_printf ( \ level, instance->totemudp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)\n", ##args, _error_ptr, err_num); \ } while(0) int totemudp_crypto_set ( void *udp_context, const char *cipher_type, const char *hash_type) { return (0); } static inline void ucast_sendmsg ( struct totemudp_instance *instance, struct totem_ip_address *system_to, const void *msg, unsigned int msg_len) { struct msghdr msg_ucast; int res = 0; struct sockaddr_storage sockaddr; struct iovec iovec; int addrlen; iovec.iov_base = (void*)msg; iovec.iov_len = msg_len; /* * Build unicast message */ memset(&msg_ucast, 0, sizeof(msg_ucast)); totemip_totemip_to_sockaddr_convert(system_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); msg_ucast.msg_name = &sockaddr; msg_ucast.msg_namelen = addrlen; msg_ucast.msg_iov = (void *)&iovec; msg_ucast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_ucast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_ucast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_ucast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_ucast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_ucast.msg_accrightslen = 0; #endif /* * Transmit unicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->totemudp_sockets.mcast_send, &msg_ucast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "sendmsg(ucast) failed (non-critical)"); } } static inline void mcast_sendmsg ( struct totemudp_instance *instance, const void *msg, unsigned int msg_len) { struct msghdr msg_mcast; int res = 0; struct iovec iovec; struct sockaddr_storage sockaddr; int addrlen; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; /* * Build multicast message */ totemip_totemip_to_sockaddr_convert(&instance->mcast_address, instance->totem_interface->ip_port, &sockaddr, &addrlen); memset(&msg_mcast, 0, sizeof(msg_mcast)); msg_mcast.msg_name = &sockaddr; msg_mcast.msg_namelen = addrlen; msg_mcast.msg_iov = (void *)&iovec; msg_mcast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_mcast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_mcast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_mcast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_mcast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_mcast.msg_accrightslen = 0; #endif /* * Transmit multicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->totemudp_sockets.mcast_send, &msg_mcast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "sendmsg(mcast) failed (non-critical)"); instance->stats->continuous_sendmsg_failures++; } else { instance->stats->continuous_sendmsg_failures = 0; } /* * Transmit multicast message to local unix mcast loop * An error here is recovered by totemsrp */ msg_mcast.msg_name = NULL; msg_mcast.msg_namelen = 0; res = sendmsg (instance->totemudp_sockets.local_mcast_loop[1], &msg_mcast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "sendmsg(local mcast loop) failed (non-critical)"); } } int totemudp_finalize ( void *udp_context) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; int res = 0; if (instance->totemudp_sockets.mcast_recv > 0) { qb_loop_poll_del (instance->totemudp_poll_handle, instance->totemudp_sockets.mcast_recv); close (instance->totemudp_sockets.mcast_recv); } if (instance->totemudp_sockets.mcast_send > 0) { close (instance->totemudp_sockets.mcast_send); } if (instance->totemudp_sockets.local_mcast_loop[0] > 0) { qb_loop_poll_del (instance->totemudp_poll_handle, instance->totemudp_sockets.local_mcast_loop[0]); close (instance->totemudp_sockets.local_mcast_loop[0]); close (instance->totemudp_sockets.local_mcast_loop[1]); } if (instance->totemudp_sockets.token > 0) { qb_loop_poll_del (instance->totemudp_poll_handle, instance->totemudp_sockets.token); close (instance->totemudp_sockets.token); } return (res); } /* * Only designed to work with a message with one iov */ static int net_deliver_fn ( int fd, int revents, void *data) { struct totemudp_instance *instance = (struct totemudp_instance *)data; struct msghdr msg_recv; struct iovec *iovec; struct sockaddr_storage system_from; int bytes_received; int truncated_packet; if (instance->flushing == 1) { iovec = &instance->totemudp_iov_recv_flush; } else { iovec = &instance->totemudp_iov_recv; } /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = iovec; msg_recv.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_recv.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_recv.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_recv.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_recv.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_recv.msg_accrightslen = 0; #endif bytes_received = recvmsg (fd, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (bytes_received == -1) { return (0); } else { instance->stats_recv += bytes_received; } truncated_packet = 0; #ifdef HAVE_MSGHDR_FLAGS if (msg_recv.msg_flags & MSG_TRUNC) { truncated_packet = 1; } #else /* * We don't have MSGHDR_FLAGS, but we can (hopefully) safely make assumption that * if bytes_received == UDP_RECIEVE_FRAME_SIZE_MAX then packet is truncated */ if (bytes_received == UDP_RECEIVE_FRAME_SIZE_MAX) { truncated_packet = 1; } #endif if (truncated_packet) { log_printf (instance->totemudp_log_level_error, "Received too big message. This may be because something bad is happening" "on the network (attack?), or you tried join more nodes than corosync is" "compiled with (%u) or bug in the code (bad estimation of " "the UDP_RECEIVE_FRAME_SIZE_MAX). Dropping packet.", PROCESSOR_COUNT_MAX); return (0); } iovec->iov_len = bytes_received; /* * Handle incoming message */ instance->totemudp_deliver_fn ( instance->context, iovec->iov_base, iovec->iov_len, &system_from); iovec->iov_len = UDP_RECEIVE_FRAME_SIZE_MAX; return (0); } static int netif_determine ( struct totemudp_instance *instance, struct totem_ip_address *bindnet, struct totem_ip_address *bound_to, int *interface_up, int *interface_num) { int res; res = totemip_iface_check (bindnet, bound_to, interface_up, interface_num, instance->totem_config->clear_node_high_bit); return (res); } /* * If the interface is up, the sockets for totem are built. If the interface is down * this function is requeued in the timer list to retry building the sockets later. */ static void timer_function_netif_check_timeout ( void *data) { struct totemudp_instance *instance = (struct totemudp_instance *)data; int interface_up; int interface_num; struct totem_ip_address *bind_address; /* * Build sockets for every interface */ netif_determine (instance, &instance->totem_interface->bindnet, &instance->totem_interface->boundto, &interface_up, &interface_num); /* * If the network interface isn't back up and we are already * in loopback mode, add timer to check again and return */ if ((instance->netif_bind_state == BIND_STATE_LOOPBACK && interface_up == 0) || (instance->my_memb_entries == 1 && instance->netif_bind_state == BIND_STATE_REGULAR && interface_up == 1)) { qb_loop_timer_add (instance->totemudp_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); /* * Add a timer to check for a downed regular interface */ return; } if (instance->totemudp_sockets.mcast_recv > 0) { qb_loop_poll_del (instance->totemudp_poll_handle, instance->totemudp_sockets.mcast_recv); close (instance->totemudp_sockets.mcast_recv); } if (instance->totemudp_sockets.mcast_send > 0) { close (instance->totemudp_sockets.mcast_send); } if (instance->totemudp_sockets.local_mcast_loop[0] > 0) { qb_loop_poll_del (instance->totemudp_poll_handle, instance->totemudp_sockets.local_mcast_loop[0]); close (instance->totemudp_sockets.local_mcast_loop[0]); close (instance->totemudp_sockets.local_mcast_loop[1]); } if (instance->totemudp_sockets.token > 0) { qb_loop_poll_del (instance->totemudp_poll_handle, instance->totemudp_sockets.token); close (instance->totemudp_sockets.token); } if (interface_up == 0) { /* * Interface is not up */ instance->netif_bind_state = BIND_STATE_LOOPBACK; bind_address = &localhost; /* * Add a timer to retry building interfaces and request memb_gather_enter */ qb_loop_timer_add (instance->totemudp_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } else { /* * Interface is up */ instance->netif_bind_state = BIND_STATE_REGULAR; bind_address = &instance->totem_interface->bindnet; } /* * Create and bind the multicast and unicast sockets */ (void)totemudp_build_sockets (instance, &instance->mcast_address, bind_address, &instance->totemudp_sockets, &instance->totem_interface->boundto); qb_loop_poll_add ( instance->totemudp_poll_handle, QB_LOOP_MED, instance->totemudp_sockets.mcast_recv, POLLIN, instance, net_deliver_fn); qb_loop_poll_add ( instance->totemudp_poll_handle, QB_LOOP_MED, instance->totemudp_sockets.local_mcast_loop[0], POLLIN, instance, net_deliver_fn); qb_loop_poll_add ( instance->totemudp_poll_handle, QB_LOOP_MED, instance->totemudp_sockets.token, POLLIN, instance, net_deliver_fn); totemip_copy (&instance->my_id, &instance->totem_interface->boundto); /* * This reports changes in the interface to the user and totemsrp */ if (instance->netif_bind_state == BIND_STATE_REGULAR) { if (instance->netif_state_report & NETIF_STATE_REPORT_UP) { log_printf (instance->totemudp_log_level_notice, "The network interface [%s] is now up.", totemip_print (&instance->totem_interface->boundto)); instance->netif_state_report = NETIF_STATE_REPORT_DOWN; instance->totemudp_iface_change_fn (instance->context, &instance->my_id, 0); } /* * Add a timer to check for interface going down in single membership */ if (instance->my_memb_entries == 1) { qb_loop_timer_add (instance->totemudp_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } } else { if (instance->netif_state_report & NETIF_STATE_REPORT_DOWN) { log_printf (instance->totemudp_log_level_notice, "The network interface is down."); instance->totemudp_iface_change_fn (instance->context, &instance->my_id, 0); } instance->netif_state_report = NETIF_STATE_REPORT_UP; } } /* Set the socket priority to INTERACTIVE to ensure that our messages don't get queued behind anything else */ static void totemudp_traffic_control_set(struct totemudp_instance *instance, int sock) { #ifdef SO_PRIORITY int prio = 6; /* TC_PRIO_INTERACTIVE */ if (setsockopt(sock, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(int))) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Could not set traffic priority"); } #endif } static int totemudp_build_sockets_ip ( struct totemudp_instance *instance, struct totem_ip_address *mcast_address, struct totem_ip_address *bindnet_address, struct totemudp_socket *sockets, struct totem_ip_address *bound_to, int interface_num) { struct sockaddr_storage sockaddr; struct ipv6_mreq mreq6; struct ip_mreq mreq; struct sockaddr_storage mcast_ss, boundto_ss; struct sockaddr_in6 *mcast_sin6 = (struct sockaddr_in6 *)&mcast_ss; struct sockaddr_in *mcast_sin = (struct sockaddr_in *)&mcast_ss; struct sockaddr_in *boundto_sin = (struct sockaddr_in *)&boundto_ss; unsigned int sendbuf_size; unsigned int recvbuf_size; unsigned int optlen = sizeof (sendbuf_size); unsigned int retries; int addrlen; int res; int flag; uint8_t sflag; int i; /* * Create multicast recv socket */ sockets->mcast_recv = socket (bindnet_address->family, SOCK_DGRAM, 0); if (sockets->mcast_recv == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "socket() failed"); return (-1); } totemip_nosigpipe (sockets->mcast_recv); res = fcntl (sockets->mcast_recv, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Could not set non-blocking operation on multicast socket"); return (-1); } /* * Force reuse */ flag = 1; if ( setsockopt(sockets->mcast_recv, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof (flag)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "setsockopt(SO_REUSEADDR) failed"); return (-1); } /* * Create local multicast loop socket */ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets->local_mcast_loop) == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "socket() failed"); return (-1); } for (i = 0; i < 2; i++) { totemip_nosigpipe (sockets->local_mcast_loop[i]); res = fcntl (sockets->local_mcast_loop[i], F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Could not set non-blocking operation on multicast socket"); return (-1); } } /* * Setup mcast send socket */ sockets->mcast_send = socket (bindnet_address->family, SOCK_DGRAM, 0); if (sockets->mcast_send == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "socket() failed"); return (-1); } totemip_nosigpipe (sockets->mcast_send); res = fcntl (sockets->mcast_send, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Could not set non-blocking operation on multicast socket"); return (-1); } /* * Force reuse */ flag = 1; if ( setsockopt(sockets->mcast_send, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof (flag)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "setsockopt(SO_REUSEADDR) failed"); return (-1); } totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port - 1, &sockaddr, &addrlen); retries = 0; while (1) { res = bind (sockets->mcast_send, (struct sockaddr *)&sockaddr, addrlen); if (res == 0) { break; } LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Unable to bind the socket to send multicast packets"); if (++retries > BIND_MAX_RETRIES) { break; } /* * Wait for a while */ (void)poll(NULL, 0, BIND_RETRIES_INTERVAL * retries); } if (res == -1) { return (-1); } /* * Setup unicast socket */ sockets->token = socket (bindnet_address->family, SOCK_DGRAM, 0); if (sockets->token == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "socket() failed"); return (-1); } totemip_nosigpipe (sockets->token); res = fcntl (sockets->token, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Could not set non-blocking operation on token socket"); return (-1); } /* * Force reuse */ flag = 1; if ( setsockopt(sockets->token, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof (flag)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "setsockopt(SO_REUSEADDR) failed"); return (-1); } /* * Bind to unicast socket used for token send/receives * This has the side effect of binding to the correct interface */ totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); retries = 0; while (1) { res = bind (sockets->token, (struct sockaddr *)&sockaddr, addrlen); if (res == 0) { break; } LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Unable to bind UDP unicast socket"); if (++retries > BIND_MAX_RETRIES) { break; } /* * Wait for a while */ (void)poll(NULL, 0, BIND_RETRIES_INTERVAL * retries); } if (res == -1) { return (-1); } recvbuf_size = MCAST_SOCKET_BUFFER_SIZE; sendbuf_size = MCAST_SOCKET_BUFFER_SIZE; /* * Set buffer sizes to avoid overruns */ res = setsockopt (sockets->mcast_recv, SOL_SOCKET, SO_RCVBUF, &recvbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "Unable to set SO_RCVBUF size on UDP mcast socket"); return (-1); } res = setsockopt (sockets->mcast_send, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "Unable to set SO_SNDBUF size on UDP mcast socket"); return (-1); } res = setsockopt (sockets->local_mcast_loop[0], SOL_SOCKET, SO_RCVBUF, &recvbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "Unable to set SO_RCVBUF size on UDP local mcast loop socket"); return (-1); } res = setsockopt (sockets->local_mcast_loop[1], SOL_SOCKET, SO_SNDBUF, &sendbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_debug, "Unable to set SO_SNDBUF size on UDP local mcast loop socket"); return (-1); } res = getsockopt (sockets->mcast_recv, SOL_SOCKET, SO_RCVBUF, &recvbuf_size, &optlen); if (res == 0) { log_printf (instance->totemudp_log_level_debug, "Receive multicast socket recv buffer size (%d bytes).", recvbuf_size); } res = getsockopt (sockets->mcast_send, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, &optlen); if (res == 0) { log_printf (instance->totemudp_log_level_debug, "Transmit multicast socket send buffer size (%d bytes).", sendbuf_size); } res = getsockopt (sockets->local_mcast_loop[0], SOL_SOCKET, SO_RCVBUF, &recvbuf_size, &optlen); if (res == 0) { log_printf (instance->totemudp_log_level_debug, "Local receive multicast loop socket recv buffer size (%d bytes).", recvbuf_size); } res = getsockopt (sockets->local_mcast_loop[1], SOL_SOCKET, SO_SNDBUF, &sendbuf_size, &optlen); if (res == 0) { log_printf (instance->totemudp_log_level_debug, "Local transmit multicast loop socket send buffer size (%d bytes).", sendbuf_size); } /* * Join group membership on socket */ totemip_totemip_to_sockaddr_convert(mcast_address, instance->totem_interface->ip_port, &mcast_ss, &addrlen); totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port, &boundto_ss, &addrlen); if (instance->totem_config->broadcast_use == 1) { unsigned int broadcast = 1; if ((setsockopt(sockets->mcast_recv, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof (broadcast))) == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "setting broadcast option failed"); return (-1); } if ((setsockopt(sockets->mcast_send, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof (broadcast))) == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "setting broadcast option failed"); return (-1); } } else { switch (bindnet_address->family) { case AF_INET: memset(&mreq, 0, sizeof(mreq)); mreq.imr_multiaddr.s_addr = mcast_sin->sin_addr.s_addr; mreq.imr_interface.s_addr = boundto_sin->sin_addr.s_addr; res = setsockopt (sockets->mcast_recv, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof (mreq)); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "join ipv4 multicast group failed"); return (-1); } break; case AF_INET6: memset(&mreq6, 0, sizeof(mreq6)); memcpy(&mreq6.ipv6mr_multiaddr, &mcast_sin6->sin6_addr, sizeof(struct in6_addr)); mreq6.ipv6mr_interface = interface_num; res = setsockopt (sockets->mcast_recv, IPPROTO_IPV6, IPV6_JOIN_GROUP, &mreq6, sizeof (mreq6)); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "join ipv6 multicast group failed"); return (-1); } break; } } /* * Turn off multicast loopback */ flag = 0; switch ( bindnet_address->family ) { case AF_INET: sflag = 0; res = setsockopt (sockets->mcast_send, IPPROTO_IP, IP_MULTICAST_LOOP, &sflag, sizeof (sflag)); break; case AF_INET6: res = setsockopt (sockets->mcast_send, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, &flag, sizeof (flag)); } if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Unable to turn off multicast loopback"); return (-1); } /* * Set multicast packets TTL */ flag = instance->totem_interface->ttl; if (bindnet_address->family == AF_INET6) { res = setsockopt (sockets->mcast_send, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &flag, sizeof (flag)); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "set mcast v6 TTL failed"); return (-1); } } else { sflag = flag; res = setsockopt(sockets->mcast_send, IPPROTO_IP, IP_MULTICAST_TTL, &sflag, sizeof(sflag)); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "set mcast v4 TTL failed"); return (-1); } } /* * Bind to a specific interface for multicast send and receive */ switch ( bindnet_address->family ) { case AF_INET: if (setsockopt (sockets->mcast_send, IPPROTO_IP, IP_MULTICAST_IF, &boundto_sin->sin_addr, sizeof (boundto_sin->sin_addr)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "cannot select interface for multicast packets (send)"); return (-1); } if (setsockopt (sockets->mcast_recv, IPPROTO_IP, IP_MULTICAST_IF, &boundto_sin->sin_addr, sizeof (boundto_sin->sin_addr)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "cannot select interface for multicast packets (recv)"); return (-1); } break; case AF_INET6: if (setsockopt (sockets->mcast_send, IPPROTO_IPV6, IPV6_MULTICAST_IF, &interface_num, sizeof (interface_num)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "cannot select interface for multicast packets (send v6)"); return (-1); } if (setsockopt (sockets->mcast_recv, IPPROTO_IPV6, IPV6_MULTICAST_IF, &interface_num, sizeof (interface_num)) < 0) { LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "cannot select interface for multicast packets (recv v6)"); return (-1); } break; } /* * Bind to multicast socket used for multicast receives * This needs to happen after all of the multicast setsockopt() calls * as the kernel seems to only put them into effect (for IPV6) when bind() * is called. */ totemip_totemip_to_sockaddr_convert(mcast_address, instance->totem_interface->ip_port, &sockaddr, &addrlen); retries = 0; while (1) { res = bind (sockets->mcast_recv, (struct sockaddr *)&sockaddr, addrlen); if (res == 0) { break; } LOGSYS_PERROR (errno, instance->totemudp_log_level_warning, "Unable to bind the socket to receive multicast packets"); if (++retries > BIND_MAX_RETRIES) { break; } /* * Wait for a while */ (void)poll(NULL, 0, BIND_RETRIES_INTERVAL * retries); } if (res == -1) { return (-1); } return 0; } static int totemudp_build_sockets ( struct totemudp_instance *instance, struct totem_ip_address *mcast_address, struct totem_ip_address *bindnet_address, struct totemudp_socket *sockets, struct totem_ip_address *bound_to) { int interface_num; int interface_up; int res; /* * Determine the ip address bound to and the interface name */ res = netif_determine (instance, bindnet_address, bound_to, &interface_up, &interface_num); if (res == -1) { return (-1); } totemip_copy(&instance->my_id, bound_to); res = totemudp_build_sockets_ip (instance, mcast_address, bindnet_address, sockets, bound_to, interface_num); if (res == -1) { /* if we get here, corosync won't work anyway, so better leaving than faking to work */ LOGSYS_PERROR (errno, instance->totemudp_log_level_error, "Unable to create sockets, exiting"); exit(EXIT_FAILURE); } /* We only send out of the token socket */ totemudp_traffic_control_set(instance, sockets->token); return res; } /* * Totem Network interface * depends on poll abstraction, POSIX, IPV4 */ /* * Create an instance */ int totemudp_initialize ( qb_loop_t *poll_handle, void **udp_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemudp_instance *instance; instance = malloc (sizeof (struct totemudp_instance)); if (instance == NULL) { return (-1); } totemudp_instance_initialize (instance); instance->totem_config = totem_config; instance->stats = stats; /* * Configure logging */ instance->totemudp_log_level_security = 1; //totem_config->totem_logging_configuration.log_level_security; instance->totemudp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemudp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemudp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemudp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemudp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemudp_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Initialize local variables for totemudp */ instance->totem_interface = &totem_config->interfaces[0]; totemip_copy (&instance->mcast_address, &instance->totem_interface->mcast_addr); memset (instance->iov_buffer, 0, UDP_RECEIVE_FRAME_SIZE_MAX); instance->totemudp_poll_handle = poll_handle; instance->totem_interface->bindnet.nodeid = instance->totem_config->node_id; instance->context = context; instance->totemudp_deliver_fn = deliver_fn; instance->totemudp_iface_change_fn = iface_change_fn; instance->totemudp_target_set_completed = target_set_completed; totemip_localhost (instance->mcast_address.family, &localhost); localhost.nodeid = instance->totem_config->node_id; /* * RRP layer isn't ready to receive message because it hasn't * initialized yet. Add short timer to check the interfaces. */ qb_loop_timer_add (instance->totemudp_poll_handle, QB_LOOP_MED, 100*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); *udp_context = instance; return (0); } void *totemudp_buffer_alloc (void) { return malloc (FRAME_SIZE_MAX); } void totemudp_buffer_release (void *ptr) { return free (ptr); } int totemudp_processor_count_set ( void *udp_context, int processor_count) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; int res = 0; instance->my_memb_entries = processor_count; qb_loop_timer_del (instance->totemudp_poll_handle, instance->timer_netif_check_timeout); if (processor_count == 1) { qb_loop_timer_add (instance->totemudp_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } return (res); } int totemudp_recv_flush (void *udp_context) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; struct pollfd ufd; int nfds; int res = 0; int i; int sock; instance->flushing = 1; for (i = 0; i < 2; i++) { sock = -1; if (i == 0) { sock = instance->totemudp_sockets.mcast_recv; } if (i == 1) { sock = instance->totemudp_sockets.local_mcast_loop[0]; } assert(sock != -1); do { ufd.fd = sock; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { net_deliver_fn (sock, ufd.revents, instance); } } while (nfds == 1); } instance->flushing = 0; return (res); } int totemudp_send_flush (void *udp_context) { return 0; } int totemudp_token_send ( void *udp_context, const void *msg, unsigned int msg_len) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; int res = 0; ucast_sendmsg (instance, &instance->token_target, msg, msg_len); return (res); } int totemudp_mcast_flush_send ( void *udp_context, const void *msg, unsigned int msg_len) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; int res = 0; mcast_sendmsg (instance, msg, msg_len); return (res); } int totemudp_mcast_noflush_send ( void *udp_context, const void *msg, unsigned int msg_len) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; int res = 0; mcast_sendmsg (instance, msg, msg_len); return (res); } extern int totemudp_iface_check (void *udp_context) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; int res = 0; timer_function_netif_check_timeout (instance); return (res); } int totemudp_nodestatus_get (void *udp_context, unsigned int nodeid, struct totem_node_status *node_status) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; struct qb_list_head *list; struct totemudp_member *member; qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudp_member, list); if (member->member.nodeid == nodeid) { node_status->nodeid = nodeid; /* reachable is filled in by totemsrp */ node_status->link_status[0].enabled = 1; if (instance->netif_bind_state == BIND_STATE_REGULAR) { node_status->link_status[0].enabled = 1; } else { node_status->link_status[0].enabled = 0; } node_status->link_status[0].connected = node_status->reachable; node_status->link_status[0].mtu = instance->totem_config->net_mtu; strncpy(node_status->link_status[0].src_ipaddr, totemip_print(&member->member), KNET_MAX_HOST_LEN-1); } } return (0); } int totemudp_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count) { static char *statuses[INTERFACE_MAX] = {(char*)"OK"}; if (status) { *status = statuses; } *iface_count = 1; return (0); } extern void totemudp_net_mtu_adjust (void *udp_context, struct totem_config *totem_config) { totem_config->net_mtu -= totemip_udpip_header_size(totem_config->interfaces[0].bindnet.family); } int totemudp_token_target_set ( void *udp_context, unsigned int nodeid) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; struct qb_list_head *list; struct totemudp_member *member; int res = 0; qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudp_member, list); if (member->member.nodeid == nodeid) { memcpy (&instance->token_target, &member->member, sizeof (struct totem_ip_address)); instance->totemudp_target_set_completed (instance->context); break; } } return (res); } extern int totemudp_recv_mcast_empty ( void *udp_context) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; unsigned int res; struct sockaddr_storage system_from; struct msghdr msg_recv; struct pollfd ufd; int nfds; int msg_processed = 0; int i; int sock; /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = &instance->totemudp_iov_recv_flush; msg_recv.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_recv.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_recv.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_recv.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_recv.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_recv.msg_accrightslen = 0; #endif for (i = 0; i < 2; i++) { sock = -1; if (i == 0) { sock = instance->totemudp_sockets.mcast_recv; } if (i == 1) { sock = instance->totemudp_sockets.local_mcast_loop[0]; } assert(sock != -1); do { ufd.fd = sock; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { res = recvmsg (sock, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (res != -1) { msg_processed = 1; } else { msg_processed = -1; } } } while (nfds == 1); } return (msg_processed); } int totemudp_member_add ( void *udp_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no) { struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; struct totemudp_member *new_member; new_member = malloc (sizeof (struct totemudp_member)); if (new_member == NULL) { return (-1); } memset(new_member, 0, sizeof(*new_member)); qb_list_init (&new_member->list); qb_list_add_tail (&new_member->list, &instance->member_list); memcpy (&new_member->member, member, sizeof (struct totem_ip_address)); return (0); } int totemudp_member_remove ( void *udp_context, const struct totem_ip_address *token_target, int ring_no) { int found = 0; struct qb_list_head *list; struct totemudp_member *member; struct totemudp_instance *instance = (struct totemudp_instance *)udp_context; /* * Find the member to remove and close its socket */ qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudp_member, list); if (totemip_compare (token_target, &member->member)==0) { found = 1; break; } } /* * Delete the member from the list */ if (found) { qb_list_del (list); } return (0); } int totemudp_iface_set (void *net_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no) { /* Not supported */ return (-1); } int totemudp_reconfigure ( void *udp_context, struct totem_config *totem_config) { /* Not supported */ return (-1); } diff --git a/exec/totemudp.h b/exec/totemudp.h index 7d2abcd9..66424724 100644 --- a/exec/totemudp.h +++ b/exec/totemudp.h @@ -1,144 +1,144 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2011 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMUDP_H_DEFINED #define TOTEMUDP_H_DEFINED #include #include #include #include /** * Create an instance */ extern int totemudp_initialize ( qb_loop_t* poll_handle, void **udp_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); extern void *totemudp_buffer_alloc (void); extern void totemudp_buffer_release (void *ptr); extern int totemudp_processor_count_set ( void *udp_context, int processor_count); extern int totemudp_token_send ( void *udp_context, const void *msg, unsigned int msg_len); extern int totemudp_mcast_flush_send ( void *udp_context, const void *msg, unsigned int msg_len); extern int totemudp_mcast_noflush_send ( void *udp_context, const void *msg, unsigned int msg_len); extern int totemudp_nodestatus_get (void *net_context, unsigned int nodeid, struct totem_node_status *node_status); extern int totemudp_ifaces_get (void *net_context, char ***status, unsigned int *iface_count); extern int totemudp_recv_flush (void *udp_context); extern int totemudp_send_flush (void *udp_context); extern int totemudp_iface_set (void *net_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no); extern int totemudp_iface_check (void *udp_context); extern int totemudp_finalize (void *udp_context); extern void totemudp_net_mtu_adjust (void *udp_context, struct totem_config *totem_config); extern int totemudp_token_target_set ( void *udp_context, unsigned int nodeid); extern int totemudp_crypto_set ( void *udp_context, const char *cipher_type, const char *hash_type); extern int totemudp_recv_mcast_empty ( void *udp_context); extern int totemudp_member_add ( void *udpu_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); extern int totemudp_member_remove ( void *udpu_context, const struct totem_ip_address *member, int ring_no); extern int totemudp_reconfigure ( void *udp_context, struct totem_config *totem_config); #endif /* TOTEMUDP_H_DEFINED */ diff --git a/exec/totemudpu.c b/exec/totemudpu.c index a7029a46..399b47b1 100644 --- a/exec/totemudpu.c +++ b/exec/totemudpu.c @@ -1,1453 +1,1453 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemudpu.h" #include "util.h" #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif #define MCAST_SOCKET_BUFFER_SIZE (TRANSMITS_ALLOWED * UDP_RECEIVE_FRAME_SIZE_MAX) #define NETIF_STATE_REPORT_UP 1 #define NETIF_STATE_REPORT_DOWN 2 #define BIND_STATE_UNBOUND 0 #define BIND_STATE_REGULAR 1 #define BIND_STATE_LOOPBACK 2 struct totemudpu_member { struct qb_list_head list; struct totem_ip_address member; int fd; int active; }; struct totemudpu_instance { qb_loop_t *totemudpu_poll_handle; struct totem_interface *totem_interface; int netif_state_report; int netif_bind_state; void *context; - void (*totemudpu_deliver_fn) ( + int (*totemudpu_deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from); - void (*totemudpu_iface_change_fn) ( + int (*totemudpu_iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no); void (*totemudpu_target_set_completed) (void *context); /* * Function and data used to log messages */ int totemudpu_log_level_security; int totemudpu_log_level_error; int totemudpu_log_level_warning; int totemudpu_log_level_notice; int totemudpu_log_level_debug; int totemudpu_subsys_id; void (*totemudpu_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); void *udpu_context; char iov_buffer[UDP_RECEIVE_FRAME_SIZE_MAX]; struct iovec totemudpu_iov_recv; struct qb_list_head member_list; int stats_sent; int stats_recv; int stats_delv; int stats_remcasts; int stats_orf_token; struct timeval stats_tv_start; struct totem_ip_address my_id; int firstrun; qb_loop_timer_handle timer_netif_check_timeout; unsigned int my_memb_entries; struct totem_config *totem_config; totemsrp_stats_t *stats; struct totem_ip_address token_target; int token_socket; int local_loop_sock[2]; qb_loop_timer_handle timer_merge_detect_timeout; int send_merge_detect_message; unsigned int merge_detect_messages_sent_before_timeout; }; struct work_item { const void *msg; unsigned int msg_len; struct totemudpu_instance *instance; }; static int totemudpu_build_sockets ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *bound_to); static int totemudpu_create_sending_socket( void *udpu_context, const struct totem_ip_address *member); int totemudpu_member_list_rebind_ip ( void *udpu_context); static void totemudpu_start_merge_detect_timeout( void *udpu_context); static void totemudpu_stop_merge_detect_timeout( void *udpu_context); static void totemudpu_instance_initialize (struct totemudpu_instance *instance) { memset (instance, 0, sizeof (struct totemudpu_instance)); instance->netif_state_report = NETIF_STATE_REPORT_UP | NETIF_STATE_REPORT_DOWN; instance->totemudpu_iov_recv.iov_base = instance->iov_buffer; instance->totemudpu_iov_recv.iov_len = UDP_RECEIVE_FRAME_SIZE_MAX; //sizeof (instance->iov_buffer); /* * There is always atleast 1 processor */ instance->my_memb_entries = 1; qb_list_init (&instance->member_list); } #define log_printf(level, format, args...) \ do { \ instance->totemudpu_log_printf ( \ level, instance->totemudpu_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemudpu_log_printf ( \ level, instance->totemudpu_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)", ##args, _error_ptr, err_num); \ } while(0) int totemudpu_crypto_set ( void *udpu_context, const char *cipher_type, const char *hash_type) { return (0); } static inline void ucast_sendmsg ( struct totemudpu_instance *instance, struct totem_ip_address *system_to, const void *msg, unsigned int msg_len) { struct msghdr msg_ucast; int res = 0; struct sockaddr_storage sockaddr; struct iovec iovec; int addrlen; int send_sock; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; /* * Build unicast message */ totemip_totemip_to_sockaddr_convert(system_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); memset(&msg_ucast, 0, sizeof(msg_ucast)); msg_ucast.msg_name = &sockaddr; msg_ucast.msg_namelen = addrlen; msg_ucast.msg_iov = (void *)&iovec; msg_ucast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_ucast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_ucast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_ucast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_ucast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_ucast.msg_accrightslen = 0; #endif if (instance->netif_bind_state == BIND_STATE_REGULAR) { send_sock = instance->token_socket; } else { send_sock = instance->local_loop_sock[1]; msg_ucast.msg_name = NULL; msg_ucast.msg_namelen = 0; } /* * Transmit unicast message * An error here is recovered by totemsrp */ res = sendmsg (send_sock, &msg_ucast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "sendmsg(ucast) failed (non-critical)"); } } static inline void mcast_sendmsg ( struct totemudpu_instance *instance, const void *msg, unsigned int msg_len, int only_active) { struct msghdr msg_mcast; int res = 0; struct iovec iovec; struct sockaddr_storage sockaddr; int addrlen; struct qb_list_head *list; struct totemudpu_member *member; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; memset(&msg_mcast, 0, sizeof(msg_mcast)); /* * Build multicast message */ if (instance->netif_bind_state == BIND_STATE_REGULAR) { qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); /* * Do not send multicast message if message is not "flush", member * is inactive and timeout for sending merge message didn't expired. */ if (only_active && !member->active && !instance->send_merge_detect_message) continue ; totemip_totemip_to_sockaddr_convert(&member->member, instance->totem_interface->ip_port, &sockaddr, &addrlen); msg_mcast.msg_name = &sockaddr; msg_mcast.msg_namelen = addrlen; msg_mcast.msg_iov = (void *)&iovec; msg_mcast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_mcast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_mcast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_mcast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_mcast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_mcast.msg_accrightslen = 0; #endif /* * Transmit multicast message * An error here is recovered by totemsrp */ res = sendmsg (member->fd, &msg_mcast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "sendmsg(mcast) failed (non-critical)"); } } if (!only_active || instance->send_merge_detect_message) { /* * Current message was sent to all nodes */ instance->merge_detect_messages_sent_before_timeout++; instance->send_merge_detect_message = 0; } } else { /* * Transmit multicast message to local unix mcast loop * An error here is recovered by totemsrp */ msg_mcast.msg_name = NULL; msg_mcast.msg_namelen = 0; msg_mcast.msg_iov = (void *)&iovec; msg_mcast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_mcast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_mcast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_mcast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_mcast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_mcast.msg_accrightslen = 0; #endif res = sendmsg (instance->local_loop_sock[1], &msg_mcast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "sendmsg(local mcast loop) failed (non-critical)"); } } } int totemudpu_finalize ( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; if (instance->token_socket > 0) { qb_loop_poll_del (instance->totemudpu_poll_handle, instance->token_socket); close (instance->token_socket); } if (instance->local_loop_sock[0] > 0) { qb_loop_poll_del (instance->totemudpu_poll_handle, instance->local_loop_sock[0]); close (instance->local_loop_sock[0]); close (instance->local_loop_sock[1]); } totemudpu_stop_merge_detect_timeout(instance); return (res); } static struct totemudpu_member *find_member_by_sockaddr( const void *udpu_context, const struct sockaddr *sa) { struct qb_list_head *list; struct totemudpu_member *member; struct totemudpu_member *res_member; const struct totemudpu_instance *instance = (const struct totemudpu_instance *)udpu_context; res_member = NULL; qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (totemip_sa_equal(&member->member, sa)) { res_member = member; break ; } } return (res_member); } static int net_deliver_fn ( int fd, int revents, void *data) { struct totemudpu_instance *instance = (struct totemudpu_instance *)data; struct msghdr msg_recv; struct iovec *iovec; struct sockaddr_storage system_from; int bytes_received; int truncated_packet; iovec = &instance->totemudpu_iov_recv; /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = iovec; msg_recv.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_recv.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_recv.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_recv.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_recv.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_recv.msg_accrightslen = 0; #endif bytes_received = recvmsg (fd, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (bytes_received == -1) { return (0); } else { instance->stats_recv += bytes_received; } truncated_packet = 0; #ifdef HAVE_MSGHDR_FLAGS if (msg_recv.msg_flags & MSG_TRUNC) { truncated_packet = 1; } #else /* * We don't have MSGHDR_FLAGS, but we can (hopefully) safely make assumption that * if bytes_received == UDP_RECEIVE_FRAME_SIZE_MAX then packet is truncated */ if (bytes_received == UDP_RECEIVE_FRAME_SIZE_MAX) { truncated_packet = 1; } #endif if (truncated_packet) { log_printf (instance->totemudpu_log_level_error, "Received too big message. This may be because something bad is happening" "on the network (attack?), or you tried join more nodes than corosync is" "compiled with (%u) or bug in the code (bad estimation of " "the UDP_RECEIVE_FRAME_SIZE_MAX). Dropping packet.", PROCESSOR_COUNT_MAX); return (0); } if (instance->totem_config->block_unlisted_ips && instance->netif_bind_state == BIND_STATE_REGULAR && find_member_by_sockaddr(instance, (const struct sockaddr *)&system_from) == NULL) { log_printf(instance->totemudpu_log_level_debug, "Packet rejected from %s", totemip_sa_print((const struct sockaddr *)&system_from)); return (0); } iovec->iov_len = bytes_received; /* * Handle incoming message */ instance->totemudpu_deliver_fn ( instance->context, iovec->iov_base, iovec->iov_len, &system_from); iovec->iov_len = UDP_RECEIVE_FRAME_SIZE_MAX; return (0); } static int netif_determine ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet, struct totem_ip_address *bound_to, int *interface_up, int *interface_num) { int res; res = totemip_iface_check (bindnet, bound_to, interface_up, interface_num, instance->totem_config->clear_node_high_bit); return (res); } /* * If the interface is up, the sockets for totem are built. If the interface is down * this function is requeued in the timer list to retry building the sockets later. */ static void timer_function_netif_check_timeout ( void *data) { struct totemudpu_instance *instance = (struct totemudpu_instance *)data; int interface_up; int interface_num; /* * Build sockets for every interface */ netif_determine (instance, &instance->totem_interface->bindnet, &instance->totem_interface->boundto, &interface_up, &interface_num); /* * If the network interface isn't back up and we are already * in loopback mode, add timer to check again and return */ if ((instance->netif_bind_state == BIND_STATE_LOOPBACK && interface_up == 0) || (instance->my_memb_entries == 1 && instance->netif_bind_state == BIND_STATE_REGULAR && interface_up == 1)) { qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); /* * Add a timer to check for a downed regular interface */ return; } if (instance->token_socket > 0) { qb_loop_poll_del (instance->totemudpu_poll_handle, instance->token_socket); close (instance->token_socket); instance->token_socket = -1; } if (interface_up == 0) { if (instance->netif_bind_state == BIND_STATE_UNBOUND) { log_printf (instance->totemudpu_log_level_error, "One of your ip addresses are now bound to localhost. " "Corosync would not work correctly."); exit(COROSYNC_DONE_FATAL_ERR); } /* * Interface is not up */ instance->netif_bind_state = BIND_STATE_LOOPBACK; /* * Add a timer to retry building interfaces and request memb_gather_enter */ qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } else { /* * Interface is up */ instance->netif_bind_state = BIND_STATE_REGULAR; } /* * Create and bind the multicast and unicast sockets */ totemudpu_build_sockets (instance, &instance->totem_interface->bindnet, &instance->totem_interface->boundto); if (instance->netif_bind_state == BIND_STATE_REGULAR) { qb_loop_poll_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->token_socket, POLLIN, instance, net_deliver_fn); } totemip_copy (&instance->my_id, &instance->totem_interface->boundto); /* * This reports changes in the interface to the user and totemsrp */ if (instance->netif_bind_state == BIND_STATE_REGULAR) { if (instance->netif_state_report & NETIF_STATE_REPORT_UP) { log_printf (instance->totemudpu_log_level_notice, "The network interface [%s] is now up.", totemip_print (&instance->totem_interface->boundto)); instance->netif_state_report = NETIF_STATE_REPORT_DOWN; instance->totemudpu_iface_change_fn (instance->context, &instance->my_id, 0); } /* * Add a timer to check for interface going down in single membership */ if (instance->my_memb_entries == 1) { qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } } else { if (instance->netif_state_report & NETIF_STATE_REPORT_DOWN) { log_printf (instance->totemudpu_log_level_notice, "The network interface is down."); instance->totemudpu_iface_change_fn (instance->context, &instance->my_id, 0); } instance->netif_state_report = NETIF_STATE_REPORT_UP; } } /* Set the socket priority to INTERACTIVE to ensure that our messages don't get queued behind anything else */ static void totemudpu_traffic_control_set(struct totemudpu_instance *instance, int sock) { #ifdef SO_PRIORITY int prio = 6; /* TC_PRIO_INTERACTIVE */ if (setsockopt(sock, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(int))) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set traffic priority"); } #endif } static int totemudpu_build_sockets_ip ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *bound_to, int interface_num) { struct sockaddr_storage sockaddr; int addrlen; int res; unsigned int recvbuf_size; unsigned int optlen = sizeof (recvbuf_size); unsigned int retries = 0; /* * Setup unicast socket */ instance->token_socket = socket (bindnet_address->family, SOCK_DGRAM, 0); if (instance->token_socket == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "socket() failed"); return (-1); } totemip_nosigpipe (instance->token_socket); res = fcntl (instance->token_socket, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set non-blocking operation on token socket"); return (-1); } /* * Bind to unicast socket used for token send/receives * This has the side effect of binding to the correct interface */ totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); while (1) { res = bind (instance->token_socket, (struct sockaddr *)&sockaddr, addrlen); if (res == 0) { break; } LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "bind token socket failed"); if (++retries > BIND_MAX_RETRIES) { break; } /* * Wait for a while */ (void)poll(NULL, 0, BIND_RETRIES_INTERVAL * retries); } if (res == -1) { return (-1); } /* * the token_socket can receive many messages. Allow a large number * of receive messages on this socket */ recvbuf_size = MCAST_SOCKET_BUFFER_SIZE; res = setsockopt (instance->token_socket, SOL_SOCKET, SO_RCVBUF, &recvbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_notice, "Could not set recvbuf size"); } return 0; } int totemudpu_nodestatus_get (void *udpu_context, unsigned int nodeid, struct totem_node_status *node_status) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; struct qb_list_head *list; struct totemudpu_member *member; qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (member->member.nodeid == nodeid) { node_status->nodeid = nodeid; /* reachable is filled in by totemsrp */ if (instance->netif_bind_state == BIND_STATE_REGULAR) { node_status->link_status[0].enabled = 1; } else { node_status->link_status[0].enabled = 0; } node_status->link_status[0].connected = node_status->reachable; node_status->link_status[0].mtu = instance->totem_config->net_mtu; strncpy(node_status->link_status[0].src_ipaddr, totemip_print(&member->member), KNET_MAX_HOST_LEN-1); } } return (0); } int totemudpu_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count) { static char *statuses[INTERFACE_MAX] = {(char*)"OK"}; if (status) { *status = statuses; } *iface_count = 1; return (0); } static int totemudpu_build_local_sockets( struct totemudpu_instance *instance) { int i; unsigned int sendbuf_size; unsigned int recvbuf_size; unsigned int optlen = sizeof (sendbuf_size); int res; /* * Create local multicast loop socket */ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, instance->local_loop_sock) == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "socket() failed"); return (-1); } for (i = 0; i < 2; i++) { totemip_nosigpipe (instance->local_loop_sock[i]); res = fcntl (instance->local_loop_sock[i], F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set non-blocking operation on multicast socket"); return (-1); } } recvbuf_size = MCAST_SOCKET_BUFFER_SIZE; sendbuf_size = MCAST_SOCKET_BUFFER_SIZE; res = setsockopt (instance->local_loop_sock[0], SOL_SOCKET, SO_RCVBUF, &recvbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "Unable to set SO_RCVBUF size on UDP local mcast loop socket"); return (-1); } res = setsockopt (instance->local_loop_sock[1], SOL_SOCKET, SO_SNDBUF, &sendbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "Unable to set SO_SNDBUF size on UDP local mcast loop socket"); return (-1); } res = getsockopt (instance->local_loop_sock[0], SOL_SOCKET, SO_RCVBUF, &recvbuf_size, &optlen); if (res == 0) { log_printf (instance->totemudpu_log_level_debug, "Local receive multicast loop socket recv buffer size (%d bytes).", recvbuf_size); } res = getsockopt (instance->local_loop_sock[1], SOL_SOCKET, SO_SNDBUF, &sendbuf_size, &optlen); if (res == 0) { log_printf (instance->totemudpu_log_level_debug, "Local transmit multicast loop socket send buffer size (%d bytes).", sendbuf_size); } return (0); } static int totemudpu_build_sockets ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *bound_to) { int interface_num; int interface_up; int res; /* * Determine the ip address bound to and the interface name */ res = netif_determine (instance, bindnet_address, bound_to, &interface_up, &interface_num); if (res == -1) { return (-1); } totemip_copy(&instance->my_id, bound_to); res = totemudpu_build_sockets_ip (instance, bindnet_address, bound_to, interface_num); if (res == -1) { /* if we get here, corosync won't work anyway, so better leaving than faking to work */ LOGSYS_PERROR (errno, instance->totemudpu_log_level_error, "Unable to create sockets, exiting"); exit(EXIT_FAILURE); } /* We only send out of the token socket */ totemudpu_traffic_control_set(instance, instance->token_socket); /* * Rebind all members to new ips */ totemudpu_member_list_rebind_ip(instance); return res; } /* * Totem Network interface * depends on poll abstraction, POSIX, IPV4 */ /* * Create an instance */ int totemudpu_initialize ( qb_loop_t *poll_handle, void **udpu_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemudpu_instance *instance; instance = malloc (sizeof (struct totemudpu_instance)); if (instance == NULL) { return (-1); } totemudpu_instance_initialize (instance); instance->totem_config = totem_config; instance->stats = stats; /* * Configure logging */ instance->totemudpu_log_level_security = 1; //totem_config->totem_logging_configuration.log_level_security; instance->totemudpu_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemudpu_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemudpu_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemudpu_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemudpu_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemudpu_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Initialize local variables for totemudpu */ instance->totem_interface = &totem_config->interfaces[0]; memset (instance->iov_buffer, 0, UDP_RECEIVE_FRAME_SIZE_MAX); instance->totemudpu_poll_handle = poll_handle; instance->totem_interface->bindnet.nodeid = instance->totem_config->node_id; instance->context = context; instance->totemudpu_deliver_fn = deliver_fn; instance->totemudpu_iface_change_fn = iface_change_fn; instance->totemudpu_target_set_completed = target_set_completed; /* * Create static local mcast sockets */ if (totemudpu_build_local_sockets(instance) == -1) { free(instance); return (-1); } qb_loop_poll_add ( instance->totemudpu_poll_handle, QB_LOOP_MED, instance->local_loop_sock[0], POLLIN, instance, net_deliver_fn); /* * RRP layer isn't ready to receive message because it hasn't * initialized yet. Add short timer to check the interfaces. */ qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, 100*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); totemudpu_start_merge_detect_timeout((void*)instance); *udpu_context = instance; return (0); } void *totemudpu_buffer_alloc (void) { return malloc (FRAME_SIZE_MAX); } void totemudpu_buffer_release (void *ptr) { return free (ptr); } int totemudpu_processor_count_set ( void *udpu_context, int processor_count) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; instance->my_memb_entries = processor_count; qb_loop_timer_del (instance->totemudpu_poll_handle, instance->timer_netif_check_timeout); if (processor_count == 1) { qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } return (res); } int totemudpu_recv_flush (void *udpu_context) { int res = 0; return (res); } int totemudpu_send_flush (void *udpu_context) { int res = 0; return (res); } int totemudpu_token_send ( void *udpu_context, const void *msg, unsigned int msg_len) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; ucast_sendmsg (instance, &instance->token_target, msg, msg_len); return (res); } int totemudpu_mcast_flush_send ( void *udpu_context, const void *msg, unsigned int msg_len) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 0); return (res); } int totemudpu_mcast_noflush_send ( void *udpu_context, const void *msg, unsigned int msg_len) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 1); return (res); } extern int totemudpu_iface_check (void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; timer_function_netif_check_timeout (instance); return (res); } extern void totemudpu_net_mtu_adjust (void *udpu_context, struct totem_config *totem_config) { totem_config->net_mtu -= totemip_udpip_header_size(totem_config->interfaces[0].bindnet.family); } int totemudpu_token_target_set ( void *udpu_context, unsigned int nodeid) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; struct qb_list_head *list; struct totemudpu_member *member; int res = 0; qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (member->member.nodeid == nodeid) { memcpy (&instance->token_target, &member->member, sizeof (struct totem_ip_address)); instance->totemudpu_target_set_completed (instance->context); break; } } return (res); } extern int totemudpu_recv_mcast_empty ( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; unsigned int res; struct sockaddr_storage system_from; struct msghdr msg_recv; struct pollfd ufd; int nfds, i; int msg_processed = 0; int sock; /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = &instance->totemudpu_iov_recv; msg_recv.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_recv.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_recv.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_recv.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_recv.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_recv.msg_accrightslen = 0; #endif for (i = 0; i < 2; i++) { sock = -1; if (i == 0) { if (instance->netif_bind_state == BIND_STATE_REGULAR) { sock = instance->token_socket; } else { continue; } } if (i == 1) { sock = instance->local_loop_sock[0]; } assert(sock != -1); do { ufd.fd = sock; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { res = recvmsg (sock, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (res != -1) { msg_processed = 1; } else { msg_processed = -1; } } } while (nfds == 1); } return (msg_processed); } static int totemudpu_create_sending_socket( void *udpu_context, const struct totem_ip_address *member) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int fd; int res; unsigned int sendbuf_size; unsigned int optlen = sizeof (sendbuf_size); struct sockaddr_storage sockaddr; int addrlen; fd = socket (member->family, SOCK_DGRAM, 0); if (fd == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not create socket for new member"); return (-1); } totemip_nosigpipe (fd); res = fcntl (fd, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set non-blocking operation on token socket"); goto error_close_fd; } /* * These sockets are used to send multicast messages, so their buffers * should be large */ sendbuf_size = MCAST_SOCKET_BUFFER_SIZE; res = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_notice, "Could not set sendbuf size"); /* * Fail in setting sendbuf size is not fatal -> don't exit */ } /* * Bind to sending interface */ totemip_totemip_to_sockaddr_convert(&instance->my_id, 0, &sockaddr, &addrlen); res = bind (fd, (struct sockaddr *)&sockaddr, addrlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "bind token socket failed"); goto error_close_fd; } return (fd); error_close_fd: close(fd); return (-1); } int totemudpu_iface_set (void *net_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no) { /* Not supported */ return (-1); } int totemudpu_member_add ( void *udpu_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; struct totemudpu_member *new_member; new_member = malloc (sizeof (struct totemudpu_member)); if (new_member == NULL) { return (-1); } memset(new_member, 0, sizeof(*new_member)); log_printf (LOGSYS_LEVEL_NOTICE, "adding new UDPU member {%s}", totemip_print(member)); qb_list_init (&new_member->list); qb_list_add_tail (&new_member->list, &instance->member_list); memcpy (&new_member->member, member, sizeof (struct totem_ip_address)); new_member->fd = totemudpu_create_sending_socket(udpu_context, member); new_member->active = 1; return (0); } int totemudpu_member_remove ( void *udpu_context, const struct totem_ip_address *token_target, int ring_no) { int found = 0; struct qb_list_head *list; struct totemudpu_member *member; struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; /* * Find the member to remove and close its socket */ qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (totemip_compare (token_target, &member->member)==0) { log_printf(LOGSYS_LEVEL_NOTICE, "removing UDPU member {%s}", totemip_print(&member->member)); if (member->fd > 0) { log_printf(LOGSYS_LEVEL_DEBUG, "Closing socket to: {%s}", totemip_print(&member->member)); qb_loop_poll_del (instance->totemudpu_poll_handle, member->fd); close (member->fd); } found = 1; break; } } /* * Delete the member from the list */ if (found) { qb_list_del (list); } instance = NULL; return (0); } int totemudpu_member_list_rebind_ip ( void *udpu_context) { struct qb_list_head *list; struct totemudpu_member *member; struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (member->fd > 0) { close (member->fd); } member->fd = totemudpu_create_sending_socket(udpu_context, &member->member); } return (0); } static void timer_function_merge_detect_timeout ( void *data) { struct totemudpu_instance *instance = (struct totemudpu_instance *)data; if (instance->merge_detect_messages_sent_before_timeout == 0) { instance->send_merge_detect_message = 1; } instance->merge_detect_messages_sent_before_timeout = 0; totemudpu_start_merge_detect_timeout(instance); } static void totemudpu_start_merge_detect_timeout( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; qb_loop_timer_add(instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout * 2 * QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); } static void totemudpu_stop_merge_detect_timeout( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; qb_loop_timer_del(instance->totemudpu_poll_handle, instance->timer_merge_detect_timeout); } int totemudpu_reconfigure ( void *udpu_context, struct totem_config *totem_config) { /* Not supported */ return (-1); } diff --git a/exec/totemudpu.h b/exec/totemudpu.h index 07e63459..fe530ca1 100644 --- a/exec/totemudpu.h +++ b/exec/totemudpu.h @@ -1,144 +1,144 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2011 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMUDPU_H_DEFINED #define TOTEMUDPU_H_DEFINED #include #include #include #include /** * Create an instance */ extern int totemudpu_initialize ( qb_loop_t *poll_handle, void **udpu_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, - void (*deliver_fn) ( + int (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), - void (*iface_change_fn) ( + int (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); extern void *totemudpu_buffer_alloc (void); extern void totemudpu_buffer_release (void *ptr); extern int totemudpu_processor_count_set ( void *udpu_context, int processor_count); extern int totemudpu_token_send ( void *udpu_context, const void *msg, unsigned int msg_len); extern int totemudpu_mcast_flush_send ( void *udpu_context, const void *msg, unsigned int msg_len); extern int totemudpu_mcast_noflush_send ( void *udpu_context, const void *msg, unsigned int msg_len); extern int totemudpu_nodestatus_get (void *net_context, unsigned int nodeid, struct totem_node_status *node_status); extern int totemudpu_ifaces_get (void *net_context, char ***status, unsigned int *iface_count); extern int totemudpu_recv_flush (void *udpu_context); extern int totemudpu_send_flush (void *udpu_context); extern int totemudpu_iface_set (void *net_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no); extern int totemudpu_iface_check (void *udpu_context); extern int totemudpu_finalize (void *udpu_context); extern void totemudpu_net_mtu_adjust (void *udpu_context, struct totem_config *totem_config); extern int totemudpu_token_target_set ( void *udpu_context, unsigned int nodeid); extern int totemudpu_crypto_set ( void *udpu_context, const char *cipher_type, const char *hash_type); extern int totemudpu_recv_mcast_empty ( void *udpu_context); extern int totemudpu_member_add ( void *udpu_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); extern int totemudpu_member_remove ( void *udpu_context, const struct totem_ip_address *member, int ring_no); extern int totemudpu_reconfigure ( void *udpu_context, struct totem_config *totem_config); #endif /* TOTEMUDPU_H_DEFINED */