diff --git a/exec/cfg.c b/exec/cfg.c index 05c9a3a5..c9d98489 100644 --- a/exec/cfg.c +++ b/exec/cfg.c @@ -1,1083 +1,1083 @@ /* * Copyright (c) 2005-2006 MontaVista Software, Inc. * Copyright (c) 2006-2013 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "service.h" #include "main.h" LOGSYS_DECLARE_SUBSYS ("CFG"); enum cfg_message_req_types { MESSAGE_REQ_EXEC_CFG_RINGREENABLE = 0, MESSAGE_REQ_EXEC_CFG_KILLNODE = 1, MESSAGE_REQ_EXEC_CFG_SHUTDOWN = 2, MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG = 3 }; #define DEFAULT_SHUTDOWN_TIMEOUT 5 static struct qb_list_head trackers_list; /* * Variables controlling a requested shutdown */ static corosync_timer_handle_t shutdown_timer; static struct cfg_info *shutdown_con; static uint32_t shutdown_flags; static int shutdown_yes; static int shutdown_no; static int shutdown_expected; struct cfg_info { struct qb_list_head list; void *conn; void *tracker_conn; enum {SHUTDOWN_REPLY_UNKNOWN, SHUTDOWN_REPLY_YES, SHUTDOWN_REPLY_NO} shutdown_reply; }; static void cfg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); static char *cfg_exec_init_fn (struct corosync_api_v1 *corosync_api_v1); static struct corosync_api_v1 *api; static int cfg_lib_init_fn (void *conn); static int cfg_lib_exit_fn (void *conn); static void message_handler_req_exec_cfg_ringreenable ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_killnode ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_shutdown ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_reload_config ( const void *message, unsigned int nodeid); static void exec_cfg_killnode_endian_convert (void *msg); static void message_handler_req_lib_cfg_ringstatusget ( void *conn, const void *msg); static void message_handler_req_lib_cfg_ringreenable ( void *conn, const void *msg); static void message_handler_req_lib_cfg_killnode ( void *conn, const void *msg); static void message_handler_req_lib_cfg_tryshutdown ( void *conn, const void *msg); static void message_handler_req_lib_cfg_replytoshutdown ( void *conn, const void *msg); static void message_handler_req_lib_cfg_get_node_addrs ( void *conn, const void *msg); static void message_handler_req_lib_cfg_local_get ( void *conn, const void *msg); static void message_handler_req_lib_cfg_reload_config ( void *conn, const void *msg); /* * Service Handler Definition */ static struct corosync_lib_handler cfg_lib_engine[] = { { /* 0 */ .lib_handler_fn = message_handler_req_lib_cfg_ringstatusget, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 1 */ .lib_handler_fn = message_handler_req_lib_cfg_ringreenable, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 2 */ .lib_handler_fn = message_handler_req_lib_cfg_killnode, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 3 */ .lib_handler_fn = message_handler_req_lib_cfg_tryshutdown, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 4 */ .lib_handler_fn = message_handler_req_lib_cfg_replytoshutdown, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 5 */ .lib_handler_fn = message_handler_req_lib_cfg_get_node_addrs, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 6 */ .lib_handler_fn = message_handler_req_lib_cfg_local_get, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 7 */ .lib_handler_fn = message_handler_req_lib_cfg_reload_config, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED } }; static struct corosync_exec_handler cfg_exec_engine[] = { { /* 0 */ .exec_handler_fn = message_handler_req_exec_cfg_ringreenable, }, { /* 1 */ .exec_handler_fn = message_handler_req_exec_cfg_killnode, .exec_endian_convert_fn = exec_cfg_killnode_endian_convert }, { /* 2 */ .exec_handler_fn = message_handler_req_exec_cfg_shutdown, }, { /* 3 */ .exec_handler_fn = message_handler_req_exec_cfg_reload_config, } }; /* * Exports the interface for the service */ struct corosync_service_engine cfg_service_engine = { .name = "corosync configuration service", .id = CFG_SERVICE, .priority = 1, .private_data_size = sizeof(struct cfg_info), .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .lib_init_fn = cfg_lib_init_fn, .lib_exit_fn = cfg_lib_exit_fn, .lib_engine = cfg_lib_engine, .lib_engine_count = sizeof (cfg_lib_engine) / sizeof (struct corosync_lib_handler), .exec_init_fn = cfg_exec_init_fn, .exec_engine = cfg_exec_engine, .exec_engine_count = sizeof (cfg_exec_engine) / sizeof (struct corosync_exec_handler), .confchg_fn = cfg_confchg_fn }; struct corosync_service_engine *cfg_get_service_engine_ver0 (void) { return (&cfg_service_engine); } struct req_exec_cfg_ringreenable { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); }; struct req_exec_cfg_reload_config { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); }; struct req_exec_cfg_killnode { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_uint32_t nodeid __attribute__((aligned(8))); mar_name_t reason __attribute__((aligned(8))); }; struct req_exec_cfg_shutdown { struct qb_ipc_request_header header __attribute__((aligned(8))); }; /* IMPL */ static char *cfg_exec_init_fn ( struct corosync_api_v1 *corosync_api_v1) { api = corosync_api_v1; qb_list_init(&trackers_list); return (NULL); } static void cfg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { } /* * Tell other nodes we are shutting down */ static int send_shutdown(void) { struct req_exec_cfg_shutdown req_exec_cfg_shutdown; struct iovec iovec; ENTER(); req_exec_cfg_shutdown.header.size = sizeof (struct req_exec_cfg_shutdown); req_exec_cfg_shutdown.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_SHUTDOWN); iovec.iov_base = (char *)&req_exec_cfg_shutdown; iovec.iov_len = sizeof (struct req_exec_cfg_shutdown); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); return 0; } static void send_test_shutdown(void *only_conn, void *exclude_conn, int status) { struct res_lib_cfg_testshutdown res_lib_cfg_testshutdown; struct qb_list_head *iter; ENTER(); res_lib_cfg_testshutdown.header.size = sizeof(struct res_lib_cfg_testshutdown); res_lib_cfg_testshutdown.header.id = MESSAGE_RES_CFG_TESTSHUTDOWN; res_lib_cfg_testshutdown.header.error = status; res_lib_cfg_testshutdown.flags = shutdown_flags; if (only_conn) { TRACE1("sending testshutdown to only %p", only_conn); api->ipc_dispatch_send(only_conn, &res_lib_cfg_testshutdown, sizeof(res_lib_cfg_testshutdown)); } else { qb_list_for_each(iter, &trackers_list) { struct cfg_info *ci = qb_list_entry(iter, struct cfg_info, list); if (ci->conn != exclude_conn) { TRACE1("sending testshutdown to %p", ci->tracker_conn); api->ipc_dispatch_send(ci->tracker_conn, &res_lib_cfg_testshutdown, sizeof(res_lib_cfg_testshutdown)); } } } LEAVE(); } static void check_shutdown_status(void) { ENTER(); /* * Shutdown client might have gone away */ if (!shutdown_con) { LEAVE(); return; } /* * All replies safely gathered in ? */ if (shutdown_yes + shutdown_no >= shutdown_expected) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; api->timer_delete(shutdown_timer); if (shutdown_yes >= shutdown_expected || shutdown_flags == CFG_SHUTDOWN_FLAG_REGARDLESS) { TRACE1("shutdown confirmed"); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; /* * Tell originator that shutdown was confirmed */ api->ipc_response_send(shutdown_con->conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); shutdown_con = NULL; /* * Tell other nodes we are going down */ send_shutdown(); } else { TRACE1("shutdown cancelled"); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_ERR_BUSY; /* * Tell originator that shutdown was cancelled */ api->ipc_response_send(shutdown_con->conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); shutdown_con = NULL; } log_printf(LOGSYS_LEVEL_DEBUG, "shutdown decision is: (yes count: %d, no count: %d) flags=%x", shutdown_yes, shutdown_no, shutdown_flags); } LEAVE(); } /* * Not all nodes responded to the shutdown (in time) */ static void shutdown_timer_fn(void *arg) { ENTER(); /* * Mark undecideds as "NO" */ shutdown_no = shutdown_expected; check_shutdown_status(); send_test_shutdown(NULL, NULL, CS_ERR_TIMEOUT); LEAVE(); } static void remove_ci_from_shutdown(struct cfg_info *ci) { ENTER(); /* * If the controlling shutdown process has quit, then cancel the * shutdown session */ if (ci == shutdown_con) { shutdown_con = NULL; api->timer_delete(shutdown_timer); } if (!qb_list_empty(&ci->list)) { qb_list_del(&ci->list); qb_list_init(&ci->list); /* * Remove our option */ if (shutdown_con) { if (ci->shutdown_reply == SHUTDOWN_REPLY_YES) shutdown_yes--; if (ci->shutdown_reply == SHUTDOWN_REPLY_NO) shutdown_no--; } /* * If we are leaving, then that's an implicit YES to shutdown */ ci->shutdown_reply = SHUTDOWN_REPLY_YES; shutdown_yes++; check_shutdown_status(); } LEAVE(); } int cfg_lib_exit_fn (void *conn) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); ENTER(); remove_ci_from_shutdown(ci); LEAVE(); return (0); } static int cfg_lib_init_fn (void *conn) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); ENTER(); qb_list_init(&ci->list); LEAVE(); return (0); } /* * Executive message handlers */ static void message_handler_req_exec_cfg_ringreenable ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_ringreenable *req_exec_cfg_ringreenable = message; struct res_lib_cfg_ringreenable res_lib_cfg_ringreenable; ENTER(); api->totem_ring_reenable (); if (api->ipc_source_is_local(&req_exec_cfg_ringreenable->source)) { res_lib_cfg_ringreenable.header.id = MESSAGE_RES_CFG_RINGREENABLE; res_lib_cfg_ringreenable.header.size = sizeof (struct res_lib_cfg_ringreenable); res_lib_cfg_ringreenable.header.error = CS_OK; api->ipc_response_send ( req_exec_cfg_ringreenable->source.conn, &res_lib_cfg_ringreenable, sizeof (struct res_lib_cfg_ringreenable)); api->ipc_refcnt_dec(req_exec_cfg_ringreenable->source.conn); } LEAVE(); } static void exec_cfg_killnode_endian_convert (void *msg) { struct req_exec_cfg_killnode *req_exec_cfg_killnode = (struct req_exec_cfg_killnode *)msg; ENTER(); swab_mar_name_t(&req_exec_cfg_killnode->reason); LEAVE(); } static void message_handler_req_exec_cfg_killnode ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_killnode *req_exec_cfg_killnode = message; cs_name_t reason; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "request to kill node %d(us=%d)", req_exec_cfg_killnode->nodeid, api->totem_nodeid_get()); if (req_exec_cfg_killnode->nodeid == api->totem_nodeid_get()) { marshall_from_mar_name_t(&reason, &req_exec_cfg_killnode->reason); log_printf(LOGSYS_LEVEL_NOTICE, "Killed by node %d: %s", nodeid, reason.value); corosync_fatal_error(COROSYNC_FATAL_ERROR_EXIT); } LEAVE(); } /* * Self shutdown */ static void message_handler_req_exec_cfg_shutdown ( const void *message, unsigned int nodeid) { ENTER(); log_printf(LOGSYS_LEVEL_NOTICE, "Node %d was shut down by sysadmin", nodeid); if (nodeid == api->totem_nodeid_get()) { api->shutdown_request(); } LEAVE(); } /* strcmp replacement that can handle NULLs */ static int nullcheck_strcmp(const char* left, const char *right) { if (!left && right) return -1; if (left && !right) return 1; if (!left && !right) return 0; return strcmp(left, right); } /* * If a key has changed value in the new file, then warn the user and remove it from the temp_map */ static void delete_and_notify_if_changed(icmap_map_t temp_map, const char *key_name) { if (!(icmap_key_value_eq(temp_map, key_name, icmap_get_global_map(), key_name))) { if (icmap_delete_r(temp_map, key_name) == CS_OK) { log_printf(LOGSYS_LEVEL_NOTICE, "Modified entry '%s' in corosync.conf cannot be changed at run-time", key_name); } } } /* * Remove any keys from the new config file that in the new corosync.conf but that * cannot be changed at run time. A log message will be issued for each * entry that the user wants to change but they cannot. * * Add more here as needed. */ static void remove_ro_entries(icmap_map_t temp_map) { delete_and_notify_if_changed(temp_map, "totem.secauth"); delete_and_notify_if_changed(temp_map, "totem.crypto_hash"); delete_and_notify_if_changed(temp_map, "totem.crypto_cipher"); delete_and_notify_if_changed(temp_map, "totem.version"); delete_and_notify_if_changed(temp_map, "totem.threads"); delete_and_notify_if_changed(temp_map, "totem.ip_version"); delete_and_notify_if_changed(temp_map, "totem.rrp_mode"); delete_and_notify_if_changed(temp_map, "totem.netmtu"); delete_and_notify_if_changed(temp_map, "totem.interface.ringnumber"); delete_and_notify_if_changed(temp_map, "totem.interface.bindnetaddr"); delete_and_notify_if_changed(temp_map, "totem.interface.mcastaddr"); delete_and_notify_if_changed(temp_map, "totem.interface.broadcast"); delete_and_notify_if_changed(temp_map, "totem.interface.mcastport"); delete_and_notify_if_changed(temp_map, "totem.interface.ttl"); delete_and_notify_if_changed(temp_map, "totem.vsftype"); delete_and_notify_if_changed(temp_map, "totem.transport"); delete_and_notify_if_changed(temp_map, "totem.cluster_name"); delete_and_notify_if_changed(temp_map, "quorum.provider"); delete_and_notify_if_changed(temp_map, "qb.ipc_type"); } /* * Remove entries that exist in the global map, but not in the temp_map, this will * cause delete notifications to be sent to any listeners. * * NOTE: This routine depends entirely on the keys returned by the iterators * being in alpha-sorted order. */ static void remove_deleted_entries(icmap_map_t temp_map, const char *prefix) { icmap_iter_t old_iter; icmap_iter_t new_iter; const char *old_key, *new_key; int ret; old_iter = icmap_iter_init(prefix); new_iter = icmap_iter_init_r(temp_map, prefix); old_key = icmap_iter_next(old_iter, NULL, NULL); new_key = icmap_iter_next(new_iter, NULL, NULL); while (old_key || new_key) { ret = nullcheck_strcmp(old_key, new_key); if ((ret < 0 && old_key) || !new_key) { /* * new_key is greater, a line (or more) has been deleted * Continue until old is >= new */ do { /* Remove it from icmap & send notifications */ icmap_delete(old_key); old_key = icmap_iter_next(old_iter, NULL, NULL); ret = nullcheck_strcmp(old_key, new_key); } while (ret < 0 && old_key); } else if ((ret > 0 && new_key) || !old_key) { /* * old_key is greater, a line (or more) has been added * Continue until new is >= old * * we don't need to do anything special with this like tell * icmap. That will happen when we copy the values over */ do { new_key = icmap_iter_next(new_iter, NULL, NULL); ret = nullcheck_strcmp(old_key, new_key); } while (ret > 0 && new_key); } if (ret == 0) { new_key = icmap_iter_next(new_iter, NULL, NULL); old_key = icmap_iter_next(old_iter, NULL, NULL); } } icmap_iter_finalize(new_iter); icmap_iter_finalize(old_iter); } /* * Reload configuration file */ static void message_handler_req_exec_cfg_reload_config ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_reload_config *req_exec_cfg_reload_config = message; struct res_lib_cfg_reload_config res_lib_cfg_reload_config; icmap_map_t temp_map; const char *error_string; int res = CS_OK; ENTER(); log_printf(LOGSYS_LEVEL_NOTICE, "Config reload requested by node %d", nodeid); /* * Set up a new hashtable as a staging area. */ if ((res = icmap_init_r(&temp_map)) != CS_OK) { log_printf(LOGSYS_LEVEL_ERROR, "Unable to create temporary icmap. config file reload cancelled\n"); goto reload_fini; } /* * Load new config into the temporary map */ res = coroparse_configparse(temp_map, &error_string); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Unable to reload config file: %s", error_string); res = CS_ERR_LIBRARY; goto reload_return; } /* Tell interested listeners that we have started a reload */ icmap_set_uint8("config.reload_in_progress", 1); /* Detect deleted entries and remove them from the main icmap hashtable */ remove_deleted_entries(temp_map, "logging."); remove_deleted_entries(temp_map, "totem."); remove_deleted_entries(temp_map, "nodelist."); remove_deleted_entries(temp_map, "quorum."); remove_deleted_entries(temp_map, "uidgid.config."); /* Remove entries that cannot be changed */ remove_ro_entries(temp_map); /* * Copy new keys into live config. * If this fails we will have a partially loaded config because some keys (above) might * have been reset to defaults - I'm not sure what to do here, we might have to quit. */ if ( (res = icmap_copy_map(icmap_get_global_map(), temp_map)) != CS_OK) { log_printf (LOGSYS_LEVEL_ERROR, "Error making new config live. cmap database may be inconsistent\n"); } /* All done - let clients know */ icmap_set_uint8("config.reload_in_progress", 0); reload_fini: /* Finished with the temporary storage */ icmap_fini_r(temp_map); reload_return: /* All done, return result to the caller if it was on this system */ if (nodeid == api->totem_nodeid_get()) { res_lib_cfg_reload_config.header.size = sizeof(res_lib_cfg_reload_config); res_lib_cfg_reload_config.header.id = MESSAGE_RES_CFG_RELOAD_CONFIG; res_lib_cfg_reload_config.header.error = res; api->ipc_response_send(req_exec_cfg_reload_config->source.conn, &res_lib_cfg_reload_config, sizeof(res_lib_cfg_reload_config)); api->ipc_refcnt_dec(req_exec_cfg_reload_config->source.conn);; } LEAVE(); } /* * Library Interface Implementation */ static void message_handler_req_lib_cfg_ringstatusget ( void *conn, const void *msg) { struct res_lib_cfg_ringstatusget res_lib_cfg_ringstatusget; struct totem_ip_address interfaces[INTERFACE_MAX]; unsigned int iface_count; char **status; const char *totem_ip_string; unsigned int i; cs_error_t res = CS_OK; ENTER(); res_lib_cfg_ringstatusget.header.id = MESSAGE_RES_CFG_RINGSTATUSGET; res_lib_cfg_ringstatusget.header.size = sizeof (struct res_lib_cfg_ringstatusget); api->totem_ifaces_get ( api->totem_nodeid_get(), interfaces, INTERFACE_MAX, &status, &iface_count); assert(iface_count <= CFG_MAX_INTERFACES); res_lib_cfg_ringstatusget.interface_count = iface_count; for (i = 0; i < iface_count; i++) { totem_ip_string = (const char *)api->totem_ip_print (&interfaces[i]); if (strlen(totem_ip_string) >= CFG_INTERFACE_NAME_MAX_LEN) { log_printf(LOGSYS_LEVEL_ERROR, "String representation of interface %u is too long", i); res = CS_ERR_NAME_TOO_LONG; goto send_response; } if (strlen(status[i]) >= CFG_INTERFACE_STATUS_MAX_LEN) { log_printf(LOGSYS_LEVEL_ERROR, "Status string for interface %u is too long", i); res = CS_ERR_NAME_TOO_LONG; goto send_response; } strcpy ((char *)&res_lib_cfg_ringstatusget.interface_status[i], status[i]); strcpy ((char *)&res_lib_cfg_ringstatusget.interface_name[i], totem_ip_string); } send_response: res_lib_cfg_ringstatusget.header.error = res; api->ipc_response_send ( conn, &res_lib_cfg_ringstatusget, sizeof (struct res_lib_cfg_ringstatusget)); LEAVE(); } static void message_handler_req_lib_cfg_ringreenable ( void *conn, const void *msg) { struct req_exec_cfg_ringreenable req_exec_cfg_ringreenable; struct iovec iovec; ENTER(); req_exec_cfg_ringreenable.header.size = sizeof (struct req_exec_cfg_ringreenable); req_exec_cfg_ringreenable.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_RINGREENABLE); api->ipc_source_set (&req_exec_cfg_ringreenable.source, conn); api->ipc_refcnt_inc(conn); iovec.iov_base = (char *)&req_exec_cfg_ringreenable; iovec.iov_len = sizeof (struct req_exec_cfg_ringreenable); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); } static void message_handler_req_lib_cfg_killnode ( void *conn, const void *msg) { const struct req_lib_cfg_killnode *req_lib_cfg_killnode = msg; struct res_lib_cfg_killnode res_lib_cfg_killnode; struct req_exec_cfg_killnode req_exec_cfg_killnode; struct iovec iovec; ENTER(); req_exec_cfg_killnode.header.size = sizeof (struct req_exec_cfg_killnode); req_exec_cfg_killnode.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_KILLNODE); req_exec_cfg_killnode.nodeid = req_lib_cfg_killnode->nodeid; marshall_to_mar_name_t(&req_exec_cfg_killnode.reason, &req_lib_cfg_killnode->reason); iovec.iov_base = (char *)&req_exec_cfg_killnode; iovec.iov_len = sizeof (struct req_exec_cfg_killnode); (void)api->totem_mcast (&iovec, 1, TOTEM_SAFE); res_lib_cfg_killnode.header.size = sizeof(struct res_lib_cfg_killnode); res_lib_cfg_killnode.header.id = MESSAGE_RES_CFG_KILLNODE; res_lib_cfg_killnode.header.error = CS_OK; api->ipc_response_send(conn, &res_lib_cfg_killnode, sizeof(res_lib_cfg_killnode)); LEAVE(); } static void message_handler_req_lib_cfg_tryshutdown ( void *conn, const void *msg) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); const struct req_lib_cfg_tryshutdown *req_lib_cfg_tryshutdown = msg; struct qb_list_head *iter; ENTER(); if (req_lib_cfg_tryshutdown->flags == CFG_SHUTDOWN_FLAG_IMMEDIATE) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; /* * Tell other nodes */ send_shutdown(); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); LEAVE(); return; } /* * Shutdown in progress, return an error */ if (shutdown_con) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_ERR_EXIST; api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); LEAVE(); return; } ci->conn = conn; shutdown_con = (struct cfg_info *)api->ipc_private_data_get (conn); shutdown_flags = req_lib_cfg_tryshutdown->flags; shutdown_yes = 0; shutdown_no = 0; /* * Count the number of listeners */ shutdown_expected = 0; - qb_list_for_each(iter, &trackers_list) { + qb_list_for_each(iter, &trackers_list) { struct cfg_info *testci = qb_list_entry(iter, struct cfg_info, list); /* * It is assumed that we will allow shutdown */ if (testci != ci) { testci->shutdown_reply = SHUTDOWN_REPLY_UNKNOWN; shutdown_expected++; } } /* * If no-one is listening for events then we can just go down now */ if (shutdown_expected == 0) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; /* * Tell originator that shutdown was confirmed */ api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); send_shutdown(); LEAVE(); return; } else { unsigned int shutdown_timeout = DEFAULT_SHUTDOWN_TIMEOUT; /* * Look for a shutdown timeout in configuration map */ icmap_get_uint32("cfg.shutdown_timeout", &shutdown_timeout); /* * Start the timer. If we don't get a full set of replies before this goes * off we'll cancel the shutdown */ api->timer_add_duration((unsigned long long)shutdown_timeout*1000000000, NULL, shutdown_timer_fn, &shutdown_timer); /* * Tell the users we would like to shut down */ send_test_shutdown(NULL, conn, CS_OK); } /* * We don't sent a reply to the caller here. * We send it when we know if we can shut down or not */ LEAVE(); } static void message_handler_req_lib_cfg_replytoshutdown ( void *conn, const void *msg) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); const struct req_lib_cfg_replytoshutdown *req_lib_cfg_replytoshutdown = msg; struct res_lib_cfg_replytoshutdown res_lib_cfg_replytoshutdown; int status = CS_OK; ENTER(); if (!shutdown_con) { status = CS_ERR_ACCESS; goto exit_fn; } if (req_lib_cfg_replytoshutdown->response) { shutdown_yes++; ci->shutdown_reply = SHUTDOWN_REPLY_YES; } else { shutdown_no++; ci->shutdown_reply = SHUTDOWN_REPLY_NO; } check_shutdown_status(); exit_fn: res_lib_cfg_replytoshutdown.header.error = status; res_lib_cfg_replytoshutdown.header.id = MESSAGE_RES_CFG_REPLYTOSHUTDOWN; res_lib_cfg_replytoshutdown.header.size = sizeof(res_lib_cfg_replytoshutdown); api->ipc_response_send(conn, &res_lib_cfg_replytoshutdown, sizeof(res_lib_cfg_replytoshutdown)); LEAVE(); } static void message_handler_req_lib_cfg_get_node_addrs (void *conn, const void *msg) { struct totem_ip_address node_ifs[INTERFACE_MAX]; char buf[PIPE_BUF]; char **status; unsigned int num_interfaces = 0; int ret = CS_OK; int i; const struct req_lib_cfg_get_node_addrs *req_lib_cfg_get_node_addrs = msg; struct res_lib_cfg_get_node_addrs *res_lib_cfg_get_node_addrs = (struct res_lib_cfg_get_node_addrs *)buf; unsigned int nodeid = req_lib_cfg_get_node_addrs->nodeid; char *addr_buf; if (nodeid == 0) nodeid = api->totem_nodeid_get(); api->totem_ifaces_get(nodeid, node_ifs, INTERFACE_MAX, &status, &num_interfaces); res_lib_cfg_get_node_addrs->header.size = sizeof(struct res_lib_cfg_get_node_addrs) + (num_interfaces * TOTEMIP_ADDRLEN); res_lib_cfg_get_node_addrs->header.id = MESSAGE_RES_CFG_GET_NODE_ADDRS; res_lib_cfg_get_node_addrs->header.error = ret; res_lib_cfg_get_node_addrs->num_addrs = num_interfaces; if (num_interfaces) { res_lib_cfg_get_node_addrs->family = node_ifs[0].family; for (i = 0, addr_buf = (char *)res_lib_cfg_get_node_addrs->addrs; i < num_interfaces; i++, addr_buf += TOTEMIP_ADDRLEN) { memcpy(addr_buf, node_ifs[i].addr, TOTEMIP_ADDRLEN); } } else { res_lib_cfg_get_node_addrs->header.error = CS_ERR_NOT_EXIST; } api->ipc_response_send(conn, res_lib_cfg_get_node_addrs, res_lib_cfg_get_node_addrs->header.size); } static void message_handler_req_lib_cfg_local_get (void *conn, const void *msg) { struct res_lib_cfg_local_get res_lib_cfg_local_get; res_lib_cfg_local_get.header.size = sizeof(res_lib_cfg_local_get); res_lib_cfg_local_get.header.id = MESSAGE_RES_CFG_LOCAL_GET; res_lib_cfg_local_get.header.error = CS_OK; res_lib_cfg_local_get.local_nodeid = api->totem_nodeid_get (); api->ipc_response_send(conn, &res_lib_cfg_local_get, sizeof(res_lib_cfg_local_get)); } static void message_handler_req_lib_cfg_reload_config (void *conn, const void *msg) { struct req_exec_cfg_reload_config req_exec_cfg_reload_config; struct iovec iovec; ENTER(); req_exec_cfg_reload_config.header.size = sizeof (struct req_exec_cfg_reload_config); req_exec_cfg_reload_config.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG); api->ipc_source_set (&req_exec_cfg_reload_config.source, conn); api->ipc_refcnt_inc(conn); iovec.iov_base = (char *)&req_exec_cfg_reload_config; iovec.iov_len = sizeof (struct req_exec_cfg_reload_config); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); } diff --git a/exec/coroparse.c b/exec/coroparse.c index 3318a399..f4dea8a6 100644 --- a/exec/coroparse.c +++ b/exec/coroparse.c @@ -1,1392 +1,1392 @@ /* * Copyright (c) 2006-2013 Red Hat, Inc. * * All rights reserved. * * Author: Patrick Caulfield (pcaulfie@redhat.com) * Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include #include "main.h" #include "util.h" enum parser_cb_type { PARSER_CB_START, PARSER_CB_END, PARSER_CB_SECTION_START, PARSER_CB_SECTION_END, PARSER_CB_ITEM, }; enum main_cp_cb_data_state { MAIN_CP_CB_DATA_STATE_NORMAL, MAIN_CP_CB_DATA_STATE_TOTEM, MAIN_CP_CB_DATA_STATE_INTERFACE, MAIN_CP_CB_DATA_STATE_LOGGER_SUBSYS, MAIN_CP_CB_DATA_STATE_UIDGID, MAIN_CP_CB_DATA_STATE_LOGGING_DAEMON, MAIN_CP_CB_DATA_STATE_MEMBER, MAIN_CP_CB_DATA_STATE_QUORUM, MAIN_CP_CB_DATA_STATE_QDEVICE, MAIN_CP_CB_DATA_STATE_NODELIST, MAIN_CP_CB_DATA_STATE_NODELIST_NODE, MAIN_CP_CB_DATA_STATE_PLOAD, MAIN_CP_CB_DATA_STATE_QB, MAIN_CP_CB_DATA_STATE_RESOURCES, MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM, MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS, MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM_MEMUSED, MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS_MEMUSED }; typedef int (*parser_cb_f)(const char *path, char *key, char *value, enum main_cp_cb_data_state *state, enum parser_cb_type type, const char **error_string, icmap_map_t config_map, void *user_data); struct key_value_list_item { char *key; char *value; struct qb_list_head list; }; struct main_cp_cb_data { int linknumber; char *bindnetaddr; char *mcastaddr; char *broadcast; int mcastport; int ttl; int knet_link_priority; int knet_ping_interval; int knet_ping_timeout; int knet_ping_precision; struct qb_list_head logger_subsys_items_head; char *subsys; char *logging_daemon_name; struct qb_list_head member_items_head; int node_number; int ring0_addr_added; }; static int read_config_file_into_icmap( const char **error_string, icmap_map_t config_map); static char error_string_response[512]; static int uid_determine (const char *req_user) { int pw_uid = 0; struct passwd passwd; struct passwd* pwdptr = &passwd; struct passwd* temp_pwd_pt; char *pwdbuffer; int pwdlinelen, rc; long int id; char *ep; id = strtol(req_user, &ep, 10); if (*ep == '\0' && id >= 0 && id <= UINT_MAX) { return (id); } pwdlinelen = sysconf (_SC_GETPW_R_SIZE_MAX); if (pwdlinelen == -1) { pwdlinelen = 256; } pwdbuffer = malloc (pwdlinelen); while ((rc = getpwnam_r (req_user, pwdptr, pwdbuffer, pwdlinelen, &temp_pwd_pt)) == ERANGE) { char *n; pwdlinelen *= 2; if (pwdlinelen <= 32678) { n = realloc (pwdbuffer, pwdlinelen); if (n != NULL) { pwdbuffer = n; continue; } } } if (rc != 0) { free (pwdbuffer); sprintf (error_string_response, "getpwnam_r(): %s", strerror(rc)); return (-1); } if (temp_pwd_pt == NULL) { free (pwdbuffer); sprintf (error_string_response, "The '%s' user is not found in /etc/passwd, please read the documentation.", req_user); return (-1); } pw_uid = passwd.pw_uid; free (pwdbuffer); return pw_uid; } static int gid_determine (const char *req_group) { int corosync_gid = 0; struct group group; struct group * grpptr = &group; struct group * temp_grp_pt; char *grpbuffer; int grplinelen, rc; long int id; char *ep; id = strtol(req_group, &ep, 10); if (*ep == '\0' && id >= 0 && id <= UINT_MAX) { return (id); } grplinelen = sysconf (_SC_GETGR_R_SIZE_MAX); if (grplinelen == -1) { grplinelen = 256; } grpbuffer = malloc (grplinelen); while ((rc = getgrnam_r (req_group, grpptr, grpbuffer, grplinelen, &temp_grp_pt)) == ERANGE) { char *n; grplinelen *= 2; if (grplinelen <= 32678) { n = realloc (grpbuffer, grplinelen); if (n != NULL) { grpbuffer = n; continue; } } } if (rc != 0) { free (grpbuffer); sprintf (error_string_response, "getgrnam_r(): %s", strerror(rc)); return (-1); } if (temp_grp_pt == NULL) { free (grpbuffer); sprintf (error_string_response, "The '%s' group is not found in /etc/group, please read the documentation.", req_group); return (-1); } corosync_gid = group.gr_gid; free (grpbuffer); return corosync_gid; } static char *strchr_rs (const char *haystack, int byte) { const char *end_address = strchr (haystack, byte); if (end_address) { end_address += 1; /* skip past { or = */ while (*end_address == ' ' || *end_address == '\t') end_address++; } return ((char *) end_address); } int coroparse_configparse (icmap_map_t config_map, const char **error_string) { if (read_config_file_into_icmap(error_string, config_map)) { return -1; } return 0; } static char *remove_whitespace(char *string, int remove_colon_and_brace) { char *start; char *end; start = string; while (*start == ' ' || *start == '\t') start++; end = start+(strlen(start))-1; while ((*end == ' ' || *end == '\t' || (remove_colon_and_brace && (*end == ':' || *end == '{'))) && end > start) end--; if (end != start) *(end+1) = '\0'; return start; } static int parse_section(FILE *fp, char *path, const char **error_string, int depth, enum main_cp_cb_data_state state, parser_cb_f parser_cb, icmap_map_t config_map, void *user_data) { char line[512]; int i; char *loc; int ignore_line; char new_keyname[ICMAP_KEYNAME_MAXLEN]; if (strcmp(path, "") == 0) { parser_cb("", NULL, NULL, &state, PARSER_CB_START, error_string, config_map, user_data); } while (fgets (line, sizeof (line), fp)) { if (strlen(line) > 0) { if (line[strlen(line) - 1] == '\n') line[strlen(line) - 1] = '\0'; if (strlen (line) > 0 && line[strlen(line) - 1] == '\r') line[strlen(line) - 1] = '\0'; } /* * Clear out white space and tabs */ for (i = strlen (line) - 1; i > -1; i--) { if (line[i] == '\t' || line[i] == ' ') { line[i] = '\0'; } else { break; } } ignore_line = 1; for (i = 0; i < strlen (line); i++) { if (line[i] != '\t' && line[i] != ' ') { if (line[i] != '#') ignore_line = 0; break; } } /* * Clear out comments and empty lines */ if (ignore_line) { continue; } /* New section ? */ if ((loc = strchr_rs (line, '{'))) { char *section = remove_whitespace(line, 1); enum main_cp_cb_data_state newstate; loc--; *loc = '\0'; if (strlen(path) + strlen(section) + 1 >= ICMAP_KEYNAME_MAXLEN) { *error_string = "parser error: Start of section makes total cmap path too long"; return -1; } strcpy(new_keyname, path); if (strcmp(path, "") != 0) { strcat(new_keyname, "."); } strcat(new_keyname, section); /* Only use the new state for items further down the stack */ newstate = state; if (!parser_cb(new_keyname, NULL, NULL, &newstate, PARSER_CB_SECTION_START, error_string, config_map, user_data)) { return -1; } if (parse_section(fp, new_keyname, error_string, depth + 1, newstate, parser_cb, config_map, user_data)) return -1; continue ; } /* New key/value */ if ((loc = strchr_rs (line, ':'))) { char *key; char *value; *(loc-1) = '\0'; key = remove_whitespace(line, 1); value = remove_whitespace(loc, 0); if (strlen(path) + strlen(key) + 1 >= ICMAP_KEYNAME_MAXLEN) { *error_string = "parser error: New key makes total cmap path too long"; return -1; } strcpy(new_keyname, path); if (strcmp(path, "") != 0) { strcat(new_keyname, "."); } strcat(new_keyname, key); if (!parser_cb(new_keyname, key, value, &state, PARSER_CB_ITEM, error_string, config_map, user_data)) { return -1; } continue ; } if (strchr_rs (line, '}')) { if (depth == 0) { *error_string = "parser error: Unexpected closing brace"; return -1; } if (!parser_cb(path, NULL, NULL, &state, PARSER_CB_SECTION_END, error_string, config_map, user_data)) { return -1; } return 0; } } if (strcmp(path, "") != 0) { *error_string = "parser error: Missing closing brace"; return -1; } if (strcmp(path, "") == 0) { parser_cb("", NULL, NULL, &state, PARSER_CB_END, error_string, config_map, user_data); } return 0; } static int safe_atoq_range(icmap_value_types_t value_type, long long int *min_val, long long int *max_val) { switch (value_type) { case ICMAP_VALUETYPE_INT8: *min_val = INT8_MIN; *max_val = INT8_MAX; break; case ICMAP_VALUETYPE_UINT8: *min_val = 0; *max_val = UINT8_MAX; break; case ICMAP_VALUETYPE_INT16: *min_val = INT16_MIN; *max_val = INT16_MAX; break; case ICMAP_VALUETYPE_UINT16: *min_val = 0; *max_val = UINT16_MAX; break; case ICMAP_VALUETYPE_INT32: *min_val = INT32_MIN; *max_val = INT32_MAX; break; case ICMAP_VALUETYPE_UINT32: *min_val = 0; *max_val = UINT32_MAX; break; default: return (-1); } return (0); } /* * Convert string str to long long int res. Type of result is target_type and currently only * ICMAP_VALUETYPE_[U]INT[8|16|32] is supported. * Return 0 on success, -1 on failure. */ static int safe_atoq(const char *str, long long int *res, icmap_value_types_t target_type) { long long int val; long long int min_val, max_val; char *endptr; errno = 0; val = strtoll(str, &endptr, 10); if (errno == ERANGE) { return (-1); } if (endptr == str) { return (-1); } if (*endptr != '\0') { return (-1); } if (safe_atoq_range(target_type, &min_val, &max_val) != 0) { return (-1); } if (val < min_val || val > max_val) { return (-1); } *res = val; return (0); } static int str_to_ull(const char *str, unsigned long long int *res) { unsigned long long int val; char *endptr; errno = 0; val = strtoull(str, &endptr, 10); if (errno == ERANGE) { return (-1); } if (endptr == str) { return (-1); } if (*endptr != '\0') { return (-1); } *res = val; return (0); } static int main_config_parser_cb(const char *path, char *key, char *value, enum main_cp_cb_data_state *state, enum parser_cb_type type, const char **error_string, icmap_map_t config_map, void *user_data) { int ii; long long int val; long long int min_val, max_val; icmap_value_types_t val_type = ICMAP_VALUETYPE_BINARY; unsigned long long int ull; int add_as_string; char key_name[ICMAP_KEYNAME_MAXLEN]; static char formated_err[256]; struct main_cp_cb_data *data = (struct main_cp_cb_data *)user_data; struct key_value_list_item *kv_item; - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; int uid, gid; switch (type) { case PARSER_CB_START: memset(data, 0, sizeof(struct main_cp_cb_data)); *state = MAIN_CP_CB_DATA_STATE_NORMAL; break; case PARSER_CB_END: break; case PARSER_CB_ITEM: add_as_string = 1; switch (*state) { case MAIN_CP_CB_DATA_STATE_NORMAL: break; case MAIN_CP_CB_DATA_STATE_PLOAD: if ((strcmp(path, "pload.count") == 0) || (strcmp(path, "pload.size") == 0)) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint32_r(config_map, path, val); add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_QUORUM: if ((strcmp(path, "quorum.expected_votes") == 0) || (strcmp(path, "quorum.votes") == 0) || (strcmp(path, "quorum.last_man_standing_window") == 0) || (strcmp(path, "quorum.leaving_timeout") == 0)) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint32_r(config_map, path, val); add_as_string = 0; } if ((strcmp(path, "quorum.two_node") == 0) || (strcmp(path, "quorum.expected_votes_tracking") == 0) || (strcmp(path, "quorum.allow_downscale") == 0) || (strcmp(path, "quorum.wait_for_all") == 0) || (strcmp(path, "quorum.auto_tie_breaker") == 0) || (strcmp(path, "quorum.last_man_standing") == 0)) { val_type = ICMAP_VALUETYPE_UINT8; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint8_r(config_map, path, val); add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_QDEVICE: if ((strcmp(path, "quorum.device.timeout") == 0) || (strcmp(path, "quorum.device.sync_timeout") == 0) || (strcmp(path, "quorum.device.votes") == 0)) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint32_r(config_map, path, val); add_as_string = 0; } if ((strcmp(path, "quorum.device.master_wins") == 0)) { val_type = ICMAP_VALUETYPE_UINT8; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint8_r(config_map, path, val); add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_TOTEM: if ((strcmp(path, "totem.version") == 0) || (strcmp(path, "totem.nodeid") == 0) || (strcmp(path, "totem.threads") == 0) || (strcmp(path, "totem.token") == 0) || (strcmp(path, "totem.token_coefficient") == 0) || (strcmp(path, "totem.token_retransmit") == 0) || (strcmp(path, "totem.hold") == 0) || (strcmp(path, "totem.token_retransmits_before_loss_const") == 0) || (strcmp(path, "totem.join") == 0) || (strcmp(path, "totem.send_join") == 0) || (strcmp(path, "totem.consensus") == 0) || (strcmp(path, "totem.merge") == 0) || (strcmp(path, "totem.downcheck") == 0) || (strcmp(path, "totem.fail_recv_const") == 0) || (strcmp(path, "totem.seqno_unchanged_const") == 0) || (strcmp(path, "totem.rrp_token_expired_timeout") == 0) || (strcmp(path, "totem.rrp_problem_count_timeout") == 0) || (strcmp(path, "totem.rrp_problem_count_threshold") == 0) || (strcmp(path, "totem.rrp_problem_count_mcast_threshold") == 0) || (strcmp(path, "totem.rrp_autorecovery_check_timeout") == 0) || (strcmp(path, "totem.heartbeat_failures_allowed") == 0) || (strcmp(path, "totem.max_network_delay") == 0) || (strcmp(path, "totem.window_size") == 0) || (strcmp(path, "totem.max_messages") == 0) || (strcmp(path, "totem.miss_count_const") == 0) || (strcmp(path, "totem.netmtu") == 0)) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint32_r(config_map,path, val); add_as_string = 0; } if (strcmp(path, "totem.config_version") == 0) { if (str_to_ull(value, &ull) != 0) { goto atoi_error; } icmap_set_uint64_r(config_map, path, ull); add_as_string = 0; } if (strcmp(path, "totem.ip_version") == 0) { if ((strcmp(value, "ipv4") != 0) && (strcmp(value, "ipv6") != 0)) { *error_string = "Invalid ip_version type"; return (0); } } if (strcmp(path, "totem.crypto_type") == 0) { if ((strcmp(value, "nss") != 0) && (strcmp(value, "aes256") != 0) && (strcmp(value, "aes192") != 0) && (strcmp(value, "aes128") != 0) && (strcmp(value, "3des") != 0)) { *error_string = "Invalid crypto type"; return (0); } } if (strcmp(path, "totem.crypto_cipher") == 0) { if ((strcmp(value, "none") != 0) && (strcmp(value, "aes256") != 0) && (strcmp(value, "aes192") != 0) && (strcmp(value, "aes128") != 0) && (strcmp(value, "3des") != 0)) { *error_string = "Invalid cipher type"; return (0); } } if (strcmp(path, "totem.crypto_hash") == 0) { if ((strcmp(value, "none") != 0) && (strcmp(value, "md5") != 0) && (strcmp(value, "sha1") != 0) && (strcmp(value, "sha256") != 0) && (strcmp(value, "sha384") != 0) && (strcmp(value, "sha512") != 0)) { *error_string = "Invalid hash type"; return (0); } } break; case MAIN_CP_CB_DATA_STATE_QB: if (strcmp(path, "qb.ipc_type") == 0) { if ((strcmp(value, "native") != 0) && (strcmp(value, "shm") != 0) && (strcmp(value, "socket") != 0)) { *error_string = "Invalid qb ipc_type"; return (0); } } break; case MAIN_CP_CB_DATA_STATE_INTERFACE: if (strcmp(path, "totem.interface.linknumber") == 0) { val_type = ICMAP_VALUETYPE_UINT8; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->linknumber = val; add_as_string = 0; } if (strcmp(path, "totem.interface.bindnetaddr") == 0) { data->bindnetaddr = strdup(value); add_as_string = 0; } if (strcmp(path, "totem.interface.mcastaddr") == 0) { data->mcastaddr = strdup(value); add_as_string = 0; } if (strcmp(path, "totem.interface.broadcast") == 0) { data->broadcast = strdup(value); add_as_string = 0; } if (strcmp(path, "totem.interface.mcastport") == 0) { val_type = ICMAP_VALUETYPE_UINT16; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->mcastport = val; add_as_string = 0; } if (strcmp(path, "totem.interface.ttl") == 0) { val_type = ICMAP_VALUETYPE_UINT8; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->ttl = val; add_as_string = 0; } if (strcmp(path, "totem.interface.knet_link_priority") == 0) { val_type = ICMAP_VALUETYPE_UINT8; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->knet_link_priority = val; add_as_string = 0; } if (strcmp(path, "totem.interface.knet_ping_interval") == 0) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->knet_ping_interval = val; add_as_string = 0; } if (strcmp(path, "totem.interface.knet_ping_timeout") == 0) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->knet_ping_timeout = val; add_as_string = 0; } if (strcmp(path, "totem.interface.knet_ping_precision") == 0) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } data->knet_ping_precision = val; add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_LOGGER_SUBSYS: if (strcmp(key, "subsys") == 0) { data->subsys = strdup(value); if (data->subsys == NULL) { *error_string = "Can't alloc memory"; return (0); } } else { kv_item = malloc(sizeof(*kv_item)); if (kv_item == NULL) { *error_string = "Can't alloc memory"; return (0); } memset(kv_item, 0, sizeof(*kv_item)); kv_item->key = strdup(key); kv_item->value = strdup(value); if (kv_item->key == NULL || kv_item->value == NULL) { free(kv_item); *error_string = "Can't alloc memory"; return (0); } qb_list_init(&kv_item->list); qb_list_add(&kv_item->list, &data->logger_subsys_items_head); } add_as_string = 0; break; case MAIN_CP_CB_DATA_STATE_LOGGING_DAEMON: if (strcmp(key, "subsys") == 0) { data->subsys = strdup(value); if (data->subsys == NULL) { *error_string = "Can't alloc memory"; return (0); } } else if (strcmp(key, "name") == 0) { data->logging_daemon_name = strdup(value); if (data->logging_daemon_name == NULL) { *error_string = "Can't alloc memory"; return (0); } } else { kv_item = malloc(sizeof(*kv_item)); if (kv_item == NULL) { *error_string = "Can't alloc memory"; return (0); } memset(kv_item, 0, sizeof(*kv_item)); kv_item->key = strdup(key); kv_item->value = strdup(value); if (kv_item->key == NULL || kv_item->value == NULL) { free(kv_item); *error_string = "Can't alloc memory"; return (0); } qb_list_init(&kv_item->list); qb_list_add(&kv_item->list, &data->logger_subsys_items_head); } add_as_string = 0; break; case MAIN_CP_CB_DATA_STATE_UIDGID: if (strcmp(key, "uid") == 0) { uid = uid_determine(value); if (uid == -1) { *error_string = error_string_response; return (0); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.config.uid.%u", uid); icmap_set_uint8_r(config_map, key_name, 1); add_as_string = 0; } else if (strcmp(key, "gid") == 0) { gid = gid_determine(value); if (gid == -1) { *error_string = error_string_response; return (0); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.config.gid.%u", gid); icmap_set_uint8_r(config_map, key_name, 1); add_as_string = 0; } else { *error_string = "uidgid: Only uid and gid are allowed items"; return (0); } break; case MAIN_CP_CB_DATA_STATE_MEMBER: if (strcmp(key, "memberaddr") != 0) { *error_string = "Only memberaddr is allowed in member section"; return (0); } kv_item = malloc(sizeof(*kv_item)); if (kv_item == NULL) { *error_string = "Can't alloc memory"; return (0); } memset(kv_item, 0, sizeof(*kv_item)); kv_item->key = strdup(key); kv_item->value = strdup(value); if (kv_item->key == NULL || kv_item->value == NULL) { free(kv_item); *error_string = "Can't alloc memory"; return (0); } qb_list_init(&kv_item->list); qb_list_add(&kv_item->list, &data->member_items_head); add_as_string = 0; break; case MAIN_CP_CB_DATA_STATE_NODELIST: break; case MAIN_CP_CB_DATA_STATE_NODELIST_NODE: snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.%s", data->node_number, key); if ((strcmp(key, "nodeid") == 0) || (strcmp(key, "quorum_votes") == 0)) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint32_r(config_map, key_name, val); add_as_string = 0; } if (strcmp(key, "ring0_addr") == 0) { data->ring0_addr_added = 1; } if (add_as_string) { icmap_set_string_r(config_map, key_name, value); add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_RESOURCES: if (strcmp(key, "watchdog_timeout") == 0) { val_type = ICMAP_VALUETYPE_UINT32; if (safe_atoq(value, &val, val_type) != 0) { goto atoi_error; } icmap_set_uint32_r(config_map,path, val); add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM: case MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM_MEMUSED: if (strcmp(key, "poll_period") == 0) { if (str_to_ull(value, &ull) != 0) { goto atoi_error; } icmap_set_uint64_r(config_map,path, ull); add_as_string = 0; } break; case MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS: case MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS_MEMUSED: if (strcmp(key, "poll_period") == 0) { if (str_to_ull(value, &ull) != 0) { goto atoi_error; } icmap_set_uint64_r(config_map,path, ull); add_as_string = 0; } break; } if (add_as_string) { icmap_set_string_r(config_map, path, value); } break; case PARSER_CB_SECTION_START: if (strcmp(path, "totem.interface") == 0) { *state = MAIN_CP_CB_DATA_STATE_INTERFACE; data->linknumber = 0; data->mcastport = -1; data->ttl = -1; data->knet_link_priority = -1; data->knet_ping_interval = -1; data->knet_ping_timeout = -1; data->knet_ping_precision = -1; qb_list_init(&data->member_items_head); }; if (strcmp(path, "totem") == 0) { *state = MAIN_CP_CB_DATA_STATE_TOTEM; }; if (strcmp(path, "qb") == 0) { *state = MAIN_CP_CB_DATA_STATE_QB; } if (strcmp(path, "logging.logger_subsys") == 0) { *state = MAIN_CP_CB_DATA_STATE_LOGGER_SUBSYS; qb_list_init(&data->logger_subsys_items_head); data->subsys = NULL; } if (strcmp(path, "logging.logging_daemon") == 0) { *state = MAIN_CP_CB_DATA_STATE_LOGGING_DAEMON; qb_list_init(&data->logger_subsys_items_head); data->subsys = NULL; data->logging_daemon_name = NULL; } if (strcmp(path, "uidgid") == 0) { *state = MAIN_CP_CB_DATA_STATE_UIDGID; } if (strcmp(path, "totem.interface.member") == 0) { *state = MAIN_CP_CB_DATA_STATE_MEMBER; } if (strcmp(path, "quorum") == 0) { *state = MAIN_CP_CB_DATA_STATE_QUORUM; } if (strcmp(path, "quorum.device") == 0) { *state = MAIN_CP_CB_DATA_STATE_QDEVICE; } if (strcmp(path, "nodelist") == 0) { *state = MAIN_CP_CB_DATA_STATE_NODELIST; data->node_number = 0; } if (strcmp(path, "nodelist.node") == 0) { *state = MAIN_CP_CB_DATA_STATE_NODELIST_NODE; data->ring0_addr_added = 0; } if (strcmp(path, "resources") == 0) { *state = MAIN_CP_CB_DATA_STATE_RESOURCES; } if (strcmp(path, "resources.system") == 0) { *state = MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM; } if (strcmp(path, "resources.system.memory_used") == 0) { *state = MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM_MEMUSED; } if (strcmp(path, "resources.process") == 0) { *state = MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS; } if (strcmp(path, "resources.process.memory_used") == 0) { *state = MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS_MEMUSED; } break; case PARSER_CB_SECTION_END: switch (*state) { case MAIN_CP_CB_DATA_STATE_INTERFACE: /* * Create new interface section */ if (data->bindnetaddr != NULL) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", data->linknumber); icmap_set_string_r(config_map, key_name, data->bindnetaddr); free(data->bindnetaddr); data->bindnetaddr = NULL; } if (data->mcastaddr != NULL) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", data->linknumber); icmap_set_string_r(config_map, key_name, data->mcastaddr); free(data->mcastaddr); data->mcastaddr = NULL; } if (data->broadcast != NULL) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.broadcast", data->linknumber); icmap_set_string_r(config_map, key_name, data->broadcast); free(data->broadcast); data->broadcast = NULL; } if (data->mcastport > -1) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", data->linknumber); icmap_set_uint16_r(config_map, key_name, data->mcastport); } if (data->ttl > -1) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.ttl", data->linknumber); icmap_set_uint8_r(config_map, key_name, data->ttl); } if (data->knet_link_priority > -1) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_link_priority", data->linknumber); icmap_set_uint8_r(config_map, key_name, data->knet_link_priority); } if (data->knet_ping_interval > -1) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_interval", data->linknumber); icmap_set_uint32_r(config_map, key_name, data->knet_ping_interval); } if (data->knet_ping_timeout > -1) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_timeout", data->linknumber); icmap_set_uint32_r(config_map, key_name, data->knet_ping_timeout); } if (data->knet_ping_precision > -1) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_precision", data->linknumber); icmap_set_uint32_r(config_map, key_name, data->knet_ping_precision); } ii = 0; - qb_list_for_each(iter, &(data->member_items_head)) { + qb_list_for_each_safe(iter, tmp_iter, &(data->member_items_head)) { kv_item = qb_list_entry(iter, struct key_value_list_item, list); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.member.%u", data->linknumber, ii); icmap_set_string_r(config_map, key_name, kv_item->value); free(kv_item->value); free(kv_item->key); free(kv_item); ii++; } break; case MAIN_CP_CB_DATA_STATE_LOGGER_SUBSYS: if (data->subsys == NULL) { *error_string = "No subsys key in logger_subsys directive"; return (0); } - qb_list_for_each(iter, &(data->logger_subsys_items_head)) { + qb_list_for_each_safe(iter, tmp_iter, &(data->logger_subsys_items_head)) { kv_item = qb_list_entry(iter, struct key_value_list_item, list); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logger_subsys.%s.%s", data->subsys, kv_item->key); icmap_set_string_r(config_map, key_name, kv_item->value); free(kv_item->value); free(kv_item->key); free(kv_item); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logger_subsys.%s.subsys", data->subsys); icmap_set_string_r(config_map, key_name, data->subsys); free(data->subsys); break; case MAIN_CP_CB_DATA_STATE_LOGGING_DAEMON: if (data->logging_daemon_name == NULL) { *error_string = "No name key in logging_daemon directive"; return (0); } - qb_list_for_each(iter, &(data->logger_subsys_items_head)) { + qb_list_for_each_safe(iter, tmp_iter, &(data->logger_subsys_items_head)) { kv_item = qb_list_entry(iter, struct key_value_list_item, list); if (data->subsys == NULL) { if (strcmp(data->logging_daemon_name, "corosync") == 0) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.%s", kv_item->key); } else { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logging_daemon.%s.%s", data->logging_daemon_name, kv_item->key); } } else { if (strcmp(data->logging_daemon_name, "corosync") == 0) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logger_subsys.%s.%s", data->subsys, kv_item->key); } else { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logging_daemon.%s.%s.%s", data->logging_daemon_name, data->subsys, kv_item->key); } } icmap_set_string_r(config_map, key_name, kv_item->value); free(kv_item->value); free(kv_item->key); free(kv_item); } if (data->subsys == NULL) { if (strcmp(data->logging_daemon_name, "corosync") != 0) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logging_daemon.%s.name", data->logging_daemon_name); icmap_set_string_r(config_map, key_name, data->logging_daemon_name); } } else { if (strcmp(data->logging_daemon_name, "corosync") == 0) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logger_subsys.%s.subsys", data->subsys); icmap_set_string_r(config_map, key_name, data->subsys); } else { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logging_daemon.%s.%s.subsys", data->logging_daemon_name, data->subsys); icmap_set_string_r(config_map, key_name, data->subsys); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "logging.logging_daemon.%s.%s.name", data->logging_daemon_name, data->subsys); icmap_set_string_r(config_map, key_name, data->logging_daemon_name); } } free(data->subsys); free(data->logging_daemon_name); break; case MAIN_CP_CB_DATA_STATE_NODELIST_NODE: if (!data->ring0_addr_added) { *error_string = "No ring0_addr specified for node"; return (0); } data->node_number++; break; case MAIN_CP_CB_DATA_STATE_NORMAL: case MAIN_CP_CB_DATA_STATE_PLOAD: case MAIN_CP_CB_DATA_STATE_UIDGID: case MAIN_CP_CB_DATA_STATE_MEMBER: case MAIN_CP_CB_DATA_STATE_QUORUM: case MAIN_CP_CB_DATA_STATE_QDEVICE: case MAIN_CP_CB_DATA_STATE_NODELIST: case MAIN_CP_CB_DATA_STATE_TOTEM: case MAIN_CP_CB_DATA_STATE_QB: break; case MAIN_CP_CB_DATA_STATE_RESOURCES: *state = MAIN_CP_CB_DATA_STATE_NORMAL; break; case MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM: *state = MAIN_CP_CB_DATA_STATE_RESOURCES; break; case MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM_MEMUSED: *state = MAIN_CP_CB_DATA_STATE_RESOURCES_SYSTEM; break; case MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS: *state = MAIN_CP_CB_DATA_STATE_RESOURCES; break; case MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS_MEMUSED: *state = MAIN_CP_CB_DATA_STATE_RESOURCES_PROCESS; break; } break; } return (1); atoi_error: min_val = max_val = 0; /* * This is really assert, because developer ether doesn't set val_type correctly or * we've got here after some nasty memory overwrite */ assert(safe_atoq_range(val_type, &min_val, &max_val) == 0); snprintf(formated_err, sizeof(formated_err), "Value of key \"%s\" is expected to be integer in range (%lld..%lld), but \"%s\" was given", key, min_val, max_val, value); *error_string = formated_err; return (0); } static int uidgid_config_parser_cb(const char *path, char *key, char *value, enum main_cp_cb_data_state *state, enum parser_cb_type type, const char **error_string, icmap_map_t config_map, void *user_data) { char key_name[ICMAP_KEYNAME_MAXLEN]; int uid, gid; switch (type) { case PARSER_CB_START: break; case PARSER_CB_END: break; case PARSER_CB_ITEM: if (strcmp(path, "uidgid.uid") == 0) { uid = uid_determine(value); if (uid == -1) { *error_string = error_string_response; return (0); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.config.uid.%u", uid); icmap_set_uint8_r(config_map, key_name, 1); } else if (strcmp(path, "uidgid.gid") == 0) { gid = gid_determine(value); if (gid == -1) { *error_string = error_string_response; return (0); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.config.gid.%u", gid); icmap_set_uint8_r(config_map, key_name, 1); } else { *error_string = "uidgid: Only uid and gid are allowed items"; return (0); } break; case PARSER_CB_SECTION_START: if (strcmp(path, "uidgid") != 0) { *error_string = "uidgid: Can't add subsection different than uidgid"; return (0); }; break; case PARSER_CB_SECTION_END: break; } return (1); } static int read_uidgid_files_into_icmap( const char **error_string, icmap_map_t config_map) { FILE *fp; const char *dirname; DIR *dp; struct dirent *dirent; struct dirent *entry; char filename[PATH_MAX + FILENAME_MAX + 1]; int res = 0; size_t len; int return_code; struct stat stat_buf; enum main_cp_cb_data_state state = MAIN_CP_CB_DATA_STATE_NORMAL; char key_name[ICMAP_KEYNAME_MAXLEN]; dirname = COROSYSCONFDIR "/uidgid.d"; dp = opendir (dirname); if (dp == NULL) return 0; len = offsetof(struct dirent, d_name) + FILENAME_MAX + 1; entry = malloc(len); if (entry == NULL) { res = 0; goto error_exit; } for (return_code = readdir_r(dp, entry, &dirent); dirent != NULL && return_code == 0; return_code = readdir_r(dp, entry, &dirent)) { snprintf(filename, sizeof (filename), "%s/%s", dirname, dirent->d_name); res = stat (filename, &stat_buf); if (res == 0 && S_ISREG(stat_buf.st_mode)) { fp = fopen (filename, "r"); if (fp == NULL) continue; key_name[0] = 0; res = parse_section(fp, key_name, error_string, 0, state, uidgid_config_parser_cb, config_map, NULL); fclose (fp); if (res != 0) { goto error_exit; } } } error_exit: free (entry); closedir(dp); return res; } /* Read config file and load into icmap */ static int read_config_file_into_icmap( const char **error_string, icmap_map_t config_map) { FILE *fp; const char *filename; char *error_reason = error_string_response; int res; char key_name[ICMAP_KEYNAME_MAXLEN]; struct main_cp_cb_data data; enum main_cp_cb_data_state state = MAIN_CP_CB_DATA_STATE_NORMAL; filename = getenv ("COROSYNC_MAIN_CONFIG_FILE"); if (!filename) filename = COROSYSCONFDIR "/corosync.conf"; fp = fopen (filename, "r"); if (fp == NULL) { char error_str[100]; const char *error_ptr = qb_strerror_r(errno, error_str, sizeof(error_str)); snprintf (error_reason, sizeof(error_string_response), "Can't read file %s reason = (%s)", filename, error_ptr); *error_string = error_reason; return -1; } key_name[0] = 0; res = parse_section(fp, key_name, error_string, 0, state, main_config_parser_cb, config_map, &data); fclose(fp); if (res == 0) { res = read_uidgid_files_into_icmap(error_string, config_map); } if (res == 0) { snprintf (error_reason, sizeof(error_string_response), "Successfully read main configuration file '%s'.", filename); *error_string = error_reason; } return res; } diff --git a/exec/cpg.c b/exec/cpg.c index 52086881..0e117137 100644 --- a/exec/cpg.c +++ b/exec/cpg.c @@ -1,2385 +1,2384 @@ /* * Copyright (c) 2006-2015 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfie@redhat.com) * Author: Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #include "service.h" LOGSYS_DECLARE_SUBSYS ("CPG"); #define GROUP_HASH_SIZE 32 enum cpg_message_req_types { MESSAGE_REQ_EXEC_CPG_PROCJOIN = 0, MESSAGE_REQ_EXEC_CPG_PROCLEAVE = 1, MESSAGE_REQ_EXEC_CPG_JOINLIST = 2, MESSAGE_REQ_EXEC_CPG_MCAST = 3, MESSAGE_REQ_EXEC_CPG_DOWNLIST_OLD = 4, MESSAGE_REQ_EXEC_CPG_DOWNLIST = 5, MESSAGE_REQ_EXEC_CPG_PARTIAL_MCAST = 6, }; struct zcb_mapped { struct qb_list_head list; void *addr; size_t size; }; /* * state` exec deliver * match group name, pid -> if matched deliver for YES: * XXX indicates impossible state * * join leave mcast * UNJOINED XXX XXX NO * LEAVE_STARTED XXX YES(unjoined_enter) YES * JOIN_STARTED YES(join_started_enter) XXX NO * JOIN_COMPLETED XXX NO YES * * join_started_enter * set JOIN_COMPLETED * add entry to process_info list * unjoined_enter * set UNJOINED * delete entry from process_info list * * * library accept join error codes * UNJOINED YES(CS_OK) set JOIN_STARTED * LEAVE_STARTED NO(CS_ERR_BUSY) * JOIN_STARTED NO(CS_ERR_EXIST) * JOIN_COMPlETED NO(CS_ERR_EXIST) * * library accept leave error codes * UNJOINED NO(CS_ERR_NOT_EXIST) * LEAVE_STARTED NO(CS_ERR_NOT_EXIST) * JOIN_STARTED NO(CS_ERR_BUSY) * JOIN_COMPLETED YES(CS_OK) set LEAVE_STARTED * * library accept mcast * UNJOINED NO(CS_ERR_NOT_EXIST) * LEAVE_STARTED NO(CS_ERR_NOT_EXIST) * JOIN_STARTED YES(CS_OK) * JOIN_COMPLETED YES(CS_OK) */ enum cpd_state { CPD_STATE_UNJOINED, CPD_STATE_LEAVE_STARTED, CPD_STATE_JOIN_STARTED, CPD_STATE_JOIN_COMPLETED }; enum cpg_sync_state { CPGSYNC_DOWNLIST, CPGSYNC_JOINLIST }; enum cpg_downlist_state_e { CPG_DOWNLIST_NONE, CPG_DOWNLIST_WAITING_FOR_MESSAGES, CPG_DOWNLIST_APPLYING, }; static enum cpg_downlist_state_e downlist_state; static struct qb_list_head downlist_messages_head; static struct qb_list_head joinlist_messages_head; struct cpg_pd { void *conn; mar_cpg_name_t group_name; uint32_t pid; enum cpd_state cpd_state; unsigned int flags; int initial_totem_conf_sent; uint64_t transition_counter; /* These two are used when sending fragmented messages */ uint64_t initial_transition_counter; struct qb_list_head list; struct qb_list_head iteration_instance_list_head; struct qb_list_head zcb_mapped_list_head; }; struct cpg_iteration_instance { hdb_handle_t handle; struct qb_list_head list; struct qb_list_head items_list_head; /* List of process_info */ struct qb_list_head *current_pointer; }; DECLARE_HDB_DATABASE(cpg_iteration_handle_t_db,NULL); QB_LIST_DECLARE (cpg_pd_list_head); static unsigned int my_member_list[PROCESSOR_COUNT_MAX]; static unsigned int my_member_list_entries; static unsigned int my_old_member_list[PROCESSOR_COUNT_MAX]; static unsigned int my_old_member_list_entries = 0; static struct corosync_api_v1 *api = NULL; static enum cpg_sync_state my_sync_state = CPGSYNC_DOWNLIST; static mar_cpg_ring_id_t last_sync_ring_id; struct process_info { unsigned int nodeid; uint32_t pid; mar_cpg_name_t group; struct qb_list_head list; /* on the group_info members list */ }; QB_LIST_DECLARE (process_info_list_head); struct join_list_entry { uint32_t pid; mar_cpg_name_t group_name; }; /* * Service Interfaces required by service_message_handler struct */ static char *cpg_exec_init_fn (struct corosync_api_v1 *); static int cpg_lib_init_fn (void *conn); static int cpg_lib_exit_fn (void *conn); static void message_handler_req_exec_cpg_procjoin ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cpg_procleave ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cpg_joinlist ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cpg_mcast ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cpg_partial_mcast ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cpg_downlist_old ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cpg_downlist ( const void *message, unsigned int nodeid); static void exec_cpg_procjoin_endian_convert (void *msg); static void exec_cpg_joinlist_endian_convert (void *msg); static void exec_cpg_mcast_endian_convert (void *msg); static void exec_cpg_partial_mcast_endian_convert (void *msg); static void exec_cpg_downlist_endian_convert_old (void *msg); static void exec_cpg_downlist_endian_convert (void *msg); static void message_handler_req_lib_cpg_join (void *conn, const void *message); static void message_handler_req_lib_cpg_leave (void *conn, const void *message); static void message_handler_req_lib_cpg_finalize (void *conn, const void *message); static void message_handler_req_lib_cpg_mcast (void *conn, const void *message); static void message_handler_req_lib_cpg_partial_mcast (void *conn, const void *message); static void message_handler_req_lib_cpg_membership (void *conn, const void *message); static void message_handler_req_lib_cpg_local_get (void *conn, const void *message); static void message_handler_req_lib_cpg_iteration_initialize ( void *conn, const void *message); static void message_handler_req_lib_cpg_iteration_next ( void *conn, const void *message); static void message_handler_req_lib_cpg_iteration_finalize ( void *conn, const void *message); static void message_handler_req_lib_cpg_zc_alloc ( void *conn, const void *message); static void message_handler_req_lib_cpg_zc_free ( void *conn, const void *message); static void message_handler_req_lib_cpg_zc_execute ( void *conn, const void *message); static int cpg_node_joinleave_send (unsigned int pid, const mar_cpg_name_t *group_name, int fn, int reason); static int cpg_exec_send_downlist(void); static int cpg_exec_send_joinlist(void); static void downlist_messages_delete (void); static void downlist_master_choose_and_send (void); static void joinlist_inform_clients (void); static void joinlist_messages_delete (void); static void cpg_sync_init ( const unsigned int *trans_list, size_t trans_list_entries, const unsigned int *member_list, size_t member_list_entries, const struct memb_ring_id *ring_id); static int cpg_sync_process (void); static void cpg_sync_activate (void); static void cpg_sync_abort (void); static void do_proc_join( const mar_cpg_name_t *name, uint32_t pid, unsigned int nodeid, int reason); static void do_proc_leave( const mar_cpg_name_t *name, uint32_t pid, unsigned int nodeid, int reason); static int notify_lib_totem_membership ( void *conn, int member_list_entries, const unsigned int *member_list); static inline int zcb_all_free ( struct cpg_pd *cpd); static char *cpg_print_group_name ( const mar_cpg_name_t *group); /* * Library Handler Definition */ static struct corosync_lib_handler cpg_lib_engine[] = { { /* 0 - MESSAGE_REQ_CPG_JOIN */ .lib_handler_fn = message_handler_req_lib_cpg_join, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 1 - MESSAGE_REQ_CPG_LEAVE */ .lib_handler_fn = message_handler_req_lib_cpg_leave, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 2 - MESSAGE_REQ_CPG_MCAST */ .lib_handler_fn = message_handler_req_lib_cpg_mcast, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 3 - MESSAGE_REQ_CPG_MEMBERSHIP */ .lib_handler_fn = message_handler_req_lib_cpg_membership, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 4 - MESSAGE_REQ_CPG_LOCAL_GET */ .lib_handler_fn = message_handler_req_lib_cpg_local_get, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 5 - MESSAGE_REQ_CPG_ITERATIONINITIALIZE */ .lib_handler_fn = message_handler_req_lib_cpg_iteration_initialize, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 6 - MESSAGE_REQ_CPG_ITERATIONNEXT */ .lib_handler_fn = message_handler_req_lib_cpg_iteration_next, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 7 - MESSAGE_REQ_CPG_ITERATIONFINALIZE */ .lib_handler_fn = message_handler_req_lib_cpg_iteration_finalize, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 8 - MESSAGE_REQ_CPG_FINALIZE */ .lib_handler_fn = message_handler_req_lib_cpg_finalize, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 9 */ .lib_handler_fn = message_handler_req_lib_cpg_zc_alloc, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 10 */ .lib_handler_fn = message_handler_req_lib_cpg_zc_free, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 11 */ .lib_handler_fn = message_handler_req_lib_cpg_zc_execute, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 12 */ .lib_handler_fn = message_handler_req_lib_cpg_partial_mcast, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, }; static struct corosync_exec_handler cpg_exec_engine[] = { { /* 0 - MESSAGE_REQ_EXEC_CPG_PROCJOIN */ .exec_handler_fn = message_handler_req_exec_cpg_procjoin, .exec_endian_convert_fn = exec_cpg_procjoin_endian_convert }, { /* 1 - MESSAGE_REQ_EXEC_CPG_PROCLEAVE */ .exec_handler_fn = message_handler_req_exec_cpg_procleave, .exec_endian_convert_fn = exec_cpg_procjoin_endian_convert }, { /* 2 - MESSAGE_REQ_EXEC_CPG_JOINLIST */ .exec_handler_fn = message_handler_req_exec_cpg_joinlist, .exec_endian_convert_fn = exec_cpg_joinlist_endian_convert }, { /* 3 - MESSAGE_REQ_EXEC_CPG_MCAST */ .exec_handler_fn = message_handler_req_exec_cpg_mcast, .exec_endian_convert_fn = exec_cpg_mcast_endian_convert }, { /* 4 - MESSAGE_REQ_EXEC_CPG_DOWNLIST_OLD */ .exec_handler_fn = message_handler_req_exec_cpg_downlist_old, .exec_endian_convert_fn = exec_cpg_downlist_endian_convert_old }, { /* 5 - MESSAGE_REQ_EXEC_CPG_DOWNLIST */ .exec_handler_fn = message_handler_req_exec_cpg_downlist, .exec_endian_convert_fn = exec_cpg_downlist_endian_convert }, { /* 6 - MESSAGE_REQ_EXEC_CPG_PARTIAL_MCAST */ .exec_handler_fn = message_handler_req_exec_cpg_partial_mcast, .exec_endian_convert_fn = exec_cpg_partial_mcast_endian_convert }, }; struct corosync_service_engine cpg_service_engine = { .name = "corosync cluster closed process group service v1.01", .id = CPG_SERVICE, .priority = 1, .private_data_size = sizeof (struct cpg_pd), .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED, .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .lib_init_fn = cpg_lib_init_fn, .lib_exit_fn = cpg_lib_exit_fn, .lib_engine = cpg_lib_engine, .lib_engine_count = sizeof (cpg_lib_engine) / sizeof (struct corosync_lib_handler), .exec_init_fn = cpg_exec_init_fn, .exec_dump_fn = NULL, .exec_engine = cpg_exec_engine, .exec_engine_count = sizeof (cpg_exec_engine) / sizeof (struct corosync_exec_handler), .sync_init = cpg_sync_init, .sync_process = cpg_sync_process, .sync_activate = cpg_sync_activate, .sync_abort = cpg_sync_abort }; struct corosync_service_engine *cpg_get_service_engine_ver0 (void) { return (&cpg_service_engine); } struct req_exec_cpg_procjoin { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_cpg_name_t group_name __attribute__((aligned(8))); mar_uint32_t pid __attribute__((aligned(8))); mar_uint32_t reason __attribute__((aligned(8))); }; struct req_exec_cpg_mcast { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_cpg_name_t group_name __attribute__((aligned(8))); mar_uint32_t msglen __attribute__((aligned(8))); mar_uint32_t pid __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); mar_uint8_t message[] __attribute__((aligned(8))); }; struct req_exec_cpg_partial_mcast { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_cpg_name_t group_name __attribute__((aligned(8))); mar_uint32_t msglen __attribute__((aligned(8))); mar_uint32_t fraglen __attribute__((aligned(8))); mar_uint32_t pid __attribute__((aligned(8))); mar_uint32_t type __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); mar_uint8_t message[] __attribute__((aligned(8))); }; struct req_exec_cpg_downlist_old { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_uint32_t left_nodes __attribute__((aligned(8))); mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); }; struct req_exec_cpg_downlist { struct qb_ipc_request_header header __attribute__((aligned(8))); /* merge decisions */ mar_uint32_t old_members __attribute__((aligned(8))); /* downlist below */ mar_uint32_t left_nodes __attribute__((aligned(8))); mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); }; struct downlist_msg { mar_uint32_t sender_nodeid; mar_uint32_t old_members __attribute__((aligned(8))); mar_uint32_t left_nodes __attribute__((aligned(8))); mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); struct qb_list_head list; }; struct joinlist_msg { mar_uint32_t sender_nodeid; uint32_t pid; mar_cpg_name_t group_name; struct qb_list_head list; }; static struct req_exec_cpg_downlist g_req_exec_cpg_downlist; /* * Function print group name. It's not reentrant */ static char *cpg_print_group_name(const mar_cpg_name_t *group) { static char res[CPG_MAX_NAME_LENGTH * 4 + 1]; int dest_pos = 0; char c; int i; for (i = 0; i < group->length; i++) { c = group->value[i]; if (c >= ' ' && c < 0x7f && c != '\\') { res[dest_pos++] = c; } else { if (c == '\\') { res[dest_pos++] = '\\'; res[dest_pos++] = '\\'; } else { snprintf(res + dest_pos, sizeof(res) - dest_pos, "\\x%02X", c); dest_pos += 4; } } } res[dest_pos] = 0; return (res); } static void cpg_sync_init ( const unsigned int *trans_list, size_t trans_list_entries, const unsigned int *member_list, size_t member_list_entries, const struct memb_ring_id *ring_id) { int entries; int i, j; int found; my_sync_state = CPGSYNC_DOWNLIST; memcpy (my_member_list, member_list, member_list_entries * sizeof (unsigned int)); my_member_list_entries = member_list_entries; last_sync_ring_id.nodeid = ring_id->rep.nodeid; last_sync_ring_id.seq = ring_id->seq; downlist_state = CPG_DOWNLIST_WAITING_FOR_MESSAGES; entries = 0; /* * Determine list of nodeids for downlist message */ for (i = 0; i < my_old_member_list_entries; i++) { found = 0; for (j = 0; j < trans_list_entries; j++) { if (my_old_member_list[i] == trans_list[j]) { found = 1; break; } } if (found == 0) { g_req_exec_cpg_downlist.nodeids[entries++] = my_old_member_list[i]; } } g_req_exec_cpg_downlist.left_nodes = entries; } static int cpg_sync_process (void) { int res = -1; if (my_sync_state == CPGSYNC_DOWNLIST) { res = cpg_exec_send_downlist(); if (res == -1) { return (-1); } my_sync_state = CPGSYNC_JOINLIST; } if (my_sync_state == CPGSYNC_JOINLIST) { res = cpg_exec_send_joinlist(); } return (res); } static void cpg_sync_activate (void) { memcpy (my_old_member_list, my_member_list, my_member_list_entries * sizeof (unsigned int)); my_old_member_list_entries = my_member_list_entries; if (downlist_state == CPG_DOWNLIST_WAITING_FOR_MESSAGES) { downlist_master_choose_and_send (); } joinlist_inform_clients (); downlist_messages_delete (); downlist_state = CPG_DOWNLIST_NONE; joinlist_messages_delete (); notify_lib_totem_membership (NULL, my_member_list_entries, my_member_list); } static void cpg_sync_abort (void) { downlist_state = CPG_DOWNLIST_NONE; downlist_messages_delete (); joinlist_messages_delete (); } static int notify_lib_totem_membership ( void *conn, int member_list_entries, const unsigned int *member_list) { struct qb_list_head *iter; char *buf; int size; struct res_lib_cpg_totem_confchg_callback *res; size = sizeof(struct res_lib_cpg_totem_confchg_callback) + sizeof(mar_uint32_t) * (member_list_entries); buf = alloca(size); if (!buf) return CS_ERR_LIBRARY; res = (struct res_lib_cpg_totem_confchg_callback *)buf; res->member_list_entries = member_list_entries; res->header.size = size; res->header.id = MESSAGE_RES_CPG_TOTEM_CONFCHG_CALLBACK; res->header.error = CS_OK; memcpy (&res->ring_id, &last_sync_ring_id, sizeof (mar_cpg_ring_id_t)); memcpy (res->member_list, member_list, res->member_list_entries * sizeof (mar_uint32_t)); if (conn == NULL) { - qb_list_for_each(iter, &cpg_pd_list_head) { + qb_list_for_each(iter, &cpg_pd_list_head) { struct cpg_pd *cpg_pd = qb_list_entry (iter, struct cpg_pd, list); api->ipc_dispatch_send (cpg_pd->conn, buf, size); } } else { api->ipc_dispatch_send (conn, buf, size); } return CS_OK; } static int notify_lib_joinlist( const mar_cpg_name_t *group_name, void *conn, int joined_list_entries, mar_cpg_address_t *joined_list, int left_list_entries, mar_cpg_address_t *left_list, int id) { int size; char *buf; struct qb_list_head *iter; int count; struct res_lib_cpg_confchg_callback *res; mar_cpg_address_t *retgi; count = 0; - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (iter, struct process_info, list); if (mar_name_compare (&pi->group, group_name) == 0) { int i; int founded = 0; for (i = 0; i < left_list_entries; i++) { if (left_list[i].nodeid == pi->nodeid && left_list[i].pid == pi->pid) { founded++; } } if (!founded) count++; } } size = sizeof(struct res_lib_cpg_confchg_callback) + sizeof(mar_cpg_address_t) * (count + left_list_entries + joined_list_entries); buf = alloca(size); if (!buf) return CS_ERR_LIBRARY; res = (struct res_lib_cpg_confchg_callback *)buf; res->joined_list_entries = joined_list_entries; res->left_list_entries = left_list_entries; res->member_list_entries = count; retgi = res->member_list; res->header.size = size; res->header.id = id; res->header.error = CS_OK; memcpy(&res->group_name, group_name, sizeof(mar_cpg_name_t)); - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi=qb_list_entry (iter, struct process_info, list); if (mar_name_compare (&pi->group, group_name) == 0) { int i; int founded = 0; for (i = 0;i < left_list_entries; i++) { if (left_list[i].nodeid == pi->nodeid && left_list[i].pid == pi->pid) { founded++; } } if (!founded) { retgi->nodeid = pi->nodeid; retgi->pid = pi->pid; retgi++; } } } if (left_list_entries) { memcpy (retgi, left_list, left_list_entries * sizeof(mar_cpg_address_t)); retgi += left_list_entries; } if (joined_list_entries) { memcpy (retgi, joined_list, joined_list_entries * sizeof(mar_cpg_address_t)); retgi += joined_list_entries; } if (conn) { api->ipc_dispatch_send (conn, buf, size); } else { - qb_list_for_each(iter, &cpg_pd_list_head) { + qb_list_for_each(iter, &cpg_pd_list_head) { struct cpg_pd *cpd = qb_list_entry (iter, struct cpg_pd, list); if (mar_name_compare (&cpd->group_name, group_name) == 0) { assert (joined_list_entries <= 1); if (joined_list_entries) { if (joined_list[0].pid == cpd->pid && joined_list[0].nodeid == api->totem_nodeid_get()) { cpd->cpd_state = CPD_STATE_JOIN_COMPLETED; } } if (cpd->cpd_state == CPD_STATE_JOIN_COMPLETED || cpd->cpd_state == CPD_STATE_LEAVE_STARTED) { api->ipc_dispatch_send (cpd->conn, buf, size); cpd->transition_counter++; } if (left_list_entries) { if (left_list[0].pid == cpd->pid && left_list[0].nodeid == api->totem_nodeid_get() && left_list[0].reason == CONFCHG_CPG_REASON_LEAVE) { cpd->pid = 0; memset (&cpd->group_name, 0, sizeof(cpd->group_name)); cpd->cpd_state = CPD_STATE_UNJOINED; } } } } } /* * Traverse thru cpds and send totem membership for cpd, where it is not send yet */ - qb_list_for_each(iter, &cpg_pd_list_head) { + qb_list_for_each(iter, &cpg_pd_list_head) { struct cpg_pd *cpd = qb_list_entry (iter, struct cpg_pd, list); if ((cpd->flags & CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF) && (cpd->initial_totem_conf_sent == 0)) { cpd->initial_totem_conf_sent = 1; notify_lib_totem_membership (cpd->conn, my_old_member_list_entries, my_old_member_list); } } return CS_OK; } static void downlist_log(const char *msg, struct downlist_msg* dl) { log_printf (LOG_DEBUG, "%s: sender %s; members(old:%d left:%d)", msg, api->totem_ifaces_print(dl->sender_nodeid), dl->old_members, dl->left_nodes); } static struct downlist_msg* downlist_master_choose (void) { struct downlist_msg *cmp; struct downlist_msg *best = NULL; struct qb_list_head *iter; uint32_t cmp_members; uint32_t best_members; uint32_t i; int ignore_msg; - qb_list_for_each(iter, &downlist_messages_head) { + qb_list_for_each(iter, &downlist_messages_head) { cmp = qb_list_entry(iter, struct downlist_msg, list); downlist_log("comparing", cmp); ignore_msg = 0; for (i = 0; i < cmp->left_nodes; i++) { if (cmp->nodeids[i] == api->totem_nodeid_get()) { log_printf (LOG_DEBUG, "Ignoring this entry because I'm in the left list\n"); ignore_msg = 1; break; } } if (ignore_msg) { continue ; } if (best == NULL) { best = cmp; continue; } best_members = best->old_members - best->left_nodes; cmp_members = cmp->old_members - cmp->left_nodes; if (cmp_members > best_members) { best = cmp; } else if (cmp_members == best_members) { if (cmp->old_members > best->old_members) { best = cmp; } else if (cmp->old_members == best->old_members) { if (cmp->sender_nodeid < best->sender_nodeid) { best = cmp; } } } } assert (best != NULL); return best; } static void downlist_master_choose_and_send (void) { struct downlist_msg *stored_msg; - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct process_info *left_pi; qb_map_t *group_map; struct cpg_name cpg_group; mar_cpg_name_t group; struct confchg_data{ struct cpg_name cpg_group; mar_cpg_address_t left_list[CPG_MEMBERS_MAX]; int left_list_entries; struct qb_list_head list; } *pcd; qb_map_iter_t *miter; int i, size; downlist_state = CPG_DOWNLIST_APPLYING; stored_msg = downlist_master_choose (); if (!stored_msg) { log_printf (LOGSYS_LEVEL_DEBUG, "NO chosen downlist"); return; } downlist_log("chosen downlist", stored_msg); group_map = qb_skiplist_create(); /* * only the cpg groups included in left nodes should receive * confchg event, so we will collect these cpg groups and * relative left_lists here. */ - qb_list_for_each(iter, &process_info_list_head) { - struct process_info *pi = qb_list_entry(iter, struct process_info, list); + qb_list_for_each_safe(iter, tmp_iter, &process_info_list_head) { + struct process_info *pi = qb_list_entry(iter, struct process_info, list); left_pi = NULL; for (i = 0; i < stored_msg->left_nodes; i++) { if (pi->nodeid == stored_msg->nodeids[i]) { left_pi = pi; break; } } if (left_pi) { marshall_from_mar_cpg_name_t(&cpg_group, &left_pi->group); cpg_group.value[cpg_group.length] = 0; pcd = (struct confchg_data *)qb_map_get(group_map, cpg_group.value); if (pcd == NULL) { pcd = (struct confchg_data *)calloc(1, sizeof(struct confchg_data)); memcpy(&pcd->cpg_group, &cpg_group, sizeof(struct cpg_name)); qb_map_put(group_map, pcd->cpg_group.value, pcd); } size = pcd->left_list_entries; pcd->left_list[size].nodeid = left_pi->nodeid; pcd->left_list[size].pid = left_pi->pid; pcd->left_list[size].reason = CONFCHG_CPG_REASON_NODEDOWN; pcd->left_list_entries++; qb_list_del (&left_pi->list); free (left_pi); } } /* send only one confchg event per cpg group */ miter = qb_map_iter_create(group_map); while (qb_map_iter_next(miter, (void **)&pcd)) { marshall_to_mar_cpg_name_t(&group, &pcd->cpg_group); log_printf (LOG_DEBUG, "left_list_entries:%d", pcd->left_list_entries); for (i=0; ileft_list_entries; i++) { log_printf (LOG_DEBUG, "left_list[%d] group:%s, ip:%s, pid:%d", i, cpg_print_group_name(&group), (char*)api->totem_ifaces_print(pcd->left_list[i].nodeid), pcd->left_list[i].pid); } /* send confchg event */ notify_lib_joinlist(&group, NULL, 0, NULL, pcd->left_list_entries, pcd->left_list, MESSAGE_RES_CPG_CONFCHG_CALLBACK); free(pcd); } qb_map_iter_free(miter); qb_map_destroy(group_map); } /* * Remove processes that might have left the group while we were suspended. */ static void joinlist_remove_zombie_pi_entries (void) { - struct qb_list_head *pi_iter; + struct qb_list_head *pi_iter, *tmp_iter; struct qb_list_head *jl_iter; struct process_info *pi; struct joinlist_msg *stored_msg; int found; - qb_list_for_each(pi_iter, &process_info_list_head) { - pi = qb_list_entry (pi_iter, struct process_info, list); + qb_list_for_each_safe(pi_iter, tmp_iter, &process_info_list_head) { + pi = qb_list_entry (pi_iter, struct process_info, list); /* * Ignore local node */ if (pi->nodeid == api->totem_nodeid_get()) { continue ; } /* * Try to find message in joinlist messages */ found = 0; - qb_list_for_each(jl_iter, &joinlist_messages_head) { + qb_list_for_each(jl_iter, &joinlist_messages_head) { stored_msg = qb_list_entry(jl_iter, struct joinlist_msg, list); if (stored_msg->sender_nodeid == api->totem_nodeid_get()) { continue ; } if (pi->nodeid == stored_msg->sender_nodeid && pi->pid == stored_msg->pid && mar_name_compare (&pi->group, &stored_msg->group_name) == 0) { found = 1; break ; } } if (!found) { do_proc_leave(&pi->group, pi->pid, pi->nodeid, CONFCHG_CPG_REASON_PROCDOWN); } } } static void joinlist_inform_clients (void) { struct joinlist_msg *stored_msg; struct qb_list_head *iter; unsigned int i; i = 0; - qb_list_for_each(iter, &joinlist_messages_head) { + qb_list_for_each(iter, &joinlist_messages_head) { stored_msg = qb_list_entry(iter, struct joinlist_msg, list); log_printf (LOG_DEBUG, "joinlist_messages[%u] group:%s, ip:%s, pid:%d", i++, cpg_print_group_name(&stored_msg->group_name), (char*)api->totem_ifaces_print(stored_msg->sender_nodeid), stored_msg->pid); /* Ignore our own messages */ if (stored_msg->sender_nodeid == api->totem_nodeid_get()) { continue ; } do_proc_join (&stored_msg->group_name, stored_msg->pid, stored_msg->sender_nodeid, CONFCHG_CPG_REASON_NODEUP); } joinlist_remove_zombie_pi_entries (); } static void downlist_messages_delete (void) { struct downlist_msg *stored_msg; - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; - qb_list_for_each(iter, &downlist_messages_head) { + qb_list_for_each_safe(iter, tmp_iter, &downlist_messages_head) { stored_msg = qb_list_entry(iter, struct downlist_msg, list); qb_list_del (&stored_msg->list); free (stored_msg); } } static void joinlist_messages_delete (void) { struct joinlist_msg *stored_msg; - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; - qb_list_for_each(iter, &joinlist_messages_head) { + qb_list_for_each_safe(iter, tmp_iter, &joinlist_messages_head) { stored_msg = qb_list_entry(iter, struct joinlist_msg, list); qb_list_del (&stored_msg->list); free (stored_msg); } qb_list_init (&joinlist_messages_head); } static char *cpg_exec_init_fn (struct corosync_api_v1 *corosync_api) { qb_list_init (&downlist_messages_head); qb_list_init (&joinlist_messages_head); api = corosync_api; return (NULL); } static void cpg_iteration_instance_finalize (struct cpg_iteration_instance *cpg_iteration_instance) { - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct process_info *pi; - qb_list_for_each(iter, &(cpg_iteration_instance->items_list_head)) { + qb_list_for_each_safe(iter, tmp_iter, &(cpg_iteration_instance->items_list_head)) { pi = qb_list_entry (iter, struct process_info, list); qb_list_del (&pi->list); free (pi); } qb_list_del (&cpg_iteration_instance->list); hdb_handle_destroy (&cpg_iteration_handle_t_db, cpg_iteration_instance->handle); } static void cpg_pd_finalize (struct cpg_pd *cpd) { - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct cpg_iteration_instance *cpii; - zcb_all_free(cpd); - qb_list_for_each(iter, &(cpd->iteration_instance_list_head)) { + zcb_all_free(cpd); + qb_list_for_each_safe(iter, tmp_iter, &(cpd->iteration_instance_list_head)) { cpii = qb_list_entry (iter, struct cpg_iteration_instance, list); cpg_iteration_instance_finalize (cpii); } qb_list_del (&cpd->list); } static int cpg_lib_exit_fn (void *conn) { struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, "exit_fn for conn=%p", conn); if (cpd->group_name.length > 0 && cpd->cpd_state != CPD_STATE_LEAVE_STARTED) { cpg_node_joinleave_send (cpd->pid, &cpd->group_name, MESSAGE_REQ_EXEC_CPG_PROCLEAVE, CONFCHG_CPG_REASON_PROCDOWN); } cpg_pd_finalize (cpd); api->ipc_refcnt_dec (conn); return (0); } static int cpg_node_joinleave_send (unsigned int pid, const mar_cpg_name_t *group_name, int fn, int reason) { struct req_exec_cpg_procjoin req_exec_cpg_procjoin; struct iovec req_exec_cpg_iovec; int result; memcpy(&req_exec_cpg_procjoin.group_name, group_name, sizeof(mar_cpg_name_t)); req_exec_cpg_procjoin.pid = pid; req_exec_cpg_procjoin.reason = reason; req_exec_cpg_procjoin.header.size = sizeof(req_exec_cpg_procjoin); req_exec_cpg_procjoin.header.id = SERVICE_ID_MAKE(CPG_SERVICE, fn); req_exec_cpg_iovec.iov_base = (char *)&req_exec_cpg_procjoin; req_exec_cpg_iovec.iov_len = sizeof(req_exec_cpg_procjoin); result = api->totem_mcast (&req_exec_cpg_iovec, 1, TOTEM_AGREED); return (result); } /* Can byteswap join & leave messages */ static void exec_cpg_procjoin_endian_convert (void *msg) { struct req_exec_cpg_procjoin *req_exec_cpg_procjoin = msg; req_exec_cpg_procjoin->pid = swab32(req_exec_cpg_procjoin->pid); swab_mar_cpg_name_t (&req_exec_cpg_procjoin->group_name); req_exec_cpg_procjoin->reason = swab32(req_exec_cpg_procjoin->reason); } static void exec_cpg_joinlist_endian_convert (void *msg_v) { char *msg = msg_v; struct qb_ipc_response_header *res = (struct qb_ipc_response_header *)msg; struct join_list_entry *jle = (struct join_list_entry *)(msg + sizeof(struct qb_ipc_response_header)); swab_mar_int32_t (&res->size); while ((const char*)jle < msg + res->size) { jle->pid = swab32(jle->pid); swab_mar_cpg_name_t (&jle->group_name); jle++; } } static void exec_cpg_downlist_endian_convert_old (void *msg) { } static void exec_cpg_downlist_endian_convert (void *msg) { struct req_exec_cpg_downlist *req_exec_cpg_downlist = msg; unsigned int i; req_exec_cpg_downlist->left_nodes = swab32(req_exec_cpg_downlist->left_nodes); req_exec_cpg_downlist->old_members = swab32(req_exec_cpg_downlist->old_members); for (i = 0; i < req_exec_cpg_downlist->left_nodes; i++) { req_exec_cpg_downlist->nodeids[i] = swab32(req_exec_cpg_downlist->nodeids[i]); } } static void exec_cpg_mcast_endian_convert (void *msg) { struct req_exec_cpg_mcast *req_exec_cpg_mcast = msg; swab_coroipc_request_header_t (&req_exec_cpg_mcast->header); swab_mar_cpg_name_t (&req_exec_cpg_mcast->group_name); req_exec_cpg_mcast->pid = swab32(req_exec_cpg_mcast->pid); req_exec_cpg_mcast->msglen = swab32(req_exec_cpg_mcast->msglen); swab_mar_message_source_t (&req_exec_cpg_mcast->source); } static void exec_cpg_partial_mcast_endian_convert (void *msg) { struct req_exec_cpg_partial_mcast *req_exec_cpg_mcast = msg; swab_coroipc_request_header_t (&req_exec_cpg_mcast->header); swab_mar_cpg_name_t (&req_exec_cpg_mcast->group_name); req_exec_cpg_mcast->pid = swab32(req_exec_cpg_mcast->pid); req_exec_cpg_mcast->msglen = swab32(req_exec_cpg_mcast->msglen); req_exec_cpg_mcast->fraglen = swab32(req_exec_cpg_mcast->fraglen); req_exec_cpg_mcast->type = swab32(req_exec_cpg_mcast->type); swab_mar_message_source_t (&req_exec_cpg_mcast->source); } static struct process_info *process_info_find(const mar_cpg_name_t *group_name, uint32_t pid, unsigned int nodeid) { struct qb_list_head *iter; - qb_list_for_each(iter, &process_info_list_head) { - struct process_info *pi = qb_list_entry (iter, struct process_info, list); + qb_list_for_each(iter, &process_info_list_head) { + struct process_info *pi = qb_list_entry (iter, struct process_info, list); + if (pi->pid == pid && pi->nodeid == nodeid && mar_name_compare (&pi->group, group_name) == 0) { return pi; } } return NULL; } static void do_proc_join( const mar_cpg_name_t *name, uint32_t pid, unsigned int nodeid, int reason) { struct process_info *pi; struct process_info *pi_entry; mar_cpg_address_t notify_info; struct qb_list_head *list; struct qb_list_head *list_to_add = NULL; if (process_info_find (name, pid, nodeid) != NULL) { return ; } pi = malloc (sizeof (struct process_info)); if (!pi) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to allocate process_info struct"); return; } pi->nodeid = nodeid; pi->pid = pid; memcpy(&pi->group, name, sizeof(*name)); qb_list_init(&pi->list); /* * Insert new process in sorted order so synchronization works properly */ list_to_add = &process_info_list_head; - qb_list_for_each(list, &process_info_list_head) { + qb_list_for_each(list, &process_info_list_head) { pi_entry = qb_list_entry(list, struct process_info, list); if (pi_entry->nodeid > pi->nodeid || (pi_entry->nodeid == pi->nodeid && pi_entry->pid > pi->pid)) { break; } list_to_add = list; } qb_list_add (&pi->list, list_to_add); notify_info.pid = pi->pid; notify_info.nodeid = nodeid; notify_info.reason = reason; notify_lib_joinlist(&pi->group, NULL, 1, ¬ify_info, 0, NULL, MESSAGE_RES_CPG_CONFCHG_CALLBACK); } static void do_proc_leave( const mar_cpg_name_t *name, uint32_t pid, unsigned int nodeid, int reason) { struct process_info *pi; - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; mar_cpg_address_t notify_info; notify_info.pid = pid; notify_info.nodeid = nodeid; notify_info.reason = reason; notify_lib_joinlist(name, NULL, 0, NULL, 1, ¬ify_info, MESSAGE_RES_CPG_CONFCHG_CALLBACK); - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each_safe(iter, tmp_iter, &process_info_list_head) { pi = qb_list_entry(iter, struct process_info, list); - iter = iter->next; if (pi->pid == pid && pi->nodeid == nodeid && mar_name_compare (&pi->group, name)==0) { qb_list_del (&pi->list); free (pi); } } } static void message_handler_req_exec_cpg_downlist_old ( const void *message, unsigned int nodeid) { log_printf (LOGSYS_LEVEL_WARNING, "downlist OLD from node 0x%x", nodeid); } static void message_handler_req_exec_cpg_downlist( const void *message, unsigned int nodeid) { const struct req_exec_cpg_downlist *req_exec_cpg_downlist = message; int i; struct qb_list_head *iter; struct downlist_msg *stored_msg; int found; if (downlist_state != CPG_DOWNLIST_WAITING_FOR_MESSAGES) { log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received in state %d", req_exec_cpg_downlist->left_nodes, downlist_state); return; } stored_msg = malloc (sizeof (struct downlist_msg)); stored_msg->sender_nodeid = nodeid; stored_msg->old_members = req_exec_cpg_downlist->old_members; stored_msg->left_nodes = req_exec_cpg_downlist->left_nodes; memcpy (stored_msg->nodeids, req_exec_cpg_downlist->nodeids, req_exec_cpg_downlist->left_nodes * sizeof (mar_uint32_t)); qb_list_init (&stored_msg->list); qb_list_add (&stored_msg->list, &downlist_messages_head); for (i = 0; i < my_member_list_entries; i++) { found = 0; - qb_list_for_each(iter, &downlist_messages_head) { - + qb_list_for_each(iter, &downlist_messages_head) { stored_msg = qb_list_entry(iter, struct downlist_msg, list); + if (my_member_list[i] == stored_msg->sender_nodeid) { found = 1; } } if (!found) { return; } } downlist_master_choose_and_send (); } static void message_handler_req_exec_cpg_procjoin ( const void *message, unsigned int nodeid) { const struct req_exec_cpg_procjoin *req_exec_cpg_procjoin = message; log_printf(LOGSYS_LEVEL_DEBUG, "got procjoin message from cluster node 0x%x (%s) for pid %u", nodeid, api->totem_ifaces_print(nodeid), (unsigned int)req_exec_cpg_procjoin->pid); do_proc_join (&req_exec_cpg_procjoin->group_name, req_exec_cpg_procjoin->pid, nodeid, CONFCHG_CPG_REASON_JOIN); } static void message_handler_req_exec_cpg_procleave ( const void *message, unsigned int nodeid) { const struct req_exec_cpg_procjoin *req_exec_cpg_procjoin = message; log_printf(LOGSYS_LEVEL_DEBUG, "got procleave message from cluster node 0x%x (%s) for pid %u", nodeid, api->totem_ifaces_print(nodeid), (unsigned int)req_exec_cpg_procjoin->pid); do_proc_leave (&req_exec_cpg_procjoin->group_name, req_exec_cpg_procjoin->pid, nodeid, req_exec_cpg_procjoin->reason); } /* Got a proclist from another node */ static void message_handler_req_exec_cpg_joinlist ( const void *message_v, unsigned int nodeid) { const char *message = message_v; const struct qb_ipc_response_header *res = (const struct qb_ipc_response_header *)message; const struct join_list_entry *jle = (const struct join_list_entry *)(message + sizeof(struct qb_ipc_response_header)); struct joinlist_msg *stored_msg; log_printf(LOGSYS_LEVEL_DEBUG, "got joinlist message from node 0x%x", nodeid); while ((const char*)jle < message + res->size) { stored_msg = malloc (sizeof (struct joinlist_msg)); memset(stored_msg, 0, sizeof (struct joinlist_msg)); stored_msg->sender_nodeid = nodeid; stored_msg->pid = jle->pid; memcpy(&stored_msg->group_name, &jle->group_name, sizeof(mar_cpg_name_t)); qb_list_init (&stored_msg->list); qb_list_add (&stored_msg->list, &joinlist_messages_head); jle++; } } static void message_handler_req_exec_cpg_mcast ( const void *message, unsigned int nodeid) { const struct req_exec_cpg_mcast *req_exec_cpg_mcast = message; struct res_lib_cpg_deliver_callback res_lib_cpg_mcast; int msglen = req_exec_cpg_mcast->msglen; - struct qb_list_head *iter, *pi_iter; + struct qb_list_head *iter, *pi_iter, *tmp_iter; struct cpg_pd *cpd; struct iovec iovec[2]; int known_node = 0; res_lib_cpg_mcast.header.id = MESSAGE_RES_CPG_DELIVER_CALLBACK; res_lib_cpg_mcast.header.size = sizeof(res_lib_cpg_mcast) + msglen; res_lib_cpg_mcast.msglen = msglen; res_lib_cpg_mcast.pid = req_exec_cpg_mcast->pid; res_lib_cpg_mcast.nodeid = nodeid; memcpy(&res_lib_cpg_mcast.group_name, &req_exec_cpg_mcast->group_name, sizeof(mar_cpg_name_t)); iovec[0].iov_base = (void *)&res_lib_cpg_mcast; iovec[0].iov_len = sizeof (res_lib_cpg_mcast); iovec[1].iov_base = (char*)message+sizeof(*req_exec_cpg_mcast); iovec[1].iov_len = msglen; - qb_list_for_each(iter, &cpg_pd_list_head) { + qb_list_for_each_safe(iter, tmp_iter, &cpg_pd_list_head) { cpd = qb_list_entry(iter, struct cpg_pd, list); if ((cpd->cpd_state == CPD_STATE_LEAVE_STARTED || cpd->cpd_state == CPD_STATE_JOIN_COMPLETED) && (mar_name_compare (&cpd->group_name, &req_exec_cpg_mcast->group_name) == 0)) { if (!known_node) { /* Try to find, if we know the node */ - qb_list_for_each(pi_iter, &process_info_list_head) { + qb_list_for_each(pi_iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (pi_iter, struct process_info, list); if (pi->nodeid == nodeid && mar_name_compare (&pi->group, &req_exec_cpg_mcast->group_name) == 0) { known_node = 1; break; } } } if (!known_node) { log_printf(LOGSYS_LEVEL_WARNING, "Unknown node -> we will not deliver message"); return ; } api->ipc_dispatch_iov_send (cpd->conn, iovec, 2); } } } static void message_handler_req_exec_cpg_partial_mcast ( const void *message, unsigned int nodeid) { const struct req_exec_cpg_partial_mcast *req_exec_cpg_mcast = message; struct res_lib_cpg_partial_deliver_callback res_lib_cpg_mcast; int msglen = req_exec_cpg_mcast->fraglen; - struct qb_list_head *iter, *pi_iter; + struct qb_list_head *iter, *pi_iter, *tmp_iter; struct cpg_pd *cpd; struct iovec iovec[2]; int known_node = 0; log_printf(LOGSYS_LEVEL_DEBUG, "Got fragmented message from node %d, size = %d bytes\n", nodeid, msglen); res_lib_cpg_mcast.header.id = MESSAGE_RES_CPG_PARTIAL_DELIVER_CALLBACK; res_lib_cpg_mcast.header.size = sizeof(res_lib_cpg_mcast) + msglen; res_lib_cpg_mcast.fraglen = msglen; res_lib_cpg_mcast.msglen = req_exec_cpg_mcast->msglen; res_lib_cpg_mcast.pid = req_exec_cpg_mcast->pid; res_lib_cpg_mcast.type = req_exec_cpg_mcast->type; res_lib_cpg_mcast.nodeid = nodeid; memcpy(&res_lib_cpg_mcast.group_name, &req_exec_cpg_mcast->group_name, sizeof(mar_cpg_name_t)); iovec[0].iov_base = (void *)&res_lib_cpg_mcast; iovec[0].iov_len = sizeof (res_lib_cpg_mcast); iovec[1].iov_base = (char*)message+sizeof(*req_exec_cpg_mcast); iovec[1].iov_len = msglen; - qb_list_for_each(iter, &cpg_pd_list_head) { - cpd = qb_list_entry(iter, struct cpg_pd, list); + qb_list_for_each_safe(iter, tmp_iter, &cpg_pd_list_head) { + cpd = qb_list_entry(iter, struct cpg_pd, list); + if ((cpd->cpd_state == CPD_STATE_LEAVE_STARTED || cpd->cpd_state == CPD_STATE_JOIN_COMPLETED) && (mar_name_compare (&cpd->group_name, &req_exec_cpg_mcast->group_name) == 0)) { if (!known_node) { /* Try to find, if we know the node */ - qb_list_for_each(pi_iter, &process_info_list_head) { + qb_list_for_each(pi_iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (pi_iter, struct process_info, list); if (pi->nodeid == nodeid && mar_name_compare (&pi->group, &req_exec_cpg_mcast->group_name) == 0) { known_node = 1; break; } } } if (!known_node) { log_printf(LOGSYS_LEVEL_WARNING, "Unknown node -> we will not deliver message"); return ; } api->ipc_dispatch_iov_send (cpd->conn, iovec, 2); } } } static int cpg_exec_send_downlist(void) { struct iovec iov; g_req_exec_cpg_downlist.header.id = SERVICE_ID_MAKE(CPG_SERVICE, MESSAGE_REQ_EXEC_CPG_DOWNLIST); g_req_exec_cpg_downlist.header.size = sizeof(struct req_exec_cpg_downlist); g_req_exec_cpg_downlist.old_members = my_old_member_list_entries; iov.iov_base = (void *)&g_req_exec_cpg_downlist; iov.iov_len = g_req_exec_cpg_downlist.header.size; return (api->totem_mcast (&iov, 1, TOTEM_AGREED)); } static int cpg_exec_send_joinlist(void) { int count = 0; struct qb_list_head *iter; struct qb_ipc_response_header *res; char *buf; struct join_list_entry *jle; struct iovec req_exec_cpg_iovec; - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (iter, struct process_info, list); if (pi->nodeid == api->totem_nodeid_get ()) { count++; } } /* Nothing to send */ if (!count) return 0; buf = alloca(sizeof(struct qb_ipc_response_header) + sizeof(struct join_list_entry) * count); if (!buf) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to allocate joinlist buffer"); return -1; } jle = (struct join_list_entry *)(buf + sizeof(struct qb_ipc_response_header)); res = (struct qb_ipc_response_header *)buf; - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (iter, struct process_info, list); if (pi->nodeid == api->totem_nodeid_get ()) { memcpy (&jle->group_name, &pi->group, sizeof (mar_cpg_name_t)); jle->pid = pi->pid; jle++; } } res->id = SERVICE_ID_MAKE(CPG_SERVICE, MESSAGE_REQ_EXEC_CPG_JOINLIST); res->size = sizeof(struct qb_ipc_response_header)+sizeof(struct join_list_entry) * count; req_exec_cpg_iovec.iov_base = buf; req_exec_cpg_iovec.iov_len = res->size; return (api->totem_mcast (&req_exec_cpg_iovec, 1, TOTEM_AGREED)); } static int cpg_lib_init_fn (void *conn) { struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); memset (cpd, 0, sizeof(struct cpg_pd)); cpd->conn = conn; qb_list_add (&cpd->list, &cpg_pd_list_head); qb_list_init (&cpd->iteration_instance_list_head); qb_list_init (&cpd->zcb_mapped_list_head); api->ipc_refcnt_inc (conn); log_printf(LOGSYS_LEVEL_DEBUG, "lib_init_fn: conn=%p, cpd=%p", conn, cpd); return (0); } /* Join message from the library */ static void message_handler_req_lib_cpg_join (void *conn, const void *message) { const struct req_lib_cpg_join *req_lib_cpg_join = message; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); struct res_lib_cpg_join res_lib_cpg_join; cs_error_t error = CS_OK; struct qb_list_head *iter; /* Test, if we don't have same pid and group name joined */ - qb_list_for_each(iter, &cpg_pd_list_head) { + qb_list_for_each(iter, &cpg_pd_list_head) { struct cpg_pd *cpd_item = qb_list_entry (iter, struct cpg_pd, list); if (cpd_item->pid == req_lib_cpg_join->pid && mar_name_compare(&req_lib_cpg_join->group_name, &cpd_item->group_name) == 0) { /* We have same pid and group name joined -> return error */ error = CS_ERR_EXIST; goto response_send; } } /* * Same check must be done in process info list, because there may be not yet delivered * leave of client. */ - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (iter, struct process_info, list); if (pi->nodeid == api->totem_nodeid_get () && pi->pid == req_lib_cpg_join->pid && mar_name_compare(&req_lib_cpg_join->group_name, &pi->group) == 0) { /* We have same pid and group name joined -> return error */ error = CS_ERR_TRY_AGAIN; goto response_send; } } if (req_lib_cpg_join->group_name.length > CPG_MAX_NAME_LENGTH) { error = CS_ERR_NAME_TOO_LONG; goto response_send; } switch (cpd->cpd_state) { case CPD_STATE_UNJOINED: error = CS_OK; cpd->cpd_state = CPD_STATE_JOIN_STARTED; cpd->pid = req_lib_cpg_join->pid; cpd->flags = req_lib_cpg_join->flags; memcpy (&cpd->group_name, &req_lib_cpg_join->group_name, sizeof (cpd->group_name)); cpg_node_joinleave_send (req_lib_cpg_join->pid, &req_lib_cpg_join->group_name, MESSAGE_REQ_EXEC_CPG_PROCJOIN, CONFCHG_CPG_REASON_JOIN); break; case CPD_STATE_LEAVE_STARTED: error = CS_ERR_BUSY; break; case CPD_STATE_JOIN_STARTED: error = CS_ERR_EXIST; break; case CPD_STATE_JOIN_COMPLETED: error = CS_ERR_EXIST; break; } response_send: res_lib_cpg_join.header.size = sizeof(res_lib_cpg_join); res_lib_cpg_join.header.id = MESSAGE_RES_CPG_JOIN; res_lib_cpg_join.header.error = error; api->ipc_response_send (conn, &res_lib_cpg_join, sizeof(res_lib_cpg_join)); } /* Leave message from the library */ static void message_handler_req_lib_cpg_leave (void *conn, const void *message) { struct res_lib_cpg_leave res_lib_cpg_leave; cs_error_t error = CS_OK; struct req_lib_cpg_leave *req_lib_cpg_leave = (struct req_lib_cpg_leave *)message; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, "got leave request on %p", conn); switch (cpd->cpd_state) { case CPD_STATE_UNJOINED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_LEAVE_STARTED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_JOIN_STARTED: error = CS_ERR_BUSY; break; case CPD_STATE_JOIN_COMPLETED: error = CS_OK; cpd->cpd_state = CPD_STATE_LEAVE_STARTED; cpg_node_joinleave_send (req_lib_cpg_leave->pid, &req_lib_cpg_leave->group_name, MESSAGE_REQ_EXEC_CPG_PROCLEAVE, CONFCHG_CPG_REASON_LEAVE); break; } /* send return */ res_lib_cpg_leave.header.size = sizeof(res_lib_cpg_leave); res_lib_cpg_leave.header.id = MESSAGE_RES_CPG_LEAVE; res_lib_cpg_leave.header.error = error; api->ipc_response_send(conn, &res_lib_cpg_leave, sizeof(res_lib_cpg_leave)); } /* Finalize message from library */ static void message_handler_req_lib_cpg_finalize ( void *conn, const void *message) { struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); struct res_lib_cpg_finalize res_lib_cpg_finalize; cs_error_t error = CS_OK; log_printf (LOGSYS_LEVEL_DEBUG, "cpg finalize for conn=%p", conn); /* * We will just remove cpd from list. After this call, connection will be * closed on lib side, and cpg_lib_exit_fn will be called */ qb_list_del (&cpd->list); qb_list_init (&cpd->list); res_lib_cpg_finalize.header.size = sizeof (res_lib_cpg_finalize); res_lib_cpg_finalize.header.id = MESSAGE_RES_CPG_FINALIZE; res_lib_cpg_finalize.header.error = error; api->ipc_response_send (conn, &res_lib_cpg_finalize, sizeof (res_lib_cpg_finalize)); } static int memory_map ( const char *path, size_t bytes, void **buf) { int32_t fd; void *addr; int32_t res; fd = open (path, O_RDWR, 0600); unlink (path); if (fd == -1) { return (-1); } res = ftruncate (fd, bytes); if (res == -1) { goto error_close_unlink; } addr = mmap (NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { goto error_close_unlink; } #ifdef MADV_NOSYNC madvise(addr, bytes, MADV_NOSYNC); #endif res = close (fd); if (res) { munmap (addr, bytes); return (-1); } *buf = addr; return (0); error_close_unlink: close (fd); unlink(path); return -1; } static inline int zcb_alloc ( struct cpg_pd *cpd, const char *path_to_file, size_t size, void **addr) { struct zcb_mapped *zcb_mapped; unsigned int res; zcb_mapped = malloc (sizeof (struct zcb_mapped)); if (zcb_mapped == NULL) { return (-1); } res = memory_map ( path_to_file, size, addr); if (res == -1) { free (zcb_mapped); return (-1); } qb_list_init (&zcb_mapped->list); zcb_mapped->addr = *addr; zcb_mapped->size = size; qb_list_add_tail (&zcb_mapped->list, &cpd->zcb_mapped_list_head); return (0); } static inline int zcb_free (struct zcb_mapped *zcb_mapped) { unsigned int res; res = munmap (zcb_mapped->addr, zcb_mapped->size); qb_list_del (&zcb_mapped->list); free (zcb_mapped); return (res); } static inline int zcb_by_addr_free (struct cpg_pd *cpd, void *addr) { - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; struct zcb_mapped *zcb_mapped; unsigned int res = 0; - qb_list_for_each(list, &(cpd->zcb_mapped_list_head)) { + qb_list_for_each_safe(list, tmp_iter, &(cpd->zcb_mapped_list_head)) { zcb_mapped = qb_list_entry (list, struct zcb_mapped, list); if (zcb_mapped->addr == addr) { res = zcb_free (zcb_mapped); break; } } return (res); } static inline int zcb_all_free ( struct cpg_pd *cpd) { - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; struct zcb_mapped *zcb_mapped; - qb_list_for_each(list, &(cpd->zcb_mapped_list_head)) { + qb_list_for_each_safe(list, tmp_iter, &(cpd->zcb_mapped_list_head)) { zcb_mapped = qb_list_entry (list, struct zcb_mapped, list); - list = list->next; - zcb_free (zcb_mapped); } return (0); } union u { uint64_t server_addr; void *server_ptr; }; static uint64_t void2serveraddr (void *server_ptr) { union u u; u.server_ptr = server_ptr; return (u.server_addr); } static void *serveraddr2void (uint64_t server_addr) { union u u; u.server_addr = server_addr; return (u.server_ptr); }; static void message_handler_req_lib_cpg_zc_alloc ( void *conn, const void *message) { mar_req_coroipcc_zc_alloc_t *hdr = (mar_req_coroipcc_zc_alloc_t *)message; struct qb_ipc_response_header res_header; void *addr = NULL; struct coroipcs_zc_header *zc_header; unsigned int res; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, "path: %s", hdr->path_to_file); res = zcb_alloc (cpd, hdr->path_to_file, hdr->map_size, &addr); assert(res == 0); zc_header = (struct coroipcs_zc_header *)addr; zc_header->server_address = void2serveraddr(addr); res_header.size = sizeof (struct qb_ipc_response_header); res_header.id = 0; api->ipc_response_send (conn, &res_header, res_header.size); } static void message_handler_req_lib_cpg_zc_free ( void *conn, const void *message) { mar_req_coroipcc_zc_free_t *hdr = (mar_req_coroipcc_zc_free_t *)message; struct qb_ipc_response_header res_header; void *addr = NULL; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, " free'ing"); addr = serveraddr2void (hdr->server_address); zcb_by_addr_free (cpd, addr); res_header.size = sizeof (struct qb_ipc_response_header); res_header.id = 0; api->ipc_response_send ( conn, &res_header, res_header.size); } /* Fragmented mcast message from the library */ static void message_handler_req_lib_cpg_partial_mcast (void *conn, const void *message) { const struct req_lib_cpg_partial_mcast *req_lib_cpg_mcast = message; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); mar_cpg_name_t group_name = cpd->group_name; struct iovec req_exec_cpg_iovec[2]; struct req_exec_cpg_partial_mcast req_exec_cpg_mcast; struct res_lib_cpg_partial_send res_lib_cpg_partial_send; int msglen = req_lib_cpg_mcast->fraglen; int result; cs_error_t error = CS_ERR_NOT_EXIST; log_printf(LOGSYS_LEVEL_TRACE, "got fragmented mcast request on %p", conn); log_printf(LOGSYS_LEVEL_DEBUG, "Sending fragmented message size = %d bytes\n", msglen); switch (cpd->cpd_state) { case CPD_STATE_UNJOINED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_LEAVE_STARTED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_JOIN_STARTED: error = CS_OK; break; case CPD_STATE_JOIN_COMPLETED: error = CS_OK; break; } res_lib_cpg_partial_send.header.size = sizeof(res_lib_cpg_partial_send); res_lib_cpg_partial_send.header.id = MESSAGE_RES_CPG_PARTIAL_SEND; if (req_lib_cpg_mcast->type == LIBCPG_PARTIAL_FIRST) { cpd->initial_transition_counter = cpd->transition_counter; } if (cpd->transition_counter != cpd->initial_transition_counter) { error = CS_ERR_INTERRUPT; } if (error == CS_OK) { req_exec_cpg_mcast.header.size = sizeof(req_exec_cpg_mcast) + msglen; req_exec_cpg_mcast.header.id = SERVICE_ID_MAKE(CPG_SERVICE, MESSAGE_REQ_EXEC_CPG_PARTIAL_MCAST); req_exec_cpg_mcast.pid = cpd->pid; req_exec_cpg_mcast.msglen = req_lib_cpg_mcast->msglen; req_exec_cpg_mcast.type = req_lib_cpg_mcast->type; req_exec_cpg_mcast.fraglen = req_lib_cpg_mcast->fraglen; api->ipc_source_set (&req_exec_cpg_mcast.source, conn); memcpy(&req_exec_cpg_mcast.group_name, &group_name, sizeof(mar_cpg_name_t)); req_exec_cpg_iovec[0].iov_base = (char *)&req_exec_cpg_mcast; req_exec_cpg_iovec[0].iov_len = sizeof(req_exec_cpg_mcast); req_exec_cpg_iovec[1].iov_base = (char *)&req_lib_cpg_mcast->message; req_exec_cpg_iovec[1].iov_len = msglen; result = api->totem_mcast (req_exec_cpg_iovec, 2, TOTEM_AGREED); assert(result == 0); } else { log_printf(LOGSYS_LEVEL_ERROR, "*** %p can't mcast to group %s state:%d, error:%d", conn, group_name.value, cpd->cpd_state, error); } res_lib_cpg_partial_send.header.error = error; api->ipc_response_send (conn, &res_lib_cpg_partial_send, sizeof (res_lib_cpg_partial_send)); } /* Mcast message from the library */ static void message_handler_req_lib_cpg_mcast (void *conn, const void *message) { const struct req_lib_cpg_mcast *req_lib_cpg_mcast = message; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); mar_cpg_name_t group_name = cpd->group_name; struct iovec req_exec_cpg_iovec[2]; struct req_exec_cpg_mcast req_exec_cpg_mcast; int msglen = req_lib_cpg_mcast->msglen; int result; cs_error_t error = CS_ERR_NOT_EXIST; log_printf(LOGSYS_LEVEL_TRACE, "got mcast request on %p", conn); switch (cpd->cpd_state) { case CPD_STATE_UNJOINED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_LEAVE_STARTED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_JOIN_STARTED: error = CS_OK; break; case CPD_STATE_JOIN_COMPLETED: error = CS_OK; break; } if (error == CS_OK) { req_exec_cpg_mcast.header.size = sizeof(req_exec_cpg_mcast) + msglen; req_exec_cpg_mcast.header.id = SERVICE_ID_MAKE(CPG_SERVICE, MESSAGE_REQ_EXEC_CPG_MCAST); req_exec_cpg_mcast.pid = cpd->pid; req_exec_cpg_mcast.msglen = msglen; api->ipc_source_set (&req_exec_cpg_mcast.source, conn); memcpy(&req_exec_cpg_mcast.group_name, &group_name, sizeof(mar_cpg_name_t)); req_exec_cpg_iovec[0].iov_base = (char *)&req_exec_cpg_mcast; req_exec_cpg_iovec[0].iov_len = sizeof(req_exec_cpg_mcast); req_exec_cpg_iovec[1].iov_base = (char *)&req_lib_cpg_mcast->message; req_exec_cpg_iovec[1].iov_len = msglen; result = api->totem_mcast (req_exec_cpg_iovec, 2, TOTEM_AGREED); assert(result == 0); } else { log_printf(LOGSYS_LEVEL_ERROR, "*** %p can't mcast to group %s state:%d, error:%d", conn, group_name.value, cpd->cpd_state, error); } } static void message_handler_req_lib_cpg_zc_execute ( void *conn, const void *message) { mar_req_coroipcc_zc_execute_t *hdr = (mar_req_coroipcc_zc_execute_t *)message; struct qb_ipc_request_header *header; struct res_lib_cpg_mcast res_lib_cpg_mcast; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); struct iovec req_exec_cpg_iovec[2]; struct req_exec_cpg_mcast req_exec_cpg_mcast; struct req_lib_cpg_mcast *req_lib_cpg_mcast; int result; cs_error_t error = CS_ERR_NOT_EXIST; log_printf(LOGSYS_LEVEL_TRACE, "got ZC mcast request on %p", conn); header = (struct qb_ipc_request_header *)(((char *)serveraddr2void(hdr->server_address) + sizeof (struct coroipcs_zc_header))); req_lib_cpg_mcast = (struct req_lib_cpg_mcast *)header; switch (cpd->cpd_state) { case CPD_STATE_UNJOINED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_LEAVE_STARTED: error = CS_ERR_NOT_EXIST; break; case CPD_STATE_JOIN_STARTED: error = CS_OK; break; case CPD_STATE_JOIN_COMPLETED: error = CS_OK; break; } res_lib_cpg_mcast.header.size = sizeof(res_lib_cpg_mcast); res_lib_cpg_mcast.header.id = MESSAGE_RES_CPG_MCAST; if (error == CS_OK) { req_exec_cpg_mcast.header.size = sizeof(req_exec_cpg_mcast) + req_lib_cpg_mcast->msglen; req_exec_cpg_mcast.header.id = SERVICE_ID_MAKE(CPG_SERVICE, MESSAGE_REQ_EXEC_CPG_MCAST); req_exec_cpg_mcast.pid = cpd->pid; req_exec_cpg_mcast.msglen = req_lib_cpg_mcast->msglen; api->ipc_source_set (&req_exec_cpg_mcast.source, conn); memcpy(&req_exec_cpg_mcast.group_name, &cpd->group_name, sizeof(mar_cpg_name_t)); req_exec_cpg_iovec[0].iov_base = (char *)&req_exec_cpg_mcast; req_exec_cpg_iovec[0].iov_len = sizeof(req_exec_cpg_mcast); req_exec_cpg_iovec[1].iov_base = (char *)header + sizeof(struct req_lib_cpg_mcast); req_exec_cpg_iovec[1].iov_len = req_exec_cpg_mcast.msglen; result = api->totem_mcast (req_exec_cpg_iovec, 2, TOTEM_AGREED); if (result == 0) { res_lib_cpg_mcast.header.error = CS_OK; } else { res_lib_cpg_mcast.header.error = CS_ERR_TRY_AGAIN; } } else { res_lib_cpg_mcast.header.error = error; } api->ipc_response_send (conn, &res_lib_cpg_mcast, sizeof (res_lib_cpg_mcast)); } static void message_handler_req_lib_cpg_membership (void *conn, const void *message) { struct req_lib_cpg_membership_get *req_lib_cpg_membership_get = (struct req_lib_cpg_membership_get *)message; struct res_lib_cpg_membership_get res_lib_cpg_membership_get; struct qb_list_head *iter; int member_count = 0; res_lib_cpg_membership_get.header.id = MESSAGE_RES_CPG_MEMBERSHIP; res_lib_cpg_membership_get.header.error = CS_OK; res_lib_cpg_membership_get.header.size = sizeof (struct res_lib_cpg_membership_get); - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (iter, struct process_info, list); if (mar_name_compare (&pi->group, &req_lib_cpg_membership_get->group_name) == 0) { res_lib_cpg_membership_get.member_list[member_count].nodeid = pi->nodeid; res_lib_cpg_membership_get.member_list[member_count].pid = pi->pid; member_count += 1; } } res_lib_cpg_membership_get.member_count = member_count; api->ipc_response_send (conn, &res_lib_cpg_membership_get, sizeof (res_lib_cpg_membership_get)); } static void message_handler_req_lib_cpg_local_get (void *conn, const void *message) { struct res_lib_cpg_local_get res_lib_cpg_local_get; res_lib_cpg_local_get.header.size = sizeof (res_lib_cpg_local_get); res_lib_cpg_local_get.header.id = MESSAGE_RES_CPG_LOCAL_GET; res_lib_cpg_local_get.header.error = CS_OK; res_lib_cpg_local_get.local_nodeid = api->totem_nodeid_get (); api->ipc_response_send (conn, &res_lib_cpg_local_get, sizeof (res_lib_cpg_local_get)); } static void message_handler_req_lib_cpg_iteration_initialize ( void *conn, const void *message) { const struct req_lib_cpg_iterationinitialize *req_lib_cpg_iterationinitialize = message; struct cpg_pd *cpd = (struct cpg_pd *)api->ipc_private_data_get (conn); hdb_handle_t cpg_iteration_handle = 0; struct res_lib_cpg_iterationinitialize res_lib_cpg_iterationinitialize; struct qb_list_head *iter, *iter2; struct cpg_iteration_instance *cpg_iteration_instance; cs_error_t error = CS_OK; int res; log_printf (LOGSYS_LEVEL_DEBUG, "cpg iteration initialize"); /* Because between calling this function and *next can be some operations which will * change list, we must do full copy. */ /* * Create new iteration instance */ res = hdb_handle_create (&cpg_iteration_handle_t_db, sizeof (struct cpg_iteration_instance), &cpg_iteration_handle); if (res != 0) { error = CS_ERR_NO_MEMORY; goto response_send; } res = hdb_handle_get (&cpg_iteration_handle_t_db, cpg_iteration_handle, (void *)&cpg_iteration_instance); if (res != 0) { error = CS_ERR_BAD_HANDLE; goto error_destroy; } qb_list_init (&cpg_iteration_instance->items_list_head); cpg_iteration_instance->handle = cpg_iteration_handle; /* * Create copy of process_info list "grouped by" group name */ - qb_list_for_each(iter, &process_info_list_head) { + qb_list_for_each(iter, &process_info_list_head) { struct process_info *pi = qb_list_entry (iter, struct process_info, list); struct process_info *new_pi; if (req_lib_cpg_iterationinitialize->iteration_type == CPG_ITERATION_NAME_ONLY) { /* * Try to find processed group name in our list new list */ int found = 0; qb_list_for_each(iter2, &(cpg_iteration_instance->items_list_head)) { struct process_info *pi2 = qb_list_entry (iter2, struct process_info, list); if (mar_name_compare (&pi2->group, &pi->group) == 0) { found = 1; break; } } if (found) { /* * We have this name in list -> don't add */ continue ; } } else if (req_lib_cpg_iterationinitialize->iteration_type == CPG_ITERATION_ONE_GROUP) { /* * Test pi group name with request */ if (mar_name_compare (&pi->group, &req_lib_cpg_iterationinitialize->group_name) != 0) /* * Not same -> don't add */ continue ; } new_pi = malloc (sizeof (struct process_info)); if (!new_pi) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to allocate process_info struct"); error = CS_ERR_NO_MEMORY; goto error_put_destroy; } memcpy (new_pi, pi, sizeof (struct process_info)); qb_list_init (&new_pi->list); if (req_lib_cpg_iterationinitialize->iteration_type == CPG_ITERATION_NAME_ONLY) { /* * pid and nodeid -> undefined */ new_pi->pid = new_pi->nodeid = 0; } /* * We will return list "grouped" by "group name", so try to find right place to add */ - qb_list_for_each(iter2, &(cpg_iteration_instance->items_list_head)) { - struct process_info *pi2 = qb_list_entry (iter2, struct process_info, list); + qb_list_for_each(iter2, &(cpg_iteration_instance->items_list_head)) { + struct process_info *pi2 = qb_list_entry (iter2, struct process_info, list); - if (mar_name_compare (&pi2->group, &pi->group) == 0) { + if (mar_name_compare (&pi2->group, &pi->group) == 0) { break; - } + } } qb_list_add (&new_pi->list, iter2); } /* * Now we have a full "grouped by" copy of process_info list */ /* * Add instance to current cpd list */ qb_list_init (&cpg_iteration_instance->list); qb_list_add (&cpg_iteration_instance->list, &cpd->iteration_instance_list_head); cpg_iteration_instance->current_pointer = &cpg_iteration_instance->items_list_head; error_put_destroy: hdb_handle_put (&cpg_iteration_handle_t_db, cpg_iteration_handle); error_destroy: if (error != CS_OK) { hdb_handle_destroy (&cpg_iteration_handle_t_db, cpg_iteration_handle); } response_send: res_lib_cpg_iterationinitialize.header.size = sizeof (res_lib_cpg_iterationinitialize); res_lib_cpg_iterationinitialize.header.id = MESSAGE_RES_CPG_ITERATIONINITIALIZE; res_lib_cpg_iterationinitialize.header.error = error; res_lib_cpg_iterationinitialize.iteration_handle = cpg_iteration_handle; api->ipc_response_send (conn, &res_lib_cpg_iterationinitialize, sizeof (res_lib_cpg_iterationinitialize)); } static void message_handler_req_lib_cpg_iteration_next ( void *conn, const void *message) { const struct req_lib_cpg_iterationnext *req_lib_cpg_iterationnext = message; struct res_lib_cpg_iterationnext res_lib_cpg_iterationnext; struct cpg_iteration_instance *cpg_iteration_instance; cs_error_t error = CS_OK; int res; struct process_info *pi; log_printf (LOGSYS_LEVEL_DEBUG, "cpg iteration next"); res = hdb_handle_get (&cpg_iteration_handle_t_db, req_lib_cpg_iterationnext->iteration_handle, (void *)&cpg_iteration_instance); if (res != 0) { error = CS_ERR_LIBRARY; goto error_exit; } assert (cpg_iteration_instance); cpg_iteration_instance->current_pointer = cpg_iteration_instance->current_pointer->next; if (cpg_iteration_instance->current_pointer == &cpg_iteration_instance->items_list_head) { error = CS_ERR_NO_SECTIONS; goto error_put; } pi = qb_list_entry (cpg_iteration_instance->current_pointer, struct process_info, list); /* * Copy iteration data */ res_lib_cpg_iterationnext.description.nodeid = pi->nodeid; res_lib_cpg_iterationnext.description.pid = pi->pid; memcpy (&res_lib_cpg_iterationnext.description.group, &pi->group, sizeof (mar_cpg_name_t)); error_put: hdb_handle_put (&cpg_iteration_handle_t_db, req_lib_cpg_iterationnext->iteration_handle); error_exit: res_lib_cpg_iterationnext.header.size = sizeof (res_lib_cpg_iterationnext); res_lib_cpg_iterationnext.header.id = MESSAGE_RES_CPG_ITERATIONNEXT; res_lib_cpg_iterationnext.header.error = error; api->ipc_response_send (conn, &res_lib_cpg_iterationnext, sizeof (res_lib_cpg_iterationnext)); } static void message_handler_req_lib_cpg_iteration_finalize ( void *conn, const void *message) { const struct req_lib_cpg_iterationfinalize *req_lib_cpg_iterationfinalize = message; struct res_lib_cpg_iterationfinalize res_lib_cpg_iterationfinalize; struct cpg_iteration_instance *cpg_iteration_instance; cs_error_t error = CS_OK; int res; log_printf (LOGSYS_LEVEL_DEBUG, "cpg iteration finalize"); res = hdb_handle_get (&cpg_iteration_handle_t_db, req_lib_cpg_iterationfinalize->iteration_handle, (void *)&cpg_iteration_instance); if (res != 0) { error = CS_ERR_LIBRARY; goto error_exit; } assert (cpg_iteration_instance); cpg_iteration_instance_finalize (cpg_iteration_instance); hdb_handle_put (&cpg_iteration_handle_t_db, cpg_iteration_instance->handle); error_exit: res_lib_cpg_iterationfinalize.header.size = sizeof (res_lib_cpg_iterationfinalize); res_lib_cpg_iterationfinalize.header.id = MESSAGE_RES_CPG_ITERATIONFINALIZE; res_lib_cpg_iterationfinalize.header.error = error; api->ipc_response_send (conn, &res_lib_cpg_iterationfinalize, sizeof (res_lib_cpg_iterationfinalize)); } diff --git a/exec/icmap.c b/exec/icmap.c index 6bbe9eaf..bb77ef8b 100644 --- a/exec/icmap.c +++ b/exec/icmap.c @@ -1,1335 +1,1337 @@ /* * Copyright (c) 2011 Red Hat, Inc. * * All rights reserved. * * Author: Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the Red Hat, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #define ICMAP_MAX_VALUE_LEN (16*1024) struct icmap_item { char *key_name; icmap_value_types_t type; size_t value_len; char value[]; }; struct icmap_map { qb_map_t *qb_map; }; static icmap_map_t icmap_global_map; struct icmap_track { char *key_name; int32_t track_type; icmap_notify_fn_t notify_fn; void *user_data; struct qb_list_head list; }; struct icmap_ro_access_item { char *key_name; int prefix; struct qb_list_head list; }; QB_LIST_DECLARE (icmap_ro_access_item_list_head); QB_LIST_DECLARE (icmap_track_list_head); /* * Static functions declarations */ /* * Check if key_name is valid icmap key name. Returns 0 on success, and -1 on fail */ static int icmap_check_key_name(const char *key_name); /* * Check that value with given type has correct length value_len. Returns 0 on success, * and -1 on fail */ static int icmap_check_value_len(const void *value, size_t value_len, icmap_value_types_t type); /* * Returns length of value of given type, or 0 for string and binary data type */ static size_t icmap_get_valuetype_len(icmap_value_types_t type); /* * Converts track type of icmap to qb */ static int32_t icmap_tt_to_qbtt(int32_t track_type); /* * Convert track type of qb to icmap */ static int32_t icmap_qbtt_to_tt(int32_t track_type); /* * Checks if item has same value as value with value_len and given type. Returns 0 if not, otherwise !0. */ static int icmap_item_eq(const struct icmap_item *item, const void *value, size_t value_len, icmap_value_types_t type); /* * Checks if given character is valid in key name. Returns 0 if not, otherwise !0. */ static int icmap_is_valid_name_char(char c); /* * Helper for getting integer and float value with given type for key key_name and store it in value. */ static cs_error_t icmap_get_int_r( const icmap_map_t map, const char *key_name, void *value, icmap_value_types_t type); /* * Return raw item value data. Internal function used by icmap_get_r which does most * of arguments validity checks but doesn't copy data (it returns raw item data * pointer). It's not very safe tho it's static. */ static cs_error_t icmap_get_ref_r( const icmap_map_t map, const char *key_name, void **value, size_t *value_len, icmap_value_types_t *type); /* * Function implementation */ static int32_t icmap_tt_to_qbtt(int32_t track_type) { int32_t res = 0; if (track_type & ICMAP_TRACK_DELETE) { res |= QB_MAP_NOTIFY_DELETED; } if (track_type & ICMAP_TRACK_MODIFY) { res |= QB_MAP_NOTIFY_REPLACED; } if (track_type & ICMAP_TRACK_ADD) { res |= QB_MAP_NOTIFY_INSERTED; } if (track_type & ICMAP_TRACK_PREFIX) { res |= QB_MAP_NOTIFY_RECURSIVE; } return (res); } static int32_t icmap_qbtt_to_tt(int32_t track_type) { int32_t res = 0; if (track_type & QB_MAP_NOTIFY_DELETED) { res |= ICMAP_TRACK_DELETE; } if (track_type & QB_MAP_NOTIFY_REPLACED) { res |= ICMAP_TRACK_MODIFY; } if (track_type & QB_MAP_NOTIFY_INSERTED) { res |= ICMAP_TRACK_ADD; } if (track_type & QB_MAP_NOTIFY_RECURSIVE) { res |= ICMAP_TRACK_PREFIX; } return (res); } static void icmap_map_free_cb(uint32_t event, char* key, void* old_value, void* value, void* user_data) { struct icmap_item *item = (struct icmap_item *)old_value; /* * value == old_value -> fast_adjust_int was used, don't free data */ if (item != NULL && value != old_value) { free(item->key_name); free(item); } } cs_error_t icmap_init_r(icmap_map_t *result) { int32_t err; *result = malloc(sizeof(struct icmap_map)); if (*result == NULL) { return (CS_ERR_NO_MEMORY); } (*result)->qb_map = qb_trie_create(); if ((*result)->qb_map == NULL) return (CS_ERR_INIT); err = qb_map_notify_add((*result)->qb_map, NULL, icmap_map_free_cb, QB_MAP_NOTIFY_FREE, NULL); return (qb_to_cs_error(err)); } cs_error_t icmap_init(void) { return (icmap_init_r(&icmap_global_map)); } static void icmap_set_ro_access_free(void) { - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct icmap_ro_access_item *icmap_ro_ai; - qb_list_for_each(iter, &icmap_ro_access_item_list_head) { + qb_list_for_each_safe(iter, tmp_iter, &icmap_ro_access_item_list_head) { icmap_ro_ai = qb_list_entry(iter, struct icmap_ro_access_item, list); qb_list_del(&icmap_ro_ai->list); free(icmap_ro_ai->key_name); - free(icmap_ro_ai); + free(icmap_ro_ai); } } static void icmap_del_all_track(void) { - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct icmap_track *icmap_track; - qb_list_for_each(iter, &icmap_track_list_head) { + + qb_list_for_each_safe(iter, tmp_iter, &icmap_track_list_head) { icmap_track = qb_list_entry(iter, struct icmap_track, list); - icmap_track_delete(icmap_track); + + icmap_track_delete(icmap_track); } } void icmap_fini_r(const icmap_map_t map) { qb_map_destroy(map->qb_map); free(map); return; } void icmap_fini(void) { icmap_del_all_track(); /* * catch 22 warning: * We need to drop this notify but we can't because it calls icmap_map_free_cb * while destroying the tree to free icmap_item(s). * -> qb_map_notify_del_2(icmap_map, NULL, icmap_map_free_cb, QB_MAP_NOTIFY_FREE, NULL); * and we cannot call it after map_destroy. joy! :) */ icmap_fini_r(icmap_global_map); icmap_set_ro_access_free(); return ; } icmap_map_t icmap_get_global_map(void) { return (icmap_global_map); } static int icmap_is_valid_name_char(char c) { return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c == '_' || c == '-' || c == '/' || c == ':'); } void icmap_convert_name_to_valid_name(char *key_name) { int i; for (i = 0; i < strlen(key_name); i++) { if (!icmap_is_valid_name_char(key_name[i])) { key_name[i] = '_'; } } } static int icmap_check_key_name(const char *key_name) { int i; if ((strlen(key_name) < ICMAP_KEYNAME_MINLEN) || strlen(key_name) > ICMAP_KEYNAME_MAXLEN) { return (-1); } for (i = 0; i < strlen(key_name); i++) { if (!icmap_is_valid_name_char(key_name[i])) { return (-1); } } return (0); } static size_t icmap_get_valuetype_len(icmap_value_types_t type) { size_t res = 0; switch (type) { case ICMAP_VALUETYPE_INT8: res = sizeof(int8_t); break; case ICMAP_VALUETYPE_UINT8: res = sizeof(uint8_t); break; case ICMAP_VALUETYPE_INT16: res = sizeof(int16_t); break; case ICMAP_VALUETYPE_UINT16: res = sizeof(uint16_t); break; case ICMAP_VALUETYPE_INT32: res = sizeof(int32_t); break; case ICMAP_VALUETYPE_UINT32: res = sizeof(uint32_t); break; case ICMAP_VALUETYPE_INT64: res = sizeof(int64_t); break; case ICMAP_VALUETYPE_UINT64: res = sizeof(uint64_t); break; case ICMAP_VALUETYPE_FLOAT: res = sizeof(float); break; case ICMAP_VALUETYPE_DOUBLE: res = sizeof(double); break; case ICMAP_VALUETYPE_STRING: case ICMAP_VALUETYPE_BINARY: res = 0; break; } return (res); } static int icmap_check_value_len(const void *value, size_t value_len, icmap_value_types_t type) { if (value_len > ICMAP_MAX_VALUE_LEN) { return (-1); } if (type != ICMAP_VALUETYPE_STRING && type != ICMAP_VALUETYPE_BINARY) { if (icmap_get_valuetype_len(type) == value_len) { return (0); } else { return (-1); } } if (type == ICMAP_VALUETYPE_STRING) { /* * value_len can be shorter then real string length, but never * longer (+ 1 is because of 0 at the end of string) */ if (value_len > strlen((const char *)value) + 1) { return (-1); } else { return (0); } } return (0); } static int icmap_item_eq(const struct icmap_item *item, const void *value, size_t value_len, icmap_value_types_t type) { size_t ptr_len; if (item->type != type) { return (0); } if (item->type == ICMAP_VALUETYPE_STRING) { ptr_len = strlen((const char *)value); if (ptr_len > value_len) { ptr_len = value_len; } ptr_len++; } else { ptr_len = value_len; } if (item->value_len == ptr_len) { return (memcmp(item->value, value, value_len) == 0); }; return (0); } int icmap_key_value_eq( const icmap_map_t map1, const char *key_name1, const icmap_map_t map2, const char *key_name2) { struct icmap_item *item1, *item2; if (map1 == NULL || key_name1 == NULL || map2 == NULL || key_name2 == NULL) { return (0); } item1 = qb_map_get(map1->qb_map, key_name1); item2 = qb_map_get(map2->qb_map, key_name2); if (item1 == NULL || item2 == NULL) { return (0); } return (icmap_item_eq(item1, item2->value, item2->value_len, item2->type)); } cs_error_t icmap_set_r( const icmap_map_t map, const char *key_name, const void *value, size_t value_len, icmap_value_types_t type) { struct icmap_item *item; struct icmap_item *new_item; size_t new_value_len; size_t new_item_size; if (value == NULL || key_name == NULL) { return (CS_ERR_INVALID_PARAM); } if (icmap_check_value_len(value, value_len, type) != 0) { return (CS_ERR_INVALID_PARAM); } item = qb_map_get(map->qb_map, key_name); if (item != NULL) { /* * Check that key is really changed */ if (icmap_item_eq(item, value, value_len, type)) { return (CS_OK); } } else { if (icmap_check_key_name(key_name) != 0) { return (CS_ERR_NAME_TOO_LONG); } } if (type == ICMAP_VALUETYPE_BINARY || type == ICMAP_VALUETYPE_STRING) { if (type == ICMAP_VALUETYPE_STRING) { new_value_len = strlen((const char *)value); if (new_value_len > value_len) { new_value_len = value_len; } new_value_len++; } else { new_value_len = value_len; } } else { new_value_len = icmap_get_valuetype_len(type); } new_item_size = sizeof(struct icmap_item) + new_value_len; new_item = malloc(new_item_size); if (new_item == NULL) { return (CS_ERR_NO_MEMORY); } memset(new_item, 0, new_item_size); if (item == NULL) { new_item->key_name = strdup(key_name); if (new_item->key_name == NULL) { free(new_item); return (CS_ERR_NO_MEMORY); } } else { new_item->key_name = item->key_name; item->key_name = NULL; } new_item->type = type; new_item->value_len = new_value_len; memcpy(new_item->value, value, new_value_len); if (new_item->type == ICMAP_VALUETYPE_STRING) { ((char *)new_item->value)[new_value_len - 1] = 0; } qb_map_put(map->qb_map, new_item->key_name, new_item); return (CS_OK); } cs_error_t icmap_set( const char *key_name, const void *value, size_t value_len, icmap_value_types_t type) { return (icmap_set_r(icmap_global_map, key_name, value, value_len, type)); } cs_error_t icmap_set_int8_r(const icmap_map_t map, const char *key_name, int8_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_INT8)); } cs_error_t icmap_set_uint8_r(const icmap_map_t map, const char *key_name, uint8_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_UINT8)); } cs_error_t icmap_set_int16_r(const icmap_map_t map, const char *key_name, int16_t value) { return (icmap_set_r(map,key_name, &value, sizeof(value), ICMAP_VALUETYPE_INT16)); } cs_error_t icmap_set_uint16_r(const icmap_map_t map, const char *key_name, uint16_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_UINT16)); } cs_error_t icmap_set_int32_r(const icmap_map_t map, const char *key_name, int32_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_INT32)); } cs_error_t icmap_set_uint32_r(const icmap_map_t map, const char *key_name, uint32_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_UINT32)); } cs_error_t icmap_set_int64_r(const icmap_map_t map, const char *key_name, int64_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_INT64)); } cs_error_t icmap_set_uint64_r(const icmap_map_t map, const char *key_name, uint64_t value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_UINT64)); } cs_error_t icmap_set_float_r(const icmap_map_t map, const char *key_name, float value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_FLOAT)); } cs_error_t icmap_set_double_r(const icmap_map_t map, const char *key_name, double value) { return (icmap_set_r(map, key_name, &value, sizeof(value), ICMAP_VALUETYPE_DOUBLE)); } cs_error_t icmap_set_string_r(const icmap_map_t map, const char *key_name, const char *value) { if (value == NULL) { return (CS_ERR_INVALID_PARAM); } return (icmap_set_r(map, key_name, value, strlen(value), ICMAP_VALUETYPE_STRING)); } cs_error_t icmap_set_int8(const char *key_name, int8_t value) { return (icmap_set_int8_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_uint8(const char *key_name, uint8_t value) { return (icmap_set_uint8_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_int16(const char *key_name, int16_t value) { return (icmap_set_int16_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_uint16(const char *key_name, uint16_t value) { return (icmap_set_uint16_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_int32(const char *key_name, int32_t value) { return (icmap_set_int32_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_uint32(const char *key_name, uint32_t value) { return (icmap_set_uint32_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_int64(const char *key_name, int64_t value) { return (icmap_set_int64_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_uint64(const char *key_name, uint64_t value) { return (icmap_set_uint64_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_float(const char *key_name, float value) { return (icmap_set_float_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_double(const char *key_name, double value) { return (icmap_set_double_r(icmap_global_map, key_name, value)); } cs_error_t icmap_set_string(const char *key_name, const char *value) { return (icmap_set_string_r(icmap_global_map, key_name, value)); } cs_error_t icmap_delete_r(const icmap_map_t map, const char *key_name) { struct icmap_item *item; if (key_name == NULL) { return (CS_ERR_INVALID_PARAM); } item = qb_map_get(map->qb_map, key_name); if (item == NULL) { return (CS_ERR_NOT_EXIST); } if (qb_map_rm(map->qb_map, item->key_name) != QB_TRUE) { return (CS_ERR_NOT_EXIST); } return (CS_OK); } cs_error_t icmap_delete(const char *key_name) { return (icmap_delete_r(icmap_global_map, key_name)); } static cs_error_t icmap_get_ref_r( const icmap_map_t map, const char *key_name, void **value, size_t *value_len, icmap_value_types_t *type) { struct icmap_item *item; if (key_name == NULL) { return (CS_ERR_INVALID_PARAM); } item = qb_map_get(map->qb_map, key_name); if (item == NULL) { return (CS_ERR_NOT_EXIST); } if (type != NULL) { *type = item->type; } if (value_len != NULL) { *value_len = item->value_len; } if (value != NULL) { *value = item->value; } return (CS_OK); } cs_error_t icmap_get_r( const icmap_map_t map, const char *key_name, void *value, size_t *value_len, icmap_value_types_t *type) { cs_error_t res; void *tmp_value; size_t tmp_value_len; res = icmap_get_ref_r(map, key_name, &tmp_value, &tmp_value_len, type); if (res != CS_OK) { return (res); } if (value == NULL) { if (value_len != NULL) { *value_len = tmp_value_len; } } else { if (value_len == NULL || *value_len < tmp_value_len) { return (CS_ERR_INVALID_PARAM); } *value_len = tmp_value_len; memcpy(value, tmp_value, tmp_value_len); } return (CS_OK); } cs_error_t icmap_get( const char *key_name, void *value, size_t *value_len, icmap_value_types_t *type) { return (icmap_get_r(icmap_global_map, key_name, value, value_len, type)); } static cs_error_t icmap_get_int_r( const icmap_map_t map, const char *key_name, void *value, icmap_value_types_t type) { char key_value[16]; size_t key_size; cs_error_t err; icmap_value_types_t key_type; key_size = sizeof(key_value); memset(key_value, 0, key_size); err = icmap_get(key_name, key_value, &key_size, &key_type); if (err != CS_OK) return (err); if (key_type != type) { return (CS_ERR_INVALID_PARAM); } memcpy(value, key_value, icmap_get_valuetype_len(key_type)); return (CS_OK); } cs_error_t icmap_get_int8_r(const icmap_map_t map, const char *key_name, int8_t *i8) { return (icmap_get_int_r(map, key_name, i8, ICMAP_VALUETYPE_INT8)); } cs_error_t icmap_get_uint8_r(const icmap_map_t map, const char *key_name, uint8_t *u8) { return (icmap_get_int_r(map, key_name, u8, ICMAP_VALUETYPE_UINT8)); } cs_error_t icmap_get_int16_r(const icmap_map_t map, const char *key_name, int16_t *i16) { return (icmap_get_int_r(map, key_name, i16, ICMAP_VALUETYPE_INT16)); } cs_error_t icmap_get_uint16_r(const icmap_map_t map, const char *key_name, uint16_t *u16) { return (icmap_get_int_r(map, key_name, u16, ICMAP_VALUETYPE_UINT16)); } cs_error_t icmap_get_int32_r(const icmap_map_t map, const char *key_name, int32_t *i32) { return (icmap_get_int_r(map, key_name, i32, ICMAP_VALUETYPE_INT32)); } cs_error_t icmap_get_uint32_r(const icmap_map_t map, const char *key_name, uint32_t *u32) { return (icmap_get_int_r(map, key_name, u32, ICMAP_VALUETYPE_UINT32)); } cs_error_t icmap_get_int64_r(const icmap_map_t map, const char *key_name, int64_t *i64) { return(icmap_get_int_r(map, key_name, i64, ICMAP_VALUETYPE_INT64)); } cs_error_t icmap_get_uint64_r(const icmap_map_t map, const char *key_name, uint64_t *u64) { return (icmap_get_int_r(map, key_name, u64, ICMAP_VALUETYPE_UINT64)); } cs_error_t icmap_get_float_r(const icmap_map_t map, const char *key_name, float *flt) { return (icmap_get_int_r(map, key_name, flt, ICMAP_VALUETYPE_FLOAT)); } cs_error_t icmap_get_double_r(const icmap_map_t map, const char *key_name, double *dbl) { return (icmap_get_int_r(map, key_name, dbl, ICMAP_VALUETYPE_DOUBLE)); } cs_error_t icmap_get_int8(const char *key_name, int8_t *i8) { return (icmap_get_int8_r(icmap_global_map, key_name, i8)); } cs_error_t icmap_get_uint8(const char *key_name, uint8_t *u8) { return (icmap_get_uint8_r(icmap_global_map, key_name, u8)); } cs_error_t icmap_get_int16(const char *key_name, int16_t *i16) { return (icmap_get_int16_r(icmap_global_map, key_name, i16)); } cs_error_t icmap_get_uint16(const char *key_name, uint16_t *u16) { return (icmap_get_uint16_r(icmap_global_map, key_name, u16)); } cs_error_t icmap_get_int32(const char *key_name, int32_t *i32) { return (icmap_get_int32_r(icmap_global_map, key_name, i32)); } cs_error_t icmap_get_uint32(const char *key_name, uint32_t *u32) { return (icmap_get_uint32_r(icmap_global_map, key_name, u32)); } cs_error_t icmap_get_int64(const char *key_name, int64_t *i64) { return(icmap_get_int64_r(icmap_global_map, key_name, i64)); } cs_error_t icmap_get_uint64(const char *key_name, uint64_t *u64) { return (icmap_get_uint64_r(icmap_global_map, key_name, u64)); } cs_error_t icmap_get_float(const char *key_name, float *flt) { return (icmap_get_float_r(icmap_global_map, key_name, flt)); } cs_error_t icmap_get_double(const char *key_name, double *dbl) { return (icmap_get_double_r(icmap_global_map, key_name, dbl)); } cs_error_t icmap_get_string(const char *key_name, char **str) { cs_error_t res; size_t str_len; icmap_value_types_t type; res = icmap_get(key_name, NULL, &str_len, &type); if (res != CS_OK || type != ICMAP_VALUETYPE_STRING) { if (res == CS_OK) { res = CS_ERR_INVALID_PARAM; } goto return_error; } *str = malloc(str_len); if (*str == NULL) { res = CS_ERR_NO_MEMORY; goto return_error; } res = icmap_get(key_name, *str, &str_len, &type); if (res != CS_OK) { free(*str); goto return_error; } return (CS_OK); return_error: return (res); } cs_error_t icmap_adjust_int_r( const icmap_map_t map, const char *key_name, int32_t step) { struct icmap_item *item; uint8_t u8; uint16_t u16; uint32_t u32; uint64_t u64; cs_error_t err = CS_OK; if (key_name == NULL) { return (CS_ERR_INVALID_PARAM); } item = qb_map_get(map->qb_map, key_name); if (item == NULL) { return (CS_ERR_NOT_EXIST); } switch (item->type) { case ICMAP_VALUETYPE_INT8: case ICMAP_VALUETYPE_UINT8: memcpy(&u8, item->value, sizeof(u8)); u8 += step; err = icmap_set(key_name, &u8, sizeof(u8), item->type); break; case ICMAP_VALUETYPE_INT16: case ICMAP_VALUETYPE_UINT16: memcpy(&u16, item->value, sizeof(u16)); u16 += step; err = icmap_set(key_name, &u16, sizeof(u16), item->type); break; case ICMAP_VALUETYPE_INT32: case ICMAP_VALUETYPE_UINT32: memcpy(&u32, item->value, sizeof(u32)); u32 += step; err = icmap_set(key_name, &u32, sizeof(u32), item->type); break; case ICMAP_VALUETYPE_INT64: case ICMAP_VALUETYPE_UINT64: memcpy(&u64, item->value, sizeof(u64)); u64 += step; err = icmap_set(key_name, &u64, sizeof(u64), item->type); break; case ICMAP_VALUETYPE_FLOAT: case ICMAP_VALUETYPE_DOUBLE: case ICMAP_VALUETYPE_STRING: case ICMAP_VALUETYPE_BINARY: err = CS_ERR_INVALID_PARAM; break; } return (err); } cs_error_t icmap_adjust_int( const char *key_name, int32_t step) { return (icmap_adjust_int_r(icmap_global_map, key_name, step)); } cs_error_t icmap_fast_adjust_int_r( const icmap_map_t map, const char *key_name, int32_t step) { struct icmap_item *item; cs_error_t err = CS_OK; if (key_name == NULL) { return (CS_ERR_INVALID_PARAM); } item = qb_map_get(map->qb_map, key_name); if (item == NULL) { return (CS_ERR_NOT_EXIST); } switch (item->type) { case ICMAP_VALUETYPE_INT8: case ICMAP_VALUETYPE_UINT8: *(uint8_t *)item->value += step; break; case ICMAP_VALUETYPE_INT16: case ICMAP_VALUETYPE_UINT16: *(uint16_t *)item->value += step; break; case ICMAP_VALUETYPE_INT32: case ICMAP_VALUETYPE_UINT32: *(uint32_t *)item->value += step; break; case ICMAP_VALUETYPE_INT64: case ICMAP_VALUETYPE_UINT64: *(uint64_t *)item->value += step; break; case ICMAP_VALUETYPE_FLOAT: case ICMAP_VALUETYPE_DOUBLE: case ICMAP_VALUETYPE_STRING: case ICMAP_VALUETYPE_BINARY: err = CS_ERR_INVALID_PARAM; break; } if (err == CS_OK) { qb_map_put(map->qb_map, item->key_name, item); } return (err); } cs_error_t icmap_fast_adjust_int( const char *key_name, int32_t step) { return (icmap_fast_adjust_int_r(icmap_global_map, key_name, step)); } cs_error_t icmap_inc_r(const icmap_map_t map, const char *key_name) { return (icmap_adjust_int_r(map, key_name, 1)); } cs_error_t icmap_inc(const char *key_name) { return (icmap_inc_r(icmap_global_map, key_name)); } cs_error_t icmap_dec_r(const icmap_map_t map, const char *key_name) { return (icmap_adjust_int_r(map, key_name, -1)); } cs_error_t icmap_dec(const char *key_name) { return (icmap_dec_r(icmap_global_map, key_name)); } cs_error_t icmap_fast_inc_r(const icmap_map_t map, const char *key_name) { return (icmap_fast_adjust_int_r(map, key_name, 1)); } cs_error_t icmap_fast_inc(const char *key_name) { return (icmap_fast_inc_r(icmap_global_map, key_name)); } cs_error_t icmap_fast_dec_r(const icmap_map_t map, const char *key_name) { return (icmap_fast_adjust_int_r(map, key_name, -1)); } cs_error_t icmap_fast_dec(const char *key_name) { return (icmap_fast_dec_r(icmap_global_map, key_name)); } icmap_iter_t icmap_iter_init_r(const icmap_map_t map, const char *prefix) { return (qb_map_pref_iter_create(map->qb_map, prefix)); } icmap_iter_t icmap_iter_init(const char *prefix) { return (icmap_iter_init_r(icmap_global_map, prefix)); } const char *icmap_iter_next(icmap_iter_t iter, size_t *value_len, icmap_value_types_t *type) { struct icmap_item *item; const char *res; res = qb_map_iter_next(iter, (void **)&item); if (res == NULL) { return (res); } if (value_len != NULL) { *value_len = item->value_len; } if (type != NULL) { *type = item->type; } return (res); } void icmap_iter_finalize(icmap_iter_t iter) { qb_map_iter_free(iter); } static void icmap_notify_fn(uint32_t event, char *key, void *old_value, void *value, void *user_data) { icmap_track_t icmap_track = (icmap_track_t)user_data; struct icmap_item *new_item = (struct icmap_item *)value; struct icmap_item *old_item = (struct icmap_item *)old_value; struct icmap_notify_value new_val; struct icmap_notify_value old_val; if (value == NULL && old_value == NULL) { return ; } if (new_item != NULL) { new_val.type = new_item->type; new_val.len = new_item->value_len; new_val.data = new_item->value; } else { memset(&new_val, 0, sizeof(new_val)); } /* * old_item == new_item if fast functions are used -> don't fill old value */ if (old_item != NULL && old_item != new_item) { old_val.type = old_item->type; old_val.len = old_item->value_len; old_val.data = old_item->value; } else { memset(&old_val, 0, sizeof(old_val)); } icmap_track->notify_fn(icmap_qbtt_to_tt(event), key, new_val, old_val, icmap_track->user_data); } cs_error_t icmap_track_add( const char *key_name, int32_t track_type, icmap_notify_fn_t notify_fn, void *user_data, icmap_track_t *icmap_track) { int32_t err; if (notify_fn == NULL || icmap_track == NULL) { return (CS_ERR_INVALID_PARAM); } if ((track_type & ~(ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX)) != 0) { return (CS_ERR_INVALID_PARAM); } *icmap_track = malloc(sizeof(**icmap_track)); if (*icmap_track == NULL) { return (CS_ERR_NO_MEMORY); } memset(*icmap_track, 0, sizeof(**icmap_track)); if (key_name != NULL) { (*icmap_track)->key_name = strdup(key_name); }; (*icmap_track)->track_type = track_type; (*icmap_track)->notify_fn = notify_fn; (*icmap_track)->user_data = user_data; if ((err = qb_map_notify_add(icmap_global_map->qb_map, (*icmap_track)->key_name, icmap_notify_fn, icmap_tt_to_qbtt(track_type), *icmap_track)) != 0) { free((*icmap_track)->key_name); free(*icmap_track); return (qb_to_cs_error(err)); } qb_list_init(&(*icmap_track)->list); qb_list_add (&(*icmap_track)->list, &icmap_track_list_head); return (CS_OK); } cs_error_t icmap_track_delete(icmap_track_t icmap_track) { int32_t err; if ((err = qb_map_notify_del_2(icmap_global_map->qb_map, icmap_track->key_name, icmap_notify_fn, icmap_tt_to_qbtt(icmap_track->track_type), icmap_track)) != 0) { return (qb_to_cs_error(err)); } qb_list_del(&icmap_track->list); free(icmap_track->key_name); free(icmap_track); return (CS_OK); } void *icmap_track_get_user_data(icmap_track_t icmap_track) { return (icmap_track->user_data); } cs_error_t icmap_set_ro_access(const char *key_name, int prefix, int ro_access) { - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct icmap_ro_access_item *icmap_ro_ai; - qb_list_for_each(iter, &icmap_ro_access_item_list_head) { + qb_list_for_each_safe(iter, tmp_iter, &icmap_ro_access_item_list_head) { icmap_ro_ai = qb_list_entry(iter, struct icmap_ro_access_item, list); if (icmap_ro_ai->prefix == prefix && strcmp(key_name, icmap_ro_ai->key_name) == 0) { /* * We found item */ if (ro_access) { return (CS_ERR_EXIST); } else { qb_list_del(&icmap_ro_ai->list); free(icmap_ro_ai->key_name); free(icmap_ro_ai); return (CS_OK); } } } if (!ro_access) { return (CS_ERR_NOT_EXIST); } icmap_ro_ai = malloc(sizeof(*icmap_ro_ai)); if (icmap_ro_ai == NULL) { return (CS_ERR_NO_MEMORY); } memset(icmap_ro_ai, 0, sizeof(*icmap_ro_ai)); icmap_ro_ai->key_name = strdup(key_name); if (icmap_ro_ai->key_name == NULL) { free(icmap_ro_ai); return (CS_ERR_NO_MEMORY); } icmap_ro_ai->prefix = prefix; qb_list_init(&icmap_ro_ai->list); qb_list_add (&icmap_ro_ai->list, &icmap_ro_access_item_list_head); return (CS_OK); } int icmap_is_key_ro(const char *key_name) { struct qb_list_head *iter; struct icmap_ro_access_item *icmap_ro_ai; - qb_list_for_each(iter, &icmap_ro_access_item_list_head) { + qb_list_for_each(iter, &icmap_ro_access_item_list_head) { icmap_ro_ai = qb_list_entry(iter, struct icmap_ro_access_item, list); if (icmap_ro_ai->prefix) { if (strlen(icmap_ro_ai->key_name) > strlen(key_name)) continue; if (strncmp(icmap_ro_ai->key_name, key_name, strlen(icmap_ro_ai->key_name)) == 0) { return (CS_TRUE); } } else { if (strcmp(icmap_ro_ai->key_name, key_name) == 0) { return (CS_TRUE); } } } return (CS_FALSE); } cs_error_t icmap_copy_map(icmap_map_t dst_map, const icmap_map_t src_map) { icmap_iter_t iter; size_t value_len; icmap_value_types_t value_type; const char *key_name; cs_error_t err; void *value; iter = icmap_iter_init_r(src_map, NULL); if (iter == NULL) { return (CS_ERR_NO_MEMORY); } err = CS_OK; while ((key_name = icmap_iter_next(iter, &value_len, &value_type)) != NULL) { err = icmap_get_ref_r(src_map, key_name, &value, &value_len, &value_type); if (err != CS_OK) { goto exit_iter_finalize; } err = icmap_set_r(dst_map, key_name, value, value_len, value_type); if (err != CS_OK) { goto exit_iter_finalize; } } exit_iter_finalize: icmap_iter_finalize(iter); return (err); } diff --git a/exec/ipc_glue.c b/exec/ipc_glue.c index 4de1ec02..d0a25d98 100644 --- a/exec/ipc_glue.c +++ b/exec/ipc_glue.c @@ -1,902 +1,902 @@ /* * Copyright (c) 2010-2012 Red Hat, Inc. * * All rights reserved. * * Author: Angus Salkeld * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Red Hat, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sync.h" #include "timer.h" #include "main.h" #include "util.h" #include "apidef.h" #include "service.h" LOGSYS_DECLARE_SUBSYS ("MAIN"); static struct corosync_api_v1 *api = NULL; static int32_t ipc_not_enough_fds_left = 0; static int32_t ipc_fc_is_quorate; /* boolean */ static int32_t ipc_fc_totem_queue_level; /* percentage used */ static int32_t ipc_fc_sync_in_process; /* boolean */ static int32_t ipc_allow_connections = 0; /* boolean */ #define CS_IPCS_MAPPER_SERV_NAME 256 struct cs_ipcs_mapper { int32_t id; qb_ipcs_service_t *inst; char name[CS_IPCS_MAPPER_SERV_NAME]; }; struct outq_item { void *msg; size_t mlen; struct qb_list_head list; }; static struct cs_ipcs_mapper ipcs_mapper[SERVICES_COUNT_MAX]; static int32_t cs_ipcs_job_add(enum qb_loop_priority p, void *data, qb_loop_job_dispatch_fn fn); static int32_t cs_ipcs_dispatch_add(enum qb_loop_priority p, int32_t fd, int32_t events, void *data, qb_ipcs_dispatch_fn_t fn); static int32_t cs_ipcs_dispatch_mod(enum qb_loop_priority p, int32_t fd, int32_t events, void *data, qb_ipcs_dispatch_fn_t fn); static int32_t cs_ipcs_dispatch_del(int32_t fd); static void outq_flush (void *data); static struct qb_ipcs_poll_handlers corosync_poll_funcs = { .job_add = cs_ipcs_job_add, .dispatch_add = cs_ipcs_dispatch_add, .dispatch_mod = cs_ipcs_dispatch_mod, .dispatch_del = cs_ipcs_dispatch_del, }; static int32_t cs_ipcs_connection_accept (qb_ipcs_connection_t *c, uid_t euid, gid_t egid); static void cs_ipcs_connection_created(qb_ipcs_connection_t *c); static int32_t cs_ipcs_msg_process(qb_ipcs_connection_t *c, void *data, size_t size); static int32_t cs_ipcs_connection_closed (qb_ipcs_connection_t *c); static void cs_ipcs_connection_destroyed (qb_ipcs_connection_t *c); static struct qb_ipcs_service_handlers corosync_service_funcs = { .connection_accept = cs_ipcs_connection_accept, .connection_created = cs_ipcs_connection_created, .msg_process = cs_ipcs_msg_process, .connection_closed = cs_ipcs_connection_closed, .connection_destroyed = cs_ipcs_connection_destroyed, }; static const char* cs_ipcs_serv_short_name(int32_t service_id) { const char *name; switch (service_id) { case CFG_SERVICE: name = "cfg"; break; case CPG_SERVICE: name = "cpg"; break; case QUORUM_SERVICE: name = "quorum"; break; case PLOAD_SERVICE: name = "pload"; break; case VOTEQUORUM_SERVICE: name = "votequorum"; break; case MON_SERVICE: name = "mon"; break; case WD_SERVICE: name = "wd"; break; case CMAP_SERVICE: name = "cmap"; break; default: name = NULL; break; } return name; } void cs_ipc_allow_connections(int32_t allow) { ipc_allow_connections = allow; } int32_t cs_ipcs_service_destroy(int32_t service_id) { if (ipcs_mapper[service_id].inst) { qb_ipcs_destroy(ipcs_mapper[service_id].inst); ipcs_mapper[service_id].inst = NULL; } return 0; } static int32_t cs_ipcs_connection_accept (qb_ipcs_connection_t *c, uid_t euid, gid_t egid) { int32_t service = qb_ipcs_service_id_get(c); uint8_t u8; char key_name[ICMAP_KEYNAME_MAXLEN]; if (!ipc_allow_connections) { log_printf(LOGSYS_LEVEL_DEBUG, "Denied connection, corosync is not ready"); return -EAGAIN; } if (corosync_service[service] == NULL || ipcs_mapper[service].inst == NULL) { return -ENOSYS; } if (ipc_not_enough_fds_left) { return -EMFILE; } if (euid == 0 || egid == 0) { return 0; } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.uid.%u", euid); if (icmap_get_uint8(key_name, &u8) == CS_OK && u8 == 1) return 0; snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.config.uid.%u", euid); if (icmap_get_uint8(key_name, &u8) == CS_OK && u8 == 1) return 0; snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.gid.%u", egid); if (icmap_get_uint8(key_name, &u8) == CS_OK && u8 == 1) return 0; snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "uidgid.config.gid.%u", egid); if (icmap_get_uint8(key_name, &u8) == CS_OK && u8 == 1) return 0; log_printf(LOGSYS_LEVEL_ERROR, "Denied connection attempt from %d:%d", euid, egid); return -EACCES; } static char * pid_to_name (pid_t pid, char *out_name, size_t name_len) { char *name; char *rest; FILE *fp; char fname[32]; char buf[256]; snprintf (fname, 32, "/proc/%d/stat", pid); fp = fopen (fname, "r"); if (!fp) { return NULL; } if (fgets (buf, sizeof (buf), fp) == NULL) { fclose (fp); return NULL; } fclose (fp); name = strrchr (buf, '('); if (!name) { return NULL; } /* move past the bracket */ name++; rest = strrchr (buf, ')'); if (rest == NULL || rest[1] != ' ') { return NULL; } *rest = '\0'; /* move past the NULL and space */ rest += 2; /* copy the name */ strncpy (out_name, name, name_len); out_name[name_len - 1] = '\0'; return out_name; } struct cs_ipcs_conn_context { char *icmap_path; struct qb_list_head outq_head; int32_t queuing; uint32_t queued; uint64_t invalid_request; uint64_t overload; uint32_t sent; char data[1]; }; static void cs_ipcs_connection_created(qb_ipcs_connection_t *c) { int32_t service = 0; struct cs_ipcs_conn_context *context; char proc_name[32]; struct qb_ipcs_connection_stats stats; int32_t size = sizeof(struct cs_ipcs_conn_context); char key_name[ICMAP_KEYNAME_MAXLEN]; int set_client_pid = 0; int set_proc_name = 0; log_printf(LOG_DEBUG, "connection created"); service = qb_ipcs_service_id_get(c); size += corosync_service[service]->private_data_size; context = calloc(1, size); if (context == NULL) { qb_ipcs_disconnect(c); return; } qb_list_init(&context->outq_head); context->queuing = QB_FALSE; context->queued = 0; context->sent = 0; qb_ipcs_context_set(c, context); if (corosync_service[service]->lib_init_fn(c) != 0) { log_printf(LOG_ERR, "lib_init_fn failed, disconnecting"); qb_ipcs_disconnect(c); return; } icmap_inc("runtime.connections.active"); qb_ipcs_connection_stats_get(c, &stats, QB_FALSE); if (stats.client_pid > 0) { if (pid_to_name (stats.client_pid, proc_name, sizeof(proc_name))) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "runtime.connections.%s:%u:%p", proc_name, stats.client_pid, c); set_proc_name = 1; } else { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "runtime.connections.%u:%p", stats.client_pid, c); } set_client_pid = 1; } else { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "runtime.connections.%p", c); } icmap_convert_name_to_valid_name(key_name); context->icmap_path = strdup(key_name); if (context->icmap_path == NULL) { qb_ipcs_disconnect(c); return; } if (set_proc_name) { snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.name", context->icmap_path); icmap_set_string(key_name, proc_name); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.client_pid", context->icmap_path); if (set_client_pid) { icmap_set_uint32(key_name, stats.client_pid); } else { icmap_set_uint32(key_name, 0); } snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.service_id", context->icmap_path); icmap_set_uint32(key_name, service); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.responses", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.dispatched", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.requests", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.send_retries", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.recv_retries", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.flow_control", context->icmap_path); icmap_set_uint32(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.flow_control_count", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.queue_size", context->icmap_path); icmap_set_uint32(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.invalid_request", context->icmap_path); icmap_set_uint64(key_name, 0); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.overload", context->icmap_path); icmap_set_uint64(key_name, 0); } void cs_ipc_refcnt_inc(void *conn) { qb_ipcs_connection_ref(conn); } void cs_ipc_refcnt_dec(void *conn) { qb_ipcs_connection_unref(conn); } void *cs_ipcs_private_data_get(void *conn) { struct cs_ipcs_conn_context *cnx; cnx = qb_ipcs_context_get(conn); return &cnx->data[0]; } static void cs_ipcs_connection_destroyed (qb_ipcs_connection_t *c) { struct cs_ipcs_conn_context *context; - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; struct outq_item *outq_item; log_printf(LOG_DEBUG, "%s() ", __func__); context = qb_ipcs_context_get(c); if (context) { - qb_list_for_each(list, &(context->outq_head)) { + qb_list_for_each_safe(list, tmp_iter, &(context->outq_head)) { outq_item = qb_list_entry (list, struct outq_item, list); qb_list_del (list); free (outq_item->msg); free (outq_item); } free(context); } } static int32_t cs_ipcs_connection_closed (qb_ipcs_connection_t *c) { int32_t res = 0; int32_t service = qb_ipcs_service_id_get(c); icmap_iter_t iter; char prefix[ICMAP_KEYNAME_MAXLEN]; const char *key_name; struct cs_ipcs_conn_context *cnx; log_printf(LOG_DEBUG, "%s() ", __func__); res = corosync_service[service]->lib_exit_fn(c); if (res != 0) { return res; } qb_loop_job_del(cs_poll_handle_get(), QB_LOOP_HIGH, c, outq_flush); cnx = qb_ipcs_context_get(c); snprintf(prefix, ICMAP_KEYNAME_MAXLEN, "%s.", cnx->icmap_path); iter = icmap_iter_init(prefix); while ((key_name = icmap_iter_next(iter, NULL, NULL)) != NULL) { icmap_delete(key_name); } icmap_iter_finalize(iter); free(cnx->icmap_path); icmap_inc("runtime.connections.closed"); icmap_dec("runtime.connections.active"); return 0; } int cs_ipcs_response_iov_send (void *conn, const struct iovec *iov, unsigned int iov_len) { int32_t rc = qb_ipcs_response_sendv(conn, iov, iov_len); if (rc >= 0) { return 0; } return rc; } int cs_ipcs_response_send(void *conn, const void *msg, size_t mlen) { int32_t rc = qb_ipcs_response_send(conn, msg, mlen); if (rc >= 0) { return 0; } return rc; } static void outq_flush (void *data) { qb_ipcs_connection_t *conn = data; - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; struct outq_item *outq_item; int32_t rc; struct cs_ipcs_conn_context *context = qb_ipcs_context_get(conn); - qb_list_for_each(list, &(context->outq_head)) { + qb_list_for_each_safe(list, tmp_iter, &(context->outq_head)) { outq_item = qb_list_entry (list, struct outq_item, list); rc = qb_ipcs_event_send(conn, outq_item->msg, outq_item->mlen); if (rc < 0 && rc != -EAGAIN) { errno = -rc; qb_perror(LOG_ERR, "qb_ipcs_event_send"); return; } else if (rc == -EAGAIN) { break; } assert(rc == outq_item->mlen); context->sent++; context->queued--; qb_list_del (list); free (outq_item->msg); free (outq_item); } if (qb_list_empty (&context->outq_head)) { context->queuing = QB_FALSE; log_printf(LOGSYS_LEVEL_INFO, "Q empty, queued:%d sent:%d.", context->queued, context->sent); context->queued = 0; context->sent = 0; } else { qb_loop_job_add(cs_poll_handle_get(), QB_LOOP_HIGH, conn, outq_flush); } } static void msg_send_or_queue(qb_ipcs_connection_t *conn, const struct iovec *iov, uint32_t iov_len) { int32_t rc = 0; int32_t i; int32_t bytes_msg = 0; struct outq_item *outq_item; char *write_buf = 0; struct cs_ipcs_conn_context *context = qb_ipcs_context_get(conn); for (i = 0; i < iov_len; i++) { bytes_msg += iov[i].iov_len; } if (!context->queuing) { assert(qb_list_empty (&context->outq_head)); rc = qb_ipcs_event_sendv(conn, iov, iov_len); if (rc == bytes_msg) { context->sent++; return; } if (rc == -EAGAIN) { context->queued = 0; context->sent = 0; context->queuing = QB_TRUE; qb_loop_job_add(cs_poll_handle_get(), QB_LOOP_HIGH, conn, outq_flush); } else { log_printf(LOGSYS_LEVEL_ERROR, "event_send retuned %d, expected %d!", rc, bytes_msg); return; } } outq_item = malloc (sizeof (struct outq_item)); if (outq_item == NULL) { qb_ipcs_disconnect(conn); return; } outq_item->msg = malloc (bytes_msg); if (outq_item->msg == NULL) { free (outq_item); qb_ipcs_disconnect(conn); return; } write_buf = outq_item->msg; for (i = 0; i < iov_len; i++) { memcpy (write_buf, iov[i].iov_base, iov[i].iov_len); write_buf += iov[i].iov_len; } outq_item->mlen = bytes_msg; qb_list_init (&outq_item->list); qb_list_add_tail (&outq_item->list, &context->outq_head); context->queued++; } int cs_ipcs_dispatch_send(void *conn, const void *msg, size_t mlen) { struct iovec iov; iov.iov_base = (void *)msg; iov.iov_len = mlen; msg_send_or_queue (conn, &iov, 1); return 0; } int cs_ipcs_dispatch_iov_send (void *conn, const struct iovec *iov, unsigned int iov_len) { msg_send_or_queue(conn, iov, iov_len); return 0; } static int32_t cs_ipcs_msg_process(qb_ipcs_connection_t *c, void *data, size_t size) { struct qb_ipc_response_header response; struct qb_ipc_request_header *request_pt = (struct qb_ipc_request_header *)data; int32_t service = qb_ipcs_service_id_get(c); int32_t send_ok = 0; int32_t is_async_call = QB_FALSE; ssize_t res = -1; int sending_allowed_private_data; struct cs_ipcs_conn_context *cnx; send_ok = corosync_sending_allowed (service, request_pt->id, request_pt, &sending_allowed_private_data); is_async_call = (service == CPG_SERVICE && request_pt->id == 2); /* * This happens when the message contains some kind of invalid * parameter, such as an invalid size */ if (send_ok == -EINVAL) { response.size = sizeof (response); response.id = 0; response.error = CS_ERR_INVALID_PARAM; cnx = qb_ipcs_context_get(c); if (cnx) { cnx->invalid_request++; } if (is_async_call) { log_printf(LOGSYS_LEVEL_INFO, "*** %s() invalid message! size:%d error:%d", __func__, response.size, response.error); } else { qb_ipcs_response_send (c, &response, sizeof (response)); } res = -EINVAL; } else if (send_ok < 0) { cnx = qb_ipcs_context_get(c); if (cnx) { cnx->overload++; } if (!is_async_call) { /* * Overload, tell library to retry */ response.size = sizeof (response); response.id = 0; response.error = CS_ERR_TRY_AGAIN; qb_ipcs_response_send (c, &response, sizeof (response)); } else { log_printf(LOGSYS_LEVEL_WARNING, "*** %s() (%d:%d - %d) %s!", __func__, service, request_pt->id, is_async_call, strerror(-send_ok)); } res = -ENOBUFS; } if (send_ok >= 0) { corosync_service[service]->lib_engine[request_pt->id].lib_handler_fn(c, request_pt); res = 0; } corosync_sending_allowed_release (&sending_allowed_private_data); return res; } static int32_t cs_ipcs_job_add(enum qb_loop_priority p, void *data, qb_loop_job_dispatch_fn fn) { return qb_loop_job_add(cs_poll_handle_get(), p, data, fn); } static int32_t cs_ipcs_dispatch_add(enum qb_loop_priority p, int32_t fd, int32_t events, void *data, qb_ipcs_dispatch_fn_t fn) { return qb_loop_poll_add(cs_poll_handle_get(), p, fd, events, data, fn); } static int32_t cs_ipcs_dispatch_mod(enum qb_loop_priority p, int32_t fd, int32_t events, void *data, qb_ipcs_dispatch_fn_t fn) { return qb_loop_poll_mod(cs_poll_handle_get(), p, fd, events, data, fn); } static int32_t cs_ipcs_dispatch_del(int32_t fd) { return qb_loop_poll_del(cs_poll_handle_get(), fd); } static void cs_ipcs_low_fds_event(int32_t not_enough, int32_t fds_available) { ipc_not_enough_fds_left = not_enough; if (not_enough) { log_printf(LOGSYS_LEVEL_WARNING, "refusing new connections (fds_available:%d)", fds_available); } else { log_printf(LOGSYS_LEVEL_NOTICE, "allowing new connections (fds_available:%d)", fds_available); } } int32_t cs_ipcs_q_level_get(void) { return ipc_fc_totem_queue_level; } static qb_loop_timer_handle ipcs_check_for_flow_control_timer; static void cs_ipcs_check_for_flow_control(void) { int32_t i; int32_t fc_enabled; for (i = 0; i < SERVICES_COUNT_MAX; i++) { if (corosync_service[i] == NULL || ipcs_mapper[i].inst == NULL) { continue; } fc_enabled = QB_IPCS_RATE_OFF; if (ipc_fc_is_quorate == 1 || corosync_service[i]->allow_inquorate == CS_LIB_ALLOW_INQUORATE) { /* * we are quorate * now check flow control */ if (ipc_fc_totem_queue_level != TOTEM_Q_LEVEL_CRITICAL && ipc_fc_sync_in_process == 0) { fc_enabled = QB_FALSE; } else if (ipc_fc_totem_queue_level != TOTEM_Q_LEVEL_CRITICAL && i == VOTEQUORUM_SERVICE) { /* * Allow message processing for votequorum service even * in sync phase */ fc_enabled = QB_FALSE; } else { fc_enabled = QB_IPCS_RATE_OFF_2; } } if (fc_enabled) { qb_ipcs_request_rate_limit(ipcs_mapper[i].inst, fc_enabled); qb_loop_timer_add(cs_poll_handle_get(), QB_LOOP_MED, 1*QB_TIME_NS_IN_MSEC, NULL, corosync_recheck_the_q_level, &ipcs_check_for_flow_control_timer); } else if (ipc_fc_totem_queue_level == TOTEM_Q_LEVEL_LOW) { qb_ipcs_request_rate_limit(ipcs_mapper[i].inst, QB_IPCS_RATE_FAST); } else if (ipc_fc_totem_queue_level == TOTEM_Q_LEVEL_GOOD) { qb_ipcs_request_rate_limit(ipcs_mapper[i].inst, QB_IPCS_RATE_NORMAL); } else if (ipc_fc_totem_queue_level == TOTEM_Q_LEVEL_HIGH) { qb_ipcs_request_rate_limit(ipcs_mapper[i].inst, QB_IPCS_RATE_SLOW); } } } static void cs_ipcs_fc_quorum_changed(int quorate, void *context) { ipc_fc_is_quorate = quorate; cs_ipcs_check_for_flow_control(); } static void cs_ipcs_totem_queue_level_changed(enum totem_q_level level) { ipc_fc_totem_queue_level = level; cs_ipcs_check_for_flow_control(); } void cs_ipcs_sync_state_changed(int32_t sync_in_process) { ipc_fc_sync_in_process = sync_in_process; cs_ipcs_check_for_flow_control(); } void cs_ipcs_stats_update(void) { int32_t i; struct qb_ipcs_stats srv_stats; struct qb_ipcs_connection_stats stats; qb_ipcs_connection_t *c, *prev; struct cs_ipcs_conn_context *cnx; char key_name[ICMAP_KEYNAME_MAXLEN]; for (i = 0; i < SERVICES_COUNT_MAX; i++) { if (corosync_service[i] == NULL || ipcs_mapper[i].inst == NULL) { continue; } qb_ipcs_stats_get(ipcs_mapper[i].inst, &srv_stats, QB_FALSE); for (c = qb_ipcs_connection_first_get(ipcs_mapper[i].inst); c; prev = c, c = qb_ipcs_connection_next_get(ipcs_mapper[i].inst, prev), qb_ipcs_connection_unref(prev)) { cnx = qb_ipcs_context_get(c); if (cnx == NULL) continue; qb_ipcs_connection_stats_get(c, &stats, QB_FALSE); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.client_pid", cnx->icmap_path); icmap_set_uint32(key_name, stats.client_pid); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.requests", cnx->icmap_path); icmap_set_uint64(key_name, stats.requests); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.responses", cnx->icmap_path); icmap_set_uint64(key_name, stats.responses); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.dispatched", cnx->icmap_path); icmap_set_uint64(key_name, stats.events); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.send_retries", cnx->icmap_path); icmap_set_uint64(key_name, stats.send_retries); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.recv_retries", cnx->icmap_path); icmap_set_uint64(key_name, stats.recv_retries); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.flow_control", cnx->icmap_path); icmap_set_uint32(key_name, stats.flow_control_state); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.flow_control_count", cnx->icmap_path); icmap_set_uint64(key_name, stats.flow_control_count); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.queue_size", cnx->icmap_path); icmap_set_uint32(key_name, cnx->queued); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.invalid_request", cnx->icmap_path); icmap_set_uint64(key_name, cnx->invalid_request); snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s.overload", cnx->icmap_path); icmap_set_uint64(key_name, cnx->overload); } } } static enum qb_ipc_type cs_get_ipc_type (void) { char *str; int found = 0; enum qb_ipc_type ret = QB_IPC_NATIVE; if (icmap_get_string("qb.ipc_type", &str) != CS_OK) { log_printf(LOGSYS_LEVEL_DEBUG, "No configured qb.ipc_type. Using native ipc"); return QB_IPC_NATIVE; } if (strcmp(str, "native") == 0) { ret = QB_IPC_NATIVE; found = 1; } if (strcmp(str, "shm") == 0) { ret = QB_IPC_SHM; found = 1; } if (strcmp(str, "socket") == 0) { ret = QB_IPC_SOCKET; found = 1; } if (found) { log_printf(LOGSYS_LEVEL_DEBUG, "Using %s ipc", str); } else { log_printf(LOGSYS_LEVEL_DEBUG, "Unknown ipc type %s", str); } free(str); return ret; } const char *cs_ipcs_service_init(struct corosync_service_engine *service) { const char *serv_short_name; serv_short_name = cs_ipcs_serv_short_name(service->id); if (service->lib_engine_count == 0) { log_printf (LOGSYS_LEVEL_DEBUG, "NOT Initializing IPC on %s [%d]", serv_short_name, service->id); return NULL; } if (strlen(serv_short_name) >= CS_IPCS_MAPPER_SERV_NAME) { log_printf (LOGSYS_LEVEL_ERROR, "service name %s is too long", serv_short_name); return "qb_ipcs_run error"; } ipcs_mapper[service->id].id = service->id; strcpy(ipcs_mapper[service->id].name, serv_short_name); log_printf (LOGSYS_LEVEL_DEBUG, "Initializing IPC on %s [%d]", ipcs_mapper[service->id].name, ipcs_mapper[service->id].id); ipcs_mapper[service->id].inst = qb_ipcs_create(ipcs_mapper[service->id].name, ipcs_mapper[service->id].id, cs_get_ipc_type(), &corosync_service_funcs); assert(ipcs_mapper[service->id].inst); qb_ipcs_poll_handlers_set(ipcs_mapper[service->id].inst, &corosync_poll_funcs); if (qb_ipcs_run(ipcs_mapper[service->id].inst) != 0) { log_printf (LOGSYS_LEVEL_ERROR, "Can't initialize IPC"); return "qb_ipcs_run error"; } return NULL; } void cs_ipcs_init(void) { api = apidef_get (); qb_loop_poll_low_fds_event_set(cs_poll_handle_get(), cs_ipcs_low_fds_event); api->quorum_register_callback (cs_ipcs_fc_quorum_changed, NULL); totempg_queue_level_register_callback (cs_ipcs_totem_queue_level_changed); icmap_set_uint64("runtime.connections.active", 0); icmap_set_uint64("runtime.connections.closed", 0); } diff --git a/exec/totemconfig.c b/exec/totemconfig.c index a1e3dabd..5fce2e62 100644 --- a/exec/totemconfig.c +++ b/exec/totemconfig.c @@ -1,1653 +1,1653 @@ /* * Copyright (c) 2002-2005 MontaVista Software, Inc. * Copyright (c) 2006-2013 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "totemconfig.h" #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST 4 #define TOKEN_TIMEOUT 1000 #define TOKEN_COEFFICIENT 650 #define JOIN_TIMEOUT 50 #define MERGE_TIMEOUT 200 #define DOWNCHECK_TIMEOUT 1000 #define FAIL_TO_RECV_CONST 2500 #define SEQNO_UNCHANGED_CONST 30 #define MINIMUM_TIMEOUT (int)(1000/HZ)*3 #define MAX_NETWORK_DELAY 50 #define WINDOW_SIZE 50 #define MAX_MESSAGES 17 #define MISS_COUNT_CONST 5 /* These currently match the defaults in libknet.h */ #define KNET_PING_INTERVAL 1000 #define KNET_PING_TIMEOUT 2000 #define KNET_PING_PRECISION 2048 #define DEFAULT_PORT 5405 static char error_string_response[512]; static void add_totem_config_notification(struct totem_config *totem_config); /* All the volatile parameters are uint32s, luckily */ static uint32_t *totem_get_param_by_name(struct totem_config *totem_config, const char *param_name) { if (strcmp(param_name, "totem.token") == 0) return &totem_config->token_timeout; if (strcmp(param_name, "totem.token_retransmit") == 0) return &totem_config->token_retransmit_timeout; if (strcmp(param_name, "totem.hold") == 0) return &totem_config->token_hold_timeout; if (strcmp(param_name, "totem.token_retransmits_before_loss_const") == 0) return &totem_config->token_retransmits_before_loss_const; if (strcmp(param_name, "totem.join") == 0) return &totem_config->join_timeout; if (strcmp(param_name, "totem.send_join") == 0) return &totem_config->send_join_timeout; if (strcmp(param_name, "totem.consensus") == 0) return &totem_config->consensus_timeout; if (strcmp(param_name, "totem.merge") == 0) return &totem_config->merge_timeout; if (strcmp(param_name, "totem.downcheck") == 0) return &totem_config->downcheck_timeout; if (strcmp(param_name, "totem.fail_recv_const") == 0) return &totem_config->fail_to_recv_const; if (strcmp(param_name, "totem.seqno_unchanged_const") == 0) return &totem_config->seqno_unchanged_const; if (strcmp(param_name, "totem.heartbeat_failures_allowed") == 0) return &totem_config->heartbeat_failures_allowed; if (strcmp(param_name, "totem.max_network_delay") == 0) return &totem_config->max_network_delay; if (strcmp(param_name, "totem.window_size") == 0) return &totem_config->window_size; if (strcmp(param_name, "totem.max_messages") == 0) return &totem_config->max_messages; if (strcmp(param_name, "totem.miss_count_const") == 0) return &totem_config->miss_count_const; return NULL; } /* * Read key_name from icmap. If key is not found or key_name == delete_key or if allow_zero is false * and readed value is zero, default value is used and stored into totem_config. */ static void totem_volatile_config_set_value (struct totem_config *totem_config, const char *key_name, const char *deleted_key, unsigned int default_value, int allow_zero_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; if (icmap_get_uint32(key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (!allow_zero_value && *totem_get_param_by_name(totem_config, key_name) == 0)) { *totem_get_param_by_name(totem_config, key_name) = default_value; } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); icmap_set_uint32(runtime_key_name, *totem_get_param_by_name(totem_config, key_name)); } /* * Read and validate config values from cmap and store them into totem_config. If key doesn't exists, * default value is stored. deleted_key is name of key beeing processed by delete operation * from cmap. It is considered as non existing even if it can be read. Can be NULL. */ static void totem_volatile_config_read (struct totem_config *totem_config, const char *deleted_key) { uint32_t u32; totem_volatile_config_set_value(totem_config, "totem.token_retransmits_before_loss_const", deleted_key, TOKEN_RETRANSMITS_BEFORE_LOSS_CONST, 0); totem_volatile_config_set_value(totem_config, "totem.token", deleted_key, TOKEN_TIMEOUT, 0); if (totem_config->interface_count > 0 && totem_config->interfaces[0].member_count > 2) { u32 = TOKEN_COEFFICIENT; icmap_get_uint32("totem.token_coefficient", &u32); totem_config->token_timeout += (totem_config->interfaces[0].member_count - 2) * u32; /* * Store totem_config value to cmap runtime section */ icmap_set_uint32("runtime.config.totem.token", totem_config->token_timeout); } totem_volatile_config_set_value(totem_config, "totem.max_network_delay", deleted_key, MAX_NETWORK_DELAY, 0); totem_volatile_config_set_value(totem_config, "totem.window_size", deleted_key, WINDOW_SIZE, 0); totem_volatile_config_set_value(totem_config, "totem.max_messages", deleted_key, MAX_MESSAGES, 0); totem_volatile_config_set_value(totem_config, "totem.miss_count_const", deleted_key, MISS_COUNT_CONST, 0); totem_volatile_config_set_value(totem_config, "totem.token_retransmit", deleted_key, (int)(totem_config->token_timeout / (totem_config->token_retransmits_before_loss_const + 0.2)), 0); totem_volatile_config_set_value(totem_config, "totem.hold", deleted_key, (int)(totem_config->token_retransmit_timeout * 0.8 - (1000/HZ)), 0); totem_volatile_config_set_value(totem_config, "totem.join", deleted_key, JOIN_TIMEOUT, 0); totem_volatile_config_set_value(totem_config, "totem.consensus", deleted_key, (int)(float)(1.2 * totem_config->token_timeout), 0); totem_volatile_config_set_value(totem_config, "totem.merge", deleted_key, MERGE_TIMEOUT, 0); totem_volatile_config_set_value(totem_config, "totem.downcheck", deleted_key, DOWNCHECK_TIMEOUT, 0); totem_volatile_config_set_value(totem_config, "totem.fail_recv_const", deleted_key, FAIL_TO_RECV_CONST, 0); totem_volatile_config_set_value(totem_config, "totem.seqno_unchanged_const", deleted_key, SEQNO_UNCHANGED_CONST, 0); totem_volatile_config_set_value(totem_config, "totem.send_join", deleted_key, 0, 1); totem_volatile_config_set_value(totem_config, "totem.heartbeat_failures_allowed", deleted_key, 0, 1); } static int totem_volatile_config_validate ( struct totem_config *totem_config, const char **error_string) { static char local_error_reason[512]; const char *error_reason = local_error_reason; if (totem_config->max_network_delay < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The max_network_delay parameter (%d ms) may not be less than (%d ms).", totem_config->max_network_delay, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token retransmit timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_retransmit_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_hold_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token hold timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_hold_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->join_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The join timeout parameter (%d ms) may not be less than (%d ms).", totem_config->join_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less than (%d ms).", totem_config->consensus_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout < totem_config->join_timeout) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less than join timeout (%d ms).", totem_config->consensus_timeout, totem_config->join_timeout); goto parse_error; } if (totem_config->merge_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The merge timeout parameter (%d ms) may not be less than (%d ms).", totem_config->merge_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->downcheck_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The downcheck timeout parameter (%d ms) may not be less than (%d ms).", totem_config->downcheck_timeout, MINIMUM_TIMEOUT); goto parse_error; } return 0; parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } static int totem_get_crypto(struct totem_config *totem_config) { char *str; const char *tmp_cipher; const char *tmp_hash; tmp_hash = "none"; tmp_cipher = "none"; if (icmap_get_string("totem.crypto_cipher", &str) == CS_OK) { if (strcmp(str, "none") == 0) { tmp_cipher = "none"; } if (strcmp(str, "aes256") == 0) { tmp_cipher = "aes256"; } if (strcmp(str, "aes192") == 0) { tmp_cipher = "aes192"; } if (strcmp(str, "aes128") == 0) { tmp_cipher = "aes128"; } if (strcmp(str, "aes256") == 0) { tmp_cipher = "aes256"; } if (strcmp(str, "3des") == 0) { tmp_cipher = "3des"; } free(str); } if (icmap_get_string("totem.crypto_hash", &str) == CS_OK) { if (strcmp(str, "none") == 0) { tmp_hash = "none"; } if (strcmp(str, "md5") == 0) { tmp_hash = "md5"; } if (strcmp(str, "sha1") == 0) { tmp_hash = "sha1"; } if (strcmp(str, "sha256") == 0) { tmp_hash = "sha256"; } if (strcmp(str, "sha384") == 0) { tmp_hash = "sha384"; } if (strcmp(str, "sha512") == 0) { tmp_hash = "sha512"; } free(str); } if ((strcmp(tmp_cipher, "none") != 0) && (strcmp(tmp_hash, "none") == 0)) { return -1; } free(totem_config->crypto_cipher_type); free(totem_config->crypto_hash_type); totem_config->crypto_cipher_type = strdup(tmp_cipher); totem_config->crypto_hash_type = strdup(tmp_hash); return 0; } static int totem_config_get_ip_version(void) { int res; char *str; res = AF_INET; if (icmap_get_string("totem.ip_version", &str) == CS_OK) { if (strcmp(str, "ipv4") == 0) { res = AF_INET; } if (strcmp(str, "ipv6") == 0) { res = AF_INET6; } free(str); } return (res); } static uint16_t generate_cluster_id (const char *cluster_name) { int i; int value = 0; for (i = 0; i < strlen(cluster_name); i++) { value <<= 1; value += cluster_name[i]; } return (value & 0xFFFF); } static int get_cluster_mcast_addr ( const char *cluster_name, unsigned int linknumber, int ip_version, struct totem_ip_address *res) { uint16_t clusterid; char addr[INET6_ADDRSTRLEN + 1]; int err; if (cluster_name == NULL) { return (-1); } clusterid = generate_cluster_id(cluster_name) + linknumber; memset (res, 0, sizeof(*res)); switch (ip_version) { case AF_INET: snprintf(addr, sizeof(addr), "239.192.%d.%d", clusterid >> 8, clusterid % 0xFF); break; case AF_INET6: snprintf(addr, sizeof(addr), "ff15::%x", clusterid); break; default: /* * Unknown family */ return (-1); } err = totemip_parse (res, addr, ip_version); return (err); } static unsigned int generate_nodeid_for_duplicate_test( struct totem_config *totem_config, char *addr) { unsigned int nodeid; struct totem_ip_address totemip; /* AF_INET hard-coded here because auto-generated nodeids are only for IPv4 */ if (totemip_parse(&totemip, addr, AF_INET) != 0) return -1; memcpy (&nodeid, &totemip.addr, sizeof (unsigned int)); #if __BYTE_ORDER == __LITTLE_ENDIAN nodeid = swab32 (nodeid); #endif if (totem_config->clear_node_high_bit) { nodeid &= 0x7FFFFFFF; } return nodeid; } static int check_for_duplicate_nodeids( struct totem_config *totem_config, const char **error_string) { icmap_iter_t iter; icmap_iter_t subiter; const char *iter_key; int res = 0; int retval = 0; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char *ring0_addr=NULL; char *ring0_addr1=NULL; unsigned int node_pos; unsigned int node_pos1; unsigned int nodeid; unsigned int nodeid1; int autogenerated; iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "ring0_addr") != 0) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); autogenerated = 0; if (icmap_get_uint32(tmp_key, &nodeid) != CS_OK) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos); if (icmap_get_string(tmp_key, &ring0_addr) != CS_OK) { continue; } /* Generate nodeid so we can check that auto-generated nodeids don't clash either */ nodeid = generate_nodeid_for_duplicate_test(totem_config, ring0_addr); if (nodeid == -1) { continue; } autogenerated = 1; } node_pos1 = 0; subiter = icmap_iter_init("nodelist.node."); while (((iter_key = icmap_iter_next(subiter, NULL, NULL)) != NULL) && (node_pos1 < node_pos)) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos1, tmp_key); if ((res != 2) || (node_pos1 >= node_pos)) { continue; } if (strcmp(tmp_key, "ring0_addr") != 0) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos1); if (icmap_get_uint32(tmp_key, &nodeid1) != CS_OK) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos1); if (icmap_get_string(tmp_key, &ring0_addr1) != CS_OK) { continue; } nodeid1 = generate_nodeid_for_duplicate_test(totem_config, ring0_addr1); if (nodeid1 == -1) { continue; } } if (nodeid == nodeid1) { retval = -1; snprintf (error_string_response, sizeof(error_string_response), "Nodeid %u%s%s%s appears twice in corosync.conf", nodeid, autogenerated?"(autogenerated from ":"", autogenerated?ring0_addr:"", autogenerated?")":""); log_printf (LOGSYS_LEVEL_ERROR, error_string_response); *error_string = error_string_response; break; } } icmap_iter_finalize(subiter); } icmap_iter_finalize(iter); return retval; } static int find_local_node_in_nodelist(struct totem_config *totem_config) { icmap_iter_t iter; const char *iter_key; int res = 0; unsigned int node_pos; int local_node_pos = -1; struct totem_ip_address bind_addr; int interface_up, interface_num; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; struct totem_ip_address node_addr; res = totemip_iface_check(&totem_config->interfaces[0].bindnet, &bind_addr, &interface_up, &interface_num, totem_config->clear_node_high_bit); if (res == -1) { return (-1); } iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "ring0_addr") != 0) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos); if (icmap_get_string(tmp_key, &node_addr_str) != CS_OK) { continue; } res = totemip_parse (&node_addr, node_addr_str, totem_config->ip_version); free(node_addr_str); if (res == -1) { continue ; } if (totemip_equal(&bind_addr, &node_addr)) { local_node_pos = node_pos; } } icmap_iter_finalize(iter); return (local_node_pos); } /* * Compute difference between two set of totem interface arrays. set1 and set2 * are changed so for same ring, ip existing in both set1 and set2 are cleared * (set to 0), and ips which are only in set1 or set2 remains untouched. * totempg_node_add/remove is called. */ static void compute_interfaces_diff(int interface_count, struct totem_interface *set1, struct totem_interface *set2) { int ring_no, set1_pos, set2_pos; struct totem_ip_address empty_ip_address; memset(&empty_ip_address, 0, sizeof(empty_ip_address)); for (ring_no = 0; ring_no < interface_count; ring_no++) { for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) { for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) { /* * For current ring_no remove all set1 items existing * in set2 */ if (memcmp(&set1[ring_no].member_list[set1_pos], &set2[ring_no].member_list[set2_pos], sizeof(struct totem_ip_address)) == 0) { memset(&set1[ring_no].member_list[set1_pos], 0, sizeof(struct totem_ip_address)); memset(&set2[ring_no].member_list[set2_pos], 0, sizeof(struct totem_ip_address)); } } } } for (ring_no = 0; ring_no < interface_count; ring_no++) { for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) { /* * All items which remained in set1 doesn't exists in set2 any longer so * node has to be removed. */ if (memcmp(&set1[ring_no].member_list[set1_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) { log_printf(LOGSYS_LEVEL_DEBUG, "removing dynamic member %s for ring %u", totemip_print(&set1[ring_no].member_list[set1_pos]), ring_no); totempg_member_remove(&set1[ring_no].member_list[set1_pos], ring_no); } } for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) { /* * All items which remained in set2 doesn't existed in set1 so this is no node * and has to be added. */ if (memcmp(&set2[ring_no].member_list[set2_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) { log_printf(LOGSYS_LEVEL_DEBUG, "adding dynamic member %s for ring %u", totemip_print(&set2[ring_no].member_list[set2_pos]), ring_no); totempg_member_add(&set2[ring_no].member_list[set2_pos], ring_no); } } } } static void put_nodelist_members_to_config(struct totem_config *totem_config, int reload) { icmap_iter_t iter, iter2; const char *iter_key, *iter_key2; int res = 0; unsigned int node_pos; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key2[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; int member_count; unsigned int linknumber = 0; int i, j; struct totem_interface *orig_interfaces = NULL; struct totem_interface *new_interfaces = NULL; if (reload) { /* * We need to compute diff only for reload. Also for initial configuration * not all totem structures are initialized so corosync will crash during * member_add/remove */ orig_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(orig_interfaces != NULL); new_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(new_interfaces != NULL); memcpy(orig_interfaces, totem_config->interfaces, sizeof (struct totem_interface) * INTERFACE_MAX); } /* Clear out nodelist so we can put the new one in if needed */ for (i = 0; i < totem_config->interface_count; i++) { for (j = 0; j < PROCESSOR_COUNT_MAX; j++) { memset(&totem_config->interfaces[i].member_list[j], 0, sizeof(struct totem_ip_address)); } totem_config->interfaces[i].member_count = 0; } iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "ring0_addr") != 0) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos); iter2 = icmap_iter_init(tmp_key); while ((iter_key2 = icmap_iter_next(iter2, NULL, NULL)) != NULL) { unsigned int nodeid; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); if (icmap_get_uint32(tmp_key, &nodeid) != CS_OK) { } res = sscanf(iter_key2, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2); if (res != 3 || strcmp(tmp_key2, "_addr") != 0) { continue; } if (icmap_get_string(iter_key2, &node_addr_str) != CS_OK) { continue; } member_count = totem_config->interfaces[linknumber].member_count; res = totemip_parse(&totem_config->interfaces[linknumber].member_list[member_count], node_addr_str, totem_config->ip_version); if (res != -1) { totem_config->interfaces[linknumber].member_list[member_count].nodeid = nodeid; totem_config->interfaces[linknumber].member_count++; } free(node_addr_str); } icmap_iter_finalize(iter2); } icmap_iter_finalize(iter); if (reload) { memcpy(new_interfaces, totem_config->interfaces, sizeof (struct totem_interface) * INTERFACE_MAX); compute_interfaces_diff(totem_config->interface_count, orig_interfaces, new_interfaces); free(new_interfaces); free(orig_interfaces); } } static void nodelist_dynamic_notify( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { int res; unsigned int ring_no; unsigned int member_no; char tmp_str[ICMAP_KEYNAME_MAXLEN]; uint8_t reloading; struct totem_config *totem_config = (struct totem_config *)user_data; /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.totemconfig_reload_in_progress", &reloading) == CS_OK && reloading) { return ; } res = sscanf(key_name, "nodelist.node.%u.ring%u%s", &member_no, &ring_no, tmp_str); if (res != 3) return ; if (strcmp(tmp_str, "_addr") != 0) { return; } put_nodelist_members_to_config(totem_config, 1); } /* * Tries to find node (node_pos) in config nodelist which address matches any * local interface. Address can be stored in ring0_addr or if ipaddr_key_prefix is not NULL * key with prefix ipaddr_key is used (there can be multiuple of them) * This function differs * from find_local_node_in_nodelist because it doesn't need bindnetaddr, * but doesn't work when bind addr is network address (so IP must be exact * match). * * Returns 1 on success (address was found, node_pos is then correctly set) or 0 on failure. */ int totem_config_find_local_addr_in_nodelist(const char *ipaddr_key_prefix, unsigned int *node_pos) { struct qb_list_head addrs; struct totem_ip_if_address *if_addr; icmap_iter_t iter, iter2; const char *iter_key, *iter_key2; struct qb_list_head *list; const char *ipaddr_key; int ip_version; struct totem_ip_address node_addr; char *node_addr_str; int node_found = 0; int res = 0; char tmp_key[ICMAP_KEYNAME_MAXLEN]; if (totemip_getifaddrs(&addrs) == -1) { return 0; } ip_version = totem_config_get_ip_version(); iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", node_pos, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "ring0_addr") != 0) { continue; } if (icmap_get_string(iter_key, &node_addr_str) != CS_OK) { continue ; } free(node_addr_str); /* * ring0_addr found -> let's iterate thru ipaddr_key_prefix */ snprintf(tmp_key, sizeof(tmp_key), "nodelist.node.%u.%s", *node_pos, (ipaddr_key_prefix != NULL ? ipaddr_key_prefix : "ring0_addr")); iter2 = icmap_iter_init(tmp_key); while ((iter_key2 = icmap_iter_next(iter2, NULL, NULL)) != NULL) { /* * ring0_addr must be exact match, not prefix */ ipaddr_key = (ipaddr_key_prefix != NULL ? iter_key2 : tmp_key); if (icmap_get_string(ipaddr_key, &node_addr_str) != CS_OK) { continue ; } if (totemip_parse(&node_addr, node_addr_str, ip_version) == -1) { free(node_addr_str); continue ; } free(node_addr_str); /* * Try to match ip with if_addrs */ node_found = 0; - qb_list_for_each(list, &(addrs)) { + qb_list_for_each(list, &(addrs)) { if_addr = qb_list_entry(list, struct totem_ip_if_address, list); if (totemip_equal(&node_addr, &if_addr->ip_addr)) { node_found = 1; break; } } if (node_found) { break ; } } icmap_iter_finalize(iter2); if (node_found) { break ; } } icmap_iter_finalize(iter); totemip_freeifaddrs(&addrs); return (node_found); } static void config_convert_nodelist_to_interface(struct totem_config *totem_config) { int res = 0; unsigned int node_pos; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key2[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; unsigned int linknumber = 0; icmap_iter_t iter; const char *iter_key; if (totem_config_find_local_addr_in_nodelist(NULL, &node_pos)) { /* * We found node, so create interface section */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos); iter = icmap_iter_init(tmp_key); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2); if (res != 3 || strcmp(tmp_key2, "_addr") != 0) { continue ; } if (icmap_get_string(iter_key, &node_addr_str) != CS_OK) { continue; } snprintf(tmp_key2, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber); icmap_set_string(tmp_key2, node_addr_str); free(node_addr_str); } icmap_iter_finalize(iter); } } extern int totem_config_read ( struct totem_config *totem_config, const char **error_string, uint64_t *warnings) { int res = 0; char *str; unsigned int linknumber = 0; int member_count = 0; icmap_iter_t iter, member_iter; const char *iter_key; const char *member_iter_key; char linknumber_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key[ICMAP_KEYNAME_MAXLEN]; uint8_t u8; uint16_t u16; uint32_t u32; char *cluster_name = NULL; int i; int local_node_pos; int nodeid_set; *warnings = 0; memset (totem_config, 0, sizeof (struct totem_config)); totem_config->interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); if (totem_config->interfaces == 0) { *error_string = "Out of memory trying to allocate ethernet interface storage area"; return -1; } memset (totem_config->interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); strcpy (totem_config->link_mode, "passive"); icmap_get_uint32("totem.version", (uint32_t *)&totem_config->version); if (totem_get_crypto(totem_config) != 0) { *error_string = "crypto_cipher requires crypto_hash with value other than none"; return -1; } if (icmap_get_string("totem.link_mode", &str) == CS_OK) { if (strlen(str) >= TOTEM_LINK_MODE_BYTES) { *error_string = "totem.link_mode is too long"; free(str); return -1; } strcpy (totem_config->link_mode, str); free(str); } icmap_get_uint32("totem.nodeid", &totem_config->node_id); totem_config->clear_node_high_bit = 0; if (icmap_get_string("totem.clear_node_high_bit", &str) == CS_OK) { if (strcmp (str, "yes") == 0) { totem_config->clear_node_high_bit = 1; } free(str); } icmap_get_uint32("totem.threads", &totem_config->threads); icmap_get_uint32("totem.netmtu", &totem_config->net_mtu); if (icmap_get_string("totem.cluster_name", &cluster_name) != CS_OK) { cluster_name = NULL; } totem_config->ip_version = totem_config_get_ip_version(); if (icmap_get_string("totem.interface.0.bindnetaddr", &str) != CS_OK) { /* * We were not able to find ring 0 bindnet addr. Try to use nodelist informations */ config_convert_nodelist_to_interface(totem_config); } else { free(str); } /* * Broadcast option is global but set in interface section, * so reset before processing interfaces. */ totem_config->broadcast_use = 0; iter = icmap_iter_init("totem.interface."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "totem.interface.%[^.].%s", linknumber_key, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "bindnetaddr") != 0) { continue; } member_count = 0; linknumber = atoi(linknumber_key); if (linknumber >= INTERFACE_MAX) { free(cluster_name); snprintf (error_string_response, sizeof(error_string_response), "parse error in config: interface ring number %u is bigger than allowed maximum %u\n", linknumber, INTERFACE_MAX - 1); *error_string = error_string_response; return -1; } /* * Get the bind net address */ if (icmap_get_string(iter_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].bindnet, str, totem_config->ip_version); free(str); } /* * Get interface multicast address */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", linknumber); if (icmap_get_string(tmp_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].mcast_addr, str, totem_config->ip_version); free(str); } else { /* * User not specified address -> autogenerate one from cluster_name key * (if available). Return code is intentionally ignored, because * udpu doesn't need mcastaddr and validity of mcastaddr for udp is * checked later anyway. */ (void)get_cluster_mcast_addr (cluster_name, linknumber, totem_config->ip_version, &totem_config->interfaces[linknumber].mcast_addr); } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.broadcast", linknumber); if (icmap_get_string(tmp_key, &str) == CS_OK) { if (strcmp (str, "yes") == 0) { totem_config->broadcast_use = 1; } free(str); } /* * Get mcast port */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", linknumber); if (icmap_get_uint16(tmp_key, &totem_config->interfaces[linknumber].ip_port) != CS_OK) { if (totem_config->broadcast_use) { totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + (2 * linknumber); } else { totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT; } } /* * Get the TTL */ totem_config->interfaces[linknumber].ttl = 1; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.ttl", linknumber); if (icmap_get_uint8(tmp_key, &u8) == CS_OK) { totem_config->interfaces[linknumber].ttl = u8; } /* * Get the knet link params */ totem_config->interfaces[linknumber].knet_link_priority = 1; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_link_priority", linknumber); if (icmap_get_uint8(tmp_key, &u8) == CS_OK) { totem_config->interfaces[linknumber].knet_link_priority = u8; } totem_config->interfaces[linknumber].knet_ping_interval = KNET_PING_INTERVAL; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_interval", linknumber); if (icmap_get_uint32(tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_interval = u32; } totem_config->interfaces[linknumber].knet_ping_timeout = KNET_PING_TIMEOUT; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_timeout", linknumber); if (icmap_get_uint32(tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_timeout = u32; } totem_config->interfaces[linknumber].knet_ping_precision = KNET_PING_PRECISION; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_precision", linknumber); if (icmap_get_uint32(tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_precision = u32; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.member.", linknumber); member_iter = icmap_iter_init(tmp_key); while ((member_iter_key = icmap_iter_next(member_iter, NULL, NULL)) != NULL) { if (member_count == 0) { if (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK) { free(str); *warnings |= TOTEM_CONFIG_WARNING_MEMBERS_IGNORED; break; } else { *warnings |= TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED; } } if (icmap_get_string(member_iter_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].member_list[member_count++], str, totem_config->ip_version); } } icmap_iter_finalize(member_iter); totem_config->interfaces[linknumber].member_count = member_count; totem_config->interface_count++; } icmap_iter_finalize(iter); /* * Use broadcast is global, so if set, make sure to fill mcast addr correctly */ if (totem_config->broadcast_use) { for (linknumber = 0; linknumber < totem_config->interface_count; linknumber++) { totemip_parse (&totem_config->interfaces[linknumber].mcast_addr, "255.255.255.255", 0); } } /* * Store automatically generated items back to icmap */ for (i = 0; i < totem_config->interface_count; i++) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", i); if (icmap_get_string(tmp_key, &str) == CS_OK) { free(str); } else { str = (char *)totemip_print(&totem_config->interfaces[i].mcast_addr); icmap_set_string(tmp_key, str); } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", i); if (icmap_get_uint16(tmp_key, &u16) != CS_OK) { icmap_set_uint16(tmp_key, totem_config->interfaces[i].ip_port); } } totem_config->transport_number = TOTEM_TRANSPORT_KNET; if (icmap_get_string("totem.transport", &str) == CS_OK) { if (strcmp (str, "udpu") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDPU; } if (strcmp (str, "udp") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDP; } if (strcmp (str, "knet") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_KNET; } free(str); } free(cluster_name); /* * Check existence of nodelist */ if (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK) { free(str); /* * find local node */ local_node_pos = find_local_node_in_nodelist(totem_config); if (local_node_pos != -1) { icmap_set_uint32("nodelist.local_node_pos", local_node_pos); snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", local_node_pos); nodeid_set = (totem_config->node_id != 0); if (icmap_get_uint32(tmp_key, &totem_config->node_id) == CS_OK && nodeid_set) { *warnings |= TOTEM_CONFIG_WARNING_TOTEM_NODEID_IGNORED; } /* * Make localnode ring0_addr read only, so we can be sure that local * node never changes. If rebinding to other IP would be in future * supported, this must be changed and handled properly! */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", local_node_pos); icmap_set_ro_access(tmp_key, 0, 1); icmap_set_ro_access("nodelist.local_node_pos", 0, 1); } put_nodelist_members_to_config(totem_config, 0); } /* * Get things that might change in the future (and can depend on totem_config->interfaces); */ totem_volatile_config_read(totem_config, NULL); icmap_set_uint8("config.totemconfig_reload_in_progress", 0); add_totem_config_notification(totem_config); return 0; } int totem_config_validate ( struct totem_config *totem_config, const char **error_string) { static char local_error_reason[512]; char parse_error[512]; const char *error_reason = local_error_reason; int i, j; unsigned int interface_max = INTERFACE_MAX; unsigned int port1, port2; if (totem_config->interface_count == 0) { error_reason = "No interfaces defined"; goto parse_error; } for (i = 0; i < totem_config->interface_count; i++) { /* * Some error checking of parsed data to make sure its valid */ struct totem_ip_address null_addr; memset (&null_addr, 0, sizeof (struct totem_ip_address)); if ((totem_config->transport_number == 0) && memcmp (&totem_config->interfaces[i].mcast_addr, &null_addr, sizeof (struct totem_ip_address)) == 0) { error_reason = "No multicast address specified"; goto parse_error; } if (totem_config->interfaces[i].ip_port == 0) { error_reason = "No multicast port specified"; goto parse_error; } if (totem_config->interfaces[i].ttl > 255) { error_reason = "Invalid TTL (should be 0..255)"; goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_UDP && totem_config->interfaces[i].ttl != 1) { error_reason = "Can only set ttl on multicast transport types"; goto parse_error; } if (totem_config->interfaces[i].knet_link_priority > 255) { error_reason = "Invalid link priority (should be 0..255)"; goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_KNET && totem_config->interfaces[i].knet_link_priority != 1) { error_reason = "Can only set link priority on knet transport type"; goto parse_error; } if (totem_config->interfaces[i].mcast_addr.family == AF_INET6 && totem_config->node_id == 0) { error_reason = "An IPV6 network requires that a node ID be specified."; goto parse_error; } if (totem_config->broadcast_use == 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) { if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) { error_reason = "Multicast address family does not match bind address family"; goto parse_error; } if (totemip_is_mcast (&totem_config->interfaces[i].mcast_addr) != 0) { error_reason = "mcastaddr is not a correct multicast address."; goto parse_error; } } if (totem_config->interfaces[0].bindnet.family != totem_config->interfaces[i].bindnet.family) { error_reason = "Not all bind address belong to the same IP family"; goto parse_error; } /* * Ensure mcast address/port differs */ if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { for (j = i + 1; j < totem_config->interface_count; j++) { port1 = totem_config->interfaces[i].ip_port; port2 = totem_config->interfaces[j].ip_port; if (totemip_equal(&totem_config->interfaces[i].mcast_addr, &totem_config->interfaces[j].mcast_addr) && (((port1 > port2 ? port1 : port2) - (port1 < port2 ? port1 : port2)) <= 1)) { error_reason = "Interfaces multicast address/port pair must differ"; goto parse_error; } } } } if (totem_config->version != 2) { error_reason = "This totem parser can only parse version 2 configurations."; goto parse_error; } if (totem_volatile_config_validate(totem_config, error_string) == -1) { return (-1); } if (check_for_duplicate_nodeids(totem_config, error_string) == -1) { return (-1); } /* * KNET Link values validation */ if (strcmp (totem_config->link_mode, "active") && strcmp (totem_config->link_mode, "rr") && strcmp (totem_config->link_mode, "passive")) { snprintf (local_error_reason, sizeof(local_error_reason), "The Knet link mode \"%s\" specified is invalid. It must be active, passive or rr.\n", totem_config->link_mode); goto parse_error; } /* Only Knet does multiple interfaces */ if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) { interface_max = 1; } if (interface_max < totem_config->interface_count) { snprintf (parse_error, sizeof(parse_error), "%d is too many configured interfaces for non-Knet transport.", totem_config->interface_count); error_reason = parse_error; goto parse_error; } /* Only knet allows crypto */ if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) { if ((strcmp(totem_config->crypto_cipher_type, "none") != 0) || (strcmp(totem_config->crypto_hash_type, "none") != 0)) { snprintf (parse_error, sizeof(parse_error), "crypto_cipher & crypto_hash are only valid for the Knet transport."); error_reason = parse_error; goto parse_error; } } if (totem_config->net_mtu == 0) { totem_config->net_mtu = 1500; } return 0; parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } static int read_keyfile ( const char *key_location, struct totem_config *totem_config, const char **error_string) { int fd; int res; ssize_t expected_key_len = sizeof (totem_config->private_key); int saved_errno; char error_str[100]; const char *error_ptr; fd = open (key_location, O_RDONLY); if (fd == -1) { error_ptr = qb_strerror_r(errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not open %s: %s\n", key_location, error_ptr); goto parse_error; } res = read (fd, totem_config->private_key, expected_key_len); saved_errno = errno; close (fd); if (res == -1) { error_ptr = qb_strerror_r (saved_errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not read %s: %s\n", key_location, error_ptr); goto parse_error; } totem_config->private_key_len = expected_key_len; if (res != expected_key_len) { snprintf (error_string_response, sizeof(error_string_response), "Could only read %d bits of 1024 bits from %s.\n", res * 8, key_location); goto parse_error; } return 0; parse_error: *error_string = error_string_response; return (-1); } int totem_config_keyread ( struct totem_config *totem_config, const char **error_string) { int got_key = 0; char *key_location = NULL; int res; size_t key_len; memset (totem_config->private_key, 0, 128); totem_config->private_key_len = 128; if (strcmp(totem_config->crypto_cipher_type, "none") == 0 && strcmp(totem_config->crypto_hash_type, "none") == 0) { return (0); } /* cmap may store the location of the key file */ if (icmap_get_string("totem.keyfile", &key_location) == CS_OK) { res = read_keyfile(key_location, totem_config, error_string); free(key_location); if (res) { goto key_error; } got_key = 1; } else { /* Or the key itself may be in the cmap */ if (icmap_get("totem.key", NULL, &key_len, NULL) == CS_OK) { if (key_len > sizeof (totem_config->private_key)) { sprintf(error_string_response, "key is too long"); goto key_error; } if (icmap_get("totem.key", totem_config->private_key, &key_len, NULL) == CS_OK) { totem_config->private_key_len = key_len; got_key = 1; } else { sprintf(error_string_response, "can't store private key"); goto key_error; } } } /* In desperation we read the default filename */ if (!got_key) { const char *filename = getenv("COROSYNC_TOTEM_AUTHKEY_FILE"); if (!filename) filename = COROSYSCONFDIR "/authkey"; res = read_keyfile(filename, totem_config, error_string); if (res) goto key_error; } return (0); key_error: *error_string = error_string_response; return (-1); } static void debug_dump_totem_config(const struct totem_config *totem_config) { log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)", totem_config->token_timeout, totem_config->token_retransmit_timeout); log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf(LOGSYS_LEVEL_DEBUG, "downcheck (%d ms) fail to recv const (%d msgs)", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf(LOGSYS_LEVEL_DEBUG, "seqno unchanged const (%d rotations) Maximum network MTU %d", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf(LOGSYS_LEVEL_DEBUG, "window size per rotation (%d messages) maximum messages per rotation (%d messages)", totem_config->window_size, totem_config->max_messages); log_printf(LOGSYS_LEVEL_DEBUG, "missed count const (%d messages)", totem_config->miss_count_const); log_printf(LOGSYS_LEVEL_DEBUG, "heartbeat_failures_allowed (%d)", totem_config->heartbeat_failures_allowed); log_printf(LOGSYS_LEVEL_DEBUG, "max_network_delay (%d ms)", totem_config->max_network_delay); } static void totem_change_notify( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { struct totem_config *totem_config = (struct totem_config *)user_data; uint32_t *param; uint8_t reloading; const char *deleted_key = NULL; const char *error_string; /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.reload_in_progress", &reloading) == CS_OK && reloading) return; param = totem_get_param_by_name((struct totem_config *)user_data, key_name); /* * Process change only if changed key is found in totem_config (-> param is not NULL) * or for special key token_coefficient. token_coefficient key is not stored in * totem_config, but it is used for computation of token timeout. */ if (!param && strcmp(key_name, "totem.token_coefficient") != 0) return; /* * Values other than UINT32 are not supported, or needed (yet) */ switch (event) { case ICMAP_TRACK_DELETE: deleted_key = key_name; break; case ICMAP_TRACK_ADD: case ICMAP_TRACK_MODIFY: deleted_key = NULL; break; default: break; } totem_volatile_config_read (totem_config, deleted_key); log_printf(LOGSYS_LEVEL_DEBUG, "Totem related config key changed. Dumping actual totem config."); debug_dump_totem_config(totem_config); if (totem_volatile_config_validate(totem_config, &error_string) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); /* * TODO: Consider corosync exit and/or load defaults for volatile * values. For now, log error seems to be enough */ } } static void totem_reload_notify( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { struct totem_config *totem_config = (struct totem_config *)user_data; uint32_t local_node_pos; const char *error_string; /* Reload has completed */ if (*(uint8_t *)new_val.data == 0) { put_nodelist_members_to_config (totem_config, 1); totem_volatile_config_read (totem_config, NULL); log_printf(LOGSYS_LEVEL_DEBUG, "Configuration reloaded. Dumping actual totem config."); debug_dump_totem_config(totem_config); if (totem_volatile_config_validate(totem_config, &error_string) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); /* * TODO: Consider corosync exit and/or load defaults for volatile * values. For now, log error seems to be enough */ } /* Reinstate the local_node_pos */ local_node_pos = find_local_node_in_nodelist(totem_config); if (local_node_pos != -1) { icmap_set_uint32("nodelist.local_node_pos", local_node_pos); } icmap_set_uint8("config.totemconfig_reload_in_progress", 0); } else { icmap_set_uint8("config.totemconfig_reload_in_progress", 1); } } static void add_totem_config_notification(struct totem_config *totem_config) { icmap_track_t icmap_track; icmap_track_add("totem.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, totem_change_notify, totem_config, &icmap_track); icmap_track_add("config.reload_in_progress", ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY, totem_reload_notify, totem_config, &icmap_track); icmap_track_add("nodelist.node.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, nodelist_dynamic_notify, (void *)totem_config, &icmap_track); } diff --git a/exec/totemip.c b/exec/totemip.c index 013f39cb..6a9b682e 100644 --- a/exec/totemip.c +++ b/exec/totemip.c @@ -1,511 +1,511 @@ /* * Copyright (c) 2005-2011 Red Hat, Inc. * * All rights reserved. * * Author: Patrick Caulfield (pcaulfie@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* IPv4/6 abstraction */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOCALHOST_IPV4 "127.0.0.1" #define LOCALHOST_IPV6 "::1" #define NETLINK_BUFSIZE 16384 #ifdef SO_NOSIGPIPE void totemip_nosigpipe(int s) { int on = 1; setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on, sizeof(on)); } #endif /* Compare two addresses */ int totemip_equal(const struct totem_ip_address *addr1, const struct totem_ip_address *addr2) { int addrlen = 0; if (addr1->family != addr2->family) return 0; if (addr1->family == AF_INET) { addrlen = sizeof(struct in_addr); } if (addr1->family == AF_INET6) { addrlen = sizeof(struct in6_addr); } assert(addrlen); if (memcmp(addr1->addr, addr2->addr, addrlen) == 0) return 1; else return 0; } /* Copy a totem_ip_address */ void totemip_copy(struct totem_ip_address *addr1, const struct totem_ip_address *addr2) { memcpy(addr1, addr2, sizeof(struct totem_ip_address)); } void totemip_copy_endian_convert(struct totem_ip_address *addr1, const struct totem_ip_address *addr2) { addr1->nodeid = swab32(addr2->nodeid); addr1->family = swab16(addr2->family); memcpy(addr1->addr, addr2->addr, TOTEMIP_ADDRLEN); } /* * Multicast address range is 224.0.0.0 to 239.255.255.255 this * translates to the first 4 bits == 1110 (0xE). * http://en.wikipedia.org/wiki/Multicast_address */ int32_t totemip_is_mcast(struct totem_ip_address *ip_addr) { uint32_t addr = 0; memcpy (&addr, ip_addr->addr, sizeof (uint32_t)); if (ip_addr->family == AF_INET) { addr = ntohl(addr); if ((addr >> 28) != 0xE) { return -1; } } return 0; } /* For sorting etc. params are void * for qsort's benefit */ int totemip_compare(const void *a, const void *b) { int i; const struct totem_ip_address *totemip_a = (const struct totem_ip_address *)a; const struct totem_ip_address *totemip_b = (const struct totem_ip_address *)b; struct in_addr ipv4_a1; struct in_addr ipv4_a2; struct in6_addr ipv6_a1; struct in6_addr ipv6_a2; unsigned short family; /* * Use memcpy to align since totem_ip_address is unaligned on various archs */ memcpy (&family, &totemip_a->family, sizeof (unsigned short)); if (family == AF_INET) { memcpy (&ipv4_a1, totemip_a->addr, sizeof (struct in_addr)); memcpy (&ipv4_a2, totemip_b->addr, sizeof (struct in_addr)); if (ipv4_a1.s_addr == ipv4_a2.s_addr) { return (0); } if (htonl(ipv4_a1.s_addr) < htonl(ipv4_a2.s_addr)) { return -1; } else { return +1; } } else if (family == AF_INET6) { /* * We can only compare 8 bits at time for portability reasons */ memcpy (&ipv6_a1, totemip_a->addr, sizeof (struct in6_addr)); memcpy (&ipv6_a2, totemip_b->addr, sizeof (struct in6_addr)); for (i = 0; i < 16; i++) { int res = ipv6_a1.s6_addr[i] - ipv6_a2.s6_addr[i]; if (res) { return res; } } return 0; } else { /* * Family not set, should be! */ assert (0); } return 0; } /* Build a localhost totem_ip_address */ int totemip_localhost(int family, struct totem_ip_address *localhost) { const char *addr_text; memset (localhost, 0, sizeof (struct totem_ip_address)); if (family == AF_INET) { addr_text = LOCALHOST_IPV4; if (inet_pton(family, addr_text, (char *)&localhost->nodeid) <= 0) { return -1; } } else { addr_text = LOCALHOST_IPV6; } if (inet_pton(family, addr_text, (char *)localhost->addr) <= 0) return -1; localhost->family = family; return 0; } int totemip_localhost_check(const struct totem_ip_address *addr) { struct totem_ip_address localhost; if (totemip_localhost(addr->family, &localhost)) return 0; return totemip_equal(addr, &localhost); } const char *totemip_print(const struct totem_ip_address *addr) { static char buf[INET6_ADDRSTRLEN]; return (inet_ntop(addr->family, addr->addr, buf, sizeof(buf))); } /* Make a totem_ip_address into a usable sockaddr_storage */ int totemip_totemip_to_sockaddr_convert(struct totem_ip_address *ip_addr, uint16_t port, struct sockaddr_storage *saddr, int *addrlen) { int ret = -1; if (ip_addr->family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)saddr; memset(sin, 0, sizeof(struct sockaddr_in)); #ifdef HAVE_SOCK_SIN_LEN sin->sin_len = sizeof(struct sockaddr_in); #endif sin->sin_family = ip_addr->family; sin->sin_port = ntohs(port); memcpy(&sin->sin_addr, ip_addr->addr, sizeof(struct in_addr)); *addrlen = sizeof(struct sockaddr_in); ret = 0; } if (ip_addr->family == AF_INET6) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)saddr; memset(sin, 0, sizeof(struct sockaddr_in6)); #ifdef HAVE_SOCK_SIN6_LEN sin->sin6_len = sizeof(struct sockaddr_in6); #endif sin->sin6_family = ip_addr->family; sin->sin6_port = ntohs(port); sin->sin6_scope_id = 2; memcpy(&sin->sin6_addr, ip_addr->addr, sizeof(struct in6_addr)); *addrlen = sizeof(struct sockaddr_in6); ret = 0; } return ret; } /* Converts an address string string into a totem_ip_address. family can be AF_INET, AF_INET6 or 0 ("for "don't care") */ int totemip_parse(struct totem_ip_address *totemip, const char *addr, int family) { struct addrinfo *ainfo; struct addrinfo ahints; struct sockaddr_in *sa; struct sockaddr_in6 *sa6; int ret; memset(&ahints, 0, sizeof(ahints)); ahints.ai_socktype = SOCK_DGRAM; ahints.ai_protocol = IPPROTO_UDP; ahints.ai_family = family; /* Lookup the nodename address */ ret = getaddrinfo(addr, NULL, &ahints, &ainfo); if (ret) return -1; sa = (struct sockaddr_in *)ainfo->ai_addr; sa6 = (struct sockaddr_in6 *)ainfo->ai_addr; totemip->family = ainfo->ai_family; if (ainfo->ai_family == AF_INET) memcpy(totemip->addr, &sa->sin_addr, sizeof(struct in_addr)); else memcpy(totemip->addr, &sa6->sin6_addr, sizeof(struct in6_addr)); freeaddrinfo(ainfo); return 0; } /* Make a sockaddr_* into a totem_ip_address */ int totemip_sockaddr_to_totemip_convert(const struct sockaddr_storage *saddr, struct totem_ip_address *ip_addr) { int ret = -1; ip_addr->family = saddr->ss_family; ip_addr->nodeid = 0; if (saddr->ss_family == AF_INET) { const struct sockaddr_in *sin = (const struct sockaddr_in *)saddr; memcpy(ip_addr->addr, &sin->sin_addr, sizeof(struct in_addr)); ret = 0; } if (saddr->ss_family == AF_INET6) { const struct sockaddr_in6 *sin = (const struct sockaddr_in6 *)saddr; memcpy(ip_addr->addr, &sin->sin6_addr, sizeof(struct in6_addr)); ret = 0; } return ret; } int totemip_getifaddrs(struct qb_list_head *addrs) { struct ifaddrs *ifap, *ifa; struct totem_ip_if_address *if_addr; if (getifaddrs(&ifap) != 0) return (-1); qb_list_init(addrs); for (ifa = ifap; ifa; ifa = ifa->ifa_next) { if (ifa->ifa_addr == NULL || ifa->ifa_netmask == NULL) continue ; if ((ifa->ifa_addr->sa_family != AF_INET && ifa->ifa_addr->sa_family != AF_INET6) || (ifa->ifa_netmask->sa_family != AF_INET && ifa->ifa_netmask->sa_family != AF_INET6 && ifa->ifa_netmask->sa_family != 0)) continue ; if (ifa->ifa_netmask->sa_family == 0) { ifa->ifa_netmask->sa_family = ifa->ifa_addr->sa_family; } if_addr = malloc(sizeof(struct totem_ip_if_address)); if (if_addr == NULL) { goto error_free_ifaddrs; } qb_list_init(&if_addr->list); memset(if_addr, 0, sizeof(struct totem_ip_if_address)); if_addr->interface_up = ifa->ifa_flags & IFF_UP; if_addr->interface_num = if_nametoindex(ifa->ifa_name); if_addr->name = strdup(ifa->ifa_name); if (if_addr->name == NULL) { goto error_free_addr; } if (totemip_sockaddr_to_totemip_convert((const struct sockaddr_storage *)ifa->ifa_addr, &if_addr->ip_addr) == -1) { goto error_free_addr_name; } if (totemip_sockaddr_to_totemip_convert((const struct sockaddr_storage *)ifa->ifa_netmask, &if_addr->mask_addr) == -1) { goto error_free_addr_name; } qb_list_add_tail(&if_addr->list, addrs); } freeifaddrs(ifap); return (0); error_free_addr_name: free(if_addr->name); error_free_addr: free(if_addr); error_free_ifaddrs: totemip_freeifaddrs(addrs); freeifaddrs(ifap); return (-1); } void totemip_freeifaddrs(struct qb_list_head *addrs) { struct totem_ip_if_address *if_addr; - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; - qb_list_for_each(list, addrs) { + qb_list_for_each_safe(list, tmp_iter, addrs) { if_addr = qb_list_entry(list, struct totem_ip_if_address, list); free(if_addr->name); qb_list_del(&if_addr->list); free(if_addr); } qb_list_init(addrs); } int totemip_iface_check(struct totem_ip_address *bindnet, struct totem_ip_address *boundto, int *interface_up, int *interface_num, int mask_high_bit) { struct qb_list_head addrs; struct qb_list_head *list; struct totem_ip_if_address *if_addr; struct totem_ip_address bn_netaddr, if_netaddr; socklen_t addr_len; socklen_t si; int res = -1; int exact_match_found = 0; int net_match_found = 0; *interface_up = 0; *interface_num = 0; if (totemip_getifaddrs(&addrs) == -1) { return (-1); } - qb_list_for_each(list, &addrs) { + qb_list_for_each(list, &addrs) { if_addr = qb_list_entry(list, struct totem_ip_if_address, list); if (bindnet->family != if_addr->ip_addr.family) continue ; addr_len = 0; switch (bindnet->family) { case AF_INET: addr_len = sizeof(struct in_addr); break; case AF_INET6: addr_len = sizeof(struct in6_addr); break; } if (addr_len == 0) continue ; totemip_copy(&bn_netaddr, bindnet); totemip_copy(&if_netaddr, &if_addr->ip_addr); if (totemip_equal(&bn_netaddr, &if_netaddr)) { exact_match_found = 1; } for (si = 0; si < addr_len; si++) { bn_netaddr.addr[si] = bn_netaddr.addr[si] & if_addr->mask_addr.addr[si]; if_netaddr.addr[si] = if_netaddr.addr[si] & if_addr->mask_addr.addr[si]; } if (exact_match_found || (!net_match_found && totemip_equal(&bn_netaddr, &if_netaddr))) { totemip_copy(boundto, &if_addr->ip_addr); boundto->nodeid = bindnet->nodeid; *interface_up = if_addr->interface_up; *interface_num = if_addr->interface_num; if (boundto->family == AF_INET && boundto->nodeid == 0) { unsigned int nodeid = 0; memcpy (&nodeid, boundto->addr, sizeof (int)); #if __BYTE_ORDER == __LITTLE_ENDIAN nodeid = swab32 (nodeid); #endif if (mask_high_bit) { nodeid &= 0x7FFFFFFF; } boundto->nodeid = nodeid; } net_match_found = 1; res = 0; if (exact_match_found) { goto finished; } } } finished: totemip_freeifaddrs(&addrs); return (res); } #define TOTEMIP_UDP_HEADER_SIZE 8 #define TOTEMIP_IPV4_HEADER_SIZE 20 #define TOTEMIP_IPV6_HEADER_SIZE 40 size_t totemip_udpip_header_size(int family) { size_t header_size; header_size = 0; switch (family) { case AF_INET: header_size = TOTEMIP_UDP_HEADER_SIZE + TOTEMIP_IPV4_HEADER_SIZE; break; case AF_INET6: header_size = TOTEMIP_UDP_HEADER_SIZE + TOTEMIP_IPV6_HEADER_SIZE; break; } return (header_size); } diff --git a/exec/totempg.c b/exec/totempg.c index b1be3233..861e8b03 100644 --- a/exec/totempg.c +++ b/exec/totempg.c @@ -1,1523 +1,1523 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2005 OSDL. * Copyright (c) 2006-2012 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * Author: Mark Haverkamp (markh@osdl.org) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * FRAGMENTATION AND PACKING ALGORITHM: * * Assemble the entire message into one buffer * if full fragment * store fragment into lengths list * for each full fragment * multicast fragment * set length and fragment fields of pg mesage * store remaining multicast into head of fragmentation data and set lens field * * If a message exceeds the maximum packet size allowed by the totem * single ring protocol, the protocol could lose forward progress. * Statically calculating the allowed data amount doesn't work because * the amount of data allowed depends on the number of fragments in * each message. In this implementation, the maximum fragment size * is dynamically calculated for each fragment added to the message. * It is possible for a message to be two bytes short of the maximum * packet size. This occurs when a message or collection of * messages + the mcast header + the lens are two bytes short of the * end of the packet. Since another len field consumes two bytes, the * len field would consume the rest of the packet without room for data. * * One optimization would be to forgo the final len field and determine * it from the size of the udp datagram. Then this condition would no * longer occur. */ /* * ASSEMBLY AND UNPACKING ALGORITHM: * * copy incoming packet into assembly data buffer indexed by current * location of end of fragment * * if not fragmented * deliver all messages in assembly data buffer * else * if msg_count > 1 and fragmented * deliver all messages except last message in assembly data buffer * copy last fragmented section to start of assembly data buffer * else * if msg_count = 1 and fragmented * do nothing * */ #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemsrp.h" #define min(a,b) ((a) < (b)) ? a : b struct totempg_mcast_header { short version; short type; }; #if !(defined(__i386__) || defined(__x86_64__)) /* * Need align on architectures different then i386 or x86_64 */ #define TOTEMPG_NEED_ALIGN 1 #endif /* * totempg_mcast structure * * header: Identify the mcast. * fragmented: Set if this message continues into next message * continuation: Set if this message is a continuation from last message * msg_count Indicates how many packed messages are contained * in the mcast. * Also, the size of each packed message and the messages themselves are * appended to the end of this structure when sent. */ struct totempg_mcast { struct totempg_mcast_header header; unsigned char fragmented; unsigned char continuation; unsigned short msg_count; /* * short msg_len[msg_count]; */ /* * data for messages */ }; /* * Maximum packet size for totem pg messages */ #define TOTEMPG_PACKET_SIZE (totempg_totem_config->net_mtu - \ sizeof (struct totempg_mcast)) /* * Local variables used for packing small messages */ static unsigned short mcast_packed_msg_lens[FRAME_SIZE_MAX]; static int mcast_packed_msg_count = 0; static int totempg_reserved = 1; static unsigned int totempg_size_limit; static totem_queue_level_changed_fn totem_queue_level_changed = NULL; static uint32_t totempg_threaded_mode = 0; static void *totemsrp_context; /* * Function and data used to log messages */ static int totempg_log_level_security; static int totempg_log_level_error; static int totempg_log_level_warning; static int totempg_log_level_notice; static int totempg_log_level_debug; static int totempg_subsys_id; static void (*totempg_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...) __attribute__((format(printf, 6, 7))); struct totem_config *totempg_totem_config; static totempg_stats_t totempg_stats; enum throw_away_mode { THROW_AWAY_INACTIVE, THROW_AWAY_ACTIVE }; struct assembly { unsigned int nodeid; unsigned char data[MESSAGE_SIZE_MAX]; int index; unsigned char last_frag_num; enum throw_away_mode throw_away_mode; struct qb_list_head list; }; static void assembly_deref (struct assembly *assembly); static int callback_token_received_fn (enum totem_callback_token_type type, const void *data); QB_LIST_DECLARE(assembly_list_inuse); /* * Free list is used both for transitional and operational assemblies */ QB_LIST_DECLARE(assembly_list_free); QB_LIST_DECLARE(assembly_list_inuse_trans); QB_LIST_DECLARE(totempg_groups_list); /* * Staging buffer for packed messages. Messages are staged in this buffer * before sending. Multiple messages may fit which cuts down on the * number of mcasts sent. If a message doesn't completely fit, then * the mcast header has a fragment bit set that says that there are more * data to follow. fragment_size is an index into the buffer. It indicates * the size of message data and where to place new message data. * fragment_contuation indicates whether the first packed message in * the buffer is a continuation of a previously packed fragment. */ static unsigned char *fragmentation_data; static int fragment_size = 0; static int fragment_continuation = 0; static int totempg_waiting_transack = 0; struct totempg_group_instance { void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required); void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); struct totempg_group *groups; int groups_cnt; int32_t q_level; struct qb_list_head list; }; static unsigned char next_fragment = 1; static pthread_mutex_t totempg_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t callback_token_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t mcast_msg_mutex = PTHREAD_MUTEX_INITIALIZER; #define log_printf(level, format, args...) \ do { \ totempg_log_printf(level, \ totempg_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); static int msg_count_send_ok (int msg_count); static int byte_count_send_ok (int byte_count); static void totempg_waiting_trans_ack_cb (int waiting_trans_ack) { log_printf(LOG_DEBUG, "waiting_trans_ack changed to %u", waiting_trans_ack); totempg_waiting_transack = waiting_trans_ack; } static struct assembly *assembly_ref (unsigned int nodeid) { struct assembly *assembly; struct qb_list_head *list; struct qb_list_head *active_assembly_list_inuse; if (totempg_waiting_transack) { active_assembly_list_inuse = &assembly_list_inuse_trans; } else { active_assembly_list_inuse = &assembly_list_inuse; } /* * Search inuse list for node id and return assembly buffer if found */ - qb_list_for_each(list, active_assembly_list_inuse) { + qb_list_for_each(list, active_assembly_list_inuse) { assembly = qb_list_entry (list, struct assembly, list); if (nodeid == assembly->nodeid) { return (assembly); } } /* * Nothing found in inuse list get one from free list if available */ if (qb_list_empty (&assembly_list_free) == 0) { assembly = qb_list_entry (assembly_list_free.next, struct assembly, list); qb_list_del (&assembly->list); qb_list_add (&assembly->list, active_assembly_list_inuse); assembly->nodeid = nodeid; assembly->index = 0; assembly->last_frag_num = 0; assembly->throw_away_mode = THROW_AWAY_INACTIVE; return (assembly); } /* * Nothing available in inuse or free list, so allocate a new one */ assembly = malloc (sizeof (struct assembly)); /* * TODO handle memory allocation failure here */ assert (assembly); assembly->nodeid = nodeid; assembly->data[0] = 0; assembly->index = 0; assembly->last_frag_num = 0; assembly->throw_away_mode = THROW_AWAY_INACTIVE; qb_list_init (&assembly->list); qb_list_add (&assembly->list, active_assembly_list_inuse); return (assembly); } static void assembly_deref (struct assembly *assembly) { qb_list_del (&assembly->list); qb_list_add (&assembly->list, &assembly_list_free); } static void assembly_deref_from_normal_and_trans (int nodeid) { int j; - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; struct qb_list_head *active_assembly_list_inuse; struct assembly *assembly; for (j = 0; j < 2; j++) { if (j == 0) { active_assembly_list_inuse = &assembly_list_inuse; } else { active_assembly_list_inuse = &assembly_list_inuse_trans; } - qb_list_for_each(list, active_assembly_list_inuse) { + qb_list_for_each_safe(list, tmp_iter, active_assembly_list_inuse) { assembly = qb_list_entry (list, struct assembly, list); if (nodeid == assembly->nodeid) { qb_list_del (&assembly->list); qb_list_add (&assembly->list, &assembly_list_free); } } } } static inline void app_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; struct totempg_group_instance *instance; struct qb_list_head *list; /* * For every leaving processor, add to free list * This also has the side effect of clearing out the dataset * In the leaving processor's assembly buffer. */ for (i = 0; i < left_list_entries; i++) { assembly_deref_from_normal_and_trans (left_list[i]); } - qb_list_for_each(list, &totempg_groups_list) { + qb_list_for_each(list, &totempg_groups_list) { instance = qb_list_entry (list, struct totempg_group_instance, list); if (instance->confchg_fn) { instance->confchg_fn ( configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } } } static inline void group_endian_convert ( void *msg, int msg_len) { unsigned short *group_len; int i; char *aligned_msg; #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ if ((size_t)msg % 4 != 0) { aligned_msg = alloca(msg_len); memcpy(aligned_msg, msg, msg_len); } else { aligned_msg = msg; } #else aligned_msg = msg; #endif group_len = (unsigned short *)aligned_msg; group_len[0] = swab16(group_len[0]); for (i = 1; i < group_len[0] + 1; i++) { group_len[i] = swab16(group_len[i]); } if (aligned_msg != msg) { memcpy(msg, aligned_msg, msg_len); } } static inline int group_matches ( struct iovec *iovec, unsigned int iov_len, struct totempg_group *groups_b, unsigned int group_b_cnt, unsigned int *adjust_iovec) { unsigned short *group_len; char *group_name; int i; int j; #ifdef TOTEMPG_NEED_ALIGN struct iovec iovec_aligned = { NULL, 0 }; #endif assert (iov_len == 1); #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ if ((size_t)iovec->iov_base % 4 != 0) { iovec_aligned.iov_base = alloca(iovec->iov_len); memcpy(iovec_aligned.iov_base, iovec->iov_base, iovec->iov_len); iovec_aligned.iov_len = iovec->iov_len; iovec = &iovec_aligned; } #endif group_len = (unsigned short *)iovec->iov_base; group_name = ((char *)iovec->iov_base) + sizeof (unsigned short) * (group_len[0] + 1); /* * Calculate amount to adjust the iovec by before delivering to app */ *adjust_iovec = sizeof (unsigned short) * (group_len[0] + 1); for (i = 1; i < group_len[0] + 1; i++) { *adjust_iovec += group_len[i]; } /* * Determine if this message should be delivered to this instance */ for (i = 1; i < group_len[0] + 1; i++) { for (j = 0; j < group_b_cnt; j++) { if ((group_len[i] == groups_b[j].group_len) && (memcmp (groups_b[j].group, group_name, group_len[i]) == 0)) { return (1); } } group_name += group_len[i]; } return (0); } static inline void app_deliver_fn ( unsigned int nodeid, void *msg, unsigned int msg_len, int endian_conversion_required) { struct totempg_group_instance *instance; struct iovec stripped_iovec; unsigned int adjust_iovec; struct iovec *iovec; struct qb_list_head *list; struct iovec aligned_iovec = { NULL, 0 }; if (endian_conversion_required) { group_endian_convert (msg, msg_len); } /* * TODO: segmentation/assembly need to be redesigned to provide aligned access * in all cases to avoid memory copies on non386 archs. Probably broke backwars * compatibility */ #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ aligned_iovec.iov_base = alloca(msg_len); aligned_iovec.iov_len = msg_len; memcpy(aligned_iovec.iov_base, msg, msg_len); #else aligned_iovec.iov_base = msg; aligned_iovec.iov_len = msg_len; #endif iovec = &aligned_iovec; - qb_list_for_each(list, &totempg_groups_list) { + qb_list_for_each(list, &totempg_groups_list) { instance = qb_list_entry (list, struct totempg_group_instance, list); if (group_matches (iovec, 1, instance->groups, instance->groups_cnt, &adjust_iovec)) { stripped_iovec.iov_len = iovec->iov_len - adjust_iovec; stripped_iovec.iov_base = (char *)iovec->iov_base + adjust_iovec; #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ if ((char *)iovec->iov_base + adjust_iovec % 4 != 0) { /* * Deal with misalignment */ stripped_iovec.iov_base = alloca (stripped_iovec.iov_len); memcpy (stripped_iovec.iov_base, (char *)iovec->iov_base + adjust_iovec, stripped_iovec.iov_len); } #endif instance->deliver_fn ( nodeid, stripped_iovec.iov_base, stripped_iovec.iov_len, endian_conversion_required); } } } static void totempg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { // TODO optimize this app_confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } static void totempg_deliver_fn ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required) { struct totempg_mcast *mcast; unsigned short *msg_lens; int i; struct assembly *assembly; char header[FRAME_SIZE_MAX]; int msg_count; int continuation; int start; const char *data; int datasize; struct iovec iov_delv; assembly = assembly_ref (nodeid); assert (assembly); /* * Assemble the header into one block of data and * assemble the packet contents into one block of data to simplify delivery */ mcast = (struct totempg_mcast *)msg; if (endian_conversion_required) { mcast->msg_count = swab16 (mcast->msg_count); } msg_count = mcast->msg_count; datasize = sizeof (struct totempg_mcast) + msg_count * sizeof (unsigned short); memcpy (header, msg, datasize); data = msg; msg_lens = (unsigned short *) (header + sizeof (struct totempg_mcast)); if (endian_conversion_required) { for (i = 0; i < mcast->msg_count; i++) { msg_lens[i] = swab16 (msg_lens[i]); } } memcpy (&assembly->data[assembly->index], &data[datasize], msg_len - datasize); /* * If the last message in the buffer is a fragment, then we * can't deliver it. We'll first deliver the full messages * then adjust the assembly buffer so we can add the rest of the * fragment when it arrives. */ msg_count = mcast->fragmented ? mcast->msg_count - 1 : mcast->msg_count; continuation = mcast->continuation; iov_delv.iov_base = (void *)&assembly->data[0]; iov_delv.iov_len = assembly->index + msg_lens[0]; /* * Make sure that if this message is a continuation, that it * matches the sequence number of the previous fragment. * Also, if the first packed message is a continuation * of a previous message, but the assembly buffer * is empty, then we need to discard it since we can't * assemble a complete message. Likewise, if this message isn't a * continuation and the assembly buffer is empty, we have to discard * the continued message. */ start = 0; if (assembly->throw_away_mode == THROW_AWAY_ACTIVE) { /* Throw away the first msg block */ if (mcast->fragmented == 0 || mcast->fragmented == 1) { assembly->throw_away_mode = THROW_AWAY_INACTIVE; assembly->index += msg_lens[0]; iov_delv.iov_base = (void *)&assembly->data[assembly->index]; iov_delv.iov_len = msg_lens[1]; start = 1; } } else if (assembly->throw_away_mode == THROW_AWAY_INACTIVE) { if (continuation == assembly->last_frag_num) { assembly->last_frag_num = mcast->fragmented; for (i = start; i < msg_count; i++) { app_deliver_fn(nodeid, iov_delv.iov_base, iov_delv.iov_len, endian_conversion_required); assembly->index += msg_lens[i]; iov_delv.iov_base = (void *)&assembly->data[assembly->index]; if (i < (msg_count - 1)) { iov_delv.iov_len = msg_lens[i + 1]; } } } else { log_printf (LOG_DEBUG, "fragmented continuation %u is not equal to assembly last_frag_num %u", continuation, assembly->last_frag_num); assembly->throw_away_mode = THROW_AWAY_ACTIVE; } } if (mcast->fragmented == 0) { /* * End of messages, dereference assembly struct */ assembly->last_frag_num = 0; assembly->index = 0; assembly_deref (assembly); } else { /* * Message is fragmented, keep around assembly list */ if (mcast->msg_count > 1) { memmove (&assembly->data[0], &assembly->data[assembly->index], msg_lens[msg_count]); assembly->index = 0; } assembly->index += msg_lens[msg_count]; } } /* * Totem Process Group Abstraction * depends on poll abstraction, POSIX, IPV4 */ void *callback_token_received_handle; int callback_token_received_fn (enum totem_callback_token_type type, const void *data) { struct totempg_mcast mcast; struct iovec iovecs[3]; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&mcast_msg_mutex); } if (mcast_packed_msg_count == 0) { if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (0); } if (totemsrp_avail(totemsrp_context) == 0) { if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (0); } mcast.header.version = 0; mcast.header.type = 0; mcast.fragmented = 0; /* * Was the first message in this buffer a continuation of a * fragmented message? */ mcast.continuation = fragment_continuation; fragment_continuation = 0; mcast.msg_count = mcast_packed_msg_count; iovecs[0].iov_base = (void *)&mcast; iovecs[0].iov_len = sizeof (struct totempg_mcast); iovecs[1].iov_base = (void *)mcast_packed_msg_lens; iovecs[1].iov_len = mcast_packed_msg_count * sizeof (unsigned short); iovecs[2].iov_base = (void *)&fragmentation_data[0]; iovecs[2].iov_len = fragment_size; (void)totemsrp_mcast (totemsrp_context, iovecs, 3, 0); mcast_packed_msg_count = 0; fragment_size = 0; if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (0); } /* * Initialize the totem process group abstraction */ int totempg_initialize ( qb_loop_t *poll_handle, struct totem_config *totem_config) { int res; totempg_totem_config = totem_config; totempg_log_level_security = totem_config->totem_logging_configuration.log_level_security; totempg_log_level_error = totem_config->totem_logging_configuration.log_level_error; totempg_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; totempg_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; totempg_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; totempg_log_printf = totem_config->totem_logging_configuration.log_printf; totempg_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; fragmentation_data = malloc (TOTEMPG_PACKET_SIZE); if (fragmentation_data == 0) { return (-1); } totemsrp_net_mtu_adjust (totem_config); res = totemsrp_initialize ( poll_handle, &totemsrp_context, totem_config, &totempg_stats, totempg_deliver_fn, totempg_confchg_fn, totempg_waiting_trans_ack_cb); totemsrp_callback_token_create ( totemsrp_context, &callback_token_received_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, callback_token_received_fn, 0); totempg_size_limit = (totemsrp_avail(totemsrp_context) - 1) * (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16); qb_list_init (&totempg_groups_list); return (res); } void totempg_finalize (void) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } totemsrp_finalize (totemsrp_context); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } } /* * Multicast a message */ static int mcast_msg ( struct iovec *iovec_in, unsigned int iov_len, int guarantee) { int res = 0; struct totempg_mcast mcast; struct iovec iovecs[3]; struct iovec iovec[64]; int i; int dest, src; int max_packet_size = 0; int copy_len = 0; int copy_base = 0; int total_size = 0; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&mcast_msg_mutex); } totemsrp_event_signal (totemsrp_context, TOTEM_EVENT_NEW_MSG, 1); /* * Remove zero length iovectors from the list */ assert (iov_len < 64); for (dest = 0, src = 0; src < iov_len; src++) { if (iovec_in[src].iov_len) { memcpy (&iovec[dest++], &iovec_in[src], sizeof (struct iovec)); } } iov_len = dest; max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof (unsigned short) * (mcast_packed_msg_count + 1)); mcast_packed_msg_lens[mcast_packed_msg_count] = 0; /* * Check if we would overwrite new message queue */ for (i = 0; i < iov_len; i++) { total_size += iovec[i].iov_len; } if (byte_count_send_ok (total_size + sizeof(unsigned short) * (mcast_packed_msg_count)) == 0) { if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return(-1); } mcast.header.version = 0; for (i = 0; i < iov_len; ) { mcast.fragmented = 0; mcast.continuation = fragment_continuation; copy_len = iovec[i].iov_len - copy_base; /* * If it all fits with room left over, copy it in. * We need to leave at least sizeof(short) + 1 bytes in the * fragment_buffer on exit so that max_packet_size + fragment_size * doesn't exceed the size of the fragment_buffer on the next call. */ if ((copy_len + fragment_size) < (max_packet_size - sizeof (unsigned short))) { memcpy (&fragmentation_data[fragment_size], (char *)iovec[i].iov_base + copy_base, copy_len); fragment_size += copy_len; mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len; next_fragment = 1; copy_len = 0; copy_base = 0; i++; continue; /* * If it just fits or is too big, then send out what fits. */ } else { unsigned char *data_ptr; copy_len = min(copy_len, max_packet_size - fragment_size); if( copy_len == max_packet_size ) data_ptr = (unsigned char *)iovec[i].iov_base + copy_base; else { data_ptr = fragmentation_data; memcpy (&fragmentation_data[fragment_size], (unsigned char *)iovec[i].iov_base + copy_base, copy_len); } memcpy (&fragmentation_data[fragment_size], (unsigned char *)iovec[i].iov_base + copy_base, copy_len); mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len; /* * if we're not on the last iovec or the iovec is too large to * fit, then indicate a fragment. This also means that the next * message will have the continuation of this one. */ if ((i < (iov_len - 1)) || ((copy_base + copy_len) < iovec[i].iov_len)) { if (!next_fragment) { next_fragment++; } fragment_continuation = next_fragment; mcast.fragmented = next_fragment++; assert(fragment_continuation != 0); assert(mcast.fragmented != 0); } else { fragment_continuation = 0; } /* * assemble the message and send it */ mcast.msg_count = ++mcast_packed_msg_count; iovecs[0].iov_base = (void *)&mcast; iovecs[0].iov_len = sizeof(struct totempg_mcast); iovecs[1].iov_base = (void *)mcast_packed_msg_lens; iovecs[1].iov_len = mcast_packed_msg_count * sizeof(unsigned short); iovecs[2].iov_base = (void *)data_ptr; iovecs[2].iov_len = max_packet_size; assert (totemsrp_avail(totemsrp_context) > 0); res = totemsrp_mcast (totemsrp_context, iovecs, 3, guarantee); if (res == -1) { goto error_exit; } /* * Recalculate counts and indexes for the next. */ mcast_packed_msg_lens[0] = 0; mcast_packed_msg_count = 0; fragment_size = 0; max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof(unsigned short)); /* * If the iovec all fit, go to the next iovec */ if ((copy_base + copy_len) == iovec[i].iov_len) { copy_len = 0; copy_base = 0; i++; /* * Continue with the rest of the current iovec. */ } else { copy_base += copy_len; } } } /* * Bump only if we added message data. This may be zero if * the last buffer just fit into the fragmentation_data buffer * and we were at the last iovec. */ if (mcast_packed_msg_lens[mcast_packed_msg_count]) { mcast_packed_msg_count++; } error_exit: if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (res); } /* * Determine if a message of msg_size could be queued */ static int msg_count_send_ok ( int msg_count) { int avail = 0; avail = totemsrp_avail (totemsrp_context); totempg_stats.msg_queue_avail = avail; return ((avail - totempg_reserved) > msg_count); } static int byte_count_send_ok ( int byte_count) { unsigned int msg_count = 0; int avail = 0; avail = totemsrp_avail (totemsrp_context); msg_count = (byte_count / (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16)) + 1; return (avail >= msg_count); } static int send_reserve ( int msg_size) { unsigned int msg_count = 0; msg_count = (msg_size / (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16)) + 1; totempg_reserved += msg_count; totempg_stats.msg_reserved = totempg_reserved; return (msg_count); } static void send_release ( int msg_count) { totempg_reserved -= msg_count; totempg_stats.msg_reserved = totempg_reserved; } #ifndef HAVE_SMALL_MEMORY_FOOTPRINT #undef MESSAGE_QUEUE_MAX #define MESSAGE_QUEUE_MAX ((4 * MESSAGE_SIZE_MAX) / totempg_totem_config->net_mtu) #endif /* HAVE_SMALL_MEMORY_FOOTPRINT */ static uint32_t q_level_precent_used(void) { return (100 - (((totemsrp_avail(totemsrp_context) - totempg_reserved) * 100) / MESSAGE_QUEUE_MAX)); } int totempg_callback_token_create ( void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&callback_token_mutex); } res = totemsrp_callback_token_create (totemsrp_context, handle_out, type, delete, callback_fn, data); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&callback_token_mutex); } return (res); } void totempg_callback_token_destroy ( void *handle_out) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&callback_token_mutex); } totemsrp_callback_token_destroy (totemsrp_context, handle_out); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&callback_token_mutex); } } /* * vi: set autoindent tabstop=4 shiftwidth=4 : */ int totempg_groups_initialize ( void **totempg_groups_instance, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)) { struct totempg_group_instance *instance; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } instance = malloc (sizeof (struct totempg_group_instance)); if (instance == NULL) { goto error_exit; } instance->deliver_fn = deliver_fn; instance->confchg_fn = confchg_fn; instance->groups = 0; instance->groups_cnt = 0; instance->q_level = QB_LOOP_MED; qb_list_init (&instance->list); qb_list_add (&instance->list, &totempg_groups_list); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } *totempg_groups_instance = instance; return (0); error_exit: if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (-1); } int totempg_groups_join ( void *totempg_groups_instance, const struct totempg_group *groups, size_t group_cnt) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; struct totempg_group *new_groups; unsigned int res = 0; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } new_groups = realloc (instance->groups, sizeof (struct totempg_group) * (instance->groups_cnt + group_cnt)); if (new_groups == 0) { res = ENOMEM; goto error_exit; } memcpy (&new_groups[instance->groups_cnt], groups, group_cnt * sizeof (struct totempg_group)); instance->groups = new_groups; instance->groups_cnt += group_cnt; error_exit: if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } int totempg_groups_leave ( void *totempg_groups_instance, const struct totempg_group *groups, size_t group_cnt) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (0); } #define MAX_IOVECS_FROM_APP 32 #define MAX_GROUPS_PER_MSG 32 int totempg_groups_mcast_joined ( void *totempg_groups_instance, const struct iovec *iovec, unsigned int iov_len, int guarantee) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; unsigned short group_len[MAX_GROUPS_PER_MSG + 1]; struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP]; int i; unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } /* * Build group_len structure and the iovec_mcast structure */ group_len[0] = instance->groups_cnt; for (i = 0; i < instance->groups_cnt; i++) { group_len[i + 1] = instance->groups[i].group_len; iovec_mcast[i + 1].iov_len = instance->groups[i].group_len; iovec_mcast[i + 1].iov_base = (void *) instance->groups[i].group; } iovec_mcast[0].iov_len = (instance->groups_cnt + 1) * sizeof (unsigned short); iovec_mcast[0].iov_base = group_len; for (i = 0; i < iov_len; i++) { iovec_mcast[i + instance->groups_cnt + 1].iov_len = iovec[i].iov_len; iovec_mcast[i + instance->groups_cnt + 1].iov_base = iovec[i].iov_base; } res = mcast_msg (iovec_mcast, iov_len + instance->groups_cnt + 1, guarantee); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } static void check_q_level( void *totempg_groups_instance) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; int32_t old_level = instance->q_level; int32_t percent_used = q_level_precent_used(); if (percent_used >= 75 && instance->q_level != TOTEM_Q_LEVEL_CRITICAL) { instance->q_level = TOTEM_Q_LEVEL_CRITICAL; } else if (percent_used < 30 && instance->q_level != TOTEM_Q_LEVEL_LOW) { instance->q_level = TOTEM_Q_LEVEL_LOW; } else if (percent_used > 40 && percent_used < 50 && instance->q_level != TOTEM_Q_LEVEL_GOOD) { instance->q_level = TOTEM_Q_LEVEL_GOOD; } else if (percent_used > 60 && percent_used < 70 && instance->q_level != TOTEM_Q_LEVEL_HIGH) { instance->q_level = TOTEM_Q_LEVEL_HIGH; } if (totem_queue_level_changed && old_level != instance->q_level) { totem_queue_level_changed(instance->q_level); } } void totempg_check_q_level( void *totempg_groups_instance) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; check_q_level(instance); } int totempg_groups_joined_reserve ( void *totempg_groups_instance, const struct iovec *iovec, unsigned int iov_len) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; unsigned int size = 0; unsigned int i; unsigned int reserved = 0; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); pthread_mutex_lock (&mcast_msg_mutex); } for (i = 0; i < instance->groups_cnt; i++) { size += instance->groups[i].group_len; } for (i = 0; i < iov_len; i++) { size += iovec[i].iov_len; } if (size >= totempg_size_limit) { reserved = -1; goto error_exit; } if (byte_count_send_ok (size)) { reserved = send_reserve (size); } else { reserved = 0; } error_exit: check_q_level(instance); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); pthread_mutex_unlock (&totempg_mutex); } return (reserved); } int totempg_groups_joined_release (int msg_count) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); pthread_mutex_lock (&mcast_msg_mutex); } send_release (msg_count); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); pthread_mutex_unlock (&totempg_mutex); } return 0; } int totempg_groups_mcast_groups ( void *totempg_groups_instance, int guarantee, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, unsigned int iov_len) { unsigned short group_len[MAX_GROUPS_PER_MSG + 1]; struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP]; int i; unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } /* * Build group_len structure and the iovec_mcast structure */ group_len[0] = groups_cnt; for (i = 0; i < groups_cnt; i++) { group_len[i + 1] = groups[i].group_len; iovec_mcast[i + 1].iov_len = groups[i].group_len; iovec_mcast[i + 1].iov_base = (void *) groups[i].group; } iovec_mcast[0].iov_len = (groups_cnt + 1) * sizeof (unsigned short); iovec_mcast[0].iov_base = group_len; for (i = 0; i < iov_len; i++) { iovec_mcast[i + groups_cnt + 1].iov_len = iovec[i].iov_len; iovec_mcast[i + groups_cnt + 1].iov_base = iovec[i].iov_base; } res = mcast_msg (iovec_mcast, iov_len + groups_cnt + 1, guarantee); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } /* * Returns -1 if error, 0 if can't send, 1 if can send the message */ int totempg_groups_send_ok_groups ( void *totempg_groups_instance, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, unsigned int iov_len) { unsigned int size = 0; unsigned int i; unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } for (i = 0; i < groups_cnt; i++) { size += groups[i].group_len; } for (i = 0; i < iov_len; i++) { size += iovec[i].iov_len; } res = msg_count_send_ok (size); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } int totempg_ifaces_get ( unsigned int nodeid, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count) { int res; res = totemsrp_ifaces_get ( totemsrp_context, nodeid, interfaces, interfaces_size, status, iface_count); return (res); } void totempg_event_signal (enum totem_event_type type, int value) { totemsrp_event_signal (totemsrp_context, type, value); } void* totempg_get_stats (void) { return &totempg_stats; } int totempg_crypto_set ( const char *cipher_type, const char *hash_type) { int res; res = totemsrp_crypto_set (totemsrp_context, cipher_type, hash_type); return (res); } int totempg_ring_reenable (void) { int res; res = totemsrp_ring_reenable (totemsrp_context); return (res); } #define ONE_IFACE_LEN 63 const char *totempg_ifaces_print (unsigned int nodeid) { static char iface_string[256 * INTERFACE_MAX]; char one_iface[ONE_IFACE_LEN+1]; struct totem_ip_address interfaces[INTERFACE_MAX]; unsigned int iface_count; unsigned int i; int res; iface_string[0] = '\0'; res = totempg_ifaces_get (nodeid, interfaces, INTERFACE_MAX, NULL, &iface_count); if (res == -1) { return ("no interface found for nodeid"); } res = totempg_ifaces_get (nodeid, interfaces, INTERFACE_MAX, NULL, &iface_count); for (i = 0; i < iface_count; i++) { snprintf (one_iface, ONE_IFACE_LEN, "r(%d) ip(%s) ", i, totemip_print (&interfaces[i])); strcat (iface_string, one_iface); } return (iface_string); } unsigned int totempg_my_nodeid_get (void) { return (totemsrp_my_nodeid_get(totemsrp_context)); } int totempg_my_family_get (void) { return (totemsrp_my_family_get(totemsrp_context)); } extern void totempg_service_ready_register ( void (*totem_service_ready) (void)) { totemsrp_service_ready_register (totemsrp_context, totem_service_ready); } void totempg_queue_level_register_callback (totem_queue_level_changed_fn fn) { totem_queue_level_changed = fn; } extern int totempg_member_add ( const struct totem_ip_address *member, int ring_no) { return totemsrp_member_add (totemsrp_context, member, ring_no); } extern int totempg_member_remove ( const struct totem_ip_address *member, int ring_no) { return totemsrp_member_remove (totemsrp_context, member, ring_no); } void totempg_threaded_mode_enable (void) { totempg_threaded_mode = 1; totemsrp_threaded_mode_enable (totemsrp_context); } void totempg_trans_ack (void) { totemsrp_trans_ack (totemsrp_context); } diff --git a/exec/totemsrp.c b/exec/totemsrp.c index 34a2105d..261325d5 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -1,4779 +1,4779 @@ /* * Copyright (c) 2003-2006 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * The first version of this code was based upon Yair Amir's PhD thesis: * http://www.cs.jhu.edu/~yairamir/phd.ps) (ch4,5). * * The current version of totemsrp implements the Totem protocol specified in: * http://citeseer.ist.psu.edu/amir95totem.html * * The deviations from the above published protocols are: * - encryption of message contents with nss * - authentication of meessage contents with SHA1/HMAC * - token hold mode where token doesn't rotate on unused ring - reduces cpu * usage on 1.6ghz xeon from 35% to less then .1 % as measured by top */ #include #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemsrp.h" #include "totemnet.h" #include "cs_queue.h" #define LOCALHOST_IP inet_addr("127.0.0.1") #define QUEUE_RTR_ITEMS_SIZE_MAX 16384 /* allow 16384 retransmit items */ #define RETRANS_MESSAGE_QUEUE_SIZE_MAX 16384 /* allow 500 messages to be queued */ #define RECEIVED_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */ #define MAXIOVS 5 #define RETRANSMIT_ENTRIES_MAX 30 #define TOKEN_SIZE_MAX 64000 /* bytes */ #define LEAVE_DUMMY_NODEID 0 /* * SRP address. * CC: TODO: Can we remove IP address from this and just use nodeids? */ struct srp_addr { uint8_t no_addrs; struct totem_ip_address addr[INTERFACE_MAX]; }; /* * Rollover handling: * SEQNO_START_MSG is the starting sequence number after a new configuration * This should remain zero, unless testing overflow in which case * 0x7ffff000 and 0xfffff000 are good starting values. * * SEQNO_START_TOKEN is the starting sequence number after a new configuration * for a token. This should remain zero, unless testing overflow in which * case 07fffff00 or 0xffffff00 are good starting values. */ #define SEQNO_START_MSG 0x0 #define SEQNO_START_TOKEN 0x0 /* * These can be used ot test different rollover points * #define SEQNO_START_MSG 0xfffffe00 * #define SEQNO_START_TOKEN 0xfffffe00 */ /* * These can be used to test the error recovery algorithms * #define TEST_DROP_ORF_TOKEN_PERCENTAGE 30 * #define TEST_DROP_COMMIT_TOKEN_PERCENTAGE 30 * #define TEST_DROP_MCAST_PERCENTAGE 50 * #define TEST_RECOVERY_MSG_COUNT 300 */ /* * we compare incoming messages to determine if their endian is * different - if so convert them * * do not change */ #define ENDIAN_LOCAL 0xff22 enum message_type { MESSAGE_TYPE_ORF_TOKEN = 0, /* Ordering, Reliability, Flow (ORF) control Token */ MESSAGE_TYPE_MCAST = 1, /* ring ordered multicast message */ MESSAGE_TYPE_MEMB_MERGE_DETECT = 2, /* merge rings if there are available rings */ MESSAGE_TYPE_MEMB_JOIN = 3, /* membership join message */ MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4, /* membership commit token */ MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5, /* cancel the holding of the token */ }; enum encapsulation_type { MESSAGE_ENCAPSULATED = 1, MESSAGE_NOT_ENCAPSULATED = 2 }; /* * New membership algorithm local variables */ struct consensus_list_item { struct srp_addr addr; int set; }; struct token_callback_instance { struct qb_list_head list; int (*callback_fn) (enum totem_callback_token_type type, const void *); enum totem_callback_token_type callback_type; int delete; void *data; }; struct totemsrp_socket { int mcast; int token; }; struct mcast { struct totem_message_header header; struct srp_addr system_from; unsigned int seq; int this_seqno; struct memb_ring_id ring_id; unsigned int node_id; int guarantee; } __attribute__((packed)); struct rtr_item { struct memb_ring_id ring_id; unsigned int seq; }__attribute__((packed)); struct orf_token { struct totem_message_header header; unsigned int seq; unsigned int token_seq; unsigned int aru; unsigned int aru_addr; struct memb_ring_id ring_id; unsigned int backlog; unsigned int fcc; int retrans_flg; int rtr_list_entries; struct rtr_item rtr_list[0]; }__attribute__((packed)); struct memb_join { struct totem_message_header header; struct srp_addr system_from; unsigned int proc_list_entries; unsigned int failed_list_entries; unsigned long long ring_seq; unsigned char end_of_memb_join[0]; /* * These parts of the data structure are dynamic: * struct srp_addr proc_list[]; * struct srp_addr failed_list[]; */ } __attribute__((packed)); struct memb_merge_detect { struct totem_message_header header; struct srp_addr system_from; struct memb_ring_id ring_id; } __attribute__((packed)); struct token_hold_cancel { struct totem_message_header header; struct memb_ring_id ring_id; } __attribute__((packed)); struct memb_commit_token_memb_entry { struct memb_ring_id ring_id; unsigned int aru; unsigned int high_delivered; unsigned int received_flg; }__attribute__((packed)); struct memb_commit_token { struct totem_message_header header; unsigned int token_seq; struct memb_ring_id ring_id; unsigned int retrans_flg; int memb_index; int addr_entries; unsigned char end_of_commit_token[0]; /* * These parts of the data structure are dynamic: * * struct srp_addr addr[PROCESSOR_COUNT_MAX]; * struct memb_commit_token_memb_entry memb_list[PROCESSOR_COUNT_MAX]; */ }__attribute__((packed)); struct message_item { struct mcast *mcast; unsigned int msg_len; }; struct sort_queue_item { struct mcast *mcast; unsigned int msg_len; }; enum memb_state { MEMB_STATE_OPERATIONAL = 1, MEMB_STATE_GATHER = 2, MEMB_STATE_COMMIT = 3, MEMB_STATE_RECOVERY = 4 }; struct totemsrp_instance { int iface_changes; int failed_to_recv; /* * Flow control mcasts and remcasts on last and current orf_token */ int fcc_remcast_last; int fcc_mcast_last; int fcc_remcast_current; struct consensus_list_item consensus_list[PROCESSOR_COUNT_MAX]; int consensus_list_entries; struct srp_addr my_id; struct srp_addr my_proc_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_failed_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_new_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_trans_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_deliver_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX]; unsigned int my_leave_memb_list[PROCESSOR_COUNT_MAX]; int my_proc_list_entries; int my_failed_list_entries; int my_new_memb_entries; int my_trans_memb_entries; int my_memb_entries; int my_deliver_memb_entries; int my_left_memb_entries; int my_leave_memb_entries; struct memb_ring_id my_ring_id; struct memb_ring_id my_old_ring_id; int my_aru_count; int my_merge_detect_timeout_outstanding; unsigned int my_last_aru; int my_seq_unchanged; int my_received_flg; unsigned int my_high_seq_received; unsigned int my_install_seq; int my_rotation_counter; int my_set_retrans_flg; int my_retrans_flg_count; unsigned int my_high_ring_delivered; int heartbeat_timeout; /* * Queues used to order, deliver, and recover messages */ struct cs_queue new_message_queue; struct cs_queue new_message_queue_trans; struct cs_queue retrans_message_queue; struct sq regular_sort_queue; struct sq recovery_sort_queue; /* * Received up to and including */ unsigned int my_aru; unsigned int my_high_delivered; struct qb_list_head token_callback_received_listhead; struct qb_list_head token_callback_sent_listhead; char orf_token_retransmit[TOKEN_SIZE_MAX]; int orf_token_retransmit_size; unsigned int my_token_seq; /* * Timers */ qb_loop_timer_handle timer_pause_timeout; qb_loop_timer_handle timer_orf_token_timeout; qb_loop_timer_handle timer_orf_token_retransmit_timeout; qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout; qb_loop_timer_handle timer_merge_detect_timeout; qb_loop_timer_handle memb_timer_state_gather_join_timeout; qb_loop_timer_handle memb_timer_state_gather_consensus_timeout; qb_loop_timer_handle memb_timer_state_commit_timeout; qb_loop_timer_handle timer_heartbeat_timeout; /* * Function and data used to log messages */ int totemsrp_log_level_security; int totemsrp_log_level_error; int totemsrp_log_level_warning; int totemsrp_log_level_notice; int totemsrp_log_level_debug; int totemsrp_log_level_trace; int totemsrp_subsys_id; void (*totemsrp_log_printf) ( int level, int sybsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7)));; enum memb_state memb_state; //TODO struct srp_addr next_memb; qb_loop_t *totemsrp_poll_handle; struct totem_ip_address mcast_address; void (*totemsrp_deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required); void (*totemsrp_confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); void (*totemsrp_service_ready_fn) (void); void (*totemsrp_waiting_trans_ack_cb_fn) ( int waiting_trans_ack); void (*memb_ring_id_create_or_load) ( struct memb_ring_id *memb_ring_id, const struct totem_ip_address *addr); void (*memb_ring_id_store) ( const struct memb_ring_id *memb_ring_id, const struct totem_ip_address *addr); int global_seqno; int my_token_held; unsigned long long token_ring_id_seq; unsigned int last_released; unsigned int set_aru; int old_ring_state_saved; int old_ring_state_aru; unsigned int old_ring_state_high_seq_received; unsigned int my_last_seq; struct timeval tv_old; void *totemnet_context; struct totem_config *totem_config; unsigned int use_heartbeat; unsigned int my_trc; unsigned int my_pbl; unsigned int my_cbl; uint64_t pause_timestamp; struct memb_commit_token *commit_token; totemsrp_stats_t stats; uint32_t orf_token_discard; uint32_t originated_orf_token; uint32_t threaded_mode_enabled; uint32_t waiting_trans_ack; int flushing; void * token_recv_event_handle; void * token_sent_event_handle; char commit_token_storage[40000]; }; struct message_handlers { int count; int (*handler_functions[6]) ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); }; enum gather_state_from { TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT = 0, TOTEMSRP_GSFROM_GATHER_MISSING1 = 1, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE = 2, TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED = 3, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE = 4, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE = 5, TOTEMSRP_GSFROM_FAILED_TO_RECEIVE = 6, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE = 7, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE = 8, TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE = 9, TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE = 10, TOTEMSRP_GSFROM_MERGE_DURING_JOIN = 11, TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE = 12, TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE = 13, TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY = 14, TOTEMSRP_GSFROM_INTERFACE_CHANGE = 15, TOTEMSRP_GSFROM_MAX = TOTEMSRP_GSFROM_INTERFACE_CHANGE, }; const char* gather_state_from_desc [] = { [TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT] = "consensus timeout", [TOTEMSRP_GSFROM_GATHER_MISSING1] = "MISSING", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE] = "The token was lost in the OPERATIONAL state.", [TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED] = "The consensus timeout expired.", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE] = "The token was lost in the COMMIT state.", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE] = "The token was lost in the RECOVERY state.", [TOTEMSRP_GSFROM_FAILED_TO_RECEIVE] = "failed to receive", [TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE] = "foreign message in operational state", [TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE] = "foreign message in gather state", [TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE] = "merge during operational state", [TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE] = "merge during gather state", [TOTEMSRP_GSFROM_MERGE_DURING_JOIN] = "merge during join", [TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE] = "join during operational state", [TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE] = "join during commit state", [TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY] = "join during recovery", [TOTEMSRP_GSFROM_INTERFACE_CHANGE] = "interface change", }; /* * forward decls */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static void totemsrp_instance_initialize (struct totemsrp_instance *instance); static void srp_addr_copy (struct srp_addr *dest, const struct srp_addr *src); static void srp_addr_to_nodeid ( unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries); static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b); static void memb_leave_message_send (struct totemsrp_instance *instance); static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type); static void memb_state_gather_enter (struct totemsrp_instance *instance, enum gather_state_from gather_from); static void messages_deliver_to_app (struct totemsrp_instance *instance, int skip, unsigned int end_point); static int orf_token_mcast (struct totemsrp_instance *instance, struct orf_token *oken, int fcc_mcasts_allowed); static void messages_free (struct totemsrp_instance *instance, unsigned int token_aru); static void memb_ring_id_set (struct totemsrp_instance *instance, const struct memb_ring_id *ring_id); static void target_set_completed (void *context); static void memb_state_commit_token_update (struct totemsrp_instance *instance); static void memb_state_commit_token_target_set (struct totemsrp_instance *instance); static int memb_state_commit_token_send (struct totemsrp_instance *instance); static int memb_state_commit_token_send_recovery (struct totemsrp_instance *instance, struct memb_commit_token *memb_commit_token); static void memb_state_commit_token_create (struct totemsrp_instance *instance); static int token_hold_cancel_send (struct totemsrp_instance *instance); static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out); static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out); static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out); static void mcast_endian_convert (const struct mcast *in, struct mcast *out); static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out); static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in); static void timer_function_orf_token_timeout (void *data); static void timer_function_pause_timeout (void *data); static void timer_function_heartbeat_timeout (void *data); static void timer_function_token_retransmit_timeout (void *data); static void timer_function_token_hold_retransmit_timeout (void *data); static void timer_function_merge_detect_timeout (void *data); static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance); static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr); static const char* gsfrom_to_msg(enum gather_state_from gsfrom); void main_deliver_fn ( void *context, const void *msg, unsigned int msg_len); void main_iface_change_fn ( void *context, const struct totem_ip_address *iface_address, unsigned int iface_no); struct message_handlers totemsrp_message_handlers = { 6, { message_handler_orf_token, /* MESSAGE_TYPE_ORF_TOKEN */ message_handler_mcast, /* MESSAGE_TYPE_MCAST */ message_handler_memb_merge_detect, /* MESSAGE_TYPE_MEMB_MERGE_DETECT */ message_handler_memb_join, /* MESSAGE_TYPE_MEMB_JOIN */ message_handler_memb_commit_token, /* MESSAGE_TYPE_MEMB_COMMIT_TOKEN */ message_handler_token_hold_cancel /* MESSAGE_TYPE_TOKEN_HOLD_CANCEL */ } }; #define log_printf(level, format, args...) \ do { \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)\n", ##args, _error_ptr, err_num); \ } while(0) static const char* gsfrom_to_msg(enum gather_state_from gsfrom) { if (gsfrom <= TOTEMSRP_GSFROM_MAX) { return gather_state_from_desc[gsfrom]; } else { return "UNKNOWN"; } } static void totemsrp_instance_initialize (struct totemsrp_instance *instance) { memset (instance, 0, sizeof (struct totemsrp_instance)); qb_list_init (&instance->token_callback_received_listhead); qb_list_init (&instance->token_callback_sent_listhead); instance->my_received_flg = 1; instance->my_token_seq = SEQNO_START_TOKEN - 1; instance->memb_state = MEMB_STATE_OPERATIONAL; instance->set_aru = -1; instance->my_aru = SEQNO_START_MSG; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_high_delivered = SEQNO_START_MSG; instance->orf_token_discard = 0; instance->originated_orf_token = 0; instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage; instance->my_id.no_addrs = INTERFACE_MAX; instance->waiting_trans_ack = 1; } static int pause_flush (struct totemsrp_instance *instance) { uint64_t now_msec; uint64_t timestamp_msec; int res = 0; now_msec = (qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC); timestamp_msec = instance->pause_timestamp / QB_TIME_NS_IN_MSEC; if ((now_msec - timestamp_msec) > (instance->totem_config->token_timeout / 2)) { log_printf (instance->totemsrp_log_level_notice, "Process pause detected for %d ms, flushing membership messages.", (unsigned int)(now_msec - timestamp_msec)); /* * -1 indicates an error from recvmsg */ do { res = totemnet_recv_mcast_empty (instance->totemnet_context); } while (res == -1); } return (res); } static int token_event_stats_collector (enum totem_callback_token_type type, const void *void_instance) { struct totemsrp_instance *instance = (struct totemsrp_instance *)void_instance; uint32_t time_now; unsigned long long nano_secs = qb_util_nano_current_get (); time_now = (nano_secs / QB_TIME_NS_IN_MSEC); if (type == TOTEM_CALLBACK_TOKEN_RECEIVED) { /* incr latest token the index */ if (instance->stats.latest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.latest_token = 0; else instance->stats.latest_token++; if (instance->stats.earliest_token == instance->stats.latest_token) { /* we have filled up the array, start overwriting */ if (instance->stats.earliest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.earliest_token = 0; else instance->stats.earliest_token++; instance->stats.token[instance->stats.earliest_token].rx = 0; instance->stats.token[instance->stats.earliest_token].tx = 0; instance->stats.token[instance->stats.earliest_token].backlog_calc = 0; } instance->stats.token[instance->stats.latest_token].rx = time_now; instance->stats.token[instance->stats.latest_token].tx = 0; /* in case we drop the token */ } else { instance->stats.token[instance->stats.latest_token].tx = time_now; } return 0; } static void totempg_mtu_changed(void *context, int net_mtu) { struct totemsrp_instance *instance = context; instance->totem_config->net_mtu = net_mtu - sizeof (struct mcast); log_printf (instance->totemsrp_log_level_debug, "Net MTU changed to %d, new value is %d", net_mtu, instance->totem_config->net_mtu); } /* * Exported interfaces */ int totemsrp_initialize ( qb_loop_t *poll_handle, void **srp_context, struct totem_config *totem_config, totempg_stats_t *stats, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id), void (*waiting_trans_ack_cb_fn) ( int waiting_trans_ack)) { struct totemsrp_instance *instance; instance = malloc (sizeof (struct totemsrp_instance)); if (instance == NULL) { goto error_exit; } totemsrp_instance_initialize (instance); instance->totemsrp_waiting_trans_ack_cb_fn = waiting_trans_ack_cb_fn; instance->totemsrp_waiting_trans_ack_cb_fn (1); stats->srp = &instance->stats; instance->stats.latest_token = 0; instance->stats.earliest_token = 0; instance->totem_config = totem_config; /* * Configure logging */ instance->totemsrp_log_level_security = totem_config->totem_logging_configuration.log_level_security; instance->totemsrp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemsrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemsrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemsrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemsrp_log_level_trace = totem_config->totem_logging_configuration.log_level_trace; instance->totemsrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemsrp_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Configure totem store and load functions */ instance->memb_ring_id_create_or_load = totem_config->totem_memb_ring_id_create_or_load; instance->memb_ring_id_store = totem_config->totem_memb_ring_id_store; /* * Initialize local variables for totemsrp */ totemip_copy (&instance->mcast_address, &totem_config->interfaces[0].mcast_addr); /* * Display totem configuration */ log_printf (instance->totemsrp_log_level_debug, "Token Timeout (%d ms) retransmit timeout (%d ms)", totem_config->token_timeout, totem_config->token_retransmit_timeout); log_printf (instance->totemsrp_log_level_debug, "token hold (%d ms) retransmits before loss (%d retrans)", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf (instance->totemsrp_log_level_debug, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf (instance->totemsrp_log_level_debug, "downcheck (%d ms) fail to recv const (%d msgs)", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf (instance->totemsrp_log_level_debug, "seqno unchanged const (%d rotations) Maximum network MTU %d", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf (instance->totemsrp_log_level_debug, "window size per rotation (%d messages) maximum messages per rotation (%d messages)", totem_config->window_size, totem_config->max_messages); log_printf (instance->totemsrp_log_level_debug, "missed count const (%d messages)", totem_config->miss_count_const); log_printf (instance->totemsrp_log_level_debug, "send threads (%d threads)", totem_config->threads); log_printf (instance->totemsrp_log_level_debug, "heartbeat_failures_allowed (%d)", totem_config->heartbeat_failures_allowed); log_printf (instance->totemsrp_log_level_debug, "max_network_delay (%d ms)", totem_config->max_network_delay); cs_queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); sq_init (&instance->regular_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); sq_init (&instance->recovery_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); instance->totemsrp_poll_handle = poll_handle; instance->totemsrp_deliver_fn = deliver_fn; instance->totemsrp_confchg_fn = confchg_fn; instance->use_heartbeat = 1; timer_function_pause_timeout (instance); if ( totem_config->heartbeat_failures_allowed == 0 ) { log_printf (instance->totemsrp_log_level_debug, "HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0"); instance->use_heartbeat = 0; } if (instance->use_heartbeat) { instance->heartbeat_timeout = (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout + totem_config->max_network_delay; if (instance->heartbeat_timeout >= totem_config->token_timeout) { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)", instance->heartbeat_timeout, totem_config->token_timeout); log_printf (instance->totemsrp_log_level_debug, "heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay"); log_printf (instance->totemsrp_log_level_debug, "heartbeat timeout should be less than the token timeout. Heartbeat is disabled!!"); instance->use_heartbeat = 0; } else { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms)", instance->heartbeat_timeout); } } totemnet_initialize ( poll_handle, &instance->totemnet_context, totem_config, stats->srp, instance, main_deliver_fn, main_iface_change_fn, totempg_mtu_changed, target_set_completed); /* * Must have net_mtu adjusted by totemnet_initialize first */ cs_queue_init (&instance->new_message_queue, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); cs_queue_init (&instance->new_message_queue_trans, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); totemsrp_callback_token_create (instance, &instance->token_recv_event_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, token_event_stats_collector, instance); totemsrp_callback_token_create (instance, &instance->token_sent_event_handle, TOTEM_CALLBACK_TOKEN_SENT, 0, token_event_stats_collector, instance); *srp_context = instance; return (0); error_exit: return (-1); } void totemsrp_finalize ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; memb_leave_message_send (instance); totemnet_finalize (instance->totemnet_context); cs_queue_free (&instance->new_message_queue); cs_queue_free (&instance->new_message_queue_trans); cs_queue_free (&instance->retrans_message_queue); sq_free (&instance->regular_sort_queue); sq_free (&instance->recovery_sort_queue); free (instance); } /* * Return configured interfaces. interfaces is array of totem_ip addresses allocated by caller, * with interaces_size number of items. iface_count is final number of interfaces filled by this * function. * * Function returns 0 on success, otherwise if interfaces array is not big enough, -2 is returned, * and if interface was not found, -1 is returned. */ int totemsrp_ifaces_get ( void *srp_context, unsigned int nodeid, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res = 0; unsigned int found = 0; unsigned int i; for (i = 0; i < instance->my_memb_entries; i++) { if (instance->my_memb_list[i].addr[0].nodeid == nodeid) { found = 1; break; } } if (found) { *iface_count = instance->totem_config->interface_count; if (interfaces_size >= *iface_count) { memcpy (interfaces, instance->my_memb_list[i].addr, sizeof (struct totem_ip_address) * *iface_count); } else { res = -2; } goto finish; } for (i = 0; i < instance->my_left_memb_entries; i++) { if (instance->my_left_memb_list[i].addr[0].nodeid == nodeid) { found = 1; break; } } if (found) { *iface_count = instance->totem_config->interface_count; if (interfaces_size >= *iface_count) { memcpy (interfaces, instance->my_left_memb_list[i].addr, sizeof (struct totem_ip_address) * *iface_count); } else { res = -2; } } else { res = -1; } finish: totemnet_ifaces_get(instance->totemnet_context, status, iface_count); return (res); } int totemsrp_crypto_set ( void *srp_context, const char *cipher_type, const char *hash_type) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = totemnet_crypto_set(instance->totemnet_context, cipher_type, hash_type); return (res); } unsigned int totemsrp_my_nodeid_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; unsigned int res; res = instance->totem_config->interfaces[0].boundto.nodeid; return (res); } int totemsrp_my_family_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = instance->totem_config->interfaces[0].boundto.family; return (res); } int totemsrp_ring_reenable ( void *srp_context) { return (0); } /* * Set operations for use by the membership algorithm */ static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b) { unsigned int i; unsigned int res; for (i = 0; i < 1; i++) { res = totemip_equal (&a->addr[i], &b->addr[i]); if (res == 0) { return (0); } } return (1); } static void srp_addr_copy (struct srp_addr *dest, const struct srp_addr *src) { unsigned int i; dest->no_addrs = src->no_addrs; for (i = 0; i < INTERFACE_MAX; i++) { totemip_copy (&dest->addr[i], &src->addr[i]); } } static void srp_addr_to_nodeid ( unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries) { unsigned int i; for (i = 0; i < entries; i++) { nodeid_out[i] = srp_addr_in[i].addr[0].nodeid; } } static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in) { int i; for (i = 0; i < INTERFACE_MAX; i++) { totemip_copy_endian_convert (&out->addr[i], &in->addr[i]); } } static void memb_consensus_reset (struct totemsrp_instance *instance) { instance->consensus_list_entries = 0; } static void memb_set_subtract ( struct srp_addr *out_list, int *out_list_entries, struct srp_addr *one_list, int one_list_entries, struct srp_addr *two_list, int two_list_entries) { int found = 0; int i; int j; *out_list_entries = 0; for (i = 0; i < one_list_entries; i++) { for (j = 0; j < two_list_entries; j++) { if (srp_addr_equal (&one_list[i], &two_list[j])) { found = 1; break; } } if (found == 0) { srp_addr_copy (&out_list[*out_list_entries], &one_list[i]); *out_list_entries = *out_list_entries + 1; } found = 0; } } /* * Set consensus for a specific processor */ static void memb_consensus_set ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int found = 0; int i; if (addr->addr[0].nodeid == LEAVE_DUMMY_NODEID) return; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) { found = 1; break; /* found entry */ } } srp_addr_copy (&instance->consensus_list[i].addr, addr); instance->consensus_list[i].set = 1; if (found == 0) { instance->consensus_list_entries++; } return; } /* * Is consensus set for a specific processor */ static int memb_consensus_isset ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal (addr, &instance->consensus_list[i].addr)) { return (instance->consensus_list[i].set); } } return (0); } /* * Is consensus agreed upon based upon consensus database */ static int memb_consensus_agreed ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int agreed = 1; int i; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); for (i = 0; i < token_memb_entries; i++) { if (memb_consensus_isset (instance, &token_memb[i]) == 0) { agreed = 0; break; } } if (agreed && instance->failed_to_recv == 1) { /* * Both nodes agreed on our failure. We don't care how many proc list items left because we * will create single ring anyway. */ return (agreed); } assert (token_memb_entries >= 1); return (agreed); } static void memb_consensus_notset ( struct totemsrp_instance *instance, struct srp_addr *no_consensus_list, int *no_consensus_list_entries, struct srp_addr *comparison_list, int comparison_list_entries) { int i; *no_consensus_list_entries = 0; for (i = 0; i < instance->my_proc_list_entries; i++) { if (memb_consensus_isset (instance, &instance->my_proc_list[i]) == 0) { srp_addr_copy (&no_consensus_list[*no_consensus_list_entries], &instance->my_proc_list[i]); *no_consensus_list_entries = *no_consensus_list_entries + 1; } } } /* * Is set1 equal to set2 Entries can be in different orders */ static int memb_set_equal ( struct srp_addr *set1, int set1_entries, struct srp_addr *set2, int set2_entries) { int i; int j; int found = 0; if (set1_entries != set2_entries) { return (0); } for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { found = 1; break; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * Is subset fully contained in fullset */ static int memb_set_subset ( const struct srp_addr *subset, int subset_entries, const struct srp_addr *fullset, int fullset_entries) { int i; int j; int found = 0; if (subset_entries > fullset_entries) { return (0); } for (i = 0; i < subset_entries; i++) { for (j = 0; j < fullset_entries; j++) { if (srp_addr_equal (&subset[i], &fullset[j])) { found = 1; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * merge subset into fullset taking care not to add duplicates */ static void memb_set_merge ( const struct srp_addr *subset, int subset_entries, struct srp_addr *fullset, int *fullset_entries) { int found = 0; int i; int j; for (i = 0; i < subset_entries; i++) { for (j = 0; j < *fullset_entries; j++) { if (srp_addr_equal (&fullset[j], &subset[i])) { found = 1; break; } } if (found == 0) { srp_addr_copy (&fullset[*fullset_entries], &subset[i]); *fullset_entries = *fullset_entries + 1; } found = 0; } return; } static void memb_set_and_with_ring_id ( struct srp_addr *set1, struct memb_ring_id *set1_ring_ids, int set1_entries, struct srp_addr *set2, int set2_entries, struct memb_ring_id *old_ring_id, struct srp_addr *and, int *and_entries) { int i; int j; int found = 0; *and_entries = 0; for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { if (memcmp (&set1_ring_ids[j], old_ring_id, sizeof (struct memb_ring_id)) == 0) { found = 1; } break; } } if (found) { srp_addr_copy (&and[*and_entries], &set1[j]); *and_entries = *and_entries + 1; } found = 0; } return; } #ifdef CODE_COVERAGE static void memb_set_print ( char *string, struct srp_addr *list, int list_entries) { int i; int j; printf ("List '%s' contains %d entries:\n", string, list_entries); for (i = 0; i < list_entries; i++) { printf ("Address %d with %d rings\n", i, list[i].no_addrs); for (j = 0; j < list[i].no_addrs; j++) { printf ("\tiface %d %s\n", j, totemip_print (&list[i].addr[j])); printf ("\tfamily %d\n", list[i].addr[j].family); } } } #endif static void my_leave_memb_clear( struct totemsrp_instance *instance) { memset(instance->my_leave_memb_list, 0, sizeof(instance->my_leave_memb_list)); instance->my_leave_memb_entries = 0; } static unsigned int my_leave_memb_match( struct totemsrp_instance *instance, unsigned int nodeid) { int i; unsigned int ret = 0; for (i = 0; i < instance->my_leave_memb_entries; i++){ if (instance->my_leave_memb_list[i] == nodeid){ ret = nodeid; break; } } return ret; } static void my_leave_memb_set( struct totemsrp_instance *instance, unsigned int nodeid) { int i, found = 0; for (i = 0; i < instance->my_leave_memb_entries; i++){ if (instance->my_leave_memb_list[i] == nodeid){ found = 1; break; } } if (found == 1) { return; } if (instance->my_leave_memb_entries < (PROCESSOR_COUNT_MAX - 1)) { instance->my_leave_memb_list[instance->my_leave_memb_entries] = nodeid; instance->my_leave_memb_entries++; } else { log_printf (instance->totemsrp_log_level_warning, "Cannot set LEAVE nodeid=%d", nodeid); } } static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance) { assert (instance != NULL); return totemnet_buffer_alloc (instance->totemnet_context); } static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr) { assert (instance != NULL); totemnet_buffer_release (instance->totemnet_context, ptr); } static void reset_token_retransmit_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_retransmit_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_retransmit_timeout, &instance->timer_orf_token_retransmit_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_retransmit_timeout - qb_loop_timer_add error : %d", res); } } static void start_merge_detect_timeout (struct totemsrp_instance *instance) { int32_t res; if (instance->my_merge_detect_timeout_outstanding == 0) { res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "start_merge_detect_timeout - qb_loop_timer_add error : %d", res); } instance->my_merge_detect_timeout_outstanding = 1; } } static void cancel_merge_detect_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 0; } /* * ring_state_* is used to save and restore the sort queue * state when a recovery operation fails (and enters gather) */ static void old_ring_state_save (struct totemsrp_instance *instance) { if (instance->old_ring_state_saved == 0) { instance->old_ring_state_saved = 1; memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); instance->old_ring_state_aru = instance->my_aru; instance->old_ring_state_high_seq_received = instance->my_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Saving state aru %x high seq received %x", instance->my_aru, instance->my_high_seq_received); } } static void old_ring_state_restore (struct totemsrp_instance *instance) { instance->my_aru = instance->old_ring_state_aru; instance->my_high_seq_received = instance->old_ring_state_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Restoring instance->my_aru %x my high seq received %x", instance->my_aru, instance->my_high_seq_received); } static void old_ring_state_reset (struct totemsrp_instance *instance) { log_printf (instance->totemsrp_log_level_debug, "Resetting old ring state"); instance->old_ring_state_saved = 0; } static void reset_pause_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_pause_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout * QB_TIME_NS_IN_MSEC / 5, (void *)instance, timer_function_pause_timeout, &instance->timer_pause_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_pause_timeout - qb_loop_timer_add error : %d", res); } } static void reset_token_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_orf_token_timeout, &instance->timer_orf_token_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_timeout - qb_loop_timer_add error : %d", res); } } static void reset_heartbeat_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->heartbeat_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_heartbeat_timeout, &instance->timer_heartbeat_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_heartbeat_timeout - qb_loop_timer_add error : %d", res); } } static void cancel_token_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); } static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); } static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); } static void start_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { int32_t res; res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_hold_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_hold_retransmit_timeout, &instance->timer_orf_token_hold_retransmit_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "start_token_hold_retransmit_timeout - qb_loop_timer_add error : %d", res); } } static void cancel_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_hold_retransmit_timeout); } static void memb_state_consensus_timeout_expired ( struct totemsrp_instance *instance) { struct srp_addr no_consensus_list[PROCESSOR_COUNT_MAX]; int no_consensus_list_entries; instance->stats.consensus_timeouts++; if (memb_consensus_agreed (instance)) { memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); reset_token_timeout (instance); // REVIEWED } else { memb_consensus_notset ( instance, no_consensus_list, &no_consensus_list_entries, instance->my_proc_list, instance->my_proc_list_entries); memb_set_merge (no_consensus_list, no_consensus_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT); } } static void memb_join_message_send (struct totemsrp_instance *instance); static void memb_merge_detect_transmit (struct totemsrp_instance *instance); /* * Timers used for various states of the membership algorithm */ static void timer_function_pause_timeout (void *data) { struct totemsrp_instance *instance = data; instance->pause_timestamp = qb_util_nano_current_get (); reset_pause_timeout (instance); } static void memb_recovery_state_token_loss (struct totemsrp_instance *instance) { old_ring_state_restore (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE); instance->stats.recovery_token_lost++; } static void timer_function_orf_token_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the OPERATIONAL state."); log_printf (instance->totemsrp_log_level_notice, "A processor failed, forming new configuration."); totemnet_iface_check (instance->totemnet_context); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE); instance->stats.operational_token_lost++; break; case MEMB_STATE_GATHER: log_printf (instance->totemsrp_log_level_debug, "The consensus timeout expired."); memb_state_consensus_timeout_expired (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED); instance->stats.gather_token_lost++; break; case MEMB_STATE_COMMIT: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the COMMIT state."); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE); instance->stats.commit_token_lost++; break; case MEMB_STATE_RECOVERY: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the RECOVERY state."); memb_recovery_state_token_loss (instance); instance->orf_token_discard = 1; break; } } static void timer_function_heartbeat_timeout (void *data) { struct totemsrp_instance *instance = data; log_printf (instance->totemsrp_log_level_debug, "HeartBeat Timer expired Invoking token loss mechanism in state %d ", instance->memb_state); timer_function_orf_token_timeout(data); } static void memb_timer_function_state_gather (void *data) { struct totemsrp_instance *instance = data; int32_t res; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: assert (0); /* this should never happen */ break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: memb_join_message_send (instance); /* * Restart the join timeout `*/ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_timer_function_state_gather - qb_loop_timer_add error : %d", res); } break; } } static void memb_timer_function_gather_consensus_timeout (void *data) { struct totemsrp_instance *instance = data; memb_state_consensus_timeout_expired (instance); } static void deliver_messages_from_recovery_to_regular (struct totemsrp_instance *instance) { unsigned int i; struct sort_queue_item *recovery_message_item; struct sort_queue_item regular_message_item; unsigned int range = 0; int res; void *ptr; struct mcast *mcast; log_printf (instance->totemsrp_log_level_debug, "recovery to regular %x-%x", SEQNO_START_MSG + 1, instance->my_aru); range = instance->my_aru - SEQNO_START_MSG; /* * Move messages from recovery to regular sort queue */ // todo should i be initialized to 0 or 1 ? for (i = 1; i <= range; i++) { res = sq_item_get (&instance->recovery_sort_queue, i + SEQNO_START_MSG, &ptr); if (res != 0) { continue; } recovery_message_item = ptr; /* * Convert recovery message into regular message */ mcast = recovery_message_item->mcast; if (mcast->header.encapsulated == MESSAGE_ENCAPSULATED) { /* * Message is a recovery message encapsulated * in a new ring message */ regular_message_item.mcast = (struct mcast *)(((char *)recovery_message_item->mcast) + sizeof (struct mcast)); regular_message_item.msg_len = recovery_message_item->msg_len - sizeof (struct mcast); mcast = regular_message_item.mcast; } else { /* * TODO this case shouldn't happen */ continue; } log_printf (instance->totemsrp_log_level_debug, "comparing if ring id is for this processors old ring seqno %d", mcast->seq); /* * Only add this message to the regular sort * queue if it was originated with the same ring * id as the previous ring */ if (memcmp (&instance->my_old_ring_id, &mcast->ring_id, sizeof (struct memb_ring_id)) == 0) { res = sq_item_inuse (&instance->regular_sort_queue, mcast->seq); if (res == 0) { sq_item_add (&instance->regular_sort_queue, ®ular_message_item, mcast->seq); if (sq_lt_compare (instance->old_ring_state_high_seq_received, mcast->seq)) { instance->old_ring_state_high_seq_received = mcast->seq; } } } else { log_printf (instance->totemsrp_log_level_debug, "-not adding msg with seq no %x", mcast->seq); } } } /* * Change states in the state machine of the membership algorithm */ static void memb_state_operational_enter (struct totemsrp_instance *instance) { struct srp_addr joined_list[PROCESSOR_COUNT_MAX]; int joined_list_entries = 0; unsigned int aru_save; unsigned int joined_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int trans_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int new_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int left_list[PROCESSOR_COUNT_MAX]; unsigned int i; unsigned int res; char left_node_msg[1024]; char joined_node_msg[1024]; char failed_node_msg[1024]; instance->originated_orf_token = 0; memb_consensus_reset (instance); old_ring_state_reset (instance); deliver_messages_from_recovery_to_regular (instance); log_printf (instance->totemsrp_log_level_trace, "Delivering to app %x to %x", instance->my_high_delivered + 1, instance->old_ring_state_high_seq_received); aru_save = instance->my_aru; instance->my_aru = instance->old_ring_state_aru; messages_deliver_to_app (instance, 0, instance->old_ring_state_high_seq_received); /* * Calculate joined and left list */ memb_set_subtract (instance->my_left_memb_list, &instance->my_left_memb_entries, instance->my_memb_list, instance->my_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); memb_set_subtract (joined_list, &joined_list_entries, instance->my_new_memb_list, instance->my_new_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); /* * Install new membership */ instance->my_memb_entries = instance->my_new_memb_entries; memcpy (&instance->my_memb_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->last_released = 0; instance->my_set_retrans_flg = 0; /* * Deliver transitional configuration to application */ srp_addr_to_nodeid (left_list, instance->my_left_memb_list, instance->my_left_memb_entries); srp_addr_to_nodeid (trans_memb_list_totemip, instance->my_trans_memb_list, instance->my_trans_memb_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_TRANSITIONAL, trans_memb_list_totemip, instance->my_trans_memb_entries, left_list, instance->my_left_memb_entries, 0, 0, &instance->my_ring_id); instance->waiting_trans_ack = 1; instance->totemsrp_waiting_trans_ack_cb_fn (1); // TODO we need to filter to ensure we only deliver those // messages which are part of instance->my_deliver_memb messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received); instance->my_aru = aru_save; /* * Deliver regular configuration to application */ srp_addr_to_nodeid (new_memb_list_totemip, instance->my_new_memb_list, instance->my_new_memb_entries); srp_addr_to_nodeid (joined_list_totemip, joined_list, joined_list_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_REGULAR, new_memb_list_totemip, instance->my_new_memb_entries, 0, 0, joined_list_totemip, joined_list_entries, &instance->my_ring_id); /* * The recovery sort queue now becomes the regular * sort queue. It is necessary to copy the state * into the regular sort queue. */ sq_copy (&instance->regular_sort_queue, &instance->recovery_sort_queue); instance->my_last_aru = SEQNO_START_MSG; /* When making my_proc_list smaller, ensure that the * now non-used entries are zero-ed out. There are some suspect * assert's that assume that there is always 2 entries in the list. * These fail when my_proc_list is reduced to 1 entry (and the * valid [0] entry is the same as the 'unused' [1] entry). */ memset(instance->my_proc_list, 0, sizeof (struct srp_addr) * instance->my_proc_list_entries); instance->my_proc_list_entries = instance->my_new_memb_entries; memcpy (instance->my_proc_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->my_failed_list_entries = 0; /* * TODO Not exactly to spec * * At the entry to this function all messages without a gap are * deliered. * * This code throw away messages from the last gap in the sort queue * to my_high_seq_received * * What should really happen is we should deliver all messages up to * a gap, then delier the transitional configuration, then deliver * the messages between the first gap and my_high_seq_received, then * deliver a regular configuration, then deliver the regular * configuration * * Unfortunately totempg doesn't appear to like this operating mode * which needs more inspection */ i = instance->my_high_seq_received + 1; do { void *ptr; i -= 1; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (i == 0) { break; } } while (res); instance->my_high_delivered = i; for (i = 0; i <= instance->my_high_delivered; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (res == 0) { struct sort_queue_item *regular_message; regular_message = ptr; free (regular_message->mcast); } } sq_items_release (&instance->regular_sort_queue, instance->my_high_delivered); instance->last_released = instance->my_high_delivered; if (joined_list_entries) { int sptr = 0; sptr += snprintf(joined_node_msg, sizeof(joined_node_msg)-sptr, " joined:"); for (i=0; i< joined_list_entries; i++) { sptr += snprintf(joined_node_msg+sptr, sizeof(joined_node_msg)-sptr, " %u", joined_list_totemip[i]); } } else { joined_node_msg[0] = '\0'; } if (instance->my_left_memb_entries) { int sptr = 0; int sptr2 = 0; sptr += snprintf(left_node_msg, sizeof(left_node_msg)-sptr, " left:"); for (i=0; i< instance->my_left_memb_entries; i++) { sptr += snprintf(left_node_msg+sptr, sizeof(left_node_msg)-sptr, " %u", left_list[i]); } for (i=0; i< instance->my_left_memb_entries; i++) { if (my_leave_memb_match(instance, left_list[i]) == 0) { if (sptr2 == 0) { sptr2 += snprintf(failed_node_msg, sizeof(failed_node_msg)-sptr2, " failed:"); } sptr2 += snprintf(failed_node_msg+sptr2, sizeof(left_node_msg)-sptr2, " %u", left_list[i]); } } if (sptr2 == 0) { failed_node_msg[0] = '\0'; } } else { left_node_msg[0] = '\0'; failed_node_msg[0] = '\0'; } my_leave_memb_clear(instance); log_printf (instance->totemsrp_log_level_debug, "entering OPERATIONAL state."); log_printf (instance->totemsrp_log_level_notice, "A new membership (%s:%lld) was formed. Members%s%s", totemip_print (&instance->my_ring_id.rep), instance->my_ring_id.seq, joined_node_msg, left_node_msg); if (strlen(failed_node_msg)) { log_printf (instance->totemsrp_log_level_notice, "Failed to receive the leave message.%s", failed_node_msg); } instance->memb_state = MEMB_STATE_OPERATIONAL; instance->stats.operational_entered++; instance->stats.continuous_gather = 0; instance->my_received_flg = 1; reset_pause_timeout (instance); /* * Save ring id information from this configuration to determine * which processors are transitioning from old regular configuration * in to new regular configuration on the next configuration change */ memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); return; } static void memb_state_gather_enter ( struct totemsrp_instance *instance, enum gather_state_from gather_from) { int32_t res; instance->orf_token_discard = 1; instance->originated_orf_token = 0; memb_set_merge ( &instance->my_id, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_join_message_send (instance); /* * Restart the join timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_state_gather_enter - qb_loop_timer_add error(1) : %d", res); } /* * Restart the consensus timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->consensus_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_gather_consensus_timeout, &instance->memb_timer_state_gather_consensus_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_state_gather_enter - qb_loop_timer_add error(2) : %d", res); } /* * Cancel the token loss and token retransmission timeouts */ cancel_token_retransmit_timeout (instance); // REVIEWED cancel_token_timeout (instance); // REVIEWED cancel_merge_detect_timeout (instance); memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); log_printf (instance->totemsrp_log_level_debug, "entering GATHER state from %d(%s).", gather_from, gsfrom_to_msg(gather_from)); instance->memb_state = MEMB_STATE_GATHER; instance->stats.gather_entered++; if (gather_from == TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED) { /* * State 3 means gather, so we are continuously gathering. */ instance->stats.continuous_gather++; } return; } static void timer_function_token_retransmit_timeout (void *data); static void target_set_completed ( void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; memb_state_commit_token_send (instance); } static void memb_state_commit_enter ( struct totemsrp_instance *instance) { old_ring_state_save (instance); memb_state_commit_token_update (instance); memb_state_commit_token_target_set (instance); qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); instance->memb_timer_state_gather_join_timeout = 0; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); instance->memb_timer_state_gather_consensus_timeout = 0; memb_ring_id_set (instance, &instance->commit_token->ring_id); instance->memb_ring_id_store (&instance->my_ring_id, &instance->my_id.addr[0]); instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf (instance->totemsrp_log_level_debug, "entering COMMIT state."); instance->memb_state = MEMB_STATE_COMMIT; reset_token_retransmit_timeout (instance); // REVIEWED reset_token_timeout (instance); // REVIEWED instance->stats.commit_entered++; instance->stats.continuous_gather = 0; /* * reset all flow control variables since we are starting a new ring */ instance->my_trc = 0; instance->my_pbl = 0; instance->my_cbl = 0; /* * commit token sent after callback that token target has been set */ } static void memb_state_recovery_enter ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { int i; int local_received_flg = 1; unsigned int low_ring_aru; unsigned int range = 0; unsigned int messages_originated = 0; const struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; struct memb_ring_id my_new_memb_ring_id_list[PROCESSOR_COUNT_MAX]; addr = (const struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); log_printf (instance->totemsrp_log_level_debug, "entering RECOVERY state."); instance->orf_token_discard = 0; instance->my_high_ring_delivered = 0; sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG); cs_queue_reinit (&instance->retrans_message_queue); low_ring_aru = instance->old_ring_state_high_seq_received; memb_state_commit_token_send_recovery (instance, commit_token); instance->my_token_seq = SEQNO_START_TOKEN - 1; /* * Build regular configuration */ totemnet_processor_count_set ( instance->totemnet_context, commit_token->addr_entries); /* * Build transitional configuration */ for (i = 0; i < instance->my_new_memb_entries; i++) { memcpy (&my_new_memb_ring_id_list[i], &memb_list[i].ring_id, sizeof (struct memb_ring_id)); } memb_set_and_with_ring_id ( instance->my_new_memb_list, my_new_memb_ring_id_list, instance->my_new_memb_entries, instance->my_memb_list, instance->my_memb_entries, &instance->my_old_ring_id, instance->my_trans_memb_list, &instance->my_trans_memb_entries); for (i = 0; i < instance->my_trans_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "TRANS [%d] member %s:", i, totemip_print (&instance->my_trans_memb_list[i].addr[0])); } for (i = 0; i < instance->my_new_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "position [%d] member %s:", i, totemip_print (&addr[i].addr[0])); log_printf (instance->totemsrp_log_level_debug, "previous ring seq %llx rep %s", memb_list[i].ring_id.seq, totemip_print (&memb_list[i].ring_id.rep)); log_printf (instance->totemsrp_log_level_debug, "aru %x high delivered %x received flag %d", memb_list[i].aru, memb_list[i].high_delivered, memb_list[i].received_flg); // assert (totemip_print (&memb_list[i].ring_id.rep) != 0); } /* * Determine if any received flag is false */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_trans_memb_list, instance->my_trans_memb_entries) && memb_list[i].received_flg == 0) { instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct srp_addr) * instance->my_trans_memb_entries); local_received_flg = 0; break; } } if (local_received_flg == 1) { goto no_originate; } /* Else originate messages if we should */ /* * Calculate my_low_ring_aru, instance->my_high_ring_delivered for the transitional membership */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) && memcmp (&instance->my_old_ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, low_ring_aru)) { low_ring_aru = memb_list[i].aru; } if (sq_lt_compare (instance->my_high_ring_delivered, memb_list[i].high_delivered)) { instance->my_high_ring_delivered = memb_list[i].high_delivered; } } } /* * Copy all old ring messages to instance->retrans_message_queue */ range = instance->old_ring_state_high_seq_received - low_ring_aru; if (range == 0) { /* * No messages to copy */ goto no_originate; } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); log_printf (instance->totemsrp_log_level_debug, "copying all old ring messages from %x-%x.", low_ring_aru + 1, instance->old_ring_state_high_seq_received); for (i = 1; i <= range; i++) { struct sort_queue_item *sort_queue_item; struct message_item message_item; void *ptr; int res; res = sq_item_get (&instance->regular_sort_queue, low_ring_aru + i, &ptr); if (res != 0) { continue; } sort_queue_item = ptr; messages_originated++; memset (&message_item, 0, sizeof (struct message_item)); // TODO LEAK message_item.mcast = totemsrp_buffer_alloc (instance); assert (message_item.mcast); message_item.mcast->header.type = MESSAGE_TYPE_MCAST; srp_addr_copy (&message_item.mcast->system_from, &instance->my_id); message_item.mcast->header.encapsulated = MESSAGE_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->header.endian_detector = ENDIAN_LOCAL; memcpy (&message_item.mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); message_item.msg_len = sort_queue_item->msg_len + sizeof (struct mcast); memcpy (((char *)message_item.mcast) + sizeof (struct mcast), sort_queue_item->mcast, sort_queue_item->msg_len); cs_queue_item_add (&instance->retrans_message_queue, &message_item); } log_printf (instance->totemsrp_log_level_debug, "Originated %d messages in RECOVERY.", messages_originated); goto originated; no_originate: log_printf (instance->totemsrp_log_level_debug, "Did not need to originate any messages in recovery."); originated: instance->my_aru = SEQNO_START_MSG; instance->my_aru_count = 0; instance->my_seq_unchanged = 0; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_install_seq = SEQNO_START_MSG; instance->last_released = SEQNO_START_MSG; reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED instance->memb_state = MEMB_STATE_RECOVERY; instance->stats.recovery_entered++; instance->stats.continuous_gather = 0; return; } void totemsrp_event_signal (void *srp_context, enum totem_event_type type, int value) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; token_hold_cancel_send (instance); return; } int totemsrp_mcast ( void *srp_context, struct iovec *iovec, unsigned int iov_len, int guarantee) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int i; struct message_item message_item; char *addr; unsigned int addr_idx; struct cs_queue *queue_use; if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } if (cs_queue_is_full (queue_use)) { log_printf (instance->totemsrp_log_level_debug, "queue full"); return (-1); } memset (&message_item, 0, sizeof (struct message_item)); /* * Allocate pending item */ message_item.mcast = totemsrp_buffer_alloc (instance); if (message_item.mcast == 0) { goto error_mcast; } /* * Set mcast header */ memset(message_item.mcast, 0, sizeof (struct mcast)); message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->header.endian_detector = ENDIAN_LOCAL; message_item.mcast->header.encapsulated = MESSAGE_NOT_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->guarantee = guarantee; srp_addr_copy (&message_item.mcast->system_from, &instance->my_id); addr = (char *)message_item.mcast; addr_idx = sizeof (struct mcast); for (i = 0; i < iov_len; i++) { memcpy (&addr[addr_idx], iovec[i].iov_base, iovec[i].iov_len); addr_idx += iovec[i].iov_len; } message_item.msg_len = addr_idx; log_printf (instance->totemsrp_log_level_trace, "mcasted message added to pending queue"); instance->stats.mcast_tx++; cs_queue_item_add (queue_use, &message_item); return (0); error_mcast: return (-1); } /* * Determine if there is room to queue a new message */ int totemsrp_avail (void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int avail; struct cs_queue *queue_use; if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } cs_queue_avail (queue_use, &avail); return (avail); } /* * ORF Token Management */ /* * Recast message to mcast group if it is available */ static int orf_token_remcast ( struct totemsrp_instance *instance, int seq) { struct sort_queue_item *sort_queue_item; int res; void *ptr; struct sq *sort_queue; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } res = sq_in_range (sort_queue, seq); if (res == 0) { log_printf (instance->totemsrp_log_level_debug, "sq not in range"); return (-1); } /* * Get RTR item at seq, if not available, return */ res = sq_item_get (sort_queue, seq, &ptr); if (res != 0) { return -1; } sort_queue_item = ptr; totemnet_mcast_noflush_send ( instance->totemnet_context, sort_queue_item->mcast, sort_queue_item->msg_len); return (0); } /* * Free all freeable messages from ring */ static void messages_free ( struct totemsrp_instance *instance, unsigned int token_aru) { struct sort_queue_item *regular_message; unsigned int i; int res; int log_release = 0; unsigned int release_to; unsigned int range = 0; release_to = token_aru; if (sq_lt_compare (instance->my_last_aru, release_to)) { release_to = instance->my_last_aru; } if (sq_lt_compare (instance->my_high_delivered, release_to)) { release_to = instance->my_high_delivered; } /* * Ensure we dont try release before an already released point */ if (sq_lt_compare (release_to, instance->last_released)) { return; } range = release_to - instance->last_released; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); /* * Release retransmit list items if group aru indicates they are transmitted */ for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, instance->last_released + i, &ptr); if (res == 0) { regular_message = ptr; totemsrp_buffer_release (instance, regular_message->mcast); } sq_items_release (&instance->regular_sort_queue, instance->last_released + i); log_release = 1; } instance->last_released += range; if (log_release) { log_printf (instance->totemsrp_log_level_trace, "releasing messages up to and including %x", release_to); } } static void update_aru ( struct totemsrp_instance *instance) { unsigned int i; int res; struct sq *sort_queue; unsigned int range; unsigned int my_aru_saved = 0; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } range = instance->my_high_seq_received - instance->my_aru; my_aru_saved = instance->my_aru; for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (sort_queue, my_aru_saved + i, &ptr); /* * If hole, stop updating aru */ if (res != 0) { break; } } instance->my_aru += i - 1; } /* * Multicasts pending messages onto the ring (requires orf_token possession) */ static int orf_token_mcast ( struct totemsrp_instance *instance, struct orf_token *token, int fcc_mcasts_allowed) { struct message_item *message_item = 0; struct cs_queue *mcast_queue; struct sq *sort_queue; struct sort_queue_item sort_queue_item; struct mcast *mcast; unsigned int fcc_mcast_current; if (instance->memb_state == MEMB_STATE_RECOVERY) { mcast_queue = &instance->retrans_message_queue; sort_queue = &instance->recovery_sort_queue; reset_token_retransmit_timeout (instance); // REVIEWED } else { if (instance->waiting_trans_ack) { mcast_queue = &instance->new_message_queue_trans; } else { mcast_queue = &instance->new_message_queue; } sort_queue = &instance->regular_sort_queue; } for (fcc_mcast_current = 0; fcc_mcast_current < fcc_mcasts_allowed; fcc_mcast_current++) { if (cs_queue_is_empty (mcast_queue)) { break; } message_item = (struct message_item *)cs_queue_item_get (mcast_queue); message_item->mcast->seq = ++token->seq; message_item->mcast->this_seqno = instance->global_seqno++; /* * Build IO vector */ memset (&sort_queue_item, 0, sizeof (struct sort_queue_item)); sort_queue_item.mcast = message_item->mcast; sort_queue_item.msg_len = message_item->msg_len; mcast = sort_queue_item.mcast; memcpy (&mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); /* * Add message to retransmit queue */ sq_item_add (sort_queue, &sort_queue_item, message_item->mcast->seq); totemnet_mcast_noflush_send ( instance->totemnet_context, message_item->mcast, message_item->msg_len); /* * Delete item from pending queue */ cs_queue_item_remove (mcast_queue); /* * If messages mcasted, deliver any new messages to totempg */ instance->my_high_seq_received = token->seq; } update_aru (instance); /* * Return 1 if more messages are available for single node clusters */ return (fcc_mcast_current); } /* * Remulticasts messages in orf_token's retransmit list (requires orf_token) * Modify's orf_token's rtr to include retransmits required by this process */ static int orf_token_rtr ( struct totemsrp_instance *instance, struct orf_token *orf_token, unsigned int *fcc_allowed) { unsigned int res; unsigned int i, j; unsigned int found; struct sq *sort_queue; struct rtr_item *rtr_list; unsigned int range = 0; char retransmit_msg[1024]; char value[64]; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } rtr_list = &orf_token->rtr_list[0]; strcpy (retransmit_msg, "Retransmit List: "); if (orf_token->rtr_list_entries) { log_printf (instance->totemsrp_log_level_debug, "Retransmit List %d", orf_token->rtr_list_entries); for (i = 0; i < orf_token->rtr_list_entries; i++) { sprintf (value, "%x ", rtr_list[i].seq); strcat (retransmit_msg, value); } strcat (retransmit_msg, ""); log_printf (instance->totemsrp_log_level_notice, "%s", retransmit_msg); } /* * Retransmit messages on orf_token's RTR list from RTR queue */ for (instance->fcc_remcast_current = 0, i = 0; instance->fcc_remcast_current < *fcc_allowed && i < orf_token->rtr_list_entries;) { /* * If this retransmit request isn't from this configuration, * try next rtr entry */ if (memcmp (&rtr_list[i].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { i += 1; continue; } res = orf_token_remcast (instance, rtr_list[i].seq); if (res == 0) { /* * Multicasted message, so no need to copy to new retransmit list */ orf_token->rtr_list_entries -= 1; assert (orf_token->rtr_list_entries >= 0); memmove (&rtr_list[i], &rtr_list[i + 1], sizeof (struct rtr_item) * (orf_token->rtr_list_entries - i)); instance->stats.mcast_retx++; instance->fcc_remcast_current++; } else { i += 1; } } *fcc_allowed = *fcc_allowed - instance->fcc_remcast_current; /* * Add messages to retransmit to RTR list * but only retry if there is room in the retransmit list */ range = orf_token->seq - instance->my_aru; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); for (i = 1; (orf_token->rtr_list_entries < RETRANSMIT_ENTRIES_MAX) && (i <= range); i++) { /* * Ensure message is within the sort queue range */ res = sq_in_range (sort_queue, instance->my_aru + i); if (res == 0) { break; } /* * Find if a message is missing from this processor */ res = sq_item_inuse (sort_queue, instance->my_aru + i); if (res == 0) { /* * Determine how many times we have missed receiving * this sequence number. sq_item_miss_count increments * a counter for the sequence number. The miss count * will be returned and compared. This allows time for * delayed multicast messages to be received before * declaring the message is missing and requesting a * retransmit. */ res = sq_item_miss_count (sort_queue, instance->my_aru + i); if (res < instance->totem_config->miss_count_const) { continue; } /* * Determine if missing message is already in retransmit list */ found = 0; for (j = 0; j < orf_token->rtr_list_entries; j++) { if (instance->my_aru + i == rtr_list[j].seq) { found = 1; } } if (found == 0) { /* * Missing message not found in current retransmit list so add it */ memcpy (&rtr_list[orf_token->rtr_list_entries].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); rtr_list[orf_token->rtr_list_entries].seq = instance->my_aru + i; orf_token->rtr_list_entries++; } } } return (instance->fcc_remcast_current); } static void token_retransmit (struct totemsrp_instance *instance) { totemnet_token_send (instance->totemnet_context, instance->orf_token_retransmit, instance->orf_token_retransmit_size); } /* * Retransmit the regular token if no mcast or token has * been received in retransmit token period retransmit * the token to the next processor */ static void timer_function_token_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); reset_token_retransmit_timeout (instance); // REVIEWED break; } } static void timer_function_token_hold_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: break; case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); break; } } static void timer_function_merge_detect_timeout(void *data) { struct totemsrp_instance *instance = data; instance->my_merge_detect_timeout_outstanding = 0; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { memb_merge_detect_transmit (instance); } break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: case MEMB_STATE_RECOVERY: break; } } /* * Send orf_token to next member (requires orf_token) */ static int token_send ( struct totemsrp_instance *instance, struct orf_token *orf_token, int forward_token) { int res = 0; unsigned int orf_token_size; orf_token_size = sizeof (struct orf_token) + (orf_token->rtr_list_entries * sizeof (struct rtr_item)); orf_token->header.nodeid = instance->my_id.addr[0].nodeid; memcpy (instance->orf_token_retransmit, orf_token, orf_token_size); instance->orf_token_retransmit_size = orf_token_size; assert (orf_token->header.nodeid); if (forward_token == 0) { return (0); } totemnet_token_send (instance->totemnet_context, orf_token, orf_token_size); return (res); } static int token_hold_cancel_send (struct totemsrp_instance *instance) { struct token_hold_cancel token_hold_cancel; /* * Only cancel if the token is currently held */ if (instance->my_token_held == 0) { return (0); } instance->my_token_held = 0; /* * Build message */ token_hold_cancel.header.type = MESSAGE_TYPE_TOKEN_HOLD_CANCEL; token_hold_cancel.header.endian_detector = ENDIAN_LOCAL; token_hold_cancel.header.encapsulated = 0; token_hold_cancel.header.nodeid = instance->my_id.addr[0].nodeid; memcpy (&token_hold_cancel.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (token_hold_cancel.header.nodeid); instance->stats.token_hold_cancel_tx++; totemnet_mcast_flush_send (instance->totemnet_context, &token_hold_cancel, sizeof (struct token_hold_cancel)); return (0); } static int orf_token_send_initial (struct totemsrp_instance *instance) { struct orf_token orf_token; int res; orf_token.header.type = MESSAGE_TYPE_ORF_TOKEN; orf_token.header.endian_detector = ENDIAN_LOCAL; orf_token.header.encapsulated = 0; orf_token.header.nodeid = instance->my_id.addr[0].nodeid; assert (orf_token.header.nodeid); orf_token.seq = SEQNO_START_MSG; orf_token.token_seq = SEQNO_START_TOKEN; orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; instance->stats.orf_token_tx++; if (cs_queue_is_empty (&instance->retrans_message_queue) == 1) { orf_token.retrans_flg = 0; instance->my_set_retrans_flg = 0; } else { orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; } orf_token.aru = 0; orf_token.aru = SEQNO_START_MSG - 1; orf_token.aru_addr = instance->my_id.addr[0].nodeid; memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); orf_token.fcc = 0; orf_token.backlog = 0; orf_token.rtr_list_entries = 0; res = token_send (instance, &orf_token, 1); return (res); } static void memb_state_commit_token_update ( struct totemsrp_instance *instance) { struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; unsigned int high_aru; unsigned int i; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (instance->my_new_memb_list, addr, sizeof (struct srp_addr) * instance->commit_token->addr_entries); instance->my_new_memb_entries = instance->commit_token->addr_entries; memcpy (&memb_list[instance->commit_token->memb_index].ring_id, &instance->my_old_ring_id, sizeof (struct memb_ring_id)); memb_list[instance->commit_token->memb_index].aru = instance->old_ring_state_aru; /* * TODO high delivered is really instance->my_aru, but with safe this * could change? */ instance->my_received_flg = (instance->my_aru == instance->my_high_seq_received); memb_list[instance->commit_token->memb_index].received_flg = instance->my_received_flg; memb_list[instance->commit_token->memb_index].high_delivered = instance->my_high_delivered; /* * find high aru up to current memb_index for all matching ring ids * if any ring id matching memb_index has aru less then high aru set * received flag for that entry to false */ high_aru = memb_list[instance->commit_token->memb_index].aru; for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (high_aru, memb_list[i].aru)) { high_aru = memb_list[i].aru; } } } for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, high_aru)) { memb_list[i].received_flg = 0; if (i == instance->commit_token->memb_index) { instance->my_received_flg = 0; } } } } instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid; instance->commit_token->memb_index += 1; assert (instance->commit_token->memb_index <= instance->commit_token->addr_entries); assert (instance->commit_token->header.nodeid); } static void memb_state_commit_token_target_set ( struct totemsrp_instance *instance) { struct srp_addr *addr; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; /* Totemnet just looks at the node id */ totemnet_token_target_set ( instance->totemnet_context, &addr[instance->commit_token->memb_index % instance->commit_token->addr_entries].addr[0]); } static int memb_state_commit_token_send_recovery ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { unsigned int commit_token_size; commit_token->token_seq++; commit_token->header.nodeid = instance->my_id.addr[0].nodeid; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemnet_token_send (instance->totemnet_context, commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_state_commit_token_send ( struct totemsrp_instance *instance) { unsigned int commit_token_size; instance->commit_token->token_seq++; instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * instance->commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, instance->commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemnet_token_send (instance->totemnet_context, instance->commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_lowest_in_config (struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int i; struct totem_ip_address *lowest_addr; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); /* * find representative by searching for smallest identifier */ lowest_addr = &token_memb[0].addr[0]; for (i = 1; i < token_memb_entries; i++) { if (totemip_compare(lowest_addr, &token_memb[i].addr[0]) > 0) { totemip_copy (lowest_addr, &token_memb[i].addr[0]); } } return (totemip_compare (lowest_addr, &instance->my_id.addr[0]) == 0); } static int srp_addr_compare (const void *a, const void *b) { const struct srp_addr *srp_a = (const struct srp_addr *)a; const struct srp_addr *srp_b = (const struct srp_addr *)b; return (totemip_compare (&srp_a->addr[0], &srp_b->addr[0])); } static void memb_state_commit_token_create ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; int token_memb_entries = 0; log_printf (instance->totemsrp_log_level_debug, "Creating commit token because I am the rep."); memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); memset (instance->commit_token, 0, sizeof (struct memb_commit_token)); instance->commit_token->header.type = MESSAGE_TYPE_MEMB_COMMIT_TOKEN; instance->commit_token->header.endian_detector = ENDIAN_LOCAL; instance->commit_token->header.encapsulated = 0; instance->commit_token->header.nodeid = instance->my_id.addr[0].nodeid; assert (instance->commit_token->header.nodeid); totemip_copy(&instance->commit_token->ring_id.rep, &instance->my_id.addr[0]); instance->commit_token->ring_id.seq = instance->token_ring_id_seq + 4; /* * This qsort is necessary to ensure the commit token traverses * the ring in the proper order */ qsort (token_memb, token_memb_entries, sizeof (struct srp_addr), srp_addr_compare); instance->commit_token->memb_index = 0; instance->commit_token->addr_entries = token_memb_entries; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (addr, token_memb, token_memb_entries * sizeof (struct srp_addr)); memset (memb_list, 0, sizeof (struct memb_commit_token_memb_entry) * token_memb_entries); } static void memb_join_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.endian_detector = ENDIAN_LOCAL; memb_join->header.encapsulated = 0; memb_join->header.nodeid = instance->my_id.addr[0].nodeid; assert (memb_join->header.nodeid); memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = instance->my_proc_list_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; srp_addr_copy (&memb_join->system_from, &instance->my_id); /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], instance->my_proc_list, instance->my_proc_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_proc_list_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemnet_mcast_flush_send ( instance->totemnet_context, memb_join, addr_idx); } static void memb_leave_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; int active_memb_entries; struct srp_addr active_memb[PROCESSOR_COUNT_MAX]; log_printf (instance->totemsrp_log_level_debug, "sending join/leave message"); /* * add us to the failed list, and remove us from * the members list */ memb_set_merge( &instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_set_subtract (active_memb, &active_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, &instance->my_id, 1); memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.endian_detector = ENDIAN_LOCAL; memb_join->header.encapsulated = 0; memb_join->header.nodeid = LEAVE_DUMMY_NODEID; memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = active_memb_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; srp_addr_copy (&memb_join->system_from, &instance->my_id); memb_join->system_from.addr[0].nodeid = LEAVE_DUMMY_NODEID; // TODO: CC Maybe use the actual join send routine. /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], active_memb, active_memb_entries * sizeof (struct srp_addr)); addr_idx += active_memb_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemnet_mcast_flush_send ( instance->totemnet_context, memb_join, addr_idx); } static void memb_merge_detect_transmit (struct totemsrp_instance *instance) { struct memb_merge_detect memb_merge_detect; memb_merge_detect.header.type = MESSAGE_TYPE_MEMB_MERGE_DETECT; memb_merge_detect.header.endian_detector = ENDIAN_LOCAL; memb_merge_detect.header.encapsulated = 0; memb_merge_detect.header.nodeid = instance->my_id.addr[0].nodeid; srp_addr_copy (&memb_merge_detect.system_from, &instance->my_id); memcpy (&memb_merge_detect.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (memb_merge_detect.header.nodeid); instance->stats.memb_merge_detect_tx++; totemnet_mcast_flush_send (instance->totemnet_context, &memb_merge_detect, sizeof (struct memb_merge_detect)); } static void memb_ring_id_set ( struct totemsrp_instance *instance, const struct memb_ring_id *ring_id) { memcpy (&instance->my_ring_id, ring_id, sizeof (struct memb_ring_id)); } int totemsrp_callback_token_create ( void *srp_context, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; struct token_callback_instance *callback_handle; token_hold_cancel_send (instance); callback_handle = malloc (sizeof (struct token_callback_instance)); if (callback_handle == 0) { return (-1); } *handle_out = (void *)callback_handle; qb_list_init (&callback_handle->list); callback_handle->callback_fn = callback_fn; callback_handle->data = (void *) data; callback_handle->callback_type = type; callback_handle->delete = delete; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: qb_list_add (&callback_handle->list, &instance->token_callback_received_listhead); break; case TOTEM_CALLBACK_TOKEN_SENT: qb_list_add (&callback_handle->list, &instance->token_callback_sent_listhead); break; } return (0); } void totemsrp_callback_token_destroy (void *srp_context, void **handle_out) { struct token_callback_instance *h; if (*handle_out) { h = (struct token_callback_instance *)*handle_out; qb_list_del (&h->list); free (h); h = NULL; *handle_out = 0; } } static void token_callbacks_execute ( struct totemsrp_instance *instance, enum totem_callback_token_type type) { - struct qb_list_head *list; + struct qb_list_head *list, *tmp_iter; struct qb_list_head *callback_listhead = 0; struct token_callback_instance *token_callback_instance; int res; int del; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: callback_listhead = &instance->token_callback_received_listhead; break; case TOTEM_CALLBACK_TOKEN_SENT: callback_listhead = &instance->token_callback_sent_listhead; break; default: assert (0); } - qb_list_for_each(list, callback_listhead) { + qb_list_for_each_safe(list, tmp_iter, callback_listhead) { token_callback_instance = qb_list_entry (list, struct token_callback_instance, list); del = token_callback_instance->delete; if (del == 1) { qb_list_del (list); } res = token_callback_instance->callback_fn ( token_callback_instance->callback_type, token_callback_instance->data); /* * This callback failed to execute, try it again on the next token */ if (res == -1 && del == 1) { qb_list_add (list, callback_listhead); } else if (del) { free (token_callback_instance); } } } /* * Flow control functions */ static unsigned int backlog_get (struct totemsrp_instance *instance) { unsigned int backlog = 0; struct cs_queue *queue_use = NULL; if (instance->memb_state == MEMB_STATE_OPERATIONAL) { if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } } else if (instance->memb_state == MEMB_STATE_RECOVERY) { queue_use = &instance->retrans_message_queue; } if (queue_use != NULL) { backlog = cs_queue_used (queue_use); } instance->stats.token[instance->stats.latest_token].backlog_calc = backlog; return (backlog); } static int fcc_calculate ( struct totemsrp_instance *instance, struct orf_token *token) { unsigned int transmits_allowed; unsigned int backlog_calc; transmits_allowed = instance->totem_config->max_messages; if (transmits_allowed > instance->totem_config->window_size - token->fcc) { transmits_allowed = instance->totem_config->window_size - token->fcc; } instance->my_cbl = backlog_get (instance); /* * Only do backlog calculation if there is a backlog otherwise * we would result in div by zero */ if (token->backlog + instance->my_cbl - instance->my_pbl) { backlog_calc = (instance->totem_config->window_size * instance->my_pbl) / (token->backlog + instance->my_cbl - instance->my_pbl); if (backlog_calc > 0 && transmits_allowed > backlog_calc) { transmits_allowed = backlog_calc; } } return (transmits_allowed); } /* * don't overflow the RTR sort queue */ static void fcc_rtr_limit ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int *transmits_allowed) { int check = QUEUE_RTR_ITEMS_SIZE_MAX; check -= (*transmits_allowed + instance->totem_config->window_size); assert (check >= 0); if (sq_lt_compare (instance->last_released + QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed - instance->totem_config->window_size, token->seq)) { *transmits_allowed = 0; } } static void fcc_token_update ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int msgs_transmitted) { token->fcc += msgs_transmitted - instance->my_trc; token->backlog += instance->my_cbl - instance->my_pbl; instance->my_trc = msgs_transmitted; instance->my_pbl = instance->my_cbl; } /* * Message Handlers */ unsigned long long int tv_old; /* * message handler called when TOKEN message type received */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { char token_storage[1500]; char token_convert[1500]; struct orf_token *token = NULL; int forward_token; unsigned int transmits_allowed; unsigned int mcasted_retransmit; unsigned int mcasted_regular; unsigned int last_aru; #ifdef GIVEINFO unsigned long long tv_current; unsigned long long tv_diff; tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "Time since last token %0.4f ms", ((float)tv_diff) / 1000000.0); #endif if (instance->orf_token_discard) { return (0); } #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) { return (0); } #endif if (endian_conversion_needed) { orf_token_endian_convert ((struct orf_token *)msg, (struct orf_token *)token_convert); msg = (struct orf_token *)token_convert; } /* * Make copy of token and retransmit list in case we have * to flush incoming messages from the kernel queue */ token = (struct orf_token *)token_storage; memcpy (token, msg, sizeof (struct orf_token)); memcpy (&token->rtr_list[0], (char *)msg + sizeof (struct orf_token), sizeof (struct rtr_item) * RETRANSMIT_ENTRIES_MAX); /* * Handle merge detection timeout */ if (token->seq == instance->my_last_seq) { start_merge_detect_timeout (instance); instance->my_seq_unchanged += 1; } else { cancel_merge_detect_timeout (instance); cancel_token_hold_retransmit_timeout (instance); instance->my_seq_unchanged = 0; } instance->my_last_seq = token->seq; #ifdef TEST_RECOVERY_MSG_COUNT if (instance->memb_state == MEMB_STATE_OPERATIONAL && token->seq > TEST_RECOVERY_MSG_COUNT) { return (0); } #endif instance->flushing = 1; totemnet_recv_flush (instance->totemnet_context); instance->flushing = 0; /* * Determine if we should hold (in reality drop) the token */ instance->my_token_held = 0; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) && instance->my_seq_unchanged > instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } else if (!totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) && instance->my_seq_unchanged >= instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } /* * Hold onto token when there is no activity on ring and * this processor is the ring rep */ forward_token = 1; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { if (instance->my_token_held) { forward_token = 0; } } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_RECEIVED); switch (instance->memb_state) { case MEMB_STATE_COMMIT: /* Discard token */ break; case MEMB_STATE_OPERATIONAL: messages_free (instance, token->aru); /* * Do NOT add break, this case should also execute code in gather case. */ case MEMB_STATE_GATHER: /* * DO NOT add break, we use different free mechanism in recovery state */ case MEMB_STATE_RECOVERY: /* * Discard tokens from another configuration */ if (memcmp (&token->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); /* discard token */ } /* * Discard retransmitted tokens */ if (sq_lte_compare (token->token_seq, instance->my_token_seq)) { return (0); /* discard token */ } last_aru = instance->my_last_aru; instance->my_last_aru = token->aru; transmits_allowed = fcc_calculate (instance, token); mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed); if (instance->my_token_held == 1 && (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) { instance->my_token_held = 0; forward_token = 1; } fcc_rtr_limit (instance, token, &transmits_allowed); mcasted_regular = orf_token_mcast (instance, token, transmits_allowed); /* if (mcasted_regular) { printf ("mcasted regular %d\n", mcasted_regular); printf ("token seq %d\n", token->seq); } */ fcc_token_update (instance, token, mcasted_retransmit + mcasted_regular); if (sq_lt_compare (instance->my_aru, token->aru) || instance->my_id.addr[0].nodeid == token->aru_addr || token->aru_addr == 0) { token->aru = instance->my_aru; if (token->aru == token->seq) { token->aru_addr = 0; } else { token->aru_addr = instance->my_id.addr[0].nodeid; } } if (token->aru == last_aru && token->aru_addr != 0) { instance->my_aru_count += 1; } else { instance->my_aru_count = 0; } /* * We really don't follow specification there. In specification, OTHER nodes * detect failure of one node (based on aru_count) and my_id IS NEVER added * to failed list (so node never mark itself as failed) */ if (instance->my_aru_count > instance->totem_config->fail_to_recv_const && token->aru_addr == instance->my_id.addr[0].nodeid) { log_printf (instance->totemsrp_log_level_error, "FAILED TO RECEIVE"); instance->failed_to_recv = 1; memb_set_merge (&instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FAILED_TO_RECEIVE); } else { instance->my_token_seq = token->token_seq; token->token_seq += 1; if (instance->memb_state == MEMB_STATE_RECOVERY) { /* * instance->my_aru == instance->my_high_seq_received means this processor * has recovered all messages it can recover * (ie: its retrans queue is empty) */ if (cs_queue_is_empty (&instance->retrans_message_queue) == 0) { if (token->retrans_flg == 0) { token->retrans_flg = 1; instance->my_set_retrans_flg = 1; } } else if (token->retrans_flg == 1 && instance->my_set_retrans_flg) { token->retrans_flg = 0; instance->my_set_retrans_flg = 0; } log_printf (instance->totemsrp_log_level_debug, "token retrans flag is %d my set retrans flag%d retrans queue empty %d count %d, aru %x", token->retrans_flg, instance->my_set_retrans_flg, cs_queue_is_empty (&instance->retrans_message_queue), instance->my_retrans_flg_count, token->aru); if (token->retrans_flg == 0) { instance->my_retrans_flg_count += 1; } else { instance->my_retrans_flg_count = 0; } if (instance->my_retrans_flg_count == 2) { instance->my_install_seq = token->seq; } log_printf (instance->totemsrp_log_level_debug, "install seq %x aru %x high seq received %x", instance->my_install_seq, instance->my_aru, instance->my_high_seq_received); if (instance->my_retrans_flg_count >= 2 && instance->my_received_flg == 0 && sq_lte_compare (instance->my_install_seq, instance->my_aru)) { instance->my_received_flg = 1; instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct totem_ip_address) * instance->my_trans_memb_entries); } if (instance->my_retrans_flg_count >= 3 && sq_lte_compare (instance->my_install_seq, token->aru)) { instance->my_rotation_counter += 1; } else { instance->my_rotation_counter = 0; } if (instance->my_rotation_counter == 2) { log_printf (instance->totemsrp_log_level_debug, "retrans flag count %x token aru %x install seq %x aru %x %x", instance->my_retrans_flg_count, token->aru, instance->my_install_seq, instance->my_aru, token->seq); memb_state_operational_enter (instance); instance->my_rotation_counter = 0; instance->my_retrans_flg_count = 0; } } totemnet_send_flush (instance->totemnet_context); token_send (instance, token, forward_token); #ifdef GIVEINFO tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "I held %0.4f ms", ((float)tv_diff) / 1000000.0); #endif if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* * Deliver messages after token has been transmitted * to improve performance */ reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED if (totemip_equal(&instance->my_id.addr[0], &instance->my_ring_id.rep) && instance->my_token_held == 1) { start_token_hold_retransmit_timeout (instance); } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_SENT); } break; } if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); } static void messages_deliver_to_app ( struct totemsrp_instance *instance, int skip, unsigned int end_point) { struct sort_queue_item *sort_queue_item_p; unsigned int i; int res; struct mcast *mcast_in; struct mcast mcast_header; unsigned int range = 0; int endian_conversion_required; unsigned int my_high_delivered_stored = 0; range = end_point - instance->my_high_delivered; if (range) { log_printf (instance->totemsrp_log_level_trace, "Delivering %x to %x", instance->my_high_delivered, end_point); } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); my_high_delivered_stored = instance->my_high_delivered; /* * Deliver messages in order from rtr queue to pending delivery queue */ for (i = 1; i <= range; i++) { void *ptr = 0; /* * If out of range of sort queue, stop assembly */ res = sq_in_range (&instance->regular_sort_queue, my_high_delivered_stored + i); if (res == 0) { break; } res = sq_item_get (&instance->regular_sort_queue, my_high_delivered_stored + i, &ptr); /* * If hole, stop assembly */ if (res != 0 && skip == 0) { break; } instance->my_high_delivered = my_high_delivered_stored + i; if (res != 0) { continue; } sort_queue_item_p = ptr; mcast_in = sort_queue_item_p->mcast; assert (mcast_in != (struct mcast *)0xdeadbeef); endian_conversion_required = 0; if (mcast_in->header.endian_detector != ENDIAN_LOCAL) { endian_conversion_required = 1; mcast_endian_convert (mcast_in, &mcast_header); } else { memcpy (&mcast_header, mcast_in, sizeof (struct mcast)); } /* * Skip messages not originated in instance->my_deliver_memb */ if (skip && memb_set_subset (&mcast_header.system_from, 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) == 0) { instance->my_high_delivered = my_high_delivered_stored + i; continue; } /* * Message found */ log_printf (instance->totemsrp_log_level_trace, "Delivering MCAST message with seq %x to pending delivery queue", mcast_header.seq); /* * Message is locally originated multicast */ instance->totemsrp_deliver_fn ( mcast_header.header.nodeid, ((char *)sort_queue_item_p->mcast) + sizeof (struct mcast), sort_queue_item_p->msg_len - sizeof (struct mcast), endian_conversion_required); } } /* * recv message handler called when MCAST message type received */ static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct sort_queue_item sort_queue_item; struct sq *sort_queue; struct mcast mcast_header; if (endian_conversion_needed) { mcast_endian_convert (msg, &mcast_header); } else { memcpy (&mcast_header, msg, sizeof (struct mcast)); } if (mcast_header.header.encapsulated == MESSAGE_ENCAPSULATED) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } assert (msg_len <= FRAME_SIZE_MAX); #ifdef TEST_DROP_MCAST_PERCENTAGE if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) { return (0); } #endif /* * If the message is foreign execute the switch below */ if (memcmp (&instance->my_ring_id, &mcast_header.ring_id, sizeof (struct memb_ring_id)) != 0) { switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge ( &mcast_header.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &mcast_header.system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&mcast_header.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE); return (0); } break; case MEMB_STATE_COMMIT: /* discard message */ instance->stats.rx_msg_dropped++; break; case MEMB_STATE_RECOVERY: /* discard message */ instance->stats.rx_msg_dropped++; break; } return (0); } log_printf (instance->totemsrp_log_level_trace, "Received ringid(%s:%lld) seq %x", totemip_print (&mcast_header.ring_id.rep), mcast_header.ring_id.seq, mcast_header.seq); /* * Add mcast message to rtr queue if not already in rtr queue * otherwise free io vectors */ if (msg_len > 0 && msg_len <= FRAME_SIZE_MAX && sq_in_range (sort_queue, mcast_header.seq) && sq_item_inuse (sort_queue, mcast_header.seq) == 0) { /* * Allocate new multicast memory block */ // TODO LEAK sort_queue_item.mcast = totemsrp_buffer_alloc (instance); if (sort_queue_item.mcast == NULL) { return (-1); /* error here is corrected by the algorithm */ } memcpy (sort_queue_item.mcast, msg, msg_len); sort_queue_item.msg_len = msg_len; if (sq_lt_compare (instance->my_high_seq_received, mcast_header.seq)) { instance->my_high_seq_received = mcast_header.seq; } sq_item_add (sort_queue, &sort_queue_item, mcast_header.seq); } update_aru (instance); if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* TODO remove from retrans message queue for old ring in recovery state */ return (0); } static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_merge_detect memb_merge_detect; if (endian_conversion_needed) { memb_merge_detect_endian_convert (msg, &memb_merge_detect); } else { memcpy (&memb_merge_detect, msg, sizeof (struct memb_merge_detect)); } /* * do nothing if this is a merge detect from this configuration */ if (memcmp (&instance->my_ring_id, &memb_merge_detect.ring_id, sizeof (struct memb_ring_id)) == 0) { return (0); } /* * Execute merge operation */ switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge (&memb_merge_detect.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &memb_merge_detect.system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&memb_merge_detect.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE); return (0); } break; case MEMB_STATE_COMMIT: /* do nothing in commit */ break; case MEMB_STATE_RECOVERY: /* do nothing in recovery */ break; } return (0); } static void memb_join_process ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; int gather_entered = 0; int fail_minus_memb_entries = 0; struct srp_addr fail_minus_memb[PROCESSOR_COUNT_MAX]; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; /* memb_set_print ("proclist", proc_list, memb_join->proc_list_entries); memb_set_print ("faillist", failed_list, memb_join->failed_list_entries); memb_set_print ("my_proclist", instance->my_proc_list, instance->my_proc_list_entries); memb_set_print ("my_faillist", instance->my_failed_list, instance->my_failed_list_entries); -*/ if (memb_join->header.type == MESSAGE_TYPE_MEMB_JOIN) { if (instance->flushing) { if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) { log_printf (instance->totemsrp_log_level_warning, "Discarding LEAVE message during flush, nodeid=%u", memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID); if (memb_join->failed_list_entries > 0) { my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid); } } else { log_printf (instance->totemsrp_log_level_warning, "Discarding JOIN message during flush, nodeid=%d", memb_join->header.nodeid); } return; } else { if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) { log_printf (instance->totemsrp_log_level_debug, "Received LEAVE message from %u", memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID); if (memb_join->failed_list_entries > 0) { my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid); } } } } if (memb_set_equal (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_equal (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { memb_consensus_set (instance, &memb_join->system_from); if (memb_consensus_agreed (instance) && instance->failed_to_recv == 1) { instance->failed_to_recv = 0; srp_addr_copy (&instance->my_proc_list[0], &instance->my_id); instance->my_proc_list_entries = 1; instance->my_failed_list_entries = 0; memb_state_commit_token_create (instance); memb_state_commit_enter (instance); return; } if (memb_consensus_agreed (instance) && memb_lowest_in_config (instance)) { memb_state_commit_token_create (instance); memb_state_commit_enter (instance); } else { goto out; } } else if (memb_set_subset (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_subset (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { goto out; } else if (memb_set_subset (&memb_join->system_from, 1, instance->my_failed_list, instance->my_failed_list_entries)) { goto out; } else { memb_set_merge (proc_list, memb_join->proc_list_entries, instance->my_proc_list, &instance->my_proc_list_entries); if (memb_set_subset ( &instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { memb_set_merge ( &memb_join->system_from, 1, instance->my_failed_list, &instance->my_failed_list_entries); } else { if (memb_set_subset ( &memb_join->system_from, 1, instance->my_memb_list, instance->my_memb_entries)) { if (memb_set_subset ( &memb_join->system_from, 1, instance->my_failed_list, instance->my_failed_list_entries) == 0) { memb_set_merge (failed_list, memb_join->failed_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); } else { memb_set_subtract (fail_minus_memb, &fail_minus_memb_entries, failed_list, memb_join->failed_list_entries, instance->my_memb_list, instance->my_memb_entries); memb_set_merge (fail_minus_memb, fail_minus_memb_entries, instance->my_failed_list, &instance->my_failed_list_entries); } } } memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_JOIN); gather_entered = 1; } out: if (gather_entered == 0 && instance->memb_state == MEMB_STATE_OPERATIONAL) { memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE); } } static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out) { int i; struct srp_addr *in_proc_list; struct srp_addr *in_failed_list; struct srp_addr *out_proc_list; struct srp_addr *out_failed_list; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); out->proc_list_entries = swab32 (in->proc_list_entries); out->failed_list_entries = swab32 (in->failed_list_entries); out->ring_seq = swab64 (in->ring_seq); in_proc_list = (struct srp_addr *)in->end_of_memb_join; in_failed_list = in_proc_list + out->proc_list_entries; out_proc_list = (struct srp_addr *)out->end_of_memb_join; out_failed_list = out_proc_list + out->proc_list_entries; for (i = 0; i < out->proc_list_entries; i++) { srp_addr_copy_endian_convert (&out_proc_list[i], &in_proc_list[i]); } for (i = 0; i < out->failed_list_entries; i++) { srp_addr_copy_endian_convert (&out_failed_list[i], &in_failed_list[i]); } } static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out) { int i; struct srp_addr *in_addr = (struct srp_addr *)in->end_of_commit_token; struct srp_addr *out_addr = (struct srp_addr *)out->end_of_commit_token; struct memb_commit_token_memb_entry *in_memb_list; struct memb_commit_token_memb_entry *out_memb_list; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->token_seq = swab32 (in->token_seq); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->retrans_flg = swab32 (in->retrans_flg); out->memb_index = swab32 (in->memb_index); out->addr_entries = swab32 (in->addr_entries); in_memb_list = (struct memb_commit_token_memb_entry *)(in_addr + out->addr_entries); out_memb_list = (struct memb_commit_token_memb_entry *)(out_addr + out->addr_entries); for (i = 0; i < out->addr_entries; i++) { srp_addr_copy_endian_convert (&out_addr[i], &in_addr[i]); /* * Only convert the memb entry if it has been set */ if (in_memb_list[i].ring_id.rep.family != 0) { totemip_copy_endian_convert (&out_memb_list[i].ring_id.rep, &in_memb_list[i].ring_id.rep); out_memb_list[i].ring_id.seq = swab64 (in_memb_list[i].ring_id.seq); out_memb_list[i].aru = swab32 (in_memb_list[i].aru); out_memb_list[i].high_delivered = swab32 (in_memb_list[i].high_delivered); out_memb_list[i].received_flg = swab32 (in_memb_list[i].received_flg); } } } static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out) { int i; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->seq = swab32 (in->seq); out->token_seq = swab32 (in->token_seq); out->aru = swab32 (in->aru); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->aru_addr = swab32(in->aru_addr); out->ring_id.seq = swab64 (in->ring_id.seq); out->fcc = swab32 (in->fcc); out->backlog = swab32 (in->backlog); out->retrans_flg = swab32 (in->retrans_flg); out->rtr_list_entries = swab32 (in->rtr_list_entries); for (i = 0; i < out->rtr_list_entries; i++) { totemip_copy_endian_convert(&out->rtr_list[i].ring_id.rep, &in->rtr_list[i].ring_id.rep); out->rtr_list[i].ring_id.seq = swab64 (in->rtr_list[i].ring_id.seq); out->rtr_list[i].seq = swab32 (in->rtr_list[i].seq); } } static void mcast_endian_convert (const struct mcast *in, struct mcast *out) { out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->header.encapsulated = in->header.encapsulated; out->seq = swab32 (in->seq); out->this_seqno = swab32 (in->this_seqno); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->node_id = swab32 (in->node_id); out->guarantee = swab32 (in->guarantee); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); } static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out) { out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); } static int ignore_join_under_operational ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; unsigned long long ring_seq; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; ring_seq = memb_join->ring_seq; if (memb_set_subset (&instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { return (1); } /* * In operational state, my_proc_list is exactly the same as * my_memb_list. */ if ((memb_set_subset (&memb_join->system_from, 1, instance->my_memb_list, instance->my_memb_entries)) && (ring_seq < instance->my_ring_id.seq)) { return (1); } return (0); } static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_join *memb_join; struct memb_join *memb_join_convert = alloca (msg_len); if (endian_conversion_needed) { memb_join = memb_join_convert; memb_join_endian_convert (msg, memb_join_convert); } else { memb_join = msg; } /* * If the process paused because it wasn't scheduled in a timely * fashion, flush the join messages because they may be queued * entries */ if (pause_flush (instance)) { return (0); } if (instance->token_ring_id_seq < memb_join->ring_seq) { instance->token_ring_id_seq = memb_join->ring_seq; } switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (!ignore_join_under_operational (instance, memb_join)) { memb_join_process (instance, memb_join); } break; case MEMB_STATE_GATHER: memb_join_process (instance, memb_join); break; case MEMB_STATE_COMMIT: if (memb_set_subset (&memb_join->system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE); } break; case MEMB_STATE_RECOVERY: if (memb_set_subset (&memb_join->system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_recovery_state_token_loss (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY); } break; } return (0); } static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_commit_token *memb_commit_token_convert = alloca (msg_len); struct memb_commit_token *memb_commit_token; struct srp_addr sub[PROCESSOR_COUNT_MAX]; int sub_entries; struct srp_addr *addr; log_printf (instance->totemsrp_log_level_debug, "got commit token"); if (endian_conversion_needed) { memb_commit_token_endian_convert (msg, memb_commit_token_convert); } else { memcpy (memb_commit_token_convert, msg, msg_len); } memb_commit_token = memb_commit_token_convert; addr = (struct srp_addr *)memb_commit_token->end_of_commit_token; #ifdef TEST_DROP_COMMIT_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_COMMIT_TOKEN_PERCENTAGE) { return (0); } #endif switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: /* discard token */ break; case MEMB_STATE_GATHER: memb_set_subtract (sub, &sub_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); if (memb_set_equal (addr, memb_commit_token->addr_entries, sub, sub_entries) && memb_commit_token->ring_id.seq > instance->my_ring_id.seq) { memcpy (instance->commit_token, memb_commit_token, msg_len); memb_state_commit_enter (instance); } break; case MEMB_STATE_COMMIT: /* * If retransmitted commit tokens are sent on this ring * filter them out and only enter recovery once the * commit token has traversed the array. This is * determined by : * memb_commit_token->memb_index == memb_commit_token->addr_entries) { */ if (memb_commit_token->ring_id.seq == instance->my_ring_id.seq && memb_commit_token->memb_index == memb_commit_token->addr_entries) { memb_state_recovery_enter (instance, memb_commit_token); } break; case MEMB_STATE_RECOVERY: if (totemip_equal (&instance->my_id.addr[0], &instance->my_ring_id.rep)) { /* Filter out duplicated tokens */ if (instance->originated_orf_token) { break; } instance->originated_orf_token = 1; log_printf (instance->totemsrp_log_level_debug, "Sending initial ORF token"); // TODO convert instead of initiate orf_token_send_initial (instance); reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED } break; } return (0); } static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct token_hold_cancel *token_hold_cancel = msg; if (memcmp (&token_hold_cancel->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) == 0) { instance->my_seq_unchanged = 0; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { timer_function_token_retransmit_timeout (instance); } } return (0); } void main_deliver_fn ( void *context, const void *msg, unsigned int msg_len) { struct totemsrp_instance *instance = context; const struct totem_message_header *message_header = msg; if (msg_len < sizeof (struct totem_message_header)) { log_printf (instance->totemsrp_log_level_security, "Received message is too short... ignoring %u.", (unsigned int)msg_len); return; } switch (message_header->type) { case MESSAGE_TYPE_ORF_TOKEN: instance->stats.orf_token_rx++; break; case MESSAGE_TYPE_MCAST: instance->stats.mcast_rx++; break; case MESSAGE_TYPE_MEMB_MERGE_DETECT: instance->stats.memb_merge_detect_rx++; break; case MESSAGE_TYPE_MEMB_JOIN: instance->stats.memb_join_rx++; break; case MESSAGE_TYPE_MEMB_COMMIT_TOKEN: instance->stats.memb_commit_token_rx++; break; case MESSAGE_TYPE_TOKEN_HOLD_CANCEL: instance->stats.token_hold_cancel_rx++; break; default: log_printf (instance->totemsrp_log_level_security, "Type of received message is wrong... ignoring %d.\n", (int)message_header->type); printf ("wrong message type\n"); instance->stats.rx_msg_dropped++; return; } /* * Handle incoming message */ totemsrp_message_handlers.handler_functions[(int)message_header->type] ( instance, msg, msg_len, message_header->endian_detector != ENDIAN_LOCAL); } void main_iface_change_fn ( void *context, const struct totem_ip_address *iface_addr, unsigned int iface_no) { struct totemsrp_instance *instance = context; int i; totemip_copy (&instance->my_id.addr[iface_no], iface_addr); assert (instance->my_id.addr[iface_no].nodeid); totemip_copy (&instance->my_memb_list[0].addr[iface_no], iface_addr); if (instance->iface_changes++ == 0) { instance->memb_ring_id_create_or_load (&instance->my_ring_id, &instance->my_id.addr[0]); instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf ( instance->totemsrp_log_level_debug, "Created or loaded sequence id %llx.%s for this ring.", instance->my_ring_id.seq, totemip_print (&instance->my_ring_id.rep)); if (instance->totemsrp_service_ready_fn) { instance->totemsrp_service_ready_fn (); } } for (i = 0; i < instance->totem_config->interfaces[iface_no].member_count; i++) { totemsrp_member_add (instance, &instance->totem_config->interfaces[iface_no].member_list[i], iface_no); } if (instance->iface_changes >= instance->totem_config->interface_count) { memb_state_gather_enter (instance, TOTEMSRP_GSFROM_INTERFACE_CHANGE); } } void totemsrp_net_mtu_adjust (struct totem_config *totem_config) { totem_config->net_mtu -= sizeof (struct mcast); } void totemsrp_service_ready_register ( void *context, void (*totem_service_ready) (void)) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->totemsrp_service_ready_fn = totem_service_ready; } int totemsrp_member_add ( void *context, const struct totem_ip_address *member, int link_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_member_add (instance->totemnet_context, &instance->my_id.addr[link_no], member, link_no); return (res); } int totemsrp_member_remove ( void *context, const struct totem_ip_address *member, int link_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_member_remove (instance->totemnet_context, member, link_no); return (res); } void totemsrp_threaded_mode_enable (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->threaded_mode_enabled = 1; } void totemsrp_trans_ack (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->waiting_trans_ack = 0; instance->totemsrp_waiting_trans_ack_cb_fn (0); } diff --git a/exec/totemudpu.c b/exec/totemudpu.c index 9e4864f9..8e625179 100644 --- a/exec/totemudpu.c +++ b/exec/totemudpu.c @@ -1,1178 +1,1178 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2012 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemudpu.h" #include "util.h" #include #include #include #include #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif #define MCAST_SOCKET_BUFFER_SIZE (TRANSMITS_ALLOWED * FRAME_SIZE_MAX) #define NETIF_STATE_REPORT_UP 1 #define NETIF_STATE_REPORT_DOWN 2 #define BIND_STATE_UNBOUND 0 #define BIND_STATE_REGULAR 1 #define BIND_STATE_LOOPBACK 2 struct totemudpu_member { struct qb_list_head list; struct totem_ip_address member; int fd; int active; }; struct totemudpu_instance { qb_loop_t *totemudpu_poll_handle; struct totem_interface *totem_interface; int netif_state_report; int netif_bind_state; void *context; void (*totemudpu_deliver_fn) ( void *context, const void *msg, unsigned int msg_len); void (*totemudpu_iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no); void (*totemudpu_target_set_completed) (void *context); /* * Function and data used to log messages */ int totemudpu_log_level_security; int totemudpu_log_level_error; int totemudpu_log_level_warning; int totemudpu_log_level_notice; int totemudpu_log_level_debug; int totemudpu_subsys_id; void (*totemudpu_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); void *udpu_context; char iov_buffer[FRAME_SIZE_MAX]; struct iovec totemudpu_iov_recv; struct qb_list_head member_list; int stats_sent; int stats_recv; int stats_delv; int stats_remcasts; int stats_orf_token; struct timeval stats_tv_start; struct totem_ip_address my_id; int firstrun; qb_loop_timer_handle timer_netif_check_timeout; unsigned int my_memb_entries; struct totem_config *totem_config; totemsrp_stats_t *stats; struct totem_ip_address token_target; int token_socket; qb_loop_timer_handle timer_merge_detect_timeout; int send_merge_detect_message; unsigned int merge_detect_messages_sent_before_timeout; }; struct work_item { const void *msg; unsigned int msg_len; struct totemudpu_instance *instance; }; static int totemudpu_build_sockets ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *bound_to); static int totemudpu_create_sending_socket( void *udpu_context, const struct totem_ip_address *member); int totemudpu_member_list_rebind_ip ( void *udpu_context); static void totemudpu_start_merge_detect_timeout( void *udpu_context); static void totemudpu_stop_merge_detect_timeout( void *udpu_context); static struct totem_ip_address localhost; static void totemudpu_instance_initialize (struct totemudpu_instance *instance) { memset (instance, 0, sizeof (struct totemudpu_instance)); instance->netif_state_report = NETIF_STATE_REPORT_UP | NETIF_STATE_REPORT_DOWN; instance->totemudpu_iov_recv.iov_base = instance->iov_buffer; instance->totemudpu_iov_recv.iov_len = FRAME_SIZE_MAX; //sizeof (instance->iov_buffer); /* * There is always atleast 1 processor */ instance->my_memb_entries = 1; qb_list_init (&instance->member_list); } #define log_printf(level, format, args...) \ do { \ instance->totemudpu_log_printf ( \ level, instance->totemudpu_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemudpu_log_printf ( \ level, instance->totemudpu_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)", ##args, _error_ptr, err_num); \ } while(0) int totemudpu_crypto_set ( void *udpu_context, const char *cipher_type, const char *hash_type) { return (0); } static inline void ucast_sendmsg ( struct totemudpu_instance *instance, struct totem_ip_address *system_to, const void *msg, unsigned int msg_len) { struct msghdr msg_ucast; int res = 0; struct sockaddr_storage sockaddr; struct iovec iovec; int addrlen; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; /* * Build unicast message */ totemip_totemip_to_sockaddr_convert(system_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); memset(&msg_ucast, 0, sizeof(msg_ucast)); msg_ucast.msg_name = &sockaddr; msg_ucast.msg_namelen = addrlen; msg_ucast.msg_iov = (void *)&iovec; msg_ucast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_ucast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_ucast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_ucast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_ucast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_ucast.msg_accrightslen = 0; #endif /* * Transmit unicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->token_socket, &msg_ucast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "sendmsg(ucast) failed (non-critical)"); } } static inline void mcast_sendmsg ( struct totemudpu_instance *instance, const void *msg, unsigned int msg_len, int only_active) { struct msghdr msg_mcast; int res = 0; struct iovec iovec; struct sockaddr_storage sockaddr; int addrlen; - struct qb_list_head *list; + struct qb_list_head *list; struct totemudpu_member *member; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; memset(&msg_mcast, 0, sizeof(msg_mcast)); /* * Build multicast message */ - qb_list_for_each(list, &(instance->member_list)) { + qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); /* * Do not send multicast message if message is not "flush", member * is inactive and timeout for sending merge message didn't expired. */ if (only_active && !member->active && !instance->send_merge_detect_message) continue ; totemip_totemip_to_sockaddr_convert(&member->member, instance->totem_interface->ip_port, &sockaddr, &addrlen); msg_mcast.msg_name = &sockaddr; msg_mcast.msg_namelen = addrlen; msg_mcast.msg_iov = (void *)&iovec; msg_mcast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_mcast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_mcast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_mcast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_mcast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_mcast.msg_accrightslen = 0; #endif /* * Transmit multicast message * An error here is recovered by totemsrp */ res = sendmsg (member->fd, &msg_mcast, MSG_NOSIGNAL); if (res < 0) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_debug, "sendmsg(mcast) failed (non-critical)"); } } if (!only_active || instance->send_merge_detect_message) { /* * Current message was sent to all nodes */ instance->merge_detect_messages_sent_before_timeout++; instance->send_merge_detect_message = 0; } } int totemudpu_finalize ( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; if (instance->token_socket > 0) { qb_loop_poll_del (instance->totemudpu_poll_handle, instance->token_socket); close (instance->token_socket); } totemudpu_stop_merge_detect_timeout(instance); return (res); } static int net_deliver_fn ( int fd, int revents, void *data) { struct totemudpu_instance *instance = (struct totemudpu_instance *)data; struct msghdr msg_recv; struct iovec *iovec; struct sockaddr_storage system_from; int bytes_received; iovec = &instance->totemudpu_iov_recv; /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = iovec; msg_recv.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_recv.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_recv.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_recv.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_recv.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_recv.msg_accrightslen = 0; #endif bytes_received = recvmsg (fd, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (bytes_received == -1) { return (0); } else { instance->stats_recv += bytes_received; } iovec->iov_len = bytes_received; /* * Handle incoming message */ instance->totemudpu_deliver_fn ( instance->context, iovec->iov_base, iovec->iov_len); iovec->iov_len = FRAME_SIZE_MAX; return (0); } static int netif_determine ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet, struct totem_ip_address *bound_to, int *interface_up, int *interface_num) { int res; res = totemip_iface_check (bindnet, bound_to, interface_up, interface_num, instance->totem_config->clear_node_high_bit); return (res); } /* * If the interface is up, the sockets for totem are built. If the interface is down * this function is requeued in the timer list to retry building the sockets later. */ static void timer_function_netif_check_timeout ( void *data) { struct totemudpu_instance *instance = (struct totemudpu_instance *)data; int interface_up; int interface_num; struct totem_ip_address *bind_address; /* * Build sockets for every interface */ netif_determine (instance, &instance->totem_interface->bindnet, &instance->totem_interface->boundto, &interface_up, &interface_num); /* * If the network interface isn't back up and we are already * in loopback mode, add timer to check again and return */ if ((instance->netif_bind_state == BIND_STATE_LOOPBACK && interface_up == 0) || (instance->my_memb_entries == 1 && instance->netif_bind_state == BIND_STATE_REGULAR && interface_up == 1)) { qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); /* * Add a timer to check for a downed regular interface */ return; } if (instance->token_socket > 0) { qb_loop_poll_del (instance->totemudpu_poll_handle, instance->token_socket); close (instance->token_socket); } if (interface_up == 0) { /* * Interface is not up */ instance->netif_bind_state = BIND_STATE_LOOPBACK; bind_address = &localhost; /* * Add a timer to retry building interfaces and request memb_gather_enter */ qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } else { /* * Interface is up */ instance->netif_bind_state = BIND_STATE_REGULAR; bind_address = &instance->totem_interface->bindnet; } /* * Create and bind the multicast and unicast sockets */ totemudpu_build_sockets (instance, bind_address, &instance->totem_interface->boundto); qb_loop_poll_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->token_socket, POLLIN, instance, net_deliver_fn); totemip_copy (&instance->my_id, &instance->totem_interface->boundto); /* * This reports changes in the interface to the user and totemsrp */ if (instance->netif_bind_state == BIND_STATE_REGULAR) { if (instance->netif_state_report & NETIF_STATE_REPORT_UP) { log_printf (instance->totemudpu_log_level_notice, "The network interface [%s] is now up.", totemip_print (&instance->totem_interface->boundto)); instance->netif_state_report = NETIF_STATE_REPORT_DOWN; instance->totemudpu_iface_change_fn (instance->context, &instance->my_id, 0); } /* * Add a timer to check for interface going down in single membership */ if (instance->my_memb_entries == 1) { qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } } else { if (instance->netif_state_report & NETIF_STATE_REPORT_DOWN) { log_printf (instance->totemudpu_log_level_notice, "The network interface is down."); instance->totemudpu_iface_change_fn (instance->context, &instance->my_id, 0); } instance->netif_state_report = NETIF_STATE_REPORT_UP; } } /* Set the socket priority to INTERACTIVE to ensure that our messages don't get queued behind anything else */ static void totemudpu_traffic_control_set(struct totemudpu_instance *instance, int sock) { #ifdef SO_PRIORITY int prio = 6; /* TC_PRIO_INTERACTIVE */ if (setsockopt(sock, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(int))) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set traffic priority"); } #endif } static int totemudpu_build_sockets_ip ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *bound_to, int interface_num) { struct sockaddr_storage sockaddr; int addrlen; int res; unsigned int recvbuf_size; unsigned int optlen = sizeof (recvbuf_size); /* * Setup unicast socket */ instance->token_socket = socket (bindnet_address->family, SOCK_DGRAM, 0); if (instance->token_socket == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "socket() failed"); return (-1); } totemip_nosigpipe (instance->token_socket); res = fcntl (instance->token_socket, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set non-blocking operation on token socket"); return (-1); } /* * Bind to unicast socket used for token send/receives * This has the side effect of binding to the correct interface */ totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); res = bind (instance->token_socket, (struct sockaddr *)&sockaddr, addrlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "bind token socket failed"); return (-1); } /* * the token_socket can receive many messages. Allow a large number * of receive messages on this socket */ recvbuf_size = MCAST_SOCKET_BUFFER_SIZE; res = setsockopt (instance->token_socket, SOL_SOCKET, SO_RCVBUF, &recvbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_notice, "Could not set recvbuf size"); } return 0; } int totemudpu_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count) { static char *statuses[INTERFACE_MAX] = {(char*)"OK"}; if (status) { *status = statuses; } *iface_count = 1; return (0); } static int totemudpu_build_sockets ( struct totemudpu_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *bound_to) { int interface_num; int interface_up; int res; /* * Determine the ip address bound to and the interface name */ res = netif_determine (instance, bindnet_address, bound_to, &interface_up, &interface_num); if (res == -1) { return (-1); } totemip_copy(&instance->my_id, bound_to); res = totemudpu_build_sockets_ip (instance, bindnet_address, bound_to, interface_num); /* We only send out of the token socket */ totemudpu_traffic_control_set(instance, instance->token_socket); /* * Rebind all members to new ips */ totemudpu_member_list_rebind_ip(instance); return res; } /* * Totem Network interface * depends on poll abstraction, POSIX, IPV4 */ /* * Create an instance */ int totemudpu_initialize ( qb_loop_t *poll_handle, void **udpu_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemudpu_instance *instance; instance = malloc (sizeof (struct totemudpu_instance)); if (instance == NULL) { return (-1); } totemudpu_instance_initialize (instance); instance->totem_config = totem_config; instance->stats = stats; /* * Configure logging */ instance->totemudpu_log_level_security = 1; //totem_config->totem_logging_configuration.log_level_security; instance->totemudpu_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemudpu_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemudpu_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemudpu_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemudpu_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemudpu_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Initialize local variables for totemudpu */ instance->totem_interface = &totem_config->interfaces[0]; memset (instance->iov_buffer, 0, FRAME_SIZE_MAX); instance->totemudpu_poll_handle = poll_handle; instance->totem_interface->bindnet.nodeid = instance->totem_config->node_id; instance->context = context; instance->totemudpu_deliver_fn = deliver_fn; instance->totemudpu_iface_change_fn = iface_change_fn; instance->totemudpu_target_set_completed = target_set_completed; totemip_localhost (AF_INET, &localhost); localhost.nodeid = instance->totem_config->node_id; /* * RRP layer isn't ready to receive message because it hasn't * initialized yet. Add short timer to check the interfaces. */ qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, 100*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); totemudpu_start_merge_detect_timeout(instance); *udpu_context = instance; return (0); } void *totemudpu_buffer_alloc (void) { return malloc (FRAME_SIZE_MAX); } void totemudpu_buffer_release (void *ptr) { return free (ptr); } int totemudpu_processor_count_set ( void *udpu_context, int processor_count) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; instance->my_memb_entries = processor_count; qb_loop_timer_del (instance->totemudpu_poll_handle, instance->timer_netif_check_timeout); if (processor_count == 1) { qb_loop_timer_add (instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->downcheck_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } return (res); } int totemudpu_recv_flush (void *udpu_context) { int res = 0; return (res); } int totemudpu_send_flush (void *udpu_context) { int res = 0; return (res); } int totemudpu_token_send ( void *udpu_context, const void *msg, unsigned int msg_len) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; ucast_sendmsg (instance, &instance->token_target, msg, msg_len); return (res); } int totemudpu_mcast_flush_send ( void *udpu_context, const void *msg, unsigned int msg_len) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 0); return (res); } int totemudpu_mcast_noflush_send ( void *udpu_context, const void *msg, unsigned int msg_len) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 1); return (res); } extern int totemudpu_iface_check (void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; timer_function_netif_check_timeout (instance); return (res); } extern void totemudpu_net_mtu_adjust (void *udpu_context, struct totem_config *totem_config) { assert(totem_config->interface_count > 0); totem_config->net_mtu -= totemip_udpip_header_size(totem_config->interfaces[0].bindnet.family); } int totemudpu_token_target_set ( void *udpu_context, const struct totem_ip_address *token_target) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int res = 0; memcpy (&instance->token_target, token_target, sizeof (struct totem_ip_address)); instance->totemudpu_target_set_completed (instance->context); return (res); } extern int totemudpu_recv_mcast_empty ( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; unsigned int res; struct sockaddr_storage system_from; struct msghdr msg_recv; struct pollfd ufd; int nfds; int msg_processed = 0; /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = &instance->totemudpu_iov_recv; msg_recv.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_recv.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_recv.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_recv.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_recv.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_recv.msg_accrightslen = 0; #endif do { ufd.fd = instance->token_socket; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { res = recvmsg (instance->token_socket, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (res != -1) { msg_processed = 1; } else { msg_processed = -1; } } } while (nfds == 1); return (msg_processed); } static int totemudpu_create_sending_socket( void *udpu_context, const struct totem_ip_address *member) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; int fd; int res; unsigned int sendbuf_size; unsigned int optlen = sizeof (sendbuf_size); struct sockaddr_storage sockaddr; int addrlen; fd = socket (member->family, SOCK_DGRAM, 0); if (fd == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not create socket for new member"); return (-1); } totemip_nosigpipe (fd); res = fcntl (fd, F_SETFL, O_NONBLOCK); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "Could not set non-blocking operation on token socket"); goto error_close_fd; } /* * These sockets are used to send multicast messages, so their buffers * should be large */ sendbuf_size = MCAST_SOCKET_BUFFER_SIZE; res = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, optlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_notice, "Could not set sendbuf size"); /* * Fail in setting sendbuf size is not fatal -> don't exit */ } /* * Bind to sending interface */ totemip_totemip_to_sockaddr_convert(&instance->my_id, 0, &sockaddr, &addrlen); res = bind (fd, (struct sockaddr *)&sockaddr, addrlen); if (res == -1) { LOGSYS_PERROR (errno, instance->totemudpu_log_level_warning, "bind token socket failed"); goto error_close_fd; } return (fd); error_close_fd: close(fd); return (-1); } int totemudpu_member_add ( void *udpu_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; struct totemudpu_member *new_member; new_member = malloc (sizeof (struct totemudpu_member)); if (new_member == NULL) { return (-1); } memset(new_member, 0, sizeof(*new_member)); log_printf (LOGSYS_LEVEL_NOTICE, "adding new UDPU member {%s}", totemip_print(member)); qb_list_init (&new_member->list); qb_list_add_tail (&new_member->list, &instance->member_list); memcpy (&new_member->member, member, sizeof (struct totem_ip_address)); new_member->fd = totemudpu_create_sending_socket(udpu_context, member); new_member->active = 1; return (0); } int totemudpu_member_remove ( void *udpu_context, const struct totem_ip_address *token_target, int ring_no) { int found = 0; struct qb_list_head *list; struct totemudpu_member *member; struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; /* * Find the member to remove and close its socket */ - qb_list_for_each(list, &(instance->member_list)) { + qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (totemip_compare (token_target, &member->member)==0) { log_printf(LOGSYS_LEVEL_NOTICE, "removing UDPU member {%s}", totemip_print(&member->member)); if (member->fd > 0) { log_printf(LOGSYS_LEVEL_DEBUG, "Closing socket to: {%s}", totemip_print(&member->member)); qb_loop_poll_del (instance->totemudpu_poll_handle, member->fd); close (member->fd); } found = 1; break; } } /* * Delete the member from the list */ if (found) { qb_list_del (list); } instance = NULL; return (0); } int totemudpu_member_list_rebind_ip ( void *udpu_context) { struct qb_list_head *list; struct totemudpu_member *member; struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; - qb_list_for_each(list, &(instance->member_list)) { + qb_list_for_each(list, &(instance->member_list)) { member = qb_list_entry (list, struct totemudpu_member, list); if (member->fd > 0) { close (member->fd); } member->fd = totemudpu_create_sending_socket(udpu_context, &member->member); } return (0); } static void timer_function_merge_detect_timeout ( void *data) { struct totemudpu_instance *instance = (struct totemudpu_instance *)data; if (instance->merge_detect_messages_sent_before_timeout == 0) { instance->send_merge_detect_message = 1; } instance->merge_detect_messages_sent_before_timeout = 0; totemudpu_start_merge_detect_timeout(instance); } static void totemudpu_start_merge_detect_timeout( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; qb_loop_timer_add(instance->totemudpu_poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout * 2 * QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); } static void totemudpu_stop_merge_detect_timeout( void *udpu_context) { struct totemudpu_instance *instance = (struct totemudpu_instance *)udpu_context; qb_loop_timer_del(instance->totemudpu_poll_handle, instance->timer_merge_detect_timeout); } diff --git a/exec/vsf_quorum.c b/exec/vsf_quorum.c index 4a2ab48e..425c21d3 100644 --- a/exec/vsf_quorum.c +++ b/exec/vsf_quorum.c @@ -1,484 +1,485 @@ /* * Copyright (c) 2008-2015 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfie@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Red Hat Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "quorum.h" #include #include #include #include #include #include #include #include #include #include #include #include "service.h" #include "votequorum.h" #include "vsf_ykd.h" LOGSYS_DECLARE_SUBSYS ("QUORUM"); struct quorum_pd { unsigned char track_flags; int tracking_enabled; struct qb_list_head list; void *conn; }; struct internal_callback_pd { struct qb_list_head list; quorum_callback_fn_t callback; void *context; }; static void message_handler_req_lib_quorum_getquorate (void *conn, const void *msg); static void message_handler_req_lib_quorum_trackstart (void *conn, const void *msg); static void message_handler_req_lib_quorum_trackstop (void *conn, const void *msg); static void message_handler_req_lib_quorum_gettype (void *conn, const void *msg); static void send_library_notification(void *conn); static void send_internal_notification(void); static char *quorum_exec_init_fn (struct corosync_api_v1 *api); static int quorum_lib_init_fn (void *conn); static int quorum_lib_exit_fn (void *conn); static int primary_designated = 0; static int quorum_type = 0; static struct corosync_api_v1 *corosync_api; static struct qb_list_head lib_trackers_list; static struct qb_list_head internal_trackers_list; static struct memb_ring_id quorum_ring_id; static size_t quorum_view_list_entries = 0; static int quorum_view_list[PROCESSOR_COUNT_MAX]; struct quorum_services_api_ver1 *quorum_iface = NULL; static char view_buf[64]; static void log_view_list(const unsigned int *view_list, size_t view_list_entries) { int total = (int)view_list_entries; int len, pos, ret; int i = 0; while (1) { len = sizeof(view_buf); pos = 0; memset(view_buf, 0, len); for (; i < total; i++) { ret = snprintf(view_buf + pos, len - pos, " %u", view_list[i]); if (ret >= len - pos) break; pos += ret; } log_printf (LOGSYS_LEVEL_NOTICE, "Members[%d]:%s%s", total, view_buf, i < total ? "\\" : ""); if (i == total) break; } } /* Internal quorum API function */ static void quorum_api_set_quorum(const unsigned int *view_list, size_t view_list_entries, int quorum, struct memb_ring_id *ring_id) { int old_quorum = primary_designated; primary_designated = quorum; if (primary_designated && !old_quorum) { log_printf (LOGSYS_LEVEL_NOTICE, "This node is within the primary component and will provide service."); } else if (!primary_designated && old_quorum) { log_printf (LOGSYS_LEVEL_NOTICE, "This node is within the non-primary component and will NOT provide any services."); } quorum_view_list_entries = view_list_entries; memcpy(&quorum_ring_id, ring_id, sizeof (quorum_ring_id)); memcpy(quorum_view_list, view_list, sizeof(unsigned int)*view_list_entries); log_view_list(view_list, view_list_entries); /* Tell internal listeners */ send_internal_notification(); /* Tell IPC listeners */ send_library_notification(NULL); } static struct corosync_lib_handler quorum_lib_service[] = { { /* 0 */ .lib_handler_fn = message_handler_req_lib_quorum_getquorate, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 1 */ .lib_handler_fn = message_handler_req_lib_quorum_trackstart, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 2 */ .lib_handler_fn = message_handler_req_lib_quorum_trackstop, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 3 */ .lib_handler_fn = message_handler_req_lib_quorum_gettype, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED } }; static struct corosync_service_engine quorum_service_handler = { .name = "corosync cluster quorum service v0.1", .id = QUORUM_SERVICE, .priority = 1, .private_data_size = sizeof (struct quorum_pd), .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .lib_init_fn = quorum_lib_init_fn, .lib_exit_fn = quorum_lib_exit_fn, .lib_engine = quorum_lib_service, .exec_init_fn = quorum_exec_init_fn, .lib_engine_count = sizeof (quorum_lib_service) / sizeof (struct corosync_lib_handler) }; struct corosync_service_engine *vsf_quorum_get_service_engine_ver0 (void) { return (&quorum_service_handler); } /* -------------------------------------------------- */ /* * Internal API functions for corosync */ static int quorum_quorate(void) { return primary_designated; } static int quorum_register_callback(quorum_callback_fn_t function, void *context) { struct internal_callback_pd *pd = malloc(sizeof(struct internal_callback_pd)); if (!pd) return -1; pd->context = context; pd->callback = function; qb_list_add (&pd->list, &internal_trackers_list); return 0; } static int quorum_unregister_callback(quorum_callback_fn_t function, void *context) { struct internal_callback_pd *pd; - struct qb_list_head *tmp; + struct qb_list_head *tmp, *tmp_iter; - qb_list_for_each(tmp, &internal_trackers_list) { + qb_list_for_each_safe(tmp, tmp_iter, &internal_trackers_list) { pd = qb_list_entry(tmp, struct internal_callback_pd, list); if (pd->callback == function && pd->context == context) { qb_list_del(&pd->list); free(pd); return 0; } } return -1; } static struct quorum_callin_functions callins = { .quorate = quorum_quorate, .register_callback = quorum_register_callback, .unregister_callback = quorum_unregister_callback }; /* --------------------------------------------------------------------- */ static char *quorum_exec_init_fn (struct corosync_api_v1 *api) { char *quorum_module = NULL; char *error; corosync_api = api; qb_list_init (&lib_trackers_list); qb_list_init (&internal_trackers_list); /* * Tell corosync we have a quorum engine. */ api->quorum_initialize(&callins); /* * Look for a quorum provider */ if (icmap_get_string("quorum.provider", &quorum_module) == CS_OK) { log_printf (LOGSYS_LEVEL_NOTICE, "Using quorum provider %s", quorum_module); error = (char *)"Invalid quorum provider"; if (strcmp (quorum_module, "corosync_votequorum") == 0) { error = votequorum_init (api, quorum_api_set_quorum); quorum_type = 1; } if (strcmp (quorum_module, "corosync_ykd") == 0) { error = ykd_init (api, quorum_api_set_quorum); quorum_type = 1; } if (error) { log_printf (LOGSYS_LEVEL_CRIT, "Quorum provider: %s failed to initialize.", quorum_module); free(quorum_module); return (error); } } if (quorum_module) { free(quorum_module); quorum_module = NULL; } /* * setting quorum_type and primary_designated in the right order is important * always try to lookup/init a quorum module, then revert back to be quorate */ if (quorum_type == 0) { primary_designated = 1; } return (NULL); } static int quorum_lib_init_fn (void *conn) { struct quorum_pd *pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, "lib_init_fn: conn=%p", conn); qb_list_init (&pd->list); pd->conn = conn; return (0); } static int quorum_lib_exit_fn (void *conn) { struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, "lib_exit_fn: conn=%p", conn); if (quorum_pd->tracking_enabled) { qb_list_del (&quorum_pd->list); qb_list_init (&quorum_pd->list); } return (0); } static void send_internal_notification(void) { struct qb_list_head *tmp; struct internal_callback_pd *pd; - qb_list_for_each(tmp, &internal_trackers_list) { + + qb_list_for_each(tmp, &internal_trackers_list) { pd = qb_list_entry(tmp, struct internal_callback_pd, list); pd->callback(primary_designated, pd->context); } } static void send_library_notification(void *conn) { int size = sizeof(struct res_lib_quorum_notification) + sizeof(unsigned int)*quorum_view_list_entries; char buf[size]; struct res_lib_quorum_notification *res_lib_quorum_notification = (struct res_lib_quorum_notification *)buf; struct qb_list_head *tmp; int i; log_printf(LOGSYS_LEVEL_DEBUG, "sending quorum notification to %p, length = %d", conn, size); res_lib_quorum_notification->quorate = primary_designated; res_lib_quorum_notification->ring_seq = quorum_ring_id.seq; res_lib_quorum_notification->view_list_entries = quorum_view_list_entries; for (i=0; iview_list[i] = quorum_view_list[i]; } res_lib_quorum_notification->header.id = MESSAGE_RES_QUORUM_NOTIFICATION; res_lib_quorum_notification->header.size = size; res_lib_quorum_notification->header.error = CS_OK; /* Send it to all interested parties */ if (conn) { corosync_api->ipc_dispatch_send(conn, res_lib_quorum_notification, size); } else { struct quorum_pd *qpd; - qb_list_for_each(tmp, &lib_trackers_list) { + qb_list_for_each(tmp, &lib_trackers_list) { qpd = qb_list_entry(tmp, struct quorum_pd, list); corosync_api->ipc_dispatch_send(qpd->conn, res_lib_quorum_notification, size); } } return; } static void message_handler_req_lib_quorum_getquorate (void *conn, const void *msg) { struct res_lib_quorum_getquorate res_lib_quorum_getquorate; log_printf(LOGSYS_LEVEL_DEBUG, "got quorate request on %p", conn); /* send status */ res_lib_quorum_getquorate.quorate = primary_designated; res_lib_quorum_getquorate.header.size = sizeof(res_lib_quorum_getquorate); res_lib_quorum_getquorate.header.id = MESSAGE_RES_QUORUM_GETQUORATE; res_lib_quorum_getquorate.header.error = CS_OK; corosync_api->ipc_response_send(conn, &res_lib_quorum_getquorate, sizeof(res_lib_quorum_getquorate)); } static void message_handler_req_lib_quorum_trackstart (void *conn, const void *msg) { const struct req_lib_quorum_trackstart *req_lib_quorum_trackstart = msg; struct qb_ipc_response_header res; struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); cs_error_t error = CS_OK; log_printf(LOGSYS_LEVEL_DEBUG, "got trackstart request on %p", conn); /* * If an immediate listing of the current cluster membership * is requested, generate membership list */ if (req_lib_quorum_trackstart->track_flags & CS_TRACK_CURRENT || req_lib_quorum_trackstart->track_flags & CS_TRACK_CHANGES) { log_printf(LOGSYS_LEVEL_DEBUG, "sending initial status to %p", conn); send_library_notification(conn); } if (quorum_pd->tracking_enabled) { error = CS_ERR_EXIST; goto response_send; } /* * Record requests for tracking */ if (req_lib_quorum_trackstart->track_flags & CS_TRACK_CHANGES || req_lib_quorum_trackstart->track_flags & CS_TRACK_CHANGES_ONLY) { quorum_pd->track_flags = req_lib_quorum_trackstart->track_flags; quorum_pd->tracking_enabled = 1; qb_list_add (&quorum_pd->list, &lib_trackers_list); } response_send: /* send status */ res.size = sizeof(res); res.id = MESSAGE_RES_QUORUM_TRACKSTART; res.error = error; corosync_api->ipc_response_send(conn, &res, sizeof(struct qb_ipc_response_header)); } static void message_handler_req_lib_quorum_trackstop (void *conn, const void *msg) { struct qb_ipc_response_header res; struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); log_printf(LOGSYS_LEVEL_DEBUG, "got trackstop request on %p", conn); if (quorum_pd->tracking_enabled) { res.error = CS_OK; quorum_pd->tracking_enabled = 0; qb_list_del (&quorum_pd->list); qb_list_init (&quorum_pd->list); } else { res.error = CS_ERR_NOT_EXIST; } /* send status */ res.size = sizeof(res); res.id = MESSAGE_RES_QUORUM_TRACKSTOP; res.error = CS_OK; corosync_api->ipc_response_send(conn, &res, sizeof(struct qb_ipc_response_header)); } static void message_handler_req_lib_quorum_gettype (void *conn, const void *msg) { struct res_lib_quorum_gettype res_lib_quorum_gettype; log_printf(LOGSYS_LEVEL_DEBUG, "got quorum_type request on %p", conn); /* send status */ res_lib_quorum_gettype.quorum_type = quorum_type; res_lib_quorum_gettype.header.size = sizeof(res_lib_quorum_gettype); res_lib_quorum_gettype.header.id = MESSAGE_RES_QUORUM_GETTYPE; res_lib_quorum_gettype.header.error = CS_OK; corosync_api->ipc_response_send(conn, &res_lib_quorum_gettype, sizeof(res_lib_quorum_gettype)); } diff --git a/lib/cpg.c b/lib/cpg.c index bdff4756..7adf6e1b 100644 --- a/lib/cpg.c +++ b/lib/cpg.c @@ -1,1369 +1,1369 @@ /* * vi: set autoindent tabstop=4 shiftwidth=4 : * * Copyright (c) 2006-2015 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfi@redhat.com) * Author: Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Provides a closed process group API using the coroipcc executive */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif /* * Maximum number of times to retry a send when transmitting * a large message fragment */ #define MAX_RETRIES 100 /* * ZCB files have following umask (umask is same as used in libqb) */ #define CPG_MEMORY_MAP_UMASK 077 struct cpg_inst { qb_ipcc_connection_t *c; int finalize; void *context; union { cpg_model_data_t model_data; cpg_model_v1_data_t model_v1_data; }; struct qb_list_head iteration_list_head; uint32_t max_msg_size; char *assembly_buf; uint32_t assembly_buf_ptr; int assembling; /* Flag that says we have started assembling a message. * It's here to catch the situation where a node joins * the cluster/group in the middle of a CPG message send * so we don't pass on a partial message to the client. */ }; static void cpg_inst_free (void *inst); DECLARE_HDB_DATABASE(cpg_handle_t_db, cpg_inst_free); struct cpg_iteration_instance_t { cpg_iteration_handle_t cpg_iteration_handle; qb_ipcc_connection_t *conn; hdb_handle_t executive_iteration_handle; struct qb_list_head list; }; DECLARE_HDB_DATABASE(cpg_iteration_handle_t_db,NULL); /* * Internal (not visible by API) functions */ static cs_error_t coroipcc_msg_send_reply_receive ( qb_ipcc_connection_t *c, const struct iovec *iov, unsigned int iov_len, void *res_msg, size_t res_len) { return qb_to_cs_error(qb_ipcc_sendv_recv(c, iov, iov_len, res_msg, res_len, CS_IPC_TIMEOUT_MS)); } static void cpg_iteration_instance_finalize (struct cpg_iteration_instance_t *cpg_iteration_instance) { qb_list_del (&cpg_iteration_instance->list); hdb_handle_destroy (&cpg_iteration_handle_t_db, cpg_iteration_instance->cpg_iteration_handle); } static void cpg_inst_free (void *inst) { struct cpg_inst *cpg_inst = (struct cpg_inst *)inst; qb_ipcc_disconnect(cpg_inst->c); } static void cpg_inst_finalize (struct cpg_inst *cpg_inst, hdb_handle_t handle) { - struct qb_list_head *iter; + struct qb_list_head *iter, *tmp_iter; struct cpg_iteration_instance_t *cpg_iteration_instance; /* * Traverse thru iteration instances and delete them */ - qb_list_for_each(iter, &(cpg_inst->iteration_list_head)) { + qb_list_for_each_safe(iter, tmp_iter, &(cpg_inst->iteration_list_head)) { cpg_iteration_instance = qb_list_entry (iter, struct cpg_iteration_instance_t, list); cpg_iteration_instance_finalize (cpg_iteration_instance); } hdb_handle_destroy (&cpg_handle_t_db, handle); } /** * @defgroup cpg_coroipcc The closed process group API * @ingroup coroipcc * * @{ */ cs_error_t cpg_initialize ( cpg_handle_t *handle, cpg_callbacks_t *callbacks) { cpg_model_v1_data_t model_v1_data; memset (&model_v1_data, 0, sizeof (cpg_model_v1_data_t)); if (callbacks) { model_v1_data.cpg_deliver_fn = callbacks->cpg_deliver_fn; model_v1_data.cpg_confchg_fn = callbacks->cpg_confchg_fn; } return (cpg_model_initialize (handle, CPG_MODEL_V1, (cpg_model_data_t *)&model_v1_data, NULL)); } cs_error_t cpg_model_initialize ( cpg_handle_t *handle, cpg_model_t model, cpg_model_data_t *model_data, void *context) { cs_error_t error; struct cpg_inst *cpg_inst; if (model != CPG_MODEL_V1) { error = CS_ERR_INVALID_PARAM; goto error_no_destroy; } error = hdb_error_to_cs (hdb_handle_create (&cpg_handle_t_db, sizeof (struct cpg_inst), handle)); if (error != CS_OK) { goto error_no_destroy; } error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, *handle, (void *)&cpg_inst)); if (error != CS_OK) { goto error_destroy; } cpg_inst->c = qb_ipcc_connect ("cpg", IPC_REQUEST_SIZE); if (cpg_inst->c == NULL) { error = qb_to_cs_error(-errno); goto error_put_destroy; } if (model_data != NULL) { switch (model) { case CPG_MODEL_V1: memcpy (&cpg_inst->model_v1_data, model_data, sizeof (cpg_model_v1_data_t)); if ((cpg_inst->model_v1_data.flags & ~(CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF)) != 0) { error = CS_ERR_INVALID_PARAM; goto error_destroy; } break; } } /* Allow space for corosync internal headers */ cpg_inst->max_msg_size = IPC_REQUEST_SIZE - 1024; cpg_inst->model_data.model = model; cpg_inst->context = context; qb_list_init(&cpg_inst->iteration_list_head); hdb_handle_put (&cpg_handle_t_db, *handle); return (CS_OK); error_put_destroy: hdb_handle_put (&cpg_handle_t_db, *handle); error_destroy: hdb_handle_destroy (&cpg_handle_t_db, *handle); error_no_destroy: return (error); } cs_error_t cpg_finalize ( cpg_handle_t handle) { struct cpg_inst *cpg_inst; struct iovec iov; struct req_lib_cpg_finalize req_lib_cpg_finalize; struct res_lib_cpg_finalize res_lib_cpg_finalize; cs_error_t error; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } /* * Another thread has already started finalizing */ if (cpg_inst->finalize) { hdb_handle_put (&cpg_handle_t_db, handle); return (CS_ERR_BAD_HANDLE); } cpg_inst->finalize = 1; /* * Send service request */ req_lib_cpg_finalize.header.size = sizeof (struct req_lib_cpg_finalize); req_lib_cpg_finalize.header.id = MESSAGE_REQ_CPG_FINALIZE; iov.iov_base = (void *)&req_lib_cpg_finalize; iov.iov_len = sizeof (struct req_lib_cpg_finalize); error = coroipcc_msg_send_reply_receive (cpg_inst->c, &iov, 1, &res_lib_cpg_finalize, sizeof (struct res_lib_cpg_finalize)); cpg_inst_finalize (cpg_inst, handle); hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_fd_get ( cpg_handle_t handle, int *fd) { cs_error_t error; struct cpg_inst *cpg_inst; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } error = qb_to_cs_error (qb_ipcc_fd_get (cpg_inst->c, fd)); hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_max_atomic_msgsize_get ( cpg_handle_t handle, uint32_t *size) { cs_error_t error; struct cpg_inst *cpg_inst; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } *size = cpg_inst->max_msg_size; hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_context_get ( cpg_handle_t handle, void **context) { cs_error_t error; struct cpg_inst *cpg_inst; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } *context = cpg_inst->context; hdb_handle_put (&cpg_handle_t_db, handle); return (CS_OK); } cs_error_t cpg_context_set ( cpg_handle_t handle, void *context) { cs_error_t error; struct cpg_inst *cpg_inst; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } cpg_inst->context = context; hdb_handle_put (&cpg_handle_t_db, handle); return (CS_OK); } cs_error_t cpg_dispatch ( cpg_handle_t handle, cs_dispatch_flags_t dispatch_types) { int timeout = -1; cs_error_t error; int cont = 1; /* always continue do loop except when set to 0 */ struct cpg_inst *cpg_inst; struct res_lib_cpg_confchg_callback *res_cpg_confchg_callback; struct res_lib_cpg_deliver_callback *res_cpg_deliver_callback; struct res_lib_cpg_partial_deliver_callback *res_cpg_partial_deliver_callback; struct res_lib_cpg_totem_confchg_callback *res_cpg_totem_confchg_callback; struct cpg_inst cpg_inst_copy; struct qb_ipc_response_header *dispatch_data; struct cpg_address member_list[CPG_MEMBERS_MAX]; struct cpg_address left_list[CPG_MEMBERS_MAX]; struct cpg_address joined_list[CPG_MEMBERS_MAX]; struct cpg_name group_name; mar_cpg_address_t *left_list_start; mar_cpg_address_t *joined_list_start; unsigned int i; struct cpg_ring_id ring_id; uint32_t totem_member_list[CPG_MEMBERS_MAX]; int32_t errno_res; char dispatch_buf[IPC_DISPATCH_SIZE]; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } /* * Timeout instantly for CS_DISPATCH_ONE_NONBLOCKING or CS_DISPATCH_ALL and * wait indefinitely for CS_DISPATCH_ONE or CS_DISPATCH_BLOCKING */ if (dispatch_types == CS_DISPATCH_ALL || dispatch_types == CS_DISPATCH_ONE_NONBLOCKING) { timeout = 0; } dispatch_data = (struct qb_ipc_response_header *)dispatch_buf; do { errno_res = qb_ipcc_event_recv ( cpg_inst->c, dispatch_buf, IPC_DISPATCH_SIZE, timeout); error = qb_to_cs_error (errno_res); if (error == CS_ERR_BAD_HANDLE) { error = CS_OK; goto error_put; } if (error == CS_ERR_TRY_AGAIN) { if (dispatch_types == CS_DISPATCH_ONE_NONBLOCKING) { /* * Don't mask error */ goto error_put; } error = CS_OK; if (dispatch_types == CS_DISPATCH_ALL) { break; /* exit do while cont is 1 loop */ } else { continue; /* next poll */ } } if (error != CS_OK) { goto error_put; } /* * Make copy of callbacks, message data, unlock instance, and call callback * A risk of this dispatch method is that the callback routines may * operate at the same time that cpgFinalize has been called. */ memcpy (&cpg_inst_copy, cpg_inst, sizeof (struct cpg_inst)); switch (cpg_inst_copy.model_data.model) { case CPG_MODEL_V1: /* * Dispatch incoming message */ switch (dispatch_data->id) { case MESSAGE_RES_CPG_DELIVER_CALLBACK: if (cpg_inst_copy.model_v1_data.cpg_deliver_fn == NULL) { break; } res_cpg_deliver_callback = (struct res_lib_cpg_deliver_callback *)dispatch_data; marshall_from_mar_cpg_name_t ( &group_name, &res_cpg_deliver_callback->group_name); cpg_inst_copy.model_v1_data.cpg_deliver_fn (handle, &group_name, res_cpg_deliver_callback->nodeid, res_cpg_deliver_callback->pid, &res_cpg_deliver_callback->message, res_cpg_deliver_callback->msglen); break; case MESSAGE_RES_CPG_PARTIAL_DELIVER_CALLBACK: res_cpg_partial_deliver_callback = (struct res_lib_cpg_partial_deliver_callback *)dispatch_data; marshall_from_mar_cpg_name_t ( &group_name, &res_cpg_partial_deliver_callback->group_name); if (res_cpg_partial_deliver_callback->type == LIBCPG_PARTIAL_FIRST) { /* * Allocate a buffer to contain a full message. */ cpg_inst->assembly_buf = malloc(res_cpg_partial_deliver_callback->msglen); if (!cpg_inst->assembly_buf) { error = CS_ERR_NO_MEMORY; goto error_put; } cpg_inst->assembling = 1; cpg_inst->assembly_buf_ptr = 0; } if (cpg_inst->assembling) { memcpy(cpg_inst->assembly_buf + cpg_inst->assembly_buf_ptr, res_cpg_partial_deliver_callback->message, res_cpg_partial_deliver_callback->fraglen); cpg_inst->assembly_buf_ptr += res_cpg_partial_deliver_callback->fraglen; if (res_cpg_partial_deliver_callback->type == LIBCPG_PARTIAL_LAST) { cpg_inst_copy.model_v1_data.cpg_deliver_fn (handle, &group_name, res_cpg_partial_deliver_callback->nodeid, res_cpg_partial_deliver_callback->pid, cpg_inst->assembly_buf, res_cpg_partial_deliver_callback->msglen); free(cpg_inst->assembly_buf); cpg_inst->assembling = 0; } } break; case MESSAGE_RES_CPG_CONFCHG_CALLBACK: if (cpg_inst_copy.model_v1_data.cpg_confchg_fn == NULL) { break; } res_cpg_confchg_callback = (struct res_lib_cpg_confchg_callback *)dispatch_data; for (i = 0; i < res_cpg_confchg_callback->member_list_entries; i++) { marshall_from_mar_cpg_address_t (&member_list[i], &res_cpg_confchg_callback->member_list[i]); } left_list_start = res_cpg_confchg_callback->member_list + res_cpg_confchg_callback->member_list_entries; for (i = 0; i < res_cpg_confchg_callback->left_list_entries; i++) { marshall_from_mar_cpg_address_t (&left_list[i], &left_list_start[i]); } joined_list_start = res_cpg_confchg_callback->member_list + res_cpg_confchg_callback->member_list_entries + res_cpg_confchg_callback->left_list_entries; for (i = 0; i < res_cpg_confchg_callback->joined_list_entries; i++) { marshall_from_mar_cpg_address_t (&joined_list[i], &joined_list_start[i]); } marshall_from_mar_cpg_name_t ( &group_name, &res_cpg_confchg_callback->group_name); cpg_inst_copy.model_v1_data.cpg_confchg_fn (handle, &group_name, member_list, res_cpg_confchg_callback->member_list_entries, left_list, res_cpg_confchg_callback->left_list_entries, joined_list, res_cpg_confchg_callback->joined_list_entries); break; case MESSAGE_RES_CPG_TOTEM_CONFCHG_CALLBACK: if (cpg_inst_copy.model_v1_data.cpg_totem_confchg_fn == NULL) { break; } res_cpg_totem_confchg_callback = (struct res_lib_cpg_totem_confchg_callback *)dispatch_data; marshall_from_mar_cpg_ring_id_t (&ring_id, &res_cpg_totem_confchg_callback->ring_id); for (i = 0; i < res_cpg_totem_confchg_callback->member_list_entries; i++) { totem_member_list[i] = res_cpg_totem_confchg_callback->member_list[i]; } cpg_inst_copy.model_v1_data.cpg_totem_confchg_fn (handle, ring_id, res_cpg_totem_confchg_callback->member_list_entries, totem_member_list); break; default: error = CS_ERR_LIBRARY; goto error_put; break; } /* - switch (dispatch_data->id) */ break; /* case CPG_MODEL_V1 */ } /* - switch (cpg_inst_copy.model_data.model) */ if (cpg_inst_copy.finalize || cpg_inst->finalize) { /* * If the finalize has been called then get out of the dispatch. */ cpg_inst->finalize = 1; error = CS_ERR_BAD_HANDLE; goto error_put; } /* * Determine if more messages should be processed */ if (dispatch_types == CS_DISPATCH_ONE || dispatch_types == CS_DISPATCH_ONE_NONBLOCKING) { cont = 0; } } while (cont); error_put: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_join ( cpg_handle_t handle, const struct cpg_name *group) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov[2]; struct req_lib_cpg_join req_lib_cpg_join; struct res_lib_cpg_join response; if (group->length > CPG_MAX_NAME_LENGTH) { return (CS_ERR_NAME_TOO_LONG); } error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } /* Now join */ req_lib_cpg_join.header.size = sizeof (struct req_lib_cpg_join); req_lib_cpg_join.header.id = MESSAGE_REQ_CPG_JOIN; req_lib_cpg_join.pid = getpid(); req_lib_cpg_join.flags = 0; switch (cpg_inst->model_data.model) { case CPG_MODEL_V1: req_lib_cpg_join.flags = cpg_inst->model_v1_data.flags; break; } marshall_to_mar_cpg_name_t (&req_lib_cpg_join.group_name, group); iov[0].iov_base = (void *)&req_lib_cpg_join; iov[0].iov_len = sizeof (struct req_lib_cpg_join); do { error = coroipcc_msg_send_reply_receive (cpg_inst->c, iov, 1, &response, sizeof (struct res_lib_cpg_join)); if (error != CS_OK) { goto error_exit; } } while (response.header.error == CS_ERR_BUSY); error = response.header.error; error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_leave ( cpg_handle_t handle, const struct cpg_name *group) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov[2]; struct req_lib_cpg_leave req_lib_cpg_leave; struct res_lib_cpg_leave res_lib_cpg_leave; if (group->length > CPG_MAX_NAME_LENGTH) { return (CS_ERR_NAME_TOO_LONG); } error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } req_lib_cpg_leave.header.size = sizeof (struct req_lib_cpg_leave); req_lib_cpg_leave.header.id = MESSAGE_REQ_CPG_LEAVE; req_lib_cpg_leave.pid = getpid(); marshall_to_mar_cpg_name_t (&req_lib_cpg_leave.group_name, group); iov[0].iov_base = (void *)&req_lib_cpg_leave; iov[0].iov_len = sizeof (struct req_lib_cpg_leave); do { error = coroipcc_msg_send_reply_receive (cpg_inst->c, iov, 1, &res_lib_cpg_leave, sizeof (struct res_lib_cpg_leave)); if (error != CS_OK) { goto error_exit; } } while (res_lib_cpg_leave.header.error == CS_ERR_BUSY); error = res_lib_cpg_leave.header.error; error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_membership_get ( cpg_handle_t handle, struct cpg_name *group_name, struct cpg_address *member_list, int *member_list_entries) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov; struct req_lib_cpg_membership_get req_lib_cpg_membership_get; struct res_lib_cpg_membership_get res_lib_cpg_membership_get; unsigned int i; if (group_name->length > CPG_MAX_NAME_LENGTH) { return (CS_ERR_NAME_TOO_LONG); } if (member_list == NULL) { return (CS_ERR_INVALID_PARAM); } if (member_list_entries == NULL) { return (CS_ERR_INVALID_PARAM); } error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } req_lib_cpg_membership_get.header.size = sizeof (struct req_lib_cpg_membership_get); req_lib_cpg_membership_get.header.id = MESSAGE_REQ_CPG_MEMBERSHIP; marshall_to_mar_cpg_name_t (&req_lib_cpg_membership_get.group_name, group_name); iov.iov_base = (void *)&req_lib_cpg_membership_get; iov.iov_len = sizeof (struct req_lib_cpg_membership_get); error = coroipcc_msg_send_reply_receive (cpg_inst->c, &iov, 1, &res_lib_cpg_membership_get, sizeof (res_lib_cpg_membership_get)); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_membership_get.header.error; /* * Copy results to caller */ *member_list_entries = res_lib_cpg_membership_get.member_count; if (member_list) { for (i = 0; i < res_lib_cpg_membership_get.member_count; i++) { marshall_from_mar_cpg_address_t (&member_list[i], &res_lib_cpg_membership_get.member_list[i]); } } error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_local_get ( cpg_handle_t handle, unsigned int *local_nodeid) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov; struct req_lib_cpg_local_get req_lib_cpg_local_get; struct res_lib_cpg_local_get res_lib_cpg_local_get; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } req_lib_cpg_local_get.header.size = sizeof (struct qb_ipc_request_header); req_lib_cpg_local_get.header.id = MESSAGE_REQ_CPG_LOCAL_GET; iov.iov_base = (void *)&req_lib_cpg_local_get; iov.iov_len = sizeof (struct req_lib_cpg_local_get); error = coroipcc_msg_send_reply_receive (cpg_inst->c, &iov, 1, &res_lib_cpg_local_get, sizeof (res_lib_cpg_local_get)); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_local_get.header.error; *local_nodeid = res_lib_cpg_local_get.local_nodeid; error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_flow_control_state_get ( cpg_handle_t handle, cpg_flow_control_state_t *flow_control_state) { cs_error_t error; struct cpg_inst *cpg_inst; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } *flow_control_state = CPG_FLOW_CONTROL_DISABLED; error = CS_OK; hdb_handle_put (&cpg_handle_t_db, handle); return (error); } static int memory_map (char *path, const char *file, void **buf, size_t bytes) { int32_t fd; void *addr; int32_t res; char *buffer; int32_t i; size_t written; size_t page_size; long int sysconf_page_size; mode_t old_umask; snprintf (path, PATH_MAX, "/dev/shm/%s", file); old_umask = umask(CPG_MEMORY_MAP_UMASK); fd = mkstemp (path); (void)umask(old_umask); if (fd == -1) { snprintf (path, PATH_MAX, LOCALSTATEDIR "/run/%s", file); old_umask = umask(CPG_MEMORY_MAP_UMASK); fd = mkstemp (path); (void)umask(old_umask); if (fd == -1) { return (-1); } } res = ftruncate (fd, bytes); if (res == -1) { goto error_close_unlink; } sysconf_page_size = sysconf(_SC_PAGESIZE); if (sysconf_page_size <= 0) { goto error_close_unlink; } page_size = sysconf_page_size; buffer = malloc (page_size); if (buffer == NULL) { goto error_close_unlink; } memset (buffer, 0, page_size); for (i = 0; i < (bytes / page_size); i++) { retry_write: written = write (fd, buffer, page_size); if (written == -1 && errno == EINTR) { goto retry_write; } if (written != page_size) { free (buffer); goto error_close_unlink; } } free (buffer); addr = mmap (NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { goto error_close_unlink; } #ifdef MADV_NOSYNC madvise(addr, bytes, MADV_NOSYNC); #endif res = close (fd); if (res) { munmap(addr, bytes); return (-1); } *buf = addr; return 0; error_close_unlink: close (fd); unlink(path); return -1; } cs_error_t cpg_zcb_alloc ( cpg_handle_t handle, size_t size, void **buffer) { void *buf = NULL; char path[PATH_MAX]; mar_req_coroipcc_zc_alloc_t req_coroipcc_zc_alloc; struct qb_ipc_response_header res_coroipcs_zc_alloc; size_t map_size; struct iovec iovec; struct coroipcs_zc_header *hdr; cs_error_t error; struct cpg_inst *cpg_inst; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } map_size = size + sizeof (struct req_lib_cpg_mcast) + sizeof (struct coroipcs_zc_header); assert(memory_map (path, "corosync_zerocopy-XXXXXX", &buf, map_size) != -1); if (strlen(path) >= CPG_ZC_PATH_LEN) { unlink(path); munmap (buf, map_size); return (CS_ERR_NAME_TOO_LONG); } req_coroipcc_zc_alloc.header.size = sizeof (mar_req_coroipcc_zc_alloc_t); req_coroipcc_zc_alloc.header.id = MESSAGE_REQ_CPG_ZC_ALLOC; req_coroipcc_zc_alloc.map_size = map_size; strcpy (req_coroipcc_zc_alloc.path_to_file, path); iovec.iov_base = (void *)&req_coroipcc_zc_alloc; iovec.iov_len = sizeof (mar_req_coroipcc_zc_alloc_t); error = coroipcc_msg_send_reply_receive ( cpg_inst->c, &iovec, 1, &res_coroipcs_zc_alloc, sizeof (struct qb_ipc_response_header)); if (error != CS_OK) { goto error_exit; } hdr = (struct coroipcs_zc_header *)buf; hdr->map_size = map_size; *buffer = ((char *)buf) + sizeof (struct coroipcs_zc_header) + sizeof (struct req_lib_cpg_mcast); error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_zcb_free ( cpg_handle_t handle, void *buffer) { cs_error_t error; unsigned int res; struct cpg_inst *cpg_inst; mar_req_coroipcc_zc_free_t req_coroipcc_zc_free; struct qb_ipc_response_header res_coroipcs_zc_free; struct iovec iovec; struct coroipcs_zc_header *header = (struct coroipcs_zc_header *)((char *)buffer - sizeof (struct coroipcs_zc_header) - sizeof (struct req_lib_cpg_mcast)); error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } req_coroipcc_zc_free.header.size = sizeof (mar_req_coroipcc_zc_free_t); req_coroipcc_zc_free.header.id = MESSAGE_REQ_CPG_ZC_FREE; req_coroipcc_zc_free.map_size = header->map_size; req_coroipcc_zc_free.server_address = header->server_address; iovec.iov_base = (void *)&req_coroipcc_zc_free; iovec.iov_len = sizeof (mar_req_coroipcc_zc_free_t); error = coroipcc_msg_send_reply_receive ( cpg_inst->c, &iovec, 1, &res_coroipcs_zc_free, sizeof (struct qb_ipc_response_header)); if (error != CS_OK) { goto error_exit; } res = munmap ((void *)header, header->map_size); if (res == -1) { error = qb_to_cs_error(-errno); goto error_exit; } error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_zcb_mcast_joined ( cpg_handle_t handle, cpg_guarantee_t guarantee, void *msg, size_t msg_len) { cs_error_t error; struct cpg_inst *cpg_inst; struct req_lib_cpg_mcast *req_lib_cpg_mcast; struct res_lib_cpg_mcast res_lib_cpg_mcast; mar_req_coroipcc_zc_execute_t req_coroipcc_zc_execute; struct coroipcs_zc_header *hdr; struct iovec iovec; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } if (msg_len > IPC_REQUEST_SIZE) { error = CS_ERR_TOO_BIG; goto error_exit; } req_lib_cpg_mcast = (struct req_lib_cpg_mcast *)(((char *)msg) - sizeof (struct req_lib_cpg_mcast)); req_lib_cpg_mcast->header.size = sizeof (struct req_lib_cpg_mcast) + msg_len; req_lib_cpg_mcast->header.id = MESSAGE_REQ_CPG_MCAST; req_lib_cpg_mcast->guarantee = guarantee; req_lib_cpg_mcast->msglen = msg_len; hdr = (struct coroipcs_zc_header *)(((char *)req_lib_cpg_mcast) - sizeof (struct coroipcs_zc_header)); req_coroipcc_zc_execute.header.size = sizeof (mar_req_coroipcc_zc_execute_t); req_coroipcc_zc_execute.header.id = MESSAGE_REQ_CPG_ZC_EXECUTE; req_coroipcc_zc_execute.server_address = hdr->server_address; iovec.iov_base = (void *)&req_coroipcc_zc_execute; iovec.iov_len = sizeof (mar_req_coroipcc_zc_execute_t); error = coroipcc_msg_send_reply_receive ( cpg_inst->c, &iovec, 1, &res_lib_cpg_mcast, sizeof(res_lib_cpg_mcast)); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_mcast.header.error; error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } static cs_error_t send_fragments ( struct cpg_inst *cpg_inst, cpg_guarantee_t guarantee, size_t msg_len, const struct iovec *iovec, unsigned int iov_len) { int i; cs_error_t error = CS_OK; struct iovec iov[2]; struct req_lib_cpg_partial_mcast req_lib_cpg_mcast; struct res_lib_cpg_partial_send res_lib_cpg_partial_send; size_t sent = 0; size_t iov_sent = 0; int retry_count; req_lib_cpg_mcast.header.id = MESSAGE_REQ_CPG_PARTIAL_MCAST; req_lib_cpg_mcast.guarantee = guarantee; req_lib_cpg_mcast.msglen = msg_len; iov[0].iov_base = (void *)&req_lib_cpg_mcast; iov[0].iov_len = sizeof (struct req_lib_cpg_partial_mcast); i=0; iov_sent = 0 ; qb_ipcc_fc_enable_max_set(cpg_inst->c, 2); while (error == CS_OK && sent < msg_len) { retry_count = 0; if ( (iovec[i].iov_len - iov_sent) > cpg_inst->max_msg_size) { iov[1].iov_len = cpg_inst->max_msg_size; } else { iov[1].iov_len = iovec[i].iov_len - iov_sent; } if (sent == 0) { req_lib_cpg_mcast.type = LIBCPG_PARTIAL_FIRST; } else if ((sent + iov[1].iov_len) == msg_len) { req_lib_cpg_mcast.type = LIBCPG_PARTIAL_LAST; } else { req_lib_cpg_mcast.type = LIBCPG_PARTIAL_CONTINUED; } req_lib_cpg_mcast.fraglen = iov[1].iov_len; req_lib_cpg_mcast.header.size = sizeof (struct req_lib_cpg_partial_mcast) + iov[1].iov_len; iov[1].iov_base = (char *)iovec[i].iov_base + iov_sent; resend: error = coroipcc_msg_send_reply_receive (cpg_inst->c, iov, 2, &res_lib_cpg_partial_send, sizeof (res_lib_cpg_partial_send)); if (error == CS_ERR_TRY_AGAIN) { fprintf(stderr, "sleep. counter=%d\n", retry_count); if (++retry_count > MAX_RETRIES) { goto error_exit; } usleep(10000); goto resend; } iov_sent += iov[1].iov_len; sent += iov[1].iov_len; /* Next iovec */ if (iov_sent >= iovec[i].iov_len) { i++; iov_sent = 0; } error = res_lib_cpg_partial_send.header.error; } error_exit: qb_ipcc_fc_enable_max_set(cpg_inst->c, 1); return error; } cs_error_t cpg_mcast_joined ( cpg_handle_t handle, cpg_guarantee_t guarantee, const struct iovec *iovec, unsigned int iov_len) { int i; cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov[64]; struct req_lib_cpg_mcast req_lib_cpg_mcast; size_t msg_len = 0; error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } for (i = 0; i < iov_len; i++ ) { msg_len += iovec[i].iov_len; } if (msg_len > cpg_inst->max_msg_size) { error = send_fragments(cpg_inst, guarantee, msg_len, iovec, iov_len); goto error_exit; } req_lib_cpg_mcast.header.size = sizeof (struct req_lib_cpg_mcast) + msg_len; req_lib_cpg_mcast.header.id = MESSAGE_REQ_CPG_MCAST; req_lib_cpg_mcast.guarantee = guarantee; req_lib_cpg_mcast.msglen = msg_len; iov[0].iov_base = (void *)&req_lib_cpg_mcast; iov[0].iov_len = sizeof (struct req_lib_cpg_mcast); memcpy (&iov[1], iovec, iov_len * sizeof (struct iovec)); qb_ipcc_fc_enable_max_set(cpg_inst->c, 2); error = qb_to_cs_error(qb_ipcc_sendv(cpg_inst->c, iov, iov_len + 1)); qb_ipcc_fc_enable_max_set(cpg_inst->c, 1); error_exit: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_iteration_initialize( cpg_handle_t handle, cpg_iteration_type_t iteration_type, const struct cpg_name *group, cpg_iteration_handle_t *cpg_iteration_handle) { cs_error_t error; struct iovec iov; struct cpg_inst *cpg_inst; struct cpg_iteration_instance_t *cpg_iteration_instance; struct req_lib_cpg_iterationinitialize req_lib_cpg_iterationinitialize; struct res_lib_cpg_iterationinitialize res_lib_cpg_iterationinitialize; if (group && group->length > CPG_MAX_NAME_LENGTH) { return (CS_ERR_NAME_TOO_LONG); } if (cpg_iteration_handle == NULL) { return (CS_ERR_INVALID_PARAM); } if ((iteration_type == CPG_ITERATION_ONE_GROUP && group == NULL) || (iteration_type != CPG_ITERATION_ONE_GROUP && group != NULL)) { return (CS_ERR_INVALID_PARAM); } if (iteration_type != CPG_ITERATION_NAME_ONLY && iteration_type != CPG_ITERATION_ONE_GROUP && iteration_type != CPG_ITERATION_ALL) { return (CS_ERR_INVALID_PARAM); } error = hdb_error_to_cs (hdb_handle_get (&cpg_handle_t_db, handle, (void *)&cpg_inst)); if (error != CS_OK) { return (error); } error = hdb_error_to_cs (hdb_handle_create (&cpg_iteration_handle_t_db, sizeof (struct cpg_iteration_instance_t), cpg_iteration_handle)); if (error != CS_OK) { goto error_put_cpg_db; } error = hdb_error_to_cs (hdb_handle_get (&cpg_iteration_handle_t_db, *cpg_iteration_handle, (void *)&cpg_iteration_instance)); if (error != CS_OK) { goto error_destroy; } cpg_iteration_instance->conn = cpg_inst->c; qb_list_init (&cpg_iteration_instance->list); req_lib_cpg_iterationinitialize.header.size = sizeof (struct req_lib_cpg_iterationinitialize); req_lib_cpg_iterationinitialize.header.id = MESSAGE_REQ_CPG_ITERATIONINITIALIZE; req_lib_cpg_iterationinitialize.iteration_type = iteration_type; if (group) { marshall_to_mar_cpg_name_t (&req_lib_cpg_iterationinitialize.group_name, group); } iov.iov_base = (void *)&req_lib_cpg_iterationinitialize; iov.iov_len = sizeof (struct req_lib_cpg_iterationinitialize); error = coroipcc_msg_send_reply_receive (cpg_inst->c, &iov, 1, &res_lib_cpg_iterationinitialize, sizeof (struct res_lib_cpg_iterationinitialize)); if (error != CS_OK) { goto error_put_destroy; } cpg_iteration_instance->executive_iteration_handle = res_lib_cpg_iterationinitialize.iteration_handle; cpg_iteration_instance->cpg_iteration_handle = *cpg_iteration_handle; qb_list_add (&cpg_iteration_instance->list, &cpg_inst->iteration_list_head); hdb_handle_put (&cpg_iteration_handle_t_db, *cpg_iteration_handle); hdb_handle_put (&cpg_handle_t_db, handle); return (res_lib_cpg_iterationinitialize.header.error); error_put_destroy: hdb_handle_put (&cpg_iteration_handle_t_db, *cpg_iteration_handle); error_destroy: hdb_handle_destroy (&cpg_iteration_handle_t_db, *cpg_iteration_handle); error_put_cpg_db: hdb_handle_put (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_iteration_next( cpg_iteration_handle_t handle, struct cpg_iteration_description_t *description) { cs_error_t error; struct cpg_iteration_instance_t *cpg_iteration_instance; struct req_lib_cpg_iterationnext req_lib_cpg_iterationnext; struct res_lib_cpg_iterationnext res_lib_cpg_iterationnext; if (description == NULL) { return CS_ERR_INVALID_PARAM; } error = hdb_error_to_cs (hdb_handle_get (&cpg_iteration_handle_t_db, handle, (void *)&cpg_iteration_instance)); if (error != CS_OK) { goto error_exit; } req_lib_cpg_iterationnext.header.size = sizeof (struct req_lib_cpg_iterationnext); req_lib_cpg_iterationnext.header.id = MESSAGE_REQ_CPG_ITERATIONNEXT; req_lib_cpg_iterationnext.iteration_handle = cpg_iteration_instance->executive_iteration_handle; error = qb_to_cs_error (qb_ipcc_send (cpg_iteration_instance->conn, &req_lib_cpg_iterationnext, req_lib_cpg_iterationnext.header.size)); if (error != CS_OK) { goto error_put; } error = qb_to_cs_error (qb_ipcc_recv (cpg_iteration_instance->conn, &res_lib_cpg_iterationnext, sizeof(struct res_lib_cpg_iterationnext), -1)); if (error != CS_OK) { goto error_put; } marshall_from_mar_cpg_iteration_description_t( description, &res_lib_cpg_iterationnext.description); error = res_lib_cpg_iterationnext.header.error; error_put: hdb_handle_put (&cpg_iteration_handle_t_db, handle); error_exit: return (error); } cs_error_t cpg_iteration_finalize ( cpg_iteration_handle_t handle) { cs_error_t error; struct iovec iov; struct cpg_iteration_instance_t *cpg_iteration_instance; struct req_lib_cpg_iterationfinalize req_lib_cpg_iterationfinalize; struct res_lib_cpg_iterationfinalize res_lib_cpg_iterationfinalize; error = hdb_error_to_cs (hdb_handle_get (&cpg_iteration_handle_t_db, handle, (void *)&cpg_iteration_instance)); if (error != CS_OK) { goto error_exit; } req_lib_cpg_iterationfinalize.header.size = sizeof (struct req_lib_cpg_iterationfinalize); req_lib_cpg_iterationfinalize.header.id = MESSAGE_REQ_CPG_ITERATIONFINALIZE; req_lib_cpg_iterationfinalize.iteration_handle = cpg_iteration_instance->executive_iteration_handle; iov.iov_base = (void *)&req_lib_cpg_iterationfinalize; iov.iov_len = sizeof (struct req_lib_cpg_iterationfinalize); error = coroipcc_msg_send_reply_receive (cpg_iteration_instance->conn, &iov, 1, &res_lib_cpg_iterationfinalize, sizeof (struct req_lib_cpg_iterationfinalize)); if (error != CS_OK) { goto error_put; } cpg_iteration_instance_finalize (cpg_iteration_instance); hdb_handle_put (&cpg_iteration_handle_t_db, cpg_iteration_instance->cpg_iteration_handle); return (res_lib_cpg_iterationfinalize.header.error); error_put: hdb_handle_put (&cpg_iteration_handle_t_db, handle); error_exit: return (error); } /** @} */