diff --git a/exec/cfg.c b/exec/cfg.c index 19b819eb..dec7dbf8 100644 --- a/exec/cfg.c +++ b/exec/cfg.c @@ -1,1115 +1,1115 @@ /* * Copyright (c) 2005-2006 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "service.h" #include "main.h" LOGSYS_DECLARE_SUBSYS ("CFG"); enum cfg_message_req_types { MESSAGE_REQ_EXEC_CFG_RINGREENABLE = 0, MESSAGE_REQ_EXEC_CFG_KILLNODE = 1, MESSAGE_REQ_EXEC_CFG_SHUTDOWN = 2, MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG = 3 }; #define DEFAULT_SHUTDOWN_TIMEOUT 5 static struct qb_list_head trackers_list; /* * Variables controlling a requested shutdown */ static corosync_timer_handle_t shutdown_timer; static struct cfg_info *shutdown_con; static uint32_t shutdown_flags; static int shutdown_yes; static int shutdown_no; static int shutdown_expected; struct cfg_info { struct qb_list_head list; void *conn; void *tracker_conn; enum {SHUTDOWN_REPLY_UNKNOWN, SHUTDOWN_REPLY_YES, SHUTDOWN_REPLY_NO} shutdown_reply; }; static void cfg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); static char *cfg_exec_init_fn (struct corosync_api_v1 *corosync_api_v1); static struct corosync_api_v1 *api; static int cfg_lib_init_fn (void *conn); static int cfg_lib_exit_fn (void *conn); static void message_handler_req_exec_cfg_ringreenable ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_killnode ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_shutdown ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_reload_config ( const void *message, unsigned int nodeid); static void exec_cfg_killnode_endian_convert (void *msg); static void message_handler_req_lib_cfg_ringstatusget ( void *conn, const void *msg); static void message_handler_req_lib_cfg_ringreenable ( void *conn, const void *msg); static void message_handler_req_lib_cfg_killnode ( void *conn, const void *msg); static void message_handler_req_lib_cfg_tryshutdown ( void *conn, const void *msg); static void message_handler_req_lib_cfg_replytoshutdown ( void *conn, const void *msg); static void message_handler_req_lib_cfg_get_node_addrs ( void *conn, const void *msg); static void message_handler_req_lib_cfg_local_get ( void *conn, const void *msg); static void message_handler_req_lib_cfg_reload_config ( void *conn, const void *msg); static void message_handler_req_lib_cfg_reopen_log_files ( void *conn, const void *msg); /* * Service Handler Definition */ static struct corosync_lib_handler cfg_lib_engine[] = { { /* 0 */ .lib_handler_fn = message_handler_req_lib_cfg_ringstatusget, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 1 */ .lib_handler_fn = message_handler_req_lib_cfg_ringreenable, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 2 */ .lib_handler_fn = message_handler_req_lib_cfg_killnode, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 3 */ .lib_handler_fn = message_handler_req_lib_cfg_tryshutdown, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 4 */ .lib_handler_fn = message_handler_req_lib_cfg_replytoshutdown, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 5 */ .lib_handler_fn = message_handler_req_lib_cfg_get_node_addrs, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 6 */ .lib_handler_fn = message_handler_req_lib_cfg_local_get, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 7 */ .lib_handler_fn = message_handler_req_lib_cfg_reload_config, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 8 */ .lib_handler_fn = message_handler_req_lib_cfg_reopen_log_files, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED } }; static struct corosync_exec_handler cfg_exec_engine[] = { { /* 0 */ .exec_handler_fn = message_handler_req_exec_cfg_ringreenable, }, { /* 1 */ .exec_handler_fn = message_handler_req_exec_cfg_killnode, .exec_endian_convert_fn = exec_cfg_killnode_endian_convert }, { /* 2 */ .exec_handler_fn = message_handler_req_exec_cfg_shutdown, }, { /* 3 */ .exec_handler_fn = message_handler_req_exec_cfg_reload_config, } }; /* * Exports the interface for the service */ struct corosync_service_engine cfg_service_engine = { .name = "corosync configuration service", .id = CFG_SERVICE, .priority = 1, .private_data_size = sizeof(struct cfg_info), .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .lib_init_fn = cfg_lib_init_fn, .lib_exit_fn = cfg_lib_exit_fn, .lib_engine = cfg_lib_engine, .lib_engine_count = sizeof (cfg_lib_engine) / sizeof (struct corosync_lib_handler), .exec_init_fn = cfg_exec_init_fn, .exec_engine = cfg_exec_engine, .exec_engine_count = sizeof (cfg_exec_engine) / sizeof (struct corosync_exec_handler), .confchg_fn = cfg_confchg_fn }; struct corosync_service_engine *cfg_get_service_engine_ver0 (void) { return (&cfg_service_engine); } struct req_exec_cfg_ringreenable { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); }; struct req_exec_cfg_reload_config { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); }; struct req_exec_cfg_killnode { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_uint32_t nodeid __attribute__((aligned(8))); mar_name_t reason __attribute__((aligned(8))); }; struct req_exec_cfg_shutdown { struct qb_ipc_request_header header __attribute__((aligned(8))); }; /* IMPL */ static char *cfg_exec_init_fn ( struct corosync_api_v1 *corosync_api_v1) { api = corosync_api_v1; qb_list_init(&trackers_list); return (NULL); } static void cfg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { } /* * Tell other nodes we are shutting down */ static int send_shutdown(void) { struct req_exec_cfg_shutdown req_exec_cfg_shutdown; struct iovec iovec; ENTER(); req_exec_cfg_shutdown.header.size = sizeof (struct req_exec_cfg_shutdown); req_exec_cfg_shutdown.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_SHUTDOWN); iovec.iov_base = (char *)&req_exec_cfg_shutdown; iovec.iov_len = sizeof (struct req_exec_cfg_shutdown); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); return 0; } static void send_test_shutdown(void *only_conn, void *exclude_conn, int status) { struct res_lib_cfg_testshutdown res_lib_cfg_testshutdown; struct qb_list_head *iter; ENTER(); res_lib_cfg_testshutdown.header.size = sizeof(struct res_lib_cfg_testshutdown); res_lib_cfg_testshutdown.header.id = MESSAGE_RES_CFG_TESTSHUTDOWN; res_lib_cfg_testshutdown.header.error = status; res_lib_cfg_testshutdown.flags = shutdown_flags; if (only_conn) { TRACE1("sending testshutdown to only %p", only_conn); api->ipc_dispatch_send(only_conn, &res_lib_cfg_testshutdown, sizeof(res_lib_cfg_testshutdown)); } else { qb_list_for_each(iter, &trackers_list) { struct cfg_info *ci = qb_list_entry(iter, struct cfg_info, list); if (ci->conn != exclude_conn) { TRACE1("sending testshutdown to %p", ci->tracker_conn); api->ipc_dispatch_send(ci->tracker_conn, &res_lib_cfg_testshutdown, sizeof(res_lib_cfg_testshutdown)); } } } LEAVE(); } static void check_shutdown_status(void) { ENTER(); /* * Shutdown client might have gone away */ if (!shutdown_con) { LEAVE(); return; } /* * All replies safely gathered in ? */ if (shutdown_yes + shutdown_no >= shutdown_expected) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; api->timer_delete(shutdown_timer); if (shutdown_yes >= shutdown_expected || shutdown_flags == CFG_SHUTDOWN_FLAG_REGARDLESS) { TRACE1("shutdown confirmed"); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; /* * Tell originator that shutdown was confirmed */ api->ipc_response_send(shutdown_con->conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); shutdown_con = NULL; /* * Tell other nodes we are going down */ send_shutdown(); } else { TRACE1("shutdown cancelled"); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_ERR_BUSY; /* * Tell originator that shutdown was cancelled */ api->ipc_response_send(shutdown_con->conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); shutdown_con = NULL; } log_printf(LOGSYS_LEVEL_DEBUG, "shutdown decision is: (yes count: %d, no count: %d) flags=%x", shutdown_yes, shutdown_no, shutdown_flags); } LEAVE(); } /* * Not all nodes responded to the shutdown (in time) */ static void shutdown_timer_fn(void *arg) { ENTER(); /* * Mark undecideds as "NO" */ shutdown_no = shutdown_expected; check_shutdown_status(); send_test_shutdown(NULL, NULL, CS_ERR_TIMEOUT); LEAVE(); } static void remove_ci_from_shutdown(struct cfg_info *ci) { ENTER(); /* * If the controlling shutdown process has quit, then cancel the * shutdown session */ if (ci == shutdown_con) { shutdown_con = NULL; api->timer_delete(shutdown_timer); } if (!qb_list_empty(&ci->list)) { qb_list_del(&ci->list); qb_list_init(&ci->list); /* * Remove our option */ if (shutdown_con) { if (ci->shutdown_reply == SHUTDOWN_REPLY_YES) shutdown_yes--; if (ci->shutdown_reply == SHUTDOWN_REPLY_NO) shutdown_no--; } /* * If we are leaving, then that's an implicit YES to shutdown */ ci->shutdown_reply = SHUTDOWN_REPLY_YES; shutdown_yes++; check_shutdown_status(); } LEAVE(); } int cfg_lib_exit_fn (void *conn) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); ENTER(); remove_ci_from_shutdown(ci); LEAVE(); return (0); } static int cfg_lib_init_fn (void *conn) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); ENTER(); qb_list_init(&ci->list); LEAVE(); return (0); } /* * Executive message handlers */ static void message_handler_req_exec_cfg_ringreenable ( const void *message, unsigned int nodeid) { ENTER(); LEAVE(); } static void exec_cfg_killnode_endian_convert (void *msg) { struct req_exec_cfg_killnode *req_exec_cfg_killnode = (struct req_exec_cfg_killnode *)msg; ENTER(); swab_mar_name_t(&req_exec_cfg_killnode->reason); LEAVE(); } static void message_handler_req_exec_cfg_killnode ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_killnode *req_exec_cfg_killnode = message; cs_name_t reason; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "request to kill node %d(us=%d)", req_exec_cfg_killnode->nodeid, api->totem_nodeid_get()); if (req_exec_cfg_killnode->nodeid == api->totem_nodeid_get()) { marshall_from_mar_name_t(&reason, &req_exec_cfg_killnode->reason); log_printf(LOGSYS_LEVEL_NOTICE, "Killed by node %d: %s", nodeid, reason.value); corosync_fatal_error(COROSYNC_FATAL_ERROR_EXIT); } LEAVE(); } /* * Self shutdown */ static void message_handler_req_exec_cfg_shutdown ( const void *message, unsigned int nodeid) { ENTER(); log_printf(LOGSYS_LEVEL_NOTICE, "Node %d was shut down by sysadmin", nodeid); if (nodeid == api->totem_nodeid_get()) { api->shutdown_request(); } LEAVE(); } /* strcmp replacement that can handle NULLs */ static int nullcheck_strcmp(const char* left, const char *right) { if (!left && right) return -1; if (left && !right) return 1; if (!left && !right) return 0; return strcmp(left, right); } /* * If a key has changed value in the new file, then warn the user and remove it from the temp_map */ static void delete_and_notify_if_changed(icmap_map_t temp_map, const char *key_name) { if (!(icmap_key_value_eq(temp_map, key_name, icmap_get_global_map(), key_name))) { if (icmap_delete_r(temp_map, key_name) == CS_OK) { log_printf(LOGSYS_LEVEL_NOTICE, "Modified entry '%s' in corosync.conf cannot be changed at run-time", key_name); } } } /* * Remove any keys from the new config file that in the new corosync.conf but that * cannot be changed at run time. A log message will be issued for each * entry that the user wants to change but they cannot. * * Add more here as needed. */ static void remove_ro_entries(icmap_map_t temp_map) { delete_and_notify_if_changed(temp_map, "totem.secauth"); delete_and_notify_if_changed(temp_map, "totem.crypto_hash"); delete_and_notify_if_changed(temp_map, "totem.crypto_cipher"); delete_and_notify_if_changed(temp_map, "totem.version"); delete_and_notify_if_changed(temp_map, "totem.threads"); delete_and_notify_if_changed(temp_map, "totem.ip_version"); delete_and_notify_if_changed(temp_map, "totem.rrp_mode"); delete_and_notify_if_changed(temp_map, "totem.netmtu"); delete_and_notify_if_changed(temp_map, "totem.interface.ringnumber"); delete_and_notify_if_changed(temp_map, "totem.interface.bindnetaddr"); delete_and_notify_if_changed(temp_map, "totem.interface.mcastaddr"); delete_and_notify_if_changed(temp_map, "totem.interface.broadcast"); delete_and_notify_if_changed(temp_map, "totem.interface.mcastport"); delete_and_notify_if_changed(temp_map, "totem.interface.ttl"); delete_and_notify_if_changed(temp_map, "totem.vsftype"); delete_and_notify_if_changed(temp_map, "totem.transport"); delete_and_notify_if_changed(temp_map, "totem.cluster_name"); delete_and_notify_if_changed(temp_map, "quorum.provider"); delete_and_notify_if_changed(temp_map, "system.move_to_root_cgroup"); delete_and_notify_if_changed(temp_map, "system.sched_rr"); delete_and_notify_if_changed(temp_map, "system.priority"); delete_and_notify_if_changed(temp_map, "system.qb_ipc_type"); - delete_and_notify_if_changed(temp_map, "system.run_dir"); + delete_and_notify_if_changed(temp_map, "system.state_dir"); } /* * Remove entries that exist in the global map, but not in the temp_map, this will * cause delete notifications to be sent to any listeners. * * NOTE: This routine depends entirely on the keys returned by the iterators * being in alpha-sorted order. */ static void remove_deleted_entries(icmap_map_t temp_map, const char *prefix) { icmap_iter_t old_iter; icmap_iter_t new_iter; const char *old_key, *new_key; int ret; old_iter = icmap_iter_init(prefix); new_iter = icmap_iter_init_r(temp_map, prefix); old_key = icmap_iter_next(old_iter, NULL, NULL); new_key = icmap_iter_next(new_iter, NULL, NULL); while (old_key || new_key) { ret = nullcheck_strcmp(old_key, new_key); if ((ret < 0 && old_key) || !new_key) { /* * new_key is greater, a line (or more) has been deleted * Continue until old is >= new */ do { /* Remove it from icmap & send notifications */ icmap_delete(old_key); old_key = icmap_iter_next(old_iter, NULL, NULL); ret = nullcheck_strcmp(old_key, new_key); } while (ret < 0 && old_key); } else if ((ret > 0 && new_key) || !old_key) { /* * old_key is greater, a line (or more) has been added * Continue until new is >= old * * we don't need to do anything special with this like tell * icmap. That will happen when we copy the values over */ do { new_key = icmap_iter_next(new_iter, NULL, NULL); ret = nullcheck_strcmp(old_key, new_key); } while (ret > 0 && new_key); } if (ret == 0) { new_key = icmap_iter_next(new_iter, NULL, NULL); old_key = icmap_iter_next(old_iter, NULL, NULL); } } icmap_iter_finalize(new_iter); icmap_iter_finalize(old_iter); } /* * Reload configuration file */ static void message_handler_req_exec_cfg_reload_config ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_reload_config *req_exec_cfg_reload_config = message; struct res_lib_cfg_reload_config res_lib_cfg_reload_config; icmap_map_t temp_map; const char *error_string; int res = CS_OK; ENTER(); log_printf(LOGSYS_LEVEL_NOTICE, "Config reload requested by node %d", nodeid); /* * Set up a new hashtable as a staging area. */ if ((res = icmap_init_r(&temp_map)) != CS_OK) { log_printf(LOGSYS_LEVEL_ERROR, "Unable to create temporary icmap. config file reload cancelled\n"); goto reload_fini; } /* * Load new config into the temporary map */ res = coroparse_configparse(temp_map, &error_string); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Unable to reload config file: %s", error_string); res = CS_ERR_LIBRARY; goto reload_return; } /* Tell interested listeners that we have started a reload */ icmap_set_uint8("config.reload_in_progress", 1); /* Detect deleted entries and remove them from the main icmap hashtable */ remove_deleted_entries(temp_map, "logging."); remove_deleted_entries(temp_map, "totem."); remove_deleted_entries(temp_map, "nodelist."); remove_deleted_entries(temp_map, "quorum."); remove_deleted_entries(temp_map, "uidgid.config."); /* Remove entries that cannot be changed */ remove_ro_entries(temp_map); /* * Copy new keys into live config. * If this fails we will have a partially loaded config because some keys (above) might * have been reset to defaults - I'm not sure what to do here, we might have to quit. */ if ( (res = icmap_copy_map(icmap_get_global_map(), temp_map)) != CS_OK) { log_printf (LOGSYS_LEVEL_ERROR, "Error making new config live. cmap database may be inconsistent\n"); } /* All done - let clients know */ icmap_set_uint8("config.reload_in_progress", 0); reload_fini: /* Finished with the temporary storage */ icmap_fini_r(temp_map); reload_return: /* All done, return result to the caller if it was on this system */ if (nodeid == api->totem_nodeid_get()) { res_lib_cfg_reload_config.header.size = sizeof(res_lib_cfg_reload_config); res_lib_cfg_reload_config.header.id = MESSAGE_RES_CFG_RELOAD_CONFIG; res_lib_cfg_reload_config.header.error = res; api->ipc_response_send(req_exec_cfg_reload_config->source.conn, &res_lib_cfg_reload_config, sizeof(res_lib_cfg_reload_config)); api->ipc_refcnt_dec(req_exec_cfg_reload_config->source.conn);; } LEAVE(); } /* * Library Interface Implementation */ static void message_handler_req_lib_cfg_ringstatusget ( void *conn, const void *msg) { struct res_lib_cfg_ringstatusget res_lib_cfg_ringstatusget; struct totem_ip_address interfaces[INTERFACE_MAX]; unsigned int iface_count; char **status; const char *totem_ip_string; char ifname[CFG_INTERFACE_NAME_MAX_LEN]; unsigned int iface_ids[INTERFACE_MAX]; unsigned int i; cs_error_t res = CS_OK; ENTER(); res_lib_cfg_ringstatusget.header.id = MESSAGE_RES_CFG_RINGSTATUSGET; res_lib_cfg_ringstatusget.header.size = sizeof (struct res_lib_cfg_ringstatusget); api->totem_ifaces_get ( api->totem_nodeid_get(), iface_ids, interfaces, INTERFACE_MAX, &status, &iface_count); assert(iface_count <= CFG_MAX_INTERFACES); res_lib_cfg_ringstatusget.interface_count = iface_count; for (i = 0; i < iface_count; i++) { totem_ip_string = (const char *)api->totem_ip_print (&interfaces[i]); if (!totem_ip_string) { totem_ip_string=""; } /* Allow for i/f number at the start */ if (strlen(totem_ip_string) >= CFG_INTERFACE_NAME_MAX_LEN-3) { log_printf(LOGSYS_LEVEL_ERROR, "String representation of interface %u is too long", i); res = CS_ERR_NAME_TOO_LONG; goto send_response; } snprintf(ifname, sizeof(ifname), "%d %s", iface_ids[i], totem_ip_string); if (strlen(status[i]) >= CFG_INTERFACE_STATUS_MAX_LEN) { log_printf(LOGSYS_LEVEL_ERROR, "Status string for interface %u is too long", i); res = CS_ERR_NAME_TOO_LONG; goto send_response; } strcpy ((char *)&res_lib_cfg_ringstatusget.interface_status[i], status[i]); strcpy ((char *)&res_lib_cfg_ringstatusget.interface_name[i], ifname); } send_response: res_lib_cfg_ringstatusget.header.error = res; api->ipc_response_send ( conn, &res_lib_cfg_ringstatusget, sizeof (struct res_lib_cfg_ringstatusget)); LEAVE(); } static void message_handler_req_lib_cfg_ringreenable ( void *conn, const void *msg) { struct res_lib_cfg_ringreenable res_lib_cfg_ringreenable; ENTER(); res_lib_cfg_ringreenable.header.id = MESSAGE_RES_CFG_RINGREENABLE; res_lib_cfg_ringreenable.header.size = sizeof (struct res_lib_cfg_ringreenable); res_lib_cfg_ringreenable.header.error = CS_ERR_NOT_SUPPORTED; api->ipc_response_send ( conn, &res_lib_cfg_ringreenable, sizeof (struct res_lib_cfg_ringreenable)); LEAVE(); } static void message_handler_req_lib_cfg_killnode ( void *conn, const void *msg) { const struct req_lib_cfg_killnode *req_lib_cfg_killnode = msg; struct res_lib_cfg_killnode res_lib_cfg_killnode; struct req_exec_cfg_killnode req_exec_cfg_killnode; struct iovec iovec; ENTER(); req_exec_cfg_killnode.header.size = sizeof (struct req_exec_cfg_killnode); req_exec_cfg_killnode.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_KILLNODE); req_exec_cfg_killnode.nodeid = req_lib_cfg_killnode->nodeid; marshall_to_mar_name_t(&req_exec_cfg_killnode.reason, &req_lib_cfg_killnode->reason); iovec.iov_base = (char *)&req_exec_cfg_killnode; iovec.iov_len = sizeof (struct req_exec_cfg_killnode); (void)api->totem_mcast (&iovec, 1, TOTEM_SAFE); res_lib_cfg_killnode.header.size = sizeof(struct res_lib_cfg_killnode); res_lib_cfg_killnode.header.id = MESSAGE_RES_CFG_KILLNODE; res_lib_cfg_killnode.header.error = CS_OK; api->ipc_response_send(conn, &res_lib_cfg_killnode, sizeof(res_lib_cfg_killnode)); LEAVE(); } static void message_handler_req_lib_cfg_tryshutdown ( void *conn, const void *msg) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); const struct req_lib_cfg_tryshutdown *req_lib_cfg_tryshutdown = msg; struct qb_list_head *iter; ENTER(); if (req_lib_cfg_tryshutdown->flags == CFG_SHUTDOWN_FLAG_IMMEDIATE) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; /* * Tell other nodes */ send_shutdown(); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); LEAVE(); return; } /* * Shutdown in progress, return an error */ if (shutdown_con) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_ERR_EXIST; api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); LEAVE(); return; } ci->conn = conn; shutdown_con = (struct cfg_info *)api->ipc_private_data_get (conn); shutdown_flags = req_lib_cfg_tryshutdown->flags; shutdown_yes = 0; shutdown_no = 0; /* * Count the number of listeners */ shutdown_expected = 0; qb_list_for_each(iter, &trackers_list) { struct cfg_info *testci = qb_list_entry(iter, struct cfg_info, list); /* * It is assumed that we will allow shutdown */ if (testci != ci) { testci->shutdown_reply = SHUTDOWN_REPLY_UNKNOWN; shutdown_expected++; } } /* * If no-one is listening for events then we can just go down now */ if (shutdown_expected == 0) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; /* * Tell originator that shutdown was confirmed */ api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); send_shutdown(); LEAVE(); return; } else { unsigned int shutdown_timeout = DEFAULT_SHUTDOWN_TIMEOUT; /* * Look for a shutdown timeout in configuration map */ icmap_get_uint32("cfg.shutdown_timeout", &shutdown_timeout); /* * Start the timer. If we don't get a full set of replies before this goes * off we'll cancel the shutdown */ api->timer_add_duration((unsigned long long)shutdown_timeout*1000000000, NULL, shutdown_timer_fn, &shutdown_timer); /* * Tell the users we would like to shut down */ send_test_shutdown(NULL, conn, CS_OK); } /* * We don't sent a reply to the caller here. * We send it when we know if we can shut down or not */ LEAVE(); } static void message_handler_req_lib_cfg_replytoshutdown ( void *conn, const void *msg) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); const struct req_lib_cfg_replytoshutdown *req_lib_cfg_replytoshutdown = msg; struct res_lib_cfg_replytoshutdown res_lib_cfg_replytoshutdown; int status = CS_OK; ENTER(); if (!shutdown_con) { status = CS_ERR_ACCESS; goto exit_fn; } if (req_lib_cfg_replytoshutdown->response) { shutdown_yes++; ci->shutdown_reply = SHUTDOWN_REPLY_YES; } else { shutdown_no++; ci->shutdown_reply = SHUTDOWN_REPLY_NO; } check_shutdown_status(); exit_fn: res_lib_cfg_replytoshutdown.header.error = status; res_lib_cfg_replytoshutdown.header.id = MESSAGE_RES_CFG_REPLYTOSHUTDOWN; res_lib_cfg_replytoshutdown.header.size = sizeof(res_lib_cfg_replytoshutdown); api->ipc_response_send(conn, &res_lib_cfg_replytoshutdown, sizeof(res_lib_cfg_replytoshutdown)); LEAVE(); } static void message_handler_req_lib_cfg_get_node_addrs (void *conn, const void *msg) { struct totem_ip_address node_ifs[INTERFACE_MAX]; unsigned int iface_ids[INTERFACE_MAX]; char buf[PIPE_BUF]; char **status; unsigned int num_interfaces = 0; struct sockaddr_storage *ss; int ret = CS_OK; int i; int live_addrs = 0; const struct req_lib_cfg_get_node_addrs *req_lib_cfg_get_node_addrs = msg; struct res_lib_cfg_get_node_addrs *res_lib_cfg_get_node_addrs = (struct res_lib_cfg_get_node_addrs *)buf; unsigned int nodeid = req_lib_cfg_get_node_addrs->nodeid; char *addr_buf; if (nodeid == 0) nodeid = api->totem_nodeid_get(); if (api->totem_ifaces_get(nodeid, iface_ids, node_ifs, INTERFACE_MAX, &status, &num_interfaces)) { ret = CS_ERR_EXIST; num_interfaces = 0; } res_lib_cfg_get_node_addrs->header.size = sizeof(struct res_lib_cfg_get_node_addrs) + (num_interfaces * TOTEMIP_ADDRLEN); res_lib_cfg_get_node_addrs->header.id = MESSAGE_RES_CFG_GET_NODE_ADDRS; res_lib_cfg_get_node_addrs->header.error = ret; if (num_interfaces) { res_lib_cfg_get_node_addrs->family = node_ifs[0].family; for (i = 0, addr_buf = (char *)res_lib_cfg_get_node_addrs->addrs; i < num_interfaces; i++) { ss = (struct sockaddr_storage *)&node_ifs[i].addr; if (ss->ss_family) { memcpy(addr_buf, node_ifs[i].addr, TOTEMIP_ADDRLEN); live_addrs++; addr_buf += TOTEMIP_ADDRLEN; } } res_lib_cfg_get_node_addrs->num_addrs = live_addrs; } else { res_lib_cfg_get_node_addrs->header.error = CS_ERR_NOT_EXIST; } api->ipc_response_send(conn, res_lib_cfg_get_node_addrs, res_lib_cfg_get_node_addrs->header.size); } static void message_handler_req_lib_cfg_local_get (void *conn, const void *msg) { struct res_lib_cfg_local_get res_lib_cfg_local_get; res_lib_cfg_local_get.header.size = sizeof(res_lib_cfg_local_get); res_lib_cfg_local_get.header.id = MESSAGE_RES_CFG_LOCAL_GET; res_lib_cfg_local_get.header.error = CS_OK; res_lib_cfg_local_get.local_nodeid = api->totem_nodeid_get (); api->ipc_response_send(conn, &res_lib_cfg_local_get, sizeof(res_lib_cfg_local_get)); } static void message_handler_req_lib_cfg_reload_config (void *conn, const void *msg) { struct req_exec_cfg_reload_config req_exec_cfg_reload_config; struct iovec iovec; ENTER(); req_exec_cfg_reload_config.header.size = sizeof (struct req_exec_cfg_reload_config); req_exec_cfg_reload_config.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG); api->ipc_source_set (&req_exec_cfg_reload_config.source, conn); api->ipc_refcnt_inc(conn); iovec.iov_base = (char *)&req_exec_cfg_reload_config; iovec.iov_len = sizeof (struct req_exec_cfg_reload_config); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); } static void message_handler_req_lib_cfg_reopen_log_files (void *conn, const void *msg) { struct res_lib_cfg_reopen_log_files res_lib_cfg_reopen_log_files; cs_error_t res; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Reopening logging files\n"); res = logsys_reopen_log_files(); res_lib_cfg_reopen_log_files.header.size = sizeof(res_lib_cfg_reopen_log_files); res_lib_cfg_reopen_log_files.header.id = MESSAGE_RES_CFG_REOPEN_LOG_FILES; res_lib_cfg_reopen_log_files.header.error = res; api->ipc_response_send(conn, &res_lib_cfg_reopen_log_files, sizeof(res_lib_cfg_reopen_log_files)); LEAVE(); } diff --git a/exec/main.c b/exec/main.c index 9c422694..3ba54f44 100644 --- a/exec/main.c +++ b/exec/main.c @@ -1,1586 +1,1586 @@ /* * Copyright (c) 2002-2006 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** * \mainpage Corosync * * This is the doxygen generated developer documentation for the Corosync * project. For more information about Corosync, please see the project * web site, corosync.org. * * \section license License * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LIBSYSTEMD #include #endif #include #include #include #include #include #include #include #include #include #include #include #include "quorum.h" #include "totemsrp.h" #include "logconfig.h" #include "totemconfig.h" #include "main.h" #include "sync.h" #include "timer.h" #include "util.h" #include "apidef.h" #include "service.h" #include "schedwrk.h" #include "ipcs_stats.h" #include "stats.h" #ifdef HAVE_SMALL_MEMORY_FOOTPRINT #define IPC_LOGSYS_SIZE 1024*64 #else #define IPC_LOGSYS_SIZE 8192*128 #endif /* * LibQB adds default "*" syslog filter so we have to set syslog_priority as low * as possible so filters applied later in _logsys_config_apply_per_file takes * effect. */ LOGSYS_DECLARE_SYSTEM ("corosync", LOGSYS_MODE_OUTPUT_STDERR | LOGSYS_MODE_OUTPUT_SYSLOG, LOG_DAEMON, LOG_EMERG); LOGSYS_DECLARE_SUBSYS ("MAIN"); #define SERVER_BACKLOG 5 static int sched_priority = 0; static unsigned int service_count = 32; static struct totem_logging_configuration totem_logging_configuration; static struct corosync_api_v1 *api = NULL; static int sync_in_process = 1; static qb_loop_t *corosync_poll_handle; struct sched_param global_sched_param; static corosync_timer_handle_t corosync_stats_timer_handle; static const char *corosync_lock_file = LOCALSTATEDIR"/run/corosync.pid"; static char corosync_config_file[PATH_MAX + 1] = COROSYSCONFDIR "/corosync.conf"; qb_loop_t *cs_poll_handle_get (void) { return (corosync_poll_handle); } int cs_poll_dispatch_add (qb_loop_t * handle, int fd, int events, void *data, int (*dispatch_fn) (int fd, int revents, void *data)) { return qb_loop_poll_add(handle, QB_LOOP_MED, fd, events, data, dispatch_fn); } int cs_poll_dispatch_delete(qb_loop_t * handle, int fd) { return qb_loop_poll_del(handle, fd); } void corosync_state_dump (void) { int i; for (i = 0; i < SERVICES_COUNT_MAX; i++) { if (corosync_service[i] && corosync_service[i]->exec_dump_fn) { corosync_service[i]->exec_dump_fn (); } } } const char *corosync_get_config_file(void) { return (corosync_config_file); } static void corosync_blackbox_write_to_file (void) { char fname[PATH_MAX]; char fdata_fname[PATH_MAX]; char time_str[PATH_MAX]; struct tm cur_time_tm; time_t cur_time_t; ssize_t res; cur_time_t = time(NULL); localtime_r(&cur_time_t, &cur_time_tm); strftime(time_str, PATH_MAX, "%Y-%m-%dT%H:%M:%S", &cur_time_tm); if (snprintf(fname, PATH_MAX, "%s/fdata-%s-%lld", - get_run_dir(), + get_state_dir(), time_str, (long long int)getpid()) >= PATH_MAX) { log_printf(LOGSYS_LEVEL_ERROR, "Can't snprintf blackbox file name"); return ; } if ((res = qb_log_blackbox_write_to_file(fname)) < 0) { LOGSYS_PERROR(-res, LOGSYS_LEVEL_ERROR, "Can't store blackbox file"); return ; } - snprintf(fdata_fname, sizeof(fdata_fname), "%s/fdata", get_run_dir()); + snprintf(fdata_fname, sizeof(fdata_fname), "%s/fdata", get_state_dir()); unlink(fdata_fname); if (symlink(fname, fdata_fname) == -1) { log_printf(LOGSYS_LEVEL_ERROR, "Can't create symlink to '%s' for corosync blackbox file '%s'", fname, fdata_fname); } } static void unlink_all_completed (void) { api->timer_delete (corosync_stats_timer_handle); qb_loop_stop (corosync_poll_handle); icmap_fini(); } void corosync_shutdown_request (void) { corosync_service_unlink_all (api, unlink_all_completed); } static int32_t sig_diag_handler (int num, void *data) { corosync_state_dump (); return 0; } static int32_t sig_exit_handler (int num, void *data) { log_printf(LOGSYS_LEVEL_NOTICE, "Node was shut down by a signal"); corosync_service_unlink_all (api, unlink_all_completed); return 0; } static void sigsegv_handler (int num) { (void)signal (num, SIG_DFL); corosync_blackbox_write_to_file (); qb_log_fini(); raise (num); } #define LOCALHOST_IP inet_addr("127.0.0.1") static void *corosync_group_handle; static struct totempg_group corosync_group = { .group = "a", .group_len = 1 }; static void serialize_lock (void) { } static void serialize_unlock (void) { } static void corosync_sync_completed (void) { log_printf (LOGSYS_LEVEL_NOTICE, "Completed service synchronization, ready to provide service."); sync_in_process = 0; cs_ipcs_sync_state_changed(sync_in_process); cs_ipc_allow_connections(1); /* * Inform totem to start using new message queue again */ totempg_trans_ack(); #ifdef HAVE_LIBSYSTEMD sd_notify (0, "READY=1"); #endif } static int corosync_sync_callbacks_retrieve ( int service_id, struct sync_callbacks *callbacks) { if (corosync_service[service_id] == NULL) { return (-1); } if (callbacks == NULL) { return (0); } callbacks->name = corosync_service[service_id]->name; callbacks->sync_init = corosync_service[service_id]->sync_init; callbacks->sync_process = corosync_service[service_id]->sync_process; callbacks->sync_activate = corosync_service[service_id]->sync_activate; callbacks->sync_abort = corosync_service[service_id]->sync_abort; return (0); } static struct memb_ring_id corosync_ring_id; static void member_object_joined (unsigned int nodeid) { char member_ip[ICMAP_KEYNAME_MAXLEN]; char member_join_count[ICMAP_KEYNAME_MAXLEN]; char member_status[ICMAP_KEYNAME_MAXLEN]; snprintf(member_ip, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.ip", nodeid); snprintf(member_join_count, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.join_count", nodeid); snprintf(member_status, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.status", nodeid); if (icmap_get(member_ip, NULL, NULL, NULL) == CS_OK) { icmap_inc(member_join_count); icmap_set_string(member_status, "joined"); } else { icmap_set_string(member_ip, (char*)api->totem_ifaces_print (nodeid)); icmap_set_uint32(member_join_count, 1); icmap_set_string(member_status, "joined"); } log_printf (LOGSYS_LEVEL_DEBUG, "Member joined: %s", api->totem_ifaces_print (nodeid)); } static void member_object_left (unsigned int nodeid) { char member_status[ICMAP_KEYNAME_MAXLEN]; snprintf(member_status, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.status", nodeid); icmap_set_string(member_status, "left"); log_printf (LOGSYS_LEVEL_DEBUG, "Member left: %s", api->totem_ifaces_print (nodeid)); } static void confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; int abort_activate = 0; if (sync_in_process == 1) { abort_activate = 1; } sync_in_process = 1; cs_ipcs_sync_state_changed(sync_in_process); memcpy (&corosync_ring_id, ring_id, sizeof (struct memb_ring_id)); for (i = 0; i < left_list_entries; i++) { member_object_left (left_list[i]); } for (i = 0; i < joined_list_entries; i++) { member_object_joined (joined_list[i]); } /* * Call configuration change for all services */ for (i = 0; i < service_count; i++) { if (corosync_service[i] && corosync_service[i]->confchg_fn) { corosync_service[i]->confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } } if (abort_activate) { sync_abort (); } if (configuration_type == TOTEM_CONFIGURATION_TRANSITIONAL) { sync_save_transitional (member_list, member_list_entries, ring_id); } if (configuration_type == TOTEM_CONFIGURATION_REGULAR) { sync_start (member_list, member_list_entries, ring_id); } } static void priv_drop (void) { return; /* TODO: we are still not dropping privs */ } static void corosync_tty_detach (void) { int devnull; /* * Disconnect from TTY if this is not a debug run */ switch (fork ()) { case -1: corosync_exit_error (COROSYNC_DONE_FORK); break; case 0: /* * child which is disconnected, run this process */ break; default: exit (0); break; } /* Create new session */ (void)setsid(); /* * Map stdin/out/err to /dev/null. */ devnull = open("/dev/null", O_RDWR); if (devnull == -1) { corosync_exit_error (COROSYNC_DONE_STD_TO_NULL_REDIR); } if (dup2(devnull, 0) < 0 || dup2(devnull, 1) < 0 || dup2(devnull, 2) < 0) { close(devnull); corosync_exit_error (COROSYNC_DONE_STD_TO_NULL_REDIR); } close(devnull); } static void corosync_mlockall (void) { int res; struct rlimit rlimit; rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; #ifndef RLIMIT_MEMLOCK #define RLIMIT_MEMLOCK RLIMIT_VMEM #endif setrlimit (RLIMIT_MEMLOCK, &rlimit); res = mlockall (MCL_CURRENT | MCL_FUTURE); if (res == -1) { LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING, "Could not lock memory of service to avoid page faults"); }; } static void corosync_totem_stats_updater (void *data) { totempg_stats_t * stats; uint32_t total_mtt_rx_token; uint32_t total_backlog_calc; uint32_t total_token_holdtime; int t, prev; int32_t token_count; const char *cstr; stats = api->totem_get_stats(); stats->srp->firewall_enabled_or_nic_failure = stats->srp->continuous_gather > MAX_NO_CONT_GATHER ? 1 : 0; if (stats->srp->continuous_gather > MAX_NO_CONT_GATHER || stats->srp->continuous_sendmsg_failures > MAX_NO_CONT_SENDMSG_FAILURES) { cstr = ""; if (stats->srp->continuous_sendmsg_failures > MAX_NO_CONT_SENDMSG_FAILURES) { cstr = "number of multicast sendmsg failures is above threshold"; } if (stats->srp->continuous_gather > MAX_NO_CONT_GATHER) { cstr = "totem is continuously in gather state"; } log_printf (LOGSYS_LEVEL_WARNING, "Totem is unable to form a cluster because of an " "operating system or network fault (reason: %s). The most common " "cause of this message is that the local firewall is " "configured improperly.", cstr); stats->srp->firewall_enabled_or_nic_failure = 1; } else { stats->srp->firewall_enabled_or_nic_failure = 0; } total_mtt_rx_token = 0; total_token_holdtime = 0; total_backlog_calc = 0; token_count = 0; t = stats->srp->latest_token; while (1) { if (t == 0) prev = TOTEM_TOKEN_STATS_MAX - 1; else prev = t - 1; if (prev == stats->srp->earliest_token) break; /* if tx == 0, then dropped token (not ours) */ if (stats->srp->token[t].tx != 0 || (stats->srp->token[t].rx - stats->srp->token[prev].rx) > 0 ) { total_mtt_rx_token += (stats->srp->token[t].rx - stats->srp->token[prev].rx); total_token_holdtime += (stats->srp->token[t].tx - stats->srp->token[t].rx); total_backlog_calc += stats->srp->token[t].backlog_calc; token_count++; } t = prev; } if (token_count) { stats->srp->mtt_rx_token = (total_mtt_rx_token / token_count); stats->srp->avg_token_workload = (total_token_holdtime / token_count); stats->srp->avg_backlog_calc = (total_backlog_calc / token_count); } stats->srp->time_since_token_last_received = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC - stats->srp->token[stats->srp->latest_token].rx; stats_trigger_trackers(); api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL, corosync_totem_stats_updater, &corosync_stats_timer_handle); } static void corosync_totem_stats_init (void) { /* start stats timer */ api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL, corosync_totem_stats_updater, &corosync_stats_timer_handle); } static void deliver_fn ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required) { const struct qb_ipc_request_header *header; int32_t service; int32_t fn_id; uint32_t id; header = msg; if (endian_conversion_required) { id = swab32 (header->id); } else { id = header->id; } /* * Call the proper executive handler */ service = id >> 16; fn_id = id & 0xffff; if (!corosync_service[service]) { return; } if (fn_id >= corosync_service[service]->exec_engine_count) { log_printf(LOGSYS_LEVEL_WARNING, "discarded unknown message %d for service %d (max id %d)", fn_id, service, corosync_service[service]->exec_engine_count); return; } icmap_fast_inc(service_stats_rx[service][fn_id]); if (endian_conversion_required) { assert(corosync_service[service]->exec_engine[fn_id].exec_endian_convert_fn != NULL); corosync_service[service]->exec_engine[fn_id].exec_endian_convert_fn ((void *)msg); } corosync_service[service]->exec_engine[fn_id].exec_handler_fn (msg, nodeid); } int main_mcast ( const struct iovec *iovec, unsigned int iov_len, unsigned int guarantee) { const struct qb_ipc_request_header *req = iovec->iov_base; int32_t service; int32_t fn_id; service = req->id >> 16; fn_id = req->id & 0xffff; if (corosync_service[service]) { icmap_fast_inc(service_stats_tx[service][fn_id]); } return (totempg_groups_mcast_joined (corosync_group_handle, iovec, iov_len, guarantee)); } static void corosync_ring_id_create_or_load ( struct memb_ring_id *memb_ring_id, unsigned int nodeid) { int fd; int res = 0; char filename[PATH_MAX]; snprintf (filename, sizeof(filename), "%s/ringid_%u", - get_run_dir(), nodeid); + get_state_dir(), nodeid); fd = open (filename, O_RDONLY, 0700); /* * If file can be opened and read, read the ring id */ if (fd != -1) { res = read (fd, &memb_ring_id->seq, sizeof (uint64_t)); close (fd); } /* * If file could not be opened or read, create a new ring id */ if ((fd == -1) || (res != sizeof (uint64_t))) { memb_ring_id->seq = 0; umask(0); fd = open (filename, O_CREAT|O_RDWR, 0700); if (fd != -1) { res = write (fd, &memb_ring_id->seq, sizeof (uint64_t)); close (fd); if (res == -1) { LOGSYS_PERROR (errno, LOGSYS_LEVEL_ERROR, "Couldn't write ringid file '%s'", filename); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } } else { LOGSYS_PERROR (errno, LOGSYS_LEVEL_ERROR, "Couldn't create ringid file '%s'", filename); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } } memb_ring_id->rep = nodeid; } static void corosync_ring_id_store ( const struct memb_ring_id *memb_ring_id, unsigned int nodeid) { char filename[PATH_MAX]; int fd; int res; snprintf (filename, sizeof(filename), "%s/ringid_%u", - get_run_dir(), nodeid); + get_state_dir(), nodeid); fd = open (filename, O_WRONLY, 0700); if (fd == -1) { fd = open (filename, O_CREAT|O_RDWR, 0700); } if (fd == -1) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "Couldn't store new ring id %llx to stable storage", memb_ring_id->seq); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } log_printf (LOGSYS_LEVEL_DEBUG, "Storing new sequence id for ring %llx", memb_ring_id->seq); res = write (fd, &memb_ring_id->seq, sizeof(memb_ring_id->seq)); close (fd); if (res != sizeof(memb_ring_id->seq)) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "Couldn't store new ring id %llx to stable storage", memb_ring_id->seq); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } } static qb_loop_timer_handle recheck_the_q_level_timer; void corosync_recheck_the_q_level(void *data) { totempg_check_q_level(corosync_group_handle); if (cs_ipcs_q_level_get() == TOTEM_Q_LEVEL_CRITICAL) { qb_loop_timer_add(cs_poll_handle_get(), QB_LOOP_MED, 1*QB_TIME_NS_IN_MSEC, NULL, corosync_recheck_the_q_level, &recheck_the_q_level_timer); } } struct sending_allowed_private_data_struct { int reserved_msgs; }; int corosync_sending_allowed ( unsigned int service, unsigned int id, const void *msg, void *sending_allowed_private_data) { struct sending_allowed_private_data_struct *pd = (struct sending_allowed_private_data_struct *)sending_allowed_private_data; struct iovec reserve_iovec; struct qb_ipc_request_header *header = (struct qb_ipc_request_header *)msg; int sending_allowed; reserve_iovec.iov_base = (char *)header; reserve_iovec.iov_len = header->size; pd->reserved_msgs = totempg_groups_joined_reserve ( corosync_group_handle, &reserve_iovec, 1); if (pd->reserved_msgs == -1) { return -EINVAL; } /* Message ID out of range */ if (id >= corosync_service[service]->lib_engine_count) { return -EINVAL; } sending_allowed = QB_FALSE; if (corosync_quorum_is_quorate() == 1 || corosync_service[service]->allow_inquorate == CS_LIB_ALLOW_INQUORATE) { // we are quorate // now check flow control if (corosync_service[service]->lib_engine[id].flow_control == CS_LIB_FLOW_CONTROL_NOT_REQUIRED) { sending_allowed = QB_TRUE; } else if (pd->reserved_msgs && sync_in_process == 0) { sending_allowed = QB_TRUE; } else if (pd->reserved_msgs == 0) { return -ENOBUFS; } else /* (sync_in_process) */ { return -EINPROGRESS; } } else { return -EHOSTUNREACH; } return (sending_allowed); } void corosync_sending_allowed_release (void *sending_allowed_private_data) { struct sending_allowed_private_data_struct *pd = (struct sending_allowed_private_data_struct *)sending_allowed_private_data; if (pd->reserved_msgs == -1) { return; } totempg_groups_joined_release (pd->reserved_msgs); } int message_source_is_local (const mar_message_source_t *source) { int ret = 0; assert (source != NULL); if (source->nodeid == totempg_my_nodeid_get ()) { ret = 1; } return ret; } void message_source_set ( mar_message_source_t *source, void *conn) { assert ((source != NULL) && (conn != NULL)); memset (source, 0, sizeof (mar_message_source_t)); source->nodeid = totempg_my_nodeid_get (); source->conn = conn; } struct scheduler_pause_timeout_data { struct totem_config *totem_config; qb_loop_timer_handle handle; unsigned long long tv_prev; unsigned long long max_tv_diff; }; static void timer_function_scheduler_timeout (void *data) { struct scheduler_pause_timeout_data *timeout_data = (struct scheduler_pause_timeout_data *)data; unsigned long long tv_current; unsigned long long tv_diff; tv_current = qb_util_nano_current_get (); if (timeout_data->tv_prev == 0) { /* * Initial call -> just pretent everything is ok */ timeout_data->tv_prev = tv_current; timeout_data->max_tv_diff = 0; } tv_diff = tv_current - timeout_data->tv_prev; timeout_data->tv_prev = tv_current; if (tv_diff > timeout_data->max_tv_diff) { log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms " "(threshold is %0.4f ms). Consider token timeout increase.", (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); } /* * Set next threshold, because token_timeout can change */ timeout_data->max_tv_diff = timeout_data->totem_config->token_timeout * QB_TIME_NS_IN_MSEC * 0.8; qb_loop_timer_add (corosync_poll_handle, QB_LOOP_MED, timeout_data->totem_config->token_timeout * QB_TIME_NS_IN_MSEC / 3, timeout_data, timer_function_scheduler_timeout, &timeout_data->handle); } static int corosync_set_rr_scheduler (void) { int ret_val = 0; #if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX) && defined(HAVE_SCHED_SETSCHEDULER) int res; sched_priority = sched_get_priority_max (SCHED_RR); if (sched_priority != -1) { global_sched_param.sched_priority = sched_priority; res = sched_setscheduler (0, SCHED_RR, &global_sched_param); if (res == -1) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "Could not set SCHED_RR at priority %d", global_sched_param.sched_priority); global_sched_param.sched_priority = 0; #ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET qb_log_thread_priority_set (SCHED_OTHER, 0); #endif ret_val = -1; } else { /* * Turn on SCHED_RR in logsys system */ #ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET res = qb_log_thread_priority_set (SCHED_RR, sched_priority); #else res = -1; #endif if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Could not set logsys thread priority." " Can't continue because of priority inversions."); corosync_exit_error (COROSYNC_DONE_LOGSETUP); } } } else { LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING, "Could not get maximum scheduler priority"); sched_priority = 0; ret_val = -1; } #else log_printf(LOGSYS_LEVEL_WARNING, "The Platform is missing process priority setting features. Leaving at default."); ret_val = -1; #endif return (ret_val); } /* The basename man page contains scary warnings about thread-safety and portability, hence this */ static const char *corosync_basename(const char *file_name) { char *base; base = strrchr (file_name, '/'); if (base) { return base + 1; } return file_name; } static void _logsys_log_printf(int level, int subsys, const char *function_name, const char *file_name, int file_line, const char *format, ...) __attribute__((format(printf, 6, 7))); static void _logsys_log_printf(int level, int subsys, const char *function_name, const char *file_name, int file_line, const char *format, ...) { va_list ap; va_start(ap, format); qb_log_from_external_source_va(function_name, corosync_basename(file_name), format, level, file_line, subsys, ap); va_end(ap); } static void fplay_key_change_notify_fn ( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { if (strcmp(key_name, "runtime.blackbox.dump_flight_data") == 0) { fprintf(stderr,"Writetofile\n"); corosync_blackbox_write_to_file (); } if (strcmp(key_name, "runtime.blackbox.dump_state") == 0) { fprintf(stderr,"statefump\n"); corosync_state_dump (); } } static void corosync_fplay_control_init (void) { icmap_track_t track = NULL; icmap_set_string("runtime.blackbox.dump_flight_data", "no"); icmap_set_string("runtime.blackbox.dump_state", "no"); icmap_track_add("runtime.blackbox.dump_flight_data", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY, fplay_key_change_notify_fn, NULL, &track); icmap_track_add("runtime.blackbox.dump_state", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY, fplay_key_change_notify_fn, NULL, &track); } static void force_gather_notify_fn( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { char *key_val; if (icmap_get_string(key_name, &key_val) == CS_OK && strcmp(key_val, "no") == 0) goto out; icmap_set_string("runtime.force_gather", "no"); if (strcmp(key_name, "runtime.force_gather") == 0) { log_printf(LOGSYS_LEVEL_ERROR, "Forcing into GATHER state\n"); totempg_force_gather(); } out: free(key_val); } static void corosync_force_gather_init (void) { icmap_track_t track = NULL; icmap_set_string("runtime.force_gather", "no"); icmap_track_add("runtime.force_gather", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY, force_gather_notify_fn, NULL, &track); } /* * Set RO flag for keys, which ether doesn't make sense to change by user (statistic) * or which when changed are not reflected by runtime (totem.crypto_cipher, ...). * * Also some RO keys cannot be determined in this stage, so they are set later in * other functions (like nodelist.local_node_pos, ...) */ static void set_icmap_ro_keys_flag (void) { /* * Set RO flag for all keys of internal configuration and runtime statistics */ icmap_set_ro_access("internal_configuration.", CS_TRUE, CS_TRUE); icmap_set_ro_access("runtime.services.", CS_TRUE, CS_TRUE); icmap_set_ro_access("runtime.config.", CS_TRUE, CS_TRUE); icmap_set_ro_access("runtime.totem.", CS_TRUE, CS_TRUE); icmap_set_ro_access("uidgid.config.", CS_TRUE, CS_TRUE); icmap_set_ro_access("system.", CS_TRUE, CS_TRUE); icmap_set_ro_access("nodelist.", CS_TRUE, CS_TRUE); /* * Set RO flag for constrete keys of configuration which can't be changed * during runtime */ icmap_set_ro_access("totem.crypto_cipher", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.crypto_hash", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.secauth", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.ip_version", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.rrp_mode", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.transport", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.cluster_name", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.netmtu", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.threads", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.version", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.nodeid", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.clear_node_high_bit", CS_FALSE, CS_TRUE); icmap_set_ro_access("config.reload_in_progress", CS_FALSE, CS_TRUE); icmap_set_ro_access("config.totemconfig_reload_in_progress", CS_FALSE, CS_TRUE); } static void main_service_ready (void) { int res; /* * This must occur after totempg is initialized because "this_ip" must be set */ res = corosync_service_defaults_link_and_init (api); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Could not initialize default services"); corosync_exit_error (COROSYNC_DONE_INIT_SERVICES); } cs_ipcs_init(); corosync_totem_stats_init (); corosync_fplay_control_init (); corosync_force_gather_init (); sync_init ( corosync_sync_callbacks_retrieve, corosync_sync_completed); } static enum e_corosync_done corosync_flock (const char *lockfile, pid_t pid) { struct flock lock; enum e_corosync_done err; char pid_s[17]; int fd_flag; int lf; err = COROSYNC_DONE_EXIT; lf = open (lockfile, O_WRONLY | O_CREAT, 0640); if (lf == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't create lock file."); return (COROSYNC_DONE_ACQUIRE_LOCK); } retry_fcntl: lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; if (fcntl (lf, F_SETLK, &lock) == -1) { switch (errno) { case EINTR: goto retry_fcntl; break; case EAGAIN: case EACCES: log_printf (LOGSYS_LEVEL_ERROR, "Another Corosync instance is already running."); err = COROSYNC_DONE_ALREADY_RUNNING; goto error_close; break; default: log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't acquire lock. Error was %s", strerror(errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close; break; } } if (ftruncate (lf, 0) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't truncate lock file. Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } memset (pid_s, 0, sizeof (pid_s)); snprintf (pid_s, sizeof (pid_s) - 1, "%u\n", pid); retry_write: if (write (lf, pid_s, strlen (pid_s)) != strlen (pid_s)) { if (errno == EINTR) { goto retry_write; } else { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't write pid to lock file. " "Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } } if ((fd_flag = fcntl (lf, F_GETFD, 0)) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't get close-on-exec flag from lock file. " "Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } fd_flag |= FD_CLOEXEC; if (fcntl (lf, F_SETFD, fd_flag) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't set close-on-exec flag to lock file. " "Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } return (err); error_close_unlink: unlink (lockfile); error_close: close (lf); return (err); } static int corosync_move_to_root_cgroup(void) { FILE *f; int res = -1; /* * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now * using systemd and systemd uses hardcoded path of cgroup mount point. * * This feature is expected to be removed as soon as systemd gets support * for managing RT configuration. */ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); if (f == NULL) { log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> " "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); res = 0; goto exit_res; } (void)fclose(f); f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); if (f == NULL) { log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing"); goto exit_res; } if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) { log_printf(LOGSYS_LEVEL_WARNING, "Can't write corosync pid into cgroups tasks file"); goto close_and_exit_res; } close_and_exit_res: if (fclose(f) != 0) { log_printf(LOGSYS_LEVEL_WARNING, "Can't close cgroups tasks file"); goto exit_res; } exit_res: return (res); } int main (int argc, char **argv, char **envp) { const char *error_string; struct totem_config totem_config; int res, ch; int background, sched_rr, prio, testonly, move_to_root_cgroup; struct stat stat_out; enum e_corosync_done flock_err; uint64_t totem_config_warnings; struct scheduler_pause_timeout_data scheduler_pause_timeout_data; long int tmpli; char *ep; char *tmp_str; int log_subsys_id_totem; /* default configuration */ background = 1; testonly = 0; while ((ch = getopt (argc, argv, "c:ftv")) != EOF) { switch (ch) { case 'c': res = snprintf(corosync_config_file, sizeof(corosync_config_file), "%s", optarg); if (res >= sizeof(corosync_config_file)) { fprintf (stderr, "Config file path too long.\n"); syslog (LOGSYS_LEVEL_ERROR, "Config file path too long."); logsys_system_fini(); return EXIT_FAILURE; } break; case 'f': background = 0; break; case 't': testonly = 1; break; case 'v': printf ("Corosync Cluster Engine, version '%s'\n", VERSION); printf ("Copyright (c) 2006-2018 Red Hat, Inc.\n"); logsys_system_fini(); return EXIT_SUCCESS; break; default: fprintf(stderr, \ "usage:\n"\ " -c : Corosync config file path.\n"\ " -f : Start application in foreground.\n"\ " -t : Test configuration and exit.\n"\ " -v : Display version and SVN revision of Corosync and exit.\n"); logsys_system_fini(); return EXIT_FAILURE; } } /* * Other signals are registered later via qb_loop_signal_add */ (void)signal (SIGSEGV, sigsegv_handler); (void)signal (SIGABRT, sigsegv_handler); #if MSG_NOSIGNAL != 0 (void)signal (SIGPIPE, SIG_IGN); #endif if (icmap_init() != CS_OK) { fprintf (stderr, "Corosync Executive couldn't initialize configuration component.\n"); syslog (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't initialize configuration component."); corosync_exit_error (COROSYNC_DONE_ICMAP); } set_icmap_ro_keys_flag(); /* * Initialize the corosync_api_v1 definition */ api = apidef_get (); res = coroparse_configparse(icmap_get_global_map(), &error_string); if (res == -1) { /* * Logsys can't log properly at this early stage, and we need to get this message out * */ fprintf (stderr, "%s\n", error_string); syslog (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } if (stats_map_init(api) != CS_OK) { fprintf (stderr, "Corosync Executive couldn't initialize statistics component.\n"); syslog (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't initialize statistics component."); corosync_exit_error (COROSYNC_DONE_STATS); } res = corosync_log_config_read (&error_string); if (res == -1) { /* * if we are here, we _must_ flush the logsys queue * and try to inform that we couldn't read the config. * this is a desperate attempt before certain death * and there is no guarantee that we can print to stderr * nor that logsys is sending the messages where we expect. */ log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); fprintf(stderr, "%s", error_string); syslog (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_LOGCONFIGREAD); } if (!testonly) { log_printf (LOGSYS_LEVEL_NOTICE, "Corosync Cluster Engine ('%s'): started and ready to provide service.", VERSION); log_printf (LOGSYS_LEVEL_INFO, "Corosync built-in features:" PACKAGE_FEATURES ""); } /* * Create totem logsys subsys before totem_config_read so log functions can be used */ log_subsys_id_totem = _logsys_subsys_create("TOTEM", "totem," "totemip.c,totemconfig.c,totemcrypto.c,totemsrp.c," "totempg.c,totemudp.c,totemudpu.c,totemnet.c,totemknet.c"); /* * Make sure required directory is present */ - res = stat (get_run_dir(), &stat_out); + res = stat (get_state_dir(), &stat_out); if ((res == -1) || (res == 0 && !S_ISDIR(stat_out.st_mode))) { - log_printf (LOGSYS_LEVEL_ERROR, "Required directory not present %s. Please create it.", get_run_dir()); + log_printf (LOGSYS_LEVEL_ERROR, "Required directory not present %s. Please create it.", get_state_dir()); corosync_exit_error (COROSYNC_DONE_DIR_NOT_PRESENT); } - res = chdir(get_run_dir()); + res = chdir(get_state_dir()); if (res == -1) { - log_printf (LOGSYS_LEVEL_ERROR, "Cannot chdir to run directory %s. " - "Please make sure it has correct context and rights.", get_run_dir()); + log_printf (LOGSYS_LEVEL_ERROR, "Cannot chdir to state directory %s. " + "Please make sure it has correct context and rights.", get_state_dir()); corosync_exit_error (COROSYNC_DONE_DIR_NOT_PRESENT); } res = totem_config_read (&totem_config, &error_string, &totem_config_warnings); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } if (totem_config_warnings & TOTEM_CONFIG_WARNING_MEMBERS_IGNORED) { log_printf (LOGSYS_LEVEL_WARNING, "member section is used together with nodelist. Members ignored."); } if (totem_config_warnings & TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED) { log_printf (LOGSYS_LEVEL_WARNING, "member section is deprecated."); } if (totem_config_warnings & TOTEM_CONFIG_WARNING_TOTEM_NODEID_IGNORED) { log_printf (LOGSYS_LEVEL_WARNING, "nodeid appears both in totem section and nodelist. Nodelist one is used."); } if (totem_config_warnings & TOTEM_CONFIG_BINDNETADDR_NODELIST_SET) { log_printf (LOGSYS_LEVEL_WARNING, "interface section bindnetaddr is used together with nodelist. " "Nodelist one is going to be used."); } if (totem_config_warnings != 0) { log_printf (LOGSYS_LEVEL_WARNING, "Please migrate config file to nodelist."); } res = totem_config_keyread (&totem_config, &error_string); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } res = totem_config_validate (&totem_config, &error_string); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } if (testonly) { corosync_exit_error (COROSYNC_DONE_EXIT); } move_to_root_cgroup = 1; if (icmap_get_string("system.move_to_root_cgroup", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "yes") != 0) { move_to_root_cgroup = 0; } free(tmp_str); } /* * Try to move corosync into root cpu cgroup. Failure is not fatal and * error is deliberately ignored. */ if (move_to_root_cgroup) { (void)corosync_move_to_root_cgroup(); } sched_rr = 1; if (icmap_get_string("system.sched_rr", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "yes") != 0) { sched_rr = 0; } free(tmp_str); } prio = 0; if (icmap_get_string("system.priority", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "max") == 0) { prio = INT_MIN; } else if (strcmp(tmp_str, "min") == 0) { prio = INT_MAX; } else { errno = 0; tmpli = strtol(tmp_str, &ep, 10); if (errno != 0 || *ep != '\0' || tmpli > INT_MAX || tmpli < INT_MIN) { log_printf (LOGSYS_LEVEL_ERROR, "Priority value %s is invalid", tmp_str); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } prio = tmpli; } free(tmp_str); } /* * Set round robin realtime scheduling with priority 99 */ if (sched_rr) { if (corosync_set_rr_scheduler () != 0) { prio = INT_MIN; } else { prio = 0; } } if (prio != 0) { if (setpriority(PRIO_PGRP, 0, prio) != 0) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "Could not set priority %d", prio); } } totem_config.totem_memb_ring_id_create_or_load = corosync_ring_id_create_or_load; totem_config.totem_memb_ring_id_store = corosync_ring_id_store; totem_config.totem_logging_configuration = totem_logging_configuration; totem_config.totem_logging_configuration.log_subsys_id = log_subsys_id_totem; totem_config.totem_logging_configuration.log_level_security = LOGSYS_LEVEL_WARNING; totem_config.totem_logging_configuration.log_level_error = LOGSYS_LEVEL_ERROR; totem_config.totem_logging_configuration.log_level_warning = LOGSYS_LEVEL_WARNING; totem_config.totem_logging_configuration.log_level_notice = LOGSYS_LEVEL_NOTICE; totem_config.totem_logging_configuration.log_level_debug = LOGSYS_LEVEL_DEBUG; totem_config.totem_logging_configuration.log_level_trace = LOGSYS_LEVEL_TRACE; totem_config.totem_logging_configuration.log_printf = _logsys_log_printf; logsys_config_apply(); /* * Now we are fully initialized. */ if (background) { logsys_blackbox_prefork(); corosync_tty_detach (); logsys_blackbox_postfork(); log_printf (LOGSYS_LEVEL_DEBUG, "Corosync TTY detached"); } /* * Lock all memory to avoid page faults which may interrupt * application healthchecking */ corosync_mlockall (); corosync_poll_handle = qb_loop_create (); memset(&scheduler_pause_timeout_data, 0, sizeof(scheduler_pause_timeout_data)); scheduler_pause_timeout_data.totem_config = &totem_config; timer_function_scheduler_timeout (&scheduler_pause_timeout_data); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_LOW, SIGUSR2, NULL, sig_diag_handler, NULL); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH, SIGINT, NULL, sig_exit_handler, NULL); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH, SIGQUIT, NULL, sig_exit_handler, NULL); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH, SIGTERM, NULL, sig_exit_handler, NULL); if (logsys_thread_start() != 0) { log_printf (LOGSYS_LEVEL_ERROR, "Can't initialize log thread"); corosync_exit_error (COROSYNC_DONE_LOGCONFIGREAD); } if ((flock_err = corosync_flock (corosync_lock_file, getpid ())) != COROSYNC_DONE_EXIT) { corosync_exit_error (flock_err); } /* * if totempg_initialize doesn't have root priveleges, it cannot * bind to a specific interface. This only matters if * there is more then one interface in a system, so * in this case, only a warning is printed */ /* * Join multicast group and setup delivery * and configuration change functions */ if (totempg_initialize ( corosync_poll_handle, &totem_config) != 0) { log_printf (LOGSYS_LEVEL_ERROR, "Can't initialize TOTEM layer"); corosync_exit_error (COROSYNC_DONE_FATAL_ERR); } totempg_service_ready_register ( main_service_ready); totempg_groups_initialize ( &corosync_group_handle, deliver_fn, confchg_fn); totempg_groups_join ( corosync_group_handle, &corosync_group, 1); /* * Drop root privleges to user 'corosync' * TODO: Don't really need full root capabilities; * needed capabilities are: * CAP_NET_RAW (bindtodevice) * CAP_SYS_NICE (setscheduler) * CAP_IPC_LOCK (mlockall) */ priv_drop (); schedwrk_init ( serialize_lock, serialize_unlock); /* * Start main processing loop */ qb_loop_run (corosync_poll_handle); /* * Exit was requested */ totempg_finalize (); /* * free the loop resources */ qb_loop_destroy (corosync_poll_handle); /* * free up the icmap */ /* * Remove pid lock file */ unlink (corosync_lock_file); corosync_exit_error (COROSYNC_DONE_EXIT); return EXIT_SUCCESS; } diff --git a/exec/util.c b/exec/util.c index 57b9fa54..7ef3362f 100644 --- a/exec/util.c +++ b/exec/util.c @@ -1,190 +1,190 @@ /* * Copyright (c) 2002-2004 MontaVista Software, Inc. * Copyright (c) 2004 Open Source Development Lab * Copyright (c) 2006-2012 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com), Mark Haverkamp (markh@osdl.org) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include "util.h" LOGSYS_DECLARE_SUBSYS ("MAIN"); struct service_names { const char *c_name; int32_t c_val; }; static struct service_names servicenames[] = { { "CFG", CFG_SERVICE }, { "CPG", CPG_SERVICE }, { "QUORUM", QUORUM_SERVICE }, { "PLOAD", PLOAD_SERVICE }, { "VOTEQUORUM", VOTEQUORUM_SERVICE }, { "MON", MON_SERVICE }, { "WD", WD_SERVICE }, { "CMAP", CMAP_SERVICE }, { NULL, -1 } }; const char * short_service_name_get(uint32_t service_id, char *buf, size_t buf_size) { uint32_t i; for (i = 0; servicenames[i].c_name != NULL; i++) { if (service_id == servicenames[i].c_val) { return (servicenames[i].c_name); } } snprintf(buf, buf_size, "%d", service_id); return buf; } /* * Compare two names. returns non-zero on match. */ int name_match(cs_name_t *name1, cs_name_t *name2) { if (name1->length == name2->length) { return ((strncmp ((char *)name1->value, (char *)name2->value, name1->length)) == 0); } return 0; } /* * Get the time of day and convert to nanoseconds */ cs_time_t clust_time_now(void) { struct timeval tv; cs_time_t time_now; if (gettimeofday(&tv, 0)) { return 0ULL; } time_now = (cs_time_t)(tv.tv_sec) * 1000000000ULL; time_now += (cs_time_t)(tv.tv_usec) * 1000ULL; return time_now; } void _corosync_out_of_memory_error (void) __attribute__((noreturn)); void _corosync_out_of_memory_error (void) { assert (0==1); exit (EXIT_FAILURE); } void _corosync_exit_error ( enum e_corosync_done err, const char *file, unsigned int line) __attribute__((noreturn)); void _corosync_exit_error ( enum e_corosync_done err, const char *file, unsigned int line) { if (err == COROSYNC_DONE_EXIT) { log_printf (LOGSYS_LEVEL_NOTICE, "Corosync Cluster Engine exiting normally"); } else { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Cluster Engine exiting " "with status %d at %s:%u.", err, file, line); } logsys_system_fini (); exit (err); } char *getcs_name_t (cs_name_t *name) { static char ret_name[CS_MAX_NAME_LENGTH]; /* if string is corrupt (non-terminated), ensure it's displayed safely */ if (name->length >= CS_MAX_NAME_LENGTH || name->value[name->length] != '\0') { memset (ret_name, 0, sizeof (ret_name)); memcpy (ret_name, name->value, min(name->length, CS_MAX_NAME_LENGTH -1)); return (ret_name); } return ((char *)name->value); } void setcs_name_t (cs_name_t *name, char *str) { strncpy ((char *)name->value, str, sizeof (name->value) - 1); ((char *)name->value)[sizeof (name->value) - 1] = '\0'; if (strlen ((char *)name->value) > CS_MAX_NAME_LENGTH) { name->length = CS_MAX_NAME_LENGTH; } else { name->length = strlen (str); } } int cs_name_tisEqual (cs_name_t *str1, char *str2) { if (str1->length == strlen (str2)) { return ((strncmp ((char *)str1->value, (char *)str2, str1->length)) == 0); } else { return 0; } } -const char *get_run_dir(void) +const char *get_state_dir(void) { static char path[PATH_MAX] = {'\0'}; - char *cmap_run_dir; + char *cmap_state_dir; int res; if (path[0] == '\0') { - if (icmap_get_string("system.run_dir", &cmap_run_dir) == CS_OK) { - res = snprintf(path, PATH_MAX, "%s", cmap_run_dir); - free(cmap_run_dir); + if (icmap_get_string("system.state_dir", &cmap_state_dir) == CS_OK) { + res = snprintf(path, PATH_MAX, "%s", cmap_state_dir); + free(cmap_state_dir); } else { res = snprintf(path, PATH_MAX, "%s/%s", LOCALSTATEDIR, "lib/corosync"); } assert(res < PATH_MAX); } return (path); } diff --git a/exec/util.h b/exec/util.h index 9e90cc80..e493b9a9 100644 --- a/exec/util.h +++ b/exec/util.h @@ -1,90 +1,90 @@ /* * Copyright (c) 2002-2004 MontaVista Software, Inc. * Copyright (c) 2004 Open Source Development Lab * Copyright (c) 2006-2017 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com), Mark Haverkamp (markh@osdl.org) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef UTIL_H_DEFINED #define UTIL_H_DEFINED #include #include /** * Get the time of day and convert to nanoseconds */ extern cs_time_t clust_time_now(void); enum e_corosync_done { COROSYNC_DONE_EXIT = 0, COROSYNC_DONE_FORK = 4, COROSYNC_DONE_LOGCONFIGREAD = 7, COROSYNC_DONE_MAINCONFIGREAD = 8, COROSYNC_DONE_LOGSETUP = 9, COROSYNC_DONE_ICMAP = 12, COROSYNC_DONE_INIT_SERVICES = 13, COROSYNC_DONE_FATAL_ERR = 15, COROSYNC_DONE_DIR_NOT_PRESENT = 16, COROSYNC_DONE_ACQUIRE_LOCK = 17, COROSYNC_DONE_ALREADY_RUNNING = 18, COROSYNC_DONE_STD_TO_NULL_REDIR = 19, COROSYNC_DONE_SERVICE_ENGINE_INIT = 20, COROSYNC_DONE_STORE_RINGID = 21, COROSYNC_DONE_STATS = 22, COROSYNC_DONE_PLOAD = 99 }; #define min(a,b) ((a) < (b) ? (a) : (b)) /** * Compare two names. returns non-zero on match. */ extern int name_match(cs_name_t *name1, cs_name_t *name2); #define corosync_exit_error(err) _corosync_exit_error ((err), __FILE__, __LINE__) extern void _corosync_exit_error (enum e_corosync_done err, const char *file, unsigned int line) __attribute__((noreturn)); void _corosync_out_of_memory_error (void) __attribute__((noreturn)); extern char *getcs_name_t (cs_name_t *name); extern void setcs_name_t (cs_name_t *name, char *str); extern int cs_name_tisEqual (cs_name_t *str1, char *str2); /** * Get the short name of a service from the service_id. */ const char * short_service_name_get(uint32_t service_id, char *buf, size_t buf_size); /* - * Return run directory (ether icmap system.run_dir or LOCALSTATEDIR/lib/corosync) + * Return state directory (ether icmap system.state_dir or LOCALSTATEDIR/lib/corosync) */ -const char *get_run_dir(void); +const char *get_state_dir(void); #endif /* UTIL_H_DEFINED */ diff --git a/exec/votequorum.c b/exec/votequorum.c index b105e7cc..6c6e3b12 100644 --- a/exec/votequorum.c +++ b/exec/votequorum.c @@ -1,3047 +1,3047 @@ /* * Copyright (c) 2009-2015 Red Hat, Inc. * * All rights reserved. * * Authors: Christine Caulfield (ccaulfie@redhat.com) * Fabio M. Di Nitto (fdinitto@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include "quorum.h" #include #include #include #include #include #include #include "service.h" #include "util.h" LOGSYS_DECLARE_SUBSYS ("VOTEQ"); /* * interface with corosync */ static struct corosync_api_v1 *corosync_api; /* * votequorum global config vars */ static char qdevice_name[VOTEQUORUM_QDEVICE_MAX_NAME_LEN]; static struct cluster_node *qdevice = NULL; static unsigned int qdevice_timeout = VOTEQUORUM_QDEVICE_DEFAULT_TIMEOUT; static unsigned int qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT; static uint8_t qdevice_can_operate = 1; static void *qdevice_reg_conn = NULL; static uint8_t qdevice_master_wins = 0; static uint8_t two_node = 0; static uint8_t wait_for_all = 0; static uint8_t wait_for_all_status = 0; static enum {ATB_NONE, ATB_LOWEST, ATB_HIGHEST, ATB_LIST} auto_tie_breaker = ATB_NONE, initial_auto_tie_breaker = ATB_NONE; static int lowest_node_id = -1; static int highest_node_id = -1; #define DEFAULT_LMS_WIN 10000 static uint8_t last_man_standing = 0; static uint32_t last_man_standing_window = DEFAULT_LMS_WIN; static uint8_t allow_downscale = 0; static uint32_t ev_barrier = 0; static uint8_t ev_tracking = 0; static uint32_t ev_tracking_barrier = 0; static int ev_tracking_fd = -1; /* * votequorum_exec defines/structs/forward definitions */ struct req_exec_quorum_nodeinfo { struct qb_ipc_request_header header __attribute__((aligned(8))); uint32_t nodeid; uint32_t votes; uint32_t expected_votes; uint32_t flags; } __attribute__((packed)); struct req_exec_quorum_reconfigure { struct qb_ipc_request_header header __attribute__((aligned(8))); uint32_t nodeid; uint32_t value; uint8_t param; uint8_t _pad0; uint8_t _pad1; uint8_t _pad2; } __attribute__((packed)); struct req_exec_quorum_qdevice_reg { struct qb_ipc_request_header header __attribute__((aligned(8))); uint32_t operation; char qdevice_name[VOTEQUORUM_QDEVICE_MAX_NAME_LEN]; } __attribute__((packed)); struct req_exec_quorum_qdevice_reconfigure { struct qb_ipc_request_header header __attribute__((aligned(8))); char oldname[VOTEQUORUM_QDEVICE_MAX_NAME_LEN]; char newname[VOTEQUORUM_QDEVICE_MAX_NAME_LEN]; } __attribute__((packed)); /* * votequorum_exec onwire version (via totem) */ #include "votequorum.h" /* * votequorum_exec onwire messages (via totem) */ #define MESSAGE_REQ_EXEC_VOTEQUORUM_NODEINFO 0 #define MESSAGE_REQ_EXEC_VOTEQUORUM_RECONFIGURE 1 #define MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_REG 2 #define MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_RECONFIGURE 3 static void votequorum_exec_send_expectedvotes_notification(void); static int votequorum_exec_send_quorum_notification(void *conn, uint64_t context); static int votequorum_exec_send_nodelist_notification(void *conn, uint64_t context); #define VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES 1 #define VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES 2 #define VOTEQUORUM_RECONFIG_PARAM_CANCEL_WFA 3 static int votequorum_exec_send_reconfigure(uint8_t param, unsigned int nodeid, uint32_t value); /* * used by req_exec_quorum_qdevice_reg */ #define VOTEQUORUM_QDEVICE_OPERATION_UNREGISTER 0 #define VOTEQUORUM_QDEVICE_OPERATION_REGISTER 1 /* * votequorum internal node status/view */ #define NODE_FLAGS_QUORATE 1 #define NODE_FLAGS_LEAVING 2 #define NODE_FLAGS_WFASTATUS 4 #define NODE_FLAGS_FIRST 8 #define NODE_FLAGS_QDEVICE_REGISTERED 16 #define NODE_FLAGS_QDEVICE_ALIVE 32 #define NODE_FLAGS_QDEVICE_CAST_VOTE 64 #define NODE_FLAGS_QDEVICE_MASTER_WINS 128 typedef enum { NODESTATE_MEMBER=1, NODESTATE_DEAD, NODESTATE_LEAVING } nodestate_t; struct cluster_node { int node_id; nodestate_t state; uint32_t votes; uint32_t expected_votes; uint32_t flags; struct qb_list_head list; }; /* * votequorum internal quorum status */ static uint8_t quorum; static uint8_t cluster_is_quorate; /* * votequorum membership data */ static struct cluster_node *us; static struct qb_list_head cluster_members_list; static unsigned int quorum_members[PROCESSOR_COUNT_MAX]; static unsigned int previous_quorum_members[PROCESSOR_COUNT_MAX]; static unsigned int atb_nodelist[PROCESSOR_COUNT_MAX]; static int quorum_members_entries = 0; static int previous_quorum_members_entries = 0; static int atb_nodelist_entries = 0; static struct memb_ring_id quorum_ringid; /* * pre allocate all cluster_nodes + one for qdevice */ static struct cluster_node cluster_nodes[PROCESSOR_COUNT_MAX+2]; static int cluster_nodes_entries = 0; /* * votequorum tracking */ struct quorum_pd { unsigned char track_flags; int tracking_enabled; uint64_t tracking_context; struct qb_list_head list; void *conn; }; static struct qb_list_head trackers_list; /* * votequorum timers */ static corosync_timer_handle_t qdevice_timer; static int qdevice_timer_set = 0; static corosync_timer_handle_t last_man_standing_timer; static int last_man_standing_timer_set = 0; static int sync_nodeinfo_sent = 0; static int sync_wait_for_poll_or_timeout = 0; /* * Service Interfaces required by service_message_handler struct */ static int sync_in_progress = 0; static void votequorum_sync_init ( const unsigned int *trans_list, size_t trans_list_entries, const unsigned int *member_list, size_t member_list_entries, const struct memb_ring_id *ring_id); static int votequorum_sync_process (void); static void votequorum_sync_activate (void); static void votequorum_sync_abort (void); static quorum_set_quorate_fn_t quorum_callback; /* * votequorum_exec handler and definitions */ static char *votequorum_exec_init_fn (struct corosync_api_v1 *api); static int votequorum_exec_exit_fn (void); static int votequorum_exec_send_nodeinfo(uint32_t nodeid); static void message_handler_req_exec_votequorum_nodeinfo ( const void *message, unsigned int nodeid); static void exec_votequorum_nodeinfo_endian_convert (void *message); static void message_handler_req_exec_votequorum_reconfigure ( const void *message, unsigned int nodeid); static void exec_votequorum_reconfigure_endian_convert (void *message); static void message_handler_req_exec_votequorum_qdevice_reg ( const void *message, unsigned int nodeid); static void exec_votequorum_qdevice_reg_endian_convert (void *message); static void message_handler_req_exec_votequorum_qdevice_reconfigure ( const void *message, unsigned int nodeid); static void exec_votequorum_qdevice_reconfigure_endian_convert (void *message); static struct corosync_exec_handler votequorum_exec_engine[] = { { /* 0 */ .exec_handler_fn = message_handler_req_exec_votequorum_nodeinfo, .exec_endian_convert_fn = exec_votequorum_nodeinfo_endian_convert }, { /* 1 */ .exec_handler_fn = message_handler_req_exec_votequorum_reconfigure, .exec_endian_convert_fn = exec_votequorum_reconfigure_endian_convert }, { /* 2 */ .exec_handler_fn = message_handler_req_exec_votequorum_qdevice_reg, .exec_endian_convert_fn = exec_votequorum_qdevice_reg_endian_convert }, { /* 3 */ .exec_handler_fn = message_handler_req_exec_votequorum_qdevice_reconfigure, .exec_endian_convert_fn = exec_votequorum_qdevice_reconfigure_endian_convert }, }; /* * Library Handler and Functions Definitions */ static int quorum_lib_init_fn (void *conn); static int quorum_lib_exit_fn (void *conn); static void qdevice_timer_fn(void *arg); static void message_handler_req_lib_votequorum_getinfo (void *conn, const void *message); static void message_handler_req_lib_votequorum_setexpected (void *conn, const void *message); static void message_handler_req_lib_votequorum_setvotes (void *conn, const void *message); static void message_handler_req_lib_votequorum_trackstart (void *conn, const void *message); static void message_handler_req_lib_votequorum_trackstop (void *conn, const void *message); static void message_handler_req_lib_votequorum_qdevice_register (void *conn, const void *message); static void message_handler_req_lib_votequorum_qdevice_unregister (void *conn, const void *message); static void message_handler_req_lib_votequorum_qdevice_update (void *conn, const void *message); static void message_handler_req_lib_votequorum_qdevice_poll (void *conn, const void *message); static void message_handler_req_lib_votequorum_qdevice_master_wins (void *conn, const void *message); static struct corosync_lib_handler quorum_lib_service[] = { { /* 0 */ .lib_handler_fn = message_handler_req_lib_votequorum_getinfo, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 1 */ .lib_handler_fn = message_handler_req_lib_votequorum_setexpected, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 2 */ .lib_handler_fn = message_handler_req_lib_votequorum_setvotes, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 3 */ .lib_handler_fn = message_handler_req_lib_votequorum_trackstart, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 4 */ .lib_handler_fn = message_handler_req_lib_votequorum_trackstop, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 5 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdevice_register, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 6 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdevice_unregister, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 7 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdevice_update, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 8 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdevice_poll, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 9 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdevice_master_wins, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED } }; static struct corosync_service_engine votequorum_service_engine = { .name = "corosync vote quorum service v1.0", .id = VOTEQUORUM_SERVICE, .priority = 2, .private_data_size = sizeof (struct quorum_pd), .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .flow_control = COROSYNC_LIB_FLOW_CONTROL_REQUIRED, .lib_init_fn = quorum_lib_init_fn, .lib_exit_fn = quorum_lib_exit_fn, .lib_engine = quorum_lib_service, .lib_engine_count = sizeof (quorum_lib_service) / sizeof (struct corosync_lib_handler), .exec_init_fn = votequorum_exec_init_fn, .exec_exit_fn = votequorum_exec_exit_fn, .exec_engine = votequorum_exec_engine, .exec_engine_count = sizeof (votequorum_exec_engine) / sizeof (struct corosync_exec_handler), .sync_init = votequorum_sync_init, .sync_process = votequorum_sync_process, .sync_activate = votequorum_sync_activate, .sync_abort = votequorum_sync_abort }; struct corosync_service_engine *votequorum_get_service_engine_ver0 (void) { return (&votequorum_service_engine); } static struct default_service votequorum_service[] = { { .name = "corosync_votequorum", .ver = 0, .loader = votequorum_get_service_engine_ver0 }, }; /* * common/utility macros/functions */ #define max(a,b) (((a) > (b)) ? (a) : (b)) static void node_add_ordered(struct cluster_node *newnode) { struct cluster_node *node = NULL; struct qb_list_head *tmp; ENTER(); qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if (newnode->node_id < node->node_id) { break; } } if (!node) { qb_list_add(&newnode->list, &cluster_members_list); } else { qb_list_add_tail(&newnode->list, &node->list); } LEAVE(); } static struct cluster_node *allocate_node(unsigned int nodeid) { struct cluster_node *cl = NULL; struct qb_list_head *tmp; ENTER(); if (cluster_nodes_entries <= PROCESSOR_COUNT_MAX + 1) { cl = (struct cluster_node *)&cluster_nodes[cluster_nodes_entries]; cluster_nodes_entries++; } else { qb_list_for_each(tmp, &cluster_members_list) { cl = qb_list_entry(tmp, struct cluster_node, list); if (cl->state == NODESTATE_DEAD) { break; } } /* * this should never happen */ if (!cl) { log_printf(LOGSYS_LEVEL_CRIT, "Unable to find memory for node %u data!!", nodeid); goto out; } qb_list_del(tmp); } memset(cl, 0, sizeof(struct cluster_node)); cl->node_id = nodeid; if (nodeid != VOTEQUORUM_QDEVICE_NODEID) { node_add_ordered(cl); } out: LEAVE(); return cl; } static struct cluster_node *find_node_by_nodeid(unsigned int nodeid) { struct cluster_node *node; struct qb_list_head *tmp; ENTER(); if (nodeid == us->node_id) { LEAVE(); return us; } if (nodeid == VOTEQUORUM_QDEVICE_NODEID) { LEAVE(); return qdevice; } qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if (node->node_id == nodeid) { LEAVE(); return node; } } LEAVE(); return NULL; } static void get_lowest_node_id(void) { struct cluster_node *node = NULL; struct qb_list_head *tmp; ENTER(); lowest_node_id = us->node_id; qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if ((node->state == NODESTATE_MEMBER) && (node->node_id < lowest_node_id)) { lowest_node_id = node->node_id; } } log_printf(LOGSYS_LEVEL_DEBUG, "lowest node id: %d us: %d", lowest_node_id, us->node_id); icmap_set_uint32("runtime.votequorum.lowest_node_id", lowest_node_id); LEAVE(); } static void get_highest_node_id(void) { struct cluster_node *node = NULL; struct qb_list_head *tmp; ENTER(); highest_node_id = us->node_id; qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if ((node->state == NODESTATE_MEMBER) && (node->node_id > highest_node_id)) { highest_node_id = node->node_id; } } log_printf(LOGSYS_LEVEL_DEBUG, "highest node id: %d us: %d", highest_node_id, us->node_id); icmap_set_uint32("runtime.votequorum.highest_node_id", highest_node_id); LEAVE(); } static int check_low_node_id_partition(void) { struct cluster_node *node = NULL; struct qb_list_head *tmp; int found = 0; ENTER(); qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if ((node->state == NODESTATE_MEMBER) && (node->node_id == lowest_node_id)) { found = 1; } } LEAVE(); return found; } static int check_high_node_id_partition(void) { struct cluster_node *node = NULL; struct qb_list_head *tmp; int found = 0; ENTER(); qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if ((node->state == NODESTATE_MEMBER) && (node->node_id == highest_node_id)) { found = 1; } } LEAVE(); return found; } static int is_in_nodelist(int nodeid, unsigned int *members, int entries) { int i; ENTER(); for (i=0; istate == NODESTATE_MEMBER) && (node->flags & NODE_FLAGS_QDEVICE_MASTER_WINS) && (node->flags & NODE_FLAGS_QDEVICE_CAST_VOTE)) { found = 1; } } LEAVE(); return found; } static void decode_flags(uint32_t flags) { ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "flags: quorate: %s Leaving: %s WFA Status: %s First: %s Qdevice: %s QdeviceAlive: %s QdeviceCastVote: %s QdeviceMasterWins: %s", (flags & NODE_FLAGS_QUORATE)?"Yes":"No", (flags & NODE_FLAGS_LEAVING)?"Yes":"No", (flags & NODE_FLAGS_WFASTATUS)?"Yes":"No", (flags & NODE_FLAGS_FIRST)?"Yes":"No", (flags & NODE_FLAGS_QDEVICE_REGISTERED)?"Yes":"No", (flags & NODE_FLAGS_QDEVICE_ALIVE)?"Yes":"No", (flags & NODE_FLAGS_QDEVICE_CAST_VOTE)?"Yes":"No", (flags & NODE_FLAGS_QDEVICE_MASTER_WINS)?"Yes":"No"); LEAVE(); } /* * load/save are copied almost pristine from totemsrp,c */ static int load_ev_tracking_barrier(void) { int res = 0; char filename[PATH_MAX]; ENTER(); - snprintf(filename, sizeof(filename) - 1, "%s/ev_tracking", get_run_dir()); + snprintf(filename, sizeof(filename) - 1, "%s/ev_tracking", get_state_dir()); ev_tracking_fd = open(filename, O_RDWR, 0700); if (ev_tracking_fd != -1) { res = read (ev_tracking_fd, &ev_tracking_barrier, sizeof(uint32_t)); close(ev_tracking_fd); if (res == sizeof (uint32_t)) { LEAVE(); return 0; } } ev_tracking_barrier = 0; umask(0); ev_tracking_fd = open (filename, O_CREAT|O_RDWR, 0700); if (ev_tracking_fd != -1) { res = write (ev_tracking_fd, &ev_tracking_barrier, sizeof (uint32_t)); if ((res == -1) || (res != sizeof (uint32_t))) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to write to %s", filename); } close(ev_tracking_fd); LEAVE(); return 0; } log_printf(LOGSYS_LEVEL_WARNING, "Unable to create %s file", filename); LEAVE(); return -1; } static void update_wait_for_all_status(uint8_t wfa_status) { ENTER(); wait_for_all_status = wfa_status; if (wait_for_all_status) { us->flags |= NODE_FLAGS_WFASTATUS; } else { us->flags &= ~NODE_FLAGS_WFASTATUS; } icmap_set_uint8("runtime.votequorum.wait_for_all_status", wait_for_all_status); LEAVE(); } static void update_two_node(void) { ENTER(); icmap_set_uint8("runtime.votequorum.two_node", two_node); LEAVE(); } static void update_ev_barrier(uint32_t expected_votes) { ENTER(); ev_barrier = expected_votes; icmap_set_uint32("runtime.votequorum.ev_barrier", ev_barrier); LEAVE(); } static void update_qdevice_can_operate(uint8_t status) { ENTER(); qdevice_can_operate = status; icmap_set_uint8("runtime.votequorum.qdevice_can_operate", qdevice_can_operate); LEAVE(); } static void update_qdevice_master_wins(uint8_t allow) { ENTER(); qdevice_master_wins = allow; icmap_set_uint8("runtime.votequorum.qdevice_master_wins", qdevice_master_wins); LEAVE(); } static void update_ev_tracking_barrier(uint32_t ev_t_barrier) { int res; ENTER(); ev_tracking_barrier = ev_t_barrier; icmap_set_uint32("runtime.votequorum.ev_tracking_barrier", ev_tracking_barrier); if (lseek (ev_tracking_fd, 0, SEEK_SET) != 0) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to update ev_tracking_barrier on disk data!!!"); LEAVE(); return; } res = write (ev_tracking_fd, &ev_tracking_barrier, sizeof (uint32_t)); if (res != sizeof (uint32_t)) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to update ev_tracking_barrier on disk data!!!"); } #ifdef HAVE_FDATASYNC fdatasync(ev_tracking_fd); #else fsync(ev_tracking_fd); #endif LEAVE(); } /* * quorum calculation core bits */ static int calculate_quorum(int allow_decrease, unsigned int max_expected, unsigned int *ret_total_votes) { struct qb_list_head *nodelist; struct cluster_node *node; unsigned int total_votes = 0; unsigned int highest_expected = 0; unsigned int newquorum, q1, q2; unsigned int total_nodes = 0; ENTER(); if ((allow_downscale) && (allow_decrease) && (max_expected)) { max_expected = max(ev_barrier, max_expected); } qb_list_for_each(nodelist, &cluster_members_list) { node = qb_list_entry(nodelist, struct cluster_node, list); log_printf(LOGSYS_LEVEL_DEBUG, "node %u state=%d, votes=%u, expected=%u", node->node_id, node->state, node->votes, node->expected_votes); if (node->state == NODESTATE_MEMBER) { highest_expected = max(highest_expected, node->expected_votes); total_votes += node->votes; total_nodes++; } } if (us->flags & NODE_FLAGS_QDEVICE_CAST_VOTE) { log_printf(LOGSYS_LEVEL_DEBUG, "node 0 state=1, votes=%u", qdevice->votes); total_votes += qdevice->votes; total_nodes++; } if (max_expected > 0) { highest_expected = max_expected; } /* * This quorum calculation is taken from the OpenVMS Cluster Systems * manual, but, then, you guessed that didn't you */ q1 = (highest_expected + 2) / 2; q2 = (total_votes + 2) / 2; newquorum = max(q1, q2); /* * Normally quorum never decreases but the system administrator can * force it down by setting expected votes to a maximum value */ if (!allow_decrease) { newquorum = max(quorum, newquorum); } /* * The special two_node mode allows each of the two nodes to retain * quorum if the other fails. Only one of the two should live past * fencing (as both nodes try to fence each other in split-brain.) * Also: if there are more than two nodes, force us inquorate to avoid * any damage or confusion. */ if (two_node && total_nodes <= 2) { newquorum = 1; } if (ret_total_votes) { *ret_total_votes = total_votes; } LEAVE(); return newquorum; } static void update_node_expected_votes(int new_expected_votes) { struct qb_list_head *nodelist; struct cluster_node *node; if (new_expected_votes) { qb_list_for_each(nodelist, &cluster_members_list) { node = qb_list_entry(nodelist, struct cluster_node, list); if (node->state == NODESTATE_MEMBER) { node->expected_votes = new_expected_votes; } } } } static void are_we_quorate(unsigned int total_votes) { int quorate; int quorum_change = 0; ENTER(); /* * wait for all nodes to show up before granting quorum */ if ((wait_for_all) && (wait_for_all_status)) { if (total_votes != us->expected_votes) { log_printf(LOGSYS_LEVEL_NOTICE, "Waiting for all cluster members. " "Current votes: %d expected_votes: %d", total_votes, us->expected_votes); cluster_is_quorate = 0; return; } update_wait_for_all_status(0); } if (quorum > total_votes) { quorate = 0; } else { quorate = 1; get_lowest_node_id(); get_highest_node_id(); } if ((auto_tie_breaker != ATB_NONE) && /* Must be a half (or half-1) split */ (total_votes == (us->expected_votes / 2)) && /* If the 'other' partition in a split might have quorum then we can't run ATB */ (previous_quorum_members_entries - quorum_members_entries < quorum) && (check_auto_tie_breaker() == 1)) { quorate = 1; } if ((qdevice_master_wins) && (!quorate) && (check_qdevice_master() == 1)) { log_printf(LOGSYS_LEVEL_DEBUG, "node is quorate as part of master_wins partition"); quorate = 1; } if (cluster_is_quorate && !quorate) { quorum_change = 1; log_printf(LOGSYS_LEVEL_DEBUG, "quorum lost, blocking activity"); } if (!cluster_is_quorate && quorate) { quorum_change = 1; log_printf(LOGSYS_LEVEL_DEBUG, "quorum regained, resuming activity"); } cluster_is_quorate = quorate; if (cluster_is_quorate) { us->flags |= NODE_FLAGS_QUORATE; } else { us->flags &= ~NODE_FLAGS_QUORATE; } if (wait_for_all) { if (quorate) { update_wait_for_all_status(0); } else { update_wait_for_all_status(1); } } if ((quorum_change) && (sync_in_progress == 0)) { quorum_callback(quorum_members, quorum_members_entries, cluster_is_quorate, &quorum_ringid); votequorum_exec_send_quorum_notification(NULL, 0L); } LEAVE(); } static void get_total_votes(unsigned int *totalvotes, unsigned int *current_members) { unsigned int total_votes = 0; unsigned int cluster_members = 0; struct qb_list_head *nodelist; struct cluster_node *node; ENTER(); qb_list_for_each(nodelist, &cluster_members_list) { node = qb_list_entry(nodelist, struct cluster_node, list); if (node->state == NODESTATE_MEMBER) { cluster_members++; total_votes += node->votes; } } if (qdevice->votes) { total_votes += qdevice->votes; cluster_members++; } *totalvotes = total_votes; *current_members = cluster_members; LEAVE(); } /* * Recalculate cluster quorum, set quorate and notify changes */ static void recalculate_quorum(int allow_decrease, int by_current_nodes) { unsigned int total_votes = 0; unsigned int cluster_members = 0; ENTER(); get_total_votes(&total_votes, &cluster_members); if (!by_current_nodes) { cluster_members = 0; } /* * Keep expected_votes at the highest number of votes in the cluster */ log_printf(LOGSYS_LEVEL_DEBUG, "total_votes=%d, expected_votes=%d", total_votes, us->expected_votes); if (total_votes > us->expected_votes) { us->expected_votes = total_votes; votequorum_exec_send_expectedvotes_notification(); } if ((ev_tracking) && (us->expected_votes > ev_tracking_barrier)) { update_ev_tracking_barrier(us->expected_votes); } quorum = calculate_quorum(allow_decrease, cluster_members, &total_votes); update_node_expected_votes(cluster_members); are_we_quorate(total_votes); LEAVE(); } /* * configuration bits and pieces */ static int votequorum_read_nodelist_configuration(uint32_t *votes, uint32_t *nodes, uint32_t *expected_votes) { icmap_iter_t iter; const char *iter_key; char tmp_key[ICMAP_KEYNAME_MAXLEN]; uint32_t our_pos, node_pos, last_node_pos=-1; uint32_t nodecount = 0; uint32_t nodelist_expected_votes = 0; uint32_t node_votes = 0; int res = 0; ENTER(); if (icmap_get_uint32("nodelist.local_node_pos", &our_pos) != CS_OK) { log_printf(LOGSYS_LEVEL_DEBUG, "No nodelist defined or our node is not in the nodelist"); return 0; } iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } /* * If current node_pos is the same as the last_node_pos then skip it * so we only do the code below once per node. * (icmap keys are always in order) */ if (last_node_pos == node_pos) { continue; } last_node_pos = node_pos; nodecount++; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.quorum_votes", node_pos); if (icmap_get_uint32(tmp_key, &node_votes) != CS_OK) { node_votes = 1; } nodelist_expected_votes = nodelist_expected_votes + node_votes; if (node_pos == our_pos) { *votes = node_votes; } } *expected_votes = nodelist_expected_votes; *nodes = nodecount; icmap_iter_finalize(iter); LEAVE(); return 1; } static int votequorum_qdevice_is_configured(uint32_t *qdevice_votes) { char *qdevice_model = NULL; int ret = 0; ENTER(); if (icmap_get_string("quorum.device.model", &qdevice_model) == CS_OK) { if (strlen(qdevice_model)) { if (icmap_get_uint32("quorum.device.votes", qdevice_votes) != CS_OK) { *qdevice_votes = -1; } if (icmap_get_uint32("quorum.device.timeout", &qdevice_timeout) != CS_OK) { qdevice_timeout = VOTEQUORUM_QDEVICE_DEFAULT_TIMEOUT; } if (icmap_get_uint32("quorum.device.sync_timeout", &qdevice_sync_timeout) != CS_OK) { qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT; } update_qdevice_can_operate(1); ret = 1; } free(qdevice_model); } LEAVE(); return ret; } #define VOTEQUORUM_READCONFIG_STARTUP 0 #define VOTEQUORUM_READCONFIG_RUNTIME 1 static char *votequorum_readconfig(int runtime) { uint32_t node_votes = 0, qdevice_votes = 0; uint32_t node_expected_votes = 0, expected_votes = 0; uint32_t node_count = 0; uint8_t atb = 0; int have_nodelist, have_qdevice; char *atb_string = NULL; char *error = NULL; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Reading configuration (runtime: %d)", runtime); /* * Set the few things we re-read at runtime back to their defaults */ if (runtime) { two_node = 0; expected_votes = 0; /* auto_tie_breaker cannot be changed by config reload, but * we automatically disable it on odd-sized clusters without * wait_for_all. * We may need to re-enable it when membership changes to ensure * that auto_tie_breaker is consistent across all nodes */ auto_tie_breaker = initial_auto_tie_breaker; icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker); } /* * gather basic data here */ icmap_get_uint32("quorum.expected_votes", &expected_votes); have_nodelist = votequorum_read_nodelist_configuration(&node_votes, &node_count, &node_expected_votes); have_qdevice = votequorum_qdevice_is_configured(&qdevice_votes); icmap_get_uint8("quorum.two_node", &two_node); /* * do config verification and enablement */ if ((!have_nodelist) && (!expected_votes)) { if (!runtime) { error = (char *)"configuration error: nodelist or quorum.expected_votes must be configured!"; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: nodelist or quorum.expected_votes must be configured!"); log_printf(LOGSYS_LEVEL_CRIT, "will continue with current runtime data"); } goto out; } /* * two_node and qdevice are not compatible in the same config. * try to make an educated guess of what to do */ if ((two_node) && (have_qdevice)) { if (!runtime) { error = (char *)"configuration error: two_node and quorum device cannot be configured at the same time!"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: two_node and quorum device cannot be configured at the same time!"); if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { log_printf(LOGSYS_LEVEL_CRIT, "quorum device is registered, disabling two_node"); two_node = 0; } else { log_printf(LOGSYS_LEVEL_CRIT, "quorum device is not registered, allowing two_node"); update_qdevice_can_operate(0); } } } /* * Enable special features */ if (!runtime) { if (two_node) { wait_for_all = 1; } icmap_get_uint8("quorum.allow_downscale", &allow_downscale); icmap_get_uint8("quorum.wait_for_all", &wait_for_all); icmap_get_uint8("quorum.last_man_standing", &last_man_standing); icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); icmap_get_uint8("quorum.auto_tie_breaker", &atb); icmap_get_string("quorum.auto_tie_breaker_node", &atb_string); /* auto_tie_breaker defaults to LOWEST */ if (atb) { auto_tie_breaker = ATB_LOWEST; icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker); } else { auto_tie_breaker = ATB_NONE; if (atb_string) { log_printf(LOGSYS_LEVEL_WARNING, "auto_tie_breaker_node: is meaningless if auto_tie_breaker is set to 0"); } } if (atb && atb_string) { parse_atb_string(atb_string); } free(atb_string); initial_auto_tie_breaker = auto_tie_breaker; /* allow_downscale requires ev_tracking */ if (allow_downscale) { ev_tracking = 1; } if (ev_tracking) { if (load_ev_tracking_barrier() < 0) { LEAVE(); return ((char *)"Unable to load ev_tracking file!"); } update_ev_tracking_barrier(ev_tracking_barrier); } } /* two_node and auto_tie_breaker are not compatible as two_node uses * a fence race to decide quorum whereas ATB decides based on node id */ if (two_node && auto_tie_breaker != ATB_NONE) { log_printf(LOGSYS_LEVEL_CRIT, "two_node and auto_tie_breaker are both specified but are not compatible."); log_printf(LOGSYS_LEVEL_CRIT, "two_node has been disabled, please fix your corosync.conf"); two_node = 0; } /* If ATB is set and the cluster has an odd number of nodes then wait_for_all needs * to be set so that an isolated half+1 without the tie breaker node * does not have quorum on reboot. */ if ((auto_tie_breaker != ATB_NONE) && (node_expected_votes % 2) && (!wait_for_all)) { if (last_man_standing) { /* if LMS is set too, it's a fatal configuration error. We can't dictate to the user what * they might want so we'll just quit. */ log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set, the cluster has an odd number of nodes\n"); log_printf(LOGSYS_LEVEL_CRIT, "and last_man_standing is also set. With this situation a better\n"); log_printf(LOGSYS_LEVEL_CRIT, "solution would be to disable LMS, leave ATB enabled, and also\n"); log_printf(LOGSYS_LEVEL_CRIT, "enable wait_for_all (mandatory for ATB in odd-numbered clusters).\n"); log_printf(LOGSYS_LEVEL_CRIT, "Due to this ambiguity, corosync will fail to start. Please fix your corosync.conf\n"); error = (char *)"configuration error: auto_tie_breaker & last_man_standing not available in odd sized cluster"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "auto_tie_breaker is set and the cluster has an odd number of nodes.\n"); log_printf(LOGSYS_LEVEL_CRIT, "wait_for_all needs to be set for this configuration but it is missing\n"); log_printf(LOGSYS_LEVEL_CRIT, "Therefore auto_tie_breaker has been disabled. Please fix your corosync.conf\n"); auto_tie_breaker = ATB_NONE; icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker); } } /* * quorum device is not compatible with last_man_standing and auto_tie_breaker * neither lms or atb can be set at runtime, so there is no need to check for * runtime incompatibilities, but qdevice can be configured _after_ LMS and ATB have * been enabled at startup. */ if ((have_qdevice) && (last_man_standing)) { if (!runtime) { error = (char *)"configuration error: quorum.device is not compatible with last_man_standing"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device is not compatible with last_man_standing"); log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations"); update_qdevice_can_operate(0); } } if ((have_qdevice) && (auto_tie_breaker != ATB_NONE)) { if (!runtime) { error = (char *)"configuration error: quorum.device is not compatible with auto_tie_breaker"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device is not compatible with auto_tie_breaker"); log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations"); update_qdevice_can_operate(0); } } if ((have_qdevice) && (allow_downscale)) { if (!runtime) { error = (char *)"configuration error: quorum.device is not compatible with allow_downscale"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device is not compatible with allow_downscale"); log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations"); update_qdevice_can_operate(0); } } /* * if user specifies quorum.expected_votes + quorum.device but NOT the device.votes * we don't know what the quorum device should vote. */ if ((expected_votes) && (have_qdevice) && (qdevice_votes == -1)) { if (!runtime) { error = (char *)"configuration error: quorum.device.votes must be specified when quorum.expected_votes is set"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device.votes must be specified when quorum.expected_votes is set"); log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations"); update_qdevice_can_operate(0); } } /* * if user specifies a node list with uneven votes and no device.votes * we cannot autocalculate the votes */ if ((have_qdevice) && (qdevice_votes == -1) && (have_nodelist) && (node_count != node_expected_votes)) { if (!runtime) { error = (char *)"configuration error: quorum.device.votes must be specified when not all nodes votes 1"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device.votes must be specified when not all nodes votes 1"); log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations"); update_qdevice_can_operate(0); } } /* * validate quorum device votes vs expected_votes */ if ((qdevice_votes > 0) && (expected_votes)) { int delta = expected_votes - qdevice_votes; if (delta < 2) { if (!runtime) { error = (char *)"configuration error: quorum.device.votes is too high or expected_votes is too low"; goto out; } else { log_printf(LOGSYS_LEVEL_CRIT, "configuration error: quorum.device.votes is too high or expected_votes is too low"); log_printf(LOGSYS_LEVEL_CRIT, "disabling quorum device operations"); update_qdevice_can_operate(0); } } } /* * automatically calculate device votes and adjust expected_votes from nodelist */ if ((have_qdevice) && (qdevice_votes == -1) && (!expected_votes) && (have_nodelist) && (node_count == node_expected_votes)) { qdevice_votes = node_expected_votes - 1; node_expected_votes = node_expected_votes + qdevice_votes; } /* * set this node votes and expected_votes */ log_printf(LOGSYS_LEVEL_DEBUG, "ev_tracking=%d, ev_tracking_barrier = %d: expected_votes = %d\n", ev_tracking, ev_tracking_barrier, expected_votes); if (ev_tracking) { expected_votes = ev_tracking_barrier; } if (have_nodelist) { us->votes = node_votes; us->expected_votes = node_expected_votes; } else { us->votes = 1; icmap_get_uint32("quorum.votes", &us->votes); } if (expected_votes) { us->expected_votes = expected_votes; } /* * set qdevice votes */ if (!have_qdevice) { qdevice->votes = 0; } if (qdevice_votes != -1) { qdevice->votes = qdevice_votes; } update_ev_barrier(us->expected_votes); update_two_node(); if (wait_for_all) { update_wait_for_all_status(1); } out: LEAVE(); return error; } static void votequorum_refresh_config( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { int old_votes, old_expected_votes; uint8_t reloading; uint8_t cancel_wfa; ENTER(); /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.totemconfig_reload_in_progress", &reloading) == CS_OK && reloading) { return ; } icmap_get_uint8("quorum.cancel_wait_for_all", &cancel_wfa); if (strcmp(key_name, "quorum.cancel_wait_for_all") == 0 && cancel_wfa >= 1) { icmap_set_uint8("quorum.cancel_wait_for_all", 0); if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_CANCEL_WFA, us->node_id, 0)) { log_printf(LOGSYS_LEVEL_ERROR, "Failed to send Cancel WFA message to other nodes"); } return; } old_votes = us->votes; old_expected_votes = us->expected_votes; /* * Reload the configuration */ votequorum_readconfig(VOTEQUORUM_READCONFIG_RUNTIME); /* * activate new config */ votequorum_exec_send_nodeinfo(us->node_id); votequorum_exec_send_nodeinfo(VOTEQUORUM_QDEVICE_NODEID); if (us->votes != old_votes) { if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES, us->node_id, us->votes)) { log_printf(LOGSYS_LEVEL_ERROR, "Failed to send new votes message to other nodes"); } } if (us->expected_votes != old_expected_votes) { if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES, us->node_id, us->expected_votes)) { log_printf(LOGSYS_LEVEL_ERROR, "Failed to send expected votes message to other nodes"); } } LEAVE(); } static void votequorum_exec_add_config_notification(void) { icmap_track_t icmap_track_nodelist = NULL; icmap_track_t icmap_track_quorum = NULL; icmap_track_t icmap_track_reload = NULL; ENTER(); icmap_track_add("nodelist.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, votequorum_refresh_config, NULL, &icmap_track_nodelist); icmap_track_add("quorum.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, votequorum_refresh_config, NULL, &icmap_track_quorum); icmap_track_add("config.totemconfig_reload_in_progress", ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY, votequorum_refresh_config, NULL, &icmap_track_reload); LEAVE(); } /* * votequorum_exec core */ static int votequorum_exec_send_reconfigure(uint8_t param, unsigned int nodeid, uint32_t value) { struct req_exec_quorum_reconfigure req_exec_quorum_reconfigure; struct iovec iov[1]; int ret; ENTER(); req_exec_quorum_reconfigure.nodeid = nodeid; req_exec_quorum_reconfigure.value = value; req_exec_quorum_reconfigure.param = param; req_exec_quorum_reconfigure._pad0 = 0; req_exec_quorum_reconfigure._pad1 = 0; req_exec_quorum_reconfigure._pad2 = 0; req_exec_quorum_reconfigure.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_RECONFIGURE); req_exec_quorum_reconfigure.header.size = sizeof(req_exec_quorum_reconfigure); iov[0].iov_base = (void *)&req_exec_quorum_reconfigure; iov[0].iov_len = sizeof(req_exec_quorum_reconfigure); ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED); LEAVE(); return ret; } static int votequorum_exec_send_nodeinfo(uint32_t nodeid) { struct req_exec_quorum_nodeinfo req_exec_quorum_nodeinfo; struct iovec iov[1]; struct cluster_node *node; int ret; ENTER(); node = find_node_by_nodeid(nodeid); if (!node) { return -1; } req_exec_quorum_nodeinfo.nodeid = nodeid; req_exec_quorum_nodeinfo.votes = node->votes; req_exec_quorum_nodeinfo.expected_votes = node->expected_votes; req_exec_quorum_nodeinfo.flags = node->flags; if (nodeid != VOTEQUORUM_QDEVICE_NODEID) { decode_flags(node->flags); } req_exec_quorum_nodeinfo.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_NODEINFO); req_exec_quorum_nodeinfo.header.size = sizeof(req_exec_quorum_nodeinfo); iov[0].iov_base = (void *)&req_exec_quorum_nodeinfo; iov[0].iov_len = sizeof(req_exec_quorum_nodeinfo); ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED); LEAVE(); return ret; } static int votequorum_exec_send_qdevice_reconfigure(const char *oldname, const char *newname) { struct req_exec_quorum_qdevice_reconfigure req_exec_quorum_qdevice_reconfigure; struct iovec iov[1]; int ret; ENTER(); req_exec_quorum_qdevice_reconfigure.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_RECONFIGURE); req_exec_quorum_qdevice_reconfigure.header.size = sizeof(req_exec_quorum_qdevice_reconfigure); strcpy(req_exec_quorum_qdevice_reconfigure.oldname, oldname); strcpy(req_exec_quorum_qdevice_reconfigure.newname, newname); iov[0].iov_base = (void *)&req_exec_quorum_qdevice_reconfigure; iov[0].iov_len = sizeof(req_exec_quorum_qdevice_reconfigure); ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED); LEAVE(); return ret; } static int votequorum_exec_send_qdevice_reg(uint32_t operation, const char *qdevice_name_req) { struct req_exec_quorum_qdevice_reg req_exec_quorum_qdevice_reg; struct iovec iov[1]; int ret; ENTER(); req_exec_quorum_qdevice_reg.header.id = SERVICE_ID_MAKE(VOTEQUORUM_SERVICE, MESSAGE_REQ_EXEC_VOTEQUORUM_QDEVICE_REG); req_exec_quorum_qdevice_reg.header.size = sizeof(req_exec_quorum_qdevice_reg); req_exec_quorum_qdevice_reg.operation = operation; strcpy(req_exec_quorum_qdevice_reg.qdevice_name, qdevice_name_req); iov[0].iov_base = (void *)&req_exec_quorum_qdevice_reg; iov[0].iov_len = sizeof(req_exec_quorum_qdevice_reg); ret = corosync_api->totem_mcast (iov, 1, TOTEM_AGREED); LEAVE(); return ret; } static int votequorum_exec_send_quorum_notification(void *conn, uint64_t context) { struct res_lib_votequorum_quorum_notification *res_lib_votequorum_notification; struct qb_list_head *tmp; struct cluster_node *node; int i = 0; int cluster_members = 0; int size; char buf[sizeof(struct res_lib_votequorum_quorum_notification) + sizeof(struct votequorum_node) * (PROCESSOR_COUNT_MAX + 2)]; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Sending quorum callback, quorate = %d", cluster_is_quorate); qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); cluster_members++; } if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { cluster_members++; } size = sizeof(struct res_lib_votequorum_quorum_notification) + sizeof(struct votequorum_node) * cluster_members; res_lib_votequorum_notification = (struct res_lib_votequorum_quorum_notification *)&buf; res_lib_votequorum_notification->quorate = cluster_is_quorate; res_lib_votequorum_notification->context = context; res_lib_votequorum_notification->node_list_entries = cluster_members; res_lib_votequorum_notification->header.id = MESSAGE_RES_VOTEQUORUM_QUORUM_NOTIFICATION; res_lib_votequorum_notification->header.size = size; res_lib_votequorum_notification->header.error = CS_OK; /* Send all known nodes and their states */ qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); res_lib_votequorum_notification->node_list[i].nodeid = node->node_id; res_lib_votequorum_notification->node_list[i++].state = node->state; } if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { res_lib_votequorum_notification->node_list[i].nodeid = VOTEQUORUM_QDEVICE_NODEID; res_lib_votequorum_notification->node_list[i++].state = qdevice->state; } /* Send it to all interested parties */ if (conn) { int ret = corosync_api->ipc_dispatch_send(conn, &buf, size); LEAVE(); return ret; } else { struct quorum_pd *qpd; qb_list_for_each(tmp, &trackers_list) { qpd = qb_list_entry(tmp, struct quorum_pd, list); res_lib_votequorum_notification->context = qpd->tracking_context; corosync_api->ipc_dispatch_send(qpd->conn, &buf, size); } } LEAVE(); return 0; } static int votequorum_exec_send_nodelist_notification(void *conn, uint64_t context) { struct res_lib_votequorum_nodelist_notification *res_lib_votequorum_notification; int i = 0; int size; struct qb_list_head *tmp; char buf[sizeof(struct res_lib_votequorum_nodelist_notification) + sizeof(uint32_t) * quorum_members_entries]; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Sending nodelist callback. ring_id = %d/%lld", quorum_ringid.nodeid, quorum_ringid.seq); size = sizeof(struct res_lib_votequorum_nodelist_notification) + sizeof(uint32_t) * quorum_members_entries; res_lib_votequorum_notification = (struct res_lib_votequorum_nodelist_notification *)&buf; res_lib_votequorum_notification->node_list_entries = quorum_members_entries; res_lib_votequorum_notification->ring_id.nodeid = quorum_ringid.nodeid; res_lib_votequorum_notification->ring_id.seq = quorum_ringid.seq; res_lib_votequorum_notification->context = context; for (i=0; inode_list[i] = quorum_members[i]; } res_lib_votequorum_notification->header.id = MESSAGE_RES_VOTEQUORUM_NODELIST_NOTIFICATION; res_lib_votequorum_notification->header.size = size; res_lib_votequorum_notification->header.error = CS_OK; /* Send it to all interested parties */ if (conn) { int ret = corosync_api->ipc_dispatch_send(conn, &buf, size); LEAVE(); return ret; } else { struct quorum_pd *qpd; qb_list_for_each(tmp, &trackers_list) { qpd = qb_list_entry(tmp, struct quorum_pd, list); res_lib_votequorum_notification->context = qpd->tracking_context; corosync_api->ipc_dispatch_send(qpd->conn, &buf, size); } } LEAVE(); return 0; } static void votequorum_exec_send_expectedvotes_notification(void) { struct res_lib_votequorum_expectedvotes_notification res_lib_votequorum_expectedvotes_notification; struct quorum_pd *qpd; struct qb_list_head *tmp; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Sending expected votes callback"); res_lib_votequorum_expectedvotes_notification.header.id = MESSAGE_RES_VOTEQUORUM_EXPECTEDVOTES_NOTIFICATION; res_lib_votequorum_expectedvotes_notification.header.size = sizeof(res_lib_votequorum_expectedvotes_notification); res_lib_votequorum_expectedvotes_notification.header.error = CS_OK; res_lib_votequorum_expectedvotes_notification.expected_votes = us->expected_votes; qb_list_for_each(tmp, &trackers_list) { qpd = qb_list_entry(tmp, struct quorum_pd, list); res_lib_votequorum_expectedvotes_notification.context = qpd->tracking_context; corosync_api->ipc_dispatch_send(qpd->conn, &res_lib_votequorum_expectedvotes_notification, sizeof(struct res_lib_votequorum_expectedvotes_notification)); } LEAVE(); } static void exec_votequorum_qdevice_reconfigure_endian_convert (void *message) { ENTER(); LEAVE(); } static void message_handler_req_exec_votequorum_qdevice_reconfigure ( const void *message, unsigned int nodeid) { const struct req_exec_quorum_qdevice_reconfigure *req_exec_quorum_qdevice_reconfigure = message; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Received qdevice name change req from node %u [from: %s to: %s]", nodeid, req_exec_quorum_qdevice_reconfigure->oldname, req_exec_quorum_qdevice_reconfigure->newname); if (!strcmp(req_exec_quorum_qdevice_reconfigure->oldname, qdevice_name)) { log_printf(LOGSYS_LEVEL_DEBUG, "Allowing qdevice rename"); memset(qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN); strcpy(qdevice_name, req_exec_quorum_qdevice_reconfigure->newname); /* * TODO: notify qdevices about name change? * this is not relevant for now and can wait later on since * qdevices are local only and libvotequorum is not final */ } LEAVE(); } static void exec_votequorum_qdevice_reg_endian_convert (void *message) { struct req_exec_quorum_qdevice_reg *req_exec_quorum_qdevice_reg = message; ENTER(); req_exec_quorum_qdevice_reg->operation = swab32(req_exec_quorum_qdevice_reg->operation); LEAVE(); } static void message_handler_req_exec_votequorum_qdevice_reg ( const void *message, unsigned int nodeid) { const struct req_exec_quorum_qdevice_reg *req_exec_quorum_qdevice_reg = message; struct res_lib_votequorum_status res_lib_votequorum_status; int wipe_qdevice_name = 1; struct cluster_node *node = NULL; struct qb_list_head *tmp; cs_error_t error = CS_OK; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Received qdevice op %u req from node %u [%s]", req_exec_quorum_qdevice_reg->operation, nodeid, req_exec_quorum_qdevice_reg->qdevice_name); switch(req_exec_quorum_qdevice_reg->operation) { case VOTEQUORUM_QDEVICE_OPERATION_REGISTER: if (nodeid != us->node_id) { if (!strlen(qdevice_name)) { log_printf(LOGSYS_LEVEL_DEBUG, "Remote qdevice name recorded"); strcpy(qdevice_name, req_exec_quorum_qdevice_reg->qdevice_name); } LEAVE(); return; } /* * protect against the case where we broadcast qdevice registration * to new memebers, we receive the message back, but there is no registration * connection in progress */ if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { LEAVE(); return; } /* * this should NEVER happen */ if (!qdevice_reg_conn) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to determine origin of the qdevice register call!"); LEAVE(); return; } /* * registering our own device in this case */ if (!strlen(qdevice_name)) { strcpy(qdevice_name, req_exec_quorum_qdevice_reg->qdevice_name); } /* * check if it is our device or something else */ if ((!strncmp(req_exec_quorum_qdevice_reg->qdevice_name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN))) { us->flags |= NODE_FLAGS_QDEVICE_REGISTERED; votequorum_exec_send_nodeinfo(VOTEQUORUM_QDEVICE_NODEID); votequorum_exec_send_nodeinfo(us->node_id); } else { log_printf(LOGSYS_LEVEL_WARNING, "A new qdevice with different name (new: %s old: %s) is trying to register!", req_exec_quorum_qdevice_reg->qdevice_name, qdevice_name); error = CS_ERR_EXIST; } res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(qdevice_reg_conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); qdevice_reg_conn = NULL; break; case VOTEQUORUM_QDEVICE_OPERATION_UNREGISTER: qb_list_for_each(tmp, &cluster_members_list) { node = qb_list_entry(tmp, struct cluster_node, list); if ((node->state == NODESTATE_MEMBER) && (node->flags & NODE_FLAGS_QDEVICE_REGISTERED)) { wipe_qdevice_name = 0; } } if (wipe_qdevice_name) { memset(qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN); } break; } LEAVE(); } static void exec_votequorum_nodeinfo_endian_convert (void *message) { struct req_exec_quorum_nodeinfo *nodeinfo = message; ENTER(); nodeinfo->nodeid = swab32(nodeinfo->nodeid); nodeinfo->votes = swab32(nodeinfo->votes); nodeinfo->expected_votes = swab32(nodeinfo->expected_votes); nodeinfo->flags = swab32(nodeinfo->flags); LEAVE(); } static void message_handler_req_exec_votequorum_nodeinfo ( const void *message, unsigned int sender_nodeid) { const struct req_exec_quorum_nodeinfo *req_exec_quorum_nodeinfo = message; struct cluster_node *node = NULL; int old_votes; int old_expected; uint32_t old_flags; nodestate_t old_state; int new_node = 0; int allow_downgrade = 0; int by_node = 0; unsigned int nodeid = req_exec_quorum_nodeinfo->nodeid; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "got nodeinfo message from cluster node %u", sender_nodeid); log_printf(LOGSYS_LEVEL_DEBUG, "nodeinfo message[%u]: votes: %d, expected: %d flags: %d", nodeid, req_exec_quorum_nodeinfo->votes, req_exec_quorum_nodeinfo->expected_votes, req_exec_quorum_nodeinfo->flags); if (nodeid != VOTEQUORUM_QDEVICE_NODEID) { decode_flags(req_exec_quorum_nodeinfo->flags); } node = find_node_by_nodeid(nodeid); if (!node) { node = allocate_node(nodeid); new_node = 1; } if (!node) { corosync_api->error_memory_failure(); LEAVE(); return; } if (new_node) { old_votes = 0; old_expected = 0; old_state = NODESTATE_DEAD; old_flags = 0; } else { old_votes = node->votes; old_expected = node->expected_votes; old_state = node->state; old_flags = node->flags; } if (nodeid == VOTEQUORUM_QDEVICE_NODEID) { struct cluster_node *sender_node = find_node_by_nodeid(sender_nodeid); assert(sender_node != NULL); if ((!cluster_is_quorate) && (sender_node->flags & NODE_FLAGS_QUORATE)) { node->votes = req_exec_quorum_nodeinfo->votes; } else { node->votes = max(node->votes, req_exec_quorum_nodeinfo->votes); } goto recalculate; } /* Update node state */ node->flags = req_exec_quorum_nodeinfo->flags; node->votes = req_exec_quorum_nodeinfo->votes; node->state = NODESTATE_MEMBER; if (node->flags & NODE_FLAGS_LEAVING) { node->state = NODESTATE_LEAVING; allow_downgrade = 1; by_node = 1; } if ((!cluster_is_quorate) && (node->flags & NODE_FLAGS_QUORATE)) { allow_downgrade = 1; us->expected_votes = req_exec_quorum_nodeinfo->expected_votes; } if (node->flags & NODE_FLAGS_QUORATE || (ev_tracking)) { node->expected_votes = req_exec_quorum_nodeinfo->expected_votes; } else { node->expected_votes = us->expected_votes; } if ((last_man_standing) && (node->votes > 1)) { log_printf(LOGSYS_LEVEL_WARNING, "Last Man Standing feature is supported only when all" "cluster nodes votes are set to 1. Disabling LMS."); last_man_standing = 0; if (last_man_standing_timer_set) { corosync_api->timer_delete(last_man_standing_timer); last_man_standing_timer_set = 0; } } recalculate: if ((new_node) || (nodeid == us->node_id) || (node->flags & NODE_FLAGS_FIRST) || (old_votes != node->votes) || (old_expected != node->expected_votes) || (old_flags != node->flags) || (old_state != node->state)) { recalculate_quorum(allow_downgrade, by_node); } if ((wait_for_all) && (!(node->flags & NODE_FLAGS_WFASTATUS)) && (node->flags & NODE_FLAGS_QUORATE)) { update_wait_for_all_status(0); } LEAVE(); } static void exec_votequorum_reconfigure_endian_convert (void *message) { struct req_exec_quorum_reconfigure *reconfigure = message; ENTER(); reconfigure->nodeid = swab32(reconfigure->nodeid); reconfigure->value = swab32(reconfigure->value); LEAVE(); } static void message_handler_req_exec_votequorum_reconfigure ( const void *message, unsigned int nodeid) { const struct req_exec_quorum_reconfigure *req_exec_quorum_reconfigure = message; struct cluster_node *node; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "got reconfigure message from cluster node %u for %u", nodeid, req_exec_quorum_reconfigure->nodeid); switch(req_exec_quorum_reconfigure->param) { case VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES: update_node_expected_votes(req_exec_quorum_reconfigure->value); votequorum_exec_send_expectedvotes_notification(); update_ev_barrier(req_exec_quorum_reconfigure->value); if (ev_tracking) { us->expected_votes = max(us->expected_votes, ev_tracking_barrier); } recalculate_quorum(1, 0); /* Allow decrease */ break; case VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES: node = find_node_by_nodeid(req_exec_quorum_reconfigure->nodeid); if (!node) { LEAVE(); return; } node->votes = req_exec_quorum_reconfigure->value; recalculate_quorum(1, 0); /* Allow decrease */ break; case VOTEQUORUM_RECONFIG_PARAM_CANCEL_WFA: update_wait_for_all_status(0); log_printf(LOGSYS_LEVEL_INFO, "wait_for_all_status reset by user on node %d.", req_exec_quorum_reconfigure->nodeid); recalculate_quorum(0, 0); break; } LEAVE(); } static int votequorum_exec_exit_fn (void) { int ret = 0; ENTER(); /* * tell the other nodes we are leaving */ if (allow_downscale) { us->flags |= NODE_FLAGS_LEAVING; ret = votequorum_exec_send_nodeinfo(us->node_id); } if ((ev_tracking) && (ev_tracking_fd != -1)) { close(ev_tracking_fd); } LEAVE(); return ret; } static void votequorum_set_icmap_ro_keys(void) { icmap_set_ro_access("quorum.allow_downscale", CS_FALSE, CS_TRUE); icmap_set_ro_access("quorum.wait_for_all", CS_FALSE, CS_TRUE); icmap_set_ro_access("quorum.last_man_standing", CS_FALSE, CS_TRUE); icmap_set_ro_access("quorum.last_man_standing_window", CS_FALSE, CS_TRUE); icmap_set_ro_access("quorum.expected_votes_tracking", CS_FALSE, CS_TRUE); icmap_set_ro_access("quorum.auto_tie_breaker", CS_FALSE, CS_TRUE); icmap_set_ro_access("quorum.auto_tie_breaker_node", CS_FALSE, CS_TRUE); } static char *votequorum_exec_init_fn (struct corosync_api_v1 *api) { char *error = NULL; ENTER(); /* * make sure we start clean */ qb_list_init(&cluster_members_list); qb_list_init(&trackers_list); qdevice = NULL; us = NULL; memset(cluster_nodes, 0, sizeof(cluster_nodes)); /* * Allocate a cluster_node for qdevice */ qdevice = allocate_node(VOTEQUORUM_QDEVICE_NODEID); if (!qdevice) { LEAVE(); return ((char *)"Could not allocate node."); } qdevice->votes = 0; memset(qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN); /* * Allocate a cluster_node for us */ us = allocate_node(corosync_api->totem_nodeid_get()); if (!us) { LEAVE(); return ((char *)"Could not allocate node."); } icmap_set_uint32("runtime.votequorum.this_node_id", us->node_id); us->state = NODESTATE_MEMBER; us->votes = 1; us->flags |= NODE_FLAGS_FIRST; error = votequorum_readconfig(VOTEQUORUM_READCONFIG_STARTUP); if (error) { return error; } recalculate_quorum(0, 0); /* * Set RO keys in icmap */ votequorum_set_icmap_ro_keys(); /* * Listen for changes */ votequorum_exec_add_config_notification(); /* * Start us off with one node */ votequorum_exec_send_nodeinfo(us->node_id); LEAVE(); return (NULL); } /* * votequorum service core */ static void votequorum_last_man_standing_timer_fn(void *arg) { ENTER(); last_man_standing_timer_set = 0; if (cluster_is_quorate) { recalculate_quorum(1,1); } LEAVE(); } static void votequorum_sync_init ( const unsigned int *trans_list, size_t trans_list_entries, const unsigned int *member_list, size_t member_list_entries, const struct memb_ring_id *ring_id) { int i, j; int found; int left_nodes; struct cluster_node *node; ENTER(); sync_in_progress = 1; sync_nodeinfo_sent = 0; sync_wait_for_poll_or_timeout = 0; if (member_list_entries > 1) { us->flags &= ~NODE_FLAGS_FIRST; } /* * we don't need to track which nodes have left directly, * since that info is in the node db, but we need to know * if somebody has left for last_man_standing */ left_nodes = 0; for (i = 0; i < quorum_members_entries; i++) { found = 0; for (j = 0; j < member_list_entries; j++) { if (quorum_members[i] == member_list[j]) { found = 1; break; } } if (found == 0) { left_nodes = 1; node = find_node_by_nodeid(quorum_members[i]); if (node) { node->state = NODESTATE_DEAD; } } } if (last_man_standing) { if (((member_list_entries >= quorum) && (left_nodes)) || ((member_list_entries <= quorum) && (auto_tie_breaker != ATB_NONE) && (check_low_node_id_partition() == 1))) { if (last_man_standing_timer_set) { corosync_api->timer_delete(last_man_standing_timer); last_man_standing_timer_set = 0; } corosync_api->timer_add_duration((unsigned long long)last_man_standing_window*1000000, NULL, votequorum_last_man_standing_timer_fn, &last_man_standing_timer); last_man_standing_timer_set = 1; } } memcpy(previous_quorum_members, quorum_members, sizeof(unsigned int) * quorum_members_entries); previous_quorum_members_entries = quorum_members_entries; memcpy(quorum_members, member_list, sizeof(unsigned int) * member_list_entries); quorum_members_entries = member_list_entries; memcpy(&quorum_ringid, ring_id, sizeof(*ring_id)); if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED && us->flags & NODE_FLAGS_QDEVICE_ALIVE) { /* * Reset poll timer. Sync waiting is interrupted on valid qdevice poll or after timeout */ if (qdevice_timer_set) { corosync_api->timer_delete(qdevice_timer); } corosync_api->timer_add_duration((unsigned long long)qdevice_sync_timeout*1000000, qdevice, qdevice_timer_fn, &qdevice_timer); qdevice_timer_set = 1; sync_wait_for_poll_or_timeout = 1; log_printf(LOGSYS_LEVEL_INFO, "waiting for quorum device %s poll (but maximum for %u ms)", qdevice_name, qdevice_sync_timeout); } LEAVE(); } static int votequorum_sync_process (void) { if (!sync_nodeinfo_sent) { votequorum_exec_send_nodeinfo(us->node_id); votequorum_exec_send_nodeinfo(VOTEQUORUM_QDEVICE_NODEID); if (strlen(qdevice_name)) { votequorum_exec_send_qdevice_reg(VOTEQUORUM_QDEVICE_OPERATION_REGISTER, qdevice_name); } votequorum_exec_send_nodelist_notification(NULL, 0LL); sync_nodeinfo_sent = 1; } if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED && sync_wait_for_poll_or_timeout) { /* * Waiting for qdevice to poll with new ringid or timeout */ return (-1); } return 0; } static void votequorum_sync_activate (void) { recalculate_quorum(0, 0); quorum_callback(quorum_members, quorum_members_entries, cluster_is_quorate, &quorum_ringid); votequorum_exec_send_quorum_notification(NULL, 0L); sync_in_progress = 0; } static void votequorum_sync_abort (void) { } char *votequorum_init(struct corosync_api_v1 *api, quorum_set_quorate_fn_t q_set_quorate_fn) { char *error; ENTER(); if (q_set_quorate_fn == NULL) { return ((char *)"Quorate function not set"); } corosync_api = api; quorum_callback = q_set_quorate_fn; error = corosync_service_link_and_init(corosync_api, &votequorum_service[0]); if (error) { return (error); } LEAVE(); return (NULL); } /* * Library Handler init/fini */ static int quorum_lib_init_fn (void *conn) { struct quorum_pd *pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); ENTER(); qb_list_init (&pd->list); pd->conn = conn; LEAVE(); return (0); } static int quorum_lib_exit_fn (void *conn) { struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); ENTER(); if (quorum_pd->tracking_enabled) { qb_list_del (&quorum_pd->list); qb_list_init (&quorum_pd->list); } LEAVE(); return (0); } /* * library internal functions */ static void qdevice_timer_fn(void *arg) { ENTER(); if ((!(us->flags & NODE_FLAGS_QDEVICE_ALIVE)) || (!qdevice_timer_set)) { LEAVE(); return; } us->flags &= ~NODE_FLAGS_QDEVICE_ALIVE; us->flags &= ~NODE_FLAGS_QDEVICE_CAST_VOTE; log_printf(LOGSYS_LEVEL_INFO, "lost contact with quorum device %s", qdevice_name); votequorum_exec_send_nodeinfo(us->node_id); qdevice_timer_set = 0; sync_wait_for_poll_or_timeout = 0; LEAVE(); } /* * Library Handler Functions */ static void message_handler_req_lib_votequorum_getinfo (void *conn, const void *message) { const struct req_lib_votequorum_getinfo *req_lib_votequorum_getinfo = message; struct res_lib_votequorum_getinfo res_lib_votequorum_getinfo; struct cluster_node *node; unsigned int highest_expected = 0; unsigned int total_votes = 0; cs_error_t error = CS_OK; uint32_t nodeid = req_lib_votequorum_getinfo->nodeid; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "got getinfo request on %p for node %u", conn, req_lib_votequorum_getinfo->nodeid); if (nodeid == VOTEQUORUM_QDEVICE_NODEID) { nodeid = us->node_id; } node = find_node_by_nodeid(nodeid); if (node) { struct cluster_node *iternode; struct qb_list_head *nodelist; qb_list_for_each(nodelist, &cluster_members_list) { iternode = qb_list_entry(nodelist, struct cluster_node, list); if (iternode->state == NODESTATE_MEMBER) { highest_expected = max(highest_expected, iternode->expected_votes); total_votes += iternode->votes; } } if (node->flags & NODE_FLAGS_QDEVICE_CAST_VOTE) { total_votes += qdevice->votes; } switch(node->state) { case NODESTATE_MEMBER: res_lib_votequorum_getinfo.state = VOTEQUORUM_NODESTATE_MEMBER; break; case NODESTATE_DEAD: res_lib_votequorum_getinfo.state = VOTEQUORUM_NODESTATE_DEAD; break; case NODESTATE_LEAVING: res_lib_votequorum_getinfo.state = VOTEQUORUM_NODESTATE_LEAVING; break; default: res_lib_votequorum_getinfo.state = node->state; break; } res_lib_votequorum_getinfo.state = node->state; res_lib_votequorum_getinfo.votes = node->votes; res_lib_votequorum_getinfo.expected_votes = node->expected_votes; res_lib_votequorum_getinfo.highest_expected = highest_expected; res_lib_votequorum_getinfo.quorum = quorum; res_lib_votequorum_getinfo.total_votes = total_votes; res_lib_votequorum_getinfo.flags = 0; res_lib_votequorum_getinfo.nodeid = node->node_id; if (two_node) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_TWONODE; } if (cluster_is_quorate) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QUORATE; } if (wait_for_all) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_WAIT_FOR_ALL; } if (last_man_standing) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_LAST_MAN_STANDING; } if (auto_tie_breaker != ATB_NONE) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_AUTO_TIE_BREAKER; } if (allow_downscale) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_ALLOW_DOWNSCALE; } memset(res_lib_votequorum_getinfo.qdevice_name, 0, VOTEQUORUM_QDEVICE_MAX_NAME_LEN); strcpy(res_lib_votequorum_getinfo.qdevice_name, qdevice_name); res_lib_votequorum_getinfo.qdevice_votes = qdevice->votes; if (node->flags & NODE_FLAGS_QDEVICE_REGISTERED) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_REGISTERED; } if (node->flags & NODE_FLAGS_QDEVICE_ALIVE) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_ALIVE; } if (node->flags & NODE_FLAGS_QDEVICE_CAST_VOTE) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_CAST_VOTE; } if (node->flags & NODE_FLAGS_QDEVICE_MASTER_WINS) { res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_QDEVICE_MASTER_WINS; } } else { error = CS_ERR_NOT_EXIST; } res_lib_votequorum_getinfo.header.size = sizeof(res_lib_votequorum_getinfo); res_lib_votequorum_getinfo.header.id = MESSAGE_RES_VOTEQUORUM_GETINFO; res_lib_votequorum_getinfo.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_getinfo, sizeof(res_lib_votequorum_getinfo)); log_printf(LOGSYS_LEVEL_DEBUG, "getinfo response error: %d", error); LEAVE(); } static void message_handler_req_lib_votequorum_setexpected (void *conn, const void *message) { const struct req_lib_votequorum_setexpected *req_lib_votequorum_setexpected = message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; unsigned int newquorum; unsigned int total_votes; uint8_t allow_downscale_status = 0; ENTER(); allow_downscale_status = allow_downscale; allow_downscale = 0; /* * Validate new expected votes */ newquorum = calculate_quorum(1, req_lib_votequorum_setexpected->expected_votes, &total_votes); allow_downscale = allow_downscale_status; if (newquorum < total_votes / 2 || newquorum > total_votes) { error = CS_ERR_INVALID_PARAM; goto error_exit; } update_node_expected_votes(req_lib_votequorum_setexpected->expected_votes); if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_EXPECTED_VOTES, us->node_id, req_lib_votequorum_setexpected->expected_votes)) { error = CS_ERR_NO_RESOURCES; } error_exit: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_setvotes (void *conn, const void *message) { const struct req_lib_votequorum_setvotes *req_lib_votequorum_setvotes = message; struct res_lib_votequorum_status res_lib_votequorum_status; struct cluster_node *node; unsigned int newquorum; unsigned int total_votes; unsigned int saved_votes; cs_error_t error = CS_OK; unsigned int nodeid; ENTER(); nodeid = req_lib_votequorum_setvotes->nodeid; node = find_node_by_nodeid(nodeid); if (!node) { error = CS_ERR_NAME_NOT_FOUND; goto error_exit; } /* * Check votes is valid */ saved_votes = node->votes; node->votes = req_lib_votequorum_setvotes->votes; newquorum = calculate_quorum(1, 0, &total_votes); if (newquorum < total_votes / 2 || newquorum > total_votes) { node->votes = saved_votes; error = CS_ERR_INVALID_PARAM; goto error_exit; } if (votequorum_exec_send_reconfigure(VOTEQUORUM_RECONFIG_PARAM_NODE_VOTES, nodeid, req_lib_votequorum_setvotes->votes)) { error = CS_ERR_NO_RESOURCES; } error_exit: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_trackstart (void *conn, const void *message) { const struct req_lib_votequorum_trackstart *req_lib_votequorum_trackstart = message; struct res_lib_votequorum_status res_lib_votequorum_status; struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); cs_error_t error = CS_OK; ENTER(); /* * If an immediate listing of the current cluster membership * is requested, generate membership list */ if (req_lib_votequorum_trackstart->track_flags & CS_TRACK_CURRENT || req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES) { log_printf(LOGSYS_LEVEL_DEBUG, "sending initial status to %p", conn); votequorum_exec_send_nodelist_notification(conn, req_lib_votequorum_trackstart->context); votequorum_exec_send_quorum_notification(conn, req_lib_votequorum_trackstart->context); } if (quorum_pd->tracking_enabled) { error = CS_ERR_EXIST; goto response_send; } /* * Record requests for tracking */ if (req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES || req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES_ONLY) { quorum_pd->track_flags = req_lib_votequorum_trackstart->track_flags; quorum_pd->tracking_enabled = 1; quorum_pd->tracking_context = req_lib_votequorum_trackstart->context; qb_list_add (&quorum_pd->list, &trackers_list); } response_send: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_trackstop (void *conn, const void *message) { struct res_lib_votequorum_status res_lib_votequorum_status; struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); int error = CS_OK; ENTER(); if (quorum_pd->tracking_enabled) { error = CS_OK; quorum_pd->tracking_enabled = 0; qb_list_del (&quorum_pd->list); qb_list_init (&quorum_pd->list); } else { error = CS_ERR_NOT_EXIST; } res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdevice_register (void *conn, const void *message) { const struct req_lib_votequorum_qdevice_register *req_lib_votequorum_qdevice_register = message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); if (!qdevice_can_operate) { log_printf(LOGSYS_LEVEL_INFO, "Registration of quorum device is disabled by incorrect corosync.conf. See logs for more information"); error = CS_ERR_ACCESS; goto out; } if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { if ((!strncmp(req_lib_votequorum_qdevice_register->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN))) { goto out; } else { log_printf(LOGSYS_LEVEL_WARNING, "A new qdevice with different name (new: %s old: %s) is trying to re-register!", req_lib_votequorum_qdevice_register->name, qdevice_name); error = CS_ERR_EXIST; goto out; } } else { if (qdevice_reg_conn != NULL) { log_printf(LOGSYS_LEVEL_WARNING, "Registration request already in progress"); error = CS_ERR_TRY_AGAIN; goto out; } qdevice_reg_conn = conn; if (votequorum_exec_send_qdevice_reg(VOTEQUORUM_QDEVICE_OPERATION_REGISTER, req_lib_votequorum_qdevice_register->name) != 0) { log_printf(LOGSYS_LEVEL_WARNING, "Unable to send qdevice registration request to cluster"); error = CS_ERR_TRY_AGAIN; qdevice_reg_conn = NULL; } else { LEAVE(); return; } } out: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdevice_unregister (void *conn, const void *message) { const struct req_lib_votequorum_qdevice_unregister *req_lib_votequorum_qdevice_unregister = message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { if (strncmp(req_lib_votequorum_qdevice_unregister->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) { error = CS_ERR_INVALID_PARAM; goto out; } if (qdevice_timer_set) { corosync_api->timer_delete(qdevice_timer); qdevice_timer_set = 0; sync_wait_for_poll_or_timeout = 0; } us->flags &= ~NODE_FLAGS_QDEVICE_REGISTERED; us->flags &= ~NODE_FLAGS_QDEVICE_ALIVE; us->flags &= ~NODE_FLAGS_QDEVICE_CAST_VOTE; us->flags &= ~NODE_FLAGS_QDEVICE_MASTER_WINS; votequorum_exec_send_nodeinfo(us->node_id); votequorum_exec_send_qdevice_reg(VOTEQUORUM_QDEVICE_OPERATION_UNREGISTER, req_lib_votequorum_qdevice_unregister->name); } else { error = CS_ERR_NOT_EXIST; } out: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdevice_update (void *conn, const void *message) { const struct req_lib_votequorum_qdevice_update *req_lib_votequorum_qdevice_update = message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { if (strncmp(req_lib_votequorum_qdevice_update->oldname, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) { error = CS_ERR_INVALID_PARAM; goto out; } votequorum_exec_send_qdevice_reconfigure(req_lib_votequorum_qdevice_update->oldname, req_lib_votequorum_qdevice_update->newname); } else { error = CS_ERR_NOT_EXIST; } out: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdevice_poll (void *conn, const void *message) { const struct req_lib_votequorum_qdevice_poll *req_lib_votequorum_qdevice_poll = message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; uint32_t oldflags; ENTER(); if (!qdevice_can_operate) { error = CS_ERR_ACCESS; goto out; } if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { if (!(req_lib_votequorum_qdevice_poll->ring_id.nodeid == quorum_ringid.nodeid && req_lib_votequorum_qdevice_poll->ring_id.seq == quorum_ringid.seq)) { log_printf(LOGSYS_LEVEL_DEBUG, "Received poll ring id (%u.%"PRIu64") != last sync " "ring id (%u.%"PRIu64"). Ignoring poll call.", req_lib_votequorum_qdevice_poll->ring_id.nodeid, req_lib_votequorum_qdevice_poll->ring_id.seq, quorum_ringid.nodeid, quorum_ringid.seq); error = CS_ERR_MESSAGE_ERROR; goto out; } if (strncmp(req_lib_votequorum_qdevice_poll->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) { error = CS_ERR_INVALID_PARAM; goto out; } if (qdevice_timer_set) { corosync_api->timer_delete(qdevice_timer); qdevice_timer_set = 0; } oldflags = us->flags; us->flags |= NODE_FLAGS_QDEVICE_ALIVE; if (req_lib_votequorum_qdevice_poll->cast_vote) { us->flags |= NODE_FLAGS_QDEVICE_CAST_VOTE; } else { us->flags &= ~NODE_FLAGS_QDEVICE_CAST_VOTE; } if (us->flags != oldflags) { votequorum_exec_send_nodeinfo(us->node_id); } corosync_api->timer_add_duration((unsigned long long)qdevice_timeout*1000000, qdevice, qdevice_timer_fn, &qdevice_timer); qdevice_timer_set = 1; sync_wait_for_poll_or_timeout = 0; } else { error = CS_ERR_NOT_EXIST; } out: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdevice_master_wins (void *conn, const void *message) { const struct req_lib_votequorum_qdevice_master_wins *req_lib_votequorum_qdevice_master_wins = message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; uint32_t oldflags = us->flags; ENTER(); if (!qdevice_can_operate) { error = CS_ERR_ACCESS; goto out; } if (us->flags & NODE_FLAGS_QDEVICE_REGISTERED) { if (strncmp(req_lib_votequorum_qdevice_master_wins->name, qdevice_name, VOTEQUORUM_QDEVICE_MAX_NAME_LEN)) { error = CS_ERR_INVALID_PARAM; goto out; } if (req_lib_votequorum_qdevice_master_wins->allow) { us->flags |= NODE_FLAGS_QDEVICE_MASTER_WINS; } else { us->flags &= ~NODE_FLAGS_QDEVICE_MASTER_WINS; } if (us->flags != oldflags) { votequorum_exec_send_nodeinfo(us->node_id); } update_qdevice_master_wins(req_lib_votequorum_qdevice_master_wins->allow); } else { error = CS_ERR_NOT_EXIST; } out: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } diff --git a/man/corosync.conf.5 b/man/corosync.conf.5 index 1b8b7cdc..790f4347 100644 --- a/man/corosync.conf.5 +++ b/man/corosync.conf.5 @@ -1,908 +1,908 @@ .\"/* .\" * Copyright (c) 2005 MontaVista Software, Inc. .\" * Copyright (c) 2006-2018 Red Hat, Inc. .\" * .\" * All rights reserved. .\" * .\" * Author: Steven Dake (sdake@redhat.com) .\" * .\" * This software licensed under BSD license, the text of which follows: .\" * .\" * Redistribution and use in source and binary forms, with or without .\" * modification, are permitted provided that the following conditions are met: .\" * .\" * - Redistributions of source code must retain the above copyright notice, .\" * this list of conditions and the following disclaimer. .\" * - Redistributions in binary form must reproduce the above copyright notice, .\" * this list of conditions and the following disclaimer in the documentation .\" * and/or other materials provided with the distribution. .\" * - Neither the name of the MontaVista Software, Inc. nor the names of its .\" * contributors may be used to endorse or promote products derived from this .\" * software without specific prior written permission. .\" * .\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" .\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE .\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF .\" * THE POSSIBILITY OF SUCH DAMAGE. .\" */ -.TH COROSYNC_CONF 5 2018-11-13 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" +.TH COROSYNC_CONF 5 2018-12-14 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" .SH NAME corosync.conf - corosync executive configuration file .SH SYNOPSIS /etc/corosync/corosync.conf .SH DESCRIPTION The corosync.conf instructs the corosync executive about various parameters needed to control the corosync executive. Empty lines and lines starting with # character are ignored. The configuration file consists of bracketed top level directives. The possible directive choices are: .TP totem { } This top level directive contains configuration options for the totem protocol. .TP logging { } This top level directive contains configuration options for logging. .TP quorum { } This top level directive contains configuration options for quorum. .TP nodelist { } This top level directive contains configuration options for nodes in cluster. .TP system { } This top level directive contains configuration options related to system. .TP resources { } This top level directive contains configuration options for resources. .PP The .B interface sub-directive of totem is optional for UDP and knet transports. For knet, multiple interface subsections define parameters for each knet link on the system. For UDPU an interface section is not needed and it is recommended that the nodelist is used to define cluster nodes. .TP linknumber This specifies the link number for the interface. When using the knet protocol, each interface should specify separate link numbers to uniquely identify to the membership protocol which interface to use for which link. The linknumber must start at 0. For UDP the only supported linknumber is 0. .TP knet_link_priority This specifies the priority for the link when knet is used in 'passive' mode. (see link_mode below) .TP knet_ping_interval This specifies the interval between knet link pings. knet_ping_interval and knet_ping_timeout are a pair, if one is specified the other should be too, otherwise one will be calculated from the token timeout and one will be taken from the config file. (default is token timeout / (knet_pong_count*2)) .TP knet_ping_timeout If no ping is received within this time, the knet link is declared dead. knet_ping_interval and knet_ping_timeout are a pair, if one is specified the other should be too, otherwise one will be calculated from the token timeout and one will be taken from the config file. (default is token timeout / knet_pong_count) .TP knet_ping_precision How many values of latency are used to calculate the average link latency. (default 2048 samples) .TP knet_pong_count How many valid ping/pongs before a link is marked UP. (default 5) .TP knet_transport Which IP transport knet should use. valid values are "sctp" or "udp". (default: udp) .TP bindnetaddr (udp only) This specifies the network address the corosync executive should bind to when using udp. bindnetaddr (udp only) should be an IP address configured on the system, or a network address. For example, if the local interface is 192.168.5.92 with netmask 255.255.255.0, you should set bindnetaddr to 192.168.5.92 or 192.168.5.0. If the local interface is 192.168.5.92 with netmask 255.255.255.192, set bindnetaddr to 192.168.5.92 or 192.168.5.64, and so forth. This may also be an IPV6 address, in which case IPV6 networking will be used. In this case, the exact address must be specified and there is no automatic selection of the network interface within a specific subnet as with IPv4. If IPv6 networking is used, the nodeid field in nodelist must be specified. .TP broadcast (udp only) This is optional and can be set to yes. If it is set to yes, the broadcast address will be used for communication. If this option is set, mcastaddr should not be set. .TP mcastaddr (udp only) This is the multicast address used by corosync executive. The default should work for most networks, but the network administrator should be queried about a multicast address to use. Avoid 224.x.x.x because this is a "config" multicast address. This may also be an IPV6 multicast address, in which case IPV6 networking will be used. If IPv6 networking is used, the nodeid field in nodelist must be specified. It's not necessary to use this option if cluster_name option is used. If both options are used, mcastaddr has higher priority. .TP mcastport (udp only) This specifies the UDP port number. It is possible to use the same multicast address on a network with the corosync services configured for different UDP ports. Please note corosync uses two UDP ports mcastport (for mcast receives) and mcastport - 1 (for mcast sends). If you have multiple clusters on the same network using the same mcastaddr please configure the mcastports with a gap. .TP ttl (udp only) This specifies the Time To Live (TTL). If you run your cluster on a routed network then the default of "1" will be too small. This option provides a way to increase this up to 255. The valid range is 0..255. .PP .PP Within the .B totem directive, there are seven configuration options of which one is required, five are optional, and one is required when IPV6 is configured in the interface subdirective. The required directive controls the version of the totem configuration. The optional option unless using IPV6 directive controls identification of the processor. The optional options control secrecy and authentication, the network mode of operation and maximum network MTU field. .TP version This specifies the version of the configuration file. Currently the only valid version for this directive is 2. .PP clear_node_high_bit This configuration option is optional and is only relevant when no nodeid is specified. Some corosync clients require a signed 32 bit nodeid that is greater than zero however by default corosync uses all 32 bits of the IPv4 address space when generating a nodeid. Set this option to yes to force the high bit to be zero and therefore ensure the nodeid is a positive signed 32 bit integer. WARNING: Cluster behavior is undefined if this option is enabled on only a subset of the cluster (for example during a rolling upgrade). .TP crypto_model This specifies which cryptographic library should be used by knet. Options are nss and openssl. The default is nss. .TP crypto_hash This specifies which HMAC authentication should be used to authenticate all messages. Valid values are none (no authentication), md5, sha1, sha256, sha384 and sha512. Encrypted transmission is only supported for the knet transport. The default is none. .TP crypto_cipher This specifies which cipher should be used to encrypt all messages. Valid values are none (no encryption), aes256, aes192, aes128 and 3des. Enabling crypto_cipher, requires also enabling of crypto_hash. Encrypted transmission is only supported for the knet transport. The default is none. .TP keyfile This specifies the fully qualified path to the shared key used to authenticate and encrypt data used within the Totem protocol. The default is /etc/corosync/authkey. .TP key Shared key stored in configuration instead of authkey file. This option has lower precedence than keyfile option so it's used only when keyfile is not specified. Using this option is not recommended for security reasons. .TP link_mode This specifies the Kronosnet mode, which may be passive, active, or rr (round-robin). .B passive: the active link with the lowest priority will be used. If one or more links share the same priority the one with the lowest link ID will be used. .B active: All active links will be used simultaneously to send traffic. link priority is ignored. .B rr: Round-Robin policy. Each packet will be sent to the next active link in order. If only one interface directive is specified, passive is automatically chosen. The maximum number of interface directives that is allowed with Kronosnet is 8. For other transports it is 1. .TP netmtu This specifies the network maximum transmit unit. To set this value beyond 1500, the regular frame MTU, requires ethernet devices that support large, or also called jumbo, frames. If any device in the network doesn't support large frames, the protocol will not operate properly. The hosts must also have their mtu size set from 1500 to whatever frame size is specified here. Please note while some NICs or switches claim large frame support, they support 9000 MTU as the maximum frame size including the IP header. Setting the netmtu and host MTUs to 9000 will cause totem to use the full 9000 bytes of the frame. Then Linux will add a 18 byte header moving the full frame size to 9018. As a result some hardware will not operate properly with this size of data. A netmtu of 8982 seems to work for the few large frame devices that have been tested. Some manufacturers claim large frame support when in fact they support frame sizes of 4500 bytes. When sending multicast traffic, if the network frequently reconfigures, chances are that some device in the network doesn't support large frames. Choose hardware carefully if intending to use large frame support. The default is 1500. .TP transport This directive controls the transport mechanism used. The default is knet. The transport type can also be set to udpu or udp. Only knet allows crypto or multiple interfaces per node. .TP cluster_name This specifies the name of cluster and it's used for automatic generating of multicast address. .TP config_version This specifies version of config file. This is converted to unsigned 64-bit int. By default it's 0. Option is used to prevent joining old nodes with not up-to-date configuration. If value is not 0, and node is going for first time (only for first time, join after split doesn't follow this rules) from single-node membership to multiple nodes membership, other nodes config_versions are collected. If current node config_version is not equal to highest of collected versions, corosync is terminated. .TP ip_version This specifies version of IP to ask DNS resolver for. The value can be one of .B ipv4 (look only for an IPv4 address) , .B ipv6 (check only IPv6 address) , .B ipv4-6 (first check IPv4 address, if that fails then look for an IPv6 address) and .B ipv6-4 (first check IPv6 address, if that fails then look for an IPv4 address). Default (if unspecified) is ipv6-4 for knet and udpu transports and ipv4 for udp. Knet transport allows to have a both ipv4 and ipv6 address, provided they are consistent on each link. Within the .B totem directive, there are several configuration options which are used to control the operation of the protocol. It is generally not recommended to change any of these values without proper guidance and sufficient testing. Some networks may require larger values if suffering from frequent reconfigurations. Some applications may require faster failure detection times which can be achieved by reducing the token timeout. .TP token This timeout is used directly or as a base for real token timeout calculation (explained in .B token_coefficient section). Token timeout specifies in milliseconds until a token loss is declared after not receiving a token. This is the time spent detecting a failure of a processor in the current configuration. Reforming a new configuration takes about 50 milliseconds in addition to this timeout. For real token timeout used by totem it's possible to read cmap value of .B runtime.config.totem.token key. The default is 1000 milliseconds. .TP token_warning Specifies the interval between warnings that the token has not been received. The value is a percentage of the token timeout and can be set to 0 to disable warnings. The default is 75%. .TP token_coefficient This value is used only when .B nodelist section is specified and contains at least 3 nodes. If so, real token timeout is then computed as token + (number_of_nodes - 2) * token_coefficient. This allows cluster to scale without manually changing token timeout every time new node is added. This value can be set to 0 resulting in effective removal of this feature. The default is 650 milliseconds. .TP token_retransmit This timeout specifies in milliseconds after how long before receiving a token the token is retransmitted. This will be automatically calculated if token is modified. It is not recommended to alter this value without guidance from the corosync community. The default is 238 milliseconds. .TP knet_compression_model The (optional) type of compression used by Kronosnet. The values available depend on the build and also avaialable libraries. Typically zlib and lz4 will be available but bzip2 and others could also be allowed. The default is 'none' .TP knet_compression_threshold Tells knet to NOT compress any packets that are smaller than the value indicated. Default 100 bytes. Set to 0 to reset to the default. Set to 1 to compress everything. .TP knet_compression_level Many compression libraries allow tuning of compression parameters. For example 0 or 1 ... 9 are commonly used to determine the level of compression. This value is passed unmodified to the compression library so it is recommended to consult the library's documentation for more detailed information. .TP hold This timeout specifies in milliseconds how long the token should be held by the representative when the protocol is under low utilization. It is not recommended to alter this value without guidance from the corosync community. The default is 180 milliseconds. .TP token_retransmits_before_loss_const This value identifies how many token retransmits should be attempted before forming a new configuration. If this value is set, retransmit and hold will be automatically calculated from retransmits_before_loss and token. The default is 4 retransmissions. .TP join This timeout specifies in milliseconds how long to wait for join messages in the membership protocol. The default is 50 milliseconds. .TP send_join This timeout specifies in milliseconds an upper range between 0 and send_join to wait before sending a join message. For configurations with less than 32 nodes, this parameter is not necessary. For larger rings, this parameter is necessary to ensure the NIC is not overflowed with join messages on formation of a new ring. A reasonable value for large rings (128 nodes) would be 80msec. Other timer values must also change if this value is changed. Seek advice from the corosync mailing list if trying to run larger configurations. The default is 0 milliseconds. .TP consensus This timeout specifies in milliseconds how long to wait for consensus to be achieved before starting a new round of membership configuration. The minimum value for consensus must be 1.2 * token. This value will be automatically calculated at 1.2 * token if the user doesn't specify a consensus value. For two node clusters, a consensus larger than the join timeout but less than token is safe. For three node or larger clusters, consensus should be larger than token. There is an increasing risk of odd membership changes, which still guarantee virtual synchrony, as node count grows if consensus is less than token. The default is 1200 milliseconds. .TP merge This timeout specifies in milliseconds how long to wait before checking for a partition when no multicast traffic is being sent. If multicast traffic is being sent, the merge detection happens automatically as a function of the protocol. The default is 200 milliseconds. .TP downcheck This timeout specifies in milliseconds how long to wait before checking that a network interface is back up after it has been downed. The default is 1000 milliseconds. .TP fail_recv_const This constant specifies how many rotations of the token without receiving any of the messages when messages should be received may occur before a new configuration is formed. The default is 2500 failures to receive a message. .TP seqno_unchanged_const This constant specifies how many rotations of the token without any multicast traffic should occur before the hold timer is started. The default is 30 rotations. .TP heartbeat_failures_allowed [HeartBeating mechanism] Configures the optional HeartBeating mechanism for faster failure detection. Keep in mind that engaging this mechanism in lossy networks could cause faulty loss declaration as the mechanism relies on the network for heartbeating. So as a rule of thumb use this mechanism if you require improved failure in low to medium utilized networks. This constant specifies the number of heartbeat failures the system should tolerate before declaring heartbeat failure e.g 3. Also if this value is not set or is 0 then the heartbeat mechanism is not engaged in the system and token rotation is the method of failure detection The default is 0 (disabled). .TP max_network_delay [HeartBeating mechanism] This constant specifies in milliseconds the approximate delay that your network takes to transport one packet from one machine to another. This value is to be set by system engineers and please don't change if not sure as this effects the failure detection mechanism using heartbeat. The default is 50 milliseconds. .TP window_size This constant specifies the maximum number of messages that may be sent on one token rotation. If all processors perform equally well, this value could be large (300), which would introduce higher latency from origination to delivery for very large rings. To reduce latency in large rings(16+), the defaults are a safe compromise. If 1 or more slow processor(s) are present among fast processors, window_size should be no larger than 256000 / netmtu to avoid overflow of the kernel receive buffers. The user is notified of this by the display of a retransmit list in the notification logs. There is no loss of data, but performance is reduced when these errors occur. The default is 50 messages. .TP max_messages This constant specifies the maximum number of messages that may be sent by one processor on receipt of the token. The max_messages parameter is limited to 256000 / netmtu to prevent overflow of the kernel transmit buffers. The default is 17 messages. .TP miss_count_const This constant defines the maximum number of times on receipt of a token a message is checked for retransmission before a retransmission occurs. This parameter is useful to modify for switches that delay multicast packets compared to unicast packets. The default setting works well for nearly all modern switches. The default is 5 messages. .TP knet_pmtud_interval How often the knet PMTUd runs to look for network MTU changes. Value in seconds, default: 30 .PP Within the .B logging directive, there are several configuration options which are all optional. .PP The following 3 options are valid only for the top level logging directive: .TP timestamp This specifies that a timestamp is placed on all log messages. It can be one of off (no timestamp), on (second precision timestamp) or hires (millisecond precision timestamp - only when supported by LibQB). The default is hires (or on if hires is not supported). .TP fileline This specifies that file and line should be printed. The default is off. .TP function_name This specifies that the code function name should be printed. The default is off. .TP blackbox This specifies that blackbox functionality should be enabled. The default is on. .PP The following options are valid both for top level logging directive and they can be overridden in logger_subsys entries. .TP to_stderr .TP to_logfile .TP to_syslog These specify the destination of logging output. Any combination of these options may be specified. Valid options are .B yes and .B no. The default is syslog and stderr. Please note, if you are using to_logfile and want to rotate the file, use logrotate(8) with the option .B copytruncate. eg. .ne 18 .RS .nf .ft CW /var/log/corosync.log { missingok compress notifempty daily rotate 7 copytruncate } .ft .fi .RE .TP logfile If the .B to_logfile directive is set to .B yes , this option specifies the pathname of the log file. No default. .TP logfile_priority This specifies the logfile priority for this particular subsystem. Ignored if debug is on. Possible values are: alert, crit, debug (same as debug = on), emerg, err, info, notice, warning. The default is: info. .TP syslog_facility This specifies the syslog facility type that will be used for any messages sent to syslog. options are daemon, local0, local1, local2, local3, local4, local5, local6 & local7. The default is daemon. .TP syslog_priority This specifies the syslog level for this particular subsystem. Ignored if debug is on. Possible values are: alert, crit, debug (same as debug = on), emerg, err, info, notice, warning. The default is: info. .TP debug This specifies whether debug output is logged for this particular logger. Also can contain value trace, what is highest level of debug information. The default is off. .PP Within the .B logging directive, logger_subsys directives are optional. .PP Within the .B logger_subsys sub-directive, all of the above logging configuration options are valid and can be used to override the default settings. The subsys entry, described below, is mandatory to identify the subsystem. .TP subsys This specifies the subsystem identity (name) for which logging is specified. This is the name used by a service in the log_init() call. E.g. 'CPG'. This directive is required. .PP Within the .B quorum directive it is possible to specify the quorum algorithm to use with the .TP provider directive. At the time of writing only corosync_votequorum is supported. See votequorum(5) for configuration options. .PP Within the .B nodelist directive it is possible to specify specific information about nodes in cluster. Directive can contain only .B node sub-directive, which specifies every node that should be a member of the membership, and where non-default options are needed. Every node must have at least ring0_addr field filled. Every node that should be a member of the membership must be specified. Possible options are: .TP ringX_addr This specifies IP or network hostname address of the particular node. X is a link number. .TP nodeid This configuration option is required for each node for Kronosnet mode. It is a 32 bit value specifying the node identifier delivered to the cluster membership service. The node identifier value of zero is reserved and should not be used. If knet is set, this field must be set. .TP name This option is used mainly with knet transport to identify local node. It's also used by client software (pacemaker). Algorithm for identifying local node is following: .RS .IP 1. Looks up $HOSTNAME in the nodelist .IP 2. If this fails strip the domain name from $HOSTNAME and looks up that in the nodelist .IP 3. If this fails look in the nodelist for a fully-qualified name whose short version matches the short version of $HOSTNAME .IP 4. If all this fails then search the interfaces list for an address that matches a name in the nodelist .RE .PP Within the .B system directive it is possible to specify system options. Possible options are: .TP qb_ipc_type This specifies type of IPC to use. Can be one of native (default), shm and socket. Native means one of shm or socket, depending on what is supported by OS. On systems with support for both, SHM is selected. SHM is generally faster, but need to allocate ring buffer file in /dev/shm. .TP sched_rr Should be set to yes (default) if corosync should try to set round robin realtime scheduling with maximal priority to itself. When setting of scheduler fails, fallback to set maximal priority. .TP priority Set priority of corosync process. Valid only when sched_rr is set to no. Can be ether numeric value with similar meaning as .BR nice (1) or .B max / .B min meaning maximal / minimal priority (so minimal / maximal nice value). .TP move_to_root_cgroup Should be set to yes (default) if corosync should try to move itself to root cgroup. This feature is available only for systems with cgroups with RT sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option). .TP -run_dir +state_dir Existing directory where corosync should chdir into. Corosync stores important state files and blackboxes there. The default is /var/lib/corosync. .PP Within the .B resources directive it is possible to specify options for resources. Possible option is: .TP watchdog_device (Valid only if Corosync was compiled with watchdog support.) .br Watchdog device to use, for example /dev/watchdog. If unset, empty or "off", no watchdog is used. .IP In a cluster with properly configured power fencing a watchdog provides no additional value. On the other hand, slow watchdog communication may incur multi-second delays in the Corosync main loop, potentially breaking down membership. IPMI watchdogs are particularly notorious in this regard: read about kipmid_max_busy_us in IPMI.txt in the Linux kernel documentation. .SH "TO ADD A NEW NODE TO THE CLUSTER" For example to add a node with address 10.24.38.108 with nodeid 3. The node has the name NEW (in DNS or /etc/hosts) and is not currently running corosync. The current corosync.conf nodelist looks like this: .PP .nf .RS nodelist { node { nodeid: 1 ring0_addr: 10.24.38.101 name: node1 } node { nodeid: 2 ring0_addr: 10.24.38.102 name: node2 } } .RE .fi .PP Add a new entry for the node below the existing nodes. Node entries don't have to be in nodeid order, but it will help keep you sane. So the nodelist now looks like this: .PP .nf .RS nodelist { node { nodeid: 1 ring0_addr: 10.24.38.101 name: node1 } node { nodeid: 2 ring0_addr: 10.24.38.102 name: node2 } node { nodeid: 3 ring0_addr: 10.24.38.108 name: NEW } } .RE .fi .PP .PP This file must then be copied onto all three nodes - the existing two nodes, and the new one. On one of the existing corosync nodes, tell corosync to re-read the updated config file into memory: .PP .nf .RS corosync-cfgtool -R .RE .fi .PP This command only needs to be run on one node in the cluster. You may then start corosync on the NEW node and it should join the cluster. If this doesn't work as expected then check the communications between all three nodes is working, and check the syslog files on all nodes for more information. It's important to note that the key bit of information about a node failing to join might be on a different node than you expect. .SH "TO REMOVE A NODE FROM THE CLUSTER" This is the reverse procedure to 'Adding a node' above. First you need to shut down the node you will be removing from the cluster. .PP .nf .RS corosync-cfgtool -H .RE .fi .PP Then delete the nodelist stanza from corosync.conf and finally update corosync on the remaining nodes by running .PP .nf .RS corosync-cfgtool -R .RE .fi .TP on one of them. .SH "ADDRESS RESOLUTION" corosync resolves ringX_addr names/IP addresses using the getaddrinfo(3) call with respect of totem.ip_version setting. getaddrinfo() function uses a sophisticated algorithm to sort node addresses into a preferred order and corosync always chooses the first address in that list of the required family. As such it is essential that your DNS or /etc/hosts files are correctly configured so that all addresses for ringX appear on the same network (or are reachable with minimal hops) and over the same IP protocol. If this is not the case then some nodes might not be able to join the cluster. It is possible to override the search order used by getaddrinfo() using the configuration file /etc/gai.conf(5) if necessary, but this is not recommended. If there is any doubt about the order of addresses returned from getaddrinfo() then it might be simpler to use IP addresses (v4 or v6) in the ringX_addr field. .SH "FILES" .TP /etc/corosync/corosync.conf The corosync executive configuration file. .SH "SEE ALSO" .BR corosync_overview (7), .BR votequorum (5), .BR corosync-qdevice (8), .BR logrotate (8) .BR getaddrinfo (3) .BR gai.conf (5) .PP diff --git a/man/votequorum.5 b/man/votequorum.5 index 0700228d..23dcc6de 100644 --- a/man/votequorum.5 +++ b/man/votequorum.5 @@ -1,409 +1,410 @@ .\"/* .\" * Copyright (c) 2012-2014 Red Hat, Inc. .\" * .\" * All rights reserved. .\" * .\" * Authors: Christine Caulfield .\" * Fabio M. Di Nitto .\" * .\" * This software licensed under BSD license, the text of which follows: .\" * .\" * Redistribution and use in source and binary forms, with or without .\" * modification, are permitted provided that the following conditions are met: .\" * .\" * - Redistributions of source code must retain the above copyright notice, .\" * this list of conditions and the following disclaimer. .\" * - Redistributions in binary form must reproduce the above copyright notice, .\" * this list of conditions and the following disclaimer in the documentation .\" * and/or other materials provided with the distribution. .\" * - Neither the name of the MontaVista Software, Inc. nor the names of its .\" * contributors may be used to endorse or promote products derived from this .\" * software without specific prior written permission. .\" * .\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" .\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE .\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF .\" * THE POSSIBILITY OF SUCH DAMAGE. .\" */ -.TH VOTEQUORUM 5 2012-01-24 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" +.TH VOTEQUORUM 5 2018-12-14 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" .SH NAME votequorum \- Votequorum Configuration Overview .SH OVERVIEW The votequorum service is part of the corosync project. This service can be optionally loaded into the nodes of a corosync cluster to avoid split-brain situations. It does this by having a number of votes assigned to each system in the cluster and ensuring that only when a majority of the votes are present, cluster operations are allowed to proceed. The service must be loaded into all nodes or none. If it is loaded into a subset of cluster nodes the results will be unpredictable. .PP The following corosync.conf extract will enable votequorum service within corosync: .PP .nf quorum { provider: corosync_votequorum } .fi .PP votequorum reads its configuration from corosync.conf. Some values can be changed at runtime, others are only read at corosync startup. It is very important that those values are consistent across all the nodes participating in the cluster or votequorum behavior will be unpredictable. .PP votequorum requires an expected_votes value to function, this can be provided in two ways. The number of expected votes will be automatically calculated when the nodelist { } section is present in corosync.conf or expected_votes can be specified in the quorum { } section. Lack of both will disable votequorum. If both are present at the same time, the quorum.expected_votes value will override the one calculated from the nodelist. .PP Example (no nodelist) of an 8 node cluster (each node has 1 vote): .nf quorum { provider: corosync_votequorum expected_votes: 8 } .fi .PP Example (with nodelist) of a 3 node cluster (each node has 1 vote): .nf quorum { provider: corosync_votequorum } nodelist { node { ring0_addr: 192.168.1.1 } node { ring0_addr: 192.168.1.2 } node { ring0_addr: 192.168.1.3 } } .fi .SH SPECIAL FEATURES .PP .B two_node: 1 .PP Enables two node cluster operations (default: 0). .PP The "two node cluster" is a use case that requires special consideration. With a standard two node cluster, each node with a single vote, there are 2 votes in the cluster. Using the simple majority calculation (50% of the votes + 1) to calculate quorum, the quorum would be 2. This means that the both nodes would always have to be alive for the cluster to be quorate and operate. .PP Enabling two_node: 1, quorum is set artificially to 1. .PP Example configuration 1: .nf quorum { provider: corosync_votequorum expected_votes: 2 two_node: 1 } .fi .PP Example configuration 2: .nf quorum { provider: corosync_votequorum two_node: 1 } nodelist { node { ring0_addr: 192.168.1.1 } node { ring0_addr: 192.168.1.2 } } .fi .PP NOTES: enabling two_node: 1 automatically enables wait_for_all. It is still possible to override wait_for_all by explicitly setting it to 0. If more than 2 nodes join the cluster, the two_node option is automatically disabled. .PP .B wait_for_all: 1 .PP Enables Wait For All (WFA) feature (default: 0). .PP The general behaviour of votequorum is to switch a cluster from inquorate to quorate as soon as possible. For example, in an 8 node cluster, where every node has 1 vote, expected_votes is set to 8 and quorum is (50% + 1) 5. As soon as 5 (or more) nodes are visible to each other, the partition of 5 (or more) becomes quorate and can start operating. .PP When WFA is enabled, the cluster will be quorate for the first time only after all nodes have been visible at least once at the same time. .PP This feature has the advantage of avoiding some startup race conditions, with the cost that all nodes need to be up at the same time at least once before the cluster can operate. .PP A common startup race condition based on the above example is that as soon as 5 nodes become quorate, with the other 3 still offline, the remaining 3 nodes will be fenced. .PP It is very useful when combined with last_man_standing (see below). .PP Example configuration: .nf quorum { provider: corosync_votequorum expected_votes: 8 wait_for_all: 1 } .fi .PP .B last_man_standing: 1 / .B last_man_standing_window: 10000 .PP Enables Last Man Standing (LMS) feature (default: 0). Tunable last_man_standing_window (default: 10 seconds, expressed in ms). .PP The general behaviour of votequorum is to set expected_votes and quorum at startup (unless modified by the user at runtime, see below) and use those values during the whole lifetime of the cluster. .PP Using for example an 8 node cluster where each node has 1 vote, expected_votes is set to 8 and quorum to 5. This condition allows a total failure of 3 nodes. If a 4th node fails, the cluster becomes inquorate and it will stop providing services. .PP Enabling LMS allows the cluster to dynamically recalculate expected_votes and quorum under specific circumstances. It is essential to enable WFA when using LMS in High Availability clusters. .PP Using the above 8 node cluster example, with LMS enabled the cluster can retain quorum and continue operating by losing, in a cascade fashion, up to 6 nodes with only 2 remaining active. .PP Example chain of events: .nf 1) cluster is fully operational with 8 nodes. (expected_votes: 8 quorum: 5) 2) 3 nodes die, cluster is quorate with 5 nodes. 3) after last_man_standing_window timer expires, expected_votes and quorum are recalculated. (expected_votes: 5 quorum: 3) 4) at this point, 2 more nodes can die and cluster will still be quorate with 3. 5) once again, after last_man_standing_window timer expires expected_votes and quorum are recalculated. (expected_votes: 3 quorum: 2) 6) at this point, 1 more node can die and cluster will still be quorate with 2. 7) one more last_man_standing_window timer (expected_votes: 2 quorum: 2) .fi .PP NOTES: In order for the cluster to downgrade automatically from 2 nodes to a 1 node cluster, the auto_tie_breaker feature must also be enabled (see below). If auto_tie_breaker is not enabled, and one more failure occurs, the remaining node will not be quorate. LMS does not work with asymmetric voting schemes, each node must vote 1. LMS is also incompatible with quorum devices, if last_man_standing is specified in corosync.conf then the quorum device will be disabled. .PP Example configuration 1: .nf quorum { provider: corosync_votequorum expected_votes: 8 last_man_standing: 1 } .fi .PP Example configuration 2 (increase timeout to 20 seconds): .nf quorum { provider: corosync_votequorum expected_votes: 8 last_man_standing: 1 last_man_standing_window: 20000 } .fi .PP .B auto_tie_breaker: 1 .PP Enables Auto Tie Breaker (ATB) feature (default: 0). .PP The general behaviour of votequorum allows a simultaneous node failure up to 50% - 1 node, assuming each node has 1 vote. .PP When ATB is enabled, the cluster can suffer up to 50% of the nodes failing at the same time, in a deterministic fashion. By default the cluster partition, or the set of nodes that are still in contact with the node that has the lowest nodeid will remain quorate. The other nodes will be inquorate. This behaviour can be changed by also specifying .PP .B auto_tie_breaker_node: lowest|highest| .PP \'lowest' is the default, 'highest' is similar in that if the current set of nodes contains the highest nodeid then it will remain quorate. Alternatively it is possible to specify a particular node ID or list of node IDs that will be required to maintain quorum. If a (space-separated) list is given, the nodes are evaluated in order, so if the first node is present then it will be used to determine the quorate partition, if that node is not in either half (ie was not in the cluster before the split) then the second node ID will be checked for and so on. ATB is incompatible with quorum devices - if auto_tie_breaker is specified in corosync.conf then the quorum device will be disabled. .PP Example configuration 1: .nf quorum { provider: corosync_votequorum expected_votes: 8 auto_tie_breaker: 1 auto_tie_breaker_node: lowest } .fi .PP Example configuration 2: .nf quorum { provider: corosync_votequorum expected_votes: 8 auto_tie_breaker: 1 auto_tie_breaker_node: 1 3 5 } .PP .fi .PP .B allow_downscale: 1 .PP Enables allow downscale (AD) feature (default: 0). .PP THIS FEATURE IS INCOMPLETE AND CURRENTLY UNSUPPORTED. .PP The general behaviour of votequorum is to never decrease expected votes or quorum. .PP When AD is enabled, both expected votes and quorum are recalculated when a node leaves the cluster in a clean state (normal corosync shutdown process) down to configured expected_votes. .PP Example use case: .PP .nf 1) N node cluster (where N is any value higher than 3) 2) expected_votes set to 3 in corosync.conf 3) only 3 nodes are running 4) admin requires to increase processing power and adds 10 nodes 5) internal expected_votes is automatically set to 13 6) minimum expected_votes is 3 (from configuration) - up to this point this is standard votequorum behavior - 7) once the work is done, admin wants to remove nodes from the cluster 8) using an ordered shutdown the admin can reduce the cluster size automatically back to 3, but not below 3, where normal quorum operation will work as usual. .fi .PP Example configuration: .nf quorum { provider: corosync_votequorum expected_votes: 3 allow_downscale: 1 } .fi allow_downscale implicitly enabled EVT (see below). .PP .B expected_votes_tracking: 1 .PP Enables Expected Votes Tracking (EVT) feature (default: 0). .PP Expected Votes Tracking stores the highest-seen value of expected votes on disk and uses that as the minimum value for expected votes in the absence of any higher authority (eg a current quorate cluster). This is useful for when a group of nodes becomes detached from the main cluster and after a restart could have enough votes to provide quorum, which can happen after using allow_downscale. .PP Note that even if the in-memory version of expected_votes is reduced, eg by removing nodes or using corosync-quorumtool, the stored value will still be the highest value seen - it never gets reduced. .PP -The value is held in the file /var/lib/corosync/ev_tracking which can be deleted if you +The value is held in the file ev_tracking (stored in the directory configured in system.state_dir +or /var/lib/corosync/ when unset) which can be deleted if you really do need to reduce the expected votes for any reason, like the node has been moved to a different cluster. .PP .fi .PP .SH VARIOUS NOTES .PP * WFA / LMS / ATB / AD can be used combined together. .PP * In order to change the default votes for a node there are two options: .nf 1) nodelist: nodelist { node { ring0_addr: 192.168.1.1 quorum_votes: 3 } .... } 2) quorum section (deprecated): quorum { provider: corosync_votequorum expected_votes: 2 votes: 2 } .fi In the event that both nodelist and quorum { votes: } are defined, the value from the nodelist will be used. .PP * Only votes, quorum_votes, expected_votes and two_node can be changed at runtime. Everything else requires a cluster restart. .SH BUGS No known bugs at the time of writing. The authors are from outerspace. Deal with it. .SH "SEE ALSO" .BR corosync (8), .BR corosync.conf (5), .BR corosync-quorumtool (8), .BR corosync-qdevice (8), .BR votequorum_overview (3) .PP