diff --git a/configure.ac b/configure.ac index f37506f..3756126 100644 --- a/configure.ac +++ b/configure.ac @@ -1,99 +1,77 @@ dnl dnl autoconf for Agents dnl dnl License: GNU General Public License (GPL) dnl =============================================== dnl Bootstrap dnl =============================================== AC_PREREQ(2.63) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services AC_INIT([sbd], [1.0], [lmb@suse.com]) AC_CANONICAL_HOST AC_CONFIG_AUX_DIR(.) AC_CONFIG_HEADERS(config.h) AM_INIT_AUTOMAKE AM_PROG_CC_C_O PKG_CHECK_MODULES(glib, [glib-2.0]) PKG_CHECK_MODULES(libcoroipcc, [libcoroipcc]) dnl pacemaker > 1.1.8 PKG_CHECK_MODULES(pacemaker, [pacemaker, pacemaker-cib], HAVE_pacemaker=1, HAVE_pacemaker=0) dnl pacemaker <= 1.1.8 PKG_CHECK_MODULES(pcmk, [pcmk, pcmk-cib], HAVE_pcmk=1, HAVE_pcmk=0) if test $HAVE_pacemaker = 0 -a $HAVE_pcmk = 0; then AC_MSG_ERROR(No package 'pacemaker' found) elif test $HAVE_pacemaker = 1; then CFLAGS="$CFLAGS $glib_CFLAGS $pacemaker_CFLAGS" else dnl Deal with the wrong 'includedir' in pcmk.pc from pacemaker < 1.1.8 pcmk_CFLAGS="-I${prefix}/include/pacemaker -I${prefix}/include/heartbeat" CFLAGS="$CFLAGS $glib_CFLAGS $pcmk_CFLAGS" fi PKG_CHECK_MODULES(libxml, [libxml-2.0]) dnl checks for libraries AC_CHECK_LIB(aio, io_setup, , missing="yes") AC_CHECK_LIB(plumbgpl, init_set_proc_title, , missing="yes") AC_CHECK_LIB(crmcommon, set_crm_log_level, , missing="yes") AC_CHECK_LIB(cib, cib_new, , missing="yes") AC_CHECK_LIB(pe_status, pe_find_node, , missing="yes") AC_CHECK_LIB(pe_rules, test_rule, , missing="yes") AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") -if test "$missing" = "yes"; then - AC_MSG_ERROR([Missing required libraries or functions.]) -fi - -dnl pacemaker < 1.1.8 -AC_CHECK_HEADERS(pacemaker/crm/cluster/stack.h) - -dnl pacemaker < 1.1.8 -AC_CHECK_HEADERS(pacemaker/crm/common/cluster.h) - dnl pacemaker >= 1.1.8 AC_CHECK_HEADERS(pacemaker/crm/cluster.h) +AC_CHECK_LIB(crmcommon, pcmk_strerror, , missing="yes") +AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes") -dnl pacemaker >= 1.1.8 -AC_CHECK_LIB(crmcommon, pcmk_strerror) -if test $ac_cv_lib_crmcommon_pcmk_strerror = yes; then - AC_DEFINE_UNQUOTED(HAVE_PCMK_STRERROR, 1, pacemaker has pcmk_strerror) -fi - -dnl pacemaker >= 1.1.8 -AC_CHECK_LIB(cib, cib_apply_patch_event) -if test $ac_cv_lib_cib_cib_apply_patch_event = yes; then - AC_DEFINE_UNQUOTED(HAVE_CIB_APPLY_PATCH_EVENT, 1, pacemaker has cib_apply_patch_event) -fi - -dnl pacemaker < 1.1.8 -AC_CHECK_LIB(crmcluster, init_ais_connection_once) -if test $ac_cv_lib_crmcluster_init_ais_connection_once = yes; then - AC_DEFINE_UNQUOTED(HAVE_INIT_AIS_CONNECTION_ONCE, 1, pacemaker has init_ais_connection_once) +if test "$missing" = "yes"; then + AC_MSG_ERROR([Missing required libraries or functions.]) fi AC_PATH_PROGS(POD2MAN, pod2man, pod2man) dnl The Makefiles and shell scripts we output AC_CONFIG_FILES([Makefile src/Makefile agent/Makefile man/Makefile]) dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c index 3d61f78..160c304 100644 --- a/src/sbd-pacemaker.c +++ b/src/sbd-pacemaker.c @@ -1,573 +1,518 @@ /* * Copyright (C) 2012 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* TODO list: * * - Trying to shutdown a node if no devices are up will fail, since SBD * currently uses a message via the disk to achieve this. * * - Shutting down cluster nodes while the majority of devices is down * will eventually take the cluster below the quorum threshold, at which * time the remaining cluster nodes will all immediately suicide. * * - With the CIB refreshed every timeout_loop seconds, do we still need * to watch for CIB update notifications or can that be removed? * */ #include "sbd.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CHECK_AIS # include #endif #include #include static void clean_up(int rc); static void crm_diff_update(const char *event, xmlNode * msg); static gboolean mon_refresh_state(gpointer user_data); static int cib_connect(gboolean full); static void set_pcmk_health(int healthy); static void notify_parent(void); static GMainLoop *mainloop = NULL; static guint timer_id_reconnect = 0; static guint timer_id_notify = 0; static int reconnect_msec = 1000; static int pcmk_healthy = 0; static int cib_connected = 0; #ifdef CHECK_AIS static guint timer_id_ais = 0; static enum cluster_type_e cluster_stack = pcmk_cluster_unknown; static int local_id = 0; static struct timespec t_last_quorum; #endif #define LOGONCE(state, lvl, fmt, args...) do { \ if (last_state != state) { \ cl_log(lvl, fmt, ##args); \ last_state = state; \ } \ } while(0) static cib_t *cib = NULL; static xmlNode *current_cib = NULL; static long last_refresh = 0; static crm_trigger_t *refresh_trigger = NULL; static gboolean mon_timer_reconnect(gpointer data) { int rc = 0; if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); } rc = cib_connect(TRUE); if (rc != 0) { cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); /* set_pcmk_health(0); */ } else { cl_log(LOG_INFO, "CIB reconnect successful"); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { if (cib) { cl_log(LOG_WARNING, "Disconnected from CIB"); /* set_pcmk_health(0); */ cib->cmds->signoff(cib); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; return; } static gboolean mon_timer_notify(gpointer data) { if (timer_id_notify > 0) { g_source_remove(timer_id_notify); } if (cib_connected) { /* TODO - do we really want to do this every loop interval? Lets * check how much CPU that takes ... */ if (1) { free_xml(current_cib); current_cib = get_cib_copy(cib); mon_refresh_state(NULL); } else { notify_parent(); } } timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(0); } static int cib_connect(gboolean full) { int rc = 0; -#if !HAVE_PCMK_STRERROR - CRM_CHECK(cib != NULL, return cib_missing); -#else CRM_CHECK(cib != NULL, return -EINVAL); -#endif cib_connected = 0; if (cib->state != cib_connected_query && cib->state != cib_connected_command) { rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != 0) { return rc; } current_cib = get_cib_copy(cib); mon_refresh_state(NULL); if (full) { if (rc == 0) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); -#if !HAVE_PCMK_STRERROR - if (rc == cib_NOTSUPPORTED) { -#else if (rc == -EPROTONOSUPPORT) { -#endif /* Notification setup failed, won't be able to reconnect after failure */ rc = 0; } } if (rc == 0) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != 0) { /* Notification setup failed, could not monitor CIB actions */ clean_up(-rc); } } } if (!rc) { cib_connected = 1; } return rc; } #ifdef CHECK_AIS static gboolean mon_timer_ais(gpointer data) { if (timer_id_ais > 0) { g_source_remove(timer_id_ais); } send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); /* The timer is set in the response processing */ return FALSE; } static void ais_membership_destroy(gpointer user_data) { cl_log(LOG_ERR, "AIS connection terminated - corosync down?"); ais_fd_sync = -1; /* TODO: Is recovery even worth it here? After all, this means * that corosync died ... */ exit(1); } static void ais_membership_dispatch(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if (!data) { return; } free(data); data = NULL; switch (kind) { case crm_class_quorum: break; default: return; break; } DBGLOG(LOG_INFO, "AIS quorum state: %d", (int)crm_have_quorum); clock_gettime(CLOCK_MONOTONIC, &t_last_quorum); timer_id_ais = g_timeout_add(timeout_loop * 1000, mon_timer_ais, NULL); return; } #endif static int compute_status(pe_working_set_t * data_set) { static int updates = 0; static int last_state = 0; int healthy = 0; node_t *dc = NULL; struct timespec t_now; updates++; dc = data_set->dc_node; clock_gettime(CLOCK_MONOTONIC, &t_now); if (dc == NULL) { /* Means we don't know if we have quorum. Hrm. Probably needs to * allow for this state for a period of time and then decide * that we don't have quorum - TODO - should we skip * notifying the parent? */ LOGONCE(1, LOG_INFO, "We don't have a DC right now."); goto out; } else { const char *cib_quorum = crm_element_value(data_set->input, XML_ATTR_HAVE_QUORUM); if (crm_is_true(cib_quorum)) { DBGLOG(LOG_INFO, "CIB: We have quorum!"); } else { LOGONCE(3, LOG_WARNING, "CIB: We do NOT have quorum!"); goto out; } } #ifdef CHECK_AIS int quorum_age = t_now.tv_sec - t_last_quorum.tv_sec; if (quorum_age > (int)(timeout_io+timeout_loop)) { if (t_last_quorum.tv_sec != 0) LOGONCE(2, LOG_WARNING, "AIS: Quorum outdated!"); goto out; } if (crm_have_quorum) { DBGLOG(LOG_INFO, "AIS: We have quorum!"); } else { LOGONCE(8, LOG_WARNING, "AIS: We do NOT have quorum!"); goto out; } #endif node_t *node = pe_find_node(data_set->nodes, local_uname); if (node->details->unclean) { LOGONCE(4, LOG_WARNING, "Node state: UNCLEAN"); goto out; } else if (node->details->pending) { LOGONCE(5, LOG_WARNING, "Node state: pending"); /* TODO ? */ } else if (node->details->online) { LOGONCE(6, LOG_INFO, "Node state: online"); healthy = 1; } else { LOGONCE(7, LOG_WARNING, "Node state: UNKNOWN"); goto out; } out: set_pcmk_health(healthy); return 0; } static void set_pcmk_health(int healthy) { pcmk_healthy = healthy; notify_parent(); } static void notify_parent(void) { pid_t ppid; union sigval signal_value; memset(&signal_value, 0, sizeof(signal_value)); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ cl_log(LOG_WARNING, "Our parent is dead."); do_reset(); } if (pcmk_healthy) { DBGLOG(LOG_INFO, "Notifying parent: healthy"); sigqueue(ppid, SIG_LIVENESS, signal_value); } else { DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY"); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); } } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; long now = time(NULL); const char *op = NULL; -#if !HAVE_CIB_APPLY_PATCH_EVENT - unsigned int log_level = LOG_INFO; - - xmlNode *diff = NULL; - xmlNode *cib_last = NULL; - - if (msg == NULL) { - crm_err("NULL update"); - return; - } - - crm_element_value_int(msg, F_CIB_RC, &rc); - op = crm_element_value(msg, F_CIB_OPERATION); - diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); - - if (rc < 0) { - log_level = LOG_WARNING; -# if !HAVE_PCMK_STRERROR - cl_log(log_level, "[%s] %s ABORTED: %s", event, op, cib_error2string(rc)); -# else - cl_log(log_level, "[%s] %s ABORTED: %s", event, op, pcmk_strerror(rc)); -# endif - return; - } - - if (current_cib != NULL) { - cib_last = current_cib; - current_cib = NULL; - rc = cib_process_diff(op, cib_force_diff, NULL, NULL, diff, cib_last, ¤t_cib, NULL); - - if (rc != 0) { -# if !HAVE_PCMK_STRERROR - crm_debug("Update didn't apply, requesting full copy: %s", cib_error2string(rc)); -# else - crm_debug("Update didn't apply, requesting full copy: %s", pcmk_strerror(rc)); -# endif - free_xml(current_cib); - current_cib = NULL; - } - } - free_xml(cib_last); -#else if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case pcmk_err_diff_resync: case pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); case pcmk_ok: break; default: crm_warn("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); return; } } -#endif if (current_cib == NULL) { current_cib = get_cib_copy(cib); } if ((now - last_refresh) > (reconnect_msec / 1000)) { /* Force a refresh */ mon_refresh_state(NULL); } else { mainloop_set_trigger(refresh_trigger); } } static gboolean mon_refresh_state(gpointer user_data) { xmlNode *cib_copy = copy_xml(current_cib); pe_working_set_t data_set; last_refresh = time(NULL); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB"); if (cib) { cib->cmds->signoff(cib); } } else { set_working_set_defaults(&data_set); data_set.input = cib_copy; cluster_status(&data_set); compute_status(&data_set); cleanup_calculations(&data_set); } return TRUE; } static void clean_up(int rc) { if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } if (rc >= 0) { exit(rc); } return; } int servant_pcmk(const char *diskname, const void* argp) { int exit_code = 0; crm_cluster_t crm_cluster; cl_log(LOG_INFO, "Monitoring Pacemaker health"); set_proc_title("sbd: watcher: Pacemaker"); /* We don't want any noisy crm messages */ set_crm_log_level(LOG_CRIT); #ifdef CHECK_AIS cluster_stack = get_cluster_type(); if (cluster_stack != pcmk_cluster_classic_ais) { cl_log(LOG_ERR, "SBD currently only supports legacy AIS for quorum state poll"); /* TODO: Wonder if that's still true with the new code? * Should be merged completely, right? */ } if(is_openais_cluster()) { crm_cluster.destroy = ais_membership_destroy; crm_cluster.cpg.cpg_deliver_fn = ais_membership_dispatch; /* crm_cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; TODO? */ crm_cluster.cpg.cpg_confchg_fn = NULL; } while (!crm_cluster_connect(&crm_cluster)) { cl_log(LOG_INFO, "Waiting to sign in with cluster ..."); sleep(reconnect_msec / 1000); } #endif if (current_cib == NULL) { cib = cib_new(); do { exit_code = cib_connect(TRUE); if (exit_code != 0) { sleep(reconnect_msec / 1000); } -#if !HAVE_PCMK_STRERROR - } while (exit_code == cib_connection); -#else } while (exit_code == -ENOTCONN); -#endif if (exit_code != 0) { clean_up(-exit_code); } } mainloop = g_main_new(FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, NULL); timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); #ifdef CHECK_AIS timer_id_ais = g_timeout_add(timeout_loop * 1000, mon_timer_ais, NULL); #endif g_main_run(mainloop); g_main_destroy(mainloop); clean_up(0); return 0; /* never reached */ }