diff --git a/configure.ac b/configure.ac index a88d8af8..6675cee1 100644 --- a/configure.ac +++ b/configure.ac @@ -1,795 +1,797 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. # bootstrap / init AC_PREREQ([2.61]) AC_INIT([corosync], m4_esyscmd([build-aux/git-version-gen .tarball-version .gitarchivever]), [users@clusterlabs.org]) AC_USE_SYSTEM_EXTENSIONS AM_INIT_AUTOMAKE([foreign 1.11]) LT_PREREQ([2.2.6]) LT_INIT AM_SILENT_RULES([yes]) AC_CONFIG_SRCDIR([lib/cpg.c]) AC_CONFIG_HEADER([include/corosync/config.h]) AC_CONFIG_MACRO_DIR([m4]) AC_CANONICAL_HOST AC_LANG([C]) AC_SUBST(WITH_LIST, [""]) #Enable inter-library dependencies AC_ARG_ENABLE(interlib-deps, [AC_HELP_STRING([--disable-interlib-deps ],[disable inter-library dependencies (might break builds)])], [enable_interlib_deps="$enableval"], [enable_interlib_deps="yes"]) AC_MSG_NOTICE([enable inter-library dependencies: $enable_interlib_deps]) if test "${enable_interlib_deps}" == "yes"; then link_all_deplibs=yes link_all_deplibs_CXX=yes else link_all_deplibs=no link_all_deplibs_CXX=no fi dnl Fix default variables - "prefix" variable if not specified if test "$prefix" = "NONE"; then prefix="/usr" dnl Fix "localstatedir" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi dnl Fix "sysconfdir" variable if not specified if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi dnl Fix "libdir" variable if not specified if test "$libdir" = "\${exec_prefix}/lib"; then if test -e /usr/lib64; then libdir="/usr/lib64" else libdir="/usr/lib" fi fi fi if test "$srcdir" = "."; then AC_MSG_NOTICE([building in place srcdir:$srcdir]) AC_DEFINE([BUILDING_IN_PLACE], 1, [building in place]) else AC_MSG_NOTICE([building out of tree srcdir:$srcdir]) fi # Checks for programs. # check stolen from gnulib/m4/gnu-make.m4 if ! ${MAKE-make} --version /cannot/make/this >/dev/null 2>&1; then AC_MSG_ERROR([you don't seem to have GNU make; it is required]) fi sinclude(corosync-default.m4) AC_PROG_CC AC_PROG_CC_C99 if test "x$ac_cv_prog_cc_c99" = "xno"; then AC_MSG_ERROR(["C99 support is required"]) fi AC_PROG_INSTALL AC_PROG_LN_S AC_PROG_MAKE_SET AC_PROG_SED AC_CHECK_PROGS([GROFF], [groff]) AC_CHECK_PROGS([PKGCONFIG], [pkg-config]) AC_CHECK_PROGS([AUGTOOL], [augtool]) AC_CHECK_PROGS([DOT], [dot]) AC_CHECK_PROGS([DOXYGEN], [doxygen]) AC_CHECK_PROGS([AWK], [awk]) AC_PATH_PROG([BASHPATH], [bash]) # Checks for compiler characteristics. AC_PROG_GCC_TRADITIONAL AC_C_CONST AC_C_INLINE AC_C_VOLATILE # Checks for header files. AC_HEADER_DIRENT AC_HEADER_STDC AC_HEADER_SYS_WAIT AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h netinet/in.h stdint.h \ stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h \ sys/time.h syslog.h unistd.h sys/types.h getopt.h malloc.h \ utmpx.h ifaddrs.h stddef.h sys/file.h sys/uio.h]) # Check entries in specific structs AC_CHECK_MEMBER([struct sockaddr_in.sin_len], [AC_DEFINE_UNQUOTED([HAVE_SOCK_SIN_LEN], [1], [sockaddr_in needs sin_len])], [], [[#include ]]) AC_CHECK_MEMBER([struct sockaddr_in6.sin6_len], [AC_DEFINE_UNQUOTED([HAVE_SOCK_SIN6_LEN], [1], [sockaddr_in6 needs sin6_len])], [], [[#include ]]) AC_CHECK_MEMBER([struct msghdr.msg_control], [AC_DEFINE_UNQUOTED([HAVE_MSGHDR_CONTROL], [1], [msghdr has msg_control])], [], [[#include ]]) AC_CHECK_MEMBER([struct msghdr.msg_controllen], [AC_DEFINE_UNQUOTED([HAVE_MSGHDR_CONTROLLEN], [1], [msghdr has msg_controllen])], [], [[#include ]]) AC_CHECK_MEMBER([struct msghdr.msg_flags], [AC_DEFINE_UNQUOTED([HAVE_MSGHDR_FLAGS], [1], [msghdr has msg_flags])], [], [[#include ]]) AC_CHECK_MEMBER([struct msghdr.msg_accrights], [AC_DEFINE_UNQUOTED([HAVE_MSGHDR_ACCRIGHTS], [1], [msghdr has msg_accrights])], [], [[#include ]]) AC_CHECK_MEMBER([struct msghdr.msg_accrightslen], [AC_DEFINE_UNQUOTED([HAVE_MSGHDR_ACCRIGHTSLEN], [1], [msghdr has msg_accrightslen])], [], [[#include ]]) # Checks for typedefs. AC_TYPE_UID_T AC_TYPE_INT16_T AC_TYPE_INT32_T AC_TYPE_INT64_T AC_TYPE_INT8_T AC_TYPE_UINT16_T AC_TYPE_UINT32_T AC_TYPE_UINT64_T AC_TYPE_UINT8_T AC_TYPE_SIZE_T AC_TYPE_SSIZE_T # Checks for libraries. SAVE_CPPFLAGS="$CPPFLAGS" SAVE_LIBS="$LIBS" PKG_CHECK_MODULES([LIBQB], [libqb]) CPPFLAGS="$CPPFLAGS $LIBQB_CFLAGS" LIBS="$LIBS $LIBQB_LIBS" AC_CHECK_LIB([qb], [qb_log_thread_priority_set], \ have_qb_log_thread_priority_set="yes", \ have_qb_log_thread_priority_set="no") if test "x${have_qb_log_thread_priority_set}" = xyes; then AC_DEFINE_UNQUOTED([HAVE_QB_LOG_THREAD_PRIORITY_SET], 1, [have qb_log_thread_priority_set]) fi AC_CHECK_LIB([qb], [qb_log_file_reopen], \ have_qb_log_file_reopen="yes", \ have_qb_log_file_reopen="no") if test "x${have_qb_log_file_reopen}" = xyes; then AC_DEFINE_UNQUOTED([HAVE_QB_LOG_FILE_REOPEN], 1, [have qb_log_file_reopen]) fi AM_CONDITIONAL(HAVE_QB_LOG_FILE_REOPEN, test x$have_qb_log_file_reopen = xyes) CPPFLAGS="$SAVE_CPPFLAGS" LIBS="$SAVE_LIBS" AC_CHECK_LIB([pthread], [pthread_create]) AC_CHECK_LIB([socket], [socket]) PKG_CHECK_MODULES([knet],[libknet]) AC_CHECK_LIB([nsl], [t_open]) AC_CHECK_LIB([rt], [sched_getscheduler]) AC_CHECK_LIB([z], [crc32], AM_CONDITIONAL([HAVE_CRC32], true), AM_CONDITIONAL([HAVE_CRC32], false)) # this hack is necessary to check for symbols on out of tree builds # but it is as horrible as it gets and in theory users should be # invoking ./configure with proper LIBRARY_PATH set. OLDLIBS="$LIBS" LIBS="$knet_LIBS $LIBS" AC_CHECK_LIB([knet],[knet_handle_enable_access_lists], [AC_DEFINE_UNQUOTED([HAVE_KNET_ACCESS_LIST], 1, [have knet access list])]) +AC_CHECK_LIB([knet],[knet_handle_crypto_set_config], + [AC_DEFINE_UNQUOTED([HAVE_KNET_CRYPTO_RECONF], 1, [have knet crypto reconfig support])]) LIBS="$OLDLIBS" # Checks for library functions. AC_FUNC_ALLOCA AC_FUNC_CLOSEDIR_VOID AC_FUNC_ERROR_AT_LINE AC_FUNC_FORK AC_FUNC_MALLOC AC_FUNC_MEMCMP AC_FUNC_MMAP AC_FUNC_REALLOC AC_FUNC_SELECT_ARGTYPES AC_FUNC_VPRINTF AC_CHECK_FUNCS([alarm alphasort atexit bzero dup2 endgrent endpwent fdatasync \ fcntl getcwd getpeerucred getpeereid gettimeofday inet_ntoa \ memmove memset mkdir scandir select socket strcasecmp strchr \ strdup strerror strrchr strspn strstr pthread_setschedparam \ sched_get_priority_max sched_setscheduler getifaddrs \ clock_gettime ftruncate gethostname localtime_r munmap strtol]) AC_CONFIG_FILES([Makefile exec/Makefile include/Makefile init/Makefile lib/Makefile common_lib/Makefile man/Makefile pkgconfig/Makefile test/Makefile tools/Makefile conf/Makefile vqsim/Makefile Doxyfile conf/logrotate/Makefile]) ### Local business dnl =============================================== dnl Functions / global M4 variables dnl =============================================== dnl Global list of LIB names m4_define([local_soname_list], [])dnl dnl Upcase parameter m4_define([local_upcase], [translit([$*], [a-z], [A-Z])])dnl dnl M4 macro for include lib/lib$1.soname and subst that m4_define([LIB_SONAME_IMPORT],[dnl m4_define([local_libname], local_upcase($1)[_SONAME])dnl m4_define([local_soname], translit(m4_sinclude(lib/lib$1.verso), [ ], []))dnl local_libname="local_soname"dnl m4_define([local_soname_list], m4_defn([local_soname_list])[,]local_libname[,]local_upcase($1))dnl AC_SUBST(local_libname)dnl ])dnl dnl M4 macro for print padspaces (used in LIB_MSG_RESULT). It takes 2 arguments, length of string to pad and desired dnl (padded) length m4_define([m4_printpadspace],[ifelse(m4_eval([$2 - $1 < 1]),[1],,[ ][m4_printpadspace([$1],m4_eval([$2 - 1]))])])dnl dnl Show AC_MSG_RESULT for specific libraries m4_define([LIB_MSG_RESULT], [ifelse([$#], [1], ,[dnl AC_MSG_RESULT([ $2 Library SONAME m4_printpadspace(len($2),8) = ${$1}]) LIB_MSG_RESULT(m4_shift(m4_shift($@)))dnl ])])dnl # =============================================== # Helpers # =============================================== ## check if the compiler supports -Werror -Wunknown-warning-option AC_MSG_CHECKING([whether $CC supports -Wunknown-warning-option -Werror]) BACKUP="$CPPFLAGS" CPPFLAGS="$CPPFLAGS -Werror -Wunknown-warning-option" AC_COMPILE_IFELSE([AC_LANG_PROGRAM([])], [unknown_warnings_as_errors='-Wunknown-warning-option -Werror'; AC_MSG_RESULT([yes])], [unknown_warnings_as_errors=''; AC_MSG_RESULT([no])]) CPPFLAGS="$BACKUP" ## helper for CC stuff cc_supports_flag() { BACKUP="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $@ $unknown_warnings_as_errors" AC_MSG_CHECKING([whether $CC supports "$@"]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([])], [RC=0; AC_MSG_RESULT([yes])], [RC=1; AC_MSG_RESULT([no])]) CPPFLAGS="$BACKUP" return $RC } ## cleanup AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr/local;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Corosync, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac ## local defines PACKAGE_FEATURES="" LINT_FLAGS="-weak -unrecog +posixlib +ignoresigns -fcnuse \ -badflag -D__gnuc_va_list=va_list -D__attribute\(x\)=" # default libraries SONAME SOMAJOR="5" SOMINOR="0" SOMICRO="0" SONAME="${SOMAJOR}.${SOMINOR}.${SOMICRO}" # specific libraries SONAME LIB_SONAME_IMPORT([cfg]) LIB_SONAME_IMPORT([cpg]) LIB_SONAME_IMPORT([quorum]) LIB_SONAME_IMPORT([sam]) LIB_SONAME_IMPORT([votequorum]) LIB_SONAME_IMPORT([cmap]) # local options AC_ARG_ENABLE([ansi], [ --enable-ansi : force to build with ANSI standards. ], [ default="no" ]) AC_ARG_ENABLE([fatal-warnings], [ --enable-fatal-warnings : enable fatal warnings. ], [ default="no" ]) AC_ARG_ENABLE([debug], [ --enable-debug : enable debug build. ], [ default="no" ]) AC_ARG_WITH([sanitizers], [AS_HELP_STRING([--with-sanitizers=...,...], [enable SANitizer build, do *NOT* use for production. Only ASAN/UBSAN/TSAN are currently supported])], [ SANITIZERS="$withval" ], [ SANITIZERS="" ]) AC_ARG_ENABLE([secure-build], [ --enable-secure-build : enable PIE/RELRO build. ], [], [enable_secure_build="yes"]) AC_ARG_ENABLE([user-flags], [ --enable-user-flags : rely on user environment. ], [ default="no" ]) AC_ARG_ENABLE([coverage], [ --enable-coverage : coverage analysis of the codebase. ], [ default="no" ]) AC_ARG_ENABLE([small-memory-footprint], [ --enable-small-memory-footprint : Use small message queues and small messages sizes. ], [ default="no" ]) AC_ARG_ENABLE([dbus], [ --enable-dbus : dbus events. ],, [ enable_dbus="no" ]) AC_ARG_ENABLE([monitoring], [ --enable-monitoring : resource monitoring ],, [ default="no" ]) AM_CONDITIONAL(BUILD_MONITORING, test x$enable_monitoring = xyes) AC_ARG_ENABLE([watchdog], [ --enable-watchdog : Watchdog support ],, [ default="no" ]) AM_CONDITIONAL(BUILD_WATCHDOG, test x$enable_watchdog = xyes) AC_ARG_ENABLE([augeas], [ --enable-augeas : Install the augeas lens for corosync.conf ],, [ enable_augeas="no" ]) AM_CONDITIONAL(INSTALL_AUGEAS, test x$enable_augeas = xyes) AC_ARG_ENABLE([systemd], [ --enable-systemd : Install systemd service files],, [ enable_systemd="no" ]) AM_CONDITIONAL(INSTALL_SYSTEMD, test x$enable_systemd = xyes) AC_ARG_WITH([initconfigdir], [AS_HELP_STRING([--with-initconfigdir=DIR], [configuration directory @<:@SYSCONFDIR/sysconfig@:>@])], [INITCONFIGDIR="$withval"], [INITCONFIGDIR='${sysconfdir}/sysconfig']) AC_SUBST([INITCONFIGDIR]) AC_ARG_WITH([initddir], [ --with-initddir=DIR : path to init script directory. ], [ INITDDIR="$withval" ], [ INITDDIR="$sysconfdir/init.d" ]) AC_ARG_WITH([systemddir], [ --with-systemddir=DIR : path to systemd unit files directory. ], [ SYSTEMDDIR="$withval" ], [ SYSTEMDDIR="/lib/systemd/system" ]) AC_ARG_WITH([logdir], [ --with-logdir=DIR : the base directory for corosync logging files. ], [ LOGDIR="$withval" ], [ LOGDIR="$localstatedir/log/cluster" ]) AC_ARG_WITH([logrotatedir], [ --with-logrotatedir=DIR : the base directory for logrorate.d files. ], [ LOGROTATEDIR="$withval" ], [ LOGROTATEDIR="$sysconfdir/logrotate.d" ]) AC_ARG_ENABLE([snmp], [ --enable-snmp : SNMP protocol support ], [ default="no" ]) AC_ARG_ENABLE([xmlconf], [ --enable-xmlconf : XML configuration support ],, [ enable_xmlconf="no" ]) AM_CONDITIONAL(INSTALL_XMLCONF, test x$enable_xmlconf = xyes) AC_ARG_ENABLE([vqsim], [ --enable-vqsim : Quorum simulator support ],, [ enable_vqsim="no" ]) AM_CONDITIONAL(BUILD_VQSIM, test x$enable_vqsim = xyes) AC_ARG_ENABLE([nozzle], [ --enable-nozzle : Support for nozzle ],, [ enable_nozzle="no" ]) # *FLAGS handling goes here ENV_CFLAGS="$CFLAGS" ENV_CPPFLAGS="$CPPFLAGS" ENV_LDFLAGS="$LDFLAGS" # debug build stuff if test "x${enable_debug}" = xyes; then AC_DEFINE_UNQUOTED([DEBUG], [1], [Compiling Debugging code]) OPT_CFLAGS="-O0" PACKAGE_FEATURES="$PACKAGE_FEATURES debug" else OPT_CFLAGS="-O3" fi # gdb flags if test "x${GCC}" = xyes; then GDB_FLAGS="-ggdb3" else GDB_FLAGS="-g" fi # --- ASAN/UBSAN/TSAN (see man gcc) --- # when using SANitizers, we need to pass the -fsanitize.. # to both CFLAGS and LDFLAGS. The CFLAGS/LDFLAGS must be # specified as first in the list or there will be runtime # issues (for example user has to LD_PRELOAD asan for it to work # properly). if test -n "${SANITIZERS}"; then SANITIZERS=$(echo $SANITIZERS | sed -e 's/,/ /g') for SANITIZER in $SANITIZERS; do case $SANITIZER in asan|ASAN) SANITIZERS_CFLAGS="$SANITIZERS_CFLAGS -fsanitize=address" SANITIZERS_LDFLAGS="$SANITIZERS_LDFLAGS -fsanitize=address -lasan" AC_CHECK_LIB([asan],[main],,AC_MSG_ERROR([Unable to find libasan])) ;; ubsan|UBSAN) SANITIZERS_CFLAGS="$SANITIZERS_CFLAGS -fsanitize=undefined" SANITIZERS_LDFLAGS="$SANITIZERS_LDFLAGS -fsanitize=undefined -lubsan" AC_CHECK_LIB([ubsan],[main],,AC_MSG_ERROR([Unable to find libubsan])) ;; tsan|TSAN) SANITIZERS_CFLAGS="$SANITIZERS_CFLAGS -fsanitize=thread" SANITIZERS_LDFLAGS="$SANITIZERS_LDFLAGS -fsanitize=thread -ltsan" AC_CHECK_LIB([tsan],[main],,AC_MSG_ERROR([Unable to find libtsan])) ;; esac done fi # Look for dbus-1 if test "x${enable_dbus}" = xyes; then PKG_CHECK_MODULES([DBUS],[dbus-1]) AC_DEFINE_UNQUOTED([HAVE_DBUS], 1, [have dbus]) PACKAGE_FEATURES="$PACKAGE_FEATURES dbus" WITH_LIST="$WITH_LIST --with dbus" fi if test "x${enable_monitoring}" = xyes; then PKG_CHECK_MODULES([statgrab], [libstatgrab]) PKG_CHECK_MODULES([statgrabge090], [libstatgrab >= 0.90], AC_DEFINE_UNQUOTED([HAVE_LIBSTATGRAB_GE_090], 1, [have libstatgrab >= 0.90]), TMP_VARIABLE=1) AC_DEFINE_UNQUOTED([HAVE_MONITORING], 1, [have resource monitoring]) PACKAGE_FEATURES="$PACKAGE_FEATURES monitoring" WITH_LIST="$WITH_LIST --with monitoring" fi if test "x${enable_watchdog}" = xyes; then AC_CHECK_HEADER([linux/watchdog.h], [], [AC_MSG_ERROR([watchdog requires linux/watchdog.h])]) AC_CHECK_HEADER([linux/reboot.h], [], [AC_MSG_ERROR([watchdog requires linux/reboot.h])]) AC_DEFINE_UNQUOTED([HAVE_WATCHDOG], 1, [have watchdog]) PACKAGE_FEATURES="$PACKAGE_FEATURES watchdog" WITH_LIST="$WITH_LIST --with watchdog" fi if test "x${enable_augeas}" = xyes; then PACKAGE_FEATURES="$PACKAGE_FEATURES augeas" fi if test "x${enable_systemd}" = xyes; then PKG_CHECK_MODULES([libsystemd], [libsystemd]) AC_DEFINE([HAVE_LIBSYSTEMD], [1], [have systemd interface library]) PACKAGE_FEATURES="$PACKAGE_FEATURES systemd" WITH_LIST="$WITH_LIST --with systemd" fi if test "x${enable_xmlconf}" = xyes; then PACKAGE_FEATURES="$PACKAGE_FEATURES xmlconf" WITH_LIST="$WITH_LIST --with xmlconf" fi if test "x${enable_vqsim}" = xyes; then vqsim_readline=no AC_CHECK_HEADERS([readline/readline.h readline/history.h], [], AC_MSG_WARN([vqsim will lack readline support])) PACKAGE_FEATURES="$PACKAGE_FEATURES vqsim" WITH_LIST="$WITH_LIST --with vqsim" fi AM_CONDITIONAL(VQSIM_READLINE, [test "x${ac_cv_header_readline_readline_h}" = xyes]) # Look for nozzle if test "x${enable_nozzle}" = xyes; then PKG_CHECK_MODULES([nozzle],[libnozzle]) AC_DEFINE_UNQUOTED([HAVE_LIBNOZZLE], 1, [have nozzle]) PACKAGE_FEATURES="$PACKAGE_FEATURES nozzle" WITH_LIST="$WITH_LIST --with nozzle" fi do_snmp=0 if test "x${enable_snmp}" = xyes; then AC_PATH_PROGS([SNMPCONFIG], [net-snmp-config]) if test "x${SNMPCONFIG}" != "x"; then AC_MSG_CHECKING([for snmp includes]) SNMP_PREFIX=`$SNMPCONFIG --prefix` SNMP_INCLUDES="-I$SNMP_PREFIX/include" AC_MSG_RESULT([$SNMP_INCLUDES]) AC_MSG_CHECKING([for snmp libraries]) SNMP_LIBS=`$SNMPCONFIG --libs` AC_MSG_RESULT([$SNMP_LIBS]) AC_SUBST([SNMP_LIBS]) saveCFLAGS="$CFLAGS" CFLAGS="$CFLAGS $SNMP_INCLUDES" AC_CHECK_HEADERS([net-snmp/net-snmp-config.h]) CFLAGS="$saveCFLAGS" if test "x${ac_cv_header_net_snmp_net_snmp_config_h}" != "xyes"; then AC_MSG_ERROR([Unable to use net-snmp/net-snmp-config.h]) fi savedLibs=$LIBS LIBS="$LIBS $SNMP_LIBS" AC_CHECK_FUNCS([netsnmp_transport_open_client]) if test $ac_cv_func_netsnmp_transport_open_client != yes; then AC_CHECK_FUNCS([netsnmp_tdomain_transport]) if test $ac_cv_func_netsnmp_tdomain_transport != yes; then AC_MSG_ERROR([No usable SNMP client transport implementation found]) fi else AC_DEFINE_UNQUOTED([NETSNMPV54], $NETSNMP_NEW_SUPPORT, [have net-snmp5.4 over]) fi LIBS=$savedLibs do_snmp=1 PACKAGE_FEATURES="$PACKAGE_FEATURES snmp" WITH_LIST="$WITH_LIST --with snmp" AC_DEFINE_UNQUOTED([ENABLE_SNMP], $do_snmp, [Build in support for sending SNMP traps]) else AC_MSG_ERROR([You need the net_snmp development package to continue.]) fi fi AM_CONDITIONAL(BUILD_SNMP, test "${do_snmp}" = "1") # extra warnings EXTRA_WARNINGS="" WARNLIST=" all shadow missing-prototypes missing-declarations strict-prototypes pointer-arith write-strings cast-align bad-function-cast missing-format-attribute format=2 format-security format-nonliteral no-long-long unsigned-char no-strict-aliasing " for j in $WARNLIST; do if cc_supports_flag -W$j; then EXTRA_WARNINGS="$EXTRA_WARNINGS -W$j"; fi done if test "x${enable_coverage}" = xyes && \ cc_supports_flag -ftest-coverage && \ cc_supports_flag -fprofile-arcs ; then AC_MSG_NOTICE([Enabling Coverage (enable -O0 by default)]) OPT_CFLAGS="-O0" COVERAGE_CFLAGS="-ftest-coverage -fprofile-arcs" COVERAGE_LDFLAGS="-ftest-coverage -fprofile-arcs" PACKAGE_FEATURES="$PACKAGE_FEATURES coverage" else COVERAGE_CFLAGS="" COVERAGE_LDFLAGS="" fi if test "x${enable_small_memory_footprint}" = xyes ; then AC_DEFINE_UNQUOTED([HAVE_SMALL_MEMORY_FOOTPRINT], 1, [have small_memory_footprint]) PACKAGE_FEATURES="$PACKAGE_FEATURES small-memory-footprint" fi if test "x${enable_ansi}" = xyes && \ cc_supports_flag -std=iso9899:199409 ; then AC_MSG_NOTICE([Enabling ANSI Compatibility]) ANSI_CPPFLAGS="-ansi -DANSI_ONLY" PACKAGE_FEATURES="$PACKAGE_FEATURES ansi" else ANSI_CPPFLAGS="" fi if test "x${enable_fatal_warnings}" = xyes && \ cc_supports_flag -Werror ; then AC_MSG_NOTICE([Enabling Fatal Warnings (-Werror)]) WERROR_CFLAGS="-Werror" PACKAGE_FEATURES="$PACKAGE_FEATURES fatal-warnings" else WERROR_CFLAGS="" fi # don't add addtional cflags if test "x${enable_user_flags}" = xyes; then OPT_CFLAGS="" GDB_FLAGS="" EXTRA_WARNINGS="" fi if test "x${enable_secure_build}" = xyes; then # stolen from apache configure snippet AC_CACHE_CHECK([whether $CC accepts PIE flags], [ap_cv_cc_pie], [ save_CFLAGS=$CFLAGS save_LDFLAGS=$LDFLAGS CFLAGS="$CFLAGS -fPIE" LDFLAGS="$LDFLAGS -pie" AC_TRY_RUN([static int foo[30000]; int main () { return 0; }], [ap_cv_cc_pie=yes], [ap_cv_cc_pie=no], [ap_cv_cc_pie=yes]) CFLAGS=$save_CFLAGS LDFLAGS=$save_LDFLAGS ]) if test "$ap_cv_cc_pie" = "yes"; then SEC_FLAGS="$SEC_FLAGS -fPIE" SEC_LDFLAGS="$SEC_LDFLAGS -pie" PACKAGE_FEATURES="$PACKAGE_FEATURES pie" fi # similar to above AC_CACHE_CHECK([whether $CC accepts RELRO flags], [ap_cv_cc_relro], [ save_LDFLAGS=$LDFLAGS LDFLAGS="$LDFLAGS -Wl,-z,relro" AC_TRY_RUN([static int foo[30000]; int main () { return 0; }], [ap_cv_cc_relro=yes], [ap_cv_cc_relro=no], [ap_cv_cc_relro=yes]) LDFLAGS=$save_LDFLAGS ]) if test "$ap_cv_cc_relro" = "yes"; then SEC_LDFLAGS="$SEC_LDFLAGS -Wl,-z,relro" PACKAGE_FEATURES="$PACKAGE_FEATURES relro" fi AC_CACHE_CHECK([whether $CC accepts BINDNOW flags], [ap_cv_cc_bindnow], [ save_LDFLAGS=$LDFLAGS LDFLAGS="$LDFLAGS -Wl,-z,now" AC_TRY_RUN([static int foo[30000]; int main () { return 0; }], [ap_cv_cc_bindnow=yes], [ap_cv_cc_bindnow=no], [ap_cv_cc_bindnow=yes]) LDFLAGS=$save_LDFLAGS ]) if test "$ap_cv_cc_bindnow" = "yes"; then SEC_LDFLAGS="$SEC_LDFLAGS -Wl,-z,now" PACKAGE_FEATURES="$PACKAGE_FEATURES bindnow" fi fi AC_CACHE_CHECK([whether $CC accepts "--as-needed"], [ap_cv_cc_as_needed], [ save_LDFLAGS=$LDFLAGS LDFLAGS="$LDFLAGS -Wl,--as-needed" AC_TRY_RUN([static int foo[30000]; int main () { return 0; }], [ap_cv_cc_as_needed=yes], [ap_cv_cc_as_needed=no], [ap_cv_cc_as_needed=yes]) LDFLAGS=$save_LDFLAGS ]) AC_CACHE_CHECK([whether $CC accepts "--version-script"], [ap_cv_cc_version_script], [ save_LDFLAGS=$LDFLAGS LDFLAGS="$LDFLAGS -Wl,--version-script=conftest.versions" echo "CONFTEST { };" >conftest.versions AC_TRY_RUN([static int foo[30000]; int main () { return 0; }], [ap_cv_cc_version_script=yes], [ap_cv_cc_version_script=no], [ap_cv_cc_version_script=yes]) rm -f conftest.versions LDFLAGS=$save_LDFLAGS ]) if test "$ap_cv_cc_version_script" = "yes"; then AC_SUBST(VERSCRIPT_LDFLAGS, ["-Wl,--version-script=\$(srcdir)/lib\$(call get_libname,\$<).versions"]) else AC_SUBST(VERSCRIPT_LDFLAGS, [""]) fi # define global include dirs INCLUDE_DIRS="$INCLUDE_DIRS -I\$(top_builddir)/include -I\$(top_srcdir)/include" INCLUDE_DIRS="$INCLUDE_DIRS -I\$(top_builddir)/include/corosync -I\$(top_srcdir)/include/corosync" # final build of *FLAGS CFLAGS="$SANITIZERS_CFLAGS $ENV_CFLAGS $lt_prog_compiler_pic $SEC_FLAGS $OPT_CFLAGS $GDB_FLAGS \ $COVERAGE_CFLAGS $EXTRA_WARNINGS \ $WERROR_CFLAGS $LIBQB_CFLAGS \ $SNMP_INCLUDES" CPPFLAGS="$ENV_CPPFLAGS $ANSI_CPPFLAGS $INCLUDE_DIRS" LDFLAGS="$SANITIZERS_LDFLAGS $ENV_LDFLAGS $lt_prog_compiler_pic $SEC_LDFLAGS $COVERAGE_LDFLAGS" if test "$ap_cv_cc_as_needed" = "yes"; then LDFLAGS="$LDFLAGS -Wl,--as-needed" fi # substitute what we need: AC_SUBST([BASHPATH]) AC_SUBST([INITDDIR]) AC_SUBST([SYSTEMDDIR]) AC_SUBST([LOGDIR]) AC_SUBST([LOGROTATEDIR]) AC_SUBST([SOMAJOR]) AC_SUBST([SOMINOR]) AC_SUBST([SOMICRO]) AC_SUBST([SONAME]) AM_CONDITIONAL(INSTALL_MIB, test "${do_snmp}" = "1") AM_CONDITIONAL(INSTALL_DBUSCONF, test "${enable_dbus}" = "yes") AM_CONDITIONAL(AUGTOOL, test -n "${AUGTOOL}") AM_CONDITIONAL(BUILD_HTML_DOCS, test -n "${GROFF}") AC_SUBST([LINT_FLAGS]) AC_DEFINE_UNQUOTED([LOCALSTATEDIR], "$(eval echo ${localstatedir})", [localstate directory]) COROSYSCONFDIR=${sysconfdir}/corosync AC_SUBST([COROSYSCONFDIR]) AC_DEFINE_UNQUOTED([COROSYSCONFDIR], "$(eval echo ${COROSYSCONFDIR})", [corosync config directory]) AC_DEFINE_UNQUOTED([PACKAGE_FEATURES], "${PACKAGE_FEATURES}", [corosync built-in features]) AC_OUTPUT AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE configuration:]) AC_MSG_RESULT([ Version = ${VERSION}]) AC_MSG_RESULT([ Prefix = ${prefix}]) AC_MSG_RESULT([ Executables = ${sbindir}]) AC_MSG_RESULT([ Man pages = ${mandir}]) AC_MSG_RESULT([ Doc dir = ${docdir}]) AC_MSG_RESULT([ Libraries = ${libdir}]) AC_MSG_RESULT([ Header files = ${includedir}]) AC_MSG_RESULT([ Arch-independent files = ${datadir}]) AC_MSG_RESULT([ State information = ${localstatedir}]) AC_MSG_RESULT([ System configuration = ${sysconfdir}]) AC_MSG_RESULT([ System init.d directory = ${INITDDIR}]) AC_MSG_RESULT([ System systemd directory = ${SYSTEMDDIR}]) AC_MSG_RESULT([ Log directory = ${LOGDIR}]) AC_MSG_RESULT([ Log rotate directory = ${LOGROTATEDIR}]) AC_MSG_RESULT([ corosync config dir = ${COROSYSCONFDIR}]) AC_MSG_RESULT([ init config directory = ${INITCONFIGDIR}]) AC_MSG_RESULT([ Features =${PACKAGE_FEATURES}]) AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE build info:]) AC_MSG_RESULT([ Library SONAME = ${SONAME}]) LIB_MSG_RESULT(m4_shift(local_soname_list))dnl AC_MSG_RESULT([ Default optimization = ${OPT_CFLAGS}]) AC_MSG_RESULT([ Default debug options = ${GDB_FLAGS}]) AC_MSG_RESULT([ Extra compiler warnings = ${EXTRA_WARNING}]) AC_MSG_RESULT([ Env. defined CFLAG = ${ENV_CFLAGS}]) AC_MSG_RESULT([ Env. defined CPPFLAGS = ${ENV_CPPFLAGS}]) AC_MSG_RESULT([ Env. defined LDFLAGS = ${ENV_LDFLAGS}]) AC_MSG_RESULT([ ANSI defined CPPFLAGS = ${ANSI_CPPFLAGS}]) AC_MSG_RESULT([ Coverage CFLAGS = ${COVERAGE_CFLAGS}]) AC_MSG_RESULT([ Coverage LDFLAGS = ${COVERAGE_LDFLAGS}]) AC_MSG_RESULT([ Fatal War. CFLAGS = ${WERROR_CFLAGS}]) AC_MSG_RESULT([ Final CFLAGS = ${CFLAGS}]) AC_MSG_RESULT([ Final CPPFLAGS = ${CPPFLAGS}]) AC_MSG_RESULT([ Final LDFLAGS = ${LDFLAGS}]) diff --git a/exec/cfg.c b/exec/cfg.c index 75b07989..d5b50fcd 100644 --- a/exec/cfg.c +++ b/exec/cfg.c @@ -1,1167 +1,1264 @@ /* * Copyright (c) 2005-2006 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "totemconfig.h" #include "service.h" #include "main.h" LOGSYS_DECLARE_SUBSYS ("CFG"); enum cfg_message_req_types { MESSAGE_REQ_EXEC_CFG_RINGREENABLE = 0, MESSAGE_REQ_EXEC_CFG_KILLNODE = 1, MESSAGE_REQ_EXEC_CFG_SHUTDOWN = 2, - MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG = 3 + MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG = 3, + MESSAGE_REQ_EXEC_CFG_CRYPTO_RECONFIG = 4 }; #define DEFAULT_SHUTDOWN_TIMEOUT 5 static struct qb_list_head trackers_list; /* * Variables controlling a requested shutdown */ static corosync_timer_handle_t shutdown_timer; static struct cfg_info *shutdown_con; static uint32_t shutdown_flags; static int shutdown_yes; static int shutdown_no; static int shutdown_expected; struct cfg_info { struct qb_list_head list; void *conn; void *tracker_conn; enum {SHUTDOWN_REPLY_UNKNOWN, SHUTDOWN_REPLY_YES, SHUTDOWN_REPLY_NO} shutdown_reply; }; static void cfg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); static char *cfg_exec_init_fn (struct corosync_api_v1 *corosync_api_v1); static struct corosync_api_v1 *api; static int cfg_lib_init_fn (void *conn); static int cfg_lib_exit_fn (void *conn); static void message_handler_req_exec_cfg_ringreenable ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_killnode ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_shutdown ( const void *message, unsigned int nodeid); static void message_handler_req_exec_cfg_reload_config ( const void *message, unsigned int nodeid); +static void message_handler_req_exec_cfg_reconfig_crypto ( + const void *message, + unsigned int nodeid); + static void exec_cfg_killnode_endian_convert (void *msg); static void message_handler_req_lib_cfg_ringstatusget ( void *conn, const void *msg); static void message_handler_req_lib_cfg_ringreenable ( void *conn, const void *msg); static void message_handler_req_lib_cfg_killnode ( void *conn, const void *msg); static void message_handler_req_lib_cfg_tryshutdown ( void *conn, const void *msg); static void message_handler_req_lib_cfg_replytoshutdown ( void *conn, const void *msg); static void message_handler_req_lib_cfg_get_node_addrs ( void *conn, const void *msg); static void message_handler_req_lib_cfg_local_get ( void *conn, const void *msg); static void message_handler_req_lib_cfg_reload_config ( void *conn, const void *msg); static void message_handler_req_lib_cfg_reopen_log_files ( void *conn, const void *msg); /* * Service Handler Definition */ static struct corosync_lib_handler cfg_lib_engine[] = { { /* 0 */ .lib_handler_fn = message_handler_req_lib_cfg_ringstatusget, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 1 */ .lib_handler_fn = message_handler_req_lib_cfg_ringreenable, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 2 */ .lib_handler_fn = message_handler_req_lib_cfg_killnode, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 3 */ .lib_handler_fn = message_handler_req_lib_cfg_tryshutdown, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 4 */ .lib_handler_fn = message_handler_req_lib_cfg_replytoshutdown, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 5 */ .lib_handler_fn = message_handler_req_lib_cfg_get_node_addrs, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 6 */ .lib_handler_fn = message_handler_req_lib_cfg_local_get, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 7 */ .lib_handler_fn = message_handler_req_lib_cfg_reload_config, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED }, { /* 8 */ .lib_handler_fn = message_handler_req_lib_cfg_reopen_log_files, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED } }; static struct corosync_exec_handler cfg_exec_engine[] = { { /* 0 */ .exec_handler_fn = message_handler_req_exec_cfg_ringreenable, }, { /* 1 */ .exec_handler_fn = message_handler_req_exec_cfg_killnode, .exec_endian_convert_fn = exec_cfg_killnode_endian_convert }, { /* 2 */ .exec_handler_fn = message_handler_req_exec_cfg_shutdown, }, { /* 3 */ .exec_handler_fn = message_handler_req_exec_cfg_reload_config, + }, + { /* 4 */ + .exec_handler_fn = message_handler_req_exec_cfg_reconfig_crypto, } }; /* * Exports the interface for the service */ struct corosync_service_engine cfg_service_engine = { .name = "corosync configuration service", .id = CFG_SERVICE, .priority = 1, .private_data_size = sizeof(struct cfg_info), .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .lib_init_fn = cfg_lib_init_fn, .lib_exit_fn = cfg_lib_exit_fn, .lib_engine = cfg_lib_engine, .lib_engine_count = sizeof (cfg_lib_engine) / sizeof (struct corosync_lib_handler), .exec_init_fn = cfg_exec_init_fn, .exec_engine = cfg_exec_engine, .exec_engine_count = sizeof (cfg_exec_engine) / sizeof (struct corosync_exec_handler), .confchg_fn = cfg_confchg_fn }; struct corosync_service_engine *cfg_get_service_engine_ver0 (void) { return (&cfg_service_engine); } struct req_exec_cfg_ringreenable { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); }; struct req_exec_cfg_reload_config { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_message_source_t source __attribute__((aligned(8))); }; +struct req_exec_cfg_crypto_reconfig { + struct qb_ipc_request_header header __attribute__((aligned(8))); + mar_uint32_t phase __attribute__((aligned(8))); +}; + struct req_exec_cfg_killnode { struct qb_ipc_request_header header __attribute__((aligned(8))); mar_uint32_t nodeid __attribute__((aligned(8))); mar_name_t reason __attribute__((aligned(8))); }; struct req_exec_cfg_shutdown { struct qb_ipc_request_header header __attribute__((aligned(8))); }; /* IMPL */ static char *cfg_exec_init_fn ( struct corosync_api_v1 *corosync_api_v1) { api = corosync_api_v1; qb_list_init(&trackers_list); return (NULL); } static void cfg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { } /* * Tell other nodes we are shutting down */ static int send_shutdown(void) { struct req_exec_cfg_shutdown req_exec_cfg_shutdown; struct iovec iovec; ENTER(); req_exec_cfg_shutdown.header.size = sizeof (struct req_exec_cfg_shutdown); req_exec_cfg_shutdown.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_SHUTDOWN); iovec.iov_base = (char *)&req_exec_cfg_shutdown; iovec.iov_len = sizeof (struct req_exec_cfg_shutdown); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); return 0; } static void send_test_shutdown(void *only_conn, void *exclude_conn, int status) { struct res_lib_cfg_testshutdown res_lib_cfg_testshutdown; struct qb_list_head *iter; ENTER(); res_lib_cfg_testshutdown.header.size = sizeof(struct res_lib_cfg_testshutdown); res_lib_cfg_testshutdown.header.id = MESSAGE_RES_CFG_TESTSHUTDOWN; res_lib_cfg_testshutdown.header.error = status; res_lib_cfg_testshutdown.flags = shutdown_flags; if (only_conn) { TRACE1("sending testshutdown to only %p", only_conn); api->ipc_dispatch_send(only_conn, &res_lib_cfg_testshutdown, sizeof(res_lib_cfg_testshutdown)); } else { qb_list_for_each(iter, &trackers_list) { struct cfg_info *ci = qb_list_entry(iter, struct cfg_info, list); if (ci->conn != exclude_conn) { TRACE1("sending testshutdown to %p", ci->tracker_conn); api->ipc_dispatch_send(ci->tracker_conn, &res_lib_cfg_testshutdown, sizeof(res_lib_cfg_testshutdown)); } } } LEAVE(); } static void check_shutdown_status(void) { ENTER(); /* * Shutdown client might have gone away */ if (!shutdown_con) { LEAVE(); return; } /* * All replies safely gathered in ? */ if (shutdown_yes + shutdown_no >= shutdown_expected) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; api->timer_delete(shutdown_timer); if (shutdown_yes >= shutdown_expected || shutdown_flags == CFG_SHUTDOWN_FLAG_REGARDLESS) { TRACE1("shutdown confirmed"); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; /* * Tell originator that shutdown was confirmed */ api->ipc_response_send(shutdown_con->conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); shutdown_con = NULL; /* * Tell other nodes we are going down */ send_shutdown(); } else { TRACE1("shutdown cancelled"); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_ERR_BUSY; /* * Tell originator that shutdown was cancelled */ api->ipc_response_send(shutdown_con->conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); shutdown_con = NULL; } log_printf(LOGSYS_LEVEL_DEBUG, "shutdown decision is: (yes count: %d, no count: %d) flags=%x", shutdown_yes, shutdown_no, shutdown_flags); } LEAVE(); } /* * Not all nodes responded to the shutdown (in time) */ static void shutdown_timer_fn(void *arg) { ENTER(); /* * Mark undecideds as "NO" */ shutdown_no = shutdown_expected; check_shutdown_status(); send_test_shutdown(NULL, NULL, CS_ERR_TIMEOUT); LEAVE(); } static void remove_ci_from_shutdown(struct cfg_info *ci) { ENTER(); /* * If the controlling shutdown process has quit, then cancel the * shutdown session */ if (ci == shutdown_con) { shutdown_con = NULL; api->timer_delete(shutdown_timer); } if (!qb_list_empty(&ci->list)) { qb_list_del(&ci->list); qb_list_init(&ci->list); /* * Remove our option */ if (shutdown_con) { if (ci->shutdown_reply == SHUTDOWN_REPLY_YES) shutdown_yes--; if (ci->shutdown_reply == SHUTDOWN_REPLY_NO) shutdown_no--; } /* * If we are leaving, then that's an implicit YES to shutdown */ ci->shutdown_reply = SHUTDOWN_REPLY_YES; shutdown_yes++; check_shutdown_status(); } LEAVE(); } int cfg_lib_exit_fn (void *conn) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); ENTER(); remove_ci_from_shutdown(ci); LEAVE(); return (0); } static int cfg_lib_init_fn (void *conn) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); ENTER(); qb_list_init(&ci->list); LEAVE(); return (0); } /* * Executive message handlers */ static void message_handler_req_exec_cfg_ringreenable ( const void *message, unsigned int nodeid) { ENTER(); LEAVE(); } static void exec_cfg_killnode_endian_convert (void *msg) { struct req_exec_cfg_killnode *req_exec_cfg_killnode = (struct req_exec_cfg_killnode *)msg; ENTER(); swab_mar_name_t(&req_exec_cfg_killnode->reason); LEAVE(); } static void message_handler_req_exec_cfg_killnode ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_killnode *req_exec_cfg_killnode = message; cs_name_t reason; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "request to kill node " CS_PRI_NODE_ID " (us=" CS_PRI_NODE_ID ")", req_exec_cfg_killnode->nodeid, api->totem_nodeid_get()); if (req_exec_cfg_killnode->nodeid == api->totem_nodeid_get()) { marshall_from_mar_name_t(&reason, &req_exec_cfg_killnode->reason); log_printf(LOGSYS_LEVEL_NOTICE, "Killed by node " CS_PRI_NODE_ID " : %s", nodeid, reason.value); corosync_fatal_error(COROSYNC_FATAL_ERROR_EXIT); } LEAVE(); } /* * Self shutdown */ static void message_handler_req_exec_cfg_shutdown ( const void *message, unsigned int nodeid) { ENTER(); log_printf(LOGSYS_LEVEL_NOTICE, "Node " CS_PRI_NODE_ID " was shut down by sysadmin", nodeid); if (nodeid == api->totem_nodeid_get()) { api->shutdown_request(); } LEAVE(); } /* strcmp replacement that can handle NULLs */ static int nullcheck_strcmp(const char* left, const char *right) { if (!left && right) return -1; if (left && !right) return 1; if (!left && !right) return 0; return strcmp(left, right); } /* * If a key has changed value in the new file, then warn the user and remove it from the temp_map */ static void delete_and_notify_if_changed(icmap_map_t temp_map, const char *key_name) { if (!(icmap_key_value_eq(temp_map, key_name, icmap_get_global_map(), key_name))) { if (icmap_delete_r(temp_map, key_name) == CS_OK) { log_printf(LOGSYS_LEVEL_NOTICE, "Modified entry '%s' in corosync.conf cannot be changed at run-time", key_name); } } } /* * Remove any keys from the new config file that in the new corosync.conf but that * cannot be changed at run time. A log message will be issued for each * entry that the user wants to change but they cannot. * * Add more here as needed. */ static void remove_ro_entries(icmap_map_t temp_map) { +#ifndef HAVE_KNET_CRYPTO_RECONF delete_and_notify_if_changed(temp_map, "totem.secauth"); delete_and_notify_if_changed(temp_map, "totem.crypto_hash"); delete_and_notify_if_changed(temp_map, "totem.crypto_cipher"); delete_and_notify_if_changed(temp_map, "totem.keyfile"); delete_and_notify_if_changed(temp_map, "totem.key"); +#endif delete_and_notify_if_changed(temp_map, "totem.version"); delete_and_notify_if_changed(temp_map, "totem.threads"); delete_and_notify_if_changed(temp_map, "totem.ip_version"); delete_and_notify_if_changed(temp_map, "totem.rrp_mode"); delete_and_notify_if_changed(temp_map, "totem.netmtu"); delete_and_notify_if_changed(temp_map, "totem.interface.ringnumber"); delete_and_notify_if_changed(temp_map, "totem.interface.bindnetaddr"); delete_and_notify_if_changed(temp_map, "totem.interface.mcastaddr"); delete_and_notify_if_changed(temp_map, "totem.interface.broadcast"); delete_and_notify_if_changed(temp_map, "totem.interface.mcastport"); delete_and_notify_if_changed(temp_map, "totem.interface.ttl"); delete_and_notify_if_changed(temp_map, "totem.transport"); delete_and_notify_if_changed(temp_map, "totem.cluster_name"); delete_and_notify_if_changed(temp_map, "quorum.provider"); delete_and_notify_if_changed(temp_map, "system.move_to_root_cgroup"); delete_and_notify_if_changed(temp_map, "system.sched_rr"); delete_and_notify_if_changed(temp_map, "system.priority"); delete_and_notify_if_changed(temp_map, "system.qb_ipc_type"); delete_and_notify_if_changed(temp_map, "system.state_dir"); } /* * Remove entries that exist in the global map, but not in the temp_map, this will * cause delete notifications to be sent to any listeners. * * NOTE: This routine depends entirely on the keys returned by the iterators * being in alpha-sorted order. */ static void remove_deleted_entries(icmap_map_t temp_map, const char *prefix) { icmap_iter_t old_iter; icmap_iter_t new_iter; const char *old_key, *new_key; int ret; old_iter = icmap_iter_init(prefix); new_iter = icmap_iter_init_r(temp_map, prefix); old_key = icmap_iter_next(old_iter, NULL, NULL); new_key = icmap_iter_next(new_iter, NULL, NULL); while (old_key || new_key) { ret = nullcheck_strcmp(old_key, new_key); if ((ret < 0 && old_key) || !new_key) { /* * new_key is greater, a line (or more) has been deleted * Continue until old is >= new */ do { /* Remove it from icmap & send notifications */ icmap_delete(old_key); old_key = icmap_iter_next(old_iter, NULL, NULL); ret = nullcheck_strcmp(old_key, new_key); } while (ret < 0 && old_key); } else if ((ret > 0 && new_key) || !old_key) { /* * old_key is greater, a line (or more) has been added * Continue until new is >= old * * we don't need to do anything special with this like tell * icmap. That will happen when we copy the values over */ do { new_key = icmap_iter_next(new_iter, NULL, NULL); ret = nullcheck_strcmp(old_key, new_key); } while (ret > 0 && new_key); } if (ret == 0) { new_key = icmap_iter_next(new_iter, NULL, NULL); old_key = icmap_iter_next(old_iter, NULL, NULL); } } icmap_iter_finalize(new_iter); icmap_iter_finalize(old_iter); } /* * Reload configuration file */ static void message_handler_req_exec_cfg_reload_config ( const void *message, unsigned int nodeid) { const struct req_exec_cfg_reload_config *req_exec_cfg_reload_config = message; struct res_lib_cfg_reload_config res_lib_cfg_reload_config; struct totem_config new_config; icmap_map_t temp_map; const char *error_string; int res = CS_OK; ENTER(); log_printf(LOGSYS_LEVEL_NOTICE, "Config reload requested by node " CS_PRI_NODE_ID, nodeid); icmap_set_uint8("config.totemconfig_reload_in_progress", 1); + + /* Make sure there is no rubbish in this that might be checked, even on error */ + memset(&new_config, 0, sizeof(new_config)); /* * Set up a new hashtable as a staging area. */ if ((res = icmap_init_r(&temp_map)) != CS_OK) { log_printf(LOGSYS_LEVEL_ERROR, "Unable to create temporary icmap. config file reload cancelled\n"); goto reload_fini_nomap; } /* * Load new config into the temporary map */ res = coroparse_configparse(temp_map, &error_string); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Unable to reload config file: %s", error_string); res = CS_ERR_INVALID_PARAM; goto reload_fini_nofree; } /* Signal start of the reload process */ icmap_set_uint8("config.reload_in_progress", 1); /* Detect deleted entries and remove them from the main icmap hashtable */ remove_deleted_entries(temp_map, "logging."); remove_deleted_entries(temp_map, "totem."); remove_deleted_entries(temp_map, "nodelist."); remove_deleted_entries(temp_map, "quorum."); remove_deleted_entries(temp_map, "uidgid.config."); remove_deleted_entries(temp_map, "nozzle."); /* Remove entries that cannot be changed */ remove_ro_entries(temp_map); /* Take a copy of the current setup so we can check what has changed */ memset(&new_config, 0, sizeof(new_config)); new_config.orig_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(new_config.orig_interfaces != NULL); totempg_get_config(&new_config); + new_config.crypto_changed = 0; new_config.interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(new_config.interfaces != NULL); memset(new_config.interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); /* For UDP[U] the configuration on link0 is static (apart from the nodelist) and only read at startup. So preserve it here */ if ( (new_config.transport_number == TOTEM_TRANSPORT_UDP) || (new_config.transport_number == TOTEM_TRANSPORT_UDPU)) { memcpy(&new_config.interfaces[0], &new_config.orig_interfaces[0], sizeof(struct totem_interface)); } /* Calculate new node and interface definitions */ if (totemconfig_configure_new_params(&new_config, temp_map, &error_string) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Cannot configure new interface definitions: %s\n", error_string); res = CS_ERR_INVALID_PARAM; goto reload_fini; } /* Read from temp_map into new_config */ totem_volatile_config_read(&new_config, temp_map, NULL); + /* Get updated crypto parameters. Will set a flag in new_config if things have changed */ + if (totem_reread_crypto_config(&new_config, temp_map, &error_string) == -1) { + log_printf (LOGSYS_LEVEL_ERROR, "Crypto configuration is not valid: %s\n", error_string); + res = CS_ERR_INVALID_PARAM; + goto reload_fini; + } + /* Validate dynamic parameters */ if (totem_volatile_config_validate(&new_config, temp_map, &error_string) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Configuration is not valid: %s\n", error_string); res = CS_ERR_INVALID_PARAM; goto reload_fini; } + /* Save this here so we can get at it for the later phases of crypto change */ + if (new_config.crypto_changed) { +#ifndef HAVE_KNET_CRYPTO_RECONF + new_config.crypto_changed = 0; + log_printf (LOGSYS_LEVEL_ERROR, "Crypto reconfiguration is not supported by the linked version of knet\n"); + res = CS_ERR_INVALID_PARAM; + goto reload_fini; +#endif + } + /* * Copy new keys into live config. */ if ( (res = icmap_copy_map(icmap_get_global_map(), temp_map)) != CS_OK) { log_printf (LOGSYS_LEVEL_ERROR, "Error making new config live. cmap database may be inconsistent\n"); /* Return res from icmap */ goto reload_fini; } /* Copy into live system */ totempg_put_config(&new_config); totemconfig_commit_new_params(&new_config, temp_map); free(new_config.interfaces); reload_fini: /* All done - let clients know */ icmap_set_int32("config.reload_status", res); icmap_set_uint8("config.totemconfig_reload_in_progress", 0); icmap_set_uint8("config.reload_in_progress", 0); /* Finished with the temporary storage */ free(new_config.orig_interfaces); reload_fini_nofree: icmap_fini_r(temp_map); reload_fini_nomap: + + /* If crypto was changed, now it's loaded on all nodes we can enable it. + * Each node sends its own PHASE message so we're not relying on the leader + * node to survive the transition + */ + if (new_config.crypto_changed) { + struct req_exec_cfg_crypto_reconfig req_exec_cfg_crypto_reconfig; + struct iovec iovec; + + req_exec_cfg_crypto_reconfig.header.size = + sizeof (struct req_exec_cfg_crypto_reconfig); + req_exec_cfg_crypto_reconfig.header.id = SERVICE_ID_MAKE (CFG_SERVICE, + MESSAGE_REQ_EXEC_CFG_CRYPTO_RECONFIG); + req_exec_cfg_crypto_reconfig.phase = CRYPTO_RECONFIG_PHASE_ACTIVATE; + + iovec.iov_base = (char *)&req_exec_cfg_crypto_reconfig; + iovec.iov_len = sizeof (struct req_exec_cfg_crypto_reconfig); + + assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); + } + /* All done, return result to the caller if it was on this system */ if (nodeid == api->totem_nodeid_get()) { res_lib_cfg_reload_config.header.size = sizeof(res_lib_cfg_reload_config); res_lib_cfg_reload_config.header.id = MESSAGE_RES_CFG_RELOAD_CONFIG; res_lib_cfg_reload_config.header.error = res; api->ipc_response_send(req_exec_cfg_reload_config->source.conn, &res_lib_cfg_reload_config, sizeof(res_lib_cfg_reload_config)); api->ipc_refcnt_dec(req_exec_cfg_reload_config->source.conn);; } LEAVE(); } +/* Handle the phases of crypto reload + * The first time we are called is after the new crypto config has been loaded + * but not activated. + * + * 1 - activate the new crypto configuration + * 2 - clear out the old configuration + */ +static void message_handler_req_exec_cfg_reconfig_crypto ( + const void *message, + unsigned int nodeid) +{ + const struct req_exec_cfg_crypto_reconfig *req_exec_cfg_crypto_reconfig = message; + + /* Got our own reconfig message */ + if (nodeid == api->totem_nodeid_get()) { + log_printf (LOGSYS_LEVEL_DEBUG, "Crypto reconfiguration phase %d", req_exec_cfg_crypto_reconfig->phase); + + /* Do the deed */ + totempg_crypto_reconfigure_phase(req_exec_cfg_crypto_reconfig->phase); + + /* Move to the next phase if not finished */ + if (req_exec_cfg_crypto_reconfig->phase < CRYPTO_RECONFIG_PHASE_CLEANUP) { + struct req_exec_cfg_crypto_reconfig req_exec_cfg_crypto_reconfig2; + struct iovec iovec; + + req_exec_cfg_crypto_reconfig2.header.size = + sizeof (struct req_exec_cfg_crypto_reconfig); + req_exec_cfg_crypto_reconfig2.header.id = SERVICE_ID_MAKE (CFG_SERVICE, + MESSAGE_REQ_EXEC_CFG_CRYPTO_RECONFIG); + req_exec_cfg_crypto_reconfig2.phase = CRYPTO_RECONFIG_PHASE_CLEANUP; + + iovec.iov_base = (char *)&req_exec_cfg_crypto_reconfig2; + iovec.iov_len = sizeof (struct req_exec_cfg_crypto_reconfig); + + assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); + } + } +} + + /* * Library Interface Implementation */ static void message_handler_req_lib_cfg_ringstatusget ( void *conn, const void *msg) { struct res_lib_cfg_ringstatusget res_lib_cfg_ringstatusget; struct totem_ip_address interfaces[INTERFACE_MAX]; unsigned int iface_count; char **status; const char *totem_ip_string; char ifname[CFG_INTERFACE_NAME_MAX_LEN]; unsigned int iface_ids[INTERFACE_MAX]; unsigned int i; cs_error_t res = CS_OK; ENTER(); res_lib_cfg_ringstatusget.header.id = MESSAGE_RES_CFG_RINGSTATUSGET; res_lib_cfg_ringstatusget.header.size = sizeof (struct res_lib_cfg_ringstatusget); api->totem_ifaces_get ( api->totem_nodeid_get(), iface_ids, interfaces, INTERFACE_MAX, &status, &iface_count); assert(iface_count <= CFG_MAX_INTERFACES); res_lib_cfg_ringstatusget.interface_count = iface_count; for (i = 0; i < iface_count; i++) { totem_ip_string = (const char *)api->totem_ip_print (&interfaces[i]); if (!totem_ip_string) { totem_ip_string=""; } /* Allow for i/f number at the start */ if (strlen(totem_ip_string) >= CFG_INTERFACE_NAME_MAX_LEN-3) { log_printf(LOGSYS_LEVEL_ERROR, "String representation of interface %u is too long", i); res = CS_ERR_NAME_TOO_LONG; goto send_response; } snprintf(ifname, sizeof(ifname), "%d %s", iface_ids[i], totem_ip_string); if (strlen(status[i]) >= CFG_INTERFACE_STATUS_MAX_LEN) { log_printf(LOGSYS_LEVEL_ERROR, "Status string for interface %u is too long", i); res = CS_ERR_NAME_TOO_LONG; goto send_response; } strcpy ((char *)&res_lib_cfg_ringstatusget.interface_status[i], status[i]); strcpy ((char *)&res_lib_cfg_ringstatusget.interface_name[i], ifname); } send_response: res_lib_cfg_ringstatusget.header.error = res; api->ipc_response_send ( conn, &res_lib_cfg_ringstatusget, sizeof (struct res_lib_cfg_ringstatusget)); LEAVE(); } static void message_handler_req_lib_cfg_ringreenable ( void *conn, const void *msg) { struct res_lib_cfg_ringreenable res_lib_cfg_ringreenable; ENTER(); res_lib_cfg_ringreenable.header.id = MESSAGE_RES_CFG_RINGREENABLE; res_lib_cfg_ringreenable.header.size = sizeof (struct res_lib_cfg_ringreenable); res_lib_cfg_ringreenable.header.error = CS_ERR_NOT_SUPPORTED; api->ipc_response_send ( conn, &res_lib_cfg_ringreenable, sizeof (struct res_lib_cfg_ringreenable)); LEAVE(); } static void message_handler_req_lib_cfg_killnode ( void *conn, const void *msg) { const struct req_lib_cfg_killnode *req_lib_cfg_killnode = msg; struct res_lib_cfg_killnode res_lib_cfg_killnode; struct req_exec_cfg_killnode req_exec_cfg_killnode; struct iovec iovec; ENTER(); req_exec_cfg_killnode.header.size = sizeof (struct req_exec_cfg_killnode); req_exec_cfg_killnode.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_KILLNODE); req_exec_cfg_killnode.nodeid = req_lib_cfg_killnode->nodeid; marshall_to_mar_name_t(&req_exec_cfg_killnode.reason, &req_lib_cfg_killnode->reason); iovec.iov_base = (char *)&req_exec_cfg_killnode; iovec.iov_len = sizeof (struct req_exec_cfg_killnode); (void)api->totem_mcast (&iovec, 1, TOTEM_SAFE); res_lib_cfg_killnode.header.size = sizeof(struct res_lib_cfg_killnode); res_lib_cfg_killnode.header.id = MESSAGE_RES_CFG_KILLNODE; res_lib_cfg_killnode.header.error = CS_OK; api->ipc_response_send(conn, &res_lib_cfg_killnode, sizeof(res_lib_cfg_killnode)); LEAVE(); } static void message_handler_req_lib_cfg_tryshutdown ( void *conn, const void *msg) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); const struct req_lib_cfg_tryshutdown *req_lib_cfg_tryshutdown = msg; struct qb_list_head *iter; ENTER(); if (req_lib_cfg_tryshutdown->flags == CFG_SHUTDOWN_FLAG_IMMEDIATE) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; /* * Tell other nodes */ send_shutdown(); res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); LEAVE(); return; } /* * Shutdown in progress, return an error */ if (shutdown_con) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_ERR_EXIST; api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); LEAVE(); return; } ci->conn = conn; shutdown_con = (struct cfg_info *)api->ipc_private_data_get (conn); shutdown_flags = req_lib_cfg_tryshutdown->flags; shutdown_yes = 0; shutdown_no = 0; /* * Count the number of listeners */ shutdown_expected = 0; qb_list_for_each(iter, &trackers_list) { struct cfg_info *testci = qb_list_entry(iter, struct cfg_info, list); /* * It is assumed that we will allow shutdown */ if (testci != ci) { testci->shutdown_reply = SHUTDOWN_REPLY_UNKNOWN; shutdown_expected++; } } /* * If no-one is listening for events then we can just go down now */ if (shutdown_expected == 0) { struct res_lib_cfg_tryshutdown res_lib_cfg_tryshutdown; res_lib_cfg_tryshutdown.header.size = sizeof(struct res_lib_cfg_tryshutdown); res_lib_cfg_tryshutdown.header.id = MESSAGE_RES_CFG_TRYSHUTDOWN; res_lib_cfg_tryshutdown.header.error = CS_OK; /* * Tell originator that shutdown was confirmed */ api->ipc_response_send(conn, &res_lib_cfg_tryshutdown, sizeof(res_lib_cfg_tryshutdown)); send_shutdown(); LEAVE(); return; } else { unsigned int shutdown_timeout = DEFAULT_SHUTDOWN_TIMEOUT; /* * Look for a shutdown timeout in configuration map */ icmap_get_uint32("cfg.shutdown_timeout", &shutdown_timeout); /* * Start the timer. If we don't get a full set of replies before this goes * off we'll cancel the shutdown */ api->timer_add_duration((unsigned long long)shutdown_timeout*1000000000, NULL, shutdown_timer_fn, &shutdown_timer); /* * Tell the users we would like to shut down */ send_test_shutdown(NULL, conn, CS_OK); } /* * We don't sent a reply to the caller here. * We send it when we know if we can shut down or not */ LEAVE(); } static void message_handler_req_lib_cfg_replytoshutdown ( void *conn, const void *msg) { struct cfg_info *ci = (struct cfg_info *)api->ipc_private_data_get (conn); const struct req_lib_cfg_replytoshutdown *req_lib_cfg_replytoshutdown = msg; struct res_lib_cfg_replytoshutdown res_lib_cfg_replytoshutdown; int status = CS_OK; ENTER(); if (!shutdown_con) { status = CS_ERR_ACCESS; goto exit_fn; } if (req_lib_cfg_replytoshutdown->response) { shutdown_yes++; ci->shutdown_reply = SHUTDOWN_REPLY_YES; } else { shutdown_no++; ci->shutdown_reply = SHUTDOWN_REPLY_NO; } check_shutdown_status(); exit_fn: res_lib_cfg_replytoshutdown.header.error = status; res_lib_cfg_replytoshutdown.header.id = MESSAGE_RES_CFG_REPLYTOSHUTDOWN; res_lib_cfg_replytoshutdown.header.size = sizeof(res_lib_cfg_replytoshutdown); api->ipc_response_send(conn, &res_lib_cfg_replytoshutdown, sizeof(res_lib_cfg_replytoshutdown)); LEAVE(); } static void message_handler_req_lib_cfg_get_node_addrs (void *conn, const void *msg) { struct totem_ip_address node_ifs[INTERFACE_MAX]; unsigned int iface_ids[INTERFACE_MAX]; char buf[PIPE_BUF]; char **status; unsigned int num_interfaces = 0; struct sockaddr_storage *ss; int ret = CS_OK; int i; int live_addrs = 0; const struct req_lib_cfg_get_node_addrs *req_lib_cfg_get_node_addrs = msg; struct res_lib_cfg_get_node_addrs *res_lib_cfg_get_node_addrs = (struct res_lib_cfg_get_node_addrs *)buf; unsigned int nodeid = req_lib_cfg_get_node_addrs->nodeid; char *addr_buf; if (nodeid == 0) nodeid = api->totem_nodeid_get(); if (api->totem_ifaces_get(nodeid, iface_ids, node_ifs, INTERFACE_MAX, &status, &num_interfaces)) { ret = CS_ERR_EXIST; num_interfaces = 0; } res_lib_cfg_get_node_addrs->header.size = sizeof(struct res_lib_cfg_get_node_addrs) + (num_interfaces * TOTEMIP_ADDRLEN); res_lib_cfg_get_node_addrs->header.id = MESSAGE_RES_CFG_GET_NODE_ADDRS; res_lib_cfg_get_node_addrs->header.error = ret; if (num_interfaces) { res_lib_cfg_get_node_addrs->family = node_ifs[0].family; for (i = 0, addr_buf = (char *)res_lib_cfg_get_node_addrs->addrs; i < num_interfaces; i++) { ss = (struct sockaddr_storage *)&node_ifs[i].addr; if (ss->ss_family) { memcpy(addr_buf, node_ifs[i].addr, TOTEMIP_ADDRLEN); live_addrs++; addr_buf += TOTEMIP_ADDRLEN; } } res_lib_cfg_get_node_addrs->num_addrs = live_addrs; } else { res_lib_cfg_get_node_addrs->header.error = CS_ERR_NOT_EXIST; } api->ipc_response_send(conn, res_lib_cfg_get_node_addrs, res_lib_cfg_get_node_addrs->header.size); } static void message_handler_req_lib_cfg_local_get (void *conn, const void *msg) { struct res_lib_cfg_local_get res_lib_cfg_local_get; res_lib_cfg_local_get.header.size = sizeof(res_lib_cfg_local_get); res_lib_cfg_local_get.header.id = MESSAGE_RES_CFG_LOCAL_GET; res_lib_cfg_local_get.header.error = CS_OK; res_lib_cfg_local_get.local_nodeid = api->totem_nodeid_get (); api->ipc_response_send(conn, &res_lib_cfg_local_get, sizeof(res_lib_cfg_local_get)); } static void message_handler_req_lib_cfg_reload_config (void *conn, const void *msg) { struct req_exec_cfg_reload_config req_exec_cfg_reload_config; struct iovec iovec; ENTER(); req_exec_cfg_reload_config.header.size = sizeof (struct req_exec_cfg_reload_config); req_exec_cfg_reload_config.header.id = SERVICE_ID_MAKE (CFG_SERVICE, MESSAGE_REQ_EXEC_CFG_RELOAD_CONFIG); api->ipc_source_set (&req_exec_cfg_reload_config.source, conn); api->ipc_refcnt_inc(conn); iovec.iov_base = (char *)&req_exec_cfg_reload_config; iovec.iov_len = sizeof (struct req_exec_cfg_reload_config); assert (api->totem_mcast (&iovec, 1, TOTEM_SAFE) == 0); LEAVE(); } static void message_handler_req_lib_cfg_reopen_log_files (void *conn, const void *msg) { struct res_lib_cfg_reopen_log_files res_lib_cfg_reopen_log_files; cs_error_t res; ENTER(); log_printf(LOGSYS_LEVEL_DEBUG, "Reopening logging files\n"); res = logsys_reopen_log_files(); res_lib_cfg_reopen_log_files.header.size = sizeof(res_lib_cfg_reopen_log_files); res_lib_cfg_reopen_log_files.header.id = MESSAGE_RES_CFG_REOPEN_LOG_FILES; res_lib_cfg_reopen_log_files.header.error = res; api->ipc_response_send(conn, &res_lib_cfg_reopen_log_files, sizeof(res_lib_cfg_reopen_log_files)); LEAVE(); } diff --git a/exec/main.c b/exec/main.c index 8c3df79b..f1496eb9 100644 --- a/exec/main.c +++ b/exec/main.c @@ -1,1594 +1,1588 @@ /* * Copyright (c) 2002-2006 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** * \mainpage Corosync * * This is the doxygen generated developer documentation for the Corosync * project. For more information about Corosync, please see the project * web site, corosync.org. * * \section license License * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LIBSYSTEMD #include #endif #include #include #include #include #include #include #include #include #include #include #include #include "quorum.h" #include "totemsrp.h" #include "logconfig.h" #include "totemconfig.h" #include "main.h" #include "sync.h" #include "timer.h" #include "util.h" #include "apidef.h" #include "service.h" #include "schedwrk.h" #include "ipcs_stats.h" #include "stats.h" #ifdef HAVE_SMALL_MEMORY_FOOTPRINT #define IPC_LOGSYS_SIZE 1024*64 #else #define IPC_LOGSYS_SIZE 8192*128 #endif /* * LibQB adds default "*" syslog filter so we have to set syslog_priority as low * as possible so filters applied later in _logsys_config_apply_per_file takes * effect. */ LOGSYS_DECLARE_SYSTEM ("corosync", LOGSYS_MODE_OUTPUT_STDERR | LOGSYS_MODE_OUTPUT_SYSLOG, LOG_DAEMON, LOG_EMERG); LOGSYS_DECLARE_SUBSYS ("MAIN"); #define SERVER_BACKLOG 5 static int sched_priority = 0; static unsigned int service_count = 32; static struct totem_logging_configuration totem_logging_configuration; static struct corosync_api_v1 *api = NULL; static int sync_in_process = 1; static qb_loop_t *corosync_poll_handle; struct sched_param global_sched_param; static corosync_timer_handle_t corosync_stats_timer_handle; static const char *corosync_lock_file = LOCALSTATEDIR"/run/corosync.pid"; static char corosync_config_file[PATH_MAX + 1] = COROSYSCONFDIR "/corosync.conf"; qb_loop_t *cs_poll_handle_get (void) { return (corosync_poll_handle); } int cs_poll_dispatch_add (qb_loop_t * handle, int fd, int events, void *data, int (*dispatch_fn) (int fd, int revents, void *data)) { return qb_loop_poll_add(handle, QB_LOOP_MED, fd, events, data, dispatch_fn); } int cs_poll_dispatch_delete(qb_loop_t * handle, int fd) { return qb_loop_poll_del(handle, fd); } void corosync_state_dump (void) { int i; for (i = 0; i < SERVICES_COUNT_MAX; i++) { if (corosync_service[i] && corosync_service[i]->exec_dump_fn) { corosync_service[i]->exec_dump_fn (); } } } const char *corosync_get_config_file(void) { return (corosync_config_file); } static void corosync_blackbox_write_to_file (void) { char fname[PATH_MAX]; char fdata_fname[PATH_MAX]; char time_str[PATH_MAX]; struct tm cur_time_tm; time_t cur_time_t; ssize_t res; cur_time_t = time(NULL); localtime_r(&cur_time_t, &cur_time_tm); strftime(time_str, PATH_MAX, "%Y-%m-%dT%H:%M:%S", &cur_time_tm); if (snprintf(fname, PATH_MAX, "%s/fdata-%s-%lld", get_state_dir(), time_str, (long long int)getpid()) >= PATH_MAX) { log_printf(LOGSYS_LEVEL_ERROR, "Can't snprintf blackbox file name"); return ; } if ((res = qb_log_blackbox_write_to_file(fname)) < 0) { LOGSYS_PERROR(-res, LOGSYS_LEVEL_ERROR, "Can't store blackbox file"); return ; } snprintf(fdata_fname, sizeof(fdata_fname), "%s/fdata", get_state_dir()); unlink(fdata_fname); if (symlink(fname, fdata_fname) == -1) { log_printf(LOGSYS_LEVEL_ERROR, "Can't create symlink to '%s' for corosync blackbox file '%s'", fname, fdata_fname); } } static void unlink_all_completed (void) { api->timer_delete (corosync_stats_timer_handle); qb_loop_stop (corosync_poll_handle); icmap_fini(); } void corosync_shutdown_request (void) { corosync_service_unlink_all (api, unlink_all_completed); } static int32_t sig_diag_handler (int num, void *data) { corosync_state_dump (); return 0; } static int32_t sig_exit_handler (int num, void *data) { log_printf(LOGSYS_LEVEL_NOTICE, "Node was shut down by a signal"); corosync_service_unlink_all (api, unlink_all_completed); return 0; } static void sigsegv_handler (int num) { (void)signal (num, SIG_DFL); corosync_blackbox_write_to_file (); qb_log_fini(); raise (num); } #define LOCALHOST_IP inet_addr("127.0.0.1") static void *corosync_group_handle; static struct totempg_group corosync_group = { .group = "a", .group_len = 1 }; static void serialize_lock (void) { } static void serialize_unlock (void) { } static void corosync_sync_completed (void) { log_printf (LOGSYS_LEVEL_NOTICE, "Completed service synchronization, ready to provide service."); sync_in_process = 0; cs_ipcs_sync_state_changed(sync_in_process); cs_ipc_allow_connections(1); /* * Inform totem to start using new message queue again */ totempg_trans_ack(); #ifdef HAVE_LIBSYSTEMD sd_notify (0, "READY=1"); #endif } static int corosync_sync_callbacks_retrieve ( int service_id, struct sync_callbacks *callbacks) { if (corosync_service[service_id] == NULL) { return (-1); } if (callbacks == NULL) { return (0); } callbacks->name = corosync_service[service_id]->name; callbacks->sync_init = corosync_service[service_id]->sync_init; callbacks->sync_process = corosync_service[service_id]->sync_process; callbacks->sync_activate = corosync_service[service_id]->sync_activate; callbacks->sync_abort = corosync_service[service_id]->sync_abort; return (0); } static struct memb_ring_id corosync_ring_id; static void member_object_joined (unsigned int nodeid) { char member_ip[ICMAP_KEYNAME_MAXLEN]; char member_join_count[ICMAP_KEYNAME_MAXLEN]; char member_status[ICMAP_KEYNAME_MAXLEN]; snprintf(member_ip, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.ip", nodeid); snprintf(member_join_count, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.join_count", nodeid); snprintf(member_status, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.status", nodeid); if (icmap_get(member_ip, NULL, NULL, NULL) == CS_OK) { icmap_inc(member_join_count); icmap_set_string(member_status, "joined"); } else { icmap_set_string(member_ip, (char*)api->totem_ifaces_print (nodeid)); icmap_set_uint32(member_join_count, 1); icmap_set_string(member_status, "joined"); } log_printf (LOGSYS_LEVEL_DEBUG, "Member joined: %s", api->totem_ifaces_print (nodeid)); } static void member_object_left (unsigned int nodeid) { char member_status[ICMAP_KEYNAME_MAXLEN]; snprintf(member_status, ICMAP_KEYNAME_MAXLEN, "runtime.members.%u.status", nodeid); icmap_set_string(member_status, "left"); log_printf (LOGSYS_LEVEL_DEBUG, "Member left: %s", api->totem_ifaces_print (nodeid)); } static void confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; int abort_activate = 0; if (sync_in_process == 1) { abort_activate = 1; } sync_in_process = 1; cs_ipcs_sync_state_changed(sync_in_process); memcpy (&corosync_ring_id, ring_id, sizeof (struct memb_ring_id)); for (i = 0; i < left_list_entries; i++) { member_object_left (left_list[i]); } for (i = 0; i < joined_list_entries; i++) { member_object_joined (joined_list[i]); } /* * Call configuration change for all services */ for (i = 0; i < service_count; i++) { if (corosync_service[i] && corosync_service[i]->confchg_fn) { corosync_service[i]->confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } } if (abort_activate) { sync_abort (); } if (configuration_type == TOTEM_CONFIGURATION_TRANSITIONAL) { sync_save_transitional (member_list, member_list_entries, ring_id); } if (configuration_type == TOTEM_CONFIGURATION_REGULAR) { sync_start (member_list, member_list_entries, ring_id); } } static void priv_drop (void) { return; /* TODO: we are still not dropping privs */ } static void corosync_tty_detach (void) { int devnull; /* * Disconnect from TTY if this is not a debug run */ switch (fork ()) { case -1: corosync_exit_error (COROSYNC_DONE_FORK); break; case 0: /* * child which is disconnected, run this process */ break; default: exit (0); break; } /* Create new session */ (void)setsid(); /* * Map stdin/out/err to /dev/null. */ devnull = open("/dev/null", O_RDWR); if (devnull == -1) { corosync_exit_error (COROSYNC_DONE_STD_TO_NULL_REDIR); } if (dup2(devnull, 0) < 0 || dup2(devnull, 1) < 0 || dup2(devnull, 2) < 0) { close(devnull); corosync_exit_error (COROSYNC_DONE_STD_TO_NULL_REDIR); } close(devnull); } static void corosync_mlockall (void) { int res; struct rlimit rlimit; rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; #ifndef RLIMIT_MEMLOCK #define RLIMIT_MEMLOCK RLIMIT_VMEM #endif setrlimit (RLIMIT_MEMLOCK, &rlimit); res = mlockall (MCL_CURRENT | MCL_FUTURE); if (res == -1) { LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING, "Could not lock memory of service to avoid page faults"); }; } static void corosync_totem_stats_updater (void *data) { totempg_stats_t * stats; uint32_t total_mtt_rx_token; uint32_t total_backlog_calc; uint32_t total_token_holdtime; int t, prev; int32_t token_count; const char *cstr; stats = api->totem_get_stats(); stats->srp->firewall_enabled_or_nic_failure = stats->srp->continuous_gather > MAX_NO_CONT_GATHER ? 1 : 0; if (stats->srp->continuous_gather > MAX_NO_CONT_GATHER || stats->srp->continuous_sendmsg_failures > MAX_NO_CONT_SENDMSG_FAILURES) { cstr = ""; if (stats->srp->continuous_sendmsg_failures > MAX_NO_CONT_SENDMSG_FAILURES) { cstr = "number of multicast sendmsg failures is above threshold"; } if (stats->srp->continuous_gather > MAX_NO_CONT_GATHER) { cstr = "totem is continuously in gather state"; } log_printf (LOGSYS_LEVEL_WARNING, "Totem is unable to form a cluster because of an " "operating system or network fault (reason: %s). The most common " "cause of this message is that the local firewall is " "configured improperly.", cstr); stats->srp->firewall_enabled_or_nic_failure = 1; } else { stats->srp->firewall_enabled_or_nic_failure = 0; } total_mtt_rx_token = 0; total_token_holdtime = 0; total_backlog_calc = 0; token_count = 0; t = stats->srp->latest_token; while (1) { if (t == 0) prev = TOTEM_TOKEN_STATS_MAX - 1; else prev = t - 1; if (prev == stats->srp->earliest_token) break; /* if tx == 0, then dropped token (not ours) */ if (stats->srp->token[t].tx != 0 || (stats->srp->token[t].rx - stats->srp->token[prev].rx) > 0 ) { total_mtt_rx_token += (stats->srp->token[t].rx - stats->srp->token[prev].rx); total_token_holdtime += (stats->srp->token[t].tx - stats->srp->token[t].rx); total_backlog_calc += stats->srp->token[t].backlog_calc; token_count++; } t = prev; } if (token_count) { stats->srp->mtt_rx_token = (total_mtt_rx_token / token_count); stats->srp->avg_token_workload = (total_token_holdtime / token_count); stats->srp->avg_backlog_calc = (total_backlog_calc / token_count); } stats->srp->time_since_token_last_received = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC - stats->srp->token[stats->srp->latest_token].rx; stats_trigger_trackers(); api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL, corosync_totem_stats_updater, &corosync_stats_timer_handle); } static void corosync_totem_stats_init (void) { /* start stats timer */ api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL, corosync_totem_stats_updater, &corosync_stats_timer_handle); } static void deliver_fn ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required) { const struct qb_ipc_request_header *header; int32_t service; int32_t fn_id; uint32_t id; header = msg; if (endian_conversion_required) { id = swab32 (header->id); } else { id = header->id; } /* * Call the proper executive handler */ service = id >> 16; fn_id = id & 0xffff; if (!corosync_service[service]) { return; } if (fn_id >= corosync_service[service]->exec_engine_count) { log_printf(LOGSYS_LEVEL_WARNING, "discarded unknown message %d for service %d (max id %d)", fn_id, service, corosync_service[service]->exec_engine_count); return; } icmap_fast_inc(service_stats_rx[service][fn_id]); if (endian_conversion_required) { assert(corosync_service[service]->exec_engine[fn_id].exec_endian_convert_fn != NULL); corosync_service[service]->exec_engine[fn_id].exec_endian_convert_fn ((void *)msg); } corosync_service[service]->exec_engine[fn_id].exec_handler_fn (msg, nodeid); } int main_mcast ( const struct iovec *iovec, unsigned int iov_len, unsigned int guarantee) { const struct qb_ipc_request_header *req = iovec->iov_base; int32_t service; int32_t fn_id; service = req->id >> 16; fn_id = req->id & 0xffff; if (corosync_service[service]) { icmap_fast_inc(service_stats_tx[service][fn_id]); } return (totempg_groups_mcast_joined (corosync_group_handle, iovec, iov_len, guarantee)); } static void corosync_ring_id_create_or_load ( struct memb_ring_id *memb_ring_id, unsigned int nodeid) { int fd; int res = 0; char filename[PATH_MAX]; snprintf (filename, sizeof(filename), "%s/ringid_%u", get_state_dir(), nodeid); fd = open (filename, O_RDONLY, 0700); /* * If file can be opened and read, read the ring id */ if (fd != -1) { res = read (fd, &memb_ring_id->seq, sizeof (uint64_t)); close (fd); } /* * If file could not be opened or read, create a new ring id */ if ((fd == -1) || (res != sizeof (uint64_t))) { memb_ring_id->seq = 0; umask(0); fd = open (filename, O_CREAT|O_RDWR, 0700); if (fd != -1) { res = write (fd, &memb_ring_id->seq, sizeof (uint64_t)); close (fd); if (res == -1) { LOGSYS_PERROR (errno, LOGSYS_LEVEL_ERROR, "Couldn't write ringid file '%s'", filename); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } } else { LOGSYS_PERROR (errno, LOGSYS_LEVEL_ERROR, "Couldn't create ringid file '%s'", filename); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } } memb_ring_id->rep = nodeid; } static void corosync_ring_id_store ( const struct memb_ring_id *memb_ring_id, unsigned int nodeid) { char filename[PATH_MAX]; int fd; int res; snprintf (filename, sizeof(filename), "%s/ringid_%u", get_state_dir(), nodeid); fd = open (filename, O_WRONLY, 0700); if (fd == -1) { fd = open (filename, O_CREAT|O_RDWR, 0700); } if (fd == -1) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "Couldn't store new ring id " CS_PRI_RING_ID_SEQ " to stable storage", memb_ring_id->seq); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } log_printf (LOGSYS_LEVEL_DEBUG, "Storing new sequence id for ring " CS_PRI_RING_ID_SEQ, memb_ring_id->seq); res = write (fd, &memb_ring_id->seq, sizeof(memb_ring_id->seq)); close (fd); if (res != sizeof(memb_ring_id->seq)) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "Couldn't store new ring id " CS_PRI_RING_ID_SEQ " to stable storage", memb_ring_id->seq); corosync_exit_error (COROSYNC_DONE_STORE_RINGID); } } static qb_loop_timer_handle recheck_the_q_level_timer; void corosync_recheck_the_q_level(void *data) { totempg_check_q_level(corosync_group_handle); if (cs_ipcs_q_level_get() == TOTEM_Q_LEVEL_CRITICAL) { qb_loop_timer_add(cs_poll_handle_get(), QB_LOOP_MED, 1*QB_TIME_NS_IN_MSEC, NULL, corosync_recheck_the_q_level, &recheck_the_q_level_timer); } } struct sending_allowed_private_data_struct { int reserved_msgs; }; int corosync_sending_allowed ( unsigned int service, unsigned int id, const void *msg, void *sending_allowed_private_data) { struct sending_allowed_private_data_struct *pd = (struct sending_allowed_private_data_struct *)sending_allowed_private_data; struct iovec reserve_iovec; struct qb_ipc_request_header *header = (struct qb_ipc_request_header *)msg; int sending_allowed; reserve_iovec.iov_base = (char *)header; reserve_iovec.iov_len = header->size; pd->reserved_msgs = totempg_groups_joined_reserve ( corosync_group_handle, &reserve_iovec, 1); if (pd->reserved_msgs == -1) { return -EINVAL; } /* Message ID out of range */ if (id >= corosync_service[service]->lib_engine_count) { return -EINVAL; } sending_allowed = QB_FALSE; if (corosync_quorum_is_quorate() == 1 || corosync_service[service]->allow_inquorate == CS_LIB_ALLOW_INQUORATE) { // we are quorate // now check flow control if (corosync_service[service]->lib_engine[id].flow_control == CS_LIB_FLOW_CONTROL_NOT_REQUIRED) { sending_allowed = QB_TRUE; } else if (pd->reserved_msgs && sync_in_process == 0) { sending_allowed = QB_TRUE; } else if (pd->reserved_msgs == 0) { return -ENOBUFS; } else /* (sync_in_process) */ { return -EINPROGRESS; } } else { return -EHOSTUNREACH; } return (sending_allowed); } void corosync_sending_allowed_release (void *sending_allowed_private_data) { struct sending_allowed_private_data_struct *pd = (struct sending_allowed_private_data_struct *)sending_allowed_private_data; if (pd->reserved_msgs == -1) { return; } totempg_groups_joined_release (pd->reserved_msgs); } int message_source_is_local (const mar_message_source_t *source) { int ret = 0; assert (source != NULL); if (source->nodeid == totempg_my_nodeid_get ()) { ret = 1; } return ret; } void message_source_set ( mar_message_source_t *source, void *conn) { assert ((source != NULL) && (conn != NULL)); memset (source, 0, sizeof (mar_message_source_t)); source->nodeid = totempg_my_nodeid_get (); source->conn = conn; } struct scheduler_pause_timeout_data { struct totem_config *totem_config; qb_loop_timer_handle handle; unsigned long long tv_prev; unsigned long long max_tv_diff; }; static void timer_function_scheduler_timeout (void *data) { struct scheduler_pause_timeout_data *timeout_data = (struct scheduler_pause_timeout_data *)data; unsigned long long tv_current; unsigned long long tv_diff; uint64_t schedmiss_event_tstamp; tv_current = qb_util_nano_current_get (); if (timeout_data->tv_prev == 0) { /* * Initial call -> just pretent everything is ok */ timeout_data->tv_prev = tv_current; timeout_data->max_tv_diff = 0; } tv_diff = tv_current - timeout_data->tv_prev; timeout_data->tv_prev = tv_current; if (tv_diff > timeout_data->max_tv_diff) { schedmiss_event_tstamp = qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC; log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled (@%" PRIu64 ") for %0.4f ms " "(threshold is %0.4f ms). Consider token timeout increase.", schedmiss_event_tstamp, (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); stats_add_schedmiss_event(schedmiss_event_tstamp, (float)tv_diff / QB_TIME_NS_IN_MSEC); } /* * Set next threshold, because token_timeout can change */ timeout_data->max_tv_diff = timeout_data->totem_config->token_timeout * QB_TIME_NS_IN_MSEC * 0.8; qb_loop_timer_add (corosync_poll_handle, QB_LOOP_MED, timeout_data->totem_config->token_timeout * QB_TIME_NS_IN_MSEC / 3, timeout_data, timer_function_scheduler_timeout, &timeout_data->handle); } static int corosync_set_rr_scheduler (void) { int ret_val = 0; #if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX) && defined(HAVE_SCHED_SETSCHEDULER) int res; sched_priority = sched_get_priority_max (SCHED_RR); if (sched_priority != -1) { global_sched_param.sched_priority = sched_priority; res = sched_setscheduler (0, SCHED_RR, &global_sched_param); if (res == -1) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "Could not set SCHED_RR at priority %d", global_sched_param.sched_priority); global_sched_param.sched_priority = 0; #ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET qb_log_thread_priority_set (SCHED_OTHER, 0); #endif ret_val = -1; } else { /* * Turn on SCHED_RR in logsys system */ #ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET res = qb_log_thread_priority_set (SCHED_RR, sched_priority); #else res = -1; #endif if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Could not set logsys thread priority." " Can't continue because of priority inversions."); corosync_exit_error (COROSYNC_DONE_LOGSETUP); } } } else { LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING, "Could not get maximum scheduler priority"); sched_priority = 0; ret_val = -1; } #else log_printf(LOGSYS_LEVEL_WARNING, "The Platform is missing process priority setting features. Leaving at default."); ret_val = -1; #endif return (ret_val); } /* The basename man page contains scary warnings about thread-safety and portability, hence this */ static const char *corosync_basename(const char *file_name) { char *base; base = strrchr (file_name, '/'); if (base) { return base + 1; } return file_name; } static void _logsys_log_printf(int level, int subsys, const char *function_name, const char *file_name, int file_line, const char *format, ...) __attribute__((format(printf, 6, 7))); static void _logsys_log_printf(int level, int subsys, const char *function_name, const char *file_name, int file_line, const char *format, ...) { va_list ap; va_start(ap, format); qb_log_from_external_source_va(function_name, corosync_basename(file_name), format, level, file_line, subsys, ap); va_end(ap); } static void fplay_key_change_notify_fn ( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { if (strcmp(key_name, "runtime.blackbox.dump_flight_data") == 0) { fprintf(stderr,"Writetofile\n"); corosync_blackbox_write_to_file (); } if (strcmp(key_name, "runtime.blackbox.dump_state") == 0) { fprintf(stderr,"statefump\n"); corosync_state_dump (); } } static void corosync_fplay_control_init (void) { icmap_track_t track = NULL; icmap_set_string("runtime.blackbox.dump_flight_data", "no"); icmap_set_string("runtime.blackbox.dump_state", "no"); icmap_track_add("runtime.blackbox.dump_flight_data", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY, fplay_key_change_notify_fn, NULL, &track); icmap_track_add("runtime.blackbox.dump_state", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY, fplay_key_change_notify_fn, NULL, &track); } static void force_gather_notify_fn( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { char *key_val; if (icmap_get_string(key_name, &key_val) == CS_OK && strcmp(key_val, "no") == 0) goto out; icmap_set_string("runtime.force_gather", "no"); if (strcmp(key_name, "runtime.force_gather") == 0) { log_printf(LOGSYS_LEVEL_ERROR, "Forcing into GATHER state\n"); totempg_force_gather(); } out: free(key_val); } static void corosync_force_gather_init (void) { icmap_track_t track = NULL; icmap_set_string("runtime.force_gather", "no"); icmap_track_add("runtime.force_gather", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY, force_gather_notify_fn, NULL, &track); } /* * Set RO flag for keys, which ether doesn't make sense to change by user (statistic) * or which when changed are not reflected by runtime (totem.crypto_cipher, ...). * * Also some RO keys cannot be determined in this stage, so they are set later in * other functions (like nodelist.local_node_pos, ...) */ static void set_icmap_ro_keys_flag (void) { /* * Set RO flag for all keys of internal configuration and runtime statistics */ icmap_set_ro_access("internal_configuration.", CS_TRUE, CS_TRUE); icmap_set_ro_access("runtime.services.", CS_TRUE, CS_TRUE); icmap_set_ro_access("runtime.config.", CS_TRUE, CS_TRUE); icmap_set_ro_access("runtime.totem.", CS_TRUE, CS_TRUE); icmap_set_ro_access("uidgid.config.", CS_TRUE, CS_TRUE); icmap_set_ro_access("system.", CS_TRUE, CS_TRUE); icmap_set_ro_access("nodelist.", CS_TRUE, CS_TRUE); /* * Set RO flag for constrete keys of configuration which can't be changed * during runtime */ icmap_set_ro_access("totem.crypto_cipher", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.crypto_hash", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.keyfile", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.key", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.secauth", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.ip_version", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.rrp_mode", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.transport", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.cluster_name", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.netmtu", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.threads", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.version", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.nodeid", CS_FALSE, CS_TRUE); icmap_set_ro_access("totem.clear_node_high_bit", CS_FALSE, CS_TRUE); icmap_set_ro_access("config.reload_in_progress", CS_FALSE, CS_TRUE); icmap_set_ro_access("config.totemconfig_reload_in_progress", CS_FALSE, CS_TRUE); } static void main_service_ready (void) { int res; /* * This must occur after totempg is initialized because "this_ip" must be set */ res = corosync_service_defaults_link_and_init (api); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Could not initialize default services"); corosync_exit_error (COROSYNC_DONE_INIT_SERVICES); } cs_ipcs_init(); corosync_totem_stats_init (); corosync_fplay_control_init (); corosync_force_gather_init (); sync_init ( corosync_sync_callbacks_retrieve, corosync_sync_completed); } static enum e_corosync_done corosync_flock (const char *lockfile, pid_t pid) { struct flock lock; enum e_corosync_done err; char pid_s[17]; int fd_flag; int lf; err = COROSYNC_DONE_EXIT; lf = open (lockfile, O_WRONLY | O_CREAT, 0640); if (lf == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't create lock file."); return (COROSYNC_DONE_ACQUIRE_LOCK); } retry_fcntl: lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; if (fcntl (lf, F_SETLK, &lock) == -1) { switch (errno) { case EINTR: goto retry_fcntl; break; case EAGAIN: case EACCES: log_printf (LOGSYS_LEVEL_ERROR, "Another Corosync instance is already running."); err = COROSYNC_DONE_ALREADY_RUNNING; goto error_close; break; default: log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't acquire lock. Error was %s", strerror(errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close; break; } } if (ftruncate (lf, 0) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't truncate lock file. Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } memset (pid_s, 0, sizeof (pid_s)); snprintf (pid_s, sizeof (pid_s) - 1, "%u\n", pid); retry_write: if (write (lf, pid_s, strlen (pid_s)) != strlen (pid_s)) { if (errno == EINTR) { goto retry_write; } else { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't write pid to lock file. " "Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } } if ((fd_flag = fcntl (lf, F_GETFD, 0)) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't get close-on-exec flag from lock file. " "Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } fd_flag |= FD_CLOEXEC; if (fcntl (lf, F_SETFD, fd_flag) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't set close-on-exec flag to lock file. " "Error was %s", strerror (errno)); err = COROSYNC_DONE_ACQUIRE_LOCK; goto error_close_unlink; } return (err); error_close_unlink: unlink (lockfile); error_close: close (lf); return (err); } static int corosync_move_to_root_cgroup(void) { FILE *f; int res = -1; /* * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now * using systemd and systemd uses hardcoded path of cgroup mount point. * * This feature is expected to be removed as soon as systemd gets support * for managing RT configuration. */ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); if (f == NULL) { log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> " "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); res = 0; goto exit_res; } (void)fclose(f); f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); if (f == NULL) { log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing"); goto exit_res; } if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) { log_printf(LOGSYS_LEVEL_WARNING, "Can't write corosync pid into cgroups tasks file"); goto close_and_exit_res; } close_and_exit_res: if (fclose(f) != 0) { log_printf(LOGSYS_LEVEL_WARNING, "Can't close cgroups tasks file"); goto exit_res; } exit_res: return (res); } int main (int argc, char **argv, char **envp) { const char *error_string; struct totem_config totem_config; int res, ch; int background, sched_rr, prio, testonly, move_to_root_cgroup; struct stat stat_out; enum e_corosync_done flock_err; uint64_t totem_config_warnings; struct scheduler_pause_timeout_data scheduler_pause_timeout_data; long int tmpli; char *ep; char *tmp_str; int log_subsys_id_totem; /* default configuration */ background = 1; testonly = 0; while ((ch = getopt (argc, argv, "c:ftv")) != EOF) { switch (ch) { case 'c': res = snprintf(corosync_config_file, sizeof(corosync_config_file), "%s", optarg); if (res >= sizeof(corosync_config_file)) { fprintf (stderr, "Config file path too long.\n"); syslog (LOGSYS_LEVEL_ERROR, "Config file path too long."); logsys_system_fini(); return EXIT_FAILURE; } break; case 'f': background = 0; break; case 't': testonly = 1; break; case 'v': printf ("Corosync Cluster Engine, version '%s'\n", VERSION); printf ("Copyright (c) 2006-2018 Red Hat, Inc.\n"); logsys_system_fini(); return EXIT_SUCCESS; break; default: fprintf(stderr, \ "usage:\n"\ " -c : Corosync config file path.\n"\ " -f : Start application in foreground.\n"\ " -t : Test configuration and exit.\n"\ " -v : Display version and SVN revision of Corosync and exit.\n"); logsys_system_fini(); return EXIT_FAILURE; } } /* * Other signals are registered later via qb_loop_signal_add */ (void)signal (SIGSEGV, sigsegv_handler); (void)signal (SIGABRT, sigsegv_handler); #if MSG_NOSIGNAL != 0 (void)signal (SIGPIPE, SIG_IGN); #endif if (icmap_init() != CS_OK) { fprintf (stderr, "Corosync Executive couldn't initialize configuration component.\n"); syslog (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't initialize configuration component."); corosync_exit_error (COROSYNC_DONE_ICMAP); } set_icmap_ro_keys_flag(); /* * Initialize the corosync_api_v1 definition */ api = apidef_get (); res = coroparse_configparse(icmap_get_global_map(), &error_string); if (res == -1) { /* * Logsys can't log properly at this early stage, and we need to get this message out * */ fprintf (stderr, "%s\n", error_string); syslog (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } if (stats_map_init(api) != CS_OK) { fprintf (stderr, "Corosync Executive couldn't initialize statistics component.\n"); syslog (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't initialize statistics component."); corosync_exit_error (COROSYNC_DONE_STATS); } res = corosync_log_config_read (&error_string); if (res == -1) { /* * if we are here, we _must_ flush the logsys queue * and try to inform that we couldn't read the config. * this is a desperate attempt before certain death * and there is no guarantee that we can print to stderr * nor that logsys is sending the messages where we expect. */ log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); fprintf(stderr, "%s", error_string); syslog (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_LOGCONFIGREAD); } if (!testonly) { log_printf (LOGSYS_LEVEL_NOTICE, "Corosync Cluster Engine %s starting up", VERSION); log_printf (LOGSYS_LEVEL_INFO, "Corosync built-in features:" PACKAGE_FEATURES ""); } /* * Create totem logsys subsys before totem_config_read so log functions can be used */ log_subsys_id_totem = _logsys_subsys_create("TOTEM", "totem," "totemip.c,totemconfig.c,totemcrypto.c,totemsrp.c," "totempg.c,totemudp.c,totemudpu.c,totemnet.c,totemknet.c"); /* * Make sure required directory is present */ res = stat (get_state_dir(), &stat_out); if ((res == -1) || (res == 0 && !S_ISDIR(stat_out.st_mode))) { log_printf (LOGSYS_LEVEL_ERROR, "State directory %s not present. Please create it.", get_state_dir()); corosync_exit_error (COROSYNC_DONE_DIR_NOT_PRESENT); } res = chdir(get_state_dir()); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "Cannot chdir to state directory %s. " "Please make sure it has correct context and rights.", get_state_dir()); corosync_exit_error (COROSYNC_DONE_DIR_NOT_PRESENT); } res = totem_config_read (&totem_config, &error_string, &totem_config_warnings); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } if (totem_config_warnings & TOTEM_CONFIG_WARNING_MEMBERS_IGNORED) { log_printf (LOGSYS_LEVEL_WARNING, "member section is used together with nodelist. Members ignored."); } if (totem_config_warnings & TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED) { log_printf (LOGSYS_LEVEL_WARNING, "member section is deprecated."); } if (totem_config_warnings & TOTEM_CONFIG_WARNING_TOTEM_NODEID_IGNORED) { log_printf (LOGSYS_LEVEL_WARNING, "nodeid appears both in totem section and nodelist. Nodelist one is used."); } if (totem_config_warnings & TOTEM_CONFIG_BINDNETADDR_NODELIST_SET) { log_printf (LOGSYS_LEVEL_WARNING, "interface section bindnetaddr is used together with nodelist. " "Nodelist one is going to be used."); } if (totem_config_warnings != 0) { log_printf (LOGSYS_LEVEL_WARNING, "Please migrate config file to nodelist."); } - res = totem_config_keyread (&totem_config, &error_string); - if (res == -1) { - log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); - corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); - } - res = totem_config_validate (&totem_config, &error_string); if (res == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } if (testonly) { corosync_exit_error (COROSYNC_DONE_EXIT); } move_to_root_cgroup = 1; if (icmap_get_string("system.move_to_root_cgroup", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "yes") != 0) { move_to_root_cgroup = 0; } free(tmp_str); } /* * Try to move corosync into root cpu cgroup. Failure is not fatal and * error is deliberately ignored. */ if (move_to_root_cgroup) { (void)corosync_move_to_root_cgroup(); } sched_rr = 1; if (icmap_get_string("system.sched_rr", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "yes") != 0) { sched_rr = 0; } free(tmp_str); } prio = 0; if (icmap_get_string("system.priority", &tmp_str) == CS_OK) { if (strcmp(tmp_str, "max") == 0) { prio = INT_MIN; } else if (strcmp(tmp_str, "min") == 0) { prio = INT_MAX; } else { errno = 0; tmpli = strtol(tmp_str, &ep, 10); if (errno != 0 || *ep != '\0' || tmpli > INT_MAX || tmpli < INT_MIN) { log_printf (LOGSYS_LEVEL_ERROR, "Priority value %s is invalid", tmp_str); corosync_exit_error (COROSYNC_DONE_MAINCONFIGREAD); } prio = tmpli; } free(tmp_str); } /* * Set round robin realtime scheduling with priority 99 */ if (sched_rr) { if (corosync_set_rr_scheduler () != 0) { prio = INT_MIN; } else { prio = 0; } } if (prio != 0) { if (setpriority(PRIO_PGRP, 0, prio) != 0) { LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "Could not set priority %d", prio); } } totem_config.totem_memb_ring_id_create_or_load = corosync_ring_id_create_or_load; totem_config.totem_memb_ring_id_store = corosync_ring_id_store; totem_config.totem_logging_configuration = totem_logging_configuration; totem_config.totem_logging_configuration.log_subsys_id = log_subsys_id_totem; totem_config.totem_logging_configuration.log_level_security = LOGSYS_LEVEL_WARNING; totem_config.totem_logging_configuration.log_level_error = LOGSYS_LEVEL_ERROR; totem_config.totem_logging_configuration.log_level_warning = LOGSYS_LEVEL_WARNING; totem_config.totem_logging_configuration.log_level_notice = LOGSYS_LEVEL_NOTICE; totem_config.totem_logging_configuration.log_level_debug = LOGSYS_LEVEL_DEBUG; totem_config.totem_logging_configuration.log_level_trace = LOGSYS_LEVEL_TRACE; totem_config.totem_logging_configuration.log_printf = _logsys_log_printf; logsys_config_apply(); /* * Now we are fully initialized. */ if (background) { logsys_blackbox_prefork(); corosync_tty_detach (); logsys_blackbox_postfork(); log_printf (LOGSYS_LEVEL_DEBUG, "Corosync TTY detached"); } /* * Lock all memory to avoid page faults which may interrupt * application healthchecking */ corosync_mlockall (); corosync_poll_handle = qb_loop_create (); memset(&scheduler_pause_timeout_data, 0, sizeof(scheduler_pause_timeout_data)); scheduler_pause_timeout_data.totem_config = &totem_config; timer_function_scheduler_timeout (&scheduler_pause_timeout_data); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_LOW, SIGUSR2, NULL, sig_diag_handler, NULL); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH, SIGINT, NULL, sig_exit_handler, NULL); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH, SIGQUIT, NULL, sig_exit_handler, NULL); qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH, SIGTERM, NULL, sig_exit_handler, NULL); if (logsys_thread_start() != 0) { log_printf (LOGSYS_LEVEL_ERROR, "Can't initialize log thread"); corosync_exit_error (COROSYNC_DONE_LOGCONFIGREAD); } if ((flock_err = corosync_flock (corosync_lock_file, getpid ())) != COROSYNC_DONE_EXIT) { corosync_exit_error (flock_err); } /* * if totempg_initialize doesn't have root priveleges, it cannot * bind to a specific interface. This only matters if * there is more then one interface in a system, so * in this case, only a warning is printed */ /* * Join multicast group and setup delivery * and configuration change functions */ if (totempg_initialize ( corosync_poll_handle, &totem_config) != 0) { log_printf (LOGSYS_LEVEL_ERROR, "Can't initialize TOTEM layer"); corosync_exit_error (COROSYNC_DONE_FATAL_ERR); } totempg_service_ready_register ( main_service_ready); totempg_groups_initialize ( &corosync_group_handle, deliver_fn, confchg_fn); totempg_groups_join ( corosync_group_handle, &corosync_group, 1); /* * Drop root privleges to user 'corosync' * TODO: Don't really need full root capabilities; * needed capabilities are: * CAP_NET_RAW (bindtodevice) * CAP_SYS_NICE (setscheduler) * CAP_IPC_LOCK (mlockall) */ priv_drop (); schedwrk_init ( serialize_lock, serialize_unlock); /* * Start main processing loop */ qb_loop_run (corosync_poll_handle); /* * Exit was requested */ totempg_finalize (); /* * free the loop resources */ qb_loop_destroy (corosync_poll_handle); /* * free up the icmap */ /* * Remove pid lock file */ unlink (corosync_lock_file); corosync_exit_error (COROSYNC_DONE_EXIT); return EXIT_SUCCESS; } diff --git a/exec/totemconfig.c b/exec/totemconfig.c index fadb9620..3927da9c 100644 --- a/exec/totemconfig.c +++ b/exec/totemconfig.c @@ -1,2343 +1,2378 @@ /* * Copyright (c) 2002-2005 MontaVista Software, Inc. * Copyright (c) 2006-2019 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * Jan Friesse (jfriesse@redhat.com) * Chrissie Caulfield (ccaulfie@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "totemconfig.h" #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST 4 #define TOKEN_TIMEOUT 1000 #define TOKEN_WARNING 75 #define TOKEN_COEFFICIENT 650 #define JOIN_TIMEOUT 50 #define MERGE_TIMEOUT 200 #define DOWNCHECK_TIMEOUT 1000 #define FAIL_TO_RECV_CONST 2500 #define SEQNO_UNCHANGED_CONST 30 #define MINIMUM_TIMEOUT (int)(1000/HZ)*3 #define MINIMUM_TIMEOUT_HOLD (int)(MINIMUM_TIMEOUT * 0.8 - (1000/HZ)) #define MAX_NETWORK_DELAY 50 #define WINDOW_SIZE 50 #define MAX_MESSAGES 17 #define MISS_COUNT_CONST 5 #define BLOCK_UNLISTED_IPS 1 /* Currently all but PONG_COUNT match the defaults in libknet.h */ #define KNET_PING_INTERVAL 1000 #define KNET_PING_TIMEOUT 2000 #define KNET_PING_PRECISION 2048 #define KNET_PONG_COUNT 2 #define KNET_PMTUD_INTERVAL 30 #define KNET_DEFAULT_TRANSPORT KNET_TRANSPORT_UDP #define DEFAULT_PORT 5405 static char error_string_response[768]; static void add_totem_config_notification(struct totem_config *totem_config); static void *totem_get_param_by_name(struct totem_config *totem_config, const char *param_name) { if (strcmp(param_name, "totem.token") == 0) return &totem_config->token_timeout; if (strcmp(param_name, "totem.token_warning") == 0) return &totem_config->token_warning; if (strcmp(param_name, "totem.token_retransmit") == 0) return &totem_config->token_retransmit_timeout; if (strcmp(param_name, "totem.hold") == 0) return &totem_config->token_hold_timeout; if (strcmp(param_name, "totem.token_retransmits_before_loss_const") == 0) return &totem_config->token_retransmits_before_loss_const; if (strcmp(param_name, "totem.join") == 0) return &totem_config->join_timeout; if (strcmp(param_name, "totem.send_join") == 0) return &totem_config->send_join_timeout; if (strcmp(param_name, "totem.consensus") == 0) return &totem_config->consensus_timeout; if (strcmp(param_name, "totem.merge") == 0) return &totem_config->merge_timeout; if (strcmp(param_name, "totem.downcheck") == 0) return &totem_config->downcheck_timeout; if (strcmp(param_name, "totem.fail_recv_const") == 0) return &totem_config->fail_to_recv_const; if (strcmp(param_name, "totem.seqno_unchanged_const") == 0) return &totem_config->seqno_unchanged_const; if (strcmp(param_name, "totem.heartbeat_failures_allowed") == 0) return &totem_config->heartbeat_failures_allowed; if (strcmp(param_name, "totem.max_network_delay") == 0) return &totem_config->max_network_delay; if (strcmp(param_name, "totem.window_size") == 0) return &totem_config->window_size; if (strcmp(param_name, "totem.max_messages") == 0) return &totem_config->max_messages; if (strcmp(param_name, "totem.miss_count_const") == 0) return &totem_config->miss_count_const; if (strcmp(param_name, "totem.knet_pmtud_interval") == 0) return &totem_config->knet_pmtud_interval; if (strcmp(param_name, "totem.knet_compression_threshold") == 0) return &totem_config->knet_compression_threshold; if (strcmp(param_name, "totem.knet_compression_level") == 0) return &totem_config->knet_compression_level; if (strcmp(param_name, "totem.knet_compression_model") == 0) return totem_config->knet_compression_model; if (strcmp(param_name, "totem.block_unlisted_ips") == 0) return &totem_config->block_unlisted_ips; return NULL; } /* * Read key_name from icmap. If key is not found or key_name == delete_key or if allow_zero is false * and readed value is zero, default value is used and stored into totem_config. */ static void totem_volatile_config_set_uint32_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, unsigned int default_value, int allow_zero_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; if (icmap_get_uint32_r(map, key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (!allow_zero_value && *(uint32_t *)totem_get_param_by_name(totem_config, key_name) == 0)) { *(uint32_t *)totem_get_param_by_name(totem_config, key_name) = default_value; } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); icmap_set_uint32_r(map, runtime_key_name, *(uint32_t *)totem_get_param_by_name(totem_config, key_name)); } static void totem_volatile_config_set_int32_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, int default_value, int allow_zero_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; if (icmap_get_int32_r(map, key_name, totem_get_param_by_name(totem_config, key_name)) != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (!allow_zero_value && *(int32_t *)totem_get_param_by_name(totem_config, key_name) == 0)) { *(int32_t *)totem_get_param_by_name(totem_config, key_name) = default_value; } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); icmap_set_int32_r(map, runtime_key_name, *(int32_t *)totem_get_param_by_name(totem_config, key_name)); } static void totem_volatile_config_set_string_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, const char *default_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; int res; char *new_config_value; const void *config_value; config_value = totem_get_param_by_name(totem_config, key_name); res = icmap_get_string_r(map, key_name, (char **)&new_config_value); if (res != CS_OK || (deleted_key != NULL && strcmp(deleted_key, key_name) == 0)) { /* Slightly pointless use of strncpy but it keeps coverity happy */ strncpy((char *)config_value, default_value, CONFIG_STRING_LEN_MAX); } else { strncpy((char *)config_value, new_config_value, CONFIG_STRING_LEN_MAX); } if (res == CS_OK) { free(new_config_value); } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); (void)icmap_set_string_r(map, runtime_key_name, (char *)config_value); } /* * Read string value stored in key_name from icmap, use it as a boolean (yes/no) type, convert it * to integer value (1/0) and store into totem_config. * * If key is not found or key_name == delete_key default value is used * and stored into totem_config. */ static void totem_volatile_config_set_boolean_value (struct totem_config *totem_config, icmap_map_t map, const char *key_name, const char *deleted_key, unsigned int default_value) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; char *str; int val; str = NULL; val = default_value; if ((deleted_key != NULL && strcmp(deleted_key, key_name) == 0) || (icmap_get_string_r(map, key_name, &str) != CS_OK)) { /* * Do nothing. str is NULL (icmap_get_string ether not called or * not changed str). */ } else { if (strcmp(str, "yes") == 0) { val = 1; } else if (strcmp(str, "no") == 0) { val = 0; } free(str); } /* * Store totem_config value to cmap runtime section */ if (strlen("runtime.config.") + strlen(key_name) >= ICMAP_KEYNAME_MAXLEN) { /* * This shouldn't happen */ return ; } strcpy(runtime_key_name, "runtime.config."); strcat(runtime_key_name, key_name); *(uint32_t *)totem_get_param_by_name(totem_config, key_name) = val; icmap_set_uint32_r(map, runtime_key_name, val); } /* * Read and validate config values from cmap and store them into totem_config. If key doesn't exists, * default value is stored. deleted_key is name of key beeing processed by delete operation * from cmap. It is considered as non existing even if it can be read. Can be NULL. */ void totem_volatile_config_read (struct totem_config *totem_config, icmap_map_t temp_map, const char *deleted_key) { uint32_t u32; totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token_retransmits_before_loss_const", deleted_key, TOKEN_RETRANSMITS_BEFORE_LOSS_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token", deleted_key, TOKEN_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token_warning", deleted_key, TOKEN_WARNING, 1); if (totem_config->interfaces[0].member_count > 2) { u32 = TOKEN_COEFFICIENT; icmap_get_uint32_r(temp_map, "totem.token_coefficient", &u32); totem_config->token_timeout += (totem_config->interfaces[0].member_count - 2) * u32; /* * Store totem_config value to cmap runtime section */ icmap_set_uint32_r(temp_map, "runtime.config.totem.token", totem_config->token_timeout); } totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.max_network_delay", deleted_key, MAX_NETWORK_DELAY, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.window_size", deleted_key, WINDOW_SIZE, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.max_messages", deleted_key, MAX_MESSAGES, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.miss_count_const", deleted_key, MISS_COUNT_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.knet_pmtud_interval", deleted_key, KNET_PMTUD_INTERVAL, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.token_retransmit", deleted_key, (int)(totem_config->token_timeout / (totem_config->token_retransmits_before_loss_const + 0.2)), 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.hold", deleted_key, (int)(totem_config->token_retransmit_timeout * 0.8 - (1000/HZ)), 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.join", deleted_key, JOIN_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.consensus", deleted_key, (int)(float)(1.2 * totem_config->token_timeout), 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.merge", deleted_key, MERGE_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.downcheck", deleted_key, DOWNCHECK_TIMEOUT, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.fail_recv_const", deleted_key, FAIL_TO_RECV_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.seqno_unchanged_const", deleted_key, SEQNO_UNCHANGED_CONST, 0); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.send_join", deleted_key, 0, 1); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.heartbeat_failures_allowed", deleted_key, 0, 1); totem_volatile_config_set_uint32_value(totem_config, temp_map, "totem.knet_compression_threshold", deleted_key, 0, 1); totem_volatile_config_set_int32_value(totem_config, temp_map, "totem.knet_compression_level", deleted_key, 0, 1); totem_volatile_config_set_string_value(totem_config, temp_map, "totem.knet_compression_model", deleted_key, "none"); totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.block_unlisted_ips", deleted_key, BLOCK_UNLISTED_IPS); } int totem_volatile_config_validate ( struct totem_config *totem_config, icmap_map_t temp_map, const char **error_string) { /* Static just to keep them off the stack */ static char local_error_reason[512]; static char addr_str_buf[INET6_ADDRSTRLEN]; const char *error_reason = local_error_reason; char name_key[ICMAP_KEYNAME_MAXLEN]; char *name_str; int i, j, num_configured, members; uint32_t tmp_config_value; if (totem_config->max_network_delay < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The max_network_delay parameter (%d ms) may not be less than (%d ms).", totem_config->max_network_delay, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The token timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->token_warning > 100 || totem_config->token_warning < 0) { snprintf (local_error_reason, sizeof(local_error_reason), "The token warning parameter (%d%%) must be between 0 (disabled) and 100.", totem_config->token_warning); goto parse_error; } if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) { if (icmap_get_uint32_r(temp_map, "totem.token_retransmit", &tmp_config_value) == CS_OK) { snprintf (local_error_reason, sizeof(local_error_reason), "The token retransmit timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_retransmit_timeout, MINIMUM_TIMEOUT); goto parse_error; } else { snprintf (local_error_reason, sizeof(local_error_reason), "Not appropriate token or token_retransmits_before_loss_const value set"); goto parse_error; } } if (totem_config->token_hold_timeout < MINIMUM_TIMEOUT_HOLD) { snprintf (local_error_reason, sizeof(local_error_reason), "The token hold timeout parameter (%d ms) may not be less than (%d ms).", totem_config->token_hold_timeout, MINIMUM_TIMEOUT_HOLD); goto parse_error; } if (totem_config->join_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The join timeout parameter (%d ms) may not be less than (%d ms).", totem_config->join_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less than (%d ms).", totem_config->consensus_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->consensus_timeout < totem_config->join_timeout) { snprintf (local_error_reason, sizeof(local_error_reason), "The consensus timeout parameter (%d ms) may not be less than join timeout (%d ms).", totem_config->consensus_timeout, totem_config->join_timeout); goto parse_error; } if (totem_config->merge_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The merge timeout parameter (%d ms) may not be less than (%d ms).", totem_config->merge_timeout, MINIMUM_TIMEOUT); goto parse_error; } if (totem_config->downcheck_timeout < MINIMUM_TIMEOUT) { snprintf (local_error_reason, sizeof(local_error_reason), "The downcheck timeout parameter (%d ms) may not be less than (%d ms).", totem_config->downcheck_timeout, MINIMUM_TIMEOUT); goto parse_error; } /* Check that we have nodelist 'name' if there is more than one link */ num_configured = 0; members = -1; for (i = 0; i < INTERFACE_MAX; i++) { if (totem_config->interfaces[i].configured) { if (num_configured == 0) { members = totem_config->interfaces[i].member_count; } num_configured++; } } if (num_configured > 1) { /* * This assert is here just to make compiler happy */ assert(members != -1); for (i=0; i < members; i++) { snprintf(name_key, sizeof(name_key), "nodelist.node.%d.name", i); if (icmap_get_string_r(temp_map, name_key, &name_str) != CS_OK) { snprintf (local_error_reason, sizeof(local_error_reason), "for a multi-link configuration, all nodes must have a 'name' attribute"); goto parse_error; } free(name_str); } for (i=0; i < INTERFACE_MAX; i++) { if (!totem_config->interfaces[i].configured) { continue; } if (totem_config->interfaces[i].member_count != members) { snprintf (local_error_reason, sizeof(local_error_reason), "Not all nodes have the same number of links"); goto parse_error; } } } /* Verify that all nodes on the same link have the same IP family */ for (i=0; i < INTERFACE_MAX; i++) { for (j=1; jinterfaces[i].member_count; j++) { if (totem_config->interfaces[i].configured) { if (totem_config->interfaces[i].member_list[j].family != totem_config->interfaces[i].member_list[0].family) { memcpy(addr_str_buf, totemip_print(&(totem_config->interfaces[i].member_list[j])), sizeof(addr_str_buf)); snprintf (local_error_reason, sizeof(local_error_reason), "Nodes for link %d have different IP families " "(compared %s with %s)", i, addr_str_buf, totemip_print(&(totem_config->interfaces[i].member_list[0]))); goto parse_error; } } } } return 0; parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } -static int totem_get_crypto(struct totem_config *totem_config, const char **error_string) +static int totem_get_crypto(struct totem_config *totem_config, icmap_map_t map, const char **error_string) { char *str; const char *tmp_cipher; const char *tmp_hash; const char *tmp_model; tmp_hash = "none"; tmp_cipher = "none"; tmp_model = "none"; - if (icmap_get_string("totem.crypto_model", &str) == CS_OK) { + if (icmap_get_string_r(map, "totem.crypto_model", &str) == CS_OK) { if (strcmp(str, "nss") == 0) { tmp_model = "nss"; } if (strcmp(str, "openssl") == 0) { tmp_model = "openssl"; } free(str); } else { tmp_model = "nss"; } - if (icmap_get_string("totem.secauth", &str) == CS_OK) { + if (icmap_get_string_r(map, "totem.secauth", &str) == CS_OK) { if (strcmp(str, "on") == 0) { tmp_cipher = "aes256"; tmp_hash = "sha256"; } free(str); } - if (icmap_get_string("totem.crypto_cipher", &str) == CS_OK) { + if (icmap_get_string_r(map, "totem.crypto_cipher", &str) == CS_OK) { if (strcmp(str, "none") == 0) { tmp_cipher = "none"; } if (strcmp(str, "aes256") == 0) { tmp_cipher = "aes256"; } if (strcmp(str, "aes192") == 0) { tmp_cipher = "aes192"; } if (strcmp(str, "aes128") == 0) { tmp_cipher = "aes128"; } free(str); } - if (icmap_get_string("totem.crypto_hash", &str) == CS_OK) { + if (icmap_get_string_r(map, "totem.crypto_hash", &str) == CS_OK) { if (strcmp(str, "none") == 0) { tmp_hash = "none"; } if (strcmp(str, "md5") == 0) { tmp_hash = "md5"; } if (strcmp(str, "sha1") == 0) { tmp_hash = "sha1"; } if (strcmp(str, "sha256") == 0) { tmp_hash = "sha256"; } if (strcmp(str, "sha384") == 0) { tmp_hash = "sha384"; } if (strcmp(str, "sha512") == 0) { tmp_hash = "sha512"; } free(str); } if ((strcmp(tmp_cipher, "none") != 0) && (strcmp(tmp_hash, "none") == 0)) { *error_string = "crypto_cipher requires crypto_hash with value other than none"; return -1; } if (strcmp(tmp_model, "none") == 0) { *error_string = "crypto_model should be 'nss' or 'openssl'"; return -1; } + if (strcmp(tmp_cipher, totem_config->crypto_cipher_type) || + strcmp(tmp_hash, totem_config->crypto_hash_type) || + strcmp(tmp_model, totem_config->crypto_model)) { + totem_config->crypto_changed = 1; + } + strncpy(totem_config->crypto_cipher_type, tmp_cipher, CONFIG_STRING_LEN_MAX); strncpy(totem_config->crypto_hash_type, tmp_hash, CONFIG_STRING_LEN_MAX); strncpy(totem_config->crypto_model, tmp_model, CONFIG_STRING_LEN_MAX); return 0; } static int nodelist_byname(icmap_map_t map, const char *find_name, int strip_domain) { icmap_iter_t iter; const char *iter_key; char name_str[ICMAP_KEYNAME_MAXLEN]; int res = 0; unsigned int node_pos; char *name; unsigned int namelen; iter = icmap_iter_init_r(map, "nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, name_str); if (res != 2) { continue; } /* ring0_addr is allowed as a fallback */ if (strcmp(name_str, "name") && strcmp(name_str, "ring0_addr")) { continue; } if (icmap_get_string_r(map, iter_key, &name) != CS_OK) { continue; } namelen = strlen(name); if (strip_domain) { char *dot; dot = strchr(name, '.'); if (dot) { namelen = name - dot - 1; } } if (strncmp(find_name, name, namelen) == 0 && strlen(find_name) == strlen(name)) { icmap_iter_finalize(iter); return node_pos; } } icmap_iter_finalize(iter); return -1; } /* Compare two addresses - only address part (sin_addr/sin6_addr) is checked */ static int ipaddr_equal(const struct sockaddr *addr1, const struct sockaddr *addr2) { int addrlen = 0; const void *addr1p, *addr2p; if (addr1->sa_family != addr2->sa_family) return 0; switch (addr1->sa_family) { case AF_INET: addrlen = sizeof(struct in_addr); addr1p = &((struct sockaddr_in *)addr1)->sin_addr; addr2p = &((struct sockaddr_in *)addr2)->sin_addr; break; case AF_INET6: addrlen = sizeof(struct in6_addr); addr1p = &((struct sockaddr_in6 *)addr1)->sin6_addr; addr2p = &((struct sockaddr_in6 *)addr2)->sin6_addr; break; default: assert(0); } return (memcmp(addr1p, addr2p, addrlen) == 0); } /* Finds the local node and returns its position in the nodelist. * Uses nodelist.local_node_pos as a cache to save effort */ static int find_local_node(icmap_map_t map, int use_cache) { char nodename2[PATH_MAX]; char name_str[ICMAP_KEYNAME_MAXLEN]; icmap_iter_t iter; const char *iter_key; unsigned int cached_pos; char *dot = NULL; const char *node; struct ifaddrs *ifa, *ifa_list; struct sockaddr *sa; int found = 0; int node_pos = -1; int res; struct utsname utsname; /* Check for cached value first */ if (use_cache) { if (icmap_get_uint32("nodelist.local_node_pos", &cached_pos) == CS_OK) { return cached_pos; } } res = uname(&utsname); if (res) { return -1; } node = utsname.nodename; /* 1. Exact match */ node_pos = nodelist_byname(map, node, 0); if (node_pos > -1) { found = 1; goto ret_found; } /* 2. Try to match with increasingly more * specific versions of it */ strcpy(nodename2, node); dot = strrchr(nodename2, '.'); while (dot) { *dot = '\0'; node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto ret_found; } dot = strrchr(nodename2, '.'); } node_pos = nodelist_byname(map, nodename2, 1); if (node_pos > -1) { found = 1; goto ret_found; } /* * The corosync.conf name may not be related to uname at all, * they may match a hostname on some network interface. */ if (getifaddrs(&ifa_list)) return -1; for (ifa = ifa_list; ifa; ifa = ifa->ifa_next) { socklen_t salen = 0; /* Restore this */ strcpy(nodename2, node); sa = ifa->ifa_addr; if (!sa) { continue; } if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) { continue; } if (sa->sa_family == AF_INET) { salen = sizeof(struct sockaddr_in); } if (sa->sa_family == AF_INET6) { salen = sizeof(struct sockaddr_in6); } if (getnameinfo(sa, salen, nodename2, sizeof(nodename2), NULL, 0, 0) == 0) { node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto out; } /* Truncate this name and try again */ dot = strchr(nodename2, '.'); if (dot) { *dot = '\0'; node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto out; } } } /* See if it's the IP address that's in corosync.conf */ if (getnameinfo(sa, sizeof(*sa), nodename2, sizeof(nodename2), NULL, 0, NI_NUMERICHOST)) continue; node_pos = nodelist_byname(map, nodename2, 0); if (node_pos > -1) { found = 1; goto out; } } out: if (found) { freeifaddrs(ifa_list); goto ret_found; } /* * This section covers the usecase where the nodename specified in cluster.conf * is an alias specified in /etc/hosts. For example: * hostname alias1 alias2 * and * the above calls use uname and getnameinfo does not return aliases. * here we take the name specified in cluster.conf, resolve it to an address * and then compare against all known local ip addresses. * if we have a match, we found our nodename. In theory this chunk of code * could replace all the checks above, but let's avoid any possible regressions * and use it as last. */ iter = icmap_iter_init_r(map, "nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { char *dbnodename = NULL; struct addrinfo hints; struct addrinfo *result = NULL, *rp = NULL; res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, name_str); if (res != 2) { continue; } /* 'ring0_addr' is allowed as a fallback, but 'name' will be found first * because the names are in alpha order. */ if (strcmp(name_str, "name") && strcmp(name_str, "ring0_addr")) { continue; } if (icmap_get_string_r(map, iter_key, &dbnodename) != CS_OK) { continue; } memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_DGRAM; hints.ai_flags = 0; hints.ai_protocol = IPPROTO_UDP; if (getaddrinfo(dbnodename, NULL, &hints, &result)) { continue; } for (rp = result; rp != NULL; rp = rp->ai_next) { for (ifa = ifa_list; ifa; ifa = ifa->ifa_next) { if (ifa->ifa_addr && ipaddr_equal(rp->ai_addr, ifa->ifa_addr)) { freeaddrinfo(result); found = 1; goto out2; } } } freeaddrinfo(result); } out2: icmap_iter_finalize(iter); freeifaddrs(ifa_list); ret_found: if (found) { res = icmap_set_uint32_r(map, "nodelist.local_node_pos", node_pos); } return node_pos; } static enum totem_ip_version_enum totem_config_get_ip_version(struct totem_config *totem_config) { enum totem_ip_version_enum res; char *str; res = TOTEM_IP_VERSION_6_4; if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { res = TOTEM_IP_VERSION_4; } if (icmap_get_string("totem.ip_version", &str) == CS_OK) { if (strcmp(str, "ipv4") == 0) { res = TOTEM_IP_VERSION_4; } if (strcmp(str, "ipv6") == 0) { res = TOTEM_IP_VERSION_6; } if (strcmp(str, "ipv6-4") == 0) { res = TOTEM_IP_VERSION_6_4; } if (strcmp(str, "ipv4-6") == 0) { res = TOTEM_IP_VERSION_4_6; } free(str); } return (res); } static uint16_t generate_cluster_id (const char *cluster_name) { int i; int value = 0; for (i = 0; i < strlen(cluster_name); i++) { value <<= 1; value += cluster_name[i]; } return (value & 0xFFFF); } static int get_cluster_mcast_addr ( const char *cluster_name, unsigned int linknumber, enum totem_ip_version_enum ip_version, struct totem_ip_address *res) { uint16_t clusterid; char addr[INET6_ADDRSTRLEN + 1]; int err; if (cluster_name == NULL) { return (-1); } clusterid = generate_cluster_id(cluster_name) + linknumber; memset (res, 0, sizeof(*res)); switch (ip_version) { case TOTEM_IP_VERSION_4: case TOTEM_IP_VERSION_4_6: snprintf(addr, sizeof(addr), "239.192.%d.%d", clusterid >> 8, clusterid % 0xFF); break; case TOTEM_IP_VERSION_6: case TOTEM_IP_VERSION_6_4: snprintf(addr, sizeof(addr), "ff15::%x", clusterid); break; default: /* * Unknown family */ return (-1); } err = totemip_parse (res, addr, ip_version); return (err); } static unsigned int generate_nodeid( struct totem_config *totem_config, char *addr) { unsigned int nodeid; struct totem_ip_address totemip; /* AF_INET hard-coded here because auto-generated nodeids are only for IPv4 */ if (totemip_parse(&totemip, addr, TOTEM_IP_VERSION_4) != 0) return -1; memcpy (&nodeid, &totemip.addr, sizeof (unsigned int)); #if __BYTE_ORDER == __LITTLE_ENDIAN nodeid = swab32 (nodeid); #endif if (totem_config->clear_node_high_bit) { nodeid &= 0x7FFFFFFF; } return nodeid; } static int check_for_duplicate_nodeids( struct totem_config *totem_config, const char **error_string) { icmap_iter_t iter; icmap_iter_t subiter; const char *iter_key; int res = 0; int retval = 0; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char *ring0_addr=NULL; char *ring0_addr1=NULL; unsigned int node_pos; unsigned int node_pos1; unsigned int last_node_pos = -1; unsigned int nodeid; unsigned int nodeid1; int autogenerated; iter = icmap_iter_init("nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } /* * This relies on the fact the icmap keys are always returned in order * so all of the keys for a node will be grouped together. We're basically * just running the code below once for each node. */ if (last_node_pos == node_pos) { continue; } last_node_pos = node_pos; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); autogenerated = 0; /* Generated nodeids are only allowed for UDP/UDPU so ring0_addr is valid here */ if (icmap_get_uint32(tmp_key, &nodeid) != CS_OK) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos); if (icmap_get_string(tmp_key, &ring0_addr) != CS_OK) { continue; } /* Generate nodeid so we can check that auto-generated nodeids don't clash either */ nodeid = generate_nodeid(totem_config, ring0_addr); if (nodeid == -1) { continue; } autogenerated = 1; } node_pos1 = 0; subiter = icmap_iter_init("nodelist.node."); while (((iter_key = icmap_iter_next(subiter, NULL, NULL)) != NULL) && (node_pos1 < node_pos)) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos1, tmp_key); if ((res != 2) || (node_pos1 >= node_pos)) { continue; } if (strcmp(tmp_key, "nodeid") != 0) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos1); if (icmap_get_uint32(tmp_key, &nodeid1) != CS_OK) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos1); if (icmap_get_string(tmp_key, &ring0_addr1) != CS_OK) { continue; } nodeid1 = generate_nodeid(totem_config, ring0_addr1); if (nodeid1 == -1) { continue; } } if (nodeid == nodeid1) { retval = -1; snprintf (error_string_response, sizeof(error_string_response), "Nodeid %u%s%s%s appears twice in corosync.conf", nodeid, autogenerated?"(autogenerated from ":"", autogenerated?ring0_addr:"", autogenerated?")":""); log_printf (LOGSYS_LEVEL_ERROR, error_string_response); *error_string = error_string_response; break; } } icmap_iter_finalize(subiter); } icmap_iter_finalize(iter); return retval; } /* * This needs to be done last of all. It would be nice to do it when reading the * interface params, but the totem params need to have them to be read first. We * need both, so this is a way round that circular dependancy. */ static void calc_knet_ping_timers(struct totem_config *totem_config) { char runtime_key_name[ICMAP_KEYNAME_MAXLEN]; int interface; for (interface = 0; interface < INTERFACE_MAX; interface++) { if (totem_config->interfaces[interface].configured) { if (!totem_config->interfaces[interface].knet_pong_count) { totem_config->interfaces[interface].knet_pong_count = KNET_PONG_COUNT; } if (!totem_config->interfaces[interface].knet_ping_timeout) { totem_config->interfaces[interface].knet_ping_timeout = totem_config->token_timeout / totem_config->interfaces[interface].knet_pong_count; } snprintf(runtime_key_name, sizeof(runtime_key_name), "runtime.config.totem.interface.%d.knet_ping_timeout", interface); icmap_set_uint32(runtime_key_name, totem_config->interfaces[interface].knet_ping_timeout); if (!totem_config->interfaces[interface].knet_ping_interval) { totem_config->interfaces[interface].knet_ping_interval = totem_config->token_timeout / (totem_config->interfaces[interface].knet_pong_count * 2); } snprintf(runtime_key_name, sizeof(runtime_key_name), "runtime.config.totem.interface.%d.knet_ping_interval", interface); icmap_set_uint32(runtime_key_name, totem_config->interfaces[interface].knet_ping_interval); } } } /* * Compute difference between two set of totem interface arrays and commit it. * set1 and set2 * are changed so for same ring, ip existing in both set1 and set2 are cleared * (set to 0), and ips which are only in set1 or set2 remains untouched. * totempg_node_add/remove is called. */ static void compute_and_set_totempg_interfaces(struct totem_interface *set1, struct totem_interface *set2) { int ring_no, set1_pos, set2_pos; struct totem_ip_address empty_ip_address; memset(&empty_ip_address, 0, sizeof(empty_ip_address)); for (ring_no = 0; ring_no < INTERFACE_MAX; ring_no++) { if (!set1[ring_no].configured && !set2[ring_no].configured) { continue; } for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) { for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) { /* * For current ring_no remove all set1 items existing * in set2 */ if (memcmp(&set1[ring_no].member_list[set1_pos], &set2[ring_no].member_list[set2_pos], sizeof(struct totem_ip_address)) == 0) { memset(&set1[ring_no].member_list[set1_pos], 0, sizeof(struct totem_ip_address)); memset(&set2[ring_no].member_list[set2_pos], 0, sizeof(struct totem_ip_address)); } } } } for (ring_no = 0; ring_no < INTERFACE_MAX; ring_no++) { for (set1_pos = 0; set1_pos < set1[ring_no].member_count; set1_pos++) { /* * All items which remain in set1 and don't exist in set2 any more * have to be removed. */ if (memcmp(&set1[ring_no].member_list[set1_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) { log_printf(LOGSYS_LEVEL_DEBUG, "removing dynamic member %s for ring %u", totemip_print(&set1[ring_no].member_list[set1_pos]), ring_no); totempg_member_remove(&set1[ring_no].member_list[set1_pos], ring_no); } } if (!set2[ring_no].configured) { continue; } for (set2_pos = 0; set2_pos < set2[ring_no].member_count; set2_pos++) { /* * All items which remain in set2 and don't exist in set1 are new nodes * and have to be added. */ if (memcmp(&set2[ring_no].member_list[set2_pos], &empty_ip_address, sizeof(empty_ip_address)) != 0) { log_printf(LOGSYS_LEVEL_DEBUG, "adding dynamic member %s for ring %u", totemip_print(&set2[ring_no].member_list[set2_pos]), ring_no); totempg_member_add(&set2[ring_no].member_list[set2_pos], ring_no); } } } } /* * Configure parameters for links */ static void configure_link_params(struct totem_config *totem_config, icmap_map_t map) { int i; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char *addr_string; int err; int local_node_pos = find_local_node(map, 0); for (i = 0; iinterfaces[i].configured) { continue; } log_printf(LOGSYS_LEVEL_DEBUG, "Configuring link %d params\n", i); snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring%u_addr", local_node_pos, i); if (icmap_get_string_r(map, tmp_key, &addr_string) != CS_OK) { continue; } err = totemip_parse(&totem_config->interfaces[i].local_ip, addr_string, totem_config->ip_version); if (err != 0) { continue; } totem_config->interfaces[i].local_ip.nodeid = totem_config->node_id; /* In case this is a new link, fill in the defaults if there was no interface{} section for it */ if (!totem_config->interfaces[i].knet_link_priority) totem_config->interfaces[i].knet_link_priority = 1; /* knet_ping_interval & knet_ping_timeout are set later once we know all the other params */ if (!totem_config->interfaces[i].knet_ping_precision) totem_config->interfaces[i].knet_ping_precision = KNET_PING_PRECISION; if (!totem_config->interfaces[i].knet_pong_count) totem_config->interfaces[i].knet_pong_count = KNET_PONG_COUNT; if (!totem_config->interfaces[i].knet_transport) totem_config->interfaces[i].knet_transport = KNET_TRANSPORT_UDP; if (!totem_config->interfaces[i].ip_port) totem_config->interfaces[i].ip_port = DEFAULT_PORT + i; } } static void configure_totem_links(struct totem_config *totem_config, icmap_map_t map) { int i; for (i = 0; iinterfaces[i].configured) { continue; } log_printf(LOGSYS_LEVEL_INFO, "Configuring link %d\n", i); totempg_iface_set(&totem_config->interfaces[i].local_ip, totem_config->interfaces[i].ip_port, i); } } /* Check for differences in config that can't be done on-the-fly and print an error */ static int check_things_have_not_changed(struct totem_config *totem_config, const char **error_string) { int i,j,k; const char *ip_str; char addr_buf[INET6_ADDRSTRLEN]; int changed = 0; for (i = 0; iinterfaces[i].configured && totem_config->orig_interfaces[i].configured) { if (totem_config->interfaces[i].knet_transport != totem_config->orig_interfaces[i].knet_transport) { log_printf(LOGSYS_LEVEL_ERROR, "New config has different knet transport for link %d. Internal value was NOT changed.\n", i); changed = 1; } /* Check each nodeid in the new configuration and make sure its IP address on this link has not changed */ for (j=0; j < totem_config->interfaces[i].member_count; j++) { for (k=0; k < totem_config->orig_interfaces[i].member_count; k++) { if (totem_config->interfaces[i].member_list[j].nodeid == totem_config->orig_interfaces[i].member_list[k].nodeid) { /* Found our nodeid - check the IP address */ if (memcmp(&totem_config->interfaces[i].member_list[j], &totem_config->orig_interfaces[i].member_list[k], sizeof(struct totem_ip_address))) { ip_str = totemip_print(&totem_config->orig_interfaces[i].member_list[k]); /* if ip_str is NULL then the old address was invalid and is allowed to change */ if (ip_str) { strncpy(addr_buf, ip_str, sizeof(addr_buf)); addr_buf[sizeof(addr_buf) - 1] = '\0'; log_printf(LOGSYS_LEVEL_ERROR, "new config has different address for link %d (addr changed from %s to %s). Internal value was NOT changed.\n", i, addr_buf, totemip_print(&totem_config->interfaces[i].member_list[j])); changed = 1; } } } } } } } if (changed) { snprintf (error_string_response, sizeof(error_string_response), "To reconfigure an interface it must be deleted and recreated. A working interface needs to be available to corosync at all times"); *error_string = error_string_response; return -1; } return 0; } static int put_nodelist_members_to_config(struct totem_config *totem_config, icmap_map_t map, int reload, const char **error_string) { icmap_iter_t iter, iter2; const char *iter_key, *iter_key2; int res = 0; unsigned int node_pos; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key2[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; int member_count; unsigned int linknumber = 0; int i, j; int last_node_pos = -1; /* Clear out nodelist so we can put the new one in if needed */ for (i = 0; i < INTERFACE_MAX; i++) { for (j = 0; j < PROCESSOR_COUNT_MAX; j++) { memset(&totem_config->interfaces[i].member_list[j], 0, sizeof(struct totem_ip_address)); } totem_config->interfaces[i].member_count = 0; } iter = icmap_iter_init_r(map, "nodelist.node."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.%s", &node_pos, tmp_key); if (res != 2) { continue; } /* If it's the same as the last node_pos then skip it */ if (node_pos == last_node_pos) { continue; } last_node_pos = node_pos; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos); iter2 = icmap_iter_init_r(map, tmp_key); while ((iter_key2 = icmap_iter_next(iter2, NULL, NULL)) != NULL) { unsigned int nodeid; char *str; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", node_pos); if (icmap_get_uint32_r(map, tmp_key, &nodeid) != CS_OK) { nodeid = 0; } res = sscanf(iter_key2, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2); if (res != 3 || strcmp(tmp_key2, "_addr") != 0) { continue; } if (icmap_get_string_r(map, iter_key2, &node_addr_str) != CS_OK) { continue; } /* Generate nodeids if they are not provided and transport is UDP/U */ if (!nodeid && (totem_config->transport_number == TOTEM_TRANSPORT_UDP || totem_config->transport_number == TOTEM_TRANSPORT_UDPU)) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", node_pos); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { nodeid = generate_nodeid(totem_config, str); if (nodeid == -1) { sprintf(error_string_response, "An IPV6 network requires that a node ID be specified " "for address '%s'.", node_addr_str); *error_string = error_string_response; free(str); return (-1); } log_printf(LOGSYS_LEVEL_DEBUG, "Generated nodeid = " CS_PRI_NODE_ID " for %s", nodeid, str); free(str); } } member_count = totem_config->interfaces[linknumber].member_count; res = totemip_parse(&totem_config->interfaces[linknumber].member_list[member_count], node_addr_str, totem_config->ip_version); if (res == 0) { totem_config->interfaces[linknumber].member_list[member_count].nodeid = nodeid; totem_config->interfaces[linknumber].member_count++; totem_config->interfaces[linknumber].configured = 1; } else { sprintf(error_string_response, "failed to parse node address '%s'\n", node_addr_str); *error_string = error_string_response; memset(&totem_config->interfaces[linknumber].member_list[member_count], 0, sizeof(struct totem_ip_address)); free(node_addr_str); icmap_iter_finalize(iter2); icmap_iter_finalize(iter); return -1; } free(node_addr_str); } icmap_iter_finalize(iter2); } icmap_iter_finalize(iter); configure_link_params(totem_config, map); if (reload) { log_printf(LOGSYS_LEVEL_DEBUG, "About to reconfigure links from nodelist.\n"); if (check_things_have_not_changed(totem_config, error_string) == -1) { return -1; } } return 0; } static void config_convert_nodelist_to_interface(icmap_map_t map, struct totem_config *totem_config) { int res = 0; int node_pos; char tmp_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key2[ICMAP_KEYNAME_MAXLEN]; char *node_addr_str; unsigned int linknumber = 0; icmap_iter_t iter; const char *iter_key; node_pos = find_local_node(map, 1); if (node_pos > -1) { /* * We found node, so create interface section */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.", node_pos); iter = icmap_iter_init_r(map, tmp_key); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "nodelist.node.%u.ring%u%s", &node_pos, &linknumber, tmp_key2); if (res != 3 || strcmp(tmp_key2, "_addr") != 0) { continue ; } if (icmap_get_string_r(map, iter_key, &node_addr_str) != CS_OK) { continue; } snprintf(tmp_key2, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber); icmap_set_string_r(map, tmp_key2, node_addr_str); free(node_addr_str); } icmap_iter_finalize(iter); } } static int get_interface_params(struct totem_config *totem_config, icmap_map_t map, const char **error_string, uint64_t *warnings, int reload) { int res = 0; unsigned int linknumber = 0; int member_count = 0; int i; icmap_iter_t iter, member_iter; const char *iter_key; const char *member_iter_key; char linknumber_key[ICMAP_KEYNAME_MAXLEN]; char tmp_key[ICMAP_KEYNAME_MAXLEN]; uint8_t u8; uint32_t u32; char *str; char *cluster_name = NULL; enum totem_ip_version_enum tmp_ip_version = TOTEM_IP_VERSION_4; int ret = 0; if (reload) { for (i=0; iinterfaces[i].configured = 0; totem_config->interfaces[i].knet_ping_timeout = 0; totem_config->interfaces[i].knet_ping_interval = 0; totem_config->interfaces[i].knet_ping_precision = KNET_PING_PRECISION; totem_config->interfaces[i].knet_pong_count = KNET_PONG_COUNT; } } if (icmap_get_string_r(map, "totem.cluster_name", &cluster_name) != CS_OK) { cluster_name = NULL; } iter = icmap_iter_init_r(map, "totem.interface."); while ((iter_key = icmap_iter_next(iter, NULL, NULL)) != NULL) { res = sscanf(iter_key, "totem.interface.%[^.].%s", linknumber_key, tmp_key); if (res != 2) { continue; } if (strcmp(tmp_key, "bindnetaddr") != 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) { continue; } member_count = 0; linknumber = atoi(linknumber_key); if (linknumber >= INTERFACE_MAX) { snprintf (error_string_response, sizeof(error_string_response), "parse error in config: interface ring number %u is bigger than allowed maximum %u\n", linknumber, INTERFACE_MAX - 1); *error_string = error_string_response; ret = -1; goto out; } /* These things are only valid for the initial read */ if (!reload) { /* * Get the bind net address */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.bindnetaddr", linknumber); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].bindnet, str, totem_config->ip_version); if (res) { sprintf(error_string_response, "failed to parse bindnet address '%s'\n", str); *error_string = error_string_response; free(str); ret = -1; goto out; } free(str); } /* * Get interface multicast address */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", linknumber); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].mcast_addr, str, totem_config->ip_version); if (res) { sprintf(error_string_response, "failed to parse mcast address '%s'\n", str); *error_string = error_string_response; free(str); ret = -1; goto out; } free(str); } else if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { /* * User not specified address -> autogenerate one from cluster_name key * (if available). Return code is intentionally ignored, because * udpu doesn't need mcastaddr and validity of mcastaddr for udp is * checked later anyway. */ if (totem_config->interfaces[0].bindnet.family == AF_INET) { tmp_ip_version = TOTEM_IP_VERSION_4; } else if (totem_config->interfaces[0].bindnet.family == AF_INET6) { tmp_ip_version = TOTEM_IP_VERSION_6; } (void)get_cluster_mcast_addr (cluster_name, linknumber, tmp_ip_version, &totem_config->interfaces[linknumber].mcast_addr); } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.broadcast", linknumber); if (icmap_get_string(tmp_key, &str) == CS_OK) { if (strcmp (str, "yes") == 0) { totem_config->broadcast_use = 1; } free(str); } } /* These things are only valid for the initial read OR a newly-defined link */ if (!reload || (totem_config->interfaces[linknumber].configured == 0)) { /* * Get mcast port */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", linknumber); if (icmap_get_uint16_r(map, tmp_key, &totem_config->interfaces[linknumber].ip_port) != CS_OK) { if (totem_config->broadcast_use) { totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + (2 * linknumber); } else { totem_config->interfaces[linknumber].ip_port = DEFAULT_PORT + linknumber; } } /* * Get the TTL */ totem_config->interfaces[linknumber].ttl = 1; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.ttl", linknumber); if (icmap_get_uint8_r(map, tmp_key, &u8) == CS_OK) { totem_config->interfaces[linknumber].ttl = u8; } totem_config->interfaces[linknumber].knet_transport = KNET_DEFAULT_TRANSPORT; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_transport", linknumber); if (icmap_get_string_r(map, tmp_key, &str) == CS_OK) { if (strcmp(str, "sctp") == 0) { totem_config->interfaces[linknumber].knet_transport = KNET_TRANSPORT_SCTP; } else if (strcmp(str, "udp") == 0) { totem_config->interfaces[linknumber].knet_transport = KNET_TRANSPORT_UDP; } else { *error_string = "Unrecognised knet_transport. expected 'udp' or 'sctp'"; ret = -1; goto out; } } } totem_config->interfaces[linknumber].configured = 1; /* * Get the knet link params */ totem_config->interfaces[linknumber].knet_link_priority = 1; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_link_priority", linknumber); if (icmap_get_uint8_r(map, tmp_key, &u8) == CS_OK) { totem_config->interfaces[linknumber].knet_link_priority = u8; } totem_config->interfaces[linknumber].knet_ping_interval = 0; /* real default applied later */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_interval", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_interval = u32; } totem_config->interfaces[linknumber].knet_ping_timeout = 0; /* real default applied later */ snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_timeout", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_timeout = u32; } totem_config->interfaces[linknumber].knet_ping_precision = KNET_PING_PRECISION; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_ping_precision", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_ping_precision = u32; } totem_config->interfaces[linknumber].knet_pong_count = KNET_PONG_COUNT; snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.knet_pong_count", linknumber); if (icmap_get_uint32_r(map, tmp_key, &u32) == CS_OK) { totem_config->interfaces[linknumber].knet_pong_count = u32; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.member.", linknumber); member_iter = icmap_iter_init_r(map, tmp_key); while ((member_iter_key = icmap_iter_next(member_iter, NULL, NULL)) != NULL) { if (member_count == 0) { if (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK) { free(str); *warnings |= TOTEM_CONFIG_WARNING_MEMBERS_IGNORED; break; } else { *warnings |= TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED; } } if (icmap_get_string_r(map, member_iter_key, &str) == CS_OK) { res = totemip_parse (&totem_config->interfaces[linknumber].member_list[member_count++], str, totem_config->ip_version); if (res) { sprintf(error_string_response, "failed to parse node address '%s'\n", str); *error_string = error_string_response; icmap_iter_finalize(member_iter); free(str); ret = -1; goto out; } free(str); } } icmap_iter_finalize(member_iter); totem_config->interfaces[linknumber].member_count = member_count; } out: icmap_iter_finalize(iter); free(cluster_name); return (ret); } extern int totem_config_read ( struct totem_config *totem_config, const char **error_string, uint64_t *warnings) { int res = 0; char *str, *ring0_addr_str; char tmp_key[ICMAP_KEYNAME_MAXLEN]; uint16_t u16; int i; int local_node_pos; int nodeid_set; *warnings = 0; memset (totem_config, 0, sizeof (struct totem_config)); totem_config->interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); if (totem_config->interfaces == 0) { *error_string = "Out of memory trying to allocate ethernet interface storage area"; return -1; } totem_config->transport_number = TOTEM_TRANSPORT_KNET; if (icmap_get_string("totem.transport", &str) == CS_OK) { if (strcmp (str, "udpu") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDPU; } if (strcmp (str, "udp") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_UDP; } if (strcmp (str, "knet") == 0) { totem_config->transport_number = TOTEM_TRANSPORT_KNET; } free(str); } memset (totem_config->interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); strcpy (totem_config->link_mode, "passive"); icmap_get_uint32("totem.version", (uint32_t *)&totem_config->version); - if (totem_get_crypto(totem_config, error_string) != 0) { + /* initial crypto load */ + if (totem_get_crypto(totem_config, icmap_get_global_map(), error_string) != 0) { + return -1; + } + if (totem_config_keyread(totem_config, icmap_get_global_map(), error_string) != 0) { return -1; } + totem_config->crypto_index = 1; + totem_config->crypto_changed = 0; if (icmap_get_string("totem.link_mode", &str) == CS_OK) { if (strlen(str) >= TOTEM_LINK_MODE_BYTES) { *error_string = "totem.link_mode is too long"; free(str); return -1; } strcpy (totem_config->link_mode, str); free(str); } icmap_get_uint32("totem.nodeid", &totem_config->node_id); totem_config->clear_node_high_bit = 0; if (icmap_get_string("totem.clear_node_high_bit", &str) == CS_OK) { if (strcmp (str, "yes") == 0) { totem_config->clear_node_high_bit = 1; } free(str); } icmap_get_uint32("totem.threads", &totem_config->threads); icmap_get_uint32("totem.netmtu", &totem_config->net_mtu); totem_config->ip_version = totem_config_get_ip_version(totem_config); if (icmap_get_string("totem.interface.0.bindnetaddr", &str) != CS_OK) { /* * We were not able to find ring 0 bindnet addr. Try to use nodelist informations */ config_convert_nodelist_to_interface(icmap_get_global_map(), totem_config); } else { if (icmap_get_string("nodelist.node.0.ring0_addr", &ring0_addr_str) == CS_OK) { /* * Both bindnetaddr and ring0_addr are set. * Log warning information, and use nodelist instead */ *warnings |= TOTEM_CONFIG_BINDNETADDR_NODELIST_SET; config_convert_nodelist_to_interface(icmap_get_global_map(), totem_config); free(ring0_addr_str); } free(str); } /* * Broadcast option is global but set in interface section, * so reset before processing interfaces. */ totem_config->broadcast_use = 0; res = get_interface_params(totem_config, icmap_get_global_map(), error_string, warnings, 0); if (res < 0) { return res; } /* * Use broadcast is global, so if set, make sure to fill mcast addr correctly * broadcast is only supported for UDP so just do interface 0; */ if (totem_config->broadcast_use) { totemip_parse (&totem_config->interfaces[0].mcast_addr, "255.255.255.255", TOTEM_IP_VERSION_4); } /* * Store automatically generated items back to icmap only for UDP */ if (totem_config->transport_number == TOTEM_TRANSPORT_UDP) { for (i = 0; i < INTERFACE_MAX; i++) { if (!totem_config->interfaces[i].configured) { continue; } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastaddr", i); if (icmap_get_string(tmp_key, &str) == CS_OK) { free(str); } else { str = (char *)totemip_print(&totem_config->interfaces[i].mcast_addr); icmap_set_string(tmp_key, str); } snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "totem.interface.%u.mcastport", i); if (icmap_get_uint16(tmp_key, &u16) != CS_OK) { icmap_set_uint16(tmp_key, totem_config->interfaces[i].ip_port); } } } /* * Check existence of nodelist */ if ((icmap_get_string("nodelist.node.0.name", &str) == CS_OK) || (icmap_get_string("nodelist.node.0.ring0_addr", &str) == CS_OK)) { free(str); /* * find local node */ local_node_pos = find_local_node(icmap_get_global_map(), 1); if (local_node_pos != -1) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.nodeid", local_node_pos); nodeid_set = (totem_config->node_id != 0); if (icmap_get_uint32(tmp_key, &totem_config->node_id) == CS_OK && nodeid_set) { *warnings |= TOTEM_CONFIG_WARNING_TOTEM_NODEID_IGNORED; } if ((totem_config->transport_number == TOTEM_TRANSPORT_KNET) && (!totem_config->node_id)) { *error_string = "Knet requires an explicit nodeid for the local node"; return -1; } if ((totem_config->transport_number == TOTEM_TRANSPORT_UDP || totem_config->transport_number == TOTEM_TRANSPORT_UDPU) && (!totem_config->node_id)) { snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "nodelist.node.%u.ring0_addr", local_node_pos); icmap_get_string(tmp_key, &str); totem_config->node_id = generate_nodeid(totem_config, str); if (totem_config->node_id == -1) { *error_string = "An IPV6 network requires that a node ID be specified"; free(str); return (-1); } totem_config->interfaces[0].member_list[local_node_pos].nodeid = totem_config->node_id; free(str); } /* Users must not change this */ icmap_set_ro_access("nodelist.local_node_pos", 0, 1); } if (put_nodelist_members_to_config(totem_config, icmap_get_global_map(), 0, error_string)) { return -1; } } /* * Get things that might change in the future (and can depend on totem_config->interfaces); */ totem_volatile_config_read(totem_config, icmap_get_global_map(), NULL); calc_knet_ping_timers(totem_config); /* This is now done in the totemknet interface callback */ /* configure_totem_links(totem_config, icmap_get_global_map()); */ add_totem_config_notification(totem_config); return 0; } int totem_config_validate ( struct totem_config *totem_config, const char **error_string) { static char local_error_reason[512]; char parse_error[512]; const char *error_reason = local_error_reason; int i; uint32_t u32; int num_configured = 0; unsigned int interface_max = INTERFACE_MAX; for (i = 0; i < INTERFACE_MAX; i++) { if (totem_config->interfaces[i].configured) { num_configured++; } } if (num_configured == 0) { error_reason = "No interfaces defined"; goto parse_error; } /* Check we found a local node name */ if (icmap_get_uint32("nodelist.local_node_pos", &u32) != CS_OK) { error_reason = "No valid name found for local host"; goto parse_error; } for (i = 0; i < INTERFACE_MAX; i++) { /* * Some error checking of parsed data to make sure its valid */ struct totem_ip_address null_addr; if (!totem_config->interfaces[i].configured) { continue; } memset (&null_addr, 0, sizeof (struct totem_ip_address)); if ((totem_config->transport_number == TOTEM_TRANSPORT_UDP) && memcmp (&totem_config->interfaces[i].mcast_addr, &null_addr, sizeof (struct totem_ip_address)) == 0) { error_reason = "No multicast address specified"; goto parse_error; } if (totem_config->interfaces[i].ip_port == 0) { error_reason = "No multicast port specified"; goto parse_error; } if (totem_config->interfaces[i].ttl > 255) { error_reason = "Invalid TTL (should be 0..255)"; goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_UDP && totem_config->interfaces[i].ttl != 1) { error_reason = "Can only set ttl on multicast transport types"; goto parse_error; } if (totem_config->interfaces[i].knet_link_priority > 255) { error_reason = "Invalid link priority (should be 0..255)"; goto parse_error; } if (totem_config->transport_number != TOTEM_TRANSPORT_KNET && totem_config->interfaces[i].knet_link_priority != 1) { error_reason = "Can only set link priority on knet transport type"; goto parse_error; } if (totem_config->interfaces[i].mcast_addr.family == AF_INET6 && totem_config->node_id == 0) { error_reason = "An IPV6 network requires that a node ID be specified."; goto parse_error; } if (totem_config->broadcast_use == 0 && totem_config->transport_number == TOTEM_TRANSPORT_UDP) { if (totem_config->interfaces[i].mcast_addr.family != totem_config->interfaces[i].bindnet.family) { error_reason = "Multicast address family does not match bind address family"; goto parse_error; } if (totemip_is_mcast (&totem_config->interfaces[i].mcast_addr) != 0) { error_reason = "mcastaddr is not a correct multicast address."; goto parse_error; } } } if (totem_config->version != 2) { error_reason = "This totem parser can only parse version 2 configurations."; goto parse_error; } if (totem_volatile_config_validate(totem_config, icmap_get_global_map(), error_string) == -1) { return (-1); } if (check_for_duplicate_nodeids(totem_config, error_string) == -1) { return (-1); } /* * KNET Link values validation */ if (strcmp (totem_config->link_mode, "active") && strcmp (totem_config->link_mode, "rr") && strcmp (totem_config->link_mode, "passive")) { snprintf (local_error_reason, sizeof(local_error_reason), "The Knet link mode \"%s\" specified is invalid. It must be active, passive or rr.\n", totem_config->link_mode); goto parse_error; } /* Only Knet does multiple interfaces */ if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) { interface_max = 1; } if (interface_max < num_configured) { snprintf (parse_error, sizeof(parse_error), "%d is too many configured interfaces for non-Knet transport.", num_configured); error_reason = parse_error; goto parse_error; } /* Only knet allows crypto */ if (totem_config->transport_number != TOTEM_TRANSPORT_KNET) { if ((strcmp(totem_config->crypto_cipher_type, "none") != 0) || (strcmp(totem_config->crypto_hash_type, "none") != 0)) { snprintf (parse_error, sizeof(parse_error), "crypto_cipher & crypto_hash are only valid for the Knet transport."); error_reason = parse_error; goto parse_error; } } if (totem_config->net_mtu == 0) { if (totem_config->transport_number == TOTEM_TRANSPORT_KNET) { totem_config->net_mtu = KNET_MAX_PACKET_SIZE; } else { totem_config->net_mtu = 1500; } } return 0; parse_error: snprintf (error_string_response, sizeof(error_string_response), "parse error in config: %s\n", error_reason); *error_string = error_string_response; return (-1); } static int read_keyfile ( const char *key_location, struct totem_config *totem_config, const char **error_string) { int fd; int res; int saved_errno; char error_str[100]; const char *error_ptr; fd = open (key_location, O_RDONLY); if (fd == -1) { error_ptr = qb_strerror_r(errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not open %s: %s\n", key_location, error_ptr); goto parse_error; } res = read (fd, totem_config->private_key, TOTEM_PRIVATE_KEY_LEN_MAX); saved_errno = errno; close (fd); if (res == -1) { error_ptr = qb_strerror_r (saved_errno, error_str, sizeof(error_str)); snprintf (error_string_response, sizeof(error_string_response), "Could not read %s: %s\n", key_location, error_ptr); goto parse_error; } if (res < TOTEM_PRIVATE_KEY_LEN_MIN) { snprintf (error_string_response, sizeof(error_string_response), "Could only read %d bits of minimum %u bits from %s.\n", res * 8, TOTEM_PRIVATE_KEY_LEN_MIN * 8, key_location); goto parse_error; } totem_config->private_key_len = res; return 0; parse_error: *error_string = error_string_response; return (-1); } int totem_config_keyread ( struct totem_config *totem_config, + icmap_map_t map, const char **error_string) { int got_key = 0; char *key_location = NULL; int res; size_t key_len; + char old_key[TOTEM_PRIVATE_KEY_LEN_MAX]; + size_t old_key_len; + + /* Take a copy so we can see if it has changed */ + memcpy(old_key, totem_config->private_key, sizeof(totem_config->private_key)); + old_key_len = totem_config->private_key_len; memset (totem_config->private_key, 0, sizeof(totem_config->private_key)); totem_config->private_key_len = 0; if (strcmp(totem_config->crypto_cipher_type, "none") == 0 && strcmp(totem_config->crypto_hash_type, "none") == 0) { return (0); } /* cmap may store the location of the key file */ - if (icmap_get_string("totem.keyfile", &key_location) == CS_OK) { + if (icmap_get_string_r(map, "totem.keyfile", &key_location) == CS_OK) { res = read_keyfile(key_location, totem_config, error_string); free(key_location); if (res) { goto key_error; } got_key = 1; } else { /* Or the key itself may be in the cmap */ - if (icmap_get("totem.key", NULL, &key_len, NULL) == CS_OK) { + if (icmap_get_r(map, "totem.key", NULL, &key_len, NULL) == CS_OK) { if (key_len > sizeof(totem_config->private_key)) { sprintf(error_string_response, "key is too long"); goto key_error; } if (key_len < TOTEM_PRIVATE_KEY_LEN_MIN) { sprintf(error_string_response, "key is too short"); goto key_error; } - if (icmap_get("totem.key", totem_config->private_key, &key_len, NULL) == CS_OK) { + if (icmap_get_r(map, "totem.key", totem_config->private_key, &key_len, NULL) == CS_OK) { totem_config->private_key_len = key_len; got_key = 1; } else { sprintf(error_string_response, "can't load private key"); goto key_error; } } } /* In desperation we read the default filename */ if (!got_key) { res = read_keyfile(COROSYSCONFDIR "/authkey", totem_config, error_string); if (res) goto key_error; } + if (old_key_len != totem_config->private_key_len || + memcmp(old_key, totem_config->private_key, sizeof(totem_config->private_key))) { + totem_config->crypto_changed = 1; + } + return (0); key_error: *error_string = error_string_response; return (-1); } +int totem_reread_crypto_config(struct totem_config *totem_config, icmap_map_t map, const char **error_string) +{ + if (totem_get_crypto(totem_config, map, error_string) != 0) { + return -1; + } + if (totem_config_keyread(totem_config, map, error_string) != 0) { + return -1; + } + return 0; +} + static void debug_dump_totem_config(const struct totem_config *totem_config) { log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)", totem_config->token_timeout, totem_config->token_retransmit_timeout); if (totem_config->token_warning) { uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100; log_printf(LOGSYS_LEVEL_DEBUG, "Token warning every %d ms (%d%% of Token Timeout)", token_warning_ms, totem_config->token_warning); if (token_warning_ms < totem_config->token_retransmit_timeout) log_printf (LOGSYS_LEVEL_DEBUG, "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) " "which can lead to spurious token warnings. Consider increasing the token_warning parameter.", token_warning_ms, totem_config->token_retransmit_timeout); } else log_printf(LOGSYS_LEVEL_DEBUG, "Token warnings disabled"); log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf(LOGSYS_LEVEL_DEBUG, "downcheck (%d ms) fail to recv const (%d msgs)", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf(LOGSYS_LEVEL_DEBUG, "seqno unchanged const (%d rotations) Maximum network MTU %d", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf(LOGSYS_LEVEL_DEBUG, "window size per rotation (%d messages) maximum messages per rotation (%d messages)", totem_config->window_size, totem_config->max_messages); log_printf(LOGSYS_LEVEL_DEBUG, "missed count const (%d messages)", totem_config->miss_count_const); log_printf(LOGSYS_LEVEL_DEBUG, "heartbeat_failures_allowed (%d)", totem_config->heartbeat_failures_allowed); log_printf(LOGSYS_LEVEL_DEBUG, "max_network_delay (%d ms)", totem_config->max_network_delay); } static void totem_change_notify( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { struct totem_config *totem_config = (struct totem_config *)user_data; uint32_t *param; uint8_t reloading; const char *deleted_key = NULL; const char *error_string; /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.reload_in_progress", &reloading) == CS_OK && reloading) return; param = totem_get_param_by_name((struct totem_config *)user_data, key_name); /* * Process change only if changed key is found in totem_config (-> param is not NULL) * or for special key token_coefficient. token_coefficient key is not stored in * totem_config, but it is used for computation of token timeout. */ if (!param && strcmp(key_name, "totem.token_coefficient") != 0) return; /* * Values other than UINT32 are not supported, or needed (yet) */ switch (event) { case ICMAP_TRACK_DELETE: deleted_key = key_name; break; case ICMAP_TRACK_ADD: case ICMAP_TRACK_MODIFY: deleted_key = NULL; break; default: break; } totem_volatile_config_read (totem_config, icmap_get_global_map(), deleted_key); log_printf(LOGSYS_LEVEL_DEBUG, "Totem related config key changed. Dumping actual totem config."); debug_dump_totem_config(totem_config); if (totem_volatile_config_validate(totem_config, icmap_get_global_map(), &error_string) == -1) { log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string); /* * TODO: Consider corosync exit and/or load defaults for volatile * values. For now, log error seems to be enough */ } } int totemconfig_configure_new_params( struct totem_config *totem_config, icmap_map_t map, const char **error_string) { uint64_t warnings = 0LL; get_interface_params(totem_config, map, error_string, &warnings, 1); if (put_nodelist_members_to_config (totem_config, map, 1, error_string)) { return -1; } calc_knet_ping_timers(totem_config); log_printf(LOGSYS_LEVEL_DEBUG, "Configuration reloaded. Dumping actual totem config."); debug_dump_totem_config(totem_config); /* Reinstate the local_node_pos */ (void)find_local_node(map, 0); return 0; } void totemconfig_commit_new_params( struct totem_config *totem_config, icmap_map_t map) { struct totem_interface *new_interfaces = NULL; new_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(new_interfaces != NULL); memcpy(new_interfaces, totem_config->interfaces, sizeof (struct totem_interface) * INTERFACE_MAX); /* Set link parameters including local_ip */ configure_totem_links(totem_config, map); /* Add & remove nodes */ compute_and_set_totempg_interfaces(totem_config->orig_interfaces, new_interfaces); /* Does basic global params (like compression) */ totempg_reconfigure(); free(new_interfaces); } static void add_totem_config_notification(struct totem_config *totem_config) { icmap_track_t icmap_track; icmap_track_add("totem.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, totem_change_notify, totem_config, &icmap_track); } diff --git a/exec/totemconfig.h b/exec/totemconfig.h index bb4579ea..58621765 100644 --- a/exec/totemconfig.h +++ b/exec/totemconfig.h @@ -1,88 +1,93 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2012 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMCONFIG_H_DEFINED #define TOTEMCONFIG_H_DEFINED #include #include #include #include #include "totemsrp.h" #define TOTEM_CONFIG_WARNING_MEMBERS_IGNORED (1<<1) #define TOTEM_CONFIG_WARNING_MEMBERS_DEPRECATED (1<<2) #define TOTEM_CONFIG_WARNING_TOTEM_NODEID_IGNORED (1<<3) #define TOTEM_CONFIG_BINDNETADDR_NODELIST_SET (1<<4) extern int totem_config_read ( struct totem_config *totem_config, const char **error_string, uint64_t *warnings); extern int totem_config_validate ( struct totem_config *totem_config, const char **error_string); extern int totem_config_keyread ( struct totem_config *totem_config, + icmap_map_t map, const char **error_string); extern int totem_config_find_local_addr_in_nodelist( struct totem_config *totem_config, const char *ipaddr_key_prefix, unsigned int *node_pos); - extern void totem_volatile_config_read( struct totem_config *totem_config, icmap_map_t temp_map, const char *deleted_key); +extern int totem_reread_crypto_config( + struct totem_config *totem_config, + icmap_map_t map, + const char **error_string); + extern int totem_volatile_config_validate( struct totem_config *totem_config, icmap_map_t temp_map, const char **error_string); extern int totemconfig_configure_new_params( struct totem_config *totem_config, icmap_map_t map, const char **error_string); extern void totemconfig_commit_new_params( struct totem_config *totem_config, icmap_map_t map); #endif /* TOTEMCONFIG_H_DEFINED */ diff --git a/exec/totemknet.c b/exec/totemknet.c index 5101c4c9..c6a1649d 100644 --- a/exec/totemknet.c +++ b/exec/totemknet.c @@ -1,1961 +1,2121 @@ /* * Copyright (c) 2016-2020 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfie@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LIBNOZZLE #include #include #endif #include #include #include #include #include #include "totemknet.h" #include "main.h" #include "util.h" #include #include #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif #ifdef HAVE_LIBNOZZLE static int setup_nozzle(void *knet_context); #endif /* Should match that used by cfg */ #define CFG_INTERFACE_STATUS_MAX_LEN 512 struct totemknet_instance { struct crypto_instance *crypto_inst; qb_loop_t *poll_handle; knet_handle_t knet_handle; int link_mode; void *context; void (*totemknet_deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from); void (*totemknet_iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int link_no); void (*totemknet_mtu_changed) ( void *context, int net_mtu); void (*totemknet_target_set_completed) (void *context); /* * Function and data used to log messages */ int totemknet_log_level_security; int totemknet_log_level_error; int totemknet_log_level_warning; int totemknet_log_level_notice; int totemknet_log_level_debug; int totemknet_subsys_id; int knet_subsys_id; void (*totemknet_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); void *knet_context; char iov_buffer[KNET_MAX_PACKET_SIZE]; char *link_status[INTERFACE_MAX]; struct totem_ip_address my_ids[INTERFACE_MAX]; uint16_t ip_port[INTERFACE_MAX]; int our_nodeid; int loopback_link; struct totem_config *totem_config; struct totem_ip_address token_target; qb_loop_timer_handle timer_netif_check_timeout; qb_loop_timer_handle timer_merge_detect_timeout; int send_merge_detect_message; unsigned int merge_detect_messages_sent_before_timeout; int logpipes[2]; int knet_fd; pthread_mutex_t log_mutex; #ifdef HAVE_LIBNOZZLE char *nozzle_name; char *nozzle_ipaddr; char *nozzle_prefix; char *nozzle_macaddr; nozzle_t nozzle_handle; #endif }; /* Awkward. But needed to get stats from knet */ struct totemknet_instance *global_instance; struct work_item { const void *msg; unsigned int msg_len; struct totemknet_instance *instance; }; int totemknet_member_list_rebind_ip ( void *knet_context); + +static int totemknet_configure_compression ( + void *knet_context, + struct totem_config *totem_config); + static void totemknet_start_merge_detect_timeout( void *knet_context); static void totemknet_stop_merge_detect_timeout( void *knet_context); static void log_flush_messages ( void *knet_context); static void totemknet_instance_initialize (struct totemknet_instance *instance) { int res; memset (instance, 0, sizeof (struct totemknet_instance)); res = pthread_mutex_init(&instance->log_mutex, NULL); /* * There is not too much else what can be done. */ assert(res == 0); } #define knet_log_printf_lock(level, subsys, function, file, line, format, args...) \ do { \ (void)pthread_mutex_lock(&instance->log_mutex); \ instance->totemknet_log_printf ( \ level, subsys, function, file, line, \ (const char *)format, ##args); \ (void)pthread_mutex_unlock(&instance->log_mutex); \ } while (0); #define knet_log_printf(level, format, args...) \ do { \ knet_log_printf_lock ( \ level, instance->totemknet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); #define libknet_log_printf(level, format, args...) \ do { \ knet_log_printf_lock ( \ level, instance->knet_subsys_id, \ __FUNCTION__, "libknet.h", __LINE__, \ (const char *)format, ##args); \ } while (0); #define KNET_LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemknet_log_printf ( \ level, instance->totemknet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)", ##args, _error_ptr, err_num); \ } while(0) #ifdef HAVE_LIBNOZZLE static inline int is_ether_addr_multicast(const uint8_t *addr) { return (addr[0] & 0x01); } static inline int is_ether_addr_zero(const uint8_t *addr) { return (!addr[0] && !addr[1] && !addr[2] && !addr[3] && !addr[4] && !addr[5]); } static int ether_host_filter_fn(void *private_data, const unsigned char *outdata, ssize_t outdata_len, uint8_t tx_rx, knet_node_id_t this_host_id, knet_node_id_t src_host_id, int8_t *channel, knet_node_id_t *dst_host_ids, size_t *dst_host_ids_entries) { struct ether_header *eth_h = (struct ether_header *)outdata; uint8_t *dst_mac = (uint8_t *)eth_h->ether_dhost; uint16_t dst_host_id; if (is_ether_addr_zero(dst_mac)) return -1; if (is_ether_addr_multicast(dst_mac)) { return 1; } memmove(&dst_host_id, &dst_mac[4], 2); dst_host_ids[0] = ntohs(dst_host_id); *dst_host_ids_entries = 1; return 0; } #endif static int dst_host_filter_callback_fn(void *private_data, const unsigned char *outdata, ssize_t outdata_len, uint8_t tx_rx, knet_node_id_t this_host_id, knet_node_id_t src_host_id, int8_t *channel, knet_node_id_t *dst_host_ids, size_t *dst_host_ids_entries) { struct totem_message_header *header = (struct totem_message_header *)outdata; int res; #ifdef HAVE_LIBNOZZLE if (*channel != 0) { return ether_host_filter_fn(private_data, outdata, outdata_len, tx_rx, this_host_id, src_host_id, channel, dst_host_ids, dst_host_ids_entries); } #endif if (header->target_nodeid) { dst_host_ids[0] = header->target_nodeid; *dst_host_ids_entries = 1; res = 0; /* unicast message */ } else { *dst_host_ids_entries = 0; res = 1; /* multicast message */ } return res; } static void socket_error_callback_fn(void *private_data, int datafd, int8_t channel, uint8_t tx_rx, int error, int errorno) { struct totemknet_instance *instance = (struct totemknet_instance *)private_data; knet_log_printf (LOGSYS_LEVEL_DEBUG, "Knet socket ERROR notification called: txrx=%d, error=%d, errorno=%d", tx_rx, error, errorno); if ((error == -1 && errorno != EAGAIN) || (error == 0)) { knet_handle_remove_datafd(instance->knet_handle, datafd); } } static void host_change_callback_fn(void *private_data, knet_node_id_t host_id, uint8_t reachable, uint8_t remote, uint8_t external) { struct totemknet_instance *instance = (struct totemknet_instance *)private_data; // TODO: what? if anything. knet_log_printf (LOGSYS_LEVEL_DEBUG, "Knet host change callback. nodeid: " CS_PRI_NODE_ID " reachable: %d", host_id, reachable); } static void pmtu_change_callback_fn(void *private_data, unsigned int data_mtu) { struct totemknet_instance *instance = (struct totemknet_instance *)private_data; knet_log_printf (LOGSYS_LEVEL_DEBUG, "Knet pMTU change: %d", data_mtu); /* We don't need to tell corosync the actual knet MTU */ // instance->totemknet_mtu_changed(instance->context, data_mtu); } int totemknet_crypto_set ( void *knet_context, const char *cipher_type, const char *hash_type) { return (0); } static inline void ucast_sendmsg ( struct totemknet_instance *instance, struct totem_ip_address *system_to, const void *msg, unsigned int msg_len) { int res = 0; struct totem_message_header *header = (struct totem_message_header *)msg; struct msghdr msg_ucast; struct iovec iovec; header->target_nodeid = system_to->nodeid; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; /* * Build unicast message */ memset(&msg_ucast, 0, sizeof(msg_ucast)); msg_ucast.msg_iov = (void *)&iovec; msg_ucast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_ucast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_ucast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_ucast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_ucast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_ucast.msg_accrightslen = 0; #endif /* * Transmit unicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->knet_fd, &msg_ucast, MSG_NOSIGNAL); if (res < 0) { KNET_LOGSYS_PERROR (errno, instance->totemknet_log_level_debug, "sendmsg(ucast) failed (non-critical)"); } } static inline void mcast_sendmsg ( struct totemknet_instance *instance, const void *msg, unsigned int msg_len, int only_active) { int res; struct totem_message_header *header = (struct totem_message_header *)msg; struct msghdr msg_mcast; struct iovec iovec; iovec.iov_base = (void *)msg; iovec.iov_len = msg_len; header->target_nodeid = 0; /* * Build multicast message */ memset(&msg_mcast, 0, sizeof(msg_mcast)); msg_mcast.msg_iov = (void *)&iovec; msg_mcast.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_mcast.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_mcast.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_mcast.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_mcast.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_mcast.msg_accrightslen = 0; #endif // log_printf (LOGSYS_LEVEL_DEBUG, "totemknet: mcast_sendmsg. only_active=%d, len=%d", only_active, msg_len); res = sendmsg (instance->knet_fd, &msg_mcast, MSG_NOSIGNAL); if (res < msg_len) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "totemknet: mcast_send sendmsg returned %d", res); } if (!only_active || instance->send_merge_detect_message) { /* * Current message was sent to all nodes */ instance->merge_detect_messages_sent_before_timeout++; instance->send_merge_detect_message = 0; } } static int node_compare(const void *aptr, const void *bptr) { uint16_t a,b; a = *(uint16_t *)aptr; b = *(uint16_t *)bptr; return a > b; } #ifndef OWN_INDEX_NONE #define OWN_INDEX_NONE -1 #endif int totemknet_ifaces_get (void *knet_context, char ***status, unsigned int *iface_count) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; struct knet_link_status link_status; knet_node_id_t host_list[KNET_MAX_HOST]; uint8_t link_list[KNET_MAX_LINK]; size_t num_hosts; size_t num_links; size_t link_idx; int i,j; char *ptr; int res = 0; /* * Don't do the whole 'link_info' bit if the caller just wants * a count of interfaces. */ if (status) { int own_idx = OWN_INDEX_NONE; res = knet_host_get_host_list(instance->knet_handle, host_list, &num_hosts); if (res) { return (-1); } qsort(host_list, num_hosts, sizeof(uint16_t), node_compare); for (j=0; jour_nodeid) { own_idx = j; break; } } for (i=0; ilink_status[i], 'd', CFG_INTERFACE_STATUS_MAX_LEN-1); if (own_idx != OWN_INDEX_NONE) { instance->link_status[i][own_idx] = 'n'; } instance->link_status[i][num_hosts] = '\0'; } /* This is all a bit "inside-out" because "status" is a set of strings per link * and knet orders things by host */ for (j=0; jknet_handle, host_list[j], link_list, &num_links); if (res) { return (-1); } link_idx = 0; for (i=0; i < num_links; i++) { /* * Skip over links that are unconfigured to corosync. This is basically * link0 if corosync isn't using it for comms, as we will still * have it set up for loopback. */ if (!instance->totem_config->interfaces[link_list[i]].configured) { continue; } ptr = instance->link_status[link_idx++]; res = knet_link_get_status(instance->knet_handle, host_list[j], link_list[i], &link_status, sizeof(link_status)); if (res == 0) { ptr[j] = '0' + (link_status.enabled | link_status.connected<<1 | link_status.dynconnected<<2); } else { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet_ifaces_get: Cannot get link status: %s", strerror(errno)); ptr[j] = '?'; } } } *status = instance->link_status; } *iface_count = INTERFACE_MAX; return (res); } int totemknet_finalize ( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; int i,j; static knet_node_id_t nodes[KNET_MAX_HOST]; /* static to save stack */ uint8_t links[KNET_MAX_LINK]; size_t num_nodes; size_t num_links; knet_log_printf(LOG_DEBUG, "totemknet: finalize"); qb_loop_poll_del (instance->poll_handle, instance->logpipes[0]); qb_loop_poll_del (instance->poll_handle, instance->knet_fd); /* * Disable forwarding to make knet flush send queue. This ensures that the LEAVE message will be sent. */ res = knet_handle_setfwd(instance->knet_handle, 0); if (res) { knet_log_printf (LOGSYS_LEVEL_CRIT, "totemknet: knet_handle_setfwd failed: %s", strerror(errno)); } res = knet_host_get_host_list(instance->knet_handle, nodes, &num_nodes); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Cannot get knet node list for shutdown: %s", strerror(errno)); /* Crash out anyway */ goto finalise_error; } /* Tidily shut down all nodes & links. */ for (i=0; iknet_handle, nodes[i], links, &num_links); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Cannot get knet link list for node " CS_PRI_NODE_ID ": %s", nodes[i], strerror(errno)); goto finalise_error; } for (j=0; jknet_handle, nodes[i], links[j], 0); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet: knet_link_set_enable(node " CS_PRI_NODE_ID ", link %d) failed: %s", nodes[i], links[j], strerror(errno)); } res = knet_link_clear_config(instance->knet_handle, nodes[i], links[j]); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet: knet_link_clear_config(node " CS_PRI_NODE_ID ", link %d) failed: %s", nodes[i], links[j], strerror(errno)); } } res = knet_host_remove(instance->knet_handle, nodes[i]); if (res) { knet_log_printf (LOGSYS_LEVEL_ERROR, "totemknet: knet_host_remove(node " CS_PRI_NODE_ID ") failed: %s", nodes[i], strerror(errno)); } } finalise_error: res = knet_handle_free(instance->knet_handle); if (res) { knet_log_printf (LOGSYS_LEVEL_CRIT, "totemknet: knet_handle_free failed: %s", strerror(errno)); } totemknet_stop_merge_detect_timeout(instance); log_flush_messages(instance); /* * Error is deliberately ignored */ (void)pthread_mutex_destroy(&instance->log_mutex); return (res); } static int log_deliver_fn ( int fd, int revents, void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; char buffer[sizeof(struct knet_log_msg)*4]; char *bufptr = buffer; int done = 0; int len; len = read(fd, buffer, sizeof(buffer)); while (done < len) { struct knet_log_msg *msg = (struct knet_log_msg *)bufptr; switch (msg->msglevel) { case KNET_LOG_ERR: libknet_log_printf (LOGSYS_LEVEL_ERROR, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; case KNET_LOG_WARN: libknet_log_printf (LOGSYS_LEVEL_WARNING, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; case KNET_LOG_INFO: libknet_log_printf (LOGSYS_LEVEL_INFO, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; case KNET_LOG_DEBUG: libknet_log_printf (LOGSYS_LEVEL_DEBUG, "%s: %s", knet_log_get_subsystem_name(msg->subsystem), msg->msg); break; } bufptr += sizeof(struct knet_log_msg); done += sizeof(struct knet_log_msg); } return 0; } static int data_deliver_fn ( int fd, int revents, void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; struct msghdr msg_hdr; struct iovec iov_recv; struct sockaddr_storage system_from; ssize_t msg_len; int truncated_packet; iov_recv.iov_base = instance->iov_buffer; iov_recv.iov_len = KNET_MAX_PACKET_SIZE; msg_hdr.msg_name = &system_from; msg_hdr.msg_namelen = sizeof (struct sockaddr_storage); msg_hdr.msg_iov = &iov_recv; msg_hdr.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_hdr.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_hdr.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_hdr.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_hdr.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_hdr.msg_accrightslen = 0; #endif msg_len = recvmsg (fd, &msg_hdr, MSG_NOSIGNAL | MSG_DONTWAIT); if (msg_len <= 0) { return (0); } truncated_packet = 0; #ifdef HAVE_MSGHDR_FLAGS if (msg_hdr.msg_flags & MSG_TRUNC) { truncated_packet = 1; } #else /* * We don't have MSGHDR_FLAGS, but we can (hopefully) safely make assumption that * if bytes_received == KNET_MAX_PACKET_SIZE then packet is truncated */ if (bytes_received == KNET_MAX_PACKET_SIZE) { truncated_packet = 1; } #endif if (truncated_packet) { knet_log_printf(instance->totemknet_log_level_error, "Received too big message. This may be because something bad is happening" "on the network (attack?), or you tried join more nodes than corosync is" "compiled with (%u) or bug in the code (bad estimation of " "the KNET_MAX_PACKET_SIZE). Dropping packet.", PROCESSOR_COUNT_MAX); return (0); } /* * Handle incoming message */ instance->totemknet_deliver_fn ( instance->context, instance->iov_buffer, msg_len, &system_from); return (0); } static void timer_function_netif_check_timeout ( void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; int i; for (i=0; i < INTERFACE_MAX; i++) { if (!instance->totem_config->interfaces[i].configured) { continue; } instance->totemknet_iface_change_fn (instance->context, &instance->my_ids[i], i); } } static void knet_set_access_list_config(struct totemknet_instance *instance) { #ifdef HAVE_KNET_ACCESS_LIST uint32_t value; cs_error_t err; value = instance->totem_config->block_unlisted_ips; knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet_enable access list: %d", value); err = knet_handle_enable_access_lists(instance->knet_handle, value); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_access_lists failed"); } #endif } /* NOTE: this relies on the fact that totem_reload_notify() is called first */ static void totemknet_refresh_config( int32_t event, const char *key_name, struct icmap_notify_value new_val, struct icmap_notify_value old_val, void *user_data) { uint8_t reloading; uint32_t value; uint32_t link_no; size_t num_nodes; knet_node_id_t host_ids[KNET_MAX_HOST]; int i; int err; struct totemknet_instance *instance = (struct totemknet_instance *)user_data; ENTER(); /* * If a full reload is in progress then don't do anything until it's done and * can reconfigure it all atomically */ if (icmap_get_uint8("config.totemconfig_reload_in_progress", &reloading) == CS_OK && reloading) { return; } knet_set_access_list_config(instance); if (icmap_get_uint32("totem.knet_pmtud_interval", &value) == CS_OK) { instance->totem_config->knet_pmtud_interval = value; knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet_pmtud_interval now %d", value); err = knet_handle_pmtud_setfreq(instance->knet_handle, instance->totem_config->knet_pmtud_interval); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_pmtud_setfreq failed"); } } /* Configure link parameters for each node */ err = knet_host_get_host_list(instance->knet_handle, host_ids, &num_nodes); if (err != 0) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_host_get_host_list failed"); } for (i=0; iour_nodeid || !instance->totem_config->interfaces[link_no].configured) { continue; } err = knet_link_set_ping_timers(instance->knet_handle, host_ids[i], link_no, instance->totem_config->interfaces[link_no].knet_ping_interval, instance->totem_config->interfaces[link_no].knet_ping_timeout, instance->totem_config->interfaces[link_no].knet_ping_precision); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_ping_timers for node " CS_PRI_NODE_ID " link %d failed", host_ids[i], link_no); } err = knet_link_set_pong_count(instance->knet_handle, host_ids[i], link_no, instance->totem_config->interfaces[link_no].knet_pong_count); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_pong_count for node " CS_PRI_NODE_ID " link %d failed",host_ids[i], link_no); } err = knet_link_set_priority(instance->knet_handle, host_ids[i], link_no, instance->totem_config->interfaces[link_no].knet_link_priority); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_priority for node " CS_PRI_NODE_ID " link %d failed", host_ids[i], link_no); } } } LEAVE(); } static void totemknet_add_config_notifications(struct totemknet_instance *instance) { icmap_track_t icmap_track_totem = NULL; icmap_track_t icmap_track_reload = NULL; ENTER(); icmap_track_add("totem.", ICMAP_TRACK_ADD | ICMAP_TRACK_DELETE | ICMAP_TRACK_MODIFY | ICMAP_TRACK_PREFIX, totemknet_refresh_config, instance, &icmap_track_totem); icmap_track_add("config.totemconfig_reload_in_progress", ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY, totemknet_refresh_config, instance, &icmap_track_reload); LEAVE(); } +static int totemknet_set_knet_crypto(struct totemknet_instance *instance) +{ + struct knet_handle_crypto_cfg crypto_cfg; + int res; + + /* These have already been validated */ + memcpy(crypto_cfg.crypto_model, instance->totem_config->crypto_model, sizeof(crypto_cfg.crypto_model)); + memcpy(crypto_cfg.crypto_cipher_type, instance->totem_config->crypto_cipher_type, sizeof(crypto_cfg.crypto_model)); + memcpy(crypto_cfg.crypto_hash_type, instance->totem_config->crypto_hash_type, sizeof(crypto_cfg.crypto_model)); + memcpy(crypto_cfg.private_key, instance->totem_config->private_key, instance->totem_config->private_key_len); + crypto_cfg.private_key_len = instance->totem_config->private_key_len; + +#ifdef HAVE_KNET_CRYPTO_RECONF + + knet_log_printf(LOGSYS_LEVEL_DEBUG, "Configuring crypto %s/%s/%s on index %d", + crypto_cfg.crypto_model, + crypto_cfg.crypto_cipher_type, + crypto_cfg.crypto_hash_type, + instance->totem_config->crypto_index + ); + + /* If crypto is being disabled we need to explicitly allow cleartext traffic in knet */ + if (strcmp(instance->totem_config->crypto_cipher_type, "none") == 0) { + res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_ALLOW_CLEAR_TRAFFIC); + if (res) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_rx_clear_traffic(ALLOW) failed %s", strerror(errno)); + } + } + + /* use_config will be called later when all nodes are synced */ + res = knet_handle_crypto_set_config(instance->knet_handle, &crypto_cfg, instance->totem_config->crypto_index); + if (res == -1) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config (index %d) failed: %s", instance->totem_config->crypto_index, strerror(errno)); + goto exit_error; + } + if (res == -2) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config (index %d) failed: -2", instance->totem_config->crypto_index); + goto exit_error; + } +#else + knet_log_printf(LOGSYS_LEVEL_DEBUG, "Configuring crypto %s/%s/%s", + crypto_cfg.crypto_model, + crypto_cfg.crypto_cipher_type, + crypto_cfg.crypto_hash_type + ); + + res = knet_handle_crypto(instance->knet_handle, &crypto_cfg); + if (res == -1) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto failed: %s", strerror(errno)); + goto exit_error; + } + if (res == -2) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto failed: -2"); + goto exit_error; + } +#endif + + +exit_error: + return res; +} + /* * Create an instance */ int totemknet_initialize ( qb_loop_t *poll_handle, void **knet_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int link_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemknet_instance *instance; int8_t channel=0; int res; int i; instance = malloc (sizeof (struct totemknet_instance)); if (instance == NULL) { return (-1); } totemknet_instance_initialize (instance); instance->totem_config = totem_config; /* * Configure logging */ instance->totemknet_log_level_security = 1; //totem_config->totem_logging_configuration.log_level_security; instance->totemknet_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemknet_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemknet_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemknet_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemknet_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemknet_log_printf = totem_config->totem_logging_configuration.log_printf; instance->knet_subsys_id = _logsys_subsys_create("KNET", "libknet.h"); /* * Initialize local variables for totemknet */ instance->our_nodeid = instance->totem_config->node_id; for (i=0; i< INTERFACE_MAX; i++) { totemip_copy(&instance->my_ids[i], &totem_config->interfaces[i].bindnet); instance->my_ids[i].nodeid = instance->our_nodeid; instance->ip_port[i] = totem_config->interfaces[i].ip_port; /* Needed for totemsrp */ totem_config->interfaces[i].boundto.nodeid = instance->our_nodeid; } instance->poll_handle = poll_handle; instance->context = context; instance->totemknet_deliver_fn = deliver_fn; instance->totemknet_iface_change_fn = iface_change_fn; instance->totemknet_mtu_changed = mtu_changed; instance->totemknet_target_set_completed = target_set_completed; instance->loopback_link = 0; res = pipe(instance->logpipes); if (res == -1) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_CRIT, "failed to create pipe for instance->logpipes"); goto exit_error; } if (fcntl(instance->logpipes[0], F_SETFL, O_NONBLOCK) == -1 || fcntl(instance->logpipes[1], F_SETFL, O_NONBLOCK) == -1) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_CRIT, "failed to set O_NONBLOCK flag for instance->logpipes"); goto exit_error; } #if !defined(KNET_API_VER) || (KNET_API_VER == 1) instance->knet_handle = knet_handle_new(instance->totem_config->node_id, instance->logpipes[1], KNET_LOG_DEBUG); #endif #if KNET_API_VER == 2 instance->knet_handle = knet_handle_new(instance->totem_config->node_id, instance->logpipes[1], KNET_LOG_DEBUG, KNET_HANDLE_FLAG_PRIVILEGED); #endif if (!instance->knet_handle) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_CRIT, "knet_handle_new failed"); goto exit_error; } knet_set_access_list_config(instance); res = knet_handle_pmtud_setfreq(instance->knet_handle, instance->totem_config->knet_pmtud_interval); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_pmtud_setfreq failed"); } res = knet_handle_enable_filter(instance->knet_handle, instance, dst_host_filter_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_filter failed"); } res = knet_handle_enable_sock_notify(instance->knet_handle, instance, socket_error_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_sock_notify failed"); } res = knet_host_enable_status_change_notify(instance->knet_handle, instance, host_change_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_host_enable_status_change_notify failed"); } res = knet_handle_enable_pmtud_notify(instance->knet_handle, instance, pmtu_change_callback_fn); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, "knet_handle_enable_pmtud_notify failed"); } global_instance = instance; /* Get an fd into knet */ instance->knet_fd = 0; res = knet_handle_add_datafd(instance->knet_handle, &instance->knet_fd, &channel); if (res) { knet_log_printf(LOG_DEBUG, "knet_handle_add_datafd failed: %s", strerror(errno)); goto exit_error; } /* Enable crypto if requested */ +#ifdef HAVE_KNET_CRYPTO_RECONF if (strcmp(instance->totem_config->crypto_cipher_type, "none") != 0) { - struct knet_handle_crypto_cfg crypto_cfg; - - assert(strlen(instance->totem_config->crypto_model) < sizeof(crypto_cfg.crypto_model)); - strcpy(crypto_cfg.crypto_model, instance->totem_config->crypto_model); - - assert(strlen(instance->totem_config->crypto_cipher_type) < sizeof(crypto_cfg.crypto_cipher_type)); - strcpy(crypto_cfg.crypto_cipher_type, instance->totem_config->crypto_cipher_type); - - assert(strlen(instance->totem_config->crypto_hash_type) < sizeof(crypto_cfg.crypto_hash_type)); - strcpy(crypto_cfg.crypto_hash_type, instance->totem_config->crypto_hash_type); - - assert(instance->totem_config->private_key_len <= sizeof(crypto_cfg.private_key)); - memcpy(crypto_cfg.private_key, instance->totem_config->private_key, instance->totem_config->private_key_len); - crypto_cfg.private_key_len = instance->totem_config->private_key_len; + res = totemknet_set_knet_crypto(instance); + if (res == 0) { + res = knet_handle_crypto_use_config(instance->knet_handle, totem_config->crypto_index); + if (res) { + knet_log_printf(LOG_DEBUG, "knet_handle_crypto_use_config failed: %s", strerror(errno)); + goto exit_error; + } + } else { + knet_log_printf(LOG_DEBUG, "Failed to set up knet crypto"); + goto exit_error; + } + res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_DISALLOW_CLEAR_TRAFFIC); + if (res) { + knet_log_printf(LOG_DEBUG, "knet_handle_crypto_rx_clear_traffic (DISALLOW) failed: %s", strerror(errno)); + goto exit_error; + } - res = knet_handle_crypto(instance->knet_handle, &crypto_cfg); - if (res == -1) { - knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto failed: %s", strerror(errno)); + } else { + res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_ALLOW_CLEAR_TRAFFIC); + if (res) { + knet_log_printf(LOG_DEBUG, "knet_handle_crypto_rx_clear_traffic (ALLOW) failed: %s", strerror(errno)); goto exit_error; } - if (res == -2) { - knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto failed: -2"); + } +#else + if (strcmp(instance->totem_config->crypto_cipher_type, "none") != 0) { + res = totemknet_set_knet_crypto(instance); + if (res) { + knet_log_printf(LOG_DEBUG, "Failed to set up knet crypto"); goto exit_error; } - knet_log_printf(LOG_INFO, "kronosnet crypto initialized: %s/%s", crypto_cfg.crypto_cipher_type, crypto_cfg.crypto_hash_type); } +#endif /* Set up compression */ - totemknet_reconfigure(instance, instance->totem_config); + if (strcmp(totem_config->knet_compression_model, "none") != 0) { + /* Not fatal, but will log */ + (void)totemknet_configure_compression(knet_context, totem_config); + } knet_handle_setfwd(instance->knet_handle, 1); instance->link_mode = KNET_LINK_POLICY_PASSIVE; if (strcmp(instance->totem_config->link_mode, "active")==0) { instance->link_mode = KNET_LINK_POLICY_ACTIVE; } if (strcmp(instance->totem_config->link_mode, "rr")==0) { instance->link_mode = KNET_LINK_POLICY_RR; } for (i=0; ilink_status[i] = malloc(CFG_INTERFACE_STATUS_MAX_LEN); if (!instance->link_status[i]) { goto exit_error; } } qb_loop_poll_add (instance->poll_handle, QB_LOOP_MED, instance->logpipes[0], POLLIN, instance, log_deliver_fn); qb_loop_poll_add (instance->poll_handle, QB_LOOP_HIGH, instance->knet_fd, POLLIN, instance, data_deliver_fn); /* * Upper layer isn't ready to receive message because it hasn't * initialized yet. Add short timer to check the interfaces. */ qb_loop_timer_add (instance->poll_handle, QB_LOOP_MED, 100*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); totemknet_start_merge_detect_timeout(instance); /* Start listening for config changes */ totemknet_add_config_notifications(instance); /* Add stats keys to icmap */ stats_knet_add_handle(); knet_log_printf (LOGSYS_LEVEL_INFO, "totemknet initialized"); *knet_context = instance; return (0); exit_error: log_flush_messages(instance); free(instance); return (-1); } void *totemknet_buffer_alloc (void) { /* Need to have space for a message AND a struct mcast in case of encapsulated messages */ return malloc(KNET_MAX_PACKET_SIZE + 512); } void totemknet_buffer_release (void *ptr) { return free (ptr); } int totemknet_processor_count_set ( void *knet_context, int processor_count) { return (0); } int totemknet_recv_flush (void *knet_context) { return (0); } int totemknet_send_flush (void *knet_context) { return (0); } int totemknet_token_send ( void *knet_context, const void *msg, unsigned int msg_len) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; ucast_sendmsg (instance, &instance->token_target, msg, msg_len); return (res); } int totemknet_mcast_flush_send ( void *knet_context, const void *msg, unsigned int msg_len) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 0); return (res); } int totemknet_mcast_noflush_send ( void *knet_context, const void *msg, unsigned int msg_len) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; mcast_sendmsg (instance, msg, msg_len, 1); return (res); } extern int totemknet_iface_check (void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; knet_log_printf(LOG_DEBUG, "totemknet: iface_check"); return (res); } extern void totemknet_net_mtu_adjust (void *knet_context, struct totem_config *totem_config) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; knet_log_printf(LOG_DEBUG, "totemknet: Returning MTU of %d", totem_config->net_mtu); } int totemknet_token_target_set ( void *knet_context, unsigned int nodeid) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res = 0; instance->token_target.nodeid = nodeid; instance->totemknet_target_set_completed (instance->context); return (res); } extern int totemknet_recv_mcast_empty ( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; unsigned int res; struct sockaddr_storage system_from; struct msghdr msg_hdr; struct iovec iov_recv; struct pollfd ufd; int nfds; int msg_processed = 0; iov_recv.iov_base = instance->iov_buffer; iov_recv.iov_len = KNET_MAX_PACKET_SIZE; msg_hdr.msg_name = &system_from; msg_hdr.msg_namelen = sizeof (struct sockaddr_storage); msg_hdr.msg_iov = &iov_recv; msg_hdr.msg_iovlen = 1; #ifdef HAVE_MSGHDR_CONTROL msg_hdr.msg_control = 0; #endif #ifdef HAVE_MSGHDR_CONTROLLEN msg_hdr.msg_controllen = 0; #endif #ifdef HAVE_MSGHDR_FLAGS msg_hdr.msg_flags = 0; #endif #ifdef HAVE_MSGHDR_ACCRIGHTS msg_msg_hdr.msg_accrights = NULL; #endif #ifdef HAVE_MSGHDR_ACCRIGHTSLEN msg_msg_hdr.msg_accrightslen = 0; #endif do { ufd.fd = instance->knet_fd; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { res = recvmsg (instance->knet_fd, &msg_hdr, MSG_NOSIGNAL | MSG_DONTWAIT); if (res != -1) { msg_processed = 1; } else { msg_processed = -1; } } } while (nfds == 1); return (msg_processed); } int totemknet_iface_set (void *knet_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; totemip_copy(&instance->my_ids[iface_no], local_addr); knet_log_printf(LOG_INFO, "Configured link number %d: local addr: %s, port=%d", iface_no, totemip_print(local_addr), ip_port); instance->ip_port[iface_no] = ip_port; return 0; } int totemknet_member_add ( void *knet_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int link_no) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int err; int port = instance->ip_port[link_no]; struct sockaddr_storage remote_ss; struct sockaddr_storage local_ss; int addrlen; int i; int host_found = 0; knet_node_id_t host_ids[KNET_MAX_HOST]; size_t num_host_ids; /* Only create 1 loopback link and use link 0 */ if (member->nodeid == instance->our_nodeid) { if (!instance->loopback_link) { link_no = 0; instance->loopback_link = 1; } else { /* Already done */ return 0; } } knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: member_add: " CS_PRI_NODE_ID " (%s), link=%d", member->nodeid, totemip_print(member), link_no); knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: local: " CS_PRI_NODE_ID " (%s)", local->nodeid, totemip_print(local)); /* Only add the host if it doesn't already exist in knet */ err = knet_host_get_host_list(instance->knet_handle, host_ids, &num_host_ids); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_host_get_host_list"); return -1; } for (i=0; inodeid) { host_found = 1; } } if (!host_found) { err = knet_host_add(instance->knet_handle, member->nodeid); if (err != 0 && errno != EEXIST) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_host_add"); return -1; } } else { knet_log_printf (LOGSYS_LEVEL_DEBUG, "nodeid " CS_PRI_NODE_ID " already added", member->nodeid); } if (err == 0) { if (knet_host_set_policy(instance->knet_handle, member->nodeid, instance->link_mode)) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_set_policy failed"); return -1; } } memset(&local_ss, 0, sizeof(local_ss)); memset(&remote_ss, 0, sizeof(remote_ss)); /* Casts to remove const */ totemip_totemip_to_sockaddr_convert((struct totem_ip_address *)member, port, &remote_ss, &addrlen); totemip_totemip_to_sockaddr_convert((struct totem_ip_address *)local, port, &local_ss, &addrlen); if (member->nodeid == instance->our_nodeid) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: loopback link is %d\n", link_no); err = knet_link_set_config(instance->knet_handle, member->nodeid, link_no, KNET_TRANSPORT_LOOPBACK, &local_ss, &remote_ss, KNET_LINK_FLAG_TRAFFICHIPRIO); } else { err = knet_link_set_config(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_transport, &local_ss, &remote_ss, KNET_LINK_FLAG_TRAFFICHIPRIO); } if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_config failed"); return -1; } knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: member_add: Setting link prio to %d", instance->totem_config->interfaces[link_no].knet_link_priority); err = knet_link_set_priority(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_link_priority); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_priority for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); } /* ping timeouts maybe 0 here for a newly added interface so we leave this till later, it will get done in totemknet_refresh_config */ if (instance->totem_config->interfaces[link_no].knet_ping_interval != 0) { err = knet_link_set_ping_timers(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_ping_interval, instance->totem_config->interfaces[link_no].knet_ping_timeout, instance->totem_config->interfaces[link_no].knet_ping_precision); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_ping_timers for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); } err = knet_link_set_pong_count(instance->knet_handle, member->nodeid, link_no, instance->totem_config->interfaces[link_no].knet_pong_count); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_pong_count for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); } } err = knet_link_set_enable(instance->knet_handle, member->nodeid, link_no, 1); if (err) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set_enable for nodeid " CS_PRI_NODE_ID ", link %d failed", member->nodeid, link_no); return -1; } /* register stats */ stats_knet_add_member(member->nodeid, link_no); return (0); } int totemknet_member_remove ( void *knet_context, const struct totem_ip_address *token_target, int link_no) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res; uint8_t link_list[KNET_MAX_LINK]; size_t num_links; knet_log_printf (LOGSYS_LEVEL_DEBUG, "knet: member_remove: " CS_PRI_NODE_ID ", link=%d", token_target->nodeid, link_no); /* Don't remove the link with the loopback on it until we shut down */ if (token_target->nodeid == instance->our_nodeid) { return 0; } /* Tidy stats */ stats_knet_del_member(token_target->nodeid, link_no); /* Remove the link first */ res = knet_link_set_enable(instance->knet_handle, token_target->nodeid, link_no, 0); if (res != 0) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_set enable(off) for nodeid " CS_PRI_NODE_ID ", link %d failed", token_target->nodeid, link_no); return res; } res = knet_link_clear_config(instance->knet_handle, token_target->nodeid, link_no); if (res != 0) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_link_clear_config for nodeid " CS_PRI_NODE_ID ", link %d failed", token_target->nodeid, link_no); return res; } /* If this is the last link, then remove the node */ res = knet_link_get_link_list(instance->knet_handle, token_target->nodeid, link_list, &num_links); if (res) { return (0); /* not really failure */ } if (num_links == 0) { res = knet_host_remove(instance->knet_handle, token_target->nodeid); } return res; } int totemknet_member_list_rebind_ip ( void *knet_context) { return (0); } -int totemknet_reconfigure ( + +static int totemknet_configure_compression ( void *knet_context, struct totem_config *totem_config) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; struct knet_handle_compress_cfg compress_cfg; int res = 0; assert(strlen(totem_config->knet_compression_model) < sizeof(compress_cfg.compress_model)); strcpy(compress_cfg.compress_model, totem_config->knet_compression_model); compress_cfg.compress_threshold = totem_config->knet_compression_threshold; compress_cfg.compress_level = totem_config->knet_compression_level; res = knet_handle_compress(instance->knet_handle, &compress_cfg); if (res) { KNET_LOGSYS_PERROR(errno, LOGSYS_LEVEL_ERROR, "knet_handle_compress failed"); } + return res; +} + +int totemknet_reconfigure ( + void *knet_context, + struct totem_config *totem_config) +{ + struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; + int res = 0; + + (void)totemknet_configure_compression(knet_context, totem_config); #ifdef HAVE_LIBNOZZLE - /* Set up nozzle device(s). Return code is ignored, because unability + /* Set up nozzle device(s). Return code is ignored, because inability * configure nozzle is not fatal problem, errors are logged and * there is not much else we can do */ (void)setup_nozzle(instance); #endif + + if (totem_config->crypto_changed) { + /* Flip crypto_index */ + totem_config->crypto_index = 3-totem_config->crypto_index; + res = totemknet_set_knet_crypto(instance); + + knet_log_printf(LOG_INFO, "kronosnet crypto reconfigured on index %d: %s/%s/%s", totem_config->crypto_index, + totem_config->crypto_model, + totem_config->crypto_cipher_type, + totem_config->crypto_hash_type); + } return (res); } +int totemknet_crypto_reconfigure_phase ( + void *knet_context, + struct totem_config *totem_config, + cfg_message_crypto_reconfig_phase_t phase) +{ +#ifdef HAVE_KNET_CRYPTO_RECONF + int res; + int config_to_use; + int config_to_clear; + struct knet_handle_crypto_cfg crypto_cfg; + struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; + + knet_log_printf(LOGSYS_LEVEL_DEBUG, "totemknet_crypto_reconfigure_phase %d, index=%d\n", phase, totem_config->crypto_index); + + switch (phase) { + case CRYPTO_RECONFIG_PHASE_ACTIVATE: + config_to_use = totem_config->crypto_index; + if (strcmp(instance->totem_config->crypto_cipher_type, "none") == 0) { + config_to_use = 0; /* we are clearing it */ + } + + /* Enable the new config on this node */ + res = knet_handle_crypto_use_config(instance->knet_handle, config_to_use); + if (res == -1) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_use_config %d failed: %s", config_to_use, strerror(errno)); + } + break; + + case CRYPTO_RECONFIG_PHASE_CLEANUP: + /* + * All nodes should now have the new config. clear the old one out + * OR disable crypto entirely if that's what the new config insists on. + */ + config_to_clear = 3-totem_config->crypto_index; + knet_log_printf(LOGSYS_LEVEL_DEBUG, "Clearing old knet crypto config %d\n", config_to_clear); + + strcpy(crypto_cfg.crypto_model, "none"); + strcpy(crypto_cfg.crypto_cipher_type, "none"); + strcpy(crypto_cfg.crypto_hash_type, "none"); + res = knet_handle_crypto_set_config(instance->knet_handle, &crypto_cfg, config_to_clear); + if (res == -1) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config to clear index %d failed: %s", config_to_clear, strerror(errno)); + } + if (res == -2) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_set_config to clear index %d failed: -2", config_to_clear); + } + + /* If crypto is enabled then disable all cleartext reception */ + if (strcmp(instance->totem_config->crypto_cipher_type, "none") != 0) { + res = knet_handle_crypto_rx_clear_traffic(instance->knet_handle, KNET_CRYPTO_RX_DISALLOW_CLEAR_TRAFFIC); + if (res) { + knet_log_printf(LOGSYS_LEVEL_ERROR, "knet_handle_crypto_rx_clear_traffic(DISALLOW) failed %s", strerror(errno)); + } + } + } +#endif + return 0; +} + void totemknet_stats_clear ( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; (void) knet_handle_clear_stats(instance->knet_handle, KNET_CLEARSTATS_HANDLE_AND_LINK); } /* For the stats module */ int totemknet_link_get_status ( knet_node_id_t node, uint8_t link_no, struct knet_link_status *status) { int res; int ret = CS_OK; /* We are probably not using knet */ if (!global_instance) { return CS_ERR_NOT_EXIST; } if (link_no >= INTERFACE_MAX) { return CS_ERR_NOT_EXIST; /* Invalid link number */ } res = knet_link_get_status(global_instance->knet_handle, node, link_no, status, sizeof(struct knet_link_status)); if (res) { switch (errno) { case EINVAL: ret = CS_ERR_INVALID_PARAM; break; case EBUSY: ret = CS_ERR_BUSY; break; case EDEADLK: ret = CS_ERR_TRY_AGAIN; break; default: ret = CS_ERR_LIBRARY; break; } } return (ret); } int totemknet_handle_get_stats ( struct knet_handle_stats *stats) { int res; /* We are probably not using knet */ if (!global_instance) { return CS_ERR_NOT_EXIST; } res = knet_handle_get_stats(global_instance->knet_handle, stats, sizeof(struct knet_handle_stats)); if (res != 0) { return (qb_to_cs_error(-errno)); } return CS_OK; } static void timer_function_merge_detect_timeout ( void *data) { struct totemknet_instance *instance = (struct totemknet_instance *)data; if (instance->merge_detect_messages_sent_before_timeout == 0) { instance->send_merge_detect_message = 1; } instance->merge_detect_messages_sent_before_timeout = 0; totemknet_start_merge_detect_timeout(instance); } static void totemknet_start_merge_detect_timeout( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; qb_loop_timer_add(instance->poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout * 2 * QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); } static void totemknet_stop_merge_detect_timeout( void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; qb_loop_timer_del(instance->poll_handle, instance->timer_merge_detect_timeout); } static void log_flush_messages (void *knet_context) { struct pollfd pfd; struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int cont; cont = 1; while (cont) { pfd.fd = instance->logpipes[0]; pfd.events = POLLIN; pfd.revents = 0; if ((poll(&pfd, 1, 0) > 0) && (pfd.revents & POLLIN) && (log_deliver_fn(instance->logpipes[0], POLLIN, instance) == 0)) { cont = 1; } else { cont = 0; } } } #ifdef HAVE_LIBNOZZLE #define NOZZLE_NAME "nozzle.name" #define NOZZLE_IPADDR "nozzle.ipaddr" #define NOZZLE_PREFIX "nozzle.ipprefix" #define NOZZLE_MACADDR "nozzle.macaddr" #define NOZZLE_CHANNEL 1 static char *get_nozzle_script_dir(void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; char filename[PATH_MAX + FILENAME_MAX + 1]; static char updown_dirname[PATH_MAX + FILENAME_MAX + 1]; int res; const char *dirname_res; /* * Build script directory based on corosync.conf file location */ res = snprintf(filename, sizeof(filename), "%s", corosync_get_config_file()); if (res >= sizeof(filename)) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "nozzle up/down path too long"); return NULL; } dirname_res = dirname(filename); res = snprintf(updown_dirname, sizeof(updown_dirname), "%s/%s", dirname_res, "updown.d"); if (res >= sizeof(updown_dirname)) { knet_log_printf (LOGSYS_LEVEL_DEBUG, "nozzle up/down path too long"); return NULL; } return updown_dirname; } /* * Deliberately doesn't return the status as caller doesn't care. * The result will be logged though */ static void run_nozzle_script(struct totemknet_instance *instance, int type, const char *typename) { int res; char *exec_string; res = nozzle_run_updown(instance->nozzle_handle, type, &exec_string); if (res == -1 && errno != ENOENT) { knet_log_printf (LOGSYS_LEVEL_INFO, "exec nozzle %s script failed: %s", typename, strerror(errno)); } else if (res == -2) { knet_log_printf (LOGSYS_LEVEL_INFO, "nozzle %s script failed", typename); knet_log_printf (LOGSYS_LEVEL_INFO, "%s", exec_string); } } /* * Reparse IP address to add in our node ID * IPv6 addresses must end in '::' * IPv4 addresses must just be valid * '/xx' lengths are optional for IPv6, mandatory for IPv4 * * Returns the modified IP address as a string to pass into libnozzle */ static int reparse_nozzle_ip_address(struct totemknet_instance *instance, const char *input_addr, const char *prefix, int nodeid, char *output_addr, size_t output_len) { char *coloncolon; int bits; int max_prefix = 64; uint32_t nodeid_mask; uint32_t addr_mask; uint32_t masked_nodeid; struct in_addr *addr; struct totem_ip_address totemip; coloncolon = strstr(input_addr, "::"); if (!coloncolon) { max_prefix = 30; } bits = atoi(prefix); if (bits < 8 || bits > max_prefix) { knet_log_printf(LOGSYS_LEVEL_ERROR, "nozzle IP address prefix must be >= 8 and <= %d (got %d)", max_prefix, bits); return -1; } /* IPv6 is easy */ if (coloncolon) { memcpy(output_addr, input_addr, coloncolon-input_addr); sprintf(output_addr + (coloncolon-input_addr), "::%x", nodeid); return 0; } /* For IPv4 we need to parse the address into binary, mask off the required bits, * add in the masked_nodeid and 'print' it out again */ nodeid_mask = UINT32_MAX & ((1<<(32 - bits)) - 1); addr_mask = UINT32_MAX ^ nodeid_mask; masked_nodeid = nodeid & nodeid_mask; if (totemip_parse(&totemip, input_addr, AF_INET)) { knet_log_printf(LOGSYS_LEVEL_ERROR, "Failed to parse IPv4 nozzle IP address"); return -1; } addr = (struct in_addr *)&totemip.addr; addr->s_addr &= htonl(addr_mask); addr->s_addr |= htonl(masked_nodeid); inet_ntop(AF_INET, addr, output_addr, output_len); return 0; } static int create_nozzle_device(void *knet_context, const char *name, const char *ipaddr, const char *prefix, const char *macaddr) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; char device_name[IFNAMSIZ+1]; size_t size = IFNAMSIZ; int8_t channel = NOZZLE_CHANNEL; nozzle_t nozzle_dev; int nozzle_fd; int res; char *updown_dir; char parsed_ipaddr[INET6_ADDRSTRLEN]; char mac[19]; memset(device_name, 0, size); memset(&mac, 0, sizeof(mac)); strncpy(device_name, name, size); updown_dir = get_nozzle_script_dir(knet_context); knet_log_printf (LOGSYS_LEVEL_INFO, "nozzle script dir is %s", updown_dir); nozzle_dev = nozzle_open(device_name, size, updown_dir); if (!nozzle_dev) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to init nozzle device %s: %s", device_name, strerror(errno)); return -1; } instance->nozzle_handle = nozzle_dev; if (nozzle_set_mac(nozzle_dev, macaddr) < 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to add set nozzle MAC to %s: %s", mac, strerror(errno)); goto out_clean; } if (reparse_nozzle_ip_address(instance, ipaddr, prefix, instance->our_nodeid, parsed_ipaddr, sizeof(parsed_ipaddr))) { /* Prints its own errors */ goto out_clean; } knet_log_printf (LOGSYS_LEVEL_INFO, "Local nozzle IP address is %s / %d", parsed_ipaddr, atoi(prefix)); if (nozzle_add_ip(nozzle_dev, parsed_ipaddr, prefix) < 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to add set nozzle IP addr to %s/%s: %s", parsed_ipaddr, prefix, strerror(errno)); goto out_clean; } nozzle_fd = nozzle_get_fd(nozzle_dev); knet_log_printf (LOGSYS_LEVEL_INFO, "Opened '%s' on fd %d", device_name, nozzle_fd); res = knet_handle_add_datafd(instance->knet_handle, &nozzle_fd, &channel); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to add nozzle FD to knet: %s", strerror(errno)); goto out_clean; } run_nozzle_script(instance, NOZZLE_PREUP, "pre-up"); res = nozzle_set_up(nozzle_dev); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Unable to set nozzle interface UP: %s", strerror(errno)); goto out_clean; } run_nozzle_script(instance, NOZZLE_UP, "up"); return 0; out_clean: nozzle_close(nozzle_dev); return -1; } static int remove_nozzle_device(void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; int res; int datafd; res = knet_handle_get_datafd(instance->knet_handle, NOZZLE_CHANNEL, &datafd); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't find datafd for channel %d: %s", NOZZLE_CHANNEL, strerror(errno)); return -1; } res = knet_handle_remove_datafd(instance->knet_handle, datafd); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't remove datafd for nozzle channel %d: %s", NOZZLE_CHANNEL, strerror(errno)); return -1; } run_nozzle_script(instance, NOZZLE_DOWN, "pre-down"); res = nozzle_set_down(instance->nozzle_handle); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't set nozzle device down: %s", strerror(errno)); return -1; } run_nozzle_script(instance, NOZZLE_POSTDOWN, "post-down"); res = nozzle_close(instance->nozzle_handle); if (res != 0) { knet_log_printf (LOGSYS_LEVEL_ERROR, "Can't close nozzle device: %s", strerror(errno)); return -1; } knet_log_printf (LOGSYS_LEVEL_INFO, "Removed nozzle device"); return 0; } static void free_nozzle(struct totemknet_instance *instance) { free(instance->nozzle_name); free(instance->nozzle_ipaddr); free(instance->nozzle_prefix); free(instance->nozzle_macaddr); instance->nozzle_name = instance->nozzle_ipaddr = instance->nozzle_prefix = instance->nozzle_macaddr = NULL; } static int setup_nozzle(void *knet_context) { struct totemknet_instance *instance = (struct totemknet_instance *)knet_context; char *ipaddr_str = NULL; char *name_str = NULL; char *prefix_str = NULL; char *macaddr_str = NULL; char mac[32]; int name_res; int macaddr_res; int res = -1; /* * Return value ignored on purpose. icmap_get_string changes * ipaddr_str/prefix_str only on success. */ (void)icmap_get_string(NOZZLE_IPADDR, &ipaddr_str); (void)icmap_get_string(NOZZLE_PREFIX, &prefix_str); macaddr_res = icmap_get_string(NOZZLE_MACADDR, &macaddr_str); name_res = icmap_get_string(NOZZLE_NAME, &name_str); /* Is is being removed? */ if (name_res == CS_ERR_NOT_EXIST && instance->nozzle_handle) { remove_nozzle_device(instance); free_nozzle(instance); goto out_free; } if (!name_str) { /* no nozzle */ goto out_free; } if (!ipaddr_str) { knet_log_printf (LOGSYS_LEVEL_ERROR, "No IP address supplied for Nozzle device"); goto out_free; } if (!prefix_str) { knet_log_printf (LOGSYS_LEVEL_ERROR, "No prefix supplied for Nozzle IP address"); goto out_free; } if (macaddr_str && strlen(macaddr_str) != 17) { knet_log_printf (LOGSYS_LEVEL_ERROR, "macaddr for nozzle device is not in the correct format '%s'", macaddr_str); goto out_free; } if (!macaddr_str) { macaddr_str = (char*)"54:54:01:00:00:00"; } if (instance->nozzle_name && (strcmp(name_str, instance->nozzle_name) == 0) && (strcmp(ipaddr_str, instance->nozzle_ipaddr) == 0) && (strcmp(prefix_str, instance->nozzle_prefix) == 0) && (instance->nozzle_macaddr == NULL || strcmp(macaddr_str, instance->nozzle_macaddr) == 0)) { /* Nothing has changed */ knet_log_printf (LOGSYS_LEVEL_DEBUG, "Nozzle device info not changed"); goto out_free; } /* Add nodeid into MAC address */ memcpy(mac, macaddr_str, 12); snprintf(mac+12, sizeof(mac) - 13, "%02x:%02x", instance->our_nodeid >> 8, instance->our_nodeid & 0xFF); knet_log_printf (LOGSYS_LEVEL_INFO, "Local nozzle MAC address is %s", mac); if (name_res == CS_OK && name_str) { /* Reconfigure */ if (instance->nozzle_name) { remove_nozzle_device(instance); free_nozzle(instance); } res = create_nozzle_device(knet_context, name_str, ipaddr_str, prefix_str, mac); instance->nozzle_name = strdup(name_str); instance->nozzle_ipaddr = strdup(ipaddr_str); instance->nozzle_prefix = strdup(prefix_str); instance->nozzle_macaddr = strdup(macaddr_str); if (!instance->nozzle_name || !instance->nozzle_ipaddr || !instance->nozzle_prefix) { knet_log_printf (LOGSYS_LEVEL_ERROR, "strdup failed in nozzle allocation"); /* * This 'free' will cause a complete reconfigure of the device next time we reload * but will also let the the current device keep working until then. * remove_nozzle() only needs the, statically-allocated, nozzle_handle */ free_nozzle(instance); } } out_free: free(name_str); free(ipaddr_str); free(prefix_str); if (macaddr_res == CS_OK) { free(macaddr_str); } return res; } #endif // HAVE_LIBNOZZLE diff --git a/exec/totemknet.h b/exec/totemknet.h index 5a4dac7b..3957b7f2 100644 --- a/exec/totemknet.h +++ b/exec/totemknet.h @@ -1,149 +1,154 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2011 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMKNET_H_DEFINED #define TOTEMKNET_H_DEFINED #include #include #include #include /** * Create an instance */ extern int totemknet_initialize ( qb_loop_t *poll_handle, void **knet_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); extern void *totemknet_buffer_alloc (void); extern void totemknet_buffer_release (void *ptr); extern int totemknet_processor_count_set ( void *knet_context, int processor_count); extern int totemknet_token_send ( void *knet_context, const void *msg, unsigned int msg_len); extern int totemknet_mcast_flush_send ( void *knet_context, const void *msg, unsigned int msg_len); extern int totemknet_mcast_noflush_send ( void *knet_context, const void *msg, unsigned int msg_len); extern int totemknet_recv_flush (void *knet_context); extern int totemknet_send_flush (void *knet_context); extern int totemknet_iface_check (void *knet_context); extern int totemknet_finalize (void *knet_context); extern void totemknet_net_mtu_adjust (void *knet_context, struct totem_config *totem_config); extern int totemknet_ifaces_get (void *net_context, char ***status, unsigned int *iface_count); extern int totemknet_iface_set (void *net_context, const struct totem_ip_address *local_addr, unsigned short ip_port, unsigned int iface_no); extern int totemknet_token_target_set ( void *knet_context, unsigned int nodeid); extern int totemknet_crypto_set ( void *knet_context, const char *cipher_type, const char *hash_type); extern int totemknet_recv_mcast_empty ( void *knet_context); extern int totemknet_member_add ( void *knet_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); extern int totemknet_member_remove ( void *knet_context, const struct totem_ip_address *member, int ring_no); extern int totemknet_member_set_active ( void *knet_context, const struct totem_ip_address *member_ip, int active); extern int totemknet_reconfigure ( void *knet_context, struct totem_config *totem_config); +extern int totemknet_crypto_reconfigure_phase ( + void *knet_context, + struct totem_config *totem_config, + cfg_message_crypto_reconfig_phase_t phase); + extern void totemknet_stats_clear ( void *knet_context); #endif /* TOTEMKNET_H_DEFINED */ diff --git a/exec/totemnet.c b/exec/totemnet.c index a16604d0..ae44dbf8 100644 --- a/exec/totemnet.c +++ b/exec/totemnet.c @@ -1,583 +1,607 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include struct transport { const char *name; int (*initialize) ( qb_loop_t *loop_pt, void **transport_instance, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); void *(*buffer_alloc) (void); void (*buffer_release) (void *ptr); int (*processor_count_set) ( void *transport_context, int processor_count); int (*token_send) ( void *transport_context, const void *msg, unsigned int msg_len); int (*mcast_flush_send) ( void *transport_context, const void *msg, unsigned int msg_len); int (*mcast_noflush_send) ( void *transport_context, const void *msg, unsigned int msg_len); int (*recv_flush) (void *transport_context); int (*send_flush) (void *transport_context); int (*iface_check) (void *transport_context); int (*finalize) (void *transport_context); void (*net_mtu_adjust) (void *transport_context, struct totem_config *totem_config); const char *(*iface_print) (void *transport_context); int (*ifaces_get) ( void *transport_context, char ***status, unsigned int *iface_count); int (*token_target_set) ( void *transport_context, unsigned int nodeid); int (*crypto_set) ( void *transport_context, const char *cipher_type, const char *hash_type); int (*recv_mcast_empty) ( void *transport_context); int (*iface_set) ( void *transport_context, const struct totem_ip_address *local, unsigned short ip_port, unsigned int ring_no); int (*member_add) ( void *transport_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); int (*member_remove) ( void *transport_context, const struct totem_ip_address *member, int ring_no); int (*member_set_active) ( void *transport_context, const struct totem_ip_address *member, int active); int (*reconfigure) ( void *net_context, struct totem_config *totem_config); + int (*crypto_reconfigure_phase) ( + void *net_context, + struct totem_config *totem_config, + cfg_message_crypto_reconfig_phase_t phase); + void (*stats_clear) ( void *net_context); }; struct transport transport_entries[] = { { .name = "UDP/IP Multicast", .initialize = totemudp_initialize, .buffer_alloc = totemudp_buffer_alloc, .buffer_release = totemudp_buffer_release, .processor_count_set = totemudp_processor_count_set, .token_send = totemudp_token_send, .mcast_flush_send = totemudp_mcast_flush_send, .mcast_noflush_send = totemudp_mcast_noflush_send, .recv_flush = totemudp_recv_flush, .send_flush = totemudp_send_flush, .iface_set = totemudp_iface_set, .iface_check = totemudp_iface_check, .finalize = totemudp_finalize, .net_mtu_adjust = totemudp_net_mtu_adjust, .ifaces_get = totemudp_ifaces_get, .token_target_set = totemudp_token_target_set, .crypto_set = totemudp_crypto_set, .recv_mcast_empty = totemudp_recv_mcast_empty, .member_add = totemudp_member_add, .member_remove = totemudp_member_remove, - .reconfigure = totemudp_reconfigure + .reconfigure = totemudp_reconfigure, + .crypto_reconfigure_phase = NULL }, { .name = "UDP/IP Unicast", .initialize = totemudpu_initialize, .buffer_alloc = totemudpu_buffer_alloc, .buffer_release = totemudpu_buffer_release, .processor_count_set = totemudpu_processor_count_set, .token_send = totemudpu_token_send, .mcast_flush_send = totemudpu_mcast_flush_send, .mcast_noflush_send = totemudpu_mcast_noflush_send, .recv_flush = totemudpu_recv_flush, .send_flush = totemudpu_send_flush, .iface_set = totemudpu_iface_set, .iface_check = totemudpu_iface_check, .finalize = totemudpu_finalize, .net_mtu_adjust = totemudpu_net_mtu_adjust, .ifaces_get = totemudpu_ifaces_get, .token_target_set = totemudpu_token_target_set, .crypto_set = totemudpu_crypto_set, .recv_mcast_empty = totemudpu_recv_mcast_empty, .member_add = totemudpu_member_add, .member_remove = totemudpu_member_remove, - .reconfigure = totemudpu_reconfigure + .reconfigure = totemudpu_reconfigure, + .crypto_reconfigure_phase = NULL }, { .name = "Kronosnet", .initialize = totemknet_initialize, .buffer_alloc = totemknet_buffer_alloc, .buffer_release = totemknet_buffer_release, .processor_count_set = totemknet_processor_count_set, .token_send = totemknet_token_send, .mcast_flush_send = totemknet_mcast_flush_send, .mcast_noflush_send = totemknet_mcast_noflush_send, .recv_flush = totemknet_recv_flush, .send_flush = totemknet_send_flush, .iface_set = totemknet_iface_set, .iface_check = totemknet_iface_check, .finalize = totemknet_finalize, .net_mtu_adjust = totemknet_net_mtu_adjust, .ifaces_get = totemknet_ifaces_get, .token_target_set = totemknet_token_target_set, .crypto_set = totemknet_crypto_set, .recv_mcast_empty = totemknet_recv_mcast_empty, .member_add = totemknet_member_add, .member_remove = totemknet_member_remove, .reconfigure = totemknet_reconfigure, + .crypto_reconfigure_phase = totemknet_crypto_reconfigure_phase, .stats_clear = totemknet_stats_clear } }; struct totemnet_instance { void *transport_context; struct transport *transport; void (*totemnet_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7))); int totemnet_subsys_id; }; #define log_printf(level, format, args...) \ do { \ instance->totemnet_log_printf ( \ level, \ instance->totemnet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ (const char *)format, ##args); \ } while (0); static void totemnet_instance_initialize ( struct totemnet_instance *instance, struct totem_config *config) { int transport; instance->totemnet_log_printf = config->totem_logging_configuration.log_printf; instance->totemnet_subsys_id = config->totem_logging_configuration.log_subsys_id; transport = config->transport_number; log_printf (LOGSYS_LEVEL_NOTICE, "Initializing transport (%s).", transport_entries[transport].name); instance->transport = &transport_entries[transport]; } int totemnet_crypto_set ( void *net_context, const char *cipher_type, const char *hash_type) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->crypto_set (instance->transport_context, cipher_type, hash_type); return res; } int totemnet_finalize ( void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->finalize (instance->transport_context); return (res); } int totemnet_initialize ( qb_loop_t *loop_pt, void **net_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int ring_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)) { struct totemnet_instance *instance; unsigned int res; instance = malloc (sizeof (struct totemnet_instance)); if (instance == NULL) { return (-1); } totemnet_instance_initialize (instance, totem_config); res = instance->transport->initialize (loop_pt, &instance->transport_context, totem_config, stats, context, deliver_fn, iface_change_fn, mtu_changed, target_set_completed); if (res == -1) { goto error_destroy; } *net_context = instance; return (0); error_destroy: free (instance); return (-1); } void *totemnet_buffer_alloc (void *net_context) { struct totemnet_instance *instance = net_context; assert (instance != NULL); assert (instance->transport != NULL); return instance->transport->buffer_alloc(); } void totemnet_buffer_release (void *net_context, void *ptr) { struct totemnet_instance *instance = net_context; assert (instance != NULL); assert (instance->transport != NULL); instance->transport->buffer_release (ptr); } int totemnet_processor_count_set ( void *net_context, int processor_count) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->processor_count_set (instance->transport_context, processor_count); return (res); } int totemnet_recv_flush (void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->recv_flush (instance->transport_context); return (res); } int totemnet_send_flush (void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->send_flush (instance->transport_context); return (res); } int totemnet_token_send ( void *net_context, const void *msg, unsigned int msg_len) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->token_send (instance->transport_context, msg, msg_len); return (res); } int totemnet_mcast_flush_send ( void *net_context, const void *msg, unsigned int msg_len) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->mcast_flush_send (instance->transport_context, msg, msg_len); return (res); } int totemnet_mcast_noflush_send ( void *net_context, const void *msg, unsigned int msg_len) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->mcast_noflush_send (instance->transport_context, msg, msg_len); return (res); } extern int totemnet_iface_check (void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; res = instance->transport->iface_check (instance->transport_context); return (res); } extern int totemnet_net_mtu_adjust (void *net_context, struct totem_config *totem_config) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res = 0; instance->transport->net_mtu_adjust (instance->transport_context, totem_config); return (res); } int totemnet_iface_set (void *net_context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; int res; res = instance->transport->iface_set (instance->transport_context, interface_addr, ip_port, iface_no); return (res); } int totemnet_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->ifaces_get (instance->transport_context, status, iface_count); return (res); } int totemnet_token_target_set ( void *net_context, unsigned int nodeid) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->token_target_set (instance->transport_context, nodeid); return (res); } extern int totemnet_recv_mcast_empty ( void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res; res = instance->transport->recv_mcast_empty (instance->transport_context); return (res); } extern int totemnet_member_add ( void *net_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->member_add) { res = instance->transport->member_add ( instance->transport_context, local, member, ring_no); } return (res); } extern int totemnet_member_remove ( void *net_context, const struct totem_ip_address *member, int ring_no) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->member_remove) { res = instance->transport->member_remove ( instance->transport_context, member, ring_no); } return (res); } int totemnet_member_set_active ( void *net_context, const struct totem_ip_address *member, int active) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; if (instance->transport->member_set_active) { res = instance->transport->member_set_active ( instance->transport_context, member, active); } return (res); } int totemnet_reconfigure ( void *net_context, struct totem_config *totem_config) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; unsigned int res = 0; res = instance->transport->reconfigure ( instance->transport_context, totem_config); return (res); } +int totemnet_crypto_reconfigure_phase ( + void *net_context, + struct totem_config *totem_config, + cfg_message_crypto_reconfig_phase_t phase) +{ + struct totemnet_instance *instance = (struct totemnet_instance *)net_context; + unsigned int res = 0; + + if (instance->transport->crypto_reconfigure_phase) { + res = instance->transport->crypto_reconfigure_phase ( + instance->transport_context, + totem_config, phase); + } + return (res); +} + void totemnet_stats_clear ( void *net_context) { struct totemnet_instance *instance = (struct totemnet_instance *)net_context; if (instance->transport->stats_clear) { instance->transport->stats_clear ( instance->transport_context); } } diff --git a/exec/totemnet.h b/exec/totemnet.h index e552c1f6..46c1dd8d 100644 --- a/exec/totemnet.h +++ b/exec/totemnet.h @@ -1,159 +1,161 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** * @file * Totem Network interface - also does encryption/decryption * * depends on poll abstraction, POSIX, IPV4 */ #ifndef TOTEMNET_H_DEFINED #define TOTEMNET_H_DEFINED #include #include #include #define TOTEMNET_NOFLUSH 0 #define TOTEMNET_FLUSH 1 /** * Create an instance */ extern int totemnet_initialize ( qb_loop_t *poll_handle, void **net_context, struct totem_config *totem_config, totemsrp_stats_t *stats, void *context, void (*deliver_fn) ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from), void (*iface_change_fn) ( void *context, const struct totem_ip_address *iface_address, unsigned int iface_no), void (*mtu_changed) ( void *context, int net_mtu), void (*target_set_completed) ( void *context)); extern void *totemnet_buffer_alloc (void *net_context); extern void totemnet_buffer_release (void *net_context, void *ptr); extern int totemnet_processor_count_set ( void *net_context, int processor_count); extern int totemnet_token_send ( void *net_context, const void *msg, unsigned int msg_len); extern int totemnet_mcast_flush_send ( void *net_context, const void *msg, unsigned int msg_len); extern int totemnet_mcast_noflush_send ( void *net_context, const void *msg, unsigned int msg_len); extern int totemnet_recv_flush (void *net_context); extern int totemnet_send_flush (void *net_context); extern int totemnet_iface_set (void *net_context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no); extern int totemnet_iface_check (void *net_context); extern int totemnet_finalize (void *net_context); extern int totemnet_net_mtu_adjust (void *net_context, struct totem_config *totem_config); extern int totemnet_reconfigure (void *net_context, struct totem_config *totem_config); +extern int totemnet_crypto_reconfigure_phase (void *net_context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase); + extern void totemnet_stats_clear (void *net_context); extern const char *totemnet_iface_print (void *net_context); extern int totemnet_ifaces_get ( void *net_context, char ***status, unsigned int *iface_count); extern int totemnet_token_target_set ( void *net_context, unsigned int target_nodeid); extern int totemnet_crypto_set ( void *net_context, const char *cipher_type, const char *hash_type); extern int totemnet_recv_mcast_empty ( void *net_context); extern int totemnet_member_add ( void *net_context, const struct totem_ip_address *local, const struct totem_ip_address *member, int ring_no); extern int totemnet_member_remove ( void *net_context, const struct totem_ip_address *member, int ring_no); extern int totemnet_member_set_active ( void *net_context, const struct totem_ip_address *member, int active); #endif /* TOTEMNET_H_DEFINED */ diff --git a/exec/totempg.c b/exec/totempg.c index 96d69372..7b1f755e 100644 --- a/exec/totempg.c +++ b/exec/totempg.c @@ -1,1608 +1,1613 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2005 OSDL. * Copyright (c) 2006-2012 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * Author: Mark Haverkamp (markh@osdl.org) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * FRAGMENTATION AND PACKING ALGORITHM: * * Assemble the entire message into one buffer * if full fragment * store fragment into lengths list * for each full fragment * multicast fragment * set length and fragment fields of pg mesage * store remaining multicast into head of fragmentation data and set lens field * * If a message exceeds the maximum packet size allowed by the totem * single ring protocol, the protocol could lose forward progress. * Statically calculating the allowed data amount doesn't work because * the amount of data allowed depends on the number of fragments in * each message. In this implementation, the maximum fragment size * is dynamically calculated for each fragment added to the message. * It is possible for a message to be two bytes short of the maximum * packet size. This occurs when a message or collection of * messages + the mcast header + the lens are two bytes short of the * end of the packet. Since another len field consumes two bytes, the * len field would consume the rest of the packet without room for data. * * One optimization would be to forgo the final len field and determine * it from the size of the udp datagram. Then this condition would no * longer occur. */ /* * ASSEMBLY AND UNPACKING ALGORITHM: * * copy incoming packet into assembly data buffer indexed by current * location of end of fragment * * if not fragmented * deliver all messages in assembly data buffer * else * if msg_count > 1 and fragmented * deliver all messages except last message in assembly data buffer * copy last fragmented section to start of assembly data buffer * else * if msg_count = 1 and fragmented * do nothing * */ #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "util.h" #include "totemsrp.h" struct totempg_mcast_header { short version; short type; }; #if !(defined(__i386__) || defined(__x86_64__)) /* * Need align on architectures different then i386 or x86_64 */ #define TOTEMPG_NEED_ALIGN 1 #endif /* * totempg_mcast structure * * header: Identify the mcast. * fragmented: Set if this message continues into next message * continuation: Set if this message is a continuation from last message * msg_count Indicates how many packed messages are contained * in the mcast. * Also, the size of each packed message and the messages themselves are * appended to the end of this structure when sent. */ struct totempg_mcast { struct totempg_mcast_header header; unsigned char fragmented; unsigned char continuation; unsigned short msg_count; /* * short msg_len[msg_count]; */ /* * data for messages */ }; /* * Maximum packet size for totem pg messages */ #define TOTEMPG_PACKET_SIZE (totempg_totem_config->net_mtu - \ sizeof (struct totempg_mcast)) /* * Local variables used for packing small messages */ static unsigned short mcast_packed_msg_lens[FRAME_SIZE_MAX]; static int mcast_packed_msg_count = 0; static int totempg_reserved = 1; static unsigned int totempg_size_limit; static totem_queue_level_changed_fn totem_queue_level_changed = NULL; static uint32_t totempg_threaded_mode = 0; static void *totemsrp_context; /* * Function and data used to log messages */ static int totempg_log_level_security; static int totempg_log_level_error; static int totempg_log_level_warning; static int totempg_log_level_notice; static int totempg_log_level_debug; static int totempg_subsys_id; static void (*totempg_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...) __attribute__((format(printf, 6, 7))); struct totem_config *totempg_totem_config; static totempg_stats_t totempg_stats; enum throw_away_mode { THROW_AWAY_INACTIVE, THROW_AWAY_ACTIVE }; struct assembly { unsigned int nodeid; unsigned char data[MESSAGE_SIZE_MAX+KNET_MAX_PACKET_SIZE]; int index; unsigned char last_frag_num; enum throw_away_mode throw_away_mode; struct qb_list_head list; }; static void assembly_deref (struct assembly *assembly); static int callback_token_received_fn (enum totem_callback_token_type type, const void *data); QB_LIST_DECLARE(assembly_list_inuse); /* * Free list is used both for transitional and operational assemblies */ QB_LIST_DECLARE(assembly_list_free); QB_LIST_DECLARE(assembly_list_inuse_trans); QB_LIST_DECLARE(totempg_groups_list); /* * Staging buffer for packed messages. Messages are staged in this buffer * before sending. Multiple messages may fit which cuts down on the * number of mcasts sent. If a message doesn't completely fit, then * the mcast header has a fragment bit set that says that there are more * data to follow. fragment_size is an index into the buffer. It indicates * the size of message data and where to place new message data. * fragment_contuation indicates whether the first packed message in * the buffer is a continuation of a previously packed fragment. */ static unsigned char *fragmentation_data; static int fragment_size = 0; static int fragment_continuation = 0; static int totempg_waiting_transack = 0; struct totempg_group_instance { void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required); void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); struct totempg_group *groups; int groups_cnt; int32_t q_level; struct qb_list_head list; }; static unsigned char next_fragment = 1; static pthread_mutex_t totempg_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t callback_token_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t mcast_msg_mutex = PTHREAD_MUTEX_INITIALIZER; #define log_printf(level, format, args...) \ do { \ totempg_log_printf(level, \ totempg_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); static int msg_count_send_ok (int msg_count); static int byte_count_send_ok (int byte_count); static void totempg_waiting_trans_ack_cb (int waiting_trans_ack) { log_printf(LOG_DEBUG, "waiting_trans_ack changed to %u", waiting_trans_ack); totempg_waiting_transack = waiting_trans_ack; } static struct assembly *assembly_ref (unsigned int nodeid) { struct assembly *assembly; struct qb_list_head *list; struct qb_list_head *active_assembly_list_inuse; if (totempg_waiting_transack) { active_assembly_list_inuse = &assembly_list_inuse_trans; } else { active_assembly_list_inuse = &assembly_list_inuse; } /* * Search inuse list for node id and return assembly buffer if found */ qb_list_for_each(list, active_assembly_list_inuse) { assembly = qb_list_entry (list, struct assembly, list); if (nodeid == assembly->nodeid) { return (assembly); } } /* * Nothing found in inuse list get one from free list if available */ if (qb_list_empty (&assembly_list_free) == 0) { assembly = qb_list_first_entry (&assembly_list_free, struct assembly, list); qb_list_del (&assembly->list); qb_list_add (&assembly->list, active_assembly_list_inuse); assembly->nodeid = nodeid; assembly->index = 0; assembly->last_frag_num = 0; assembly->throw_away_mode = THROW_AWAY_INACTIVE; return (assembly); } /* * Nothing available in inuse or free list, so allocate a new one */ assembly = malloc (sizeof (struct assembly)); /* * TODO handle memory allocation failure here */ assert (assembly); assembly->nodeid = nodeid; assembly->data[0] = 0; assembly->index = 0; assembly->last_frag_num = 0; assembly->throw_away_mode = THROW_AWAY_INACTIVE; qb_list_init (&assembly->list); qb_list_add (&assembly->list, active_assembly_list_inuse); return (assembly); } static void assembly_deref (struct assembly *assembly) { qb_list_del (&assembly->list); qb_list_add (&assembly->list, &assembly_list_free); } static void assembly_deref_from_normal_and_trans (int nodeid) { int j; struct qb_list_head *list, *tmp_iter; struct qb_list_head *active_assembly_list_inuse; struct assembly *assembly; for (j = 0; j < 2; j++) { if (j == 0) { active_assembly_list_inuse = &assembly_list_inuse; } else { active_assembly_list_inuse = &assembly_list_inuse_trans; } qb_list_for_each_safe(list, tmp_iter, active_assembly_list_inuse) { assembly = qb_list_entry (list, struct assembly, list); if (nodeid == assembly->nodeid) { qb_list_del (&assembly->list); qb_list_add (&assembly->list, &assembly_list_free); } } } } static inline void app_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; struct totempg_group_instance *instance; struct qb_list_head *list; /* * For every leaving processor, add to free list * This also has the side effect of clearing out the dataset * In the leaving processor's assembly buffer. */ for (i = 0; i < left_list_entries; i++) { assembly_deref_from_normal_and_trans (left_list[i]); } qb_list_for_each(list, &totempg_groups_list) { instance = qb_list_entry (list, struct totempg_group_instance, list); if (instance->confchg_fn) { instance->confchg_fn ( configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } } } static inline void group_endian_convert ( void *msg, int msg_len) { unsigned short *group_len; int i; char *aligned_msg; #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ if ((size_t)msg % 4 != 0) { aligned_msg = alloca(msg_len); memcpy(aligned_msg, msg, msg_len); } else { aligned_msg = msg; } #else aligned_msg = msg; #endif group_len = (unsigned short *)aligned_msg; group_len[0] = swab16(group_len[0]); for (i = 1; i < group_len[0] + 1; i++) { group_len[i] = swab16(group_len[i]); } if (aligned_msg != msg) { memcpy(msg, aligned_msg, msg_len); } } static inline int group_matches ( struct iovec *iovec, unsigned int iov_len, struct totempg_group *groups_b, unsigned int group_b_cnt, unsigned int *adjust_iovec) { unsigned short *group_len; char *group_name; int i; int j; #ifdef TOTEMPG_NEED_ALIGN struct iovec iovec_aligned = { NULL, 0 }; #endif assert (iov_len == 1); #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ if ((size_t)iovec->iov_base % 4 != 0) { iovec_aligned.iov_base = alloca(iovec->iov_len); memcpy(iovec_aligned.iov_base, iovec->iov_base, iovec->iov_len); iovec_aligned.iov_len = iovec->iov_len; iovec = &iovec_aligned; } #endif group_len = (unsigned short *)iovec->iov_base; group_name = ((char *)iovec->iov_base) + sizeof (unsigned short) * (group_len[0] + 1); /* * Calculate amount to adjust the iovec by before delivering to app */ *adjust_iovec = sizeof (unsigned short) * (group_len[0] + 1); for (i = 1; i < group_len[0] + 1; i++) { *adjust_iovec += group_len[i]; } /* * Determine if this message should be delivered to this instance */ for (i = 1; i < group_len[0] + 1; i++) { for (j = 0; j < group_b_cnt; j++) { if ((group_len[i] == groups_b[j].group_len) && (memcmp (groups_b[j].group, group_name, group_len[i]) == 0)) { return (1); } } group_name += group_len[i]; } return (0); } static inline void app_deliver_fn ( unsigned int nodeid, void *msg, unsigned int msg_len, int endian_conversion_required) { struct totempg_group_instance *instance; struct iovec stripped_iovec; unsigned int adjust_iovec; struct iovec *iovec; struct qb_list_head *list; struct iovec aligned_iovec = { NULL, 0 }; if (endian_conversion_required) { group_endian_convert (msg, msg_len); } /* * TODO: segmentation/assembly need to be redesigned to provide aligned access * in all cases to avoid memory copies on non386 archs. Probably broke backwars * compatibility */ #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ aligned_iovec.iov_base = alloca(msg_len); aligned_iovec.iov_len = msg_len; memcpy(aligned_iovec.iov_base, msg, msg_len); #else aligned_iovec.iov_base = msg; aligned_iovec.iov_len = msg_len; #endif iovec = &aligned_iovec; qb_list_for_each(list, &totempg_groups_list) { instance = qb_list_entry (list, struct totempg_group_instance, list); if (group_matches (iovec, 1, instance->groups, instance->groups_cnt, &adjust_iovec)) { stripped_iovec.iov_len = iovec->iov_len - adjust_iovec; stripped_iovec.iov_base = (char *)iovec->iov_base + adjust_iovec; #ifdef TOTEMPG_NEED_ALIGN /* * Align data structure for not i386 or x86_64 */ if ((char *)iovec->iov_base + adjust_iovec % 4 != 0) { /* * Deal with misalignment */ stripped_iovec.iov_base = alloca (stripped_iovec.iov_len); memcpy (stripped_iovec.iov_base, (char *)iovec->iov_base + adjust_iovec, stripped_iovec.iov_len); } #endif instance->deliver_fn ( nodeid, stripped_iovec.iov_base, stripped_iovec.iov_len, endian_conversion_required); } } } static void totempg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { // TODO optimize this app_confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } static void totempg_deliver_fn ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required) { struct totempg_mcast *mcast; unsigned short *msg_lens; int i; struct assembly *assembly; char header[FRAME_SIZE_MAX]; int msg_count; int continuation; int start; const char *data; int datasize; struct iovec iov_delv; size_t expected_msg_len; assembly = assembly_ref (nodeid); assert (assembly); if (msg_len < sizeof(struct totempg_mcast)) { log_printf(LOG_WARNING, "Message (totempg_mcast) received from node " CS_PRI_NODE_ID " is too short... Ignoring.", nodeid); return ; } /* * Assemble the header into one block of data and * assemble the packet contents into one block of data to simplify delivery */ mcast = (struct totempg_mcast *)msg; if (endian_conversion_required) { mcast->msg_count = swab16 (mcast->msg_count); } msg_count = mcast->msg_count; datasize = sizeof (struct totempg_mcast) + msg_count * sizeof (unsigned short); if (msg_len < datasize) { log_printf(LOG_WARNING, "Message (totempg_mcast datasize) received from node " CS_PRI_NODE_ID " is too short... Ignoring.", nodeid); return ; } memcpy (header, msg, datasize); data = msg; msg_lens = (unsigned short *) (header + sizeof (struct totempg_mcast)); expected_msg_len = datasize; for (i = 0; i < mcast->msg_count; i++) { if (endian_conversion_required) { msg_lens[i] = swab16 (msg_lens[i]); } expected_msg_len += msg_lens[i]; } if (msg_len != expected_msg_len) { log_printf(LOG_WARNING, "Message (totempg_mcast) received from node " CS_PRI_NODE_ID " doesn't have expected length of %zu (has %u) bytes... Ignoring.", nodeid, expected_msg_len, msg_len); return ; } assert((assembly->index+msg_len) < sizeof(assembly->data)); memcpy (&assembly->data[assembly->index], &data[datasize], msg_len - datasize); /* * If the last message in the buffer is a fragment, then we * can't deliver it. We'll first deliver the full messages * then adjust the assembly buffer so we can add the rest of the * fragment when it arrives. */ msg_count = mcast->fragmented ? mcast->msg_count - 1 : mcast->msg_count; continuation = mcast->continuation; iov_delv.iov_base = (void *)&assembly->data[0]; iov_delv.iov_len = assembly->index + msg_lens[0]; /* * Make sure that if this message is a continuation, that it * matches the sequence number of the previous fragment. * Also, if the first packed message is a continuation * of a previous message, but the assembly buffer * is empty, then we need to discard it since we can't * assemble a complete message. Likewise, if this message isn't a * continuation and the assembly buffer is empty, we have to discard * the continued message. */ start = 0; if (assembly->throw_away_mode == THROW_AWAY_ACTIVE) { /* Throw away the first msg block */ if (mcast->fragmented == 0 || mcast->fragmented == 1) { assembly->throw_away_mode = THROW_AWAY_INACTIVE; assembly->index += msg_lens[0]; iov_delv.iov_base = (void *)&assembly->data[assembly->index]; iov_delv.iov_len = msg_lens[1]; start = 1; } } else if (assembly->throw_away_mode == THROW_AWAY_INACTIVE) { if (continuation == assembly->last_frag_num) { assembly->last_frag_num = mcast->fragmented; for (i = start; i < msg_count; i++) { app_deliver_fn(nodeid, iov_delv.iov_base, iov_delv.iov_len, endian_conversion_required); assembly->index += msg_lens[i]; iov_delv.iov_base = (void *)&assembly->data[assembly->index]; if (i < (msg_count - 1)) { iov_delv.iov_len = msg_lens[i + 1]; } } } else { log_printf (LOG_DEBUG, "fragmented continuation %u is not equal to assembly last_frag_num %u", continuation, assembly->last_frag_num); assembly->throw_away_mode = THROW_AWAY_ACTIVE; } } if (mcast->fragmented == 0) { /* * End of messages, dereference assembly struct */ assembly->last_frag_num = 0; assembly->index = 0; assembly_deref (assembly); } else { /* * Message is fragmented, keep around assembly list */ if (mcast->msg_count > 1) { memmove (&assembly->data[0], &assembly->data[assembly->index], msg_lens[msg_count]); assembly->index = 0; } assembly->index += msg_lens[msg_count]; } } /* * Totem Process Group Abstraction * depends on poll abstraction, POSIX, IPV4 */ void *callback_token_received_handle; int callback_token_received_fn (enum totem_callback_token_type type, const void *data) { struct totempg_mcast mcast; struct iovec iovecs[3]; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&mcast_msg_mutex); } if (mcast_packed_msg_count == 0) { if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (0); } if (totemsrp_avail(totemsrp_context) == 0) { if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (0); } mcast.header.version = 0; mcast.header.type = 0; mcast.fragmented = 0; /* * Was the first message in this buffer a continuation of a * fragmented message? */ mcast.continuation = fragment_continuation; fragment_continuation = 0; mcast.msg_count = mcast_packed_msg_count; iovecs[0].iov_base = (void *)&mcast; iovecs[0].iov_len = sizeof (struct totempg_mcast); iovecs[1].iov_base = (void *)mcast_packed_msg_lens; iovecs[1].iov_len = mcast_packed_msg_count * sizeof (unsigned short); iovecs[2].iov_base = (void *)&fragmentation_data[0]; iovecs[2].iov_len = fragment_size; (void)totemsrp_mcast (totemsrp_context, iovecs, 3, 0); mcast_packed_msg_count = 0; fragment_size = 0; if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (0); } /* * Initialize the totem process group abstraction */ int totempg_initialize ( qb_loop_t *poll_handle, struct totem_config *totem_config) { int res; totempg_totem_config = totem_config; totempg_log_level_security = totem_config->totem_logging_configuration.log_level_security; totempg_log_level_error = totem_config->totem_logging_configuration.log_level_error; totempg_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; totempg_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; totempg_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; totempg_log_printf = totem_config->totem_logging_configuration.log_printf; totempg_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; fragmentation_data = malloc (TOTEMPG_PACKET_SIZE); if (fragmentation_data == 0) { return (-1); } totemsrp_net_mtu_adjust (totem_config); res = totemsrp_initialize ( poll_handle, &totemsrp_context, totem_config, &totempg_stats, totempg_deliver_fn, totempg_confchg_fn, totempg_waiting_trans_ack_cb); if (res == -1) { goto error_exit; } totemsrp_callback_token_create ( totemsrp_context, &callback_token_received_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, callback_token_received_fn, 0); totempg_size_limit = (totemsrp_avail(totemsrp_context) - 1) * (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16); qb_list_init (&totempg_groups_list); error_exit: return (res); } void totempg_finalize (void) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } totemsrp_finalize (totemsrp_context); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } } /* * Multicast a message */ static int mcast_msg ( struct iovec *iovec_in, unsigned int iov_len, int guarantee) { int res = 0; struct totempg_mcast mcast; struct iovec iovecs[3]; struct iovec iovec[64]; int i; int dest, src; int max_packet_size = 0; int copy_len = 0; int copy_base = 0; int total_size = 0; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&mcast_msg_mutex); } totemsrp_event_signal (totemsrp_context, TOTEM_EVENT_NEW_MSG, 1); /* * Remove zero length iovectors from the list */ assert (iov_len < 64); for (dest = 0, src = 0; src < iov_len; src++) { if (iovec_in[src].iov_len) { memcpy (&iovec[dest++], &iovec_in[src], sizeof (struct iovec)); } } iov_len = dest; max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof (unsigned short) * (mcast_packed_msg_count + 1)); mcast_packed_msg_lens[mcast_packed_msg_count] = 0; /* * Check if we would overwrite new message queue */ for (i = 0; i < iov_len; i++) { total_size += iovec[i].iov_len; } if (byte_count_send_ok (total_size + sizeof(unsigned short) * (mcast_packed_msg_count)) == 0) { if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return(-1); } memset(&mcast, 0, sizeof(mcast)); mcast.header.version = 0; for (i = 0; i < iov_len; ) { mcast.fragmented = 0; mcast.continuation = fragment_continuation; copy_len = iovec[i].iov_len - copy_base; /* * If it all fits with room left over, copy it in. * We need to leave at least sizeof(short) + 1 bytes in the * fragment_buffer on exit so that max_packet_size + fragment_size * doesn't exceed the size of the fragment_buffer on the next call. */ if ((iovec[i].iov_len + fragment_size) < (max_packet_size - sizeof (unsigned short))) { memcpy (&fragmentation_data[fragment_size], (char *)iovec[i].iov_base + copy_base, copy_len); fragment_size += copy_len; mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len; next_fragment = 1; copy_len = 0; copy_base = 0; i++; continue; /* * If it just fits or is too big, then send out what fits. */ } else { unsigned char *data_ptr; copy_len = min(copy_len, max_packet_size - fragment_size); if( copy_len == max_packet_size ) data_ptr = (unsigned char *)iovec[i].iov_base + copy_base; else { data_ptr = fragmentation_data; } memcpy (&fragmentation_data[fragment_size], (unsigned char *)iovec[i].iov_base + copy_base, copy_len); mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len; /* * if we're not on the last iovec or the iovec is too large to * fit, then indicate a fragment. This also means that the next * message will have the continuation of this one. */ if ((i < (iov_len - 1)) || ((copy_base + copy_len) < iovec[i].iov_len)) { if (!next_fragment) { next_fragment++; } fragment_continuation = next_fragment; mcast.fragmented = next_fragment++; assert(fragment_continuation != 0); assert(mcast.fragmented != 0); } else { fragment_continuation = 0; } /* * assemble the message and send it */ mcast.msg_count = ++mcast_packed_msg_count; iovecs[0].iov_base = (void *)&mcast; iovecs[0].iov_len = sizeof(struct totempg_mcast); iovecs[1].iov_base = (void *)mcast_packed_msg_lens; iovecs[1].iov_len = mcast_packed_msg_count * sizeof(unsigned short); iovecs[2].iov_base = (void *)data_ptr; iovecs[2].iov_len = fragment_size + copy_len; assert (totemsrp_avail(totemsrp_context) > 0); res = totemsrp_mcast (totemsrp_context, iovecs, 3, guarantee); if (res == -1) { goto error_exit; } /* * Recalculate counts and indexes for the next. */ mcast_packed_msg_lens[0] = 0; mcast_packed_msg_count = 0; fragment_size = 0; max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof(unsigned short)); /* * If the iovec all fit, go to the next iovec */ if ((copy_base + copy_len) == iovec[i].iov_len) { copy_len = 0; copy_base = 0; i++; /* * Continue with the rest of the current iovec. */ } else { copy_base += copy_len; } } } /* * Bump only if we added message data. This may be zero if * the last buffer just fit into the fragmentation_data buffer * and we were at the last iovec. */ if (mcast_packed_msg_lens[mcast_packed_msg_count]) { mcast_packed_msg_count++; } error_exit: if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); } return (res); } /* * Determine if a message of msg_size could be queued */ static int msg_count_send_ok ( int msg_count) { int avail = 0; avail = totemsrp_avail (totemsrp_context); totempg_stats.msg_queue_avail = avail; return ((avail - totempg_reserved) > msg_count); } static int byte_count_send_ok ( int byte_count) { unsigned int msg_count = 0; int avail = 0; avail = totemsrp_avail (totemsrp_context); msg_count = (byte_count / (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16)) + 1; return (avail >= msg_count); } static int send_reserve ( int msg_size) { unsigned int msg_count = 0; msg_count = (msg_size / (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16)) + 1; totempg_reserved += msg_count; totempg_stats.msg_reserved = totempg_reserved; return (msg_count); } static void send_release ( int msg_count) { totempg_reserved -= msg_count; totempg_stats.msg_reserved = totempg_reserved; } #ifndef HAVE_SMALL_MEMORY_FOOTPRINT #undef MESSAGE_QUEUE_MAX #define MESSAGE_QUEUE_MAX ((4 * MESSAGE_SIZE_MAX) / totempg_totem_config->net_mtu) #endif /* HAVE_SMALL_MEMORY_FOOTPRINT */ static uint32_t q_level_precent_used(void) { return (100 - (((totemsrp_avail(totemsrp_context) - totempg_reserved) * 100) / MESSAGE_QUEUE_MAX)); } int totempg_callback_token_create ( void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&callback_token_mutex); } res = totemsrp_callback_token_create (totemsrp_context, handle_out, type, delete, callback_fn, data); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&callback_token_mutex); } return (res); } void totempg_callback_token_destroy ( void *handle_out) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&callback_token_mutex); } totemsrp_callback_token_destroy (totemsrp_context, handle_out); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&callback_token_mutex); } } /* * vi: set autoindent tabstop=4 shiftwidth=4 : */ int totempg_groups_initialize ( void **totempg_groups_instance, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)) { struct totempg_group_instance *instance; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } instance = malloc (sizeof (struct totempg_group_instance)); if (instance == NULL) { goto error_exit; } instance->deliver_fn = deliver_fn; instance->confchg_fn = confchg_fn; instance->groups = 0; instance->groups_cnt = 0; instance->q_level = QB_LOOP_MED; qb_list_init (&instance->list); qb_list_add (&instance->list, &totempg_groups_list); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } *totempg_groups_instance = instance; return (0); error_exit: if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (-1); } int totempg_groups_join ( void *totempg_groups_instance, const struct totempg_group *groups, size_t group_cnt) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; struct totempg_group *new_groups; int res = 0; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } new_groups = realloc (instance->groups, sizeof (struct totempg_group) * (instance->groups_cnt + group_cnt)); if (new_groups == 0) { res = -1; goto error_exit; } memcpy (&new_groups[instance->groups_cnt], groups, group_cnt * sizeof (struct totempg_group)); instance->groups = new_groups; instance->groups_cnt += group_cnt; error_exit: if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } int totempg_groups_leave ( void *totempg_groups_instance, const struct totempg_group *groups, size_t group_cnt) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (0); } #define MAX_IOVECS_FROM_APP 32 #define MAX_GROUPS_PER_MSG 32 int totempg_groups_mcast_joined ( void *totempg_groups_instance, const struct iovec *iovec, unsigned int iov_len, int guarantee) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; unsigned short group_len[MAX_GROUPS_PER_MSG + 1]; struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP]; int i; unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } /* * Build group_len structure and the iovec_mcast structure */ group_len[0] = instance->groups_cnt; for (i = 0; i < instance->groups_cnt; i++) { group_len[i + 1] = instance->groups[i].group_len; iovec_mcast[i + 1].iov_len = instance->groups[i].group_len; iovec_mcast[i + 1].iov_base = (void *) instance->groups[i].group; } iovec_mcast[0].iov_len = (instance->groups_cnt + 1) * sizeof (unsigned short); iovec_mcast[0].iov_base = group_len; for (i = 0; i < iov_len; i++) { iovec_mcast[i + instance->groups_cnt + 1].iov_len = iovec[i].iov_len; iovec_mcast[i + instance->groups_cnt + 1].iov_base = iovec[i].iov_base; } res = mcast_msg (iovec_mcast, iov_len + instance->groups_cnt + 1, guarantee); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } static void check_q_level( void *totempg_groups_instance) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; int32_t old_level = instance->q_level; int32_t percent_used = q_level_precent_used(); if (percent_used >= 75 && instance->q_level != TOTEM_Q_LEVEL_CRITICAL) { instance->q_level = TOTEM_Q_LEVEL_CRITICAL; } else if (percent_used < 30 && instance->q_level != TOTEM_Q_LEVEL_LOW) { instance->q_level = TOTEM_Q_LEVEL_LOW; } else if (percent_used > 40 && percent_used < 50 && instance->q_level != TOTEM_Q_LEVEL_GOOD) { instance->q_level = TOTEM_Q_LEVEL_GOOD; } else if (percent_used > 60 && percent_used < 70 && instance->q_level != TOTEM_Q_LEVEL_HIGH) { instance->q_level = TOTEM_Q_LEVEL_HIGH; } if (totem_queue_level_changed && old_level != instance->q_level) { totem_queue_level_changed(instance->q_level); } } void totempg_check_q_level( void *totempg_groups_instance) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; check_q_level(instance); } int totempg_groups_joined_reserve ( void *totempg_groups_instance, const struct iovec *iovec, unsigned int iov_len) { struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance; unsigned int size = 0; unsigned int i; unsigned int reserved = 0; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); pthread_mutex_lock (&mcast_msg_mutex); } for (i = 0; i < instance->groups_cnt; i++) { size += instance->groups[i].group_len; } for (i = 0; i < iov_len; i++) { size += iovec[i].iov_len; } if (size >= totempg_size_limit) { reserved = -1; goto error_exit; } if (byte_count_send_ok (size)) { reserved = send_reserve (size); } else { reserved = 0; } error_exit: check_q_level(instance); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); pthread_mutex_unlock (&totempg_mutex); } return (reserved); } int totempg_groups_joined_release (int msg_count) { if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); pthread_mutex_lock (&mcast_msg_mutex); } send_release (msg_count); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&mcast_msg_mutex); pthread_mutex_unlock (&totempg_mutex); } return 0; } int totempg_groups_mcast_groups ( void *totempg_groups_instance, int guarantee, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, unsigned int iov_len) { unsigned short group_len[MAX_GROUPS_PER_MSG + 1]; struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP]; int i; unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } /* * Build group_len structure and the iovec_mcast structure */ group_len[0] = groups_cnt; for (i = 0; i < groups_cnt; i++) { group_len[i + 1] = groups[i].group_len; iovec_mcast[i + 1].iov_len = groups[i].group_len; iovec_mcast[i + 1].iov_base = (void *) groups[i].group; } iovec_mcast[0].iov_len = (groups_cnt + 1) * sizeof (unsigned short); iovec_mcast[0].iov_base = group_len; for (i = 0; i < iov_len; i++) { iovec_mcast[i + groups_cnt + 1].iov_len = iovec[i].iov_len; iovec_mcast[i + groups_cnt + 1].iov_base = iovec[i].iov_base; } res = mcast_msg (iovec_mcast, iov_len + groups_cnt + 1, guarantee); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } /* * Returns -1 if error, 0 if can't send, 1 if can send the message */ int totempg_groups_send_ok_groups ( void *totempg_groups_instance, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, unsigned int iov_len) { unsigned int size = 0; unsigned int i; unsigned int res; if (totempg_threaded_mode == 1) { pthread_mutex_lock (&totempg_mutex); } for (i = 0; i < groups_cnt; i++) { size += groups[i].group_len; } for (i = 0; i < iov_len; i++) { size += iovec[i].iov_len; } res = msg_count_send_ok (size); if (totempg_threaded_mode == 1) { pthread_mutex_unlock (&totempg_mutex); } return (res); } int totempg_iface_set ( struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no) { int res; res = totemsrp_iface_set ( totemsrp_context, interface_addr, ip_port, iface_no); return (res); } int totempg_ifaces_get ( unsigned int nodeid, unsigned int *interface_id, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count) { int res; res = totemsrp_ifaces_get ( totemsrp_context, nodeid, interface_id, interfaces, interfaces_size, status, iface_count); return (res); } void totempg_event_signal (enum totem_event_type type, int value) { totemsrp_event_signal (totemsrp_context, type, value); } void* totempg_get_stats (void) { return &totempg_stats; } int totempg_crypto_set ( const char *cipher_type, const char *hash_type) { int res; res = totemsrp_crypto_set (totemsrp_context, cipher_type, hash_type); return (res); } #define ONE_IFACE_LEN 63 const char *totempg_ifaces_print (unsigned int nodeid) { static char iface_string[256 * INTERFACE_MAX]; char one_iface[ONE_IFACE_LEN+1]; struct totem_ip_address interfaces[INTERFACE_MAX]; unsigned int iface_count; unsigned int iface_ids[INTERFACE_MAX]; unsigned int i; int res; iface_string[0] = '\0'; res = totempg_ifaces_get (nodeid, iface_ids, interfaces, INTERFACE_MAX, NULL, &iface_count); if (res == -1) { return ("no interface found for nodeid"); } res = totempg_ifaces_get (nodeid, iface_ids, interfaces, INTERFACE_MAX, NULL, &iface_count); for (i = 0; i < iface_count; i++) { if (!interfaces[i].family) { continue; } snprintf (one_iface, ONE_IFACE_LEN, "r(%d) ip(%s) ", i, totemip_print (&interfaces[i])); strcat (iface_string, one_iface); } return (iface_string); } unsigned int totempg_my_nodeid_get (void) { return (totemsrp_my_nodeid_get(totemsrp_context)); } int totempg_my_family_get (void) { return (totemsrp_my_family_get(totemsrp_context)); } extern void totempg_service_ready_register ( void (*totem_service_ready) (void)) { totemsrp_service_ready_register (totemsrp_context, totem_service_ready); } void totempg_queue_level_register_callback (totem_queue_level_changed_fn fn) { totem_queue_level_changed = fn; } extern int totempg_member_add ( const struct totem_ip_address *member, int ring_no) { return totemsrp_member_add (totemsrp_context, member, ring_no); } extern int totempg_member_remove ( const struct totem_ip_address *member, int ring_no) { return totemsrp_member_remove (totemsrp_context, member, ring_no); } extern int totempg_reconfigure (void) { return totemsrp_reconfigure (totemsrp_context, totempg_totem_config); } +extern int totempg_crypto_reconfigure_phase (cfg_message_crypto_reconfig_phase_t phase) +{ + return totemsrp_crypto_reconfigure_phase (totemsrp_context, totempg_totem_config, phase); +} + extern void totempg_stats_clear (int flags) { if (flags & TOTEMPG_STATS_CLEAR_TOTEM) { totempg_stats.msg_reserved = 0; totempg_stats.msg_queue_avail = 0; } return totemsrp_stats_clear (totemsrp_context, flags); } void totempg_threaded_mode_enable (void) { totempg_threaded_mode = 1; totemsrp_threaded_mode_enable (totemsrp_context); } void totempg_trans_ack (void) { totemsrp_trans_ack (totemsrp_context); } void totempg_force_gather (void) { totemsrp_force_gather(totemsrp_context); } /* Assumes ->orig_interfaces is already allocated */ void totempg_get_config(struct totem_config *config) { struct totem_interface *temp_if = config->orig_interfaces; memcpy(config, totempg_totem_config, sizeof(struct totem_config)); config->orig_interfaces = temp_if; memcpy(config->orig_interfaces, totempg_totem_config->interfaces, sizeof(struct totem_interface) * INTERFACE_MAX); config->interfaces = NULL; } void totempg_put_config(struct totem_config *config) { struct totem_interface *temp_if = totempg_totem_config->interfaces; /* Preseve the existing interfaces[] array as transports might have pointers saved */ memcpy(totempg_totem_config->interfaces, config->interfaces, sizeof(struct totem_interface) * INTERFACE_MAX); memcpy(totempg_totem_config, config, sizeof(struct totem_config)); totempg_totem_config->interfaces = temp_if; } diff --git a/exec/totemsrp.c b/exec/totemsrp.c index a825cca7..19cfb361 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -1,5197 +1,5206 @@ /* * Copyright (c) 2003-2006 MontaVista Software, Inc. * Copyright (c) 2006-2018 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * The first version of this code was based upon Yair Amir's PhD thesis: * http://www.cs.jhu.edu/~yairamir/phd.ps) (ch4,5). * * The current version of totemsrp implements the Totem protocol specified in: * http://citeseer.ist.psu.edu/amir95totem.html * * The deviations from the above published protocols are: * - token hold mode where token doesn't rotate on unused ring - reduces cpu * usage on 1.6ghz xeon from 35% to less then .1 % as measured by top */ #include #include #ifdef HAVE_ALLOCA_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOGSYS_UTILS_ONLY 1 #include #include "totemsrp.h" #include "totemnet.h" #include "icmap.h" #include "totemconfig.h" #include "cs_queue.h" #define LOCALHOST_IP inet_addr("127.0.0.1") #define QUEUE_RTR_ITEMS_SIZE_MAX 16384 /* allow 16384 retransmit items */ #define RETRANS_MESSAGE_QUEUE_SIZE_MAX 16384 /* allow 500 messages to be queued */ #define RECEIVED_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */ #define MAXIOVS 5 #define RETRANSMIT_ENTRIES_MAX 30 #define TOKEN_SIZE_MAX 64000 /* bytes */ #define LEAVE_DUMMY_NODEID 0 /* * SRP address. */ struct srp_addr { unsigned int nodeid; }; /* * Rollover handling: * SEQNO_START_MSG is the starting sequence number after a new configuration * This should remain zero, unless testing overflow in which case * 0x7ffff000 and 0xfffff000 are good starting values. * * SEQNO_START_TOKEN is the starting sequence number after a new configuration * for a token. This should remain zero, unless testing overflow in which * case 07fffff00 or 0xffffff00 are good starting values. */ #define SEQNO_START_MSG 0x0 #define SEQNO_START_TOKEN 0x0 /* * These can be used ot test different rollover points * #define SEQNO_START_MSG 0xfffffe00 * #define SEQNO_START_TOKEN 0xfffffe00 */ /* * These can be used to test the error recovery algorithms * #define TEST_DROP_ORF_TOKEN_PERCENTAGE 30 * #define TEST_DROP_COMMIT_TOKEN_PERCENTAGE 30 * #define TEST_DROP_MCAST_PERCENTAGE 50 * #define TEST_RECOVERY_MSG_COUNT 300 */ /* * we compare incoming messages to determine if their endian is * different - if so convert them * * do not change */ #define ENDIAN_LOCAL 0xff22 enum message_type { MESSAGE_TYPE_ORF_TOKEN = 0, /* Ordering, Reliability, Flow (ORF) control Token */ MESSAGE_TYPE_MCAST = 1, /* ring ordered multicast message */ MESSAGE_TYPE_MEMB_MERGE_DETECT = 2, /* merge rings if there are available rings */ MESSAGE_TYPE_MEMB_JOIN = 3, /* membership join message */ MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4, /* membership commit token */ MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5, /* cancel the holding of the token */ }; enum encapsulation_type { MESSAGE_ENCAPSULATED = 1, MESSAGE_NOT_ENCAPSULATED = 2 }; /* * New membership algorithm local variables */ struct consensus_list_item { struct srp_addr addr; int set; }; struct token_callback_instance { struct qb_list_head list; int (*callback_fn) (enum totem_callback_token_type type, const void *); enum totem_callback_token_type callback_type; int delete; void *data; }; struct totemsrp_socket { int mcast; int token; }; struct mcast { struct totem_message_header header; struct srp_addr system_from; unsigned int seq; int this_seqno; struct memb_ring_id ring_id; unsigned int node_id; int guarantee; } __attribute__((packed)); struct rtr_item { struct memb_ring_id ring_id; unsigned int seq; }__attribute__((packed)); struct orf_token { struct totem_message_header header; unsigned int seq; unsigned int token_seq; unsigned int aru; unsigned int aru_addr; struct memb_ring_id ring_id; unsigned int backlog; unsigned int fcc; int retrans_flg; int rtr_list_entries; struct rtr_item rtr_list[0]; }__attribute__((packed)); struct memb_join { struct totem_message_header header; struct srp_addr system_from; unsigned int proc_list_entries; unsigned int failed_list_entries; unsigned long long ring_seq; unsigned char end_of_memb_join[0]; /* * These parts of the data structure are dynamic: * struct srp_addr proc_list[]; * struct srp_addr failed_list[]; */ } __attribute__((packed)); struct memb_merge_detect { struct totem_message_header header; struct srp_addr system_from; struct memb_ring_id ring_id; } __attribute__((packed)); struct token_hold_cancel { struct totem_message_header header; struct memb_ring_id ring_id; } __attribute__((packed)); struct memb_commit_token_memb_entry { struct memb_ring_id ring_id; unsigned int aru; unsigned int high_delivered; unsigned int received_flg; }__attribute__((packed)); struct memb_commit_token { struct totem_message_header header; unsigned int token_seq; struct memb_ring_id ring_id; unsigned int retrans_flg; int memb_index; int addr_entries; unsigned char end_of_commit_token[0]; /* * These parts of the data structure are dynamic: * * struct srp_addr addr[PROCESSOR_COUNT_MAX]; * struct memb_commit_token_memb_entry memb_list[PROCESSOR_COUNT_MAX]; */ }__attribute__((packed)); struct message_item { struct mcast *mcast; unsigned int msg_len; }; struct sort_queue_item { struct mcast *mcast; unsigned int msg_len; }; enum memb_state { MEMB_STATE_OPERATIONAL = 1, MEMB_STATE_GATHER = 2, MEMB_STATE_COMMIT = 3, MEMB_STATE_RECOVERY = 4 }; struct totemsrp_instance { int iface_changes; int failed_to_recv; /* * Flow control mcasts and remcasts on last and current orf_token */ int fcc_remcast_last; int fcc_mcast_last; int fcc_remcast_current; struct consensus_list_item consensus_list[PROCESSOR_COUNT_MAX]; int consensus_list_entries; int lowest_active_if; struct srp_addr my_id; struct totem_ip_address my_addrs[INTERFACE_MAX]; struct srp_addr my_proc_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_failed_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_new_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_trans_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_deliver_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX]; unsigned int my_leave_memb_list[PROCESSOR_COUNT_MAX]; int my_proc_list_entries; int my_failed_list_entries; int my_new_memb_entries; int my_trans_memb_entries; int my_memb_entries; int my_deliver_memb_entries; int my_left_memb_entries; int my_leave_memb_entries; struct memb_ring_id my_ring_id; struct memb_ring_id my_old_ring_id; int my_aru_count; int my_merge_detect_timeout_outstanding; unsigned int my_last_aru; int my_seq_unchanged; int my_received_flg; unsigned int my_high_seq_received; unsigned int my_install_seq; int my_rotation_counter; int my_set_retrans_flg; int my_retrans_flg_count; unsigned int my_high_ring_delivered; int heartbeat_timeout; /* * Queues used to order, deliver, and recover messages */ struct cs_queue new_message_queue; struct cs_queue new_message_queue_trans; struct cs_queue retrans_message_queue; struct sq regular_sort_queue; struct sq recovery_sort_queue; /* * Received up to and including */ unsigned int my_aru; unsigned int my_high_delivered; struct qb_list_head token_callback_received_listhead; struct qb_list_head token_callback_sent_listhead; char orf_token_retransmit[TOKEN_SIZE_MAX]; int orf_token_retransmit_size; unsigned int my_token_seq; /* * Timers */ qb_loop_timer_handle timer_pause_timeout; qb_loop_timer_handle timer_orf_token_timeout; qb_loop_timer_handle timer_orf_token_warning; qb_loop_timer_handle timer_orf_token_retransmit_timeout; qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout; qb_loop_timer_handle timer_merge_detect_timeout; qb_loop_timer_handle memb_timer_state_gather_join_timeout; qb_loop_timer_handle memb_timer_state_gather_consensus_timeout; qb_loop_timer_handle memb_timer_state_commit_timeout; qb_loop_timer_handle timer_heartbeat_timeout; /* * Function and data used to log messages */ int totemsrp_log_level_security; int totemsrp_log_level_error; int totemsrp_log_level_warning; int totemsrp_log_level_notice; int totemsrp_log_level_debug; int totemsrp_log_level_trace; int totemsrp_subsys_id; void (*totemsrp_log_printf) ( int level, int subsys, const char *function, const char *file, int line, const char *format, ...)__attribute__((format(printf, 6, 7)));; enum memb_state memb_state; //TODO struct srp_addr next_memb; qb_loop_t *totemsrp_poll_handle; struct totem_ip_address mcast_address; void (*totemsrp_deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required); void (*totemsrp_confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); void (*totemsrp_service_ready_fn) (void); void (*totemsrp_waiting_trans_ack_cb_fn) ( int waiting_trans_ack); void (*memb_ring_id_create_or_load) ( struct memb_ring_id *memb_ring_id, unsigned int nodeid); void (*memb_ring_id_store) ( const struct memb_ring_id *memb_ring_id, unsigned int nodeid); int global_seqno; int my_token_held; unsigned long long token_ring_id_seq; unsigned int last_released; unsigned int set_aru; int old_ring_state_saved; int old_ring_state_aru; unsigned int old_ring_state_high_seq_received; unsigned int my_last_seq; struct timeval tv_old; void *totemnet_context; struct totem_config *totem_config; unsigned int use_heartbeat; unsigned int my_trc; unsigned int my_pbl; unsigned int my_cbl; uint64_t pause_timestamp; struct memb_commit_token *commit_token; totemsrp_stats_t stats; uint32_t orf_token_discard; uint32_t originated_orf_token; uint32_t threaded_mode_enabled; uint32_t waiting_trans_ack; int flushing; void * token_recv_event_handle; void * token_sent_event_handle; char commit_token_storage[40000]; }; struct message_handlers { int count; int (*handler_functions[6]) ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); }; enum gather_state_from { TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT = 0, TOTEMSRP_GSFROM_GATHER_MISSING1 = 1, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE = 2, TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED = 3, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE = 4, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE = 5, TOTEMSRP_GSFROM_FAILED_TO_RECEIVE = 6, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE = 7, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE = 8, TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE = 9, TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE = 10, TOTEMSRP_GSFROM_MERGE_DURING_JOIN = 11, TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE = 12, TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE = 13, TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY = 14, TOTEMSRP_GSFROM_INTERFACE_CHANGE = 15, TOTEMSRP_GSFROM_MAX = TOTEMSRP_GSFROM_INTERFACE_CHANGE, }; const char* gather_state_from_desc [] = { [TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT] = "consensus timeout", [TOTEMSRP_GSFROM_GATHER_MISSING1] = "MISSING", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE] = "The token was lost in the OPERATIONAL state.", [TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED] = "The consensus timeout expired.", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE] = "The token was lost in the COMMIT state.", [TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE] = "The token was lost in the RECOVERY state.", [TOTEMSRP_GSFROM_FAILED_TO_RECEIVE] = "failed to receive", [TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE] = "foreign message in operational state", [TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE] = "foreign message in gather state", [TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE] = "merge during operational state", [TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE] = "merge during gather state", [TOTEMSRP_GSFROM_MERGE_DURING_JOIN] = "merge during join", [TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE] = "join during operational state", [TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE] = "join during commit state", [TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY] = "join during recovery", [TOTEMSRP_GSFROM_INTERFACE_CHANGE] = "interface change", }; /* * forward decls */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed); static void totemsrp_instance_initialize (struct totemsrp_instance *instance); static void srp_addr_to_nodeid ( struct totemsrp_instance *instance, unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries); static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b); static void memb_leave_message_send (struct totemsrp_instance *instance); static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type); static void memb_state_gather_enter (struct totemsrp_instance *instance, enum gather_state_from gather_from); static void messages_deliver_to_app (struct totemsrp_instance *instance, int skip, unsigned int end_point); static int orf_token_mcast (struct totemsrp_instance *instance, struct orf_token *oken, int fcc_mcasts_allowed); static void messages_free (struct totemsrp_instance *instance, unsigned int token_aru); static void memb_ring_id_set (struct totemsrp_instance *instance, const struct memb_ring_id *ring_id); static void target_set_completed (void *context); static void memb_state_commit_token_update (struct totemsrp_instance *instance); static void memb_state_commit_token_target_set (struct totemsrp_instance *instance); static int memb_state_commit_token_send (struct totemsrp_instance *instance); static int memb_state_commit_token_send_recovery (struct totemsrp_instance *instance, struct memb_commit_token *memb_commit_token); static void memb_state_commit_token_create (struct totemsrp_instance *instance); static int token_hold_cancel_send (struct totemsrp_instance *instance); static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out); static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out); static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out); static void mcast_endian_convert (const struct mcast *in, struct mcast *out); static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out); static struct srp_addr srp_addr_endian_convert (struct srp_addr in); static void timer_function_orf_token_timeout (void *data); static void timer_function_orf_token_warning (void *data); static void timer_function_pause_timeout (void *data); static void timer_function_heartbeat_timeout (void *data); static void timer_function_token_retransmit_timeout (void *data); static void timer_function_token_hold_retransmit_timeout (void *data); static void timer_function_merge_detect_timeout (void *data); static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance); static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr); static const char* gsfrom_to_msg(enum gather_state_from gsfrom); void main_deliver_fn ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from); void main_iface_change_fn ( void *context, const struct totem_ip_address *iface_address, unsigned int iface_no); struct message_handlers totemsrp_message_handlers = { 6, { message_handler_orf_token, /* MESSAGE_TYPE_ORF_TOKEN */ message_handler_mcast, /* MESSAGE_TYPE_MCAST */ message_handler_memb_merge_detect, /* MESSAGE_TYPE_MEMB_MERGE_DETECT */ message_handler_memb_join, /* MESSAGE_TYPE_MEMB_JOIN */ message_handler_memb_commit_token, /* MESSAGE_TYPE_MEMB_COMMIT_TOKEN */ message_handler_token_hold_cancel /* MESSAGE_TYPE_TOKEN_HOLD_CANCEL */ } }; #define log_printf(level, format, args...) \ do { \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ format, ##args); \ } while (0); #define LOGSYS_PERROR(err_num, level, fmt, args...) \ do { \ char _error_str[LOGSYS_MAX_PERROR_MSG_LEN]; \ const char *_error_ptr = qb_strerror_r(err_num, _error_str, sizeof(_error_str)); \ instance->totemsrp_log_printf ( \ level, instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ fmt ": %s (%d)\n", ##args, _error_ptr, err_num); \ } while(0) static const char* gsfrom_to_msg(enum gather_state_from gsfrom) { if (gsfrom <= TOTEMSRP_GSFROM_MAX) { return gather_state_from_desc[gsfrom]; } else { return "UNKNOWN"; } } static void totemsrp_instance_initialize (struct totemsrp_instance *instance) { memset (instance, 0, sizeof (struct totemsrp_instance)); qb_list_init (&instance->token_callback_received_listhead); qb_list_init (&instance->token_callback_sent_listhead); instance->my_received_flg = 1; instance->my_token_seq = SEQNO_START_TOKEN - 1; instance->memb_state = MEMB_STATE_OPERATIONAL; instance->set_aru = -1; instance->my_aru = SEQNO_START_MSG; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_high_delivered = SEQNO_START_MSG; instance->orf_token_discard = 0; instance->originated_orf_token = 0; instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage; instance->waiting_trans_ack = 1; } static int pause_flush (struct totemsrp_instance *instance) { uint64_t now_msec; uint64_t timestamp_msec; int res = 0; now_msec = (qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC); timestamp_msec = instance->pause_timestamp / QB_TIME_NS_IN_MSEC; if ((now_msec - timestamp_msec) > (instance->totem_config->token_timeout / 2)) { log_printf (instance->totemsrp_log_level_notice, "Process pause detected for %d ms, flushing membership messages.", (unsigned int)(now_msec - timestamp_msec)); /* * -1 indicates an error from recvmsg */ do { res = totemnet_recv_mcast_empty (instance->totemnet_context); } while (res == -1); } return (res); } static int token_event_stats_collector (enum totem_callback_token_type type, const void *void_instance) { struct totemsrp_instance *instance = (struct totemsrp_instance *)void_instance; uint32_t time_now; unsigned long long nano_secs = qb_util_nano_current_get (); time_now = (nano_secs / QB_TIME_NS_IN_MSEC); if (type == TOTEM_CALLBACK_TOKEN_RECEIVED) { /* incr latest token the index */ if (instance->stats.latest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.latest_token = 0; else instance->stats.latest_token++; if (instance->stats.earliest_token == instance->stats.latest_token) { /* we have filled up the array, start overwriting */ if (instance->stats.earliest_token == (TOTEM_TOKEN_STATS_MAX - 1)) instance->stats.earliest_token = 0; else instance->stats.earliest_token++; instance->stats.token[instance->stats.earliest_token].rx = 0; instance->stats.token[instance->stats.earliest_token].tx = 0; instance->stats.token[instance->stats.earliest_token].backlog_calc = 0; } instance->stats.token[instance->stats.latest_token].rx = time_now; instance->stats.token[instance->stats.latest_token].tx = 0; /* in case we drop the token */ } else { instance->stats.token[instance->stats.latest_token].tx = time_now; } return 0; } static void totempg_mtu_changed(void *context, int net_mtu) { struct totemsrp_instance *instance = context; instance->totem_config->net_mtu = net_mtu - 2 * sizeof (struct mcast); log_printf (instance->totemsrp_log_level_debug, "Net MTU changed to %d, new value is %d", net_mtu, instance->totem_config->net_mtu); } /* * Exported interfaces */ int totemsrp_initialize ( qb_loop_t *poll_handle, void **srp_context, struct totem_config *totem_config, totempg_stats_t *stats, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id), void (*waiting_trans_ack_cb_fn) ( int waiting_trans_ack)) { struct totemsrp_instance *instance; int res; instance = malloc (sizeof (struct totemsrp_instance)); if (instance == NULL) { goto error_exit; } totemsrp_instance_initialize (instance); instance->totemsrp_waiting_trans_ack_cb_fn = waiting_trans_ack_cb_fn; instance->totemsrp_waiting_trans_ack_cb_fn (1); stats->srp = &instance->stats; instance->stats.latest_token = 0; instance->stats.earliest_token = 0; instance->totem_config = totem_config; /* * Configure logging */ instance->totemsrp_log_level_security = totem_config->totem_logging_configuration.log_level_security; instance->totemsrp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemsrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemsrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemsrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemsrp_log_level_trace = totem_config->totem_logging_configuration.log_level_trace; instance->totemsrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemsrp_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Configure totem store and load functions */ instance->memb_ring_id_create_or_load = totem_config->totem_memb_ring_id_create_or_load; instance->memb_ring_id_store = totem_config->totem_memb_ring_id_store; /* * Initialize local variables for totemsrp */ totemip_copy (&instance->mcast_address, &totem_config->interfaces[instance->lowest_active_if].mcast_addr); /* * Display totem configuration */ log_printf (instance->totemsrp_log_level_debug, "Token Timeout (%d ms) retransmit timeout (%d ms)", totem_config->token_timeout, totem_config->token_retransmit_timeout); if (totem_config->token_warning) { uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100; log_printf(instance->totemsrp_log_level_debug, "Token warning every %d ms (%d%% of Token Timeout)", token_warning_ms, totem_config->token_warning); if (token_warning_ms < totem_config->token_retransmit_timeout) log_printf (LOGSYS_LEVEL_DEBUG, "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) " "which can lead to spurious token warnings. Consider increasing the token_warning parameter.", token_warning_ms, totem_config->token_retransmit_timeout); } else { log_printf(instance->totemsrp_log_level_debug, "Token warnings disabled"); } log_printf (instance->totemsrp_log_level_debug, "token hold (%d ms) retransmits before loss (%d retrans)", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf (instance->totemsrp_log_level_debug, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf (instance->totemsrp_log_level_debug, "downcheck (%d ms) fail to recv const (%d msgs)", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf (instance->totemsrp_log_level_debug, "seqno unchanged const (%d rotations) Maximum network MTU %d", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf (instance->totemsrp_log_level_debug, "window size per rotation (%d messages) maximum messages per rotation (%d messages)", totem_config->window_size, totem_config->max_messages); log_printf (instance->totemsrp_log_level_debug, "missed count const (%d messages)", totem_config->miss_count_const); log_printf (instance->totemsrp_log_level_debug, "send threads (%d threads)", totem_config->threads); log_printf (instance->totemsrp_log_level_debug, "heartbeat_failures_allowed (%d)", totem_config->heartbeat_failures_allowed); log_printf (instance->totemsrp_log_level_debug, "max_network_delay (%d ms)", totem_config->max_network_delay); cs_queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); sq_init (&instance->regular_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); sq_init (&instance->recovery_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); instance->totemsrp_poll_handle = poll_handle; instance->totemsrp_deliver_fn = deliver_fn; instance->totemsrp_confchg_fn = confchg_fn; instance->use_heartbeat = 1; timer_function_pause_timeout (instance); if ( totem_config->heartbeat_failures_allowed == 0 ) { log_printf (instance->totemsrp_log_level_debug, "HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0"); instance->use_heartbeat = 0; } if (instance->use_heartbeat) { instance->heartbeat_timeout = (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout + totem_config->max_network_delay; if (instance->heartbeat_timeout >= totem_config->token_timeout) { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)", instance->heartbeat_timeout, totem_config->token_timeout); log_printf (instance->totemsrp_log_level_debug, "heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay"); log_printf (instance->totemsrp_log_level_debug, "heartbeat timeout should be less than the token timeout. Heartbeat is disabled!!"); instance->use_heartbeat = 0; } else { log_printf (instance->totemsrp_log_level_debug, "total heartbeat_timeout (%d ms)", instance->heartbeat_timeout); } } res = totemnet_initialize ( poll_handle, &instance->totemnet_context, totem_config, stats->srp, instance, main_deliver_fn, main_iface_change_fn, totempg_mtu_changed, target_set_completed); if (res == -1) { goto error_exit; } instance->my_id.nodeid = instance->totem_config->interfaces[instance->lowest_active_if].boundto.nodeid; /* * Must have net_mtu adjusted by totemnet_initialize first */ cs_queue_init (&instance->new_message_queue, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); cs_queue_init (&instance->new_message_queue_trans, MESSAGE_QUEUE_MAX, sizeof (struct message_item), instance->threaded_mode_enabled); totemsrp_callback_token_create (instance, &instance->token_recv_event_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, token_event_stats_collector, instance); totemsrp_callback_token_create (instance, &instance->token_sent_event_handle, TOTEM_CALLBACK_TOKEN_SENT, 0, token_event_stats_collector, instance); *srp_context = instance; return (0); error_exit: return (-1); } void totemsrp_finalize ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; memb_leave_message_send (instance); totemnet_finalize (instance->totemnet_context); cs_queue_free (&instance->new_message_queue); cs_queue_free (&instance->new_message_queue_trans); cs_queue_free (&instance->retrans_message_queue); sq_free (&instance->regular_sort_queue); sq_free (&instance->recovery_sort_queue); free (instance); } /* * Return configured interfaces. interfaces is array of totem_ip addresses allocated by caller, * with interaces_size number of items. iface_count is final number of interfaces filled by this * function. * * Function returns 0 on success, otherwise if interfaces array is not big enough, -2 is returned, * and if interface was not found, -1 is returned. */ int totemsrp_ifaces_get ( void *srp_context, unsigned int nodeid, unsigned int *interface_id, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; struct totem_ip_address *iface_ptr = interfaces; int res = 0; int i,n; int num_ifs = 0; memset(interfaces, 0, sizeof(struct totem_ip_address) * interfaces_size); *iface_count = INTERFACE_MAX; for (i=0; itotem_config->interfaces[i].member_count; n++) { if (instance->totem_config->interfaces[i].configured && instance->totem_config->interfaces[i].member_list[n].nodeid == nodeid) { memcpy(iface_ptr, &instance->totem_config->interfaces[i].member_list[n], sizeof(struct totem_ip_address)); interface_id[num_ifs] = i; iface_ptr++; if (++num_ifs > interfaces_size) { res = -2; break; } } } } totemnet_ifaces_get(instance->totemnet_context, status, iface_count); *iface_count = num_ifs; return (res); } int totemsrp_crypto_set ( void *srp_context, const char *cipher_type, const char *hash_type) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = totemnet_crypto_set(instance->totemnet_context, cipher_type, hash_type); return (res); } unsigned int totemsrp_my_nodeid_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; unsigned int res; res = instance->my_id.nodeid; return (res); } int totemsrp_my_family_get ( void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int res; res = instance->totem_config->interfaces[instance->lowest_active_if].boundto.family; return (res); } /* * Set operations for use by the membership algorithm */ static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b) { if (a->nodeid == b->nodeid) { return 1; } return 0; } static void srp_addr_to_nodeid ( struct totemsrp_instance *instance, unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries) { unsigned int i; for (i = 0; i < entries; i++) { nodeid_out[i] = srp_addr_in[i].nodeid; } } static struct srp_addr srp_addr_endian_convert (struct srp_addr in) { struct srp_addr res; res.nodeid = swab32 (in.nodeid); return (res); } static void memb_consensus_reset (struct totemsrp_instance *instance) { instance->consensus_list_entries = 0; } static void memb_set_subtract ( struct srp_addr *out_list, int *out_list_entries, struct srp_addr *one_list, int one_list_entries, struct srp_addr *two_list, int two_list_entries) { int found = 0; int i; int j; *out_list_entries = 0; for (i = 0; i < one_list_entries; i++) { for (j = 0; j < two_list_entries; j++) { if (srp_addr_equal (&one_list[i], &two_list[j])) { found = 1; break; } } if (found == 0) { out_list[*out_list_entries] = one_list[i]; *out_list_entries = *out_list_entries + 1; } found = 0; } } /* * Set consensus for a specific processor */ static void memb_consensus_set ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int found = 0; int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) { found = 1; break; /* found entry */ } } instance->consensus_list[i].addr = *addr; instance->consensus_list[i].set = 1; if (found == 0) { instance->consensus_list_entries++; } return; } /* * Is consensus set for a specific processor */ static int memb_consensus_isset ( struct totemsrp_instance *instance, const struct srp_addr *addr) { int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal (addr, &instance->consensus_list[i].addr)) { return (instance->consensus_list[i].set); } } return (0); } /* * Is consensus agreed upon based upon consensus database */ static int memb_consensus_agreed ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int agreed = 1; int i; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); for (i = 0; i < token_memb_entries; i++) { if (memb_consensus_isset (instance, &token_memb[i]) == 0) { agreed = 0; break; } } if (agreed && instance->failed_to_recv == 1) { /* * Both nodes agreed on our failure. We don't care how many proc list items left because we * will create single ring anyway. */ return (agreed); } assert (token_memb_entries >= 1); return (agreed); } static void memb_consensus_notset ( struct totemsrp_instance *instance, struct srp_addr *no_consensus_list, int *no_consensus_list_entries, struct srp_addr *comparison_list, int comparison_list_entries) { int i; *no_consensus_list_entries = 0; for (i = 0; i < instance->my_proc_list_entries; i++) { if (memb_consensus_isset (instance, &instance->my_proc_list[i]) == 0) { no_consensus_list[*no_consensus_list_entries] = instance->my_proc_list[i]; *no_consensus_list_entries = *no_consensus_list_entries + 1; } } } /* * Is set1 equal to set2 Entries can be in different orders */ static int memb_set_equal ( struct srp_addr *set1, int set1_entries, struct srp_addr *set2, int set2_entries) { int i; int j; int found = 0; if (set1_entries != set2_entries) { return (0); } for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { found = 1; break; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * Is subset fully contained in fullset */ static int memb_set_subset ( const struct srp_addr *subset, int subset_entries, const struct srp_addr *fullset, int fullset_entries) { int i; int j; int found = 0; if (subset_entries > fullset_entries) { return (0); } for (i = 0; i < subset_entries; i++) { for (j = 0; j < fullset_entries; j++) { if (srp_addr_equal (&subset[i], &fullset[j])) { found = 1; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * merge subset into fullset taking care not to add duplicates */ static void memb_set_merge ( const struct srp_addr *subset, int subset_entries, struct srp_addr *fullset, int *fullset_entries) { int found = 0; int i; int j; for (i = 0; i < subset_entries; i++) { for (j = 0; j < *fullset_entries; j++) { if (srp_addr_equal (&fullset[j], &subset[i])) { found = 1; break; } } if (found == 0) { fullset[*fullset_entries] = subset[i]; *fullset_entries = *fullset_entries + 1; } found = 0; } return; } static void memb_set_and_with_ring_id ( struct srp_addr *set1, struct memb_ring_id *set1_ring_ids, int set1_entries, struct srp_addr *set2, int set2_entries, struct memb_ring_id *old_ring_id, struct srp_addr *and, int *and_entries) { int i; int j; int found = 0; *and_entries = 0; for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { if (memcmp (&set1_ring_ids[j], old_ring_id, sizeof (struct memb_ring_id)) == 0) { found = 1; } break; } } if (found) { and[*and_entries] = set1[j]; *and_entries = *and_entries + 1; } found = 0; } return; } static void memb_set_log( struct totemsrp_instance *instance, int level, const char *string, struct srp_addr *list, int list_entries) { char int_buf[32]; char list_str[512]; int i; memset(list_str, 0, sizeof(list_str)); for (i = 0; i < list_entries; i++) { if (i == 0) { snprintf(int_buf, sizeof(int_buf), CS_PRI_NODE_ID, list[i].nodeid); } else { snprintf(int_buf, sizeof(int_buf), "," CS_PRI_NODE_ID, list[i].nodeid); } if (strlen(list_str) + strlen(int_buf) >= sizeof(list_str)) { break ; } strcat(list_str, int_buf); } log_printf(level, "List '%s' contains %d entries: %s", string, list_entries, list_str); } static void my_leave_memb_clear( struct totemsrp_instance *instance) { memset(instance->my_leave_memb_list, 0, sizeof(instance->my_leave_memb_list)); instance->my_leave_memb_entries = 0; } static unsigned int my_leave_memb_match( struct totemsrp_instance *instance, unsigned int nodeid) { int i; unsigned int ret = 0; for (i = 0; i < instance->my_leave_memb_entries; i++){ if (instance->my_leave_memb_list[i] == nodeid){ ret = nodeid; break; } } return ret; } static void my_leave_memb_set( struct totemsrp_instance *instance, unsigned int nodeid) { int i, found = 0; for (i = 0; i < instance->my_leave_memb_entries; i++){ if (instance->my_leave_memb_list[i] == nodeid){ found = 1; break; } } if (found == 1) { return; } if (instance->my_leave_memb_entries < (PROCESSOR_COUNT_MAX - 1)) { instance->my_leave_memb_list[instance->my_leave_memb_entries] = nodeid; instance->my_leave_memb_entries++; } else { log_printf (instance->totemsrp_log_level_warning, "Cannot set LEAVE nodeid=" CS_PRI_NODE_ID, nodeid); } } static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance) { assert (instance != NULL); return totemnet_buffer_alloc (instance->totemnet_context); } static void totemsrp_buffer_release (struct totemsrp_instance *instance, void *ptr) { assert (instance != NULL); totemnet_buffer_release (instance->totemnet_context, ptr); } static void reset_token_retransmit_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_retransmit_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_retransmit_timeout, &instance->timer_orf_token_retransmit_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_retransmit_timeout - qb_loop_timer_add error : %d", res); } } static void start_merge_detect_timeout (struct totemsrp_instance *instance) { int32_t res; if (instance->my_merge_detect_timeout_outstanding == 0) { res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->merge_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "start_merge_detect_timeout - qb_loop_timer_add error : %d", res); } instance->my_merge_detect_timeout_outstanding = 1; } } static void cancel_merge_detect_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 0; } /* * ring_state_* is used to save and restore the sort queue * state when a recovery operation fails (and enters gather) */ static void old_ring_state_save (struct totemsrp_instance *instance) { if (instance->old_ring_state_saved == 0) { instance->old_ring_state_saved = 1; memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); instance->old_ring_state_aru = instance->my_aru; instance->old_ring_state_high_seq_received = instance->my_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Saving state aru %x high seq received %x", instance->my_aru, instance->my_high_seq_received); } } static void old_ring_state_restore (struct totemsrp_instance *instance) { instance->my_aru = instance->old_ring_state_aru; instance->my_high_seq_received = instance->old_ring_state_high_seq_received; log_printf (instance->totemsrp_log_level_debug, "Restoring instance->my_aru %x my high seq received %x", instance->my_aru, instance->my_high_seq_received); } static void old_ring_state_reset (struct totemsrp_instance *instance) { log_printf (instance->totemsrp_log_level_debug, "Resetting old ring state"); instance->old_ring_state_saved = 0; } static void reset_pause_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_pause_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout * QB_TIME_NS_IN_MSEC / 5, (void *)instance, timer_function_pause_timeout, &instance->timer_pause_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_pause_timeout - qb_loop_timer_add error : %d", res); } } static void reset_token_warning (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_warning * instance->totem_config->token_timeout / 100 * QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_orf_token_warning, &instance->timer_orf_token_warning); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_warning - qb_loop_timer_add error : %d", res); } } static void reset_token_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_orf_token_timeout, &instance->timer_orf_token_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_token_timeout - qb_loop_timer_add error : %d", res); } if (instance->totem_config->token_warning) reset_token_warning(instance); } static void reset_heartbeat_timeout (struct totemsrp_instance *instance) { int32_t res; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->heartbeat_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_heartbeat_timeout, &instance->timer_heartbeat_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "reset_heartbeat_timeout - qb_loop_timer_add error : %d", res); } } static void cancel_token_warning (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning); } static void cancel_token_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); if (instance->totem_config->token_warning) cancel_token_warning(instance); } static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); } static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); } static void start_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { int32_t res; res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->token_hold_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, timer_function_token_hold_retransmit_timeout, &instance->timer_orf_token_hold_retransmit_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "start_token_hold_retransmit_timeout - qb_loop_timer_add error : %d", res); } } static void cancel_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_hold_retransmit_timeout); } static void memb_state_consensus_timeout_expired ( struct totemsrp_instance *instance) { struct srp_addr no_consensus_list[PROCESSOR_COUNT_MAX]; int no_consensus_list_entries; instance->stats.consensus_timeouts++; if (memb_consensus_agreed (instance)) { memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); reset_token_timeout (instance); // REVIEWED } else { memb_consensus_notset ( instance, no_consensus_list, &no_consensus_list_entries, instance->my_proc_list, instance->my_proc_list_entries); memb_set_merge (no_consensus_list, no_consensus_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_CONSENSUS_TIMEOUT); } } static void memb_join_message_send (struct totemsrp_instance *instance); static void memb_merge_detect_transmit (struct totemsrp_instance *instance); /* * Timers used for various states of the membership algorithm */ static void timer_function_pause_timeout (void *data) { struct totemsrp_instance *instance = data; instance->pause_timestamp = qb_util_nano_current_get (); reset_pause_timeout (instance); } static void memb_recovery_state_token_loss (struct totemsrp_instance *instance) { old_ring_state_restore (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_RECOVERY_STATE); instance->stats.recovery_token_lost++; } static void timer_function_orf_token_warning (void *data) { struct totemsrp_instance *instance = data; uint64_t tv_diff; /* need to protect against the case where token_warning is set to 0 dynamically */ if (instance->totem_config->token_warning) { tv_diff = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC - instance->stats.token[instance->stats.latest_token].rx; log_printf (instance->totemsrp_log_level_notice, "Token has not been received in %d ms ", (unsigned int) tv_diff); reset_token_warning(instance); } else { cancel_token_warning(instance); } } static void timer_function_orf_token_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the OPERATIONAL state."); log_printf (instance->totemsrp_log_level_notice, "A processor failed, forming new configuration."); totemnet_iface_check (instance->totemnet_context); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_OPERATIONAL_STATE); instance->stats.operational_token_lost++; break; case MEMB_STATE_GATHER: log_printf (instance->totemsrp_log_level_debug, "The consensus timeout expired."); memb_state_consensus_timeout_expired (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED); instance->stats.gather_token_lost++; break; case MEMB_STATE_COMMIT: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the COMMIT state."); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_THE_TOKEN_WAS_LOST_IN_THE_COMMIT_STATE); instance->stats.commit_token_lost++; break; case MEMB_STATE_RECOVERY: log_printf (instance->totemsrp_log_level_debug, "The token was lost in the RECOVERY state."); memb_recovery_state_token_loss (instance); instance->orf_token_discard = 1; break; } } static void timer_function_heartbeat_timeout (void *data) { struct totemsrp_instance *instance = data; log_printf (instance->totemsrp_log_level_debug, "HeartBeat Timer expired Invoking token loss mechanism in state %d ", instance->memb_state); timer_function_orf_token_timeout(data); } static void memb_timer_function_state_gather (void *data) { struct totemsrp_instance *instance = data; int32_t res; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: assert (0); /* this should never happen */ break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: memb_join_message_send (instance); /* * Restart the join timeout `*/ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_timer_function_state_gather - qb_loop_timer_add error : %d", res); } break; } } static void memb_timer_function_gather_consensus_timeout (void *data) { struct totemsrp_instance *instance = data; memb_state_consensus_timeout_expired (instance); } static void deliver_messages_from_recovery_to_regular (struct totemsrp_instance *instance) { unsigned int i; struct sort_queue_item *recovery_message_item; struct sort_queue_item regular_message_item; unsigned int range = 0; int res; void *ptr; struct mcast *mcast; log_printf (instance->totemsrp_log_level_debug, "recovery to regular %x-%x", SEQNO_START_MSG + 1, instance->my_aru); range = instance->my_aru - SEQNO_START_MSG; /* * Move messages from recovery to regular sort queue */ // todo should i be initialized to 0 or 1 ? for (i = 1; i <= range; i++) { res = sq_item_get (&instance->recovery_sort_queue, i + SEQNO_START_MSG, &ptr); if (res != 0) { continue; } recovery_message_item = ptr; /* * Convert recovery message into regular message */ mcast = recovery_message_item->mcast; if (mcast->header.encapsulated == MESSAGE_ENCAPSULATED) { /* * Message is a recovery message encapsulated * in a new ring message */ regular_message_item.mcast = (struct mcast *)(((char *)recovery_message_item->mcast) + sizeof (struct mcast)); regular_message_item.msg_len = recovery_message_item->msg_len - sizeof (struct mcast); mcast = regular_message_item.mcast; } else { /* * TODO this case shouldn't happen */ continue; } log_printf (instance->totemsrp_log_level_debug, "comparing if ring id is for this processors old ring seqno " CS_PRI_RING_ID_SEQ, (uint64_t)mcast->seq); /* * Only add this message to the regular sort * queue if it was originated with the same ring * id as the previous ring */ if (memcmp (&instance->my_old_ring_id, &mcast->ring_id, sizeof (struct memb_ring_id)) == 0) { res = sq_item_inuse (&instance->regular_sort_queue, mcast->seq); if (res == 0) { sq_item_add (&instance->regular_sort_queue, ®ular_message_item, mcast->seq); if (sq_lt_compare (instance->old_ring_state_high_seq_received, mcast->seq)) { instance->old_ring_state_high_seq_received = mcast->seq; } } } else { log_printf (instance->totemsrp_log_level_debug, "-not adding msg with seq no " CS_PRI_RING_ID_SEQ, (uint64_t)mcast->seq); } } } /* * Change states in the state machine of the membership algorithm */ static void memb_state_operational_enter (struct totemsrp_instance *instance) { struct srp_addr joined_list[PROCESSOR_COUNT_MAX]; int joined_list_entries = 0; unsigned int aru_save; unsigned int joined_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int trans_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int new_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int left_list[PROCESSOR_COUNT_MAX]; unsigned int i; unsigned int res; char left_node_msg[1024]; char joined_node_msg[1024]; char failed_node_msg[1024]; instance->originated_orf_token = 0; memb_consensus_reset (instance); old_ring_state_reset (instance); deliver_messages_from_recovery_to_regular (instance); log_printf (instance->totemsrp_log_level_trace, "Delivering to app %x to %x", instance->my_high_delivered + 1, instance->old_ring_state_high_seq_received); aru_save = instance->my_aru; instance->my_aru = instance->old_ring_state_aru; messages_deliver_to_app (instance, 0, instance->old_ring_state_high_seq_received); /* * Calculate joined and left list */ memb_set_subtract (instance->my_left_memb_list, &instance->my_left_memb_entries, instance->my_memb_list, instance->my_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); memb_set_subtract (joined_list, &joined_list_entries, instance->my_new_memb_list, instance->my_new_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); /* * Install new membership */ instance->my_memb_entries = instance->my_new_memb_entries; memcpy (&instance->my_memb_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->last_released = 0; instance->my_set_retrans_flg = 0; /* * Deliver transitional configuration to application */ srp_addr_to_nodeid (instance, left_list, instance->my_left_memb_list, instance->my_left_memb_entries); srp_addr_to_nodeid (instance, trans_memb_list_totemip, instance->my_trans_memb_list, instance->my_trans_memb_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_TRANSITIONAL, trans_memb_list_totemip, instance->my_trans_memb_entries, left_list, instance->my_left_memb_entries, 0, 0, &instance->my_ring_id); instance->waiting_trans_ack = 1; instance->totemsrp_waiting_trans_ack_cb_fn (1); // TODO we need to filter to ensure we only deliver those // messages which are part of instance->my_deliver_memb messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received); instance->my_aru = aru_save; /* * Deliver regular configuration to application */ srp_addr_to_nodeid (instance, new_memb_list_totemip, instance->my_new_memb_list, instance->my_new_memb_entries); srp_addr_to_nodeid (instance, joined_list_totemip, joined_list, joined_list_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_REGULAR, new_memb_list_totemip, instance->my_new_memb_entries, 0, 0, joined_list_totemip, joined_list_entries, &instance->my_ring_id); /* * The recovery sort queue now becomes the regular * sort queue. It is necessary to copy the state * into the regular sort queue. */ sq_copy (&instance->regular_sort_queue, &instance->recovery_sort_queue); instance->my_last_aru = SEQNO_START_MSG; /* When making my_proc_list smaller, ensure that the * now non-used entries are zero-ed out. There are some suspect * assert's that assume that there is always 2 entries in the list. * These fail when my_proc_list is reduced to 1 entry (and the * valid [0] entry is the same as the 'unused' [1] entry). */ memset(instance->my_proc_list, 0, sizeof (struct srp_addr) * instance->my_proc_list_entries); instance->my_proc_list_entries = instance->my_new_memb_entries; memcpy (instance->my_proc_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->my_failed_list_entries = 0; /* * TODO Not exactly to spec * * At the entry to this function all messages without a gap are * deliered. * * This code throw away messages from the last gap in the sort queue * to my_high_seq_received * * What should really happen is we should deliver all messages up to * a gap, then delier the transitional configuration, then deliver * the messages between the first gap and my_high_seq_received, then * deliver a regular configuration, then deliver the regular * configuration * * Unfortunately totempg doesn't appear to like this operating mode * which needs more inspection */ i = instance->my_high_seq_received + 1; do { void *ptr; i -= 1; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (i == 0) { break; } } while (res); instance->my_high_delivered = i; for (i = 0; i <= instance->my_high_delivered; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, i, &ptr); if (res == 0) { struct sort_queue_item *regular_message; regular_message = ptr; free (regular_message->mcast); } } sq_items_release (&instance->regular_sort_queue, instance->my_high_delivered); instance->last_released = instance->my_high_delivered; if (joined_list_entries) { int sptr = 0; sptr += snprintf(joined_node_msg, sizeof(joined_node_msg)-sptr, " joined:"); for (i=0; i< joined_list_entries; i++) { sptr += snprintf(joined_node_msg+sptr, sizeof(joined_node_msg)-sptr, " " CS_PRI_NODE_ID, joined_list_totemip[i]); } } else { joined_node_msg[0] = '\0'; } if (instance->my_left_memb_entries) { int sptr = 0; int sptr2 = 0; sptr += snprintf(left_node_msg, sizeof(left_node_msg)-sptr, " left:"); for (i=0; i< instance->my_left_memb_entries; i++) { sptr += snprintf(left_node_msg+sptr, sizeof(left_node_msg)-sptr, " " CS_PRI_NODE_ID, left_list[i]); } for (i=0; i< instance->my_left_memb_entries; i++) { if (my_leave_memb_match(instance, left_list[i]) == 0) { if (sptr2 == 0) { sptr2 += snprintf(failed_node_msg, sizeof(failed_node_msg)-sptr2, " failed:"); } sptr2 += snprintf(failed_node_msg+sptr2, sizeof(left_node_msg)-sptr2, " " CS_PRI_NODE_ID, left_list[i]); } } if (sptr2 == 0) { failed_node_msg[0] = '\0'; } } else { left_node_msg[0] = '\0'; failed_node_msg[0] = '\0'; } my_leave_memb_clear(instance); log_printf (instance->totemsrp_log_level_debug, "entering OPERATIONAL state."); log_printf (instance->totemsrp_log_level_notice, "A new membership (" CS_PRI_RING_ID ") was formed. Members%s%s", instance->my_ring_id.rep, (uint64_t)instance->my_ring_id.seq, joined_node_msg, left_node_msg); if (strlen(failed_node_msg)) { log_printf (instance->totemsrp_log_level_notice, "Failed to receive the leave message.%s", failed_node_msg); } instance->memb_state = MEMB_STATE_OPERATIONAL; instance->stats.operational_entered++; instance->stats.continuous_gather = 0; instance->my_received_flg = 1; reset_pause_timeout (instance); /* * Save ring id information from this configuration to determine * which processors are transitioning from old regular configuration * in to new regular configuration on the next configuration change */ memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); return; } static void memb_state_gather_enter ( struct totemsrp_instance *instance, enum gather_state_from gather_from) { int32_t res; instance->orf_token_discard = 1; instance->originated_orf_token = 0; memb_set_merge ( &instance->my_id, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_join_message_send (instance); /* * Restart the join timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->join_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_state_gather_enter - qb_loop_timer_add error(1) : %d", res); } /* * Restart the consensus timeout */ qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); res = qb_loop_timer_add (instance->totemsrp_poll_handle, QB_LOOP_MED, instance->totem_config->consensus_timeout*QB_TIME_NS_IN_MSEC, (void *)instance, memb_timer_function_gather_consensus_timeout, &instance->memb_timer_state_gather_consensus_timeout); if (res != 0) { log_printf(instance->totemsrp_log_level_error, "memb_state_gather_enter - qb_loop_timer_add error(2) : %d", res); } /* * Cancel the token loss and token retransmission timeouts */ cancel_token_retransmit_timeout (instance); // REVIEWED cancel_token_timeout (instance); // REVIEWED cancel_merge_detect_timeout (instance); memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); log_printf (instance->totemsrp_log_level_debug, "entering GATHER state from %d(%s).", gather_from, gsfrom_to_msg(gather_from)); instance->memb_state = MEMB_STATE_GATHER; instance->stats.gather_entered++; if (gather_from == TOTEMSRP_GSFROM_THE_CONSENSUS_TIMEOUT_EXPIRED) { /* * State 3 means gather, so we are continuously gathering. */ instance->stats.continuous_gather++; } return; } static void timer_function_token_retransmit_timeout (void *data); static void target_set_completed ( void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; memb_state_commit_token_send (instance); } static void memb_state_commit_enter ( struct totemsrp_instance *instance) { old_ring_state_save (instance); memb_state_commit_token_update (instance); memb_state_commit_token_target_set (instance); qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); instance->memb_timer_state_gather_join_timeout = 0; qb_loop_timer_del (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); instance->memb_timer_state_gather_consensus_timeout = 0; memb_ring_id_set (instance, &instance->commit_token->ring_id); instance->memb_ring_id_store (&instance->my_ring_id, instance->my_id.nodeid); instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf (instance->totemsrp_log_level_debug, "entering COMMIT state."); instance->memb_state = MEMB_STATE_COMMIT; reset_token_retransmit_timeout (instance); // REVIEWED reset_token_timeout (instance); // REVIEWED instance->stats.commit_entered++; instance->stats.continuous_gather = 0; /* * reset all flow control variables since we are starting a new ring */ instance->my_trc = 0; instance->my_pbl = 0; instance->my_cbl = 0; /* * commit token sent after callback that token target has been set */ } static void memb_state_recovery_enter ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { int i; int local_received_flg = 1; unsigned int low_ring_aru; unsigned int range = 0; unsigned int messages_originated = 0; const struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; struct memb_ring_id my_new_memb_ring_id_list[PROCESSOR_COUNT_MAX]; addr = (const struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); log_printf (instance->totemsrp_log_level_debug, "entering RECOVERY state."); instance->orf_token_discard = 0; instance->my_high_ring_delivered = 0; sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG); cs_queue_reinit (&instance->retrans_message_queue); low_ring_aru = instance->old_ring_state_high_seq_received; memb_state_commit_token_send_recovery (instance, commit_token); instance->my_token_seq = SEQNO_START_TOKEN - 1; /* * Build regular configuration */ totemnet_processor_count_set ( instance->totemnet_context, commit_token->addr_entries); /* * Build transitional configuration */ for (i = 0; i < instance->my_new_memb_entries; i++) { memcpy (&my_new_memb_ring_id_list[i], &memb_list[i].ring_id, sizeof (struct memb_ring_id)); } memb_set_and_with_ring_id ( instance->my_new_memb_list, my_new_memb_ring_id_list, instance->my_new_memb_entries, instance->my_memb_list, instance->my_memb_entries, &instance->my_old_ring_id, instance->my_trans_memb_list, &instance->my_trans_memb_entries); for (i = 0; i < instance->my_trans_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "TRANS [%d] member " CS_PRI_NODE_ID ":", i, instance->my_trans_memb_list[i].nodeid); } for (i = 0; i < instance->my_new_memb_entries; i++) { log_printf (instance->totemsrp_log_level_debug, "position [%d] member " CS_PRI_NODE_ID ":", i, addr[i].nodeid); log_printf (instance->totemsrp_log_level_debug, "previous ringid (" CS_PRI_RING_ID ")", memb_list[i].ring_id.rep, (uint64_t)memb_list[i].ring_id.seq); log_printf (instance->totemsrp_log_level_debug, "aru %x high delivered %x received flag %d", memb_list[i].aru, memb_list[i].high_delivered, memb_list[i].received_flg); // assert (totemip_print (&memb_list[i].ring_id.rep) != 0); } /* * Determine if any received flag is false */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_trans_memb_list, instance->my_trans_memb_entries) && memb_list[i].received_flg == 0) { instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct srp_addr) * instance->my_trans_memb_entries); local_received_flg = 0; break; } } if (local_received_flg == 1) { goto no_originate; } /* Else originate messages if we should */ /* * Calculate my_low_ring_aru, instance->my_high_ring_delivered for the transitional membership */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) && memcmp (&instance->my_old_ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, low_ring_aru)) { low_ring_aru = memb_list[i].aru; } if (sq_lt_compare (instance->my_high_ring_delivered, memb_list[i].high_delivered)) { instance->my_high_ring_delivered = memb_list[i].high_delivered; } } } /* * Copy all old ring messages to instance->retrans_message_queue */ range = instance->old_ring_state_high_seq_received - low_ring_aru; if (range == 0) { /* * No messages to copy */ goto no_originate; } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); log_printf (instance->totemsrp_log_level_debug, "copying all old ring messages from %x-%x.", low_ring_aru + 1, instance->old_ring_state_high_seq_received); for (i = 1; i <= range; i++) { struct sort_queue_item *sort_queue_item; struct message_item message_item; void *ptr; int res; res = sq_item_get (&instance->regular_sort_queue, low_ring_aru + i, &ptr); if (res != 0) { continue; } sort_queue_item = ptr; messages_originated++; memset (&message_item, 0, sizeof (struct message_item)); // TODO LEAK message_item.mcast = totemsrp_buffer_alloc (instance); assert (message_item.mcast); memset(message_item.mcast, 0, sizeof (struct mcast)); message_item.mcast->header.magic = TOTEM_MH_MAGIC; message_item.mcast->header.version = TOTEM_MH_VERSION; message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->system_from = instance->my_id; message_item.mcast->header.encapsulated = MESSAGE_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.nodeid; assert (message_item.mcast->header.nodeid); memcpy (&message_item.mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); message_item.msg_len = sort_queue_item->msg_len + sizeof (struct mcast); memcpy (((char *)message_item.mcast) + sizeof (struct mcast), sort_queue_item->mcast, sort_queue_item->msg_len); cs_queue_item_add (&instance->retrans_message_queue, &message_item); } log_printf (instance->totemsrp_log_level_debug, "Originated %d messages in RECOVERY.", messages_originated); goto originated; no_originate: log_printf (instance->totemsrp_log_level_debug, "Did not need to originate any messages in recovery."); originated: instance->my_aru = SEQNO_START_MSG; instance->my_aru_count = 0; instance->my_seq_unchanged = 0; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_install_seq = SEQNO_START_MSG; instance->last_released = SEQNO_START_MSG; reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED instance->memb_state = MEMB_STATE_RECOVERY; instance->stats.recovery_entered++; instance->stats.continuous_gather = 0; return; } void totemsrp_event_signal (void *srp_context, enum totem_event_type type, int value) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; token_hold_cancel_send (instance); return; } int totemsrp_mcast ( void *srp_context, struct iovec *iovec, unsigned int iov_len, int guarantee) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int i; struct message_item message_item; char *addr; unsigned int addr_idx; struct cs_queue *queue_use; if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } if (cs_queue_is_full (queue_use)) { log_printf (instance->totemsrp_log_level_debug, "queue full"); return (-1); } memset (&message_item, 0, sizeof (struct message_item)); /* * Allocate pending item */ message_item.mcast = totemsrp_buffer_alloc (instance); if (message_item.mcast == 0) { goto error_mcast; } /* * Set mcast header */ memset(message_item.mcast, 0, sizeof (struct mcast)); message_item.mcast->header.magic = TOTEM_MH_MAGIC; message_item.mcast->header.version = TOTEM_MH_VERSION; message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->header.encapsulated = MESSAGE_NOT_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->guarantee = guarantee; message_item.mcast->system_from = instance->my_id; addr = (char *)message_item.mcast; addr_idx = sizeof (struct mcast); for (i = 0; i < iov_len; i++) { memcpy (&addr[addr_idx], iovec[i].iov_base, iovec[i].iov_len); addr_idx += iovec[i].iov_len; } message_item.msg_len = addr_idx; log_printf (instance->totemsrp_log_level_trace, "mcasted message added to pending queue"); instance->stats.mcast_tx++; cs_queue_item_add (queue_use, &message_item); return (0); error_mcast: return (-1); } /* * Determine if there is room to queue a new message */ int totemsrp_avail (void *srp_context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; int avail; struct cs_queue *queue_use; if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } cs_queue_avail (queue_use, &avail); return (avail); } /* * ORF Token Management */ /* * Recast message to mcast group if it is available */ static int orf_token_remcast ( struct totemsrp_instance *instance, int seq) { struct sort_queue_item *sort_queue_item; int res; void *ptr; struct sq *sort_queue; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } res = sq_in_range (sort_queue, seq); if (res == 0) { log_printf (instance->totemsrp_log_level_debug, "sq not in range"); return (-1); } /* * Get RTR item at seq, if not available, return */ res = sq_item_get (sort_queue, seq, &ptr); if (res != 0) { return -1; } sort_queue_item = ptr; totemnet_mcast_noflush_send ( instance->totemnet_context, sort_queue_item->mcast, sort_queue_item->msg_len); return (0); } /* * Free all freeable messages from ring */ static void messages_free ( struct totemsrp_instance *instance, unsigned int token_aru) { struct sort_queue_item *regular_message; unsigned int i; int res; int log_release = 0; unsigned int release_to; unsigned int range = 0; release_to = token_aru; if (sq_lt_compare (instance->my_last_aru, release_to)) { release_to = instance->my_last_aru; } if (sq_lt_compare (instance->my_high_delivered, release_to)) { release_to = instance->my_high_delivered; } /* * Ensure we dont try release before an already released point */ if (sq_lt_compare (release_to, instance->last_released)) { return; } range = release_to - instance->last_released; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); /* * Release retransmit list items if group aru indicates they are transmitted */ for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, instance->last_released + i, &ptr); if (res == 0) { regular_message = ptr; totemsrp_buffer_release (instance, regular_message->mcast); } sq_items_release (&instance->regular_sort_queue, instance->last_released + i); log_release = 1; } instance->last_released += range; if (log_release) { log_printf (instance->totemsrp_log_level_trace, "releasing messages up to and including %x", release_to); } } static void update_aru ( struct totemsrp_instance *instance) { unsigned int i; int res; struct sq *sort_queue; unsigned int range; unsigned int my_aru_saved = 0; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } range = instance->my_high_seq_received - instance->my_aru; my_aru_saved = instance->my_aru; for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (sort_queue, my_aru_saved + i, &ptr); /* * If hole, stop updating aru */ if (res != 0) { break; } } instance->my_aru += i - 1; } /* * Multicasts pending messages onto the ring (requires orf_token possession) */ static int orf_token_mcast ( struct totemsrp_instance *instance, struct orf_token *token, int fcc_mcasts_allowed) { struct message_item *message_item = 0; struct cs_queue *mcast_queue; struct sq *sort_queue; struct sort_queue_item sort_queue_item; struct mcast *mcast; unsigned int fcc_mcast_current; if (instance->memb_state == MEMB_STATE_RECOVERY) { mcast_queue = &instance->retrans_message_queue; sort_queue = &instance->recovery_sort_queue; reset_token_retransmit_timeout (instance); // REVIEWED } else { if (instance->waiting_trans_ack) { mcast_queue = &instance->new_message_queue_trans; } else { mcast_queue = &instance->new_message_queue; } sort_queue = &instance->regular_sort_queue; } for (fcc_mcast_current = 0; fcc_mcast_current < fcc_mcasts_allowed; fcc_mcast_current++) { if (cs_queue_is_empty (mcast_queue)) { break; } message_item = (struct message_item *)cs_queue_item_get (mcast_queue); message_item->mcast->seq = ++token->seq; message_item->mcast->this_seqno = instance->global_seqno++; /* * Build IO vector */ memset (&sort_queue_item, 0, sizeof (struct sort_queue_item)); sort_queue_item.mcast = message_item->mcast; sort_queue_item.msg_len = message_item->msg_len; mcast = sort_queue_item.mcast; memcpy (&mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); /* * Add message to retransmit queue */ sq_item_add (sort_queue, &sort_queue_item, message_item->mcast->seq); totemnet_mcast_noflush_send ( instance->totemnet_context, message_item->mcast, message_item->msg_len); /* * Delete item from pending queue */ cs_queue_item_remove (mcast_queue); /* * If messages mcasted, deliver any new messages to totempg */ instance->my_high_seq_received = token->seq; } update_aru (instance); /* * Return 1 if more messages are available for single node clusters */ return (fcc_mcast_current); } /* * Remulticasts messages in orf_token's retransmit list (requires orf_token) * Modify's orf_token's rtr to include retransmits required by this process */ static int orf_token_rtr ( struct totemsrp_instance *instance, struct orf_token *orf_token, unsigned int *fcc_allowed) { unsigned int res; unsigned int i, j; unsigned int found; struct sq *sort_queue; struct rtr_item *rtr_list; unsigned int range = 0; char retransmit_msg[1024]; char value[64]; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } rtr_list = &orf_token->rtr_list[0]; strcpy (retransmit_msg, "Retransmit List: "); if (orf_token->rtr_list_entries) { log_printf (instance->totemsrp_log_level_debug, "Retransmit List %d", orf_token->rtr_list_entries); for (i = 0; i < orf_token->rtr_list_entries; i++) { sprintf (value, "%x ", rtr_list[i].seq); strcat (retransmit_msg, value); } strcat (retransmit_msg, ""); log_printf (instance->totemsrp_log_level_notice, "%s", retransmit_msg); } /* * Retransmit messages on orf_token's RTR list from RTR queue */ for (instance->fcc_remcast_current = 0, i = 0; instance->fcc_remcast_current < *fcc_allowed && i < orf_token->rtr_list_entries;) { /* * If this retransmit request isn't from this configuration, * try next rtr entry */ if (memcmp (&rtr_list[i].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { i += 1; continue; } res = orf_token_remcast (instance, rtr_list[i].seq); if (res == 0) { /* * Multicasted message, so no need to copy to new retransmit list */ orf_token->rtr_list_entries -= 1; assert (orf_token->rtr_list_entries >= 0); memmove (&rtr_list[i], &rtr_list[i + 1], sizeof (struct rtr_item) * (orf_token->rtr_list_entries - i)); instance->stats.mcast_retx++; instance->fcc_remcast_current++; } else { i += 1; } } *fcc_allowed = *fcc_allowed - instance->fcc_remcast_current; /* * Add messages to retransmit to RTR list * but only retry if there is room in the retransmit list */ range = orf_token->seq - instance->my_aru; assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); for (i = 1; (orf_token->rtr_list_entries < RETRANSMIT_ENTRIES_MAX) && (i <= range); i++) { /* * Ensure message is within the sort queue range */ res = sq_in_range (sort_queue, instance->my_aru + i); if (res == 0) { break; } /* * Find if a message is missing from this processor */ res = sq_item_inuse (sort_queue, instance->my_aru + i); if (res == 0) { /* * Determine how many times we have missed receiving * this sequence number. sq_item_miss_count increments * a counter for the sequence number. The miss count * will be returned and compared. This allows time for * delayed multicast messages to be received before * declaring the message is missing and requesting a * retransmit. */ res = sq_item_miss_count (sort_queue, instance->my_aru + i); if (res < instance->totem_config->miss_count_const) { continue; } /* * Determine if missing message is already in retransmit list */ found = 0; for (j = 0; j < orf_token->rtr_list_entries; j++) { if (instance->my_aru + i == rtr_list[j].seq) { found = 1; } } if (found == 0) { /* * Missing message not found in current retransmit list so add it */ memcpy (&rtr_list[orf_token->rtr_list_entries].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); rtr_list[orf_token->rtr_list_entries].seq = instance->my_aru + i; orf_token->rtr_list_entries++; } } } return (instance->fcc_remcast_current); } static void token_retransmit (struct totemsrp_instance *instance) { totemnet_token_send (instance->totemnet_context, instance->orf_token_retransmit, instance->orf_token_retransmit_size); } /* * Retransmit the regular token if no mcast or token has * been received in retransmit token period retransmit * the token to the next processor */ static void timer_function_token_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); reset_token_retransmit_timeout (instance); // REVIEWED break; } } static void timer_function_token_hold_retransmit_timeout (void *data) { struct totemsrp_instance *instance = data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: break; case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); break; } } static void timer_function_merge_detect_timeout(void *data) { struct totemsrp_instance *instance = data; instance->my_merge_detect_timeout_outstanding = 0; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (instance->my_ring_id.rep == instance->my_id.nodeid) { memb_merge_detect_transmit (instance); } break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: case MEMB_STATE_RECOVERY: break; } } /* * Send orf_token to next member (requires orf_token) */ static int token_send ( struct totemsrp_instance *instance, struct orf_token *orf_token, int forward_token) { int res = 0; unsigned int orf_token_size; orf_token_size = sizeof (struct orf_token) + (orf_token->rtr_list_entries * sizeof (struct rtr_item)); orf_token->header.nodeid = instance->my_id.nodeid; memcpy (instance->orf_token_retransmit, orf_token, orf_token_size); instance->orf_token_retransmit_size = orf_token_size; assert (orf_token->header.nodeid); if (forward_token == 0) { return (0); } totemnet_token_send (instance->totemnet_context, orf_token, orf_token_size); return (res); } static int token_hold_cancel_send (struct totemsrp_instance *instance) { struct token_hold_cancel token_hold_cancel; /* * Only cancel if the token is currently held */ if (instance->my_token_held == 0) { return (0); } instance->my_token_held = 0; /* * Build message */ token_hold_cancel.header.magic = TOTEM_MH_MAGIC; token_hold_cancel.header.version = TOTEM_MH_VERSION; token_hold_cancel.header.type = MESSAGE_TYPE_TOKEN_HOLD_CANCEL; token_hold_cancel.header.encapsulated = 0; token_hold_cancel.header.nodeid = instance->my_id.nodeid; memcpy (&token_hold_cancel.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (token_hold_cancel.header.nodeid); instance->stats.token_hold_cancel_tx++; totemnet_mcast_flush_send (instance->totemnet_context, &token_hold_cancel, sizeof (struct token_hold_cancel)); return (0); } static int orf_token_send_initial (struct totemsrp_instance *instance) { struct orf_token orf_token; int res; orf_token.header.magic = TOTEM_MH_MAGIC; orf_token.header.version = TOTEM_MH_VERSION; orf_token.header.type = MESSAGE_TYPE_ORF_TOKEN; orf_token.header.encapsulated = 0; orf_token.header.nodeid = instance->my_id.nodeid; assert (orf_token.header.nodeid); orf_token.seq = SEQNO_START_MSG; orf_token.token_seq = SEQNO_START_TOKEN; orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; instance->stats.orf_token_tx++; if (cs_queue_is_empty (&instance->retrans_message_queue) == 1) { orf_token.retrans_flg = 0; instance->my_set_retrans_flg = 0; } else { orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; } orf_token.aru = 0; orf_token.aru = SEQNO_START_MSG - 1; orf_token.aru_addr = instance->my_id.nodeid; memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); orf_token.fcc = 0; orf_token.backlog = 0; orf_token.rtr_list_entries = 0; res = token_send (instance, &orf_token, 1); return (res); } static void memb_state_commit_token_update ( struct totemsrp_instance *instance) { struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; unsigned int high_aru; unsigned int i; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (instance->my_new_memb_list, addr, sizeof (struct srp_addr) * instance->commit_token->addr_entries); instance->my_new_memb_entries = instance->commit_token->addr_entries; memcpy (&memb_list[instance->commit_token->memb_index].ring_id, &instance->my_old_ring_id, sizeof (struct memb_ring_id)); memb_list[instance->commit_token->memb_index].aru = instance->old_ring_state_aru; /* * TODO high delivered is really instance->my_aru, but with safe this * could change? */ instance->my_received_flg = (instance->my_aru == instance->my_high_seq_received); memb_list[instance->commit_token->memb_index].received_flg = instance->my_received_flg; memb_list[instance->commit_token->memb_index].high_delivered = instance->my_high_delivered; /* * find high aru up to current memb_index for all matching ring ids * if any ring id matching memb_index has aru less then high aru set * received flag for that entry to false */ high_aru = memb_list[instance->commit_token->memb_index].aru; for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (high_aru, memb_list[i].aru)) { high_aru = memb_list[i].aru; } } } for (i = 0; i <= instance->commit_token->memb_index; i++) { if (memcmp (&memb_list[instance->commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, high_aru)) { memb_list[i].received_flg = 0; if (i == instance->commit_token->memb_index) { instance->my_received_flg = 0; } } } } instance->commit_token->header.nodeid = instance->my_id.nodeid; instance->commit_token->memb_index += 1; assert (instance->commit_token->memb_index <= instance->commit_token->addr_entries); assert (instance->commit_token->header.nodeid); } static void memb_state_commit_token_target_set ( struct totemsrp_instance *instance) { struct srp_addr *addr; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; /* Totemnet just looks at the node id */ totemnet_token_target_set ( instance->totemnet_context, addr[instance->commit_token->memb_index % instance->commit_token->addr_entries].nodeid); } static int memb_state_commit_token_send_recovery ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { unsigned int commit_token_size; commit_token->token_seq++; commit_token->header.nodeid = instance->my_id.nodeid; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemnet_token_send (instance->totemnet_context, commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_state_commit_token_send ( struct totemsrp_instance *instance) { unsigned int commit_token_size; instance->commit_token->token_seq++; instance->commit_token->header.nodeid = instance->my_id.nodeid; commit_token_size = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * instance->commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, instance->commit_token, commit_token_size); instance->orf_token_retransmit_size = commit_token_size; instance->stats.memb_commit_token_tx++; totemnet_token_send (instance->totemnet_context, instance->commit_token, commit_token_size); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_lowest_in_config (struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int i; unsigned int lowest_nodeid; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); /* * find representative by searching for smallest identifier */ assert(token_memb_entries > 0); lowest_nodeid = token_memb[0].nodeid; for (i = 1; i < token_memb_entries; i++) { if (lowest_nodeid > token_memb[i].nodeid) { lowest_nodeid = token_memb[i].nodeid; } } return (lowest_nodeid == instance->my_id.nodeid); } static int srp_addr_compare (const void *a, const void *b) { const struct srp_addr *srp_a = (const struct srp_addr *)a; const struct srp_addr *srp_b = (const struct srp_addr *)b; if (srp_a->nodeid < srp_b->nodeid) { return -1; } else if (srp_a->nodeid > srp_b->nodeid) { return 1; } else { return 0; } } static void memb_state_commit_token_create ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; int token_memb_entries = 0; log_printf (instance->totemsrp_log_level_debug, "Creating commit token because I am the rep."); memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); memset (instance->commit_token, 0, sizeof (struct memb_commit_token)); instance->commit_token->header.magic = TOTEM_MH_MAGIC; instance->commit_token->header.version = TOTEM_MH_VERSION; instance->commit_token->header.type = MESSAGE_TYPE_MEMB_COMMIT_TOKEN; instance->commit_token->header.encapsulated = 0; instance->commit_token->header.nodeid = instance->my_id.nodeid; assert (instance->commit_token->header.nodeid); instance->commit_token->ring_id.rep = instance->my_id.nodeid; instance->commit_token->ring_id.seq = instance->token_ring_id_seq + 4; /* * This qsort is necessary to ensure the commit token traverses * the ring in the proper order */ qsort (token_memb, token_memb_entries, sizeof (struct srp_addr), srp_addr_compare); instance->commit_token->memb_index = 0; instance->commit_token->addr_entries = token_memb_entries; addr = (struct srp_addr *)instance->commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + instance->commit_token->addr_entries); memcpy (addr, token_memb, token_memb_entries * sizeof (struct srp_addr)); memset (memb_list, 0, sizeof (struct memb_commit_token_memb_entry) * token_memb_entries); } static void memb_join_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; size_t msg_len; memb_join->header.magic = TOTEM_MH_MAGIC; memb_join->header.version = TOTEM_MH_VERSION; memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.encapsulated = 0; memb_join->header.nodeid = instance->my_id.nodeid; assert (memb_join->header.nodeid); msg_len = sizeof(struct memb_join) + ((instance->my_proc_list_entries + instance->my_failed_list_entries) * sizeof(struct srp_addr)); if (msg_len > sizeof(memb_join_data)) { log_printf (instance->totemsrp_log_level_error, "memb_join_message too long. Ignoring message."); return ; } memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = instance->my_proc_list_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; memb_join->system_from = instance->my_id; /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], instance->my_proc_list, instance->my_proc_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_proc_list_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemnet_mcast_flush_send ( instance->totemnet_context, memb_join, addr_idx); } static void memb_leave_message_send (struct totemsrp_instance *instance) { char memb_join_data[40000]; struct memb_join *memb_join = (struct memb_join *)memb_join_data; char *addr; unsigned int addr_idx; int active_memb_entries; struct srp_addr active_memb[PROCESSOR_COUNT_MAX]; size_t msg_len; log_printf (instance->totemsrp_log_level_debug, "sending join/leave message"); /* * add us to the failed list, and remove us from * the members list */ memb_set_merge( &instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_set_subtract (active_memb, &active_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, &instance->my_id, 1); msg_len = sizeof(struct memb_join) + ((active_memb_entries + instance->my_failed_list_entries) * sizeof(struct srp_addr)); if (msg_len > sizeof(memb_join_data)) { log_printf (instance->totemsrp_log_level_error, "memb_leave message too long. Ignoring message."); return ; } memb_join->header.magic = TOTEM_MH_MAGIC; memb_join->header.version = TOTEM_MH_VERSION; memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join->header.encapsulated = 0; memb_join->header.nodeid = LEAVE_DUMMY_NODEID; memb_join->ring_seq = instance->my_ring_id.seq; memb_join->proc_list_entries = active_memb_entries; memb_join->failed_list_entries = instance->my_failed_list_entries; memb_join->system_from = instance->my_id; // TODO: CC Maybe use the actual join send routine. /* * This mess adds the joined and failed processor lists into the join * message */ addr = (char *)memb_join; addr_idx = sizeof (struct memb_join); memcpy (&addr[addr_idx], active_memb, active_memb_entries * sizeof (struct srp_addr)); addr_idx += active_memb_entries * sizeof (struct srp_addr); memcpy (&addr[addr_idx], instance->my_failed_list, instance->my_failed_list_entries * sizeof (struct srp_addr)); addr_idx += instance->my_failed_list_entries * sizeof (struct srp_addr); if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } instance->stats.memb_join_tx++; totemnet_mcast_flush_send ( instance->totemnet_context, memb_join, addr_idx); } static void memb_merge_detect_transmit (struct totemsrp_instance *instance) { struct memb_merge_detect memb_merge_detect; memb_merge_detect.header.magic = TOTEM_MH_MAGIC; memb_merge_detect.header.version = TOTEM_MH_VERSION; memb_merge_detect.header.type = MESSAGE_TYPE_MEMB_MERGE_DETECT; memb_merge_detect.header.encapsulated = 0; memb_merge_detect.header.nodeid = instance->my_id.nodeid; memb_merge_detect.system_from = instance->my_id; memcpy (&memb_merge_detect.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); assert (memb_merge_detect.header.nodeid); instance->stats.memb_merge_detect_tx++; totemnet_mcast_flush_send (instance->totemnet_context, &memb_merge_detect, sizeof (struct memb_merge_detect)); } static void memb_ring_id_set ( struct totemsrp_instance *instance, const struct memb_ring_id *ring_id) { memcpy (&instance->my_ring_id, ring_id, sizeof (struct memb_ring_id)); } int totemsrp_callback_token_create ( void *srp_context, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)srp_context; struct token_callback_instance *callback_handle; token_hold_cancel_send (instance); callback_handle = malloc (sizeof (struct token_callback_instance)); if (callback_handle == 0) { return (-1); } *handle_out = (void *)callback_handle; qb_list_init (&callback_handle->list); callback_handle->callback_fn = callback_fn; callback_handle->data = (void *) data; callback_handle->callback_type = type; callback_handle->delete = delete; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: qb_list_add (&callback_handle->list, &instance->token_callback_received_listhead); break; case TOTEM_CALLBACK_TOKEN_SENT: qb_list_add (&callback_handle->list, &instance->token_callback_sent_listhead); break; } return (0); } void totemsrp_callback_token_destroy (void *srp_context, void **handle_out) { struct token_callback_instance *h; if (*handle_out) { h = (struct token_callback_instance *)*handle_out; qb_list_del (&h->list); free (h); h = NULL; *handle_out = 0; } } static void token_callbacks_execute ( struct totemsrp_instance *instance, enum totem_callback_token_type type) { struct qb_list_head *list, *tmp_iter; struct qb_list_head *callback_listhead = 0; struct token_callback_instance *token_callback_instance; int res; int del; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: callback_listhead = &instance->token_callback_received_listhead; break; case TOTEM_CALLBACK_TOKEN_SENT: callback_listhead = &instance->token_callback_sent_listhead; break; default: assert (0); } qb_list_for_each_safe(list, tmp_iter, callback_listhead) { token_callback_instance = qb_list_entry (list, struct token_callback_instance, list); del = token_callback_instance->delete; if (del == 1) { qb_list_del (list); } res = token_callback_instance->callback_fn ( token_callback_instance->callback_type, token_callback_instance->data); /* * This callback failed to execute, try it again on the next token */ if (res == -1 && del == 1) { qb_list_add (list, callback_listhead); } else if (del) { free (token_callback_instance); } } } /* * Flow control functions */ static unsigned int backlog_get (struct totemsrp_instance *instance) { unsigned int backlog = 0; struct cs_queue *queue_use = NULL; if (instance->memb_state == MEMB_STATE_OPERATIONAL) { if (instance->waiting_trans_ack) { queue_use = &instance->new_message_queue_trans; } else { queue_use = &instance->new_message_queue; } } else if (instance->memb_state == MEMB_STATE_RECOVERY) { queue_use = &instance->retrans_message_queue; } if (queue_use != NULL) { backlog = cs_queue_used (queue_use); } instance->stats.token[instance->stats.latest_token].backlog_calc = backlog; return (backlog); } static int fcc_calculate ( struct totemsrp_instance *instance, struct orf_token *token) { unsigned int transmits_allowed; unsigned int backlog_calc; transmits_allowed = instance->totem_config->max_messages; if (transmits_allowed > instance->totem_config->window_size - token->fcc) { transmits_allowed = instance->totem_config->window_size - token->fcc; } instance->my_cbl = backlog_get (instance); /* * Only do backlog calculation if there is a backlog otherwise * we would result in div by zero */ if (token->backlog + instance->my_cbl - instance->my_pbl) { backlog_calc = (instance->totem_config->window_size * instance->my_pbl) / (token->backlog + instance->my_cbl - instance->my_pbl); if (backlog_calc > 0 && transmits_allowed > backlog_calc) { transmits_allowed = backlog_calc; } } return (transmits_allowed); } /* * don't overflow the RTR sort queue */ static void fcc_rtr_limit ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int *transmits_allowed) { int check = QUEUE_RTR_ITEMS_SIZE_MAX; check -= (*transmits_allowed + instance->totem_config->window_size); assert (check >= 0); if (sq_lt_compare (instance->last_released + QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed - instance->totem_config->window_size, token->seq)) { *transmits_allowed = 0; } } static void fcc_token_update ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int msgs_transmitted) { token->fcc += msgs_transmitted - instance->my_trc; token->backlog += instance->my_cbl - instance->my_pbl; instance->my_trc = msgs_transmitted; instance->my_pbl = instance->my_cbl; } /* * Sanity checkers */ static int check_orf_token_sanity( const struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { int rtr_entries; const struct orf_token *token = (const struct orf_token *)msg; size_t required_len; if (msg_len < sizeof(struct orf_token)) { log_printf (instance->totemsrp_log_level_security, "Received orf_token message is too short... ignoring."); return (-1); } if (endian_conversion_needed) { rtr_entries = swab32(token->rtr_list_entries); } else { rtr_entries = token->rtr_list_entries; } required_len = sizeof(struct orf_token) + rtr_entries * sizeof(struct rtr_item); if (msg_len < required_len) { log_printf (instance->totemsrp_log_level_security, "Received orf_token message is too short... ignoring."); return (-1); } return (0); } static int check_mcast_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { if (msg_len < sizeof(struct mcast)) { log_printf (instance->totemsrp_log_level_security, "Received mcast message is too short... ignoring."); return (-1); } return (0); } static int check_memb_merge_detect_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { if (msg_len < sizeof(struct memb_merge_detect)) { log_printf (instance->totemsrp_log_level_security, "Received memb_merge_detect message is too short... ignoring."); return (-1); } return (0); } static int check_memb_join_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_join *mj_msg = (const struct memb_join *)msg; unsigned int proc_list_entries; unsigned int failed_list_entries; size_t required_len; if (msg_len < sizeof(struct memb_join)) { log_printf (instance->totemsrp_log_level_security, "Received memb_join message is too short... ignoring."); return (-1); } proc_list_entries = mj_msg->proc_list_entries; failed_list_entries = mj_msg->failed_list_entries; if (endian_conversion_needed) { proc_list_entries = swab32(proc_list_entries); failed_list_entries = swab32(failed_list_entries); } required_len = sizeof(struct memb_join) + ((proc_list_entries + failed_list_entries) * sizeof(struct srp_addr)); if (msg_len < required_len) { log_printf (instance->totemsrp_log_level_security, "Received memb_join message is too short... ignoring."); return (-1); } return (0); } static int check_memb_commit_token_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_commit_token *mct_msg = (const struct memb_commit_token *)msg; unsigned int addr_entries; size_t required_len; if (msg_len < sizeof(struct memb_commit_token)) { log_printf (instance->totemsrp_log_level_security, "Received memb_commit_token message is too short... ignoring."); return (0); } addr_entries= mct_msg->addr_entries; if (endian_conversion_needed) { addr_entries = swab32(addr_entries); } required_len = sizeof(struct memb_commit_token) + (addr_entries * (sizeof(struct srp_addr) + sizeof(struct memb_commit_token_memb_entry))); if (msg_len < required_len) { log_printf (instance->totemsrp_log_level_security, "Received memb_commit_token message is too short... ignoring."); return (-1); } return (0); } static int check_token_hold_cancel_sanity( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { if (msg_len < sizeof(struct token_hold_cancel)) { log_printf (instance->totemsrp_log_level_security, "Received token_hold_cancel message is too short... ignoring."); return (-1); } return (0); } /* * Message Handlers */ unsigned long long int tv_old; /* * message handler called when TOKEN message type received */ static int message_handler_orf_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { char token_storage[1500]; char token_convert[1500]; struct orf_token *token = NULL; int forward_token; unsigned int transmits_allowed; unsigned int mcasted_retransmit; unsigned int mcasted_regular; unsigned int last_aru; #ifdef GIVEINFO unsigned long long tv_current; unsigned long long tv_diff; tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "Time since last token %0.4f ms", ((float)tv_diff) / 1000000.0); #endif if (check_orf_token_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (instance->orf_token_discard) { return (0); } #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) { return (0); } #endif if (endian_conversion_needed) { orf_token_endian_convert ((struct orf_token *)msg, (struct orf_token *)token_convert); msg = (struct orf_token *)token_convert; } /* * Make copy of token and retransmit list in case we have * to flush incoming messages from the kernel queue */ token = (struct orf_token *)token_storage; memcpy (token, msg, sizeof (struct orf_token)); memcpy (&token->rtr_list[0], (char *)msg + sizeof (struct orf_token), sizeof (struct rtr_item) * RETRANSMIT_ENTRIES_MAX); /* * Handle merge detection timeout */ if (token->seq == instance->my_last_seq) { start_merge_detect_timeout (instance); instance->my_seq_unchanged += 1; } else { cancel_merge_detect_timeout (instance); cancel_token_hold_retransmit_timeout (instance); instance->my_seq_unchanged = 0; } instance->my_last_seq = token->seq; #ifdef TEST_RECOVERY_MSG_COUNT if (instance->memb_state == MEMB_STATE_OPERATIONAL && token->seq > TEST_RECOVERY_MSG_COUNT) { return (0); } #endif instance->flushing = 1; totemnet_recv_flush (instance->totemnet_context); instance->flushing = 0; /* * Determine if we should hold (in reality drop) the token */ instance->my_token_held = 0; if (instance->my_ring_id.rep == instance->my_id.nodeid && instance->my_seq_unchanged > instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } else { if (instance->my_ring_id.rep != instance->my_id.nodeid && instance->my_seq_unchanged >= instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } } /* * Hold onto token when there is no activity on ring and * this processor is the ring rep */ forward_token = 1; if (instance->my_ring_id.rep == instance->my_id.nodeid) { if (instance->my_token_held) { forward_token = 0; } } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_RECEIVED); switch (instance->memb_state) { case MEMB_STATE_COMMIT: /* Discard token */ break; case MEMB_STATE_OPERATIONAL: messages_free (instance, token->aru); /* * Do NOT add break, this case should also execute code in gather case. */ case MEMB_STATE_GATHER: /* * DO NOT add break, we use different free mechanism in recovery state */ case MEMB_STATE_RECOVERY: /* * Discard tokens from another configuration */ if (memcmp (&token->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); /* discard token */ } /* * Discard retransmitted tokens */ if (sq_lte_compare (token->token_seq, instance->my_token_seq)) { return (0); /* discard token */ } last_aru = instance->my_last_aru; instance->my_last_aru = token->aru; transmits_allowed = fcc_calculate (instance, token); mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed); if (instance->my_token_held == 1 && (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) { instance->my_token_held = 0; forward_token = 1; } fcc_rtr_limit (instance, token, &transmits_allowed); mcasted_regular = orf_token_mcast (instance, token, transmits_allowed); /* if (mcasted_regular) { printf ("mcasted regular %d\n", mcasted_regular); printf ("token seq %d\n", token->seq); } */ fcc_token_update (instance, token, mcasted_retransmit + mcasted_regular); if (sq_lt_compare (instance->my_aru, token->aru) || instance->my_id.nodeid == token->aru_addr || token->aru_addr == 0) { token->aru = instance->my_aru; if (token->aru == token->seq) { token->aru_addr = 0; } else { token->aru_addr = instance->my_id.nodeid; } } if (token->aru == last_aru && token->aru_addr != 0) { instance->my_aru_count += 1; } else { instance->my_aru_count = 0; } /* * We really don't follow specification there. In specification, OTHER nodes * detect failure of one node (based on aru_count) and my_id IS NEVER added * to failed list (so node never mark itself as failed) */ if (instance->my_aru_count > instance->totem_config->fail_to_recv_const && token->aru_addr == instance->my_id.nodeid) { log_printf (instance->totemsrp_log_level_error, "FAILED TO RECEIVE"); instance->failed_to_recv = 1; memb_set_merge (&instance->my_id, 1, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FAILED_TO_RECEIVE); } else { instance->my_token_seq = token->token_seq; token->token_seq += 1; if (instance->memb_state == MEMB_STATE_RECOVERY) { /* * instance->my_aru == instance->my_high_seq_received means this processor * has recovered all messages it can recover * (ie: its retrans queue is empty) */ if (cs_queue_is_empty (&instance->retrans_message_queue) == 0) { if (token->retrans_flg == 0) { token->retrans_flg = 1; instance->my_set_retrans_flg = 1; } } else if (token->retrans_flg == 1 && instance->my_set_retrans_flg) { token->retrans_flg = 0; instance->my_set_retrans_flg = 0; } log_printf (instance->totemsrp_log_level_debug, "token retrans flag is %d my set retrans flag%d retrans queue empty %d count %d, aru %x", token->retrans_flg, instance->my_set_retrans_flg, cs_queue_is_empty (&instance->retrans_message_queue), instance->my_retrans_flg_count, token->aru); if (token->retrans_flg == 0) { instance->my_retrans_flg_count += 1; } else { instance->my_retrans_flg_count = 0; } if (instance->my_retrans_flg_count == 2) { instance->my_install_seq = token->seq; } log_printf (instance->totemsrp_log_level_debug, "install seq %x aru %x high seq received %x", instance->my_install_seq, instance->my_aru, instance->my_high_seq_received); if (instance->my_retrans_flg_count >= 2 && instance->my_received_flg == 0 && sq_lte_compare (instance->my_install_seq, instance->my_aru)) { instance->my_received_flg = 1; instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct totem_ip_address) * instance->my_trans_memb_entries); } if (instance->my_retrans_flg_count >= 3 && sq_lte_compare (instance->my_install_seq, token->aru)) { instance->my_rotation_counter += 1; } else { instance->my_rotation_counter = 0; } if (instance->my_rotation_counter == 2) { log_printf (instance->totemsrp_log_level_debug, "retrans flag count %x token aru %x install seq %x aru %x %x", instance->my_retrans_flg_count, token->aru, instance->my_install_seq, instance->my_aru, token->seq); memb_state_operational_enter (instance); instance->my_rotation_counter = 0; instance->my_retrans_flg_count = 0; } } totemnet_send_flush (instance->totemnet_context); token_send (instance, token, forward_token); #ifdef GIVEINFO tv_current = qb_util_nano_current_get (); tv_diff = tv_current - tv_old; tv_old = tv_current; log_printf (instance->totemsrp_log_level_debug, "I held %0.4f ms", ((float)tv_diff) / 1000000.0); #endif if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* * Deliver messages after token has been transmitted * to improve performance */ reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED if (instance->my_id.nodeid == instance->my_ring_id.rep && instance->my_token_held == 1) { start_token_hold_retransmit_timeout (instance); } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_SENT); } break; } if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); } static void messages_deliver_to_app ( struct totemsrp_instance *instance, int skip, unsigned int end_point) { struct sort_queue_item *sort_queue_item_p; unsigned int i; int res; struct mcast *mcast_in; struct mcast mcast_header; unsigned int range = 0; int endian_conversion_required; unsigned int my_high_delivered_stored = 0; struct srp_addr aligned_system_from; range = end_point - instance->my_high_delivered; if (range) { log_printf (instance->totemsrp_log_level_trace, "Delivering %x to %x", instance->my_high_delivered, end_point); } assert (range < QUEUE_RTR_ITEMS_SIZE_MAX); my_high_delivered_stored = instance->my_high_delivered; /* * Deliver messages in order from rtr queue to pending delivery queue */ for (i = 1; i <= range; i++) { void *ptr = 0; /* * If out of range of sort queue, stop assembly */ res = sq_in_range (&instance->regular_sort_queue, my_high_delivered_stored + i); if (res == 0) { break; } res = sq_item_get (&instance->regular_sort_queue, my_high_delivered_stored + i, &ptr); /* * If hole, stop assembly */ if (res != 0 && skip == 0) { break; } instance->my_high_delivered = my_high_delivered_stored + i; if (res != 0) { continue; } sort_queue_item_p = ptr; mcast_in = sort_queue_item_p->mcast; assert (mcast_in != (struct mcast *)0xdeadbeef); endian_conversion_required = 0; if (mcast_in->header.magic != TOTEM_MH_MAGIC) { endian_conversion_required = 1; mcast_endian_convert (mcast_in, &mcast_header); } else { memcpy (&mcast_header, mcast_in, sizeof (struct mcast)); } aligned_system_from = mcast_header.system_from; /* * Skip messages not originated in instance->my_deliver_memb */ if (skip && memb_set_subset (&aligned_system_from, 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) == 0) { instance->my_high_delivered = my_high_delivered_stored + i; continue; } /* * Message found */ log_printf (instance->totemsrp_log_level_trace, "Delivering MCAST message with seq %x to pending delivery queue", mcast_header.seq); /* * Message is locally originated multicast */ instance->totemsrp_deliver_fn ( mcast_header.header.nodeid, ((char *)sort_queue_item_p->mcast) + sizeof (struct mcast), sort_queue_item_p->msg_len - sizeof (struct mcast), endian_conversion_required); } } /* * recv message handler called when MCAST message type received */ static int message_handler_mcast ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct sort_queue_item sort_queue_item; struct sq *sort_queue; struct mcast mcast_header; struct srp_addr aligned_system_from; if (check_mcast_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { mcast_endian_convert (msg, &mcast_header); } else { memcpy (&mcast_header, msg, sizeof (struct mcast)); } if (mcast_header.header.encapsulated == MESSAGE_ENCAPSULATED) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } assert (msg_len <= FRAME_SIZE_MAX); #ifdef TEST_DROP_MCAST_PERCENTAGE if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) { return (0); } #endif /* * If the message is foreign execute the switch below */ if (memcmp (&instance->my_ring_id, &mcast_header.ring_id, sizeof (struct memb_ring_id)) != 0) { aligned_system_from = mcast_header.system_from; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge ( &aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_OPERATIONAL_STATE); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &aligned_system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_FOREIGN_MESSAGE_IN_GATHER_STATE); return (0); } break; case MEMB_STATE_COMMIT: /* discard message */ instance->stats.rx_msg_dropped++; break; case MEMB_STATE_RECOVERY: /* discard message */ instance->stats.rx_msg_dropped++; break; } return (0); } log_printf (instance->totemsrp_log_level_trace, "Received ringid (" CS_PRI_RING_ID ") seq %x", mcast_header.ring_id.rep, (uint64_t)mcast_header.ring_id.seq, mcast_header.seq); /* * Add mcast message to rtr queue if not already in rtr queue * otherwise free io vectors */ if (msg_len > 0 && msg_len <= FRAME_SIZE_MAX && sq_in_range (sort_queue, mcast_header.seq) && sq_item_inuse (sort_queue, mcast_header.seq) == 0) { /* * Allocate new multicast memory block */ // TODO LEAK sort_queue_item.mcast = totemsrp_buffer_alloc (instance); if (sort_queue_item.mcast == NULL) { return (-1); /* error here is corrected by the algorithm */ } memcpy (sort_queue_item.mcast, msg, msg_len); sort_queue_item.msg_len = msg_len; if (sq_lt_compare (instance->my_high_seq_received, mcast_header.seq)) { instance->my_high_seq_received = mcast_header.seq; } sq_item_add (sort_queue, &sort_queue_item, mcast_header.seq); } update_aru (instance); if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* TODO remove from retrans message queue for old ring in recovery state */ return (0); } static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_merge_detect memb_merge_detect; struct srp_addr aligned_system_from; if (check_memb_merge_detect_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { memb_merge_detect_endian_convert (msg, &memb_merge_detect); } else { memcpy (&memb_merge_detect, msg, sizeof (struct memb_merge_detect)); } /* * do nothing if this is a merge detect from this configuration */ if (memcmp (&instance->my_ring_id, &memb_merge_detect.ring_id, sizeof (struct memb_ring_id)) == 0) { return (0); } aligned_system_from = memb_merge_detect.system_from; /* * Execute merge operation */ switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge (&aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_OPERATIONAL_STATE); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &aligned_system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&aligned_system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_GATHER_STATE); return (0); } break; case MEMB_STATE_COMMIT: /* do nothing in commit */ break; case MEMB_STATE_RECOVERY: /* do nothing in recovery */ break; } return (0); } static void memb_join_process ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; int gather_entered = 0; int fail_minus_memb_entries = 0; struct srp_addr fail_minus_memb[PROCESSOR_COUNT_MAX]; struct srp_addr aligned_system_from; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; aligned_system_from = memb_join->system_from; log_printf(instance->totemsrp_log_level_trace, "memb_join_process"); memb_set_log(instance, instance->totemsrp_log_level_trace, "proclist", proc_list, memb_join->proc_list_entries); memb_set_log(instance, instance->totemsrp_log_level_trace, "faillist", failed_list, memb_join->failed_list_entries); memb_set_log(instance, instance->totemsrp_log_level_trace, "my_proclist", instance->my_proc_list, instance->my_proc_list_entries); memb_set_log(instance, instance->totemsrp_log_level_trace, "my_faillist", instance->my_failed_list, instance->my_failed_list_entries); if (memb_join->header.type == MESSAGE_TYPE_MEMB_JOIN) { if (instance->flushing) { if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) { log_printf (instance->totemsrp_log_level_warning, "Discarding LEAVE message during flush, nodeid=" CS_PRI_NODE_ID, memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].nodeid : LEAVE_DUMMY_NODEID); if (memb_join->failed_list_entries > 0) { my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].nodeid); } } else { log_printf (instance->totemsrp_log_level_warning, "Discarding JOIN message during flush, nodeid=" CS_PRI_NODE_ID, memb_join->header.nodeid); } return; } else { if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) { log_printf (instance->totemsrp_log_level_debug, "Received LEAVE message from " CS_PRI_NODE_ID, memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].nodeid : LEAVE_DUMMY_NODEID); if (memb_join->failed_list_entries > 0) { my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].nodeid); } } } } if (memb_set_equal (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_equal (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { if (memb_join->header.nodeid != LEAVE_DUMMY_NODEID) { memb_consensus_set (instance, &aligned_system_from); } if (memb_consensus_agreed (instance) && instance->failed_to_recv == 1) { instance->failed_to_recv = 0; instance->my_proc_list[0] = instance->my_id; instance->my_proc_list_entries = 1; instance->my_failed_list_entries = 0; memb_state_commit_token_create (instance); memb_state_commit_enter (instance); return; } if (memb_consensus_agreed (instance) && memb_lowest_in_config (instance)) { memb_state_commit_token_create (instance); memb_state_commit_enter (instance); } else { goto out; } } else if (memb_set_subset (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_subset (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { goto out; } else if (memb_set_subset (&aligned_system_from, 1, instance->my_failed_list, instance->my_failed_list_entries)) { goto out; } else { memb_set_merge (proc_list, memb_join->proc_list_entries, instance->my_proc_list, &instance->my_proc_list_entries); if (memb_set_subset ( &instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { memb_set_merge ( &aligned_system_from, 1, instance->my_failed_list, &instance->my_failed_list_entries); } else { if (memb_set_subset ( &aligned_system_from, 1, instance->my_memb_list, instance->my_memb_entries)) { if (memb_set_subset ( &aligned_system_from, 1, instance->my_failed_list, instance->my_failed_list_entries) == 0) { memb_set_merge (failed_list, memb_join->failed_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); } else { memb_set_subtract (fail_minus_memb, &fail_minus_memb_entries, failed_list, memb_join->failed_list_entries, instance->my_memb_list, instance->my_memb_entries); memb_set_merge (fail_minus_memb, fail_minus_memb_entries, instance->my_failed_list, &instance->my_failed_list_entries); } } } memb_state_gather_enter (instance, TOTEMSRP_GSFROM_MERGE_DURING_JOIN); gather_entered = 1; } out: if (gather_entered == 0 && instance->memb_state == MEMB_STATE_OPERATIONAL) { memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_OPERATIONAL_STATE); } } static void memb_join_endian_convert (const struct memb_join *in, struct memb_join *out) { int i; struct srp_addr *in_proc_list; struct srp_addr *in_failed_list; struct srp_addr *out_proc_list; struct srp_addr *out_failed_list; out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->system_from = srp_addr_endian_convert(in->system_from); out->proc_list_entries = swab32 (in->proc_list_entries); out->failed_list_entries = swab32 (in->failed_list_entries); out->ring_seq = swab64 (in->ring_seq); in_proc_list = (struct srp_addr *)in->end_of_memb_join; in_failed_list = in_proc_list + out->proc_list_entries; out_proc_list = (struct srp_addr *)out->end_of_memb_join; out_failed_list = out_proc_list + out->proc_list_entries; for (i = 0; i < out->proc_list_entries; i++) { out_proc_list[i] = srp_addr_endian_convert (in_proc_list[i]); } for (i = 0; i < out->failed_list_entries; i++) { out_failed_list[i] = srp_addr_endian_convert (in_failed_list[i]); } } static void memb_commit_token_endian_convert (const struct memb_commit_token *in, struct memb_commit_token *out) { int i; struct srp_addr *in_addr = (struct srp_addr *)in->end_of_commit_token; struct srp_addr *out_addr = (struct srp_addr *)out->end_of_commit_token; struct memb_commit_token_memb_entry *in_memb_list; struct memb_commit_token_memb_entry *out_memb_list; out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->token_seq = swab32 (in->token_seq); out->ring_id.rep = swab32(in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->retrans_flg = swab32 (in->retrans_flg); out->memb_index = swab32 (in->memb_index); out->addr_entries = swab32 (in->addr_entries); in_memb_list = (struct memb_commit_token_memb_entry *)(in_addr + out->addr_entries); out_memb_list = (struct memb_commit_token_memb_entry *)(out_addr + out->addr_entries); for (i = 0; i < out->addr_entries; i++) { out_addr[i] = srp_addr_endian_convert (in_addr[i]); /* * Only convert the memb entry if it has been set */ if (in_memb_list[i].ring_id.rep != 0) { out_memb_list[i].ring_id.rep = swab32(in_memb_list[i].ring_id.rep); out_memb_list[i].ring_id.seq = swab64 (in_memb_list[i].ring_id.seq); out_memb_list[i].aru = swab32 (in_memb_list[i].aru); out_memb_list[i].high_delivered = swab32 (in_memb_list[i].high_delivered); out_memb_list[i].received_flg = swab32 (in_memb_list[i].received_flg); } } } static void orf_token_endian_convert (const struct orf_token *in, struct orf_token *out) { int i; out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->seq = swab32 (in->seq); out->token_seq = swab32 (in->token_seq); out->aru = swab32 (in->aru); out->ring_id.rep = swab32(in->ring_id.rep); out->aru_addr = swab32(in->aru_addr); out->ring_id.seq = swab64 (in->ring_id.seq); out->fcc = swab32 (in->fcc); out->backlog = swab32 (in->backlog); out->retrans_flg = swab32 (in->retrans_flg); out->rtr_list_entries = swab32 (in->rtr_list_entries); for (i = 0; i < out->rtr_list_entries; i++) { out->rtr_list[i].ring_id.rep = swab32(in->rtr_list[i].ring_id.rep); out->rtr_list[i].ring_id.seq = swab64 (in->rtr_list[i].ring_id.seq); out->rtr_list[i].seq = swab32 (in->rtr_list[i].seq); } } static void mcast_endian_convert (const struct mcast *in, struct mcast *out) { out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->header.encapsulated = in->header.encapsulated; out->seq = swab32 (in->seq); out->this_seqno = swab32 (in->this_seqno); out->ring_id.rep = swab32(in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->node_id = swab32 (in->node_id); out->guarantee = swab32 (in->guarantee); out->system_from = srp_addr_endian_convert(in->system_from); } static void memb_merge_detect_endian_convert ( const struct memb_merge_detect *in, struct memb_merge_detect *out) { out->header.magic = TOTEM_MH_MAGIC; out->header.version = TOTEM_MH_VERSION; out->header.type = in->header.type; out->header.nodeid = swab32 (in->header.nodeid); out->ring_id.rep = swab32(in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->system_from = srp_addr_endian_convert (in->system_from); } static int ignore_join_under_operational ( struct totemsrp_instance *instance, const struct memb_join *memb_join) { struct srp_addr *proc_list; struct srp_addr *failed_list; unsigned long long ring_seq; struct srp_addr aligned_system_from; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; ring_seq = memb_join->ring_seq; aligned_system_from = memb_join->system_from; if (memb_set_subset (&instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { return (1); } /* * In operational state, my_proc_list is exactly the same as * my_memb_list. */ if ((memb_set_subset (&aligned_system_from, 1, instance->my_memb_list, instance->my_memb_entries)) && (ring_seq < instance->my_ring_id.seq)) { return (1); } return (0); } static int message_handler_memb_join ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct memb_join *memb_join; struct memb_join *memb_join_convert = alloca (msg_len); struct srp_addr aligned_system_from; if (check_memb_join_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { memb_join = memb_join_convert; memb_join_endian_convert (msg, memb_join_convert); } else { memb_join = msg; } aligned_system_from = memb_join->system_from; /* * If the process paused because it wasn't scheduled in a timely * fashion, flush the join messages because they may be queued * entries */ if (pause_flush (instance)) { return (0); } if (instance->token_ring_id_seq < memb_join->ring_seq) { instance->token_ring_id_seq = memb_join->ring_seq; } switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (!ignore_join_under_operational (instance, memb_join)) { memb_join_process (instance, memb_join); } break; case MEMB_STATE_GATHER: memb_join_process (instance, memb_join); break; case MEMB_STATE_COMMIT: if (memb_set_subset (&aligned_system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_COMMIT_STATE); } break; case MEMB_STATE_RECOVERY: if (memb_set_subset (&aligned_system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_recovery_state_token_loss (instance); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_JOIN_DURING_RECOVERY); } break; } return (0); } static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { struct memb_commit_token *memb_commit_token_convert = alloca (msg_len); struct memb_commit_token *memb_commit_token; struct srp_addr sub[PROCESSOR_COUNT_MAX]; int sub_entries; struct srp_addr *addr; log_printf (instance->totemsrp_log_level_debug, "got commit token"); if (check_memb_commit_token_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (endian_conversion_needed) { memb_commit_token_endian_convert (msg, memb_commit_token_convert); } else { memcpy (memb_commit_token_convert, msg, msg_len); } memb_commit_token = memb_commit_token_convert; addr = (struct srp_addr *)memb_commit_token->end_of_commit_token; #ifdef TEST_DROP_COMMIT_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_COMMIT_TOKEN_PERCENTAGE) { return (0); } #endif switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: /* discard token */ break; case MEMB_STATE_GATHER: memb_set_subtract (sub, &sub_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); if (memb_set_equal (addr, memb_commit_token->addr_entries, sub, sub_entries) && memb_commit_token->ring_id.seq > instance->my_ring_id.seq) { memcpy (instance->commit_token, memb_commit_token, msg_len); memb_state_commit_enter (instance); } break; case MEMB_STATE_COMMIT: /* * If retransmitted commit tokens are sent on this ring * filter them out and only enter recovery once the * commit token has traversed the array. This is * determined by : * memb_commit_token->memb_index == memb_commit_token->addr_entries) { */ if (memb_commit_token->ring_id.seq == instance->my_ring_id.seq && memb_commit_token->memb_index == memb_commit_token->addr_entries) { memb_state_recovery_enter (instance, memb_commit_token); } break; case MEMB_STATE_RECOVERY: if (instance->my_id.nodeid == instance->my_ring_id.rep) { /* Filter out duplicated tokens */ if (instance->originated_orf_token) { break; } instance->originated_orf_token = 1; log_printf (instance->totemsrp_log_level_debug, "Sending initial ORF token"); // TODO convert instead of initiate orf_token_send_initial (instance); reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED } break; } return (0); } static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, const void *msg, size_t msg_len, int endian_conversion_needed) { const struct token_hold_cancel *token_hold_cancel = msg; if (check_token_hold_cancel_sanity(instance, msg, msg_len, endian_conversion_needed) == -1) { return (0); } if (memcmp (&token_hold_cancel->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) == 0) { instance->my_seq_unchanged = 0; if (instance->my_ring_id.rep == instance->my_id.nodeid) { timer_function_token_retransmit_timeout (instance); } } return (0); } static int check_message_header_validity( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from) { struct totemsrp_instance *instance = context; const struct totem_message_header *message_header = msg; const char *guessed_str; const char *msg_byte = msg; if (msg_len < sizeof (struct totem_message_header)) { log_printf (instance->totemsrp_log_level_security, "Message received from %s is too short... Ignoring %u.", totemip_sa_print((struct sockaddr *)system_from), (unsigned int)msg_len); return (-1); } if (message_header->magic != TOTEM_MH_MAGIC && message_header->magic != swab16(TOTEM_MH_MAGIC)) { /* * We've received ether Knet, old version of Corosync, * or something else. Do some guessing to display (hopefully) * helpful message */ guessed_str = NULL; if (message_header->magic == 0xFFFF) { /* * Corosync 2.2 used header with two UINT8_MAX */ guessed_str = "Corosync 2.2"; } else if (message_header->magic == 0xFEFE) { /* * Corosync 2.3+ used header with two UINT8_MAX - 1 */ guessed_str = "Corosync 2.3+"; } else if (msg_byte[0] == 0x01) { /* * Knet has stable1 with first byte of message == 1 */ guessed_str = "unencrypted Kronosnet"; } else if (msg_byte[0] >= 0 && msg_byte[0] <= 5) { /* * Unencrypted Corosync 1.x/OpenAIS has first byte * 0-5. Collision with Knet (but still worth the try) */ guessed_str = "unencrypted Corosync 2.0/2.1/1.x/OpenAIS"; } else { /* * Encrypted Kronosned packet has a hash at the end of * the packet and nothing specific at the beginning of the * packet (just encrypted data). * Encrypted Corosync 1.x/OpenAIS is quite similar but hash_digest * is in the beginning of the packet. * * So it's not possible to reliably detect ether of them. */ guessed_str = "encrypted Kronosnet/Corosync 2.0/2.1/1.x/OpenAIS or unknown"; } log_printf(instance->totemsrp_log_level_security, "Message received from %s has bad magic number (probably sent by %s).. Ignoring", totemip_sa_print((struct sockaddr *)system_from), guessed_str); return (-1); } if (message_header->version != TOTEM_MH_VERSION) { log_printf(instance->totemsrp_log_level_security, "Message received from %s has unsupported version %u... Ignoring", totemip_sa_print((struct sockaddr *)system_from), message_header->version); return (-1); } return (0); } void main_deliver_fn ( void *context, const void *msg, unsigned int msg_len, const struct sockaddr_storage *system_from) { struct totemsrp_instance *instance = context; const struct totem_message_header *message_header = msg; if (check_message_header_validity(context, msg, msg_len, system_from) == -1) { return ; } switch (message_header->type) { case MESSAGE_TYPE_ORF_TOKEN: instance->stats.orf_token_rx++; break; case MESSAGE_TYPE_MCAST: instance->stats.mcast_rx++; break; case MESSAGE_TYPE_MEMB_MERGE_DETECT: instance->stats.memb_merge_detect_rx++; break; case MESSAGE_TYPE_MEMB_JOIN: instance->stats.memb_join_rx++; break; case MESSAGE_TYPE_MEMB_COMMIT_TOKEN: instance->stats.memb_commit_token_rx++; break; case MESSAGE_TYPE_TOKEN_HOLD_CANCEL: instance->stats.token_hold_cancel_rx++; break; default: log_printf (instance->totemsrp_log_level_security, "Message received from %s has wrong type... ignoring %d.\n", totemip_sa_print((struct sockaddr *)system_from), (int)message_header->type); instance->stats.rx_msg_dropped++; return; } /* * Handle incoming message */ totemsrp_message_handlers.handler_functions[(int)message_header->type] ( instance, msg, msg_len, message_header->magic != TOTEM_MH_MAGIC); } int totemsrp_iface_set ( void *context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no) { struct totemsrp_instance *instance = context; int res; totemip_copy(&instance->my_addrs[iface_no], interface_addr); res = totemnet_iface_set ( instance->totemnet_context, interface_addr, ip_port, iface_no); return (res); } /* Contrary to its name, this only gets called when the interface is enabled */ void main_iface_change_fn ( void *context, const struct totem_ip_address *iface_addr, unsigned int iface_no) { struct totemsrp_instance *instance = context; int num_interfaces; int i; if (!instance->my_id.nodeid) { instance->my_id.nodeid = iface_addr->nodeid; } totemip_copy (&instance->my_addrs[iface_no], iface_addr); if (instance->iface_changes++ == 0) { instance->memb_ring_id_create_or_load (&instance->my_ring_id, instance->my_id.nodeid); /* * Increase the ring_id sequence number. This doesn't follow specification. * Solves problem with restarted leader node (node with lowest nodeid) before * rest of the cluster forms new membership and guarantees unique ring_id for * new singleton configuration. */ instance->my_ring_id.seq++; instance->token_ring_id_seq = instance->my_ring_id.seq; log_printf ( instance->totemsrp_log_level_debug, "Created or loaded sequence id " CS_PRI_RING_ID " for this ring.", instance->my_ring_id.rep, (uint64_t)instance->my_ring_id.seq); if (instance->totemsrp_service_ready_fn) { instance->totemsrp_service_ready_fn (); } } num_interfaces = 0; for (i = 0; i < INTERFACE_MAX; i++) { if (instance->totem_config->interfaces[i].configured) { num_interfaces++; } } if (instance->iface_changes >= num_interfaces) { /* We need to clear orig_interfaces so that 'commit' diffs against nothing */ instance->totem_config->orig_interfaces = malloc (sizeof (struct totem_interface) * INTERFACE_MAX); assert(instance->totem_config->orig_interfaces != NULL); memset(instance->totem_config->orig_interfaces, 0, sizeof (struct totem_interface) * INTERFACE_MAX); totemconfig_commit_new_params(instance->totem_config, icmap_get_global_map()); memb_state_gather_enter (instance, TOTEMSRP_GSFROM_INTERFACE_CHANGE); free(instance->totem_config->orig_interfaces); } } void totemsrp_net_mtu_adjust (struct totem_config *totem_config) { totem_config->net_mtu -= 2 * sizeof (struct mcast); } void totemsrp_service_ready_register ( void *context, void (*totem_service_ready) (void)) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->totemsrp_service_ready_fn = totem_service_ready; } int totemsrp_member_add ( void *context, const struct totem_ip_address *member, int iface_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_member_add (instance->totemnet_context, &instance->my_addrs[iface_no], member, iface_no); return (res); } int totemsrp_member_remove ( void *context, const struct totem_ip_address *member, int iface_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_member_remove (instance->totemnet_context, member, iface_no); return (res); } void totemsrp_threaded_mode_enable (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->threaded_mode_enabled = 1; } void totemsrp_trans_ack (void *context) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; instance->waiting_trans_ack = 0; instance->totemsrp_waiting_trans_ack_cb_fn (0); } int totemsrp_reconfigure (void *context, struct totem_config *totem_config) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; int res; res = totemnet_reconfigure (instance->totemnet_context, totem_config); return (res); } +int totemsrp_crypto_reconfigure_phase (void *context, struct totem_config *totem_config, cfg_message_crypto_reconfig_phase_t phase) +{ + struct totemsrp_instance *instance = (struct totemsrp_instance *)context; + int res; + + res = totemnet_crypto_reconfigure_phase (instance->totemnet_context, totem_config, phase); + return (res); +} + void totemsrp_stats_clear (void *context, int flags) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; memset(&instance->stats, 0, sizeof(totemsrp_stats_t)); if (flags & TOTEMPG_STATS_CLEAR_TRANSPORT) { totemnet_stats_clear (instance->totemnet_context); } } void totemsrp_force_gather (void *context) { timer_function_orf_token_timeout(context); } diff --git a/exec/totemsrp.h b/exec/totemsrp.h index 8477e85d..c8c1c45c 100644 --- a/exec/totemsrp.h +++ b/exec/totemsrp.h @@ -1,160 +1,165 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2006-2011 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** * @file * Totem Single Ring Protocol * * depends on poll abstraction, POSIX, IPV4 */ #ifndef TOTEMSRP_H_DEFINED #define TOTEMSRP_H_DEFINED #include #include /** * Create a protocol instance */ int totemsrp_initialize ( qb_loop_t *poll_handle, void **srp_context, struct totem_config *totem_config, totempg_stats_t *stats, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id), void (*waiting_trans_ack_cb_fn) ( int waiting_trans_ack)); void totemsrp_finalize (void *srp_context); /** * Multicast a message */ int totemsrp_mcast ( void *srp_context, struct iovec *iovec, unsigned int iov_len, int priority); /** * Return number of available messages that can be queued */ int totemsrp_avail (void *srp_context); int totemsrp_callback_token_create ( void *srp_context, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data); void totemsrp_callback_token_destroy ( void *srp_context, void **handle_out); void totemsrp_event_signal (void *srp_context, enum totem_event_type type, int value); extern void totemsrp_net_mtu_adjust (struct totem_config *totem_config); extern int totemsrp_ifaces_get ( void *srp_context, unsigned int nodeid, unsigned int *interface_id, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count); extern unsigned int totemsrp_my_nodeid_get ( void *srp_context); extern int totemsrp_my_family_get ( void *srp_context); extern int totemsrp_crypto_set ( void *srp_context, const char *cipher_type, const char *hash_type); void totemsrp_service_ready_register ( void *srp_context, void (*totem_service_ready) (void)); extern int totemsrp_iface_set ( void *srp_context, const struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no); extern int totemsrp_member_add ( void *srp_context, const struct totem_ip_address *member, int ring_no); extern int totemsrp_member_remove ( void *srp_context, const struct totem_ip_address *member, int ring_no); void totemsrp_threaded_mode_enable ( void *srp_context); void totemsrp_trans_ack ( void *srp_context); int totemsrp_reconfigure ( void *context, struct totem_config *totem_config); +int totemsrp_crypto_reconfigure_phase ( + void *context, + struct totem_config *totem_config, + cfg_message_crypto_reconfig_phase_t phase); + void totemsrp_stats_clear ( void *srp_context, int flags); void totemsrp_force_gather ( void *context); #endif /* TOTEMSRP_H_DEFINED */ diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h index 4ec480e7..6f43527a 100644 --- a/include/corosync/totem/totem.h +++ b/include/corosync/totem/totem.h @@ -1,264 +1,273 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2012 Red Hat, Inc. * * Author: Steven Dake (sdake@redhat.com) * * All rights reserved. * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEM_H_DEFINED #define TOTEM_H_DEFINED #include "totemip.h" #include #include #include #ifdef HAVE_SMALL_MEMORY_FOOTPRINT #define PROCESSOR_COUNT_MAX 16 #define MESSAGE_SIZE_MAX 1024*64 #define MESSAGE_QUEUE_MAX 512 #else #define PROCESSOR_COUNT_MAX 384 #define MESSAGE_SIZE_MAX 1024*1024 /* (1MB) */ #define MESSAGE_QUEUE_MAX ((4 * MESSAGE_SIZE_MAX) / totem_config->net_mtu) #endif /* HAVE_SMALL_MEMORY_FOOTPRINT */ #define FRAME_SIZE_MAX KNET_MAX_PACKET_SIZE #define CONFIG_STRING_LEN_MAX 128 /* * Estimation of required buffer size for totemudp and totemudpu - it should be at least * sizeof(memb_join) + PROCESSOR_MAX * 2 * sizeof(srp_addr)) * if we want to support PROCESSOR_MAX nodes, but because we don't have * srp_addr and memb_join, we have to use estimation. * TODO: Consider moving srp_addr/memb_join into totem headers instead of totemsrp.c */ #define UDP_RECEIVE_FRAME_SIZE_MAX (PROCESSOR_COUNT_MAX * (INTERFACE_MAX * 2 * sizeof(struct totem_ip_address)) + 1024) #define TRANSMITS_ALLOWED 16 #define SEND_THREADS_MAX 16 /* This must be <= KNET_MAX_LINK */ #define INTERFACE_MAX 8 #define BIND_MAX_RETRIES 10 #define BIND_RETRIES_INTERVAL 100 /** * Maximum number of continuous gather states */ #define MAX_NO_CONT_GATHER 3 /* * Maximum number of continuous failures get from sendmsg call */ #define MAX_NO_CONT_SENDMSG_FAILURES 30 struct totem_interface { struct totem_ip_address bindnet; struct totem_ip_address boundto; struct totem_ip_address mcast_addr; struct totem_ip_address local_ip; uint16_t ip_port; uint16_t ttl; uint8_t configured; int member_count; int knet_link_priority; int knet_ping_interval; int knet_ping_timeout; int knet_ping_precision; int knet_pong_count; int knet_transport; struct totem_ip_address member_list[PROCESSOR_COUNT_MAX]; }; struct totem_logging_configuration { void (*log_printf) ( int level, int subsys, const char *function_name, const char *file_name, int file_line, const char *format, ...) __attribute__((format(printf, 6, 7))); int log_level_security; int log_level_error; int log_level_warning; int log_level_notice; int log_level_debug; int log_level_trace; int log_subsys_id; }; /* * COrosync TOtem. Also used as an endian_detector. */ #define TOTEM_MH_MAGIC 0xC070 #define TOTEM_MH_VERSION 0x03 struct totem_message_header { unsigned short magic; char version; char type; char encapsulated; unsigned int nodeid; unsigned int target_nodeid; } __attribute__((packed)); enum { TOTEM_PRIVATE_KEY_LEN_MIN = KNET_MIN_KEY_LEN, TOTEM_PRIVATE_KEY_LEN_MAX = KNET_MAX_KEY_LEN }; enum { TOTEM_LINK_MODE_BYTES = 64 }; typedef enum { TOTEM_TRANSPORT_UDP = 0, TOTEM_TRANSPORT_UDPU = 1, TOTEM_TRANSPORT_KNET = 2 } totem_transport_t; #define MEMB_RING_ID struct memb_ring_id { unsigned int rep; unsigned long long seq; } __attribute__((packed)); +typedef enum { + CRYPTO_RECONFIG_PHASE_ACTIVATE = 1, + CRYPTO_RECONFIG_PHASE_CLEANUP = 2, +} cfg_message_crypto_reconfig_phase_t; + struct totem_config { int version; /* * network */ struct totem_interface *interfaces; struct totem_interface *orig_interfaces; /* for reload */ unsigned int node_id; unsigned int clear_node_high_bit; unsigned int knet_pmtud_interval; /* * key information */ unsigned char private_key[TOTEM_PRIVATE_KEY_LEN_MAX]; unsigned int private_key_len; /* * Totem configuration parameters */ unsigned int token_timeout; unsigned int token_warning; unsigned int token_retransmit_timeout; unsigned int token_hold_timeout; unsigned int token_retransmits_before_loss_const; unsigned int join_timeout; unsigned int send_join_timeout; unsigned int consensus_timeout; unsigned int merge_timeout; unsigned int downcheck_timeout; unsigned int fail_to_recv_const; unsigned int seqno_unchanged_const; char link_mode[TOTEM_LINK_MODE_BYTES]; struct totem_logging_configuration totem_logging_configuration; unsigned int net_mtu; unsigned int threads; unsigned int heartbeat_failures_allowed; unsigned int max_network_delay; unsigned int window_size; unsigned int max_messages; unsigned int broadcast_use; char crypto_model[CONFIG_STRING_LEN_MAX]; char crypto_cipher_type[CONFIG_STRING_LEN_MAX]; char crypto_hash_type[CONFIG_STRING_LEN_MAX]; + int crypto_index; /* Num of crypto config currently loaded into knet ( 1 or 2 ) */ + + int crypto_changed; /* Has crypto changed since last time? (it's expensive to reload) */ + char knet_compression_model[CONFIG_STRING_LEN_MAX]; uint32_t knet_compression_threshold; int knet_compression_level; totem_transport_t transport_number; unsigned int miss_count_const; enum totem_ip_version_enum ip_version; unsigned int block_unlisted_ips; void (*totem_memb_ring_id_create_or_load) ( struct memb_ring_id *memb_ring_id, unsigned int nodeid); void (*totem_memb_ring_id_store) ( const struct memb_ring_id *memb_ring_id, unsigned int nodeid); }; #define TOTEM_CONFIGURATION_TYPE enum totem_configuration_type { TOTEM_CONFIGURATION_REGULAR, TOTEM_CONFIGURATION_TRANSITIONAL }; #define TOTEM_CALLBACK_TOKEN_TYPE enum totem_callback_token_type { TOTEM_CALLBACK_TOKEN_RECEIVED = 1, TOTEM_CALLBACK_TOKEN_SENT = 2 }; enum totem_event_type { TOTEM_EVENT_DELIVERY_CONGESTED, TOTEM_EVENT_NEW_MSG, }; #endif /* TOTEM_H_DEFINED */ diff --git a/include/corosync/totem/totempg.h b/include/corosync/totem/totempg.h index 7f550368..af9bf71f 100644 --- a/include/corosync/totem/totempg.h +++ b/include/corosync/totem/totempg.h @@ -1,205 +1,207 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2006-2011 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** * @file * Totem Single Ring Protocol * * depends on poll abstraction, POSIX, IPV4 */ #ifndef TOTEMPG_H_DEFINED #define TOTEMPG_H_DEFINED #ifdef __cplusplus extern "C" { #endif #include #include #include "totem.h" #include struct totempg_group { const void *group; size_t group_len; }; #define TOTEMPG_AGREED 0 #define TOTEMPG_SAFE 1 /** * Initialize the totem process groups abstraction */ extern int totempg_initialize ( qb_loop_t* poll_handle, struct totem_config *totem_config ); extern void totempg_finalize (void); extern int totempg_callback_token_create (void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data); extern void totempg_callback_token_destroy (void *handle); /** * Initialize a groups instance */ extern int totempg_groups_initialize ( void **instance, void (*deliver_fn) ( unsigned int nodeid, const void *msg, unsigned int msg_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)); extern int totempg_groups_finalize (void *instance); extern int totempg_groups_join ( void *instance, const struct totempg_group *groups, size_t group_cnt); extern int totempg_groups_leave ( void *instance, const struct totempg_group *groups, size_t group_cnt); extern int totempg_groups_mcast_joined ( void *instance, const struct iovec *iovec, unsigned int iov_len, int guarantee); extern int totempg_groups_joined_reserve ( void *instance, const struct iovec *iovec, unsigned int iov_len); extern int totempg_groups_joined_release ( int msg_count); extern int totempg_groups_mcast_groups ( void *instance, int guarantee, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, unsigned int iov_len); extern int totempg_groups_send_ok_groups ( void *instance, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, unsigned int iov_len); extern int totempg_ifaces_get ( unsigned int nodeid, unsigned int *interface_id, struct totem_ip_address *interfaces, unsigned int interfaces_size, char ***status, unsigned int *iface_count); extern void* totempg_get_stats (void); void totempg_event_signal (enum totem_event_type type, int value); extern const char *totempg_ifaces_print (unsigned int nodeid); extern unsigned int totempg_my_nodeid_get (void); extern int totempg_my_family_get (void); extern int totempg_crypto_set (const char *cipher_type, const char *hash_type); extern void totempg_service_ready_register ( void (*totem_service_ready) (void)); extern int totempg_iface_set ( struct totem_ip_address *interface_addr, unsigned short ip_port, unsigned int iface_no); extern int totempg_member_add ( const struct totem_ip_address *member, int ring_no); extern int totempg_member_remove ( const struct totem_ip_address *member, int ring_no); enum totem_q_level { TOTEM_Q_LEVEL_LOW, TOTEM_Q_LEVEL_GOOD, TOTEM_Q_LEVEL_HIGH, TOTEM_Q_LEVEL_CRITICAL }; void totempg_check_q_level(void *instance); typedef void (*totem_queue_level_changed_fn) (enum totem_q_level level); extern void totempg_queue_level_register_callback (totem_queue_level_changed_fn); extern void totempg_threaded_mode_enable (void); extern void totempg_trans_ack (void); extern int totempg_reconfigure (void); +extern int totempg_crypto_reconfigure_phase (cfg_message_crypto_reconfig_phase_t phase); + extern void totempg_force_gather (void); extern void totempg_get_config(struct totem_config *config); extern void totempg_put_config(struct totem_config *config); #ifdef __cplusplus } #endif #endif /* TOTEMPG_H_DEFINED */