diff --git a/ChangeLog b/ChangeLog index 1960f75ac..8d1d7a0f5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,288 +1,392 @@ +* Fri May 25 2012 Linux-HA contributors +- stable release 3.9.3 +- dhcpd: new RA to manage ISC DHCP servers +- Filesystem: add nfs4 to the list of well known types +- IPaddr2: fix regression introduce in d93b5fd, nic=lo always + "stopped" +- iSCSILogicalUnit: correctly match for target IQN and backing + device name (iet and tgt) +- jboss: implememnt rotating of console log +- mysql: improve handling of reset slave +- oracle, oralsnr: get rid of eval +- slapd: pass bind_dn correctly to ldapsearch +* Wed May 16 2012 Linux-HA contributors +- release candidate 3.9.3 rc1 +- asterisk: new resource agent +- named: new RA to manage bind servers +- pound: new RA for Pound HTTP/HTTPS reverse-proxy and + load-balancer +- rsyslog: new RA to manage rsyslog servers +- slapd: new RA to manage OpenLDAP servers +- varnish: new resource agent +- apache: add support for IPv6 in monitor +- apache: create /var/run/apache2 if it doesn't exist +- apache: fix sysconfig includes & enable status for default SUSE + conf +- conntrackd: test for socket existence in monitor instead of + process grep +- conntrackd: rename parameter "conntrackd" to "binary" +- CTDB: Add smb_fileid_algorithm parameter (bnc#696978) +- CTDB: Improve monitor op (check output of ctdb status, + bnc#712192) +- CTDB: Set ctdb_start_as_disabled=no by default (bnc#712410, + required by samba 3.6) +- exportfs: allow expanding the fsid parameter to produce correct + exportfs options +- exportfs: don't grow /var/lib/nfs/rmtab indefinitely +- exportfs: fix monitor action for special characters and common + suffixes +- Filesystem: add support for glusterfs (lf#2620) +- Filesystem: add tmpfs to the list of supported filesystems +- Filesystem: allow to force cloning for local mounts +- Filesystem: don't use direct dd option in monitor depth 20 for + non-blockdevice fs +- Filesystem: fix determining if the device is a block device +- Filesystem: improve read/write checks for CHECK_LEVEL 10, 20 +- Filesystem: repair the fast_stop parameter use (its value was + always false) +- Filesystem: support ceph +- Filesystem: remove a status file only when OCF_CHECK_LEVEL is + set to 20 +- IPaddr: add back the local_start/stop_script code +- IPaddr: remove colon at the end of the interface name +- IPv6addr: always use the provided nic and cidr_netmask when + specified +- IPv6addr: handle a link-local address properly in send_ua +- iscsi: do not rely on iscsid.startup being set correctly + (bnc#751783) +- iscsi: proceed if iscsid is not running if iscsid.startup is + present in iscsid.conf +- iSCSILogicalUnit: fix default for scsi_sn +- iSCSITarget: treat an empty "implementation" parameter + specially +- jboss: add the java_opts parameter for java options +- ldirectord: precedence error with perl v5.8.8 in IPv6 code +- LVM: drop vgck(8) from monitor +- LVM: force dmevent monitoring for clones +- LVM: use ls instead of vgdisplay in status +- lxc: fix LXC_status to work with lxc-0.7.5 or later +- mysql: improve replication support +- mysql: check mysql status more thoroughly before stopping +- mysql: fix validation return codes +- mysql: support 5.5 slave status message format +- nfsserver: Support of multiple IP addresses (bnc#684143) +- nfsserver: don't run sm-notify in foreground (bnc#759616) +- ocf-shellfuncs: fix loglevel variable scope in ha_log +- ocft: new tests for named, IPv6addr, oracle, Xinetd +- ocft: several improvements +- oracle: improve managing IPC objects +- oracle: improve matching instance specific files and processes +- pgsql: support for replication +- postfix: multiple fixes +- Raid1: support for multiple MD arrays, as specified in raidconf +- SAPDatabase: add support for Sybase ASE and SAP HANA database +- SAPDatabase: correcting the unique values of RAs parameters +- SAPDatabase: replace method for checking responsiveness of + saphostexec +- SAPDatabase: version 2.00 make use of saphostagent +- SAPInstance : correcting the unique values of RAs parameters +- slapd: always set the exit code correctly in monitor +- tomcat: remove pidfile before start, it may prevent some tomcat + releases from starting +- VirtualDomain: add a functionality that modifies utilization of + resource automatically +- VirtualDomain: if the configuration file is missing on stop + exit with success +- VirtualDomain: honor virsh "in shutdown" state +- Xen: add support for HVM ACPI graceful shutdown +- Xen: wait in migrate_from for the migration to finish instead + of bailing out immediately +- Tools: findif: Use most specific matching route (bnc#740738) +- Tools: send_arp.libnet: fix for big endian platforms + (bnc#721334) +- doc: add the RA developer's guide + * Wed Jun 29 2011 Dejan Muhamedagic and others - stable release 3.9.2 - ethermon: new resource agent - iscsi: fix regression in 3.9.1 for open-iscsi version 2.0-872 (lf#2562) - pgsql: fix regression in 3.9.1 in directories on probes - VirtualDomain: if there's no config exit on stop with success - doc: add sfex_init(8) man page * Wed Jun 15 2011 Dejan Muhamedagic and others - stable release 3.9.1 - ocf-tester: tolerate OCF_ERR_INSTALLED on probes and missing binaries - pgsql: improve configuration check and probe handling * Wed Jun 01 2011 Dejan Muhamedagic and others - release candidate 3.9.1 rc1 - first release since establishing joined repository with RHCS agents - build: new spec file and autoconf to support both agents' sets - build: use ./configure --with-ras-set=linux-ha to configure for heartbeat RA set - build: create compatibility symlinks in autofoo not in spec - build: GNUmakefile removed - lxc: new RA to manage lxc linux containers - symlink: new RA to manage symbolic links - db2: new implementation with master/slave mode - oracle: improve oracle process list test (bnc#673027) - exportfs: backup and restore rmtab to ensure smooth client failover on node failures - CTDB: Allow stop to succeed when using pkill on ctdbd (bnc#695829) - mysql: --skip-slave-start option is default now - mysql: set connect timeout to 10 seconds rather than 1 second - mysql: keep replication state (prevents data loss on master reset) - mysql: don't rely on state information from pacemaker, but check if the instance is in the read-only mode - mysql: if test parameters are all set, assume OCF_CHECK_LEVEL=10 - mysql: support for master/slave for more than two nodes - mysql: don't wait for replication to finish, when not replicating - mysql: store replication state in separate attributes for each master - VirtualDomain: correctly create migration URI when target is an FQDN - VirtualDomain: properly wait until domain_name is non-empty - ldirectord: add a support of "netmask" directive for IPv6 - ldirectord: fix fwmark behavior for IPv6 - ldirectord: ignore children in Net::DNS - iscsi: add support for open-iscsi version 2.0-872 (lf#2562) - postfix: issue error if 'postfix abort' failed - postfix: improve exit codes on installation problems - postfix: use monitor to test if postfix works after the start action - ocft: fix make command for compatibility with mawk/Debian (lf#2600) - ocft: test case for pgsql - ocft: test case for postfix - ocft: test case for iscsi - doc: improve man pages output - doc: add examples for master/slave resource agents * Wed Feb 16 2011 Dejan Muhamedagic and others - stable release 1.0.4 - ocft: testcases for db2, LVM, and Filesystem * Fri Feb 11 2011 Dejan Muhamedagic and others - release candidate 1.0.4 - add GPLv3 license file (bnc#655700) - ocf-shellfuncs: allow ocf_run to return the actual exit code - ocf-shellfuncs: handle properly syslog facility set to none (bnc#621818) - ocf-shellfuncs: correctly identify root by id only (bnc#602312) - RA: add OCF_ROOT/lib/heartbeat directory (development) - RA: set the HA_RSCTMP directory to /var/run/resource-agents (lf#2378) - build: install jboss - conntrackd: new RA - exportfs: new RA - nginx: new RA - fio: new RA for IO load simulation - Filesystem: allow cloning of some filesystems as read-only (lf#2440) - Filesystem: add fast_stop parameter (lf#2402) - Filesystem: Clarify metadata and improve non-clone warning - Filesystem: new run_fsck parameter - LVM: add partial_activation parameter (lf#2490) - IPaddr2: fix reference to Infiniband arping binary (bnc#668447) - IPaddr2: optionally flush kernel routing table on interface stop - IPaddr2: exit with the right code when not properly configured - IPaddr2: exit early and with the right code if the ip parameter is not set - IPaddr2: unique_clone_address should work without CIP (lf#2442) - IPaddr: return the correct code if interface delete failed - IPv6addr: allow link-local addresses in case the interface name is provided - IPv6addr: interface index in /proc/net/if_inet6 may be longer than 2 chars (lf#2462) - IPsrcaddr: exit with the right code when not properly configured - IPsrcaddr: add the cidr_netmask parameter - Tools: findif: differentiate between error conditions - nfsserver: fix the default string for the notification parameter - nfsserver: don't use -v in the notify cmd with rpc.statd - iSCSITarget: fix race for target IDs when using IET (lf#2432) - iSCSITarget: follow changed IET access policy - Raid1: Support attempting to re-add mirrors on deep monitor action (bnc#619121) - Raid1: Fix graceful stop code path - Raid1: Handle stop for failed arrays properly (bnc#618775) - sfex: output log messages also to stderr in sfex_init - sfex: add the sfex_stat command - sfex: wait in the start and stop actions until sfex_daemon starts/exits - Xen: implement stop of a migrating domain (bnc#656227) - Xen: check the allow_mem_management boolean properly (bnc#637525) - Xen: Always run destroy in stop sequence. - Xen: use xen-list command for status check if available (bnc#628735) - Xen: use xen-destroy for stop, if available. - Xen: Allow node configurable attribute to specify which IP to use for live migration (bnc#628735) - VirtualDomain: fix spurious stop failures - VirtualDomain: don't timeout in stop before escalating to "forced stop" - ManageVE: add migration capability - MailTo: don't check if user exists for email address (might be an alias or remote) - CTDB: Remove hard-coded timeout on start op - CTDB: Don't manage Samba and Winbind by default - CTDB: Deprecate (and make optional) smb_private_dir param (bnc#623788) - tomcat: Ensure name of tomcat resource is only used on start operation and expose JAVA_OPTS variable for use - tomcat: Fix to ensure default OCF_RESKEY_xx values are observed - tomcat: Add CATALINA_BASE parameter, defaults to CATALINA_HOME, permits multiple tomcat instances - tomcat: Use Tomcat stop TIMEOUT -force to improve stop - Dummy: migrate_from/to: correct OCF_RESKEY_CRM_meta_migrate_xxx variable names - Dummy: make method reload work - anything: add the workdir parameter - mysql: clone and master-slave functionality - mysql: add replication monitoring - mysql: check for write permissions after creating pid and socket directory - mysql: make client binary path configurable - pgsql: cd to pgdata before running commands (fixes permission error) - pgsql: add optional username, password, and sqlcode parameters for monitor - pgsql: add new "config" parameter - pgsql: properly implement pghost parameter - pgsql: socketdir parameter to manage non-default UNIX socket directories - oracle: reduce output from sqlplus to the last line for queries (bnc#567815) - db2: Replace call to db2_local_ps with db2nps - db2: guard against a hanging db2stop by spawning this into the background. Use db2_kill after grace period. - db2: add multi partition support - db2: improve behaviour on probes - db2: support for v9.x instances (bnc#608952) - SAPDatabase,SAPInstance: improve LD_LIBRARY_PATH processing (bnc#640026) - SAPInstance: prevent premature expansion of [:upper:] [:lower:] when producing sidadm uid - SAPInstance: Moved testing of SAP profile directory and START profile to a later stage (only when needed), for more robustness - SAPInstance: fix return codes in probes - SAPInstance: New parameter: SHUTDOWN_METHOD - SAPInstance: ensure enqueue failover in monitor_clone on process failure - SAPInstance: don't rely on op target rc when monitoring clones (lf#2371) - SAPDatabase: prevent premature expansion of [:upper:] and [:lower:] when producing sidadm/orasid/db2sid uids - SAPdatabase: Changed Oracle recovery method from "recover automatic database" to "end backup" - SAPDatabase: Adapt process search pattern for DB/2 9.5 - SAPDatabase: start listener only if database processes are found - SAPDatabase: avoid continuous output to syslog in monitor with SAP 7.20 and J2EE_ONLY=1 - ldirectord: http: connect to server instead of protocol (Debian#594958) - ldirectord: add implicit support for submission RFC4409 - ldirectord: example configuration for a submission virtual service - ldirectord: Shutdown write-side of client connection after writing has finished - ldirectord: port number mismatch of imaps and pops - ldirectord: Oracle compatibility - ldirectord: don't exit on timeout in HTTP/HTTPS check - ldirectord: allow underscore in service name - ldirectord: use $1 instead of \1 in pattern replace (bnc#605086) - Tools: ocf-tester: Extend to cover initial probe (monitor_0) test. - Tools: ocf-tester: set and export some common meta variables (lf#2524) - Tools: ocf-tester: meta-data also should never be affected by missing binaries. - Tools: ocf-tester: show output from the agent in case of error * Tue Apr 13 2010 Dejan Muhamedagic and others - stable release 1.0.3 - meta-data: improve timeouts in most resource agents (reduce the number of warnings by the shell) - RA: log messages to stderr if attached to a terminal - ocf-shellfuncs: tests to check for clone/ms resources - ocf-shellfuncs: don't output to stderr if using syslog (prevents double logging from the RA and lrmd) - make sure that OCF_RESKEY_CRM_meta_interval is always defined (lf#2284) - ocft: new RA test suite - VirtualDomain: bail out early if config file can't be read during probe (nbc#593988) - VirtualDomain: spin on define until we definitely have a domain name - VirtualDomain: fix incorrect use of __OCF_ACTION (the stop operation may timeout otherwise) - Filesystem: prefer /proc/mounts to /etc/mtab for non-bind mounts (lf#2388) - IPaddr2: don't bring the interface down on stop (otherwise IPv6 addresses may be removed) - oracle/oralsnr: improve exit codes if the environment isn't valid - oracle/oralsnr: improve logging - Route: don't assume that OCF_RESKEY_CRM_meta_clone_node_max is set to a number (lf#2375) - Route: add route table parameter (lf#2335) - sfex: don't use pid file (lf#2363,bnc#585416) - SFEX daemon: fix logging - ldirectord: fix the configfile default (bnc#589457) - drbd: fix metadata (bnc#588684) - IPsrcaddr: modify the interface route (lf#2367) - ldirectord: Allow multiple email addresses (lf#2168) - vmware: fix set_environment() invocation (lf#2342) - vmware: updated to version 0.2 - apache: return the right exit code from monitor (bnc#578628) - iSCSILogicalUnit: fix monitor for STGT * Mon Feb 01 2010 Dejan Muhamedagic and others - stable release 1.0.2 - EvmsSCC, Evmsd, LinuxSCSI, drbd, pingd: marked as deprecated (lf#2244) - CTDB: new resource agent for clustered samba - postfix: new resource agent - proftpd: new resource agent - AoEtarget: new resource agent to export ATA-over-Ethernet (AoE) targets - Squid: new resource agent - VirtualDomain: new resource agent (manage virtual domains using libvirt/virsh) - anything: new resource agent for arbitrary daemons - mysql-proxy: new resource agent - iSCSITarget/iSCSILogicalUnit: two new resource agents - portblock: fast reconnect/tickle ACK (new feature) - IPv6addr: new nic and cidr_netmask parameters - mysql-proxy: log_level and keepalive parameters - Filesystem: implement deep monitor operation - apache: monitor operation of depth 10 for web applications (lf#2234) - SAPDatabase + SAPInstance: New versions from SAP - CTDB: auto-generate cluster-specific part of smb.conf (lf#2308) - ClusterMon: don't fail in stop if the process is missing (bnc#569957) - Filesystem: allow configuring smbfs mounts as clones - IPaddr2: CLUSTERIP/iptables rule not always inserted on failed monitor (lf#2281) - IPaddr2: behave if the interface is down (lf#2147) - IPaddr2: check binaries when it makes sense - IPaddr2: fix invalid default value for OCF_RESKEY_clusterip_hash (bnc#553753) - IPaddr2: include netmask in search for the right interface - IPaddr2: remove all colons from the mac address before passing it to send_arp (lf#2165) - IPsrcaddr: replace 0/0 with proper ip prefix - IPv6addr: recognize network masks properly - IPv6addr: supply checksum for ICMPv6 packets - IPv6addr: ifdef out the ip offset hack for libnet v1.1.4 (lf#2034) - IPv6addr: supply checksum for ICMPv6 packets - LVM: Make monitor operation quiet in logs (bnc#546353) - MailTo: Provide a default for MAILCMD (bnc#534803, bnc#556366) - MailTo: allow multiple word subject line - Raid1: improve monitor function (bnc#546551) - Route: improve validate (lf#2232) - Squid: make the regexp match more precisely output of netstat - VIParip: Pathname needed to be configurable (lf#1331) - VirtualDomain: avoid needlessly invoking "virsh define" - VirtualDomain: destroy domain shortly before timeout expiry - VirtualDomain: fix forceful stop (lf#2283) - VirtualDomain: loop on status if libvirtd is unreachable - Xen: Remove instance_attribute "allow_migrate" (bnc#539968) - apache: make sure that proxies are not used for monitor - iSCSILogicalUnit: add support for SCSI ID, SCSI SN, Vendor ID, and Product ID - iSCSILogicalUnit: add support for per-LU parameters - iSCSILogicalUnit: set default for SCSI SN, truncate SCSI ID default to 24 bytes - iSCSILogicalUnit: use a 16-byte default SCSI ID - iSCSITarget, iSCSILogicalUnit: add support for tgt - iSCSITarget: reintroduce "tid" parameter - iSCSITarget, iSCSILogicalUnit: identify targets by IQN, not by tid - iSCSITarget, iSCSILogicalUnit: support LIO - iSCSITarget: add support for CHAP authentication - iSCSITarget: add support for restricting target access - iSCSITarget: be more persistent deleting targets on stop - include ldirectord (formerly known as heartbeat-ldirectord) - iscsi: replace wrong variable reference (bnc#499291) - jboss: Added JBoss support - ldirectord: fix setting defaults for configfile and ldirectord (lf#2328) - ldirectord: fix various bugs in OCF RA (lf#1949) - mysql: escalate stop to KILL if regular shutdown doesn't work - mysql: handle monitor and stop properly on invalid environment - nfsserver: use default values (lf#2321) - nfsserver: validate should not check if nfs_shared_infodir exists (lf#2219) - nfsserver: use check_binary properly in validate (lf#2211) - nfsserver: exit properly in nfsserver_validate (lf#2173) - oracle/oralsnr: export variables properly - oracle: drop spurious output from sqlplus - pgsql: remove the previous backup_label if it exists - portblock: add per-IP filtering capability - portblock: fix invalid exit codes on monitor - postfix: fix double stop - scsi2reservation: fix wrong logic in check for scsi_reserve - vmware: make meta-data work and several cleanups (lf#2212) - shellfuncs: make the mktemp wrappers work - ocf-shellfuncs: add mercurial repository version information - ocf-shellfuncs: add ocf_is_probe function - doc: add resource agents' man pages including examples * Thu Oct 23 2008 Lars Marowsky-Bree and MANY others - beta release 2.99.2 - LVM: stop correctly in case vol group does not exist * Tue Sep 23 2008 Lars Marowsky-Bree and MANY others - beta release 2.99.1 * Tue Aug 19 2008 Andrew Beekhof and MANY others - beta release 2.99.0 diff --git a/configure.ac b/configure.ac index 97485a275..fa417eecc 100644 --- a/configure.ac +++ b/configure.ac @@ -1,917 +1,894 @@ dnl dnl autoconf for Agents dnl dnl License: GNU General Public License (GPL) dnl =============================================== dnl Bootstrap dnl =============================================== AC_PREREQ(2.63) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services AC_INIT([resource-agents], m4_esyscmd([make/git-version-gen .tarball-version]), [to_be_defined@foobar.org]) AC_USE_SYSTEM_EXTENSIONS CRM_DTD_VERSION="1.0" PKG_FEATURES="" AC_CONFIG_AUX_DIR(.) AC_CANONICAL_HOST dnl Where #defines go (e.g. `AC_CHECK_HEADERS' below) dnl dnl Internal header: include/config.h dnl - Contains ALL defines dnl - include/config.h.in is generated automatically by autoheader dnl - NOT to be included in any header files except lha_internal.h dnl (which is also not to be included in any other header files) dnl dnl External header: include/agent_config.h dnl - Contains a subset of defines checked here dnl - Manually edit include/agent_config.h.in to have configure include new defines dnl - Should not include HAVE_* defines dnl - Safe to include anywhere AM_CONFIG_HEADER(include/config.h include/agent_config.h) ALL_LINGUAS="en fr" AC_ARG_WITH(version, [ --with-version=version Override package version (if you're a packager needing to pretend) ], [ PACKAGE_VERSION="$withval" ]) AC_ARG_WITH(pkg-name, [ --with-pkg-name=name Override package name (if you're a packager needing to pretend) ], [ PACKAGE_NAME="$withval" ]) dnl dnl AM_INIT_AUTOMAKE([1.11.1 foreign dist-bzip2 dist-xz]) dnl AM_INIT_AUTOMAKE([1.10.1 foreign dist-bzip2]) AC_DEFINE_UNQUOTED(AGENTS_VERSION, "$PACKAGE_VERSION", Current agents version) CC_IN_CONFIGURE=yes export CC_IN_CONFIGURE LDD=ldd dnl ======================================================================== dnl Compiler characteristics dnl ======================================================================== # check stolen from gnulib/m4/gnu-make.m4 if ! ${MAKE-make} --version /cannot/make/this >/dev/null 2>&1; then AC_MSG_ERROR([you don't seem to have GNU make; it is required]) fi AC_PROG_CC dnl Can force other with environment variable "CC". AM_PROG_CC_C_O AC_PROG_CC_STDC AC_PROG_AWK AC_PROG_LN_S AC_PROG_INSTALL AC_PROG_MAKE_SET AC_C_STRINGIZE AC_C_INLINE AC_TYPE_SIZE_T AC_TYPE_SSIZE_T AC_TYPE_UID_T AC_TYPE_UINT16_T AC_TYPE_UINT8_T AC_TYPE_UINT32_T AC_CHECK_SIZEOF(char) AC_CHECK_SIZEOF(short) AC_CHECK_SIZEOF(int) AC_CHECK_SIZEOF(long) AC_CHECK_SIZEOF(long long) AC_STRUCT_TIMEZONE dnl =============================================== dnl Helpers dnl =============================================== cc_supports_flag() { local CFLAGS="$@" AC_MSG_CHECKING(whether $CC supports "$@") AC_COMPILE_IFELSE([int main(){return 0;}] ,[RC=0; AC_MSG_RESULT(yes)],[RC=1; AC_MSG_RESULT(no)]) return $RC } extract_header_define() { AC_MSG_CHECKING(for $2 in $1) Cfile=$srcdir/extract_define.$2.${$} printf "#include \n" > ${Cfile}.c printf "#include <%s>\n" $1 >> ${Cfile}.c printf "int main(int argc, char **argv) { printf(\"%%s\", %s); return 0; }\n" $2 >> ${Cfile}.c $CC $CFLAGS ${Cfile}.c -o ${Cfile} value=`${Cfile}` AC_MSG_RESULT($value) printf $value rm -f ${Cfile}.c ${Cfile} } dnl =============================================== dnl Configure Options dnl =============================================== dnl Some systems, like Solaris require a custom package name AC_ARG_WITH(pkgname, [ --with-pkgname=name name for pkg (typically for Solaris) ], [ PKGNAME="$withval" ], [ PKGNAME="LXHAhb" ], ) AC_SUBST(PKGNAME) AC_ARG_ENABLE([ansi], [ --enable-ansi force GCC to compile to ANSI/ANSI standard for older compilers. [default=yes]]) AC_ARG_ENABLE([fatal-warnings], [ --enable-fatal-warnings very pedantic and fatal warnings for gcc [default=yes]]) INITDIR="" AC_ARG_WITH(initdir, [ --with-initdir=DIR directory for init (rc) scripts [${INITDIR}]], [ INITDIR="$withval" ]) OCF_ROOT_DIR="/usr/lib/ocf" AC_ARG_WITH(ocf-root, [ --with-ocf-root=DIR directory for OCF scripts [${OCF_ROOT_DIR}]], [ if test x"$withval" = xprefix; then OCF_ROOT_DIR=${prefix}; else OCF_ROOT_DIR="$withval"; fi ]) HA_RSCTMPDIR=${localstatedir}/run/resource-agents AC_ARG_WITH(rsctmpdir, [ --with-rsctmpdir=DIR directory for resource agents state files [${HA_RSCTMPDIR}]], [ if test x"$withval" = xprefix; then HA_RSCTMPDIR=${prefix}; else HA_RSCTMPDIR="$withval"; fi ]) AC_ARG_ENABLE([libnet], [ --enable-libnet Use libnet for ARP based funcationality, [default=try]], [enable_libnet="$enableval"], [enable_libnet=try]) BUILD_RGMANAGER=0 BUILD_LINUX_HA=0 RASSET=all AC_ARG_WITH(ras-set, [ --with-ras-set=SET build/install only linux-ha or rgmanager resource-agents [default: all]], [ RASSET="$withval" ]) if test x$RASSET = xyes || test x$RASSET = xall ; then BUILD_RGMANAGER=1 BUILD_LINUX_HA=1 fi if test x$RASSET = xlinux-ha; then BUILD_LINUX_HA=1 fi if test x$RASSET = xrgmanager; then BUILD_RGMANAGER=1 fi if test $BUILD_LINUX_HA -eq 0 && test $BUILD_RGMANAGER -eq 0; then AC_MSG_ERROR([Are you really sure you want this package?]) exit 1 fi AM_CONDITIONAL(BUILD_LINUX_HA, test $BUILD_LINUX_HA -eq 1) AM_CONDITIONAL(BUILD_RGMANAGER, test $BUILD_RGMANAGER -eq 1) dnl =============================================== dnl General Processing dnl =============================================== INIT_EXT="" echo Our Host OS: $host_os/$host AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Heartbeat, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac AC_MSG_NOTICE(Sanitizing INITDIR: ${INITDIR}) case $INITDIR in prefix) INITDIR=$prefix;; "") AC_MSG_CHECKING(which init (rc) directory to use) for initdir in /etc/init.d /etc/rc.d/init.d /sbin/init.d \ /usr/local/etc/rc.d /etc/rc.d do if test -d $initdir then INITDIR=$initdir break fi done if test -z $INITDIR then INITDIR=${sysconfdir}/init.d fi AC_MSG_RESULT($INITDIR);; esac AC_SUBST(INITDIR) if test "${prefix}" = "/usr"; then INITDIRPREFIX="$INITDIR" else INITDIRPREFIX="${prefix}/$INITDIR" fi AC_SUBST(INITDIRPREFIX) AC_MSG_NOTICE(Sanitizing libdir: ${libdir}) case $libdir in dnl For consistency with Heartbeat, map NONE->$prefix *prefix*|NONE) AC_MSG_CHECKING(which lib directory to use) for aDir in lib64 lib do trydir="${exec_prefix}/${aDir}" if test -d ${trydir} then libdir=${trydir} break fi done AC_MSG_RESULT($libdir); ;; esac dnl Expand autoconf variables so that we dont end up with '${prefix}' dnl in #defines and python scripts dnl NOTE: Autoconf deliberately leaves them unexpanded to allow dnl make exec_prefix=/foo install dnl No longer being able to do this seems like no great loss to me... eval prefix="`eval echo ${prefix}`" eval exec_prefix="`eval echo ${exec_prefix}`" eval bindir="`eval echo ${bindir}`" eval sbindir="`eval echo ${sbindir}`" eval libexecdir="`eval echo ${libexecdir}`" eval datadir="`eval echo ${datadir}`" eval sysconfdir="`eval echo ${sysconfdir}`" eval sharedstatedir="`eval echo ${sharedstatedir}`" eval localstatedir="`eval echo ${localstatedir}`" eval libdir="`eval echo ${libdir}`" eval includedir="`eval echo ${includedir}`" eval oldincludedir="`eval echo ${oldincludedir}`" eval infodir="`eval echo ${infodir}`" eval mandir="`eval echo ${mandir}`" dnl docdir is a recent addition to autotools eval docdir="`eval echo ${docdir}`" if test "x$docdir" = "x"; then docdir="`eval echo ${datadir}/doc`" fi AC_SUBST(docdir) dnl Home-grown variables eval INITDIR="${INITDIR}" for j in prefix exec_prefix bindir sbindir libexecdir datadir sysconfdir \ sharedstatedir localstatedir libdir includedir oldincludedir infodir \ mandir INITDIR docdir do dirname=`eval echo '${'${j}'}'` if test ! -d "$dirname" then AC_MSG_WARN([$j directory ($dirname) does not exist!]) fi done dnl This OS-based decision-making is poor autotools practice; dnl feature-based mechanisms are strongly preferred. dnl dnl So keep this section to a bare minimum; regard as a "necessary evil". REBOOT_OPTIONS="-f" POWEROFF_OPTIONS="-f" case "$host_os" in *bsd*) LIBS="-L/usr/local/lib" CPPFLAGS="$CPPFLAGS -I/usr/local/include" INIT_EXT=".sh" ;; *solaris*) REBOOT_OPTIONS="-n" POWEROFF_OPTIONS="-n" ;; *linux*) AC_DEFINE_UNQUOTED(ON_LINUX, 1, Compiling for Linux platform) POWEROFF_OPTIONS="-nf" REBOOT_OPTIONS="-nf" ;; darwin*) AC_DEFINE_UNQUOTED(ON_DARWIN, 1, Compiling for Darwin platform) LIBS="$LIBS -L${prefix}/lib" CFLAGS="$CFLAGS -I${prefix}/include" ;; esac AC_SUBST(INIT_EXT) AC_DEFINE_UNQUOTED(HA_LOG_FACILITY, LOG_DAEMON, Default logging facility) AC_MSG_NOTICE(Host CPU: $host_cpu) case "$host_cpu" in ppc64|powerpc64) case $CFLAGS in *powerpc64*) ;; *) if test "$GCC" = yes; then CFLAGS="$CFLAGS -m64" fi ;; esac esac AC_MSG_CHECKING(which format is needed to print uint64_t) case "$host_cpu" in s390x)U64T="%lu";; *64*) U64T="%lu";; *) U64T="%llu";; esac AC_MSG_RESULT($U64T) AC_DEFINE_UNQUOTED(U64T, "$U64T", Correct printf format for logging uint64_t) dnl Variables needed for substitution AC_CHECK_HEADERS(heartbeat/glue_config.h) if test "$ac_cv_header_heartbeat_glue_config_h" = "yes"; then OCF_ROOT_DIR=`extract_header_define heartbeat/glue_config.h OCF_ROOT_DIR` else enable_libnet=no fi AC_DEFINE_UNQUOTED(OCF_ROOT_DIR,"$OCF_ROOT_DIR", OCF root directory - specified by the OCF standard) AC_SUBST(OCF_ROOT_DIR) GLUE_STATE_DIR=${localstatedir}/run AC_DEFINE_UNQUOTED(GLUE_STATE_DIR,"$GLUE_STATE_DIR", Where to keep state files and sockets) AC_SUBST(GLUE_STATE_DIR) AC_DEFINE_UNQUOTED(HA_VARRUNDIR,"$GLUE_STATE_DIR", Where Heartbeat keeps state files and sockets - old name) HA_VARRUNDIR="$GLUE_STATE_DIR" AC_SUBST(HA_VARRUNDIR) # Expand $prefix eval HA_RSCTMPDIR="`eval echo ${HA_RSCTMPDIR}`" AC_DEFINE_UNQUOTED(HA_RSCTMPDIR,"$HA_RSCTMPDIR", Where Resouce agents keep state files) AC_SUBST(HA_RSCTMPDIR) dnl Eventually move out of the heartbeat dir tree and create symlinks when needed HA_VARLIBHBDIR=${localstatedir}/lib/heartbeat AC_DEFINE_UNQUOTED(HA_VARLIBHBDIR,"$HA_VARLIBHBDIR", Whatever this used to mean) AC_SUBST(HA_VARLIBHBDIR) OCF_RA_DIR="${OCF_ROOT_DIR}/resource.d/" AC_DEFINE_UNQUOTED(OCF_RA_DIR,"$OCF_RA_DIR", Location for OCF RAs) AC_SUBST(OCF_RA_DIR) if test "${prefix}" = "/usr"; then OCF_RA_DIR_PREFIX="$OCF_RA_DIR" else OCF_RA_DIR_PREFIX="${prefix}/$OCF_RA_DIR" fi AC_SUBST(OCF_RA_DIR_PREFIX) OCF_LIB_DIR="${OCF_ROOT_DIR}/lib/" AC_DEFINE_UNQUOTED(OCF_LIB_DIR,"$OCF_LIB_DIR", Location for shared code for OCF RAs) AC_SUBST(OCF_LIB_DIR) if test "${prefix}" = "/usr"; then OCF_LIB_DIR_PREFIX="$OCF_LIB_DIR" else OCF_LIB_DIR_PREFIX="${prefix}/$OCF_LIB_DIR" fi AC_SUBST(OCF_LIB_DIR_PREFIX) dnl =============================================== dnl rgmanager ras bits dnl =============================================== LOGDIR=${localstatedir}/log/cluster CLUSTERDATA=${datadir}/cluster AC_SUBST([LOGDIR]) AC_SUBST([CLUSTERDATA]) dnl =============================================== dnl Program Paths dnl =============================================== PATH="$PATH:/sbin:/usr/sbin:/usr/local/sbin:/usr/local/bin" export PATH AM_PATH_PYTHON AC_CHECK_PROGS(MAKE, gmake make) AC_PATH_PROGS(SSH, ssh, /usr/bin/ssh) AC_PATH_PROGS(SCP, scp, /usr/bin/scp) AC_PATH_PROGS(TAR, tar) AC_PATH_PROGS(MD5, md5) AC_PATH_PROGS(TEST, test) AC_PATH_PROGS(PING, ping, /bin/ping) AC_PATH_PROGS(IFCONFIG, ifconfig, /sbin/ifconfig) AC_PATH_PROGS(MAILCMD, mailx mail, mail) AC_PATH_PROGS(EGREP, egrep) AC_PATH_PROGS(PKGCONFIG, pkg-config) AC_SUBST(MAILCMD) AC_SUBST(EGREP) AC_SUBST(SHELL) AC_SUBST(PING) AC_SUBST(TEST) AC_PATH_PROGS(ROUTE, route) AC_DEFINE_UNQUOTED(ROUTE, "$ROUTE", path to route command) AC_MSG_CHECKING(ifconfig option to list interfaces) for IFCONFIG_A_OPT in "-A" "-a" "" do $IFCONFIG $IFCONFIG_A_OPT > /dev/null 2>&1 if test "$?" = 0 then AC_DEFINE_UNQUOTED(IFCONFIG_A_OPT, "$IFCONFIG_A_OPT", option for ifconfig command) AC_MSG_RESULT($IFCONFIG_A_OPT) break fi done AC_SUBST(IFCONFIG_A_OPT) if test x"${MAKE}" = x""; then AC_MSG_ERROR(You need (g)make installed in order to build ${PACKAGE}) fi dnl =============================================== dnl Libraries dnl =============================================== AC_CHECK_LIB(socket, socket) AC_CHECK_LIB(gnugetopt, getopt_long) dnl if available if test x"${PKGCONFIG}" = x""; then AC_MSG_ERROR(You need pkgconfig installed in order to build ${PACKAGE}) fi if test "x${enable_thread_safe}" = "xyes"; then GPKGNAME="gthread-2.0" else GPKGNAME="glib-2.0" fi if $PKGCONFIG --exists $GPKGNAME then GLIBCONFIG="$PKGCONFIG $GPKGNAME" else set -x echo PKG_CONFIG_PATH=$PKG_CONFIG_PATH $PKGCONFIG --exists $GPKGNAME; echo $? $PKGCONFIG --cflags $GPKGNAME; echo $? $PKGCONFIG $GPKGNAME; echo $? set +x AC_MSG_ERROR(You need glib2-devel installed in order to build ${PACKAGE}) fi AC_MSG_RESULT(using $GLIBCONFIG) if test "X$GLIBCONFIG" != X; then AC_MSG_CHECKING(for special glib includes: ) GLIBHEAD=`$GLIBCONFIG --cflags` AC_MSG_RESULT($GLIBHEAD) CPPFLAGS="$CPPFLAGS $GLIBHEAD" AC_MSG_CHECKING(for glib library flags) GLIBLIB=`$GLIBCONFIG --libs` AC_MSG_RESULT($GLIBLIB) LIBS="$LIBS $GLIBLIB" fi dnl ======================================================================== dnl Headers dnl ======================================================================== AC_HEADER_STDC AC_CHECK_HEADERS(sys/socket.h) AC_CHECK_HEADERS(sys/sockio.h) AC_CHECK_HEADERS([arpa/inet.h]) AC_CHECK_HEADERS([fcntl.h]) AC_CHECK_HEADERS([limits.h]) AC_CHECK_HEADERS([malloc.h]) AC_CHECK_HEADERS([netdb.h]) AC_CHECK_HEADERS([netinet/in.h]) AC_CHECK_HEADERS([sys/file.h]) AC_CHECK_HEADERS([sys/ioctl.h]) AC_CHECK_HEADERS([sys/param.h]) AC_CHECK_HEADERS([sys/time.h]) AC_CHECK_HEADERS([syslog.h]) dnl ======================================================================== dnl Functions dnl ======================================================================== AC_FUNC_FORK AC_FUNC_STRNLEN AC_CHECK_FUNCS([alarm gettimeofday inet_ntoa memset mkdir socket uname]) AC_CHECK_FUNCS([strcasecmp strchr strdup strerror strrchr strspn strstr strtol strtoul]) -dnl 'reboot()' system call: one argument (e.g. Linux) or two (e.g. Solaris)? -dnl -AC_CACHE_CHECK([number of arguments in reboot system call], - ac_cv_REBOOT_ARGS,[ - AC_TRY_COMPILE( - [#include ], - [(void)reboot(0);], - ac_cv_REBOOT_ARGS=1, - [AC_TRY_COMPILE( - [#include ], - [(void)reboot(0,(void *)0);], - ac_cv_REBOOT_ARGS=2, - ac_cv_REBOOT_ARGS=0 - )], - ac_cv_REBOOT_ARGS=0 - ) - ] -) -dnl Argument count of 0 suggests no known 'reboot()' call. -if test "$ac_cv_REBOOT_ARGS" -ge "1"; then - AC_DEFINE_UNQUOTED(REBOOT_ARGS,$ac_cv_REBOOT_ARGS,[number of arguments for reboot system call]) -fi - AC_PATH_PROGS(REBOOT, reboot, /sbin/reboot) AC_SUBST(REBOOT) AC_SUBST(REBOOT_OPTIONS) AC_DEFINE_UNQUOTED(REBOOT, "$REBOOT", path to the reboot command) AC_DEFINE_UNQUOTED(REBOOT_OPTIONS, "$REBOOT_OPTIONS", reboot options) AC_PATH_PROGS(POWEROFF_CMD, poweroff, /sbin/poweroff) AC_SUBST(POWEROFF_CMD) AC_SUBST(POWEROFF_OPTIONS) AC_DEFINE_UNQUOTED(POWEROFF_CMD, "$POWEROFF_CMD", path to the poweroff command) AC_DEFINE_UNQUOTED(POWEROFF_OPTIONS, "$POWEROFF_OPTIONS", poweroff options) AC_PATH_PROGS(XSLTPROC, xsltproc) AM_CONDITIONAL(BUILD_DOC, test "x$XSLTPROC" != "x" ) if test "x$XSLTPROC" = "x"; then AC_MSG_WARN([xsltproc not installed, unable to (re-)build manual pages]) fi AC_SUBST(XSLTPROC) AC_PATH_PROGS(POD2MAN, pod2man) AM_CONDITIONAL(BUILD_POD_DOC, test "x$POD2MAN" != "x" ) if test "x$POD2MAN" = "x"; then AC_MSG_WARN([pod2man not installed, unable to (re-)build ldirector manual page]) fi AC_SUBST(POD2MAN) dnl ======================================================================== dnl Functions dnl ======================================================================== AC_CHECK_FUNCS(getopt, AC_DEFINE(HAVE_DECL_GETOPT, 1, [Have getopt function])) dnl ======================================================================== dnl sfex dnl ======================================================================== build_sfex=no case $host_os in *Linux*|*linux*) if test "$ac_cv_header_heartbeat_glue_config_h" = "yes"; then build_sfex=yes fi ;; esac AM_CONDITIONAL(BUILD_SFEX, test "$build_sfex" = "yes" ) dnl ======================================================================== dnl tickle (needs port to BSD platforms) dnl ======================================================================== AC_CHECK_MEMBERS([struct iphdr.saddr],,,[[#include ]]) AM_CONDITIONAL(BUILD_TICKLE, test "$ac_cv_member_struct_iphdr_saddr" = "yes" ) dnl ======================================================================== dnl libnet dnl ======================================================================== libnet="" libnet_version="none" LIBNETLIBS="" LIBNETDEFINES="" AC_MSG_CHECKING(if libnet is required) libnet_fatal=$enable_libnet case $enable_libnet in no) ;; yes|libnet10|libnet11|10|11) libnet_fatal=yes;; try) case $host_os in *Linux*|*linux*) libnet_fatal=no;; *) libnet_fatal=yes;; dnl legacy behavior esac ;; *) libnet_fatal=yes; enable_libnet=try;; esac AC_MSG_RESULT($libnet_fatal) if test "x$enable_libnet" != "xno"; then AC_PATH_PROGS(LIBNETCONFIG, libnet-config) AC_CHECK_LIB(nsl, t_open) dnl -lnsl AC_CHECK_LIB(socket, socket) dnl -lsocket AC_CHECK_LIB(net, libnet_get_hwaddr, LIBNETLIBS=" -lnet", []) fi AC_MSG_CHECKING(for libnet) if test "x$LIBNETLIBS" != "x" -o "x$enable_libnet" = "xlibnet11"; then LIBNETDEFINES="" if test "$ac_cv_lib_nsl_t_open" = yes; then LIBNETLIBS="-lnsl $LIBNETLIBS" fi if test "$ac_cv_lib_socket_socket" = yes; then LIBNETLIBS="-lsocket $LIBNETLIBS" fi libnet=net libnet_version="libnet1.1" fi if test "x$enable_libnet" = "xtry" -o "x$enable_libnet" = "xlibnet10"; then if test "x$LIBNETLIBS" = x -a "x${LIBNETCONFIG}" != "x" ; then LIBNETDEFINES="`$LIBNETCONFIG --defines` `$LIBNETCONFIG --cflags`"; LIBNETLIBS="`$LIBNETCONFIG --libs`"; libnet_version="libnet1.0 (old)" case $LIBNETLIBS in *-l*) libnet=`echo $LIBNETLIBS | sed 's%.*-l%%'`;; *) libnet_version=none;; esac CPPFLAGS="$CPPFLAGS $LIBNETDEFINES" AC_CHECK_HEADERS(libnet.h) if test "$ac_cv_header_libnet_h" = no; then libnet_version=none fi fi fi AC_MSG_RESULT(found $libnet_version) if test "$libnet_version" = none; then LIBNETLIBS="" LIBNETDEFINES="" if test $libnet_fatal = yes; then AC_MSG_ERROR(libnet not found) fi else AC_CHECK_LIB($libnet,libnet_init, [new_libnet=yes; AC_DEFINE(HAVE_LIBNET_1_1_API, 1, Libnet 1.1 API)], [new_libnet=no; AC_DEFINE(HAVE_LIBNET_1_0_API, 1, Libnet 1.0 API)],$LIBNETLIBS) AC_SUBST(LIBNETLIBS) fi if test "$new_libnet" = yes; then AC_MSG_CHECKING(for libnet API 1.1.4: ) save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -fgnu89-inline -Wall -Werror" AC_COMPILE_IFELSE([#include int main(){libnet_t *l=NULL; libnet_pblock_record_ip_offset(l, l->total_size); return(0); }], [AC_MSG_RESULT(no)], [AC_DEFINE(HAVE_LIBNET_1_1_4_API, 1, Libnet 1.1.4 API) AC_MSG_RESULT(yes)]) CFLAGS="$save_CFLAGS" fi sendarp_linux=0 case $host_os in *Linux*|*linux*) sendarp_linux=1;; esac AC_SUBST(LIBNETLIBS) AC_SUBST(LIBNETDEFINES) AM_CONDITIONAL(SENDARP_LINUX, test $sendarp_linux = 1 ) AM_CONDITIONAL(USE_LIBNET, test "x$libnet_version" != "xnone" ) dnl ************************************************************************ dnl * Check for netinet/icmp6.h to enable the IPv6addr resource agent AC_CHECK_HEADERS(netinet/icmp6.h,[],[],[#include ]) AM_CONDITIONAL(USE_IPV6ADDR, test "$ac_cv_header_netinet_icmp6_h" = yes ) dnl ======================================================================== dnl Compiler flags dnl ======================================================================== dnl Make sure that CFLAGS is not exported. If the user did dnl not have CFLAGS in their environment then this should have dnl no effect. However if CFLAGS was exported from the user's dnl environment, then the new CFLAGS will also be exported dnl to sub processes. CC_ERRORS="" CC_EXTRAS="" if export | fgrep " CFLAGS=" > /dev/null; then export -n CFLAGS || true # We don't want to bomb out if this fails SAVED_CFLAGS="$CFLAGS" unset CFLAGS CFLAGS="$SAVED_CFLAGS" unset SAVED_CFLAGS fi if test "$GCC" != yes; then CFLAGS="$CFLAGS -g" enable_fatal_warnings=no else CFLAGS="$CFLAGS -ggdb3" # We had to eliminate -Wnested-externs because of libtool changes # Also remove -Waggregate-return because we use one libnet # call which returns a struct EXTRA_FLAGS="-fgnu89-inline -fstack-protector-all -Wall -Wbad-function-cast -Wcast-qual -Wcast-align -Wdeclaration-after-statement -Wendif-labels -Wfloat-equal -Wformat=2 -Wformat-security -Wformat-nonliteral -Winline -Wmissing-prototypes -Wmissing-declarations -Wmissing-format-attribute -Wnested-externs -Wno-long-long -Wno-strict-aliasing -Wpointer-arith -Wstrict-prototypes -Wunsigned-char -Wwrite-strings" # Additional warnings it might be nice to enable one day # -Wshadow # -Wunreachable-code for j in $EXTRA_FLAGS do if cc_supports_flag $j then CC_EXTRAS="$CC_EXTRAS $j" fi done dnl In lib/ais/Makefile.am there's a gcc option available as of v4.x GCC_MAJOR=`gcc -v 2>&1 | awk 'END{print $3}' | sed 's/[.].*//'` AM_CONDITIONAL(GCC_4, test "${GCC_MAJOR}" = 4) dnl System specific options case "$host_os" in *linux*|*bsd*) if test "${enable_fatal_warnings}" = "unknown"; then enable_fatal_warnings=yes fi ;; esac if test "x${enable_fatal_warnings}" != xno && cc_supports_flag -Werror ; then enable_fatal_warnings=yes else enable_fatal_warnings=no fi if test "x${enable_ansi}" != xno && cc_supports_flag -std=iso9899:199409 ; then AC_MSG_NOTICE(Enabling ANSI Compatibility) CC_EXTRAS="$CC_EXTRAS -ansi -D_GNU_SOURCE -DANSI_ONLY" fi AC_MSG_NOTICE(Activated additional gcc flags: ${CC_EXTRAS}) fi CFLAGS="$CFLAGS $CC_EXTRAS" NON_FATAL_CFLAGS="$CFLAGS" AC_SUBST(NON_FATAL_CFLAGS) dnl dnl We reset CFLAGS to include our warnings *after* all function dnl checking goes on, so that our warning flags don't keep the dnl AC_*FUNCS() calls above from working. In particular, -Werror will dnl *always* cause us troubles if we set it before here. dnl dnl if test "x${enable_fatal_warnings}" = xyes ; then AC_MSG_NOTICE(Enabling Fatal Warnings) CFLAGS="$CFLAGS -Werror" fi AC_SUBST(CFLAGS) dnl This is useful for use in Makefiles that need to remove one specific flag CFLAGS_COPY="$CFLAGS" AC_SUBST(CFLAGS_COPY) AC_SUBST(LOCALE) AC_SUBST(CC) AC_SUBST(MAKE) dnl The Makefiles and shell scripts we output AC_CONFIG_FILES(Makefile \ include/Makefile \ heartbeat/Makefile \ heartbeat/ocf-binaries \ heartbeat/ocf-directories \ heartbeat/ocf-shellfuncs \ heartbeat/shellfuncs \ tools/Makefile \ tools/ocf-tester \ tools/ocft/Makefile \ tools/ocft/ocft \ tools/ocft/caselib \ tools/ocft/README \ tools/ocft/README.zh_CN \ ldirectord/Makefile \ ldirectord/ldirectord \ ldirectord/init.d/Makefile \ ldirectord/init.d/ldirectord \ ldirectord/init.d/ldirectord.debian \ ldirectord/init.d/ldirectord.debian.default \ ldirectord/logrotate.d/Makefile \ ldirectord/OCF/Makefile \ ldirectord/OCF/ldirectord \ doc/Makefile \ doc/man/Makefile \ rgmanager/Makefile \ rgmanager/src/Makefile \ rgmanager/src/resources/Makefile \ rgmanager/src/resources/utils/Makefile \ ) dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() dnl ***************** dnl Configure summary dnl ***************** AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE configuration:]) AC_MSG_RESULT([ Version = ${VERSION}]) AC_MSG_RESULT([ Build Version = $Format:%H$]) AC_MSG_RESULT([ Features =${PKG_FEATURES}]) AC_MSG_RESULT([]) AC_MSG_RESULT([ Prefix = ${prefix}]) AC_MSG_RESULT([ Executables = ${sbindir}]) AC_MSG_RESULT([ Man pages = ${mandir}]) AC_MSG_RESULT([ Libraries = ${libdir}]) AC_MSG_RESULT([ Header files = ${includedir}]) AC_MSG_RESULT([ Arch-independent files = ${datadir}]) AC_MSG_RESULT([ Documentation = ${docdir}]) AC_MSG_RESULT([ State information = ${localstatedir}]) AC_MSG_RESULT([ System configuration = ${sysconfdir}]) AC_MSG_RESULT([ RA state files = ${HA_RSCTMPDIR}]) AC_MSG_RESULT([ AIS Plugins = ${LCRSODIR}]) AC_MSG_RESULT([]) AC_MSG_RESULT([ CFLAGS = ${CFLAGS}]) AC_MSG_RESULT([ Libraries = ${LIBS}]) AC_MSG_RESULT([ Stack Libraries = ${CLUSTERLIBS}]) diff --git a/doc/dev-guides/ra-dev-guide.txt b/doc/dev-guides/ra-dev-guide.txt index 26421382e..af5e3b1b0 100644 --- a/doc/dev-guides/ra-dev-guide.txt +++ b/doc/dev-guides/ra-dev-guide.txt @@ -1,1985 +1,2015 @@ = The OCF Resource Agent Developer's Guide == Introduction This document is to serve as a guide and reference for all developers, maintainers, and contributors working on OCF (Open Cluster Framework) compliant cluster resource agents. It explains the anatomy and general functionality of a resource agent, illustrates the resource agent API, and provides valuable hints and tips to resource agent authors. === What is a resource agent? A resource agent is an executable that manages a cluster resource. No formal definition of a cluster resource exists, other than "anything a cluster manages is a resource." Cluster resources can be as diverse as IP addresses, file systems, database services, and entire virtual machines -- to name just a few examples. === Who or what uses a resource agent? Any Open Cluster Framework (OCF) compliant cluster management application is capable of managing resources using the resource agents described in this document. At the time of writing, two OCF compliant cluster management applications exist for the Linux platform: * _Pacemaker_, a cluster manager supporting both the Corosync and Heartbeat cluster messaging frameworks. Pacemaker evolved out of the Linux-HA project. * _RGmanager_, the cluster manager bundled in Red Hat Cluster Suite. It supports the Corosync cluster messaging framework exclusively. === Which language is a resource agent written in? An OCF compliant resource agent can be implemented in _any_ programming language. The API is not language specific. However, most resource agents are implemented as shell scripts, which is why this guide primarily uses example code written in shell language. == API definitions === Environment variables A resource agent receives all configuration information about the resource it manages via environment variables. The names of these environment variables are always the name of the resource parameter, prefixed with +OCF_RESKEY_+. For example, if the resource has an +ip+ parameter set to +192.168.1.1+, then the resource agent will have access to an environment variable +OCF_RESKEY_ip+ holding that value. For any resource parameter that is not required to be set by the user -- that is, its parameter definition in the resource agent metadata does not specify +required="true"+ -- then the resource agent must * Provide a reasonable default. This should be advertised in the metadata. By convention, the resource agent uses a variable named +OCF_RESKEY__default+ that holds this default. * Alternatively, cater correctly for the value being empty. In addition, the cluster manager may also support _meta_ resource parameters. These do not apply directly to the resource configuration, but rather specify _how_ the cluster resource manager is expected to manage the resource. For example, the Pacemaker cluster manager uses the +target-role+ meta parameter to specify whether the resource should be started or stopped. Meta parameters are passed into the resource agent in the +OCF_RESKEY_CRM_meta_+ namespace, with any hypens converted to underscores. Thus, the +target-role+ attribute maps to an environment variable named +OCF_RESKEY_CRM_meta_target_role+. === Actions Any resource agent must support one command-line argument which specifies the action the resource agent is about to execute. The following actions must be supported by any resource agent: * +start+ -- starts the resource. * +stop+ -- shuts down the resource. * +monitor+ -- queries the resource for its state. * +meta-data+ -- dumps the resource agent metadata. In addition, resource agents may optionally support the following actions: * +promote+ -- turns a resource into the +Master+ role (Master/Slave resources only). * +demote+ -- turns a resource into the +Slave+ role (Master/Slave resources only). * +migrate_to+ and +migrate_from+ -- implement live migration of resources. * +validate-all+ -- validates a resource's configuration. * +usage+ or +help+ -- displays a usage message when the resource agent is invoked from the command line, rather than by the cluster manager. * +status+ -- historical (deprecated) synonym for +monitor+. === Timeouts Action timeouts are enforced outside the resource agent proper. It is the cluster manager's responsibility to monitor how long a resource agent action has been running, and terminate it if it does not meet its completion deadline. Thus, resource agents need not themselves check for any timeout expiry. Resource agents can, however, _advise_ the user of sensible timeout values (which, when correctly set, will be duly enforced by the cluster manager). See <<_metadata,the following section>> for details on how a resource agent advertises its suggested timeouts. === Metadata Every resource agent must describe its own purpose and supported parameters in a set of XML metadata. This metadata is used by cluster management applications for on-line help, and resource agent man pages are generated from it as well. The following is a fictitious set of metadata from an imaginary resource agent: [source,xml] -------------------------------------------------------------------------- 0.1 This is a fictitious example resource agent written for the OCF Resource Agent Developers Guide. Example resource agent for budding OCF RA developers Number of eggs, an example numeric parameter Number of eggs Enable superfrobnication, an example boolean parameter Enable superfrobnication Data directory, an example string parameter Data directory -------------------------------------------------------------------------- The +resource-agent+ element, of which there must only be one per resource agent, defines the resource agent +name+ and +version+. The +longdesc+ and +shortdesc+ elements in +resource-agent+ provide a long and short description of the resource agent's functionality. While +shortdesc+ is a one-line description of what the resource agent does and is usually used in terse listings, +longdesc+ should give a full-blown description of the resource agent in as much detail as possible. The +parameters+ element describes the resource agent parameters, and should hold any number of +parameter+ children -- one for each parameter that the resource agent supports. Every +parameter+ should, like the +resource-agent+ as a whole, come with a +shortdesc+ and a +longdesc+, and also a +content+ child that describes the parameter's expected content. On the +content+ element, there may be four different attributes: * +type+ describes the parameter type (+string+, +integer+, or +boolean+). If unset, +type+ defaults to +string+. * +required+ indicates whether setting the parameter is mandatory (+required="true"+) or optional (+required="false"+). * For optional parameters, it is customary to provide a sensible default via the +default+ attribute. * Finally, the +unique+ attribute (allowed values: +true+ or +false+) indicates that a specific value must be unique across the cluster, for this parameter of this particular resource type. For example, a highly available floating IP address is declared +unique+ -- as that one IP address should run only once throughout the cluster, avoiding duplicates. The +actions+ list defines the actions that the resource agent advertises as supported. Every +action+ should list its own +timeout+ value. This is a hint to the user what _minimal_ timeout should be configured for the action. This is meant to cater for the fact that some resources are quick to start and stop (IP addresses or filesystems, for example), some may take several minutes to do so (such as databases). In addition, recurring actions (such as +monitor+) should also specify a recommended minimum +interval+, which is the time between two consecutive invocations of the same action. Like +timeout+, this value does not constitute a default -- it is merely a hint for the user which action interval to configure, at minimum. == Return codes For any invocation, resource agents must exit with a defined return code that informs the caller of the outcome of the invoked action. The return codes are explained in detail in the following subsections. === +OCF_SUCCESS+ (0) The action completed successfully. This is the expected return code for any successful +start+, +stop+, +promote+, +demote+, +migrate_from+, +migrate_to+, +meta_data+, +help+, and +usage+ action. For +monitor+ (and its deprecated alias, +status+), however, a modified convention applies: * For primitive (stateless) resources, +OCF_SUCCESS+ from +monitor+ means that the resource is running. Non-running and gracefully shut-down resources must instead return +OCF_NOT_RUNNING+. * For master/slave (stateful) resources, +OCF_SUCCESS+ from +monitor+ means that the resource is running _in Slave mode_. Resources running in Master mode must instead return +OCF_RUNNING_MASTER+, and gracefully shut-down resources must instead return +OCF_NOT_RUNNING+. === +OCF_ERR_GENERIC+ (1) The action returned a generic error. A resource agent should use this exit code only when none of the more specific error codes, defined below, accurately describes the problem. The cluster resource manager interprets this exit code as a _soft_ error. This means that unless specifically configured otherwise, the resource manager will attempt to recover a resource which failed with +OCF_ERR_GENERIC+ in-place -- usually by restarting the resource on the same node. === +OCF_ERR_ARGS+ (2) The resource agent was invoked with incorrect arguments. This is a safety net "can't happen" error which the resource agent should only return when invoked with, for example, an incorrect number of command line arguments. NOTE: The resource agent should not return this error when instructed to perform an action that it does not support. Instead, under those circumstances, it should return +OCF_ERR_UNIMPLEMENTED+. === +OCF_ERR_UNIMPLEMENTED+ (3) The resource agent was instructed to execute an action that the agent does not implement. Not all resource agent actions are mandatory. +promote+, +demote+, +migrate_to+, +migrate_from+, and +notify+, are all optional actions which the resource agent may or may not implement. When a non-stateful resource agent is misconfigured as a master/slave resource, for example, then the resource agent should alert the user about this misconfiguration by returning +OCF_ERR_UNIMPLEMENTED+ on the +promote+ and +demote+ actions. === +OCF_ERR_PERM+ (4) The action failed due to insufficient permissions. This may be due to the agent not being able to open a certain file, to listen on a specific socket, to write to a directory, or similar. The cluster resource manager interprets this exit code as a _hard_ error. This means that unless specifically configured otherwise, the resource manager will attempt to recover a resource which failed with this error by restarting the resource on a different node (where the permission problem may not exist). === +OCF_ERR_INSTALLED+ (5) The action failed because a required component is missing on the node where the action was executed. This may be due to a required binary not being executable, or a vital configuration file being unreadable. The cluster resource manager interprets this exit code as a _hard_ error. This means that unless specifically configured otherwise, the resource manager will attempt to recover a resource which failed with this error by restarting the resource on a different node (where the required files or binaries may be present). === +OCF_ERR_CONFIGURED+ (6) The action failed because the user misconfigured the resource. For example, the user may have configured an alphanumeric string for a parameter that really should be an integer. The cluster resource manager interprets this exit code as a _fatal_ error. Since this is a configuration error that is present cluster-wide, it would make no sense to recover such a resource on a different node, let alone in-place. When a resource fails with this error, the cluster manager will attempt to shut down the resource, and wait for administrator intervention. === +OCF_NOT_RUNNING+ (7) The resource was found not to be running. This is an exit code that may be returned by the +monitor+ action exclusively. Note that this implies that the resource has either _gracefully_ shut down, or has never been started. If the resource is not running due to an error condition, the +monitor+ action should instead return one of the +OCF_ERR_+ exit codes or +OCF_FAILED_MASTER+. === +OCF_RUNNING_MASTER+ (8) The resource was found to be running in the +Master+ role. This applies only to stateful (Master/Slave) resources, and only to their +monitor+ action. Note that there is no specific exit code for "running in slave mode". This is because their is no functional distinction between a primitive resource running normally, and a stateful resource running as a slave. The +monitor+ action of a stateful resource running normally in the +Slave+ role should simply return +OCF_SUCCESS+. === +OCF_FAILED_MASTER+ (9) The resource was found to have failed in the +Master+ role. This applies only to stateful (Master/Slave) resources, and only to their +monitor+ action. The cluster resource manager interprets this exit code as a _soft_ error. This means that unless specifically configured otherwise, the resource manager will attempt to recover a resource which failed with +$OCF_FAILED_MASTER+ in-place -- usually by demoting, stopping, starting and then promoting the resource on the same node. == Resource agent structure A typical (shell-based) resource agent contains standard structural items, in the order as listed in this section. It describes the expected behavior of a resource agent with respect to the various actions it supports, using a fictitous resource agent named +foobar+ as an example. === Resource agent interpreter Any resource agent implemented as a script must specify its interpreter using standard "shebang" (+#!+) header syntax. [source,bash] -------------------------------------------------------------------------- #!/bin/sh -------------------------------------------------------------------------- If a resource agent is written in shell, specifying the generic shell interpreter (+#!/bin/sh+) is generally preferred, though not required. Resource agents declared as +/bin/sh+ compatible must not use constructs native to a specific shell (such as, for example, +${!variable}+ syntax native to +bash+). It is advisable to occasionally run such resource agents through a sanitization utility such as +checkbashisms+. It is considered a regression to introduce a patch that will make a previously +sh+ compatible resource agent suitable only for +bash+, +ksh+, or any other non-generic shell. It is, however, perfectly acceptable for a new resource agent to explicitly define a specific shell, such as +/bin/bash+, as its interpreter. === Author and license information The resource agent should contain a comment listing the resource agent author(s) and/or copyright holder(s), and stating the license that applies to the resource agent: [source,bash] -------------------------------------------------------------------------- # # Resource Agent for managing foobar resources. # # License: GNU General Public License (GPL) # (c) 2008-2010 John Doe, Jane Roe, # and Linux-HA contributors -------------------------------------------------------------------------- When a resource agent refers to a license for which multiple versions exist, it is assumed that the current version applies. === Initialization -Any shell resource agent should source the +.ocf-shellfuncs+ function +Any shell resource agent should source the +ocf-shellfuncs+ function library. With the syntax below, this is done in terms of +$OCF_FUNCTIONS_DIR+, which -- for testing purposes, and also for generating documentation -- may be overridden from the command line. [source,bash] -------------------------------------------------------------------------- # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs -------------------------------------------------------------------------- === Functions implementing resource agent actions What follows next are the functions implementing the resource agent's advertised actions. The individual actions are described in detail in <<_resource_agent_actions>>. === Execution block This is the part of the resource agent that actually executes when the resource agent is invoked. It typically follows a fairly standard structure: [source,bash] -------------------------------------------------------------------------- # Make sure meta-data and usage always succeed case $__OCF_ACTION in meta-data) foobar_meta_data exit $OCF_SUCCESS ;; usage|help) foobar_usage exit $OCF_SUCCESS ;; esac # Anything other than meta-data and usage must pass validation foobar_validate_all || exit $? # Translate each action into the appropriate function call case $__OCF_ACTION in start) foobar_start;; stop) foobar_stop;; status|monitor) foobar_monitor;; promote) foobar_promote;; demote) foobar_demote;; reload) ocf_log info "Reloading..." foobar_start ;; validate-all) ;; *) foobar_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? # The resource agent may optionally log a debug message ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" exit $rc -------------------------------------------------------------------------- == Resource agent actions Each action is typically implemented in a separate function or method in the resource agent. By convention, these are usually named +_+, so the function implementing the +start+ action in +foobar+ would be named +foobar_start()+. As a general rule, whenever the resource agent encounters an error that it is not able to recover, it is permitted to immediately exit, throw an exception, or otherwise cease execution. Examples for this include configuration issues, missing binaries, permission problems, etc. It is not necessary to pass these errors up the call stack. It is the cluster manager's responsibility to initiate the appropriate recovery action based on the user's configuration. The resource agent should not guess at said configuration. === +start+ action When invoked with the +start+ action, the resource agent must start the resource if it is not yet running. This means that the agent must verify the resource's configuration, query its state, and then start it only if it is not running. A common way of doing this would be to invoke the +validate_all+ and +monitor+ function first, as in the following example: [source,bash] -------------------------------------------------------------------------- foobar_start() { # exit immediately if configuration is not valid foobar_validate_all || exit $? # if resource is already running, bail out early if foobar_monitor; then ocf_log info "Resource is already running" return $OCF_SUCCESS fi # actually start up the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ... # After the resource has been started, check whether it started up # correctly. If the resource starts asynchronously, the agent may # spin on the monitor function here -- if the resource does not # start up within the defined timeout, the cluster manager will # consider the start action failed while ! foobar_monitor; do ocf_log debug "Resource has not started yet, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- === +stop+ action When invoked with the +stop+ action, the resource agent must stop the resource, if it is running. This means that the agent must verify the resource configuration, query its state, and then stop it only if it is currently running. A common way of doing this would be to invoke the +validate_all+ and +monitor+ function first. It is important to understand that +stop+ is a force operation -- the resource agent must do everything in its power to shut down, the resource, short of rebooting the node or shutting it off. Consider the following example: [source,bash] -------------------------------------------------------------------------- foobar_stop() { local rc # exit immediately if configuration is not valid foobar_validate_all || exit $? foobar_monitor rc=$? case "$rc" in "$OCF_SUCCESS") # Currently running. Normal, expected behavior. ocf_log debug "Resource is currently running" ;; "$OCF_RUNNING_MASTER") # Running as a Master. Need to demote before stopping. ocf_log info "Resource is currently running as Master" foobar_demote || \ ocf_log warn "Demote failed, trying to stop anyway" ;; "$OCF_NOT_RUNNING") # Currently not running. Nothing to do. ocf_log info "Resource is already stopped" return $OCF_SUCCESS ;; esac # actually shut down the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ... # After the resource has been stopped, check whether it shut down # correctly. If the resource stops asynchronously, the agent may # spin on the monitor function here -- if the resource does not # shut down within the defined timeout, the cluster manager will # consider the stop action failed while foobar_monitor; do ocf_log debug "Resource has not stopped yet, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- NOTE: The expected exit code for a successful stop operation is +$OCF_SUCCESS+, _not_ +$OCF_NOT_RUNNING+. IMPORTANT: A failed stop operation is a potentially dangerous situation which the cluster manager will almost invariably try to resolve by means of node fencing. In other words, the cluster manager will forcibly evict from the cluster a node on which a stop operation has failed. While this measure serves ultimately to protect data, it does cause disruption to applications and their users. Thus, a resource agent should make sure that it exits with an error only if all avenues for proper resource shutdown have been exhausted. === +monitor+ action The +monitor+ action queries the current status of a resource. It must discern between three different states: * resource is currently running (return +$OCF_SUCCESS+); * resource has stopped gracefully (return +$OCF_NOT_RUNNING+); * resource has run into a problem and must be considered failed (return the appropriate +$OCF_ERR_+ code to indicate the nature of the problem). [source,bash] -------------------------------------------------------------------------- foobar_monitor() { local rc # exit immediately if configuration is not valid foobar_validate_all || exit $? ocf_run frobnicate --test # This example assumes the following exit code convention # for frobnicate: # 0: running, and fully caught up with master # 1: gracefully stopped # any other: error case "$?" in 0) rc=$OCF_SUCCESS ocf_log debug "Resource is running" ;; 1) rc=$OCF_NOT_RUNNING ocf_log debug "Resource is not running" ;; *) ocf_log err "Resource has failed" exit $OCF_ERR_GENERIC esac return $rc } -------------------------------------------------------------------------- Stateful (master/slave) resource agents may use a more elaborate monitoring scheme where they can provide "hints" to the cluster manager identifying which instance is best suited to assume the +Master+ role. <<_specifying_a_master_preference>> explains the details. NOTE: The cluster manager may invoke the +monitor+ action for a _probe_, which is a test whether the resource is currently running. Normally, the monitor operation would behave exactly the same during a probe and a "real" monitor action. If a specific resource does require special treatment for probes, however, the +ocf_is_probe+ convenience function is available in the OCF shell functions library for that purpose. === +validate-all+ action The +validate-all+ action tests for correct resource agent configuration and a working environment. +validate-all+ should exit with one of the following return codes: * +$OCF_SUCCESS+ -- all is well, the configuration is valid and usable. * +$OCF_ERR_CONFIGURED+ -- the user has misconfigured the resource. * +$OCF_ERR_INSTALLED+ -- the resource has possibly been configured correctly, but a vital component is missing on the node where +validate-all+ is being executed. * +$OCF_ERR_PERM+ -- the resource is configured correctly and is not missing any required components, but is suffering from a permission issue (such as not being able to create a necessary file). +validate-all+ is usually wrapped in a function that is not only called when explicitly invoking the corresponding action, but also -- as a sanity check -- from just about any other function. Therefore, the resource agent author must keep in mind that the function may be invoked during the +start+, +stop+, and +monitor+ operations, and also during probes. Probes pose a separate challenge for validation. During a probe (when the cluster manager may expect the resource _not_ to be running on the node where the probe is executed), some required components may be _expected_ to not be available on the affected node. For example, this includes any shared data on storage devices not available for reading during the probe. The +validate-all+ function may thus need to treat probes specially, using the +ocf_is_probe+ convenience function: [source,bash] -------------------------------------------------------------------------- foobar_validate_all() { # Test for configuration errors first if ! ocf_is_decimal $OCF_RESKEY_eggs; then ocf_log err "eggs is not numeric!" exit $OCF_ERR_CONFIGURED fi # Test for required binaries check_binary frobnicate # Check for data directory (this may be on shared storage, so # disable this test during probes) if ! ocf_is_probe; then if ! [ -d $OCF_RESKEY_datadir ]; then ocf_log err "$OCF_RESKEY_datadir does not exist or is not a directory!" exit $OCF_ERR_INSTALLED fi fi return $OCF_SUCCESS } -------------------------------------------------------------------------- === +meta-data+ action The +meta-data+ action dumps the resource agent metadata to standard output. The output must follow the metadata format as specified in <<_metadata>>. [source,bash] -------------------------------------------------------------------------- foobar_meta_data { cat < 0.1 ... EOF } -------------------------------------------------------------------------- === +promote+ action The +promote+ action is optional. It must only be supported by _stateful_ resource agents, which means agents that discern between two distinct _roles_: +Master+ and +Slave+. +Slave+ is functionally identical to the +Started+ state in a stateless resource agent. Thus, while a regular (stateless) resource agent only needs to implement +start+ and +stop+, a stateful resource agent must also support the +promote+ action to be able to make a transition between the +Started+ (+Slave+) and +Master+ roles. [source,bash] -------------------------------------------------------------------------- foobar_promote() { local rc # exit immediately if configuration is not valid foobar_validate_all || exit $? # test the resource's current state foobar_monitor rc=$? case "$rc" in "$OCF_SUCCESS") # Running as slave. Normal, expected behavior. ocf_log debug "Resource is currently running as Slave" ;; "$OCF_RUNNING_MASTER") # Already a master. Unexpected, but not a problem. ocf_log info "Resource is already running as Master" return $OCF_SUCCESS ;; "$OCF_NOT_RUNNING") # Currently not running. Need to start before promoting. ocf_log info "Resource is currently not running" foobar_start ;; *) # Failed resource. Let the cluster manager recover. ocf_log err "Unexpected error, cannot promote" exit $rc ;; esac # actually promote the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run frobnicate --master-mode || exit $OCF_ERR_GENERIC # After the resource has been promoted, check whether the # promotion worked. If the resource promotion is asynchronous, the # agent may spin on the monitor function here -- if the resource # does not assume the Master role within the defined timeout, the # cluster manager will consider the promote action failed. while true; do foobar_monitor if [ $? -eq $OCF_RUNNING_MASTER ]; then ocf_log debug "Resource promoted" break else ocf_log debug "Resource still awaiting promotion" sleep 1 fi done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- === +demote+ action The +demote+ action is optional. It must only be supported by _stateful_ resource agents, which means agents that discern between two distict _roles_: +Master+ and +Slave+. +Slave+ is functionally identical to the +Started+ state in a stateless resource agent. Thus, while a regular (stateless) resource agent only needs to implement +start+ and +stop+, a stateful resource agent must also support the +demote+ action to be able to make a transition between the +Master+ and +Started+ (+Slave+) roles. [source,bash] -------------------------------------------------------------------------- foobar_demote() { local rc # exit immediately if configuration is not valid foobar_validate_all || exit $? # test the resource's current state foobar_monitor rc=$? case "$rc" in "$OCF_RUNNING_MASTER") # Running as master. Normal, expected behavior. ocf_log debug "Resource is currently running as Master" ;; "$OCF_SUCCESS") # Alread running as slave. Nothing to do. ocf_log debug "Resource is currently running as Slave" return $OCF_SUCCESS ;; "$OCF_NOT_RUNNING") # Currently not running. Getting a demote action # in this state is unexpected. Exit with an error # and let the cluster manager recover. ocf_log err "Resource is currently not running" exit $OCF_ERR_GENERIC ;; *) # Failed resource. Let the cluster manager recover. ocf_log err "Unexpected error, cannot demote" exit $rc ;; esac # actually demote the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run frobnicate --unset-master-mode || exit $OCF_ERR_GENERIC # After the resource has been demoted, check whether the # demotion worked. If the resource demotion is asynchronous, the # agent may spin on the monitor function here -- if the resource # does not assume the Slave role within the defined timeout, the # cluster manager will consider the demote action failed. while true; do foobar_monitor if [ $? -eq $OCF_RUNNING_MASTER ]; then ocf_log debug "Resource still awaiting promotion" sleep 1 else ocf_log debug "Resource demoted" break fi done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- === +migrate_to+ action The +migrate_to+ action can serve one of two purposes: * Initiate a native _push_ type migration for the resource. In other words, instruct the resource to move _to_ a specific node from the node it is currently running on. The resource agent knows about its destination node via the +$OCF_RESKEY_CRM_meta_migrate_target+ environment variable. * Freeze the resource in a _freeze/thaw_ (also known as _suspend/resume_) type migration. In this mode, the resource does not need any information about its destination node at this point. The example below illustrates a push type migration: [source,bash] -------------------------------------------------------------------------- foobar_migrate_to() { # exit immediately if configuration is not valid foobar_validate_all || exit $? # if resource is not running, bail out early if ! foobar_monitor; then ocf_log err "Resource is not running" exit $OCF_ERR_GENERIC fi # actually start up the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run frobnicate --migrate \ --dest=$OCF_RESKEY_CRM_meta_migrate_target \ || exit OCF_ERR_GENERIC ... # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- In contrast, a freeze/thaw type migration may implement its freeze operation like this: [source,bash] -------------------------------------------------------------------------- foobar_migrate_to() { # exit immediately if configuration is not valid foobar_validate_all || exit $? # if resource is not running, bail out early if ! foobar_monitor; then ocf_log err "Resource is not running" exit $OCF_ERR_GENERIC fi # actually start up the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run frobnicate --freeze || exit OCF_ERR_GENERIC ... # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- === +migrate_from+ action The +migrate_from+ action can serve one of two purposes: * Complete a native _push_ type migration for the resource. In other words, check whether the migration has succeeded properly, and the resource is running on the local node. The resource agent knows about its the migration source via the +$OCF_RESKEY_CRM_meta_migrate_source+ environment variable. * Thaw the resource in a _freeze/thaw_ (also known as _suspend/resume_) type migration. In this mode, the resource usually not need any information about its source node at this point. The example below illustrates a push type migration: [source,bash] -------------------------------------------------------------------------- foobar_migrate_from() { # exit immediately if configuration is not valid foobar_validate_all || exit $? # After the resource has been migrated, check whether it resumed # correctly. If the resource starts asynchronously, the agent may # spin on the monitor function here -- if the resource does not # run within the defined timeout, the cluster manager will # consider the migrate_from action failed while ! foobar_monitor; do ocf_log debug "Resource has not yet migrated, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- In contrast, a freeze/thaw type migration may implement its thaw operation like this: [source,bash] -------------------------------------------------------------------------- foobar_migrate_from() { # exit immediately if configuration is not valid foobar_validate_all || exit $? # actually start up the resource here (make sure to immediately # exit with an $OCF_ERR_ error code if anything goes seriously # wrong) ocf_run frobnicate --thaw || exit OCF_ERR_GENERIC # After the resource has been migrated, check whether it resumed # correctly. If the resource starts asynchronously, the agent may # spin on the monitor function here -- if the resource does not # run within the defined timeout, the cluster manager will # consider the migrate_from action failed while ! foobar_monitor; do ocf_log debug "Resource has not yet migrated, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- === +notify+ action With notifications, instances of clones (and of master/slave resources, which are an extended kind of clones) can inform each other about their state. When notifications are enabled, any action on any instance of a clone carries a +pre+ and +post+ notification. Then, the cluster manager invokes the +notify+ operation on _all_ clone instances. For +notify+ operations, additional environment variables are passed into the resource agent during execution: * +$OCF_RESKEY_CRM_meta_notify_type+ -- the notification type (+pre+ or +post+) * +$OCF_RESKEY_CRM_meta_notify_operation+ -- the operation (action) that the notification is about (+start+, +stop+, +promote+, +demote+ etc.) * +$OCF_RESKEY_CRM_meta_notify_start_uname+ -- node name of the node where the resource is being started (+start+ notifications only) * +$OCF_RESKEY_CRM_meta_notify_stop_uname+ -- node name of the node where the resource is being stopped (+stop+ notifications only) * +$OCF_RESKEY_CRM_meta_notify_master_uname+ -- node name of the node where the resource currently _is in_ the Master role * +$OCF_RESKEY_CRM_meta_notify_promote_uname+ -- node name of the node where the resource currently _is being promoted to_ the Master role (+promote+ notifications only) * +$OCF_RESKEY_CRM_meta_notify_demote_uname+ -- node name of the node where the resource currently _is being demoted to_ the Slave role (+demote+ notifications only) Notifications come in particularly handy for master/slave resources using a "pull" scheme, where the master is a publisher and the slave a subscriber. Since the master is obviously only available as such when a promotion has occurred, the slaves can use a "pre-promote" notification to configure themselves to subscribe to the right publisher. Likewise, the subscribers may want to unsubscribe from the publisher after it has relinquished its master status, and a "post-demote" notification can be used for that purpose. Consider the example below to illustrate the concept. [source,bash] -------------------------------------------------------------------------- foobar_notify() { local type_op type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" ocf_log debug "Received $type_op notification." case "$type_op" in 'pre-promote') ocf_run frobnicate --slave-mode \ --master=$OCF_RESKEY_CRM_meta_notify_promote_uname \ || exit $OCF_ERR_GENERIC ;; 'post-demote') ocf_run frobnicate --unset-slave-mode || exit $OCF_ERR_GENERIC ;; esac return $OCF_SUCCESS } -------------------------------------------------------------------------- NOTE: A master/slave resource agent may support a _multi-master_ configuration, where there is possibly more than one master at any given time. If that is the case, then the +$OCF_RESKEY_CRM_meta_notify_*_uname+ variables may each contain a space-separated lists of hostnames, rather than a single host name as shown in the example. Under those circumstances the resource agent would have to properly iterate over this list. == Script variables This section outlines variables typically available to resource agents, primarily for convenience purposes. For additional variables available while the agent is being executed, refer to <<_environment_variables>> and <<_return_codes>>. === +$OCF_ROOT+ The root of the OCF resource agent hierarchy. This should never be changed by a resource agent. This is usually +/usr/lib/ocf+. === +$OCF_FUNCTIONS_DIR+ The directory where the resource agents shell function library, -+.ocf-shellfuncs+, resides. This is usually defined in terms of ++ocf-shellfuncs+, resides. This is usually defined in terms of +$OCF_ROOT+ and should never be changed by a resource agent. This variable may, however, be overridden from the command line while testing a new or modified resource agent. === +$OCF_RESOURCE_INSTANCE+ The resource instance name. For primitive (non-clone, non-stateful) resources, this is simply the resource name. For clones and stateful resources, this is the primitive name, followed by a colon an the clone instance number (such as +p_foobar:0+). === +$__OCF_ACTION+ The currently invoked action. This is exactly the first command-line argument that the cluster manager specifies when it invokes the resource agent. === +$__SCRIPT_NAME+ The name of the resource agent. This is exactly the base name of the resource agent script, with leading directory names removed. === +$HA_RSCTMP+ A temporary directory for use by resource agents. The system startup sequence (on any LSB compliant Linux distribution) guarantees that this directory is emptied on system startup, so this directory will not contain any stale data after a node reboot. == Convenience functions === Logging: +ocf_log+ Resource agents should use the +ocf_log+ function for logging purposes. This convenient logging wrapper is invoked as follows: [source,bash] -------------------------------------------------------------------------- ocf_log "Log message" -------------------------------------------------------------------------- It supports following the following severity levels: * +debug+ -- for debugging messages. Most logging configurations suppress this level by default. * +info+ -- for informational messages about the agent's behavior or status. * +warn+ -- for warnings. This is for any messages which reflect unexpected behavior that does _not_ constitute an unrecoverable error. * +err+ -- for errors. As a general rule, this logging level should only be used immediately prior to an +exit+ with the appropriate error code. * +crit+ -- for critical errors. As with +err+, this logging level should not be used unless the resource agent also exits with an error code. Very rarely used. === Testing for binaries: +have_binary+ and +check_binary+ A resource agent may need to test for the availability of a specific executable. The +have_binary+ convenience function comes in handy here: [source,bash] -------------------------------------------------------------------------- if ! have_binary frobnicate; then ocf_log warn "Missing frobnicate binary, frobnication disabled!" fi -------------------------------------------------------------------------- If a missing binary is a fatal problem for the resource, then the +check_binary+ function should be used: [source,bash] -------------------------------------------------------------------------- check_binary frobnicate -------------------------------------------------------------------------- Using +check_binary+ is a shorthand method for testing for the existence (and executability) of the specified binary, and exiting with +$OCF_ERR_INSTALLED+ if it cannot be found or executed. NOTE: Both +have_binary+ and +check_binary+ honor +$PATH+ when the binary to test for is not specified as a full path. It is usually wise to _not_ test for a full path, as binary installations path may vary by distribution or user policy. === Executing commands and capturing their output: +ocf_run+ Whenever a resource agent needs to execute a command and capture its output, it should use the +ocf_run+ convenience function, invoked as in this example: [source,bash] -------------------------------------------------------------------------- ocf_run "frobnicate --spam=eggs" || exit $OCF_ERR_GENERIC -------------------------------------------------------------------------- With the command specified above, the resource agent will invoke +frobnicate --spam=eggs+ and capture its output and exit code. If the exit code is nonzero (indicating an error), +ocf_run+ logs the command output with the +err+ logging severity, and the resource agent subsequently exits. If the exit code is zero (indicating success), any command output will be logged with the +info+ logging severity. If the resource agent wishes to ignore the output of a successful command execution, it can use the +-q+ flag with +ocf_run+. In the example below, +ocf_run+ will only log output if the command exit code is nonzero. [source,bash] -------------------------------------------------------------------------- ocf_run -q "frobnicate --spam=eggs" || exit $OCF_ERR_GENERIC -------------------------------------------------------------------------- Finally, if the resource agent wants to log the output of a command with a nonzero exit code with a severity _other_ than error, it may do so by adding the +-info+ or +-warn+ option to +ocf_run+: [source,bash] -------------------------------------------------------------------------- ocf_run -warn "frobnicate --spam=eggs" -------------------------------------------------------------------------- === Locks: +ocf_take_lock+ and +ocf_release_lock_on_exit+ Occasionally, there may be different resources of the same type in a cluster configuration that should not execute actions in parallel. When a resource agent needs to guard against parallel execution on the same machine, it can use the +ocf_take_lock+ and +ocf_release_lock_on_exit+ convenience functions: [source,bash] -------------------------------------------------------------------------- LOCKFILE=${HA_RSCTMP}/foobar ocf_release_lock_on_exit $LOCKFILE foobar_start() { ... ocf_take_lock $LOCKFILE ... } -------------------------------------------------------------------------- +ocf_take_lock+ attempts to acquire the designated +$LOCKFILE+. When it is unavailable, it sleeps a random amount of time between 0 and 1 seconds, and retries. +ocf_release_lock_on_exit+ releases the lock file when the agent exits (for any reason). === Testing for numerical values: +ocf_is_decimal+ Specifically for parameter validation, it can be helpful to test whether a given value is numeric. The +ocf_is_decimal+ function exists for that purpose: -------------------------------------------------------------------------- foobar_validate_all() { if ! ocf_is_decimal $OCF_RESKEY_eggs; then ocf_log err "eggs is not numeric!" exit $OCF_ERR_CONFIGURED fi ... } -------------------------------------------------------------------------- === Testing for boolean values: +ocf_is_true+ When a resource agent defines a boolean parameter, the value for this parameter may be specified by the user as +0+/+1+, +true+/+false+, or +on+/+off+. Since it is tedious to test for all these values from within the resource agent, the agent should instead use the +ocf_is_true+ convenience function: [source,bash] -------------------------------------------------------------------------- if ocf_is_true $OCF_RESKEY_superfrobnicate; then ocf_run "frobnicate --super" fi -------------------------------------------------------------------------- NOTE: If +ocf_is_true+ is used against an empty or non-existant variable, it always returns an exit code of +1+, which is equivalent to +false+. +=== Version comparison: +ocf_version_cmp+ + +A resource agent may want to check the version of software +installed. +ocf_version_cmp+ takes care of all the necessary +details. + +The return codes are + +* +0+ -- the first version is smaller (earlier) than the second +* +1+ -- the two versions are equal +* +2+ -- the first version is greater (later) than the second +* +3+ -- one of arguments is not recognized as a version string + +The versions are allowed to contain digits, dots, and dashes. + +[source,bash] +-------------------------------------------------------------------------- +local v=`gooey --version` +ocf_version_cmp "$v" 12.0.8-1 +case $? in + 0) ocf_log err "we do not support version $v, it is too old" + exit $OCF_ERR_INSTALLED + ;; + [12]) ;; # we can work with versions >= 12.0.8-1 + 3) ocf_log err "gooey produced version <$v>, too funky for me" + exit $OCF_ERR_INSTALLED + ;; +esac +-------------------------------------------------------------------------- + === Pseudo resources: +ha_pseudo_resource+ "Pseudo resources" are those where the resource agent in fact does not actually start or stop something akin to a runnable process, but merely executes a single action and then needs some form of tracing whether that action has been executed or not. The +portblock+ resource agent is an example of this. Resource agents for pseudo resources can use a convenience function, +ha_pseudo_resource+, which makes use of _tracking files_ to keep tabs on the status of a resource. If +foobar+ was designed to manage a pseudo resource, then its +start+ action could look like this: [source,bash] -------------------------------------------------------------------------- foobar_start() { # exit immediately if configuration is not valid foobar_validate_all || exit $? # if resource is already running, bail out early if foobar_monitor; then ocf_log info "Resource is already running" return $OCF_SUCCESS fi # start the pseudo resource ha_pseudo_resource ${OCF_RESOURCE_INSTANCE} start # After the resource has been started, check whether it started up # correctly. If the resource starts asynchronously, the agent may # spin on the monitor function here -- if the resource does not # start up within the defined timeout, the cluster manager will # consider the start action failed while ! foobar_monitor; do ocf_log debug "Resource has not started yet, waiting" sleep 1 done # only return $OCF_SUCCESS if _everything_ succeeded as expected return $OCF_SUCCESS } -------------------------------------------------------------------------- == Conventions This section contains a collection of conventions that have emerged in the resource agent repositories over the years. Following these conventions is by no means mandatory for resource agent authors, but it is a good idea based on the http://en.wikipedia.org/wiki/Principle_of_least_surprise[Principle of Least Surprise] -- resource agents following these conventions will be easier to understand, review, and use than those that do not. === Well-known parameter names Several parameter names are supported by a number of resource agents. For new resource agents, following these examples is generally a good idea: * +binary+ -- the name of a binary that principally manages the resource, such as a server daemon * +config+ -- the full path to a configuration file * +pid+ -- the full path to a file holding a process ID (PID) * +log+ -- the full path to a log file * +socket+ -- the full path to a UNIX socket that the resource manages * +ip+ -- an IP address that a daemon binds to * +port+ -- a TCP or UDP port that a daemon binds to Needless to say, resource agents should only implement any of these parameters if they are sensible to use in the agent's context. === Parameter defaults Defaults for resource agent parameters should be set by initializing variables with the suffix +_default+: [source,bash] -------------------------------------------------------------------------- # Defaults OCF_RESKEY_superfrobnicate_default=0 : ${OCF_RESKEY_superfrobnicate=${OCF_RESKEY_superfrobnicate_default}} -------------------------------------------------------------------------- NOTE: The resource agent should make sure that it sets a default for any parameter not marked as +required+ in the metadata. === Honoring +PATH+ for binaries When a resource agent supports a parameter designed to hold the name of a binary (such as a daemon, or a client utility for querying status), then that parameter should honor the +PATH+ environment variable. Do not supply full paths. Thus, the following approach: [source,bash] -------------------------------------------------------------------------- # Good example -- do it this way OCF_RESKEY_frobnicate_default="frobnicate" : ${OCF_RESKEY_frobnicate="${OCF_RESKEY_frobnicate_default}"} -------------------------------------------------------------------------- is much preferred over specifying a full path, as shown here: [source,bash] -------------------------------------------------------------------------- # Bad example -- avoid if you can OCF_RESKEY_frobnicate_default="/usr/local/sbin/frobnicate" : ${OCF_RESKEY_frobnicate="${OCF_RESKEY_frobnicate_default}"} -------------------------------------------------------------------------- This rule holds for defaults, as well. == Special considerations === Licensing Whenever possible, resource agent contributors are _encouraged_ to use the GNU General Public License (GPL), version 2 and later, for any new resource agents. The shell functions library does not strictly mandate this, however, as it is licensed under the GNU Lesser General Public License (LGPL), version 2.1 and later (so it can be used by non-GPL agents). The resource agent _must_ explicitly state its own license in the agent source code. === Locale settings -When sourcing +.ocf-shellfuncs+ as explained in <<_initialization>>, +When sourcing +ocf-shellfuncs+ as explained in <<_initialization>>, any resource agent automatically sets +LANG+ and +LC_ALL+ to the +C+ locale. Resource agents can thus expect to always operate in the +C+ locale, and need not reset +LANG+ or any of the +LC_+ environment variables themselves. === Testing for running processes For testing whether a particular process (with a known process ID) is currently running, a frequently found method is to send it a +0+ signal and catch errors, similar to this example: [source,bash] -------------------------------------------------------------------------- if kill -s 0 `cat $daemon_pid_file`; then ocf_log debug "Process is currently running" else ocf_log warn "Process is dead, removing pid file" rm -f $daemon_pid_file if -------------------------------------------------------------------------- IMPORTANT: An approach far superior to this example is to instead test the _functionality_ of the daemon by connecting to it with a client process, as shown in the example in <<_literal_monitor_literal_action>>. === Specifying a master preference Stateful (master/slave) resources must set their own _master preference_ -- they can thus provide hints to the cluster manager which is the the best instance to promote to the +Master+ role. IMPORTANT: It is acceptable for multiple instances to have identical positive master preferences. In that case, the cluster resource manager will automatically select a resource agent to promote. However, if _all_ instances have the (default) master score of zero, the cluster manager will not promote any instance at all. Thus, it is crucial that at least one instance has a positive master score. For this purpose, +crm_master+ comes in handy. This convenience wrapper around the +crm_attribute+ sets a node attribute named +master-<<_literal_ocf_resource_instance_literal,$OCF_RESOURCE_INSTANCE>>+ for the node it is being executed on, and fills this attribute with the specified value. The cluster manager is then expected to translate this into a promotion score for the corresponding instance, and base its promotion preference on that score. Stateful resource agents typically execute +crm_master+ during the <<_literal_monitor_literal_action,+monitor+>> and/or <<_literal_notify_literal_action,+notify+>> action. The following example assumes that the +foobar+ resource agent can test the application's status by executing a binary that returns certain exit codes based on whether * the resource is either in the master role, or is a slave that is fully caught up with the master (at any rate, it has current data), or * the resource is in the slave role, but through some form of asynchronous replication has "fallen behind" the master, or * the resource has gracefully stopped, or * the resource has unexpectedly failed. [source,bash] -------------------------------------------------------------------------- foobar_monitor() { local rc # exit immediately if configuration is not valid foobar_validate_all || exit $? ocf_run frobnicate --test # This example assumes the following exit code convention # for frobnicate: # 0: running, and fully caught up with master # 1: gracefully stopped # 2: running, but lagging behind master # any other: error case "$?" in 0) rc=$OCF_SUCCESS ocf_log debug "Resource is running" # Set a high master preference. The current master # will always get this, plus 1. Any current slaves # will get a high preference so that if the master # fails, they are next in line to take over. crm_master -l reboot -v 100 ;; 1) rc=$OCF_NOT_RUNNING ocf_log debug "Resource is not running" # Remove the master preference for this node crm_master -l reboot -D ;; 2) rc=$OCF_SUCCESS ocf_log debug "Resource is lagging behind master" # Set a low master preference: if the master fails # right now, and there is another slave that does # not lag behind the master, its higher master # preference will win and that slave will become # the new master crm_master -l reboot -v 5 ;; *) ocf_log err "Resource has failed" exit $OCF_ERR_GENERIC esac return $rc } -------------------------------------------------------------------------- == Testing resource agents This section discusses automated testing for resource agents. Testing is a vital aspect of development; it is crucial both for creating new resource agents, and for modifying existing ones. === Testing with +ocf-tester+ The resource agents repository (and hence, any installed resource agents package) contains a utility named +ocf-tester+. This shell script allows you to conveniently and easily test the functionality of your resource agent. +ocf-tester+ is commonly invoked, as +root+, like this: -------------------------------------------------------------------------- ocf-tester -n [-o = ... ] -------------------------------------------------------------------------- * ++ is an arbitrary resource name. * You may set any number of +=+ with the +-o+ option, corresponding to any resource parameters you wish to set for testing. * ++ is the full path to your resource agent. When invoked, +ocf-tester+ executes all mandatory actions and enforces action behavior as explained in <<_resource_agent_actions>>. It also tests for optional actions. Optional actions must behave as expected when advertised, but do not cause +ocf-tester+ to flag an error if not implemented. IMPORTANT: +ocf-tester+ does not initiate "dry runs" of actions, nor does it create resource dummies of any kind. Instead, it exercises the actual resource agent as-is, whether that may include opening and closing databases, mounting file systems, starting or stopping virtual machines, etc. Use with care. For example, you could run +ocf-tester+ on the +foobar+ resource agent as follows: -------------------------------------------------------------------------- # ocf-tester -n foobartest \ -o superfrobnicate=true \ -o datadir=/tmp \ /home/johndoe/ra-dev/foobar Beginning tests for /home/johndoe/ra-dev/foobar... * Your agent does not support the notify action (optional) * Your agent does not support the reload action (optional) /home/johndoe/ra-dev/foobar passed all tests -------------------------------------------------------------------------- === Testing with +ocft+ +ocft+ is a testing tool for resource agents. The main difference to +ocf-tester+ is that +ocft+ can automate creating complex testing environments. That includes package installation and arbitrary shell scripting. ==== +ocft+ components +ocft+ consists of the following components: * A test case generator (+/usr/sbin/ocft+) -- generates shell scripts from test case configuration files * Configuration files (+/usr/share/resource-agents/ocft/configs/+) -- a configuration file contains environment setup and test cases for one resource agent * The testing scripts are stored in +/var/lib/resource-agents/ocft/cases/+, but normally there is no need to inspect them ==== Customizing the testing environment +ocft+ modifies the runtime environment of the resource agent either by changing environment variables (through the interface defined by OCF) or by running ad-hoc shell scripts which can for instance change permissions of a file or unmount a file system. ==== How to test You need to know the software (resource) you want to test. Draw a sketch of all interesting scenarios, with all expected and unexpected conditions and how the resource agent should react to them. Then you need to encode these conditions and the expected outcomes as +ocft+ test cases. Running ocft is then simple: --------------------------------------- # ocft make # ocft test --------------------------------------- The first subcommand generates the scripts for your test cases whereas the second runs them and checks the outcome. ==== +ocft+ configuration file syntax There are four top level options each of which can contain one or more sub-options. ===== +CONFIG+ (top level option) This option is global and influences every test case. ** +AgentRoot+ (sub-option) --------------------------------------- AgentRoot /usr/lib/ocf/resource.d/xxx --------------------------------------- Normally, we assume that the resource agent lives under the +heartbeat+ provider. Use `AgentRoot` to test agent which is distributed by another vendor. ** +InstallPackage+ (sub-option) --------------------------------------- InstallPackage package [package2 [...]] --------------------------------------- Install packages necessary for testing. The installation is skipped if the packages have already been installed. ** 'HangTimeout' (sub-option) --------------------------------------- HangTimeout secs --------------------------------------- The maximum time allowed for a single RA action. If this timer expires, the action is considered as failed. ===== +SETUP-AGENT+ (top level option) --------------------------------------- SETUP-AGENT bash commands --------------------------------------- If the RA needs to be initialized before testing, you can put bash code here for that purpose. The initialization is done only once. If you need to reinitialize then delete the +/tmp/.[AGENT_NAME]_set+ stamp file. ===== +CASE+ (top level option) --------------------------------------- CASE "description" --------------------------------------- This is the main building block of the test suite. Each test case is to be described in one +CASE+ top level option. One case consists of several suboptions typically followed by the +RunAgent+ suboption. ** +Var+ (sub-option) --------------------------------------- Var VARIABLE=value --------------------------------------- It is to set up an environment variable of the resource agent. They usually appear to be OCF_RESKEY_xxx. One point is to be noted is there is no blank by both sides of "=". ** +Unvar+ (sub-option) --------------------------------------- Unvar VARIABLE [VARIABLE2 [...]] --------------------------------------- Remove the environment variable. ** +Include+ (sub-option) --------------------------------------- Include macro_name --------------------------------------- Include statements in 'macro_name'. See below for description of +CASE-BLOCK+. ** +Bash+ (sub-option) --------------------------------------- Bash bash_codes --------------------------------------- This option is to set up the environment of OS, where you can insert BASH code to customize the system randomly. Note, do not cause unrecoverable consequences to the system. ** +BashAtExit+ (sub-option) --------------------------------------- BashAtExit bash_codes --------------------------------------- This option is to recover the OS environment in order to run another test case correctly. Of cause you can use 'Bash' option to recover it. However, if mistakes occur in the process, the script will quit directly instead of running your recovery codes. If it happens, you ought to use BashAtExit which can restore the system environment before you quit. ** +RunAgent+ (sub-option) --------------------------------------- RunAgent cmd [ret_value] --------------------------------------- This option is to run resource agent. "cmd" is the parameter of the resource agent, such as "start, status, stop ...". The second parameter is optional. It will compare the actual returned value with the expected value when the script has run recourse agent. If differs, bugs will be found. It is also possible to execute a suboption on a remote host instead of locally. The protocol used is ssh and the command is run in the background. Just add the +@+ suffix to the suboption name. For instance: --------------------------------------- Bash@192.168.1.100 date --------------------------------------- would run the date program. Remote commands are run in background. NB: Not clear how can ssh be automated as we don't know in advance the environment. Perhaps use "well-known" host names such as "node2"? Also, if the command runs in the background, it's not clear how is the exit code checked. Finally, does Var@node make sense? Or is the current environment somehow copied over? We probably need an example here. Need examples in general. ===== +CASE-BLOCK+ (top level option) --------------------------------------- CASE-BLOCK macro_name --------------------------------------- The +CASE-BLOCK+ option defines a macro which can be +Include+d in any +CASE+. All +CASE+ suboptions are valid in +CASE-BLOCK+. == Installing and packaging resource agents This section discusses what to do with your resource agent once it is done and tested -- where to install it, and how to include it in either your own application package or in the Linux-HA resource agents repository. === Installing resource agents If you choose to include your resource agent in your own project, make sure it installs into the correct location. Resource agents should install into the +/usr/lib/ocf/resource.d/+ directory, where ++ is the name of your project or any other name you wish to identify the resource agent with. For example, if your +foobar+ resource agent is being packaged as part of a project named +fortytwo+, then the correct full path to your resource agent would be +/usr/lib/ocf/resource.d/fortytwo/foobar+. Make sure your resource agent installs with +0755+ (+-rwxr-xr-x+) permission bits. When installed this way, OCF-compliant cluster resource managers will be able to properly identify, parse, and execute your resource agent. The Pacemaker cluster manager, for example, would map the above-mentioned installation path to the +ocf:fortytwo:foobar+ resource type identifier. === Packaging resource agents When you package resource agents as part of your own project, you should apply the considerations outlined in this section. NOTE: If you instead prefer to submit your resource agent to the Linux-HA resource agents repository, see <<_submitting_resource_agents>> for information on doing so. ==== RPM packaging It is recommended to put your OCF resource agent(s) in an RPM sub-package, with the name +-resource-agents+. Ensure that the package owns its provider directory, and depends on the upstream +resource-agents+ package which lays out the directory hierarchy and provides convenience shell functions. An example RPM spec snippet is given below: -------------------------------------------------------------------------- %package resource-agents Summary: OCF resource agent for Foobar Group: System Environment/Base Requires: %{name} = %{version}-%{release}, resource-agents %description resource-agents This package contains the OCF-compliant resource agents for Foobar. %files resource-agents %defattr(755,root,root,-) %dir %{_prefix}/lib/ocf/resource.d/fortytwo %{_prefix}/lib/ocf/resource.d/fortytwo/foobar -------------------------------------------------------------------------- NOTE: If an RPM spec file contains a +%package+ declaration, then RPM considers this a sub-package which inherits top-level fields such as +Name+, +Version+, +License+, etc. Sub-packages have the top-level package name automatically prepended to their own name. Thus the snippet above would create a sub-package named +foobar-resource-agents+ (presuming the package +Name+ is +foobar+). ==== Debian packaging For Debian packages, like for <<_rpm_packaging,RPMs>>, it is recommended to create a separate package holding your resource agents, which then should depend on the +cluster-agents+ package. NOTE: This section assumes that you are packaging with +debhelper+. An example +debian/control+ snippet is given below: -------------------------------------------------------------------------- Package: foobar-cluster-agents Priority: extra Architecture: all Depends: cluster-agents Description: OCF-compliant resource agents for Foobar -------------------------------------------------------------------------- You will also create a separate +.install+ file. Sticking with the example of installing the +foobar+ resource agent as a sub-package of +fortytwo+, the +debian/fortytwo-cluster-agents.install+ file could consist of the following content: -------------------------------------------------------------------------- usr/lib/ocf/resource.d/fortytwo/foobar -------------------------------------------------------------------------- === Submitting resource agents If you choose not to bundle your resource agent with your own package, but instead wish to submit it to the upstream resource agent repository hosted on https://github.com/ClusterLabs/resource-agents[the ClusterLabs repository on GitHub], please follow the steps outlined in this section. Create a working copy (a Git _clone_) of the upstream repository with the following command: -------------------------------------------------------------------------- git clone git://github.com/ClusterLabs/resource-agents -------------------------------------------------------------------------- Then, copy your resource agent into the +heartbeat+ subdirectory: -------------------------------------------------------------------------- cd resource-agents/heartbeat cp /path/to/your/local/copy/of/foobar . chmod 0755 foobar cd .. -------------------------------------------------------------------------- Next, modify the +Makefile.am+ file in +resource-agents/heartbeat+ and add your new resource agent to the +ocf_SCRIPTS+ list. This will make sure the agent is properly installed. Lastly, open Makefile.am in +resource-agents/doc/man+ and add +ocf_heartbeat_.7+ to the +man_MANS+ variable. This will automatically generate a resource agent manual page from its metadata, and then install that man page into the correct location. Now, add your new resource agents, and the two modifications to the Makefiles, to your changeset: -------------------------------------------------------------------------- git add heartbeat/foobar git add heartbeat/Makefile.am git add doc/man/Makefile.am git commit -------------------------------------------------------------------------- In your commit message, be sure to include a meaningful description, for example: -------------------------------------------------------------------------- High: foobar: new resource agent This new resource agent adds functionality to manage a foobar service. It supports being configured as a primitive or as a master/slave set, and also optionally supports superfrobnication. -------------------------------------------------------------------------- Now the patch set is good for review on the mailing list: -------------------------------------------------------------------------- git send-email --to=linux-ha-dev@lists.linux-ha.org -------------------------------------------------------------------------- +git send-email+ will now roll all local commits not in the upstream repository into a nicely formatted email, and submit that to the mailing list. Please consult +man git-send-email+ for details on configuring and using +git send-email+. Once your new resource agent has been accepted for merging, one of the upstream developers will push your patch into the upstream repository. At that point, you can update your checkout from upstream, and remove your own patch set. -------------------------------------------------------------------------- git reset --hard origin/master git pull -------------------------------------------------------------------------- === Maintaining resource agents If you maintain a specific resource agent, or you are making repeated contributions to the codebase, it's usually a good idea to maintain your own _fork_ of the +ClusterLabs/resource-agents+ repository on GitHub. To do so, * https://github.com/signup[Create a GitHub account] if you do not have one already. * http://help.github.com/fork-a-repo/[Fork] the https://github.com/ClusterLabs/resource-agents[+resource-agents+ repository]. * Clone your personal fork into a local working copy. As you work on resource agents, *please* commit early, and commit often. You can always fold commits later with +git rebase -i+. Once you have made a number of changes that you would like others to review, push them to your GitHub fork and send a post to the +linux-ha-dev+ mailing list pointing people to it. After the review is done, fix up your tree with any requested changes, and then issue a pull request. There are two ways of doing so: * You can use the +git request-pull+ utility to get a pre-populated email skeleton summarizing your changesets. Add any information you see fit, and send it to the list. It is a good idea to prefix your email subject with +[GIT PULL]+ so upstream maintainers can pick the message out easily. * You can also issue a pull request directly on GitHub. GitHub automatically notifies upstream maintainers about new pull requests by email. Please refer to http://help.github.com/send-pull-requests/[github:help] for details on initiating pull requests. diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index a7dae80d9..c4747fa8f 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -1,162 +1,163 @@ # # doc: Linux-HA resource agents # # Copyright (C) 2009 Florian Haas # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(doc_DATA) $(REFENTRY_STYLESHEET) \ mkappendix.sh ralist.sh CLEANFILES = $(man_MANS) $(xmlfiles) metadata-*.xml STYLESHEET_PREFIX ?= http://docbook.sourceforge.net/release/xsl/current MANPAGES_STYLESHEET ?= $(STYLESHEET_PREFIX)/manpages/docbook.xsl HTML_STYLESHEET ?= $(STYLESHEET_PREFIX)/xhtml/docbook.xsl FO_STYLESHEET ?= $(STYLESHEET_PREFIX)/fo/docbook.xsl REFENTRY_STYLESHEET ?= ra2refentry.xsl XSLTPROC_OPTIONS ?= --xinclude XSLTPROC_MANPAGES_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_HTML_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_FO_OPTIONS ?= $(XSLTPROC_OPTIONS) radir = $(top_srcdir)/heartbeat # OCF_ROOT=. is necessary due to a sanity check in ocf-shellfuncs # (which tests whether $OCF_ROOT points to a directory metadata-%.xml: $(radir)/% OCF_ROOT=. OCF_FUNCTIONS_DIR=$(radir) $< meta-data > $@ metadata-IPv6addr.xml: ../../heartbeat/IPv6addr OCF_ROOT=. OCF_FUNCTIONS_DIR=$(radir) $< meta-data > $@ # Please note: we can't name the man pages # ocf:heartbeat:. Believe me, I've tried. It looks like it # works, but then it doesn't. While make can deal correctly with # colons in target names (when properly escaped), it royally messes up # when it is deals with _dependencies_ that contain colons. See Bug # 12126 on savannah.gnu.org. But, maybe it gets fixed soon, it was # first reported in 1995 and added to Savannah in in 2005... if BUILD_DOC man_MANS = ocf_heartbeat_AoEtarget.7 \ ocf_heartbeat_AudibleAlarm.7 \ ocf_heartbeat_ClusterMon.7 \ ocf_heartbeat_CTDB.7 \ ocf_heartbeat_Delay.7 \ ocf_heartbeat_Dummy.7 \ ocf_heartbeat_EvmsSCC.7 \ ocf_heartbeat_Evmsd.7 \ ocf_heartbeat_Filesystem.7 \ ocf_heartbeat_ICP.7 \ ocf_heartbeat_IPaddr.7 \ ocf_heartbeat_IPaddr2.7 \ ocf_heartbeat_IPsrcaddr.7 \ ocf_heartbeat_LVM.7 \ ocf_heartbeat_LinuxSCSI.7 \ ocf_heartbeat_MailTo.7 \ ocf_heartbeat_ManageRAID.7 \ ocf_heartbeat_ManageVE.7 \ ocf_heartbeat_Pure-FTPd.7 \ ocf_heartbeat_Raid1.7 \ ocf_heartbeat_Route.7 \ ocf_heartbeat_SAPDatabase.7 \ ocf_heartbeat_SAPInstance.7 \ ocf_heartbeat_SendArp.7 \ ocf_heartbeat_ServeRAID.7 \ ocf_heartbeat_SphinxSearchDaemon.7 \ ocf_heartbeat_Squid.7 \ ocf_heartbeat_Stateful.7 \ ocf_heartbeat_SysInfo.7 \ ocf_heartbeat_VIPArip.7 \ ocf_heartbeat_VirtualDomain.7 \ ocf_heartbeat_WAS.7 \ ocf_heartbeat_WAS6.7 \ ocf_heartbeat_WinPopup.7 \ ocf_heartbeat_Xen.7 \ ocf_heartbeat_Xinetd.7 \ ocf_heartbeat_anything.7 \ ocf_heartbeat_apache.7 \ ocf_heartbeat_asterisk.7 \ ocf_heartbeat_conntrackd.7 \ ocf_heartbeat_db2.7 \ + ocf_heartbeat_dhcpd.7 \ ocf_heartbeat_drbd.7 \ ocf_heartbeat_eDir88.7 \ ocf_heartbeat_ethmonitor.7 \ ocf_heartbeat_exportfs.7 \ ocf_heartbeat_fio.7 \ ocf_heartbeat_iSCSILogicalUnit.7 \ ocf_heartbeat_iSCSITarget.7 \ ocf_heartbeat_ids.7 \ ocf_heartbeat_iscsi.7 \ ocf_heartbeat_jboss.7 \ ocf_heartbeat_lxc.7 \ ocf_heartbeat_mysql.7 \ ocf_heartbeat_mysql-proxy.7 \ ocf_heartbeat_named.7 \ ocf_heartbeat_nfsserver.7 \ ocf_heartbeat_nginx.7 \ ocf_heartbeat_oracle.7 \ ocf_heartbeat_oralsnr.7 \ ocf_heartbeat_pgsql.7 \ ocf_heartbeat_pingd.7 \ ocf_heartbeat_portblock.7 \ ocf_heartbeat_postfix.7 \ ocf_heartbeat_pound.7 \ ocf_heartbeat_proftpd.7 \ ocf_heartbeat_rsyncd.7 \ ocf_heartbeat_rsyslog.7 \ ocf_heartbeat_scsi2reservation.7 \ ocf_heartbeat_sfex.7 \ ocf_heartbeat_slapd.7 \ ocf_heartbeat_symlink.7 \ ocf_heartbeat_syslog-ng.7 \ ocf_heartbeat_tomcat.7 \ ocf_heartbeat_varnish.7 \ ocf_heartbeat_vmware.7 if USE_IPV6ADDR man_MANS += ocf_heartbeat_IPv6addr.7 endif xmlfiles = $(man_MANS:.7=.xml) %.1 %.5 %.7 %.8: %.xml $(XSLTPROC) \ $(XSLTPROC_MANPAGES_OPTIONS) \ $(MANPAGES_STYLESHEET) $< ocf_heartbeat_%.xml: metadata-%.xml $(srcdir)/$(REFENTRY_STYLESHEET) $(XSLTPROC) --novalid \ --stringparam package $(PACKAGE_NAME) \ --stringparam version $(VERSION) \ --output $@ \ $(srcdir)/$(REFENTRY_STYLESHEET) $< ocf_resource_agents.xml: $(xmlfiles) mkappendix.sh ./mkappendix.sh $(xmlfiles) > $@ %.html: %.xml $(XSLTPROC) \ $(XSLTPROC_HTML_OPTIONS) \ --output $@ \ $(HTML_STYLESHEET) $< xml: ocf_resource_agents.xml endif diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem index 9c9a66483..e44e1aa98 100755 --- a/heartbeat/Filesystem +++ b/heartbeat/Filesystem @@ -1,1122 +1,1126 @@ #!/bin/sh # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # Filesystem # Description: Manages a Filesystem on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # # usage: ./Filesystem {start|stop|status|monitor|validate-all|meta-data} # # OCF parameters are as below: # OCF_RESKEY_device # OCF_RESKEY_directory # OCF_RESKEY_fstype # OCF_RESKEY_options # OCF_RESKEY_statusfile_prefix # OCF_RESKEY_run_fsck # OCF_RESKEY_fast_stop # OCF_RESKEY_force_clones # #OCF_RESKEY_device : name of block device for the filesystem. e.g. /dev/sda1, /dev/md0 # Or a -U or -L option for mount, or an NFS mount specification #OCF_RESKEY_directory : the mount point for the filesystem #OCF_RESKEY_fstype : optional name of the filesystem type. e.g. ext2 #OCF_RESKEY_options : options to be given to the mount command via -o #OCF_RESKEY_statusfile_prefix : the prefix used for a status file for monitoring #OCF_RESKEY_run_fsck : fsck execution mode: auto(default)/force/no #OCF_RESKEY_fast_stop : fast stop: yes(default)/no #OCF_RESKEY_force_clones : allow running the resource as clone. e.g. local xfs mounts # for each brick in a glusterfs setup # # # This assumes you want to manage a filesystem on a shared (SCSI) bus, # on a replicated device (such as DRBD), or a network filesystem (such # as NFS or Samba). # # Do not put this filesystem in /etc/fstab. This script manages all of # that for you. # # NOTE: If 2 or more nodes mount the same file system read-write, and # that file system is not designed for that specific purpose # (such as GFS or OCFS2), and is not a network file system like # NFS or Samba, then the filesystem is going to become # corrupted. # # As a result, you should use this together with the stonith # option and redundant, independent communications paths. # # If you don't do this, don't blame us when you scramble your # disk. ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults DFLT_STATUSDIR=".Filesystem_status/" # Variables used by multiple methods HOSTOS=`uname` # The status file is going to an extra directory, by default # prefix=${OCF_RESKEY_statusfile_prefix} : ${prefix:=$DFLT_STATUSDIR} suffix="${OCF_RESOURCE_INSTANCE}" [ "$OCF_RESKEY_CRM_meta_clone" ] && suffix="${suffix}_$OCF_RESKEY_CRM_meta_clone" suffix="${suffix}_`uname -n`" STATUSFILE=${OCF_RESKEY_directory}/$prefix$suffix ####################################################################### usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|meta-data} EOT } meta_data() { cat < 1.1 Resource script for Filesystem. It manages a Filesystem on a shared storage medium. The standard monitor operation of depth 0 (also known as probe) checks if the filesystem is mounted. If you want deeper tests, set OCF_CHECK_LEVEL to one of the following values: 10: read first 16 blocks of the device (raw read) This doesn't exercise the filesystem at all, but the device on which the filesystem lives. This is noop for non-block devices such as NFS, SMBFS, or bind mounts. 20: test if a status file can be written and read The status file must be writable by root. This is not always the case with an NFS mount, as NFS exports usually have the "root_squash" option set. In such a setup, you must either use read-only monitoring (depth=10), export with "no_root_squash" on your NFS server, or grant world write permissions on the directory where the status file is to be placed. Manages filesystem mounts The name of block device for the filesystem, or -U, -L options for mount, or NFS mount specification. block device The mount point for the filesystem. mount point The type of filesystem to be mounted. filesystem type Any extra options to be given as -o options to mount. For bind mounts, add "bind" here and set fstype to "none". We will do the right thing for options such as "bind,ro". options The prefix to be used for a status file for resource monitoring with depth 20. If you don't specify this parameter, all status files will be created in a separate directory. status file prefix Specify how to decide whether to run fsck or not. "auto" : decide to run fsck depending on the fstype(default) "force" : always run fsck regardless of the fstype "no" : do not run fsck ever. run_fsck Normally, we expect no users of the filesystem and the stop operation to finish quickly. If you cannot control the filesystem users easily and want to prevent the stop action from failing, then set this parameter to "no" and add an appropriate timeout for the stop operation. fast stop The usage of a clone setup for local filesystems is forbidden by default. For special setups like glusterfs, cloning a mount of a local device with a filesystem like ext4 or xfs, independently on several nodes is a valid use-case. Only set this to "true" if you know what you are doing! allow running as a clone, regardless of filesystem type END } # # Make sure the kernel does the right thing with the FS buffers # This function should be called after unmounting and before mounting # It may not be necessary in 2.4 and later kernels, but it shouldn't hurt # anything either... # # It's really a bug that you have to do this at all... # flushbufs() { if have_binary $BLOCKDEV ; then if [ "$blockdevice" = "yes" ] ; then $BLOCKDEV --flushbufs $1 return $? fi fi return 0 } # Take advantage of /etc/mtab if present, use portable mount command # otherwise. Normalize format to "dev mountpoint fstype". is_bind_mount() { echo "$options" | grep -w bind >/dev/null 2>&1 } list_mounts() { local inpf="" if [ -e "/proc/mounts" ] && ! is_bind_mount; then inpf=/proc/mounts elif [ -f "/etc/mtab" -a -r "/etc/mtab" ]; then inpf=/etc/mtab fi if [ "$inpf" ]; then cut -d' ' -f1,2,3 < $inpf else $MOUNT | cut -d' ' -f1,3,5 fi } determine_blockdevice() { if [ $blockdevice = "yes" ]; then return fi # Get the current real device name, if possible. # (specified devname could be -L or -U...) case "$FSTYPE" in - nfs|smbfs|cifs|glusterfs|ceph|tmpfs|none) ;; + nfs4|nfs|smbfs|cifs|glusterfs|ceph|tmpfs|none) ;; *) DEVICE=`list_mounts | grep " $MOUNTPOINT " | cut -d' ' -f1` if [ -b "$DEVICE" ]; then blockdevice=yes fi ;; esac } # Lists all filesystems potentially mounted under a given path, # excluding the path itself. list_submounts() { list_mounts | grep " $1/" | cut -d' ' -f2 | sort -r } ocfs2_del_cache() { if [ -e "$_OCFS2_uuid_cache" ]; then rm -f $_OCFS2_uuid_cache fi } ocfs2_cleanup() { # We'll never see the post-stop notification. We're gone now, # have unmounted, and thus should remove the membership. # # (Do so regardless of whether we were unmounted already, # because the admin might have manually unmounted but not # cleared up the membership directory. Bad admin, no cookie.) # if [ ! -d "$OCFS2_FS_ROOT" ]; then ocf_log info "$OCFS2_FS_ROOT: Filesystem membership already gone." else ocf_log info "$OCFS2_FS_ROOT: Removing membership directory." rm -rf $OCFS2_FS_ROOT/ fi ocfs2_del_cache } ocfs2_fetch_uuid() { mounted.ocfs2 -d $DEVICE|tail -1|awk '{print $3}'|tr -d -- -|tr '[a-z]' '[A-Z]' } ocfs2_set_uuid() { _OCFS2_uuid_cache="$HA_RSCTMP/Filesystem.ocfs2_uuid.$(echo $DEVICE|tr / .)" if [ "$OP" != "start" -a -e "$_OCFS2_uuid_cache" ]; then # Trust the cache. OCFS2_UUID=$(cat $_OCFS2_uuid_cache 2>/dev/null) return 0 fi OCFS2_UUID=$(ocfs2_fetch_uuid) if [ -n "$OCFS2_UUID" -a "$OCFS2_UUID" != "UUID" ]; then # UUID valid: echo $OCFS2_UUID > $_OCFS2_uuid_cache return 0 fi # Ok, no UUID still, but that's alright for stop, because it # very likely means we never got started - if [ "$OP" = "stop" ]; then ocf_log warn "$DEVICE: No UUID; assuming never started!" OCFS2_UUID="UUID_NOT_SET" return 0 fi # Everything else - wrong: ocf_log err "$DEVICE: Could not determine ocfs2 UUID for device." exit $OCF_ERR_GENERIC } ocfs2_init() { # Check & initialize the OCFS2 specific variables. # This check detects whether the special/legacy hooks to # integrate OCFS2 with user-space clustering on SLES10 need to # be activated. # Newer kernels >= 2.6.28, with OCFS2+openAIS+Pacemaker, do # not need this: OCFS2_SLES10="" if [ "X$HA_cluster_type" = "Xcman" ]; then return elif [ "X$HA_cluster_type" != "Xopenais" ]; then if grep -q "SUSE Linux Enterprise Server 10" /etc/SuSE-release >/dev/null 2>&1 ; then OCFS2_SLES10="yes" ocf_log info "$DEVICE: Enabling SLES10 compatibility mode for OCFS2." else ocf_log err "$DEVICE: ocfs2 is not compatible with your environment." exit $OCF_ERR_CONFIGURED fi else return fi if [ $OP != "stop" ]; then if [ -z "$OCF_RESKEY_CRM_meta_clone" ]; then ocf_log err "ocfs2 must be run as a clone." exit $OCF_ERR_GENERIC fi fi if [ $blockdevice = "no" ]; then ocf_log err "$DEVICE: ocfs2 needs a block device instead." exit $OCF_ERR_GENERIC fi for f in "$OCF_RESKEY_ocfs2_configfs" /sys/kernel/config/cluster /configfs/cluster ; do if [ -n "$f" -a -d "$f" ]; then OCFS2_CONFIGFS="$f" break fi done if [ ! -d "$OCFS2_CONFIGFS" ]; then ocf_log err "ocfs2 needs configfs mounted." exit $OCF_ERR_GENERIC fi ocfs2_set_uuid if [ -n "$OCF_RESKEY_ocfs2_cluster" ]; then OCFS2_CLUSTER=$(echo $OCF_RESKEY_ocfs2_cluster) else OCFS2_CLUSTER=$(find "$OCFS2_CONFIGFS" -maxdepth 1 -mindepth 1 -type d -printf %f 2>/dev/null) set -- $OCFS2_CLUSTER local n; n="$#" if [ $n -gt 1 ]; then ocf_log err "$OCFS2_CLUSTER: several clusters found." exit $OCF_ERR_GENERIC fi if [ $n -eq 0 ]; then ocf_log err "$OCFS2_CONFIGFS: no clusters found." exit $OCF_ERR_GENERIC fi fi OCFS2_CLUSTER_ROOT="$OCFS2_CONFIGFS/$OCFS2_CLUSTER" if [ ! -d "$OCFS2_CLUSTER_ROOT" ]; then ocf_log err "$OCFS2_CLUSTER: Cluster doesn't exist. Maybe o2cb hasn't been run?" exit $OCF_ERR_GENERIC fi OCFS2_FS_ROOT=$OCFS2_CLUSTER_ROOT/heartbeat/$OCFS2_UUID } # kernels < 2.6.26 can't handle bind remounts bind_kernel_check() { echo "$options" | grep -w ro >/dev/null 2>&1 || return uname -r | awk -F. ' $1==2 && $2==6 { sub("[^0-9].*","",$3); if ($3<26) exit(1); }' [ $? -ne 0 ] && ocf_log warn "kernel `uname -r` cannot handle read only bind mounts" } bind_mount() { if is_bind_mount && [ "$options" != "-o bind" ] then bind_kernel_check bind_opts=`echo $options | sed 's/bind/remount/'` $MOUNT $bind_opts $MOUNTPOINT else true # make sure to return OK fi } is_option() { echo $OCF_RESKEY_options | grep -w "$1" >/dev/null 2>&1 } is_fsck_needed() { case $OCF_RESKEY_run_fsck in force) true;; no) false;; ""|auto) case $FSTYPE in - ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs) + ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs) false;; *) true;; esac;; *) ocf_log warn "Invalid parameter value for fsck: '$OCF_RESKEY_run_fsck'; setting to 'auto'" OCF_RESKEY_run_fsck="auto" is_fsck_needed;; esac } # # START: Start up the filesystem # Filesystem_start() { if [ -n "$OCFS2_SLES10" ]; then # "start" now has the notification data available; that # we're being started means we didn't get the # pre-notification, because we weren't running, so # process the information now first. ocf_log info "$OCFS2_UUID: Faking pre-notification on start." OCF_RESKEY_CRM_meta_notify_type="pre" OCF_RESKEY_CRM_meta_notify_operation="start" Filesystem_notify fi # See if the device is already mounted. if Filesystem_status >/dev/null 2>&1 ; then ocf_log info "Filesystem $MOUNTPOINT is already mounted." return $OCF_SUCCESS fi if [ "X${HOSTOS}" != "XOpenBSD" ];then if [ -z "$FSTYPE" -o "$FSTYPE" = none ]; then : No FSTYPE specified, rely on the system has the right file-system support already else local support="$FSTYPE" # support fuse-filesystems (e.g. GlusterFS) case $FSTYPE in glusterfs) support="fuse";; esac grep -w "$support"'$' /proc/filesystems >/dev/null || $MODPROBE $support >/dev/null grep -w "$support"'$' /proc/filesystems >/dev/null if [ $? -ne 0 ] ; then ocf_log err "Couldn't find filesystem $FSTYPE in /proc/filesystems" return $OCF_ERR_INSTALLED fi fi fi # Check the filesystem & auto repair. # NOTE: Some filesystem types don't need this step... Please modify # accordingly if [ $blockdevice = "yes" ]; then if [ "$DEVICE" != "/dev/null" -a ! -b "$DEVICE" ] ; then ocf_log err "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" exit $OCF_ERR_INSTALLED fi if is_fsck_needed; then ocf_log info "Starting filesystem check on $DEVICE" if [ -z "$FSTYPE" ]; then $FSCK -p $DEVICE else $FSCK -t $FSTYPE -p $DEVICE fi # NOTE: if any errors at all are detected, it returns non-zero # if the error is >= 4 then there is a big problem if [ $? -ge 4 ]; then ocf_log err "Couldn't sucessfully fsck filesystem for $DEVICE" return $OCF_ERR_GENERIC fi fi fi if [ ! -d "$MOUNTPOINT" ] ; then ocf_log err "Couldn't find directory [$MOUNTPOINT] to use as a mount point" exit $OCF_ERR_INSTALLED fi flushbufs $DEVICE # Mount the filesystem. case "$FSTYPE" in none) $MOUNT $options $DEVICE $MOUNTPOINT && bind_mount ;; "") $MOUNT $options $DEVICE $MOUNTPOINT ;; *) $MOUNT -t $FSTYPE $options $DEVICE $MOUNTPOINT ;; esac if [ $? -ne 0 ]; then ocf_log err "Couldn't mount filesystem $DEVICE on $MOUNTPOINT" if [ -n "$OCFS2_SLES10" ]; then ocfs2_cleanup fi return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # end of Filesystem_start Filesystem_notify() { # Process notifications; this is the essential glue level for # giving user-space membership events to a cluster-aware # filesystem. Right now, only OCFS2 is supported. # # When we get a pre-start notification, we set up all the nodes # which will be active in our membership for the filesystem. # (For the resource to be started, this happens at the time of # the actual 'start' operation.) # # At a post-start, actually there's nothing to do for us really, # but no harm done in re-syncing either. # # pre-stop is meaningless; we can't remove any node yet, it # first needs to unmount. # # post-stop: the node is removed from the membership of the # other nodes. # # Note that this expects that the base cluster is already # active; ie o2cb has been started and populated # $OCFS2_CLUSTER_ROOT/node/ already. This can be achieved by # simply having o2cb run on all nodes by the CRM too. This # probably ought to be mentioned somewhere in the to be written # documentation. ;-) # if [ -z "$OCFS2_SLES10" ]; then # One of the cases which shouldn't occur; it should have # been caught much earlier. Still, you know ... ocf_log err "$DEVICE: Please only enable notifications for SLES10 OCFS2 mounts." # Yes, in theory this is a configuration error, but # simply discarding them allows users to switch from the # SLES10 stack to the new one w/o downtime. # Ignoring the notifications is harmless, afterall, and # they can simply disable them in their own time. return $OCF_SUCCESS fi local n_type; n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op; n_op="$OCF_RESKEY_CRM_meta_notify_operation" local n_active; n_active="$OCF_RESKEY_CRM_meta_notify_active_uname" local n_stop; n_stop="$OCF_RESKEY_CRM_meta_notify_stop_uname" local n_start; n_start="$OCF_RESKEY_CRM_meta_notify_start_uname" ocf_log info "$OCFS2_UUID: notify: $n_type for $n_op" ocf_log info "$OCFS2_UUID: notify active: $n_active" ocf_log info "$OCFS2_UUID: notify stop: $n_stop" ocf_log info "$OCFS2_UUID: notify start: $n_start" case "$n_type" in pre) case "$n_op" in stop) ocf_log info "$OCFS2_UUID: ignoring pre-notify for stop." return $OCF_SUCCESS ;; start) # These are about to become active; prepare to # communicate with them. # Duplicate removal - start can contain nodes # already on the active list, confusing the # script later on: for UNAME in $n_active; do n_start=`echo ${n_start} | sed s/$UNAME//` done # Merge pruned lists again: n_active="$n_active $n_start" ;; esac ;; post) case "$n_op" in stop) # remove unames from notify_stop_uname; these have been # stopped and can no longer be considered active. for UNAME in $n_stop; do n_active=`echo ${n_active} | sed s/$UNAME//` done ;; start) if [ "$n_op" = "start" ]; then ocf_log info "$OCFS2_UUID: ignoring post-notify for start." return $OCF_SUCCESS fi ;; esac ;; esac ocf_log info "$OCFS2_UUID: post-processed active: $n_active" local n_myself; n_myself=${HA_CURHOST:-$(uname -n | tr '[A-Z]' '[a-z]')} ocf_log info "$OCFS2_UUID: I am node $n_myself." case " $n_active " in *" $n_myself "*) ;; *) ocf_log err "$OCFS2_UUID: $n_myself (local) not on active list!" return $OCF_ERR_GENERIC ;; esac if [ -d "$OCFS2_FS_ROOT" ]; then entry_prefix=$OCFS2_FS_ROOT/ for entry in $OCFS2_FS_ROOT/* ; do n_fs="${entry##$entry_prefix}" # ocf_log info "$OCFS2_UUID: Found current node $n_fs" case " $n_active " in *" $n_fs "*) # Construct a list of nodes which are present # already in the membership. n_exists="$n_exists $n_fs" ocf_log info "$OCFS2_UUID: Keeping node: $n_fs" ;; *) # Node is in the membership currently, but not on our # active list. Must be removed. if [ "$n_op" = "start" ]; then ocf_log warn "$OCFS2_UUID: Removing nodes on start" fi ocf_log info "$OCFS2_UUID: Removing dead node: $n_fs" if ! rm -f $entry ; then ocf_log err "$OCFS2_UUID: Removal of $n_fs failed!" fi ;; esac done else ocf_log info "$OCFS2_UUID: heartbeat directory doesn't exist yet, creating." mkdir -p $OCFS2_FS_ROOT fi ocf_log info "$OCFS2_UUID: Existing node list: $n_exists" # (2) for entry in $n_active ; do # ocf_log info "$OCFS2_UUID: Expected active node: $entry" case " $n_exists " in *" $entry "*) ocf_log info "$OCFS2_UUID: Already active: $entry" ;; *) if [ "$n_op" = "stop" ]; then ocf_log warn "$OCFS2_UUID: Adding nodes on stop" fi ocf_log info "$OCFS2_UUID: Activating node: $entry" if ! ln -s $OCFS2_CLUSTER_ROOT/node/$entry $OCFS2_FS_ROOT/$entry ; then ocf_log err "$OCFS2_CLUSTER_ROOT/node/$entry: failed to link" fi ;; esac done } signal_processes() { local dir=$1 local sig=$2 # fuser returns a non-zero return code if none of the # specified files is accessed or in case of a fatal # error. if [ "X${HOSTOS}" = "XOpenBSD" ];then PIDS=`fstat | grep $dir | awk '{print $3}'` for PID in ${PIDS};do kill -s $sig ${PID} ocf_log info "Sent signal $sig to ${PID}" done else if $FUSER -$sig -m -k $dir ; then ocf_log info "Some processes on $dir were signalled" else ocf_log info "No processes on $dir were signalled" fi fi } try_umount() { local SUB=$1 $UMOUNT $umount_force $SUB list_mounts | grep -q " $SUB " >/dev/null 2>&1 || { ocf_log info "unmounted $SUB successfully" return $OCF_SUCCESS } return $OCF_ERR_GENERIC } fs_stop() { local SUB=$1 timeout=$2 sig cnt for sig in TERM KILL; do cnt=$((timeout/2)) # try half time with TERM while [ $cnt -gt 0 ]; do try_umount $SUB && return $OCF_SUCCESS ocf_log err "Couldn't unmount $SUB; trying cleanup with $sig" signal_processes $SUB $sig cnt=$((cnt-1)) sleep 1 done done return $OCF_ERR_GENERIC } # # STOP: Unmount the filesystem # Filesystem_stop() { # See if the device is currently mounted Filesystem_status >/dev/null 2>&1 if [ $? -eq $OCF_NOT_RUNNING ]; then # Already unmounted, wonderful. rc=$OCF_SUCCESS else # Wipe the status file, but continue with a warning if # removal fails -- the file system might be read only - if [ -f "$STATUSFILE" ]; then + if [ $OCF_CHECK_LEVEL -eq 20 ]; then rm -f ${STATUSFILE} if [ $? -ne 0 ]; then ocf_log warn "Failed to remove status file ${STATUSFILE}." fi fi # Determine the real blockdevice this is mounted on (if # possible) prior to unmounting. determine_blockdevice # For networked filesystems, there's merit in trying -f: case "$FSTYPE" in - nfs|cifs|smbfs) umount_force="-f" ;; + nfs4|nfs|cifs|smbfs) umount_force="-f" ;; esac # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. local timeout for SUB in `list_submounts $MOUNTPOINT` $MOUNTPOINT; do ocf_log info "Trying to unmount $SUB" if ocf_is_true "$FAST_STOP"; then timeout=6 else timeout=${OCF_RESKEY_CRM_meta_timeout:="20000"} timeout=$((timeout/1000)) fi fs_stop $SUB $timeout rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_log err "Couldn't unmount $SUB, giving up!" fi done fi flushbufs $DEVICE # Yes I know the next blob is ugly, sorry. if [ $rc -eq $OCF_SUCCESS ]; then if [ "$FSTYPE" = "ocfs2" ]; then ocfs2_init if [ -n "$OCFS2_SLES10" ]; then ocfs2_cleanup fi fi fi return $rc } # end of Filesystem_stop # # STATUS: is the filesystem mounted or not? # Filesystem_status() { if list_mounts | grep -q " $MOUNTPOINT " >/dev/null 2>&1; then rc=$OCF_SUCCESS msg="$MOUNTPOINT is mounted (running)" else rc=$OCF_NOT_RUNNING msg="$MOUNTPOINT is unmounted (stopped)" fi # TODO: For ocfs2, or other cluster filesystems, should we be # checking connectivity to other nodes here, or the IO path to # the storage? # Special case "monitor" to check whether the UUID cached and # on-disk still match? case "$OP" in status) ocf_log info "$msg";; esac return $rc } # end of Filesystem_status # Note: the read/write tests below will stall in case the # underlying block device (or in the case of a NAS mount, the # NAS server) has gone away. In that case, if I/O does not # return to normal in time, the operation hits its timeout # and it is up to the CRM to initiate appropriate recovery # actions (such as fencing the node). # # MONITOR 10: read the device # Filesystem_monitor_10() { if [ "$blockdevice" = "no" ] ; then ocf_log warn "$DEVICE is not a block device, monitor 10 is noop" return $OCF_SUCCESS fi dd_opts="iflag=direct bs=4k count=1" err_output=`dd if=$DEVICE $dd_opts 2>&1 >/dev/null` if [ $? -ne 0 ]; then ocf_log err "Failed to read device $DEVICE" ocf_log err "dd said: $err_output" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # MONITOR 20: write and read a status file # Filesystem_monitor_20() { if [ "$blockdevice" = "no" ] ; then # O_DIRECT not supported on cifs/smbfs dd_opts="oflag=sync bs=4k conv=fsync,sync" else # Writing to the device in O_DIRECT mode is imperative # to bypass caches. dd_opts="oflag=direct,sync bs=4k conv=fsync,sync" fi status_dir=`dirname $STATUSFILE` [ -d "$status_dir" ] || mkdir -p "$status_dir" err_output=` echo "${OCF_RESOURCE_INSTANCE}" | dd of=${STATUSFILE} $dd_opts 2>&1` if [ $? -ne 0 ]; then ocf_log err "Failed to write status file ${STATUSFILE}" ocf_log err "dd said: $err_output" return $OCF_ERR_GENERIC fi test -f ${STATUSFILE} if [ $? -ne 0 ]; then ocf_log err "Cannot stat the status file ${STATUSFILE}" return $OCF_ERR_GENERIC fi cat ${STATUSFILE} > /dev/null if [ $? -ne 0 ]; then ocf_log err "Cannot read the status file ${STATUSFILE}" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } Filesystem_monitor() { Filesystem_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi if [ $rc -eq $OCF_SUCCESS -a $OCF_CHECK_LEVEL -gt 0 ]; then case "$OCF_CHECK_LEVEL" in 10) Filesystem_monitor_10; rc=$?;; 20) Filesystem_monitor_20; rc=$?;; *) ocf_log err "unsupported monitor level $OCF_CHECK_LEVEL" rc=$OCF_ERR_CONFIGURED ;; esac fi return $rc } # end of Filesystem_monitor # # VALIDATE_ALL: Are the instance parameters valid? # FIXME!! The only part that's useful is the return code. # This code always returns $OCF_SUCCESS (!) # Filesystem_validate_all() { if [ -n $MOUNTPOINT -a ! -d $MOUNTPOINT ]; then ocf_log warn "Mountpoint $MOUNTPOINT does not exist" fi # Check if the $FSTYPE is workable # NOTE: Without inserting the $FSTYPE module, this step may be imprecise # TODO: This is Linux specific crap. if [ ! -z "$FSTYPE" -a "$FSTYPE" != none ]; then cut -f2 /proc/filesystems |grep -q ^$FSTYPE$ if [ $? -ne 0 ]; then modpath=/lib/modules/`uname -r` moddep=$modpath/modules.dep # Do we have $FSTYPE in modules.dep? cut -d' ' -f1 $moddep |grep -q "^$modpath.*$FSTYPE\.k\?o:$" if [ $? -ne 0 ]; then ocf_log info "It seems we do not have $FSTYPE support" fi fi fi # If we are supposed to do monitoring with status files, then # we need a utility to write in O_DIRECT mode. if [ $OCF_CHECK_LEVEL -gt 0 ]; then check_binary dd # Note: really old coreutils version do not support # the "oflag" option for dd. We don't check for that # here. In case dd does not support oflag, monitor is # bound to fail, with dd spewing an error message to # the logs. On such systems, we must do without status # file monitoring. fi #TODO: How to check the $options ? return $OCF_SUCCESS } # # set the blockdevice variable to "no" or "yes" # set_blockdevice_var() { blockdevice=no # these are definitely not block devices case $FSTYPE in - nfs|smbfs|cifs|none|glusterfs) return;; + nfs4|nfs|smbfs|cifs|none|glusterfs|ceph) return;; esac + if `is_option "loop"`; then + return + fi + case $DEVICE in -*) # Oh... An option to mount instead... Typically -U or -L ;; /dev/null) # Special case for BSC blockdevice=yes ;; *) if [ ! -b "$DEVICE" -a ! -d "$DEVICE" -a "X$OP" != Xstart ] ; then ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" fi if [ ! -d "$DEVICE" ]; then blockdevice=yes fi ;; esac } # Check the arguments passed to this script if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # Check the OCF_RESKEY_ environment variables... DEVICE=$OCF_RESKEY_device FSTYPE=$OCF_RESKEY_fstype if [ ! -z "$OCF_RESKEY_options" ]; then options="-o $OCF_RESKEY_options" fi FAST_STOP=${OCF_RESKEY_fast_stop:="yes"} OP=$1 # These operations do not require instance parameters case $OP in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac if [ x = x"$DEVICE" ]; then ocf_log err "Please set OCF_RESKEY_device to the device to be managed" exit $OCF_ERR_CONFIGURED fi set_blockdevice_var # Normalize instance parameters: # It is possible that OCF_RESKEY_directory has one or even multiple trailing "/". # But the output of `mount` and /proc/mounts do not. if [ -z "$OCF_RESKEY_directory" ]; then if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then ocf_log err "Please specify the directory" exit $OCF_ERR_CONFIGURED fi else MOUNTPOINT=$(echo $OCF_RESKEY_directory | sed 's/\/*$//') : ${MOUNTPOINT:=/} # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/" # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll # kill the whole system. Is that a good idea? fi # Check to make sure the utilites are found if [ "X${HOSTOS}" != "XOpenBSD" ];then check_binary $MODPROBE check_binary $FUSER fi check_binary $FSCK check_binary $MOUNT check_binary $UMOUNT if [ "$OP" != "monitor" ]; then ocf_log info "Running $OP for $DEVICE on $MOUNTPOINT" fi # These operations do not require the clone checking + OCFS2 # initialization. case $OP in status) Filesystem_status exit $? ;; monitor) Filesystem_monitor exit $? ;; validate-all) Filesystem_validate_all exit $? ;; stop) Filesystem_stop exit $? ;; esac CLUSTERSAFE=0 is_option "ro" && CLUSTERSAFE=2 case $FSTYPE in ocfs2) ocfs2_init CLUSTERSAFE=1 ;; -nfs|smbfs|cifs|none|gfs2|glusterfs|ceph) CLUSTERSAFE=1 # this is kind of safe too +nfs4|nfs|smbfs|cifs|none|gfs2|glusterfs|ceph) CLUSTERSAFE=1 # this is kind of safe too ;; # add here CLUSTERSAFE=0 for all filesystems which are not # cluster aware and which, even if when mounted read-only, # could still modify parts of it such as journal/metadata ext4|ext4dev|ext3|reiserfs|reiser4|xfs|jfs) if ocf_is_true "$OCF_RESKEY_force_clones"; then CLUSTERSAFE=2 else CLUSTERSAFE=0 # these are not allowed fi ;; esac if [ -n "$OCF_RESKEY_CRM_meta_clone" ]; then case $CLUSTERSAFE in 0) ocf_log err "DANGER! $FSTYPE on $DEVICE is NOT cluster-aware!" ocf_log err "DO NOT RUN IT AS A CLONE!" ocf_log err "Politely refusing to proceed to avoid data corruption." exit $OCF_ERR_CONFIGURED ;; 2) ocf_log warn "$FSTYPE on $DEVICE is NOT cluster-aware!" if ocf_is_true "$OCF_RESKEY_force_clones"; then ocf_log warn "But we'll let it run because we trust _YOU_ verified it's safe to do so." else ocf_log warn "But we'll let it run because it is mounted read-only." ocf_log warn "Please make sure that it's meta data is read-only too!" fi ;; esac fi case $OP in start) Filesystem_start ;; notify) Filesystem_notify ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2 index a35e05838..30af45a8a 100755 --- a/heartbeat/IPaddr2 +++ b/heartbeat/IPaddr2 @@ -1,909 +1,905 @@ #!/bin/sh # # $Id: IPaddr2.in,v 1.24 2006/08/09 13:01:54 lars Exp $ # # OCF Resource Agent compliant IPaddr2 script. # # Based on work by Tuomo Soini, ported to the OCF RA API by Lars # Marowsky-Brée. Implements Cluster Alias IP functionality too. # # Cluster Alias IP cleanup, fixes and testing by Michael Schwartzkopff # # # Copyright (c) 2003 Tuomo Soini # Copyright (c) 2004-2006 SUSE LINUX AG, Lars Marowsky-Brée # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # TODO: # - There ought to be an ocf_run_cmd function which does all logging, # timeout handling etc for us # - Make this the standard IP address agent on Linux; the other # platforms simply should ignore the additional parameters OR can use # the legacy heartbeat resource script... # - Check LVS <-> clusterip incompatibilities. # # OCF parameters are as below # OCF_RESKEY_ip # OCF_RESKEY_broadcast # OCF_RESKEY_nic # OCF_RESKEY_cidr_netmask # OCF_RESKEY_iflabel # OCF_RESKEY_mac # OCF_RESKEY_clusterip_hash # OCF_RESKEY_arp_interval # OCF_RESKEY_arp_count # OCF_RESKEY_arp_bg # OCF_RESKEY_arp_mac # # OCF_RESKEY_CRM_meta_clone # OCF_RESKEY_CRM_meta_clone_max ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_lvs_support_default=false OCF_RESKEY_clusterip_hash_default="sourceip-sourceport" OCF_RESKEY_unique_clone_address_default=false OCF_RESKEY_arp_interval_default=200 OCF_RESKEY_arp_count_default=5 OCF_RESKEY_arp_bg_default=true OCF_RESKEY_arp_mac_default="ffffffffffff" : ${OCF_RESKEY_lvs_support=${OCF_RESKEY_lvs_support_default}} : ${OCF_RESKEY_clusterip_hash=${OCF_RESKEY_clusterip_hash_default}} : ${OCF_RESKEY_unique_clone_address=${OCF_RESKEY_unique_clone_address_default}} : ${OCF_RESKEY_arp_interval=${OCF_RESKEY_arp_interval_default}} : ${OCF_RESKEY_arp_count=${OCF_RESKEY_arp_count_default}} : ${OCF_RESKEY_arp_bg=${OCF_RESKEY_arp_bg_default}} : ${OCF_RESKEY_arp_mac=${OCF_RESKEY_arp_mac_default}} ####################################################################### SENDARP=$HA_BIN/send_arp FINDIF=$HA_BIN/findif VLDIR=$HA_RSCTMP SENDARPPIDDIR=$HA_RSCTMP CIP_lockfile=$HA_RSCTMP/IPaddr2-CIP-${OCF_RESKEY_ip} ####################################################################### meta_data() { cat < 1.0 This Linux-specific resource manages IP alias IP addresses. It can add an IP alias, or remove one. In addition, it can implement Cluster Alias IP functionality if invoked as a clone resource. If used as a clone, you should explicitly set clone-node-max >= 2, and/or clone-max < number of nodes. In case of node failure, clone instances need to be re-allocated on surviving nodes. Which would not be possible, if there is already an instance on those nodes, and clone-node-max=1 (which is the default). Manages virtual IPv4 addresses (Linux specific version) The IPv4 address to be configured in dotted quad notation, for example "192.168.1.1". IPv4 address The base network interface on which the IP address will be brought online. If left empty, the script will try and determine this from the routing table. Do NOT specify an alias interface in the form eth0:1 or anything here; rather, specify the base interface only. If you want a label, see the iflabel parameter. Prerequisite: There must be at least one static IP address, which is not managed by the cluster, assigned to the network interface. If you can not assign any static IP address on the interface, modify this kernel parameter: sysctl -w net.ipv4.conf.all.promote_secondaries=1 # (or per device) Network interface The netmask for the interface in CIDR format (e.g., 24 and not 255.255.255.0) If unspecified, the script will also try to determine this from the routing table. CIDR netmask Broadcast address associated with the IP. If left empty, the script will determine this from the netmask. Broadcast address You can specify an additional label for your IP address here. This label is appended to your interface name. If a label is specified in nic name, this parameter has no effect. Interface label Enable support for LVS Direct Routing configurations. In case a IP address is stopped, only move it to the loopback device to allow the local node to continue to service requests, but no longer advertise it on the network. Enable support for LVS DR Set the interface MAC address explicitly. Currently only used in case of the Cluster IP Alias. Leave empty to chose automatically. Cluster IP MAC address Specify the hashing algorithm used for the Cluster IP functionality. Cluster IP hashing function If true, add the clone ID to the supplied value of ip to create a unique address to manage Create a unique address for cloned instances Specify the interval between unsolicited ARP packets in milliseconds. ARP packet interval in ms Number of unsolicited ARP packets to send. ARP packet count Whether or not to send the arp packets in the background. ARP from background MAC address to send the ARP packets to. You really shouldn't be touching this. ARP MAC Flush the routing table on stop. This is for applications which use the cluster IP address and which run on the same physical host that the IP address lives on. The Linux kernel may force that application to take a shortcut to the local loopback interface, instead of the interface the address is really bound to. Under those circumstances, an application may, somewhat unexpectedly, continue to use connections for some time even after the IP address is deconfigured. Set this parameter in order to immediately disable said shortcut when the IP address goes away. Flush kernel routing table on stop END exit $OCF_SUCCESS } ip_init() { local rc if [ X`uname -s` != "XLinux" ]; then ocf_log err "IPaddr2 only supported Linux." exit $OCF_ERR_INSTALLED fi if [ X"$OCF_RESKEY_ip" = "X" ]; then ocf_log err "IP address (the ip parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi if case $__OCF_ACTION in start|stop) ocf_is_root;; *) true;; esac then : YAY! else ocf_log err "You must be root for $__OCF_ACTION operation." exit $OCF_ERR_PERM fi BASEIP="$OCF_RESKEY_ip" BRDCAST="$OCF_RESKEY_broadcast" NIC="$OCF_RESKEY_nic" # Note: We had a version out there for a while which used # netmask instead of cidr_netmask. Don't remove this aliasing code! if [ ! -z "$OCF_RESKEY_netmask" -a -z "$OCF_RESKEY_cidr_netmask" ] then OCF_RESKEY_cidr_netmask=$OCF_RESKEY_netmask export OCF_RESKEY_cidr_netmask fi NETMASK="$OCF_RESKEY_cidr_netmask" IFLABEL="$OCF_RESKEY_iflabel" IF_MAC="$OCF_RESKEY_mac" IP_INC_GLOBAL=${OCF_RESKEY_CRM_meta_clone_max:-1} IP_INC_NO=`expr ${OCF_RESKEY_CRM_meta_clone:-0} + 1` if ocf_is_true ${OCF_RESKEY_lvs_support} && [ $IP_INC_GLOBAL -gt 1 ]; then ocf_log err "LVS and load sharing do not go together well" exit $OCF_ERR_CONFIGURED fi if ocf_is_decimal "$IP_INC_GLOBAL" && [ $IP_INC_GLOBAL -gt 0 ]; then : else ocf_log err "Invalid OCF_RESKEY_incarnations_max_global [$IP_INC_GLOBAL], should be positive integer" exit $OCF_ERR_CONFIGURED fi # $FINDIF takes its parameters from the environment # NICINFO=`$FINDIF -C` rc=$? if [ $rc -eq 0 ] then NICINFO=`echo $NICINFO | sed -e 's/netmask\ //;s/broadcast\ //'` NIC=`echo "$NICINFO" | cut -d" " -f1` NETMASK=`echo "$NICINFO" | cut -d" " -f2` BRDCAST=`echo "$NICINFO" | cut -d" " -f3` else # findif couldn't find the interface if ocf_is_probe; then ocf_log info "[$FINDIF -C] failed" exit $OCF_NOT_RUNNING elif [ "$__OCF_ACTION" = stop ]; then ocf_log warn "[$FINDIF -C] failed" exit $OCF_SUCCESS else ocf_log err "[$FINDIF -C] failed" exit $rc fi fi SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip" case $NIC in *:*) IFLABEL=$NIC NIC=`echo $NIC | sed 's/:.*//'` ;; *) if [ -n "$IFLABEL" ]; then IFLABEL=${NIC}:${IFLABEL} fi ;; esac if [ "$IP_INC_GLOBAL" -gt 1 ] && ! ocf_is_true "$OCF_RESKEY_unique_clone_address"; then IP_CIP="yes" IP_CIP_HASH="${OCF_RESKEY_clusterip_hash}" if [ -z "$IF_MAC" ]; then # Choose a MAC # 1. Concatenate some input together # 2. This doesn't need to be a cryptographically # secure hash. # 3. Drop everything after the first 6 octets (12 chars) # 4. Delimit the octets with ':' # 5. Make sure the first octet is odd, # so the result is a multicast MAC IF_MAC=`echo $OCF_RESKEY_ip $NETMASK $BRDCAST | \ md5sum | \ sed -e 's#\(............\).*#\1#' \ -e 's#..#&:#g; s#:$##' \ -e 's#^\(.\)[02468aAcCeE]#\11#'` fi IP_CIP_FILE="/proc/net/ipt_CLUSTERIP/$OCF_RESKEY_ip" fi } # # Find out which interfaces serve the given IP address and netmask. # The arguments are an IP address and a netmask. # Its output are interface names devided by spaces (e.g., "eth0 eth1"). # find_interface() { local ipaddr="$1" local netmask="$2" # # List interfaces but exclude FreeS/WAN ipsecN virtual interfaces # local iface="`$IP2UTIL -o -f inet addr show \ | grep "\ $ipaddr/$netmask" \ | cut -d ' ' -f2 \ | grep -v '^ipsec[0-9][0-9]*$'`" echo "$iface" return 0 } # # Delete an interface # delete_interface () { ipaddr="$1" iface="$2" netmask="$3" CMD="$IP2UTIL -f inet addr delete $ipaddr/$netmask dev $iface" ocf_run $CMD || return $OCF_ERR_GENERIC if ocf_is_true $OCF_RESKEY_flush_routes; then ocf_run $IP2UTIL route flush cache fi return $OCF_SUCCESS } # # Add an interface # add_interface () { local cmd msg ipaddr netmask broadcast iface label ipaddr="$1" netmask="$2" broadcast="$3" iface="$4" label="$5" cmd="$IP2UTIL -f inet addr add $ipaddr/$netmask brd $broadcast dev $iface" msg="Adding IPv4 address $ipaddr/$netmask with broadcast address $broadcast to device $iface" if [ ! -z "$label" ]; then cmd="$cmd label $label" msg="${msg} (with label $label)" fi ocf_log info "$msg" ocf_run $cmd || return $OCF_ERR_GENERIC msg="Bringing device $iface up" cmd="$IP2UTIL link set $iface up" ocf_log info "$msg" ocf_run $cmd || return $OCF_ERR_GENERIC return $OCF_SUCCESS } # # Delete a route # delete_route () { prefix="$1" iface="$2" CMD="$IP2UTIL route delete $prefix dev $iface" ocf_log info "$CMD" $CMD return $? } # On Linux systems the (hidden) loopback interface may # conflict with the requested IP address. If so, this # unoriginal code will remove the offending loopback address # and save it in VLDIR so it can be added back in later # when the IPaddr is released. # # TODO: This is very ugly and should be controlled by an additional # instance parameter. Or even: multi-state, with the IP only being # "active" on the master!? # remove_conflicting_loopback() { ipaddr="$1" netmask="$2" broadcast="$3" ifname="$4" ocf_log info "Removing conflicting loopback $ifname." if echo "$ipaddr $netmask $broadcast $ifname" > "$VLDIR/$ipaddr" then : Saved loopback information in $VLDIR/$ipaddr else ocf_log err "Could not save conflicting loopback $ifname." \ "it will not be restored." fi delete_interface "$ipaddr" "$ifname" "$netmask" # Forcibly remove the route (if it exists) to the loopback. delete_route "$ipaddr" "$ifname" } # # On Linux systems the (hidden) loopback interface may # need to be restored if it has been taken down previously # by remove_conflicting_loopback() # restore_loopback() { ipaddr="$1" if [ -s "$VLDIR/$ipaddr" ]; then ifinfo=`cat "$VLDIR/$ipaddr"` ocf_log info "Restoring loopback IP Address " \ "$ifinfo." add_interface $ifinfo rm -f "$VLDIR/$ipaddr" fi } # # Run send_arp to note peers about new mac address # run_send_arp() { ARGS="-i $OCF_RESKEY_arp_interval -r $OCF_RESKEY_arp_count -p $SENDARPPIDFILE $NIC $OCF_RESKEY_ip auto not_used not_used" if [ "x$IP_CIP" = "xyes" ] ; then if [ x = "x$IF_MAC" ] ; then MY_MAC=auto else MY_MAC=`echo ${IF_MAC} | sed -e 's/://g'` fi ARGS="-i $OCF_RESKEY_arp_interval -r $OCF_RESKEY_arp_count -p $SENDARPPIDFILE $NIC $OCF_RESKEY_ip $MY_MAC not_used not_used" fi ocf_log info "$SENDARP $ARGS" if ocf_is_true $OCF_RESKEY_arp_bg; then ($SENDARP $ARGS || ocf_log err "Could not send gratuitous arps" &) >&2 else $SENDARP $ARGS || ocf_log err "Could not send gratuitous arps" fi } # # Run ipoibarping to note peers about new Infiniband address # run_send_ib_arp() { ARGS="-q -c $OCF_RESKEY_arp_count -U -I $NIC $OCF_RESKEY_ip" ocf_log info "ipoibarping $ARGS" if ocf_is_true $OCF_RESKEY_arp_bg; then (ipoibarping $ARGS || ocf_log err "Could not send gratuitous arps" &) >&2 else ipoibarping $ARGS || ocf_log err "Could not send gratuitous arps" fi } # Do we already serve this IP address on the given $NIC? # # returns: # ok = served (for CIP: + hash bucket) # partial = served and no hash bucket (CIP only) # partial2 = served and no CIP iptables rule # no = nothing # ip_served() { if [ -z "$NIC" ]; then # no nic found or specified echo "no" return 0 fi cur_nic="`find_interface $OCF_RESKEY_ip $NETMASK`" if [ -z "$cur_nic" ]; then echo "no" return 0 fi if [ -z "$IP_CIP" ]; then for i in $cur_nic; do - case $i in - lo*) - if ocf_is_true ${OCF_RESKEY_lvs_support}; then - echo "no" - return 0 - fi - ;; - $NIC) - # only mark as served when on the same interfaces as $NIC - echo "ok" - return 0 - ;; - esac + # only mark as served when on the same interfaces as $NIC + [ "$i" = "$NIC" ] || continue + echo "ok" + return 0 done + # There used to be logic here to pretend "not served", + # if ${OCF_RESKEY_lvs_support} was enabled, and the IP was + # found active on "lo*" only. With lvs_support on, you should + # have NIC != lo, so thats already filtered + # by the continue above. echo "no" return 0 fi # Special handling for the CIP: if [ ! -e $IP_CIP_FILE ]; then echo "partial2" return 0 fi if egrep -q "(^|,)${IP_INC_NO}(,|$)" $IP_CIP_FILE ; then echo "ok" return 0 else echo "partial" return 0 fi exit $OCF_ERR_GENERIC } ####################################################################### ip_usage() { cat <$IP_CIP_FILE fi if [ "$ip_status" = "no" ]; then if ocf_is_true ${OCF_RESKEY_lvs_support}; then for i in `find_interface $OCF_RESKEY_ip $NETMASK`; do case $i in lo*) remove_conflicting_loopback $OCF_RESKEY_ip 32 255.255.255.255 lo ;; esac done fi add_interface $OCF_RESKEY_ip $NETMASK $BRDCAST $NIC $IFLABEL if [ $? -ne 0 ]; then ocf_log err "$CMD failed." exit $OCF_ERR_GENERIC fi fi case $NIC in lo*) : no need to run send_arp on loopback ;; ib*) run_send_ib_arp ;; *) if [ -x $SENDARP ]; then run_send_arp fi ;; esac exit $OCF_SUCCESS } ip_stop() { local ip_del_if="yes" if [ -n "$IP_CIP" ]; then # Cluster IPs need special processing when the last bucket # is removed from the node... take a lock to make sure only one # process executes that code ocf_take_lock $CIP_lockfile ocf_release_lock_on_exit $CIP_lockfile fi if [ -f "$SENDARPPIDFILE" ] ; then kill `cat "$SENDARPPIDFILE"` if [ $? -ne 0 ]; then ocf_log warn "Could not kill previously running send_arp for $OCF_RESKEY_ip" else ocf_log info "killed previously running send_arp for $OCF_RESKEY_ip" rm -f "$SENDARPPIDFILE" fi fi local ip_status=`ip_served` ocf_log info "IP status = $ip_status, IP_CIP=$IP_CIP" if [ $ip_status = "no" ]; then : Requested interface not in use exit $OCF_SUCCESS fi if [ -n "$IP_CIP" ] && [ $ip_status != "partial2" ]; then if [ $ip_status = "partial" ]; then exit $OCF_SUCCESS fi echo "-$IP_INC_NO" >$IP_CIP_FILE if [ "x$(cat $IP_CIP_FILE)" = "x" ]; then ocf_log info $OCF_RESKEY_ip, $IP_CIP_HASH i=1 while [ $i -le $IP_INC_GLOBAL ]; do ocf_log info $i $IPTABLES -D INPUT -d $OCF_RESKEY_ip -i $NIC -j CLUSTERIP \ --new \ --clustermac $IF_MAC \ --total-nodes $IP_INC_GLOBAL \ --local-node $i \ --hashmode $IP_CIP_HASH i=`expr $i + 1` done else ip_del_if="no" fi fi if [ "$ip_del_if" = "yes" ]; then delete_interface $OCF_RESKEY_ip $NIC $NETMASK if [ $? -ne 0 ]; then exit $OCF_ERR_GENERIC fi if ocf_is_true ${OCF_RESKEY_lvs_support}; then restore_loopback "$OCF_RESKEY_ip" fi fi exit $OCF_SUCCESS } ip_monitor() { # TODO: Implement more elaborate monitoring like checking for # interface health maybe via a daemon like FailSafe etc... local ip_status=`ip_served` case $ip_status in ok) return $OCF_SUCCESS ;; partial|no|partial2) exit $OCF_NOT_RUNNING ;; *) # Errors on this interface? return $OCF_ERR_GENERIC ;; esac } ip_validate() { check_binary $IP2UTIL IP_CIP= ip_init case "$NIC" in ib*) check_binary ipoibarping ;; esac if [ -n "$IP_CIP" ]; then check_binary $IPTABLES check_binary $MODPROBE fi # $BASEIP, $NETMASK, $NIC , $IP_INC_GLOBAL, and $BRDCAST have been checked within ip_init, # do not bother here. if ocf_is_true "$OCF_RESKEY_unique_clone_address" && ! ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then ocf_log err "unique_clone_address makes sense only with meta globally_unique set" exit $OCF_ERR_CONFIGURED fi if ocf_is_decimal "$OCF_RESKEY_arp_interval" && [ $OCF_RESKEY_arp_interval -gt 0 ]; then : else ocf_log err "Invalid OCF_RESKEY_arp_interval [$OCF_RESKEY_arp_interval]" exit $OCF_ERR_CONFIGURED fi if ocf_is_decimal "$OCF_RESKEY_arp_count" && [ $OCF_RESKEY_arp_count -gt 0 ]; then : else ocf_log err "Invalid OCF_RESKEY_arp_count [$OCF_RESKEY_arp_count]" exit $OCF_ERR_CONFIGURED fi if [ -n "$IP_CIP" ]; then local valid=1 case $IP_CIP_HASH in sourceip|sourceip-sourceport|sourceip-sourceport-destport) ;; *) ocf_log err "Invalid OCF_RESKEY_clusterip_hash [$IP_CIP_HASH]" exit $OCF_ERR_CONFIGURED ;; esac if ocf_is_true ${OCF_RESKEY_lvs_support}; then ecf_log err "LVS and load sharing not advised to try" exit $OCF_ERR_CONFIGURED fi case $IF_MAC in [0-9a-zA-Z][13579bBdDfF][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]) ;; *) valid=0 ;; esac if [ $valid -eq 0 ]; then ocf_log err "Invalid IF_MAC [$IF_MAC]" exit $OCF_ERR_CONFIGURED fi fi } if ocf_is_true "$OCF_RESKEY_unique_clone_address"; then prefix=`echo $OCF_RESKEY_ip | awk -F. '{print $1"."$2"."$3}'` suffix=`echo $OCF_RESKEY_ip | awk -F. '{print $4}'` suffix=`expr ${OCF_RESKEY_CRM_meta_clone:-0} + $suffix` OCF_RESKEY_ip="$prefix.$suffix" fi case $__OCF_ACTION in meta-data) meta_data ;; usage|help) ip_usage exit $OCF_SUCCESS ;; esac ip_validate case $__OCF_ACTION in start) ip_start ;; stop) ip_stop ;; status) ip_status=`ip_served` if [ $ip_status = "ok" ]; then echo "running" exit $OCF_SUCCESS else echo "stopped" exit $OCF_NOT_RUNNING fi ;; monitor) ip_monitor ;; validate-all) ;; *) ip_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/IPv6addr.c b/heartbeat/IPv6addr.c index 52ea2882d..739f793f1 100644 --- a/heartbeat/IPv6addr.c +++ b/heartbeat/IPv6addr.c @@ -1,988 +1,988 @@ /* * This program manages IPv6 address with OCF Resource Agent standard. * * Author: Huang Zhen * Copyright (c) 2004 International Business Machines * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* * It can add an IPv6 address, or remove one. * * Usage: IPv6addr {start|stop|status|monitor|meta-data} * * The "start" arg adds an IPv6 address. * The "stop" arg removes one. * The "status" arg shows whether the IPv6 address exists * The "monitor" arg shows whether the IPv6 address can be pinged (ICMPv6 ECHO) * The "meta_data" arg shows the meta data(XML) */ /* * ipv6-address: * * currently the following forms are legal: * address * address/prefix * * E.g. * 3ffe:ffff:0:f101::3 * 3ffe:ffff:0:f101::3/64 * * It should be passed by environment variant: * OCF_RESKEY_ipv6addr=3ffe:ffff:0:f101::3 * OCF_RESKEY_cidr_netmask=64 * OCF_RESKEY_nic=eth0 * */ /* * start: * 1.IPv6addr will choice a proper interface for the new address. * 2.Then assign the new address to the interface. * 3.Wait until the new address is available (reply ICMPv6 ECHO packet) * 4.Send out the unsolicited advertisements. * * return 0(OCF_SUCCESS) for success * return 1(OCF_ERR_GENERIC) for failure * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) * * * stop: * remove the address from the inferface. * * return 0(OCF_SUCCESS) for success * return 1(OCF_ERR_GENERIC) for failure * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) * * status: * return the status of the address. only check whether it exists. * * return 0(OCF_SUCCESS) for existing * return 1(OCF_NOT_RUNNING) for not existing * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) * * * monitor: * ping the address by ICMPv6 ECHO request. * * return 0(OCF_SUCCESS) for response correctly. * return 1(OCF_NOT_RUNNING) for no response. * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) */ #include #include #include #include #include #include #include #include #include /* for inet_pton */ #include /* for if_nametoindex */ #include #include #include #include #include #include #include #include #define PIDFILE_BASE HA_RSCTMPDIR "/IPv6addr-" /* 0 No error, action succeeded completely 1 generic or unspecified error (current practice) The "monitor" operation shall return this for a crashed, hung or otherwise non-functional resource. 2 invalid or excess argument(s) Likely error code for validate-all, if the instance parameters do not validate. Any other action is free to also return this exit status code for this case. 3 unimplemented feature (for example, "reload") 4 user had insufficient privilege 5 program is not installed 6 program is not configured 7 program is not running 8 resource is running in "master" mode and fully operational 9 resource is in "master" mode but in a failed state */ #define OCF_SUCCESS 0 #define OCF_ERR_GENERIC 1 #define OCF_ERR_ARGS 2 #define OCF_ERR_UNIMPLEMENTED 3 #define OCF_ERR_PERM 4 #define OCF_ERR_INSTALLED 5 #define OCF_ERR_CONFIGURED 6 #define OCF_NOT_RUNNING 7 const char* IF_INET6 = "/proc/net/if_inet6"; const char* APP_NAME = "IPv6addr"; const char* START_CMD = "start"; const char* STOP_CMD = "stop"; const char* STATUS_CMD = "status"; const char* MONITOR_CMD = "monitor"; const char* ADVT_CMD = "advt"; const char* RECOVER_CMD = "recover"; const char* RELOAD_CMD = "reload"; const char* META_DATA_CMD = "meta-data"; const char* VALIDATE_CMD = "validate-all"; char BCAST_ADDR[] = "ff02::1"; const int UA_REPEAT_COUNT = 5; const int QUERY_COUNT = 5; #define HWADDR_LEN 6 /* mac address length */ struct in6_ifreq { struct in6_addr ifr6_addr; uint32_t ifr6_prefixlen; unsigned int ifr6_ifindex; }; static int start_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int stop_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int status_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int monitor_addr6(struct in6_addr* addr6, int prefix_len); static int advt_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int meta_data_addr6(void); static void usage(const char* self); int write_pid_file(const char *pid_file); int create_pid_directory(const char *pid_file); static void byebye(int nsig); static char* scan_if(struct in6_addr* addr_target, int* plen_target, int use_mask, char* prov_ifname); static char* find_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname); static char* get_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname); static int assign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name); static int unassign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name); int is_addr6_available(struct in6_addr* addr6); static int send_ua(struct in6_addr* src_ip, char* if_name); int main(int argc, char* argv[]) { char pid_file[256]; char* ipv6addr; char* cidr_netmask; int ret; char* cp; char* prov_ifname = NULL; int prefix_len = -1; struct in6_addr addr6; /* Check the count of parameters first */ if (argc < 2) { usage(argv[0]); return OCF_ERR_ARGS; } /* set termination signal */ siginterrupt(SIGTERM, 1); signal(SIGTERM, byebye); /* open system log */ cl_log_set_entity(APP_NAME); cl_log_set_facility(LOG_DAEMON); /* the meta-data dont need any parameter */ if (0 == strncmp(META_DATA_CMD, argv[1], strlen(META_DATA_CMD))) { ret = meta_data_addr6(); return OCF_SUCCESS; } /* check the OCF_RESKEY_ipv6addr parameter, should be an IPv6 address */ ipv6addr = getenv("OCF_RESKEY_ipv6addr"); if (ipv6addr == NULL) { cl_log(LOG_ERR, "Please set OCF_RESKEY_ipv6addr to the IPv6 address you want to manage."); usage(argv[0]); return OCF_ERR_ARGS; } /* legacy option */ if ((cp = strchr(ipv6addr, '/'))) { prefix_len = atol(cp + 1); if ((prefix_len < 0) || (prefix_len > 128)) { cl_log(LOG_ERR, "Invalid prefix_len [%s], should be an integer in [0, 128]", cp+1); usage(argv[0]); return OCF_ERR_ARGS; } *cp=0; } /* get provided netmask (optional) */ cidr_netmask = getenv("OCF_RESKEY_cidr_netmask"); if (cidr_netmask != NULL) { if ((atol(cidr_netmask) < 0) || (atol(cidr_netmask) > 128)) { cl_log(LOG_ERR, "Invalid prefix_len [%s], " "should be an integer in [0, 128]", cidr_netmask); usage(argv[0]); return OCF_ERR_ARGS; } if (prefix_len != -1 && prefix_len != atol(cidr_netmask)) { cl_log(LOG_DEBUG, "prefix_len(%d) is overwritted by cidr_netmask(%s)", prefix_len, cidr_netmask); } prefix_len = atol(cidr_netmask); } else if (prefix_len == -1) { prefix_len = 0; } /* get provided interface name (optional) */ prov_ifname = getenv("OCF_RESKEY_nic"); if (inet_pton(AF_INET6, ipv6addr, &addr6) <= 0) { cl_log(LOG_ERR, "Invalid IPv6 address [%s]", ipv6addr); usage(argv[0]); return OCF_ERR_ARGS; } /* Check whether this system supports IPv6 */ if (access(IF_INET6, R_OK)) { cl_log(LOG_ERR, "No support for INET6 on this system."); return OCF_ERR_GENERIC; } /* create the pid file so we can make sure that only one IPv6addr * for this address is running */ if (snprintf(pid_file, sizeof(pid_file), "%s%s", PIDFILE_BASE, ipv6addr) >= (int)sizeof(pid_file)) { cl_log(LOG_ERR, "Pid file truncated"); return OCF_ERR_GENERIC; } if (write_pid_file(pid_file) < 0) { return OCF_ERR_GENERIC; } /* switch the command */ if (0 == strncmp(START_CMD,argv[1], strlen(START_CMD))) { ret = start_addr6(&addr6, prefix_len, prov_ifname); }else if (0 == strncmp(STOP_CMD,argv[1], strlen(STOP_CMD))) { ret = stop_addr6(&addr6, prefix_len, prov_ifname); }else if (0 == strncmp(STATUS_CMD,argv[1], strlen(STATUS_CMD))) { ret = status_addr6(&addr6, prefix_len, prov_ifname); }else if (0 ==strncmp(MONITOR_CMD,argv[1], strlen(MONITOR_CMD))) { ret = monitor_addr6(&addr6, prefix_len); }else if (0 ==strncmp(RELOAD_CMD,argv[1], strlen(RELOAD_CMD))) { ret = OCF_ERR_UNIMPLEMENTED; }else if (0 ==strncmp(RECOVER_CMD,argv[1], strlen(RECOVER_CMD))) { ret = OCF_ERR_UNIMPLEMENTED; }else if (0 ==strncmp(VALIDATE_CMD,argv[1], strlen(VALIDATE_CMD))) { /* ipv6addr has been validated by inet_pton, hence a valid IPv6 address */ ret = OCF_SUCCESS; }else if (0 ==strncmp(ADVT_CMD,argv[1], strlen(MONITOR_CMD))) { ret = advt_addr6(&addr6, prefix_len, prov_ifname); }else{ usage(argv[0]); ret = OCF_ERR_ARGS; } /* release the pid file */ unlink(pid_file); return ret; } int start_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { int i; char* if_name; if(OCF_SUCCESS == status_addr6(addr6,prefix_len,prov_ifname)) { return OCF_SUCCESS; } /* we need to find a proper device to assign the address */ if_name = find_if(addr6, &prefix_len, prov_ifname); if (NULL == if_name) { - cl_log(LOG_ERR, "no valid mecahnisms"); + cl_log(LOG_ERR, "no valid mechanisms"); return OCF_ERR_GENERIC; } /* Assign the address */ if (0 != assign_addr6(addr6, prefix_len, if_name)) { cl_log(LOG_ERR, "failed to assign the address to %s", if_name); return OCF_ERR_GENERIC; } /* Check whether the address available */ for (i = 0; i < QUERY_COUNT; i++) { if (0 == is_addr6_available(addr6)) { break; } sleep(1); } if (i == QUERY_COUNT) { cl_log(LOG_ERR, "failed to ping the address"); return OCF_ERR_GENERIC; } /* Send unsolicited advertisement packet to neighbor */ for (i = 0; i < UA_REPEAT_COUNT; i++) { send_ua(addr6, if_name); sleep(1); } return OCF_SUCCESS; } int advt_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { /* First, we need to find a proper device to assign the address */ char* if_name = get_if(addr6, &prefix_len, prov_ifname); int i; if (NULL == if_name) { - cl_log(LOG_ERR, "no valid mecahnisms"); + cl_log(LOG_ERR, "no valid mechanisms"); return OCF_ERR_GENERIC; } /* Send unsolicited advertisement packet to neighbor */ for (i = 0; i < UA_REPEAT_COUNT; i++) { send_ua(addr6, if_name); sleep(1); } return OCF_SUCCESS; } int stop_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { char* if_name; if(OCF_NOT_RUNNING == status_addr6(addr6,prefix_len,prov_ifname)) { return OCF_SUCCESS; } if_name = get_if(addr6, &prefix_len, prov_ifname); if (NULL == if_name) { cl_log(LOG_ERR, "no valid mechanisms."); /* I think this should be a success exit according to LSB. */ return OCF_ERR_GENERIC; } /* Unassign the address */ if (0 != unassign_addr6(addr6, prefix_len, if_name)) { cl_log(LOG_ERR, "failed to assign the address to %s", if_name); return OCF_ERR_GENERIC; } return OCF_SUCCESS; } int status_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { char* if_name = get_if(addr6, &prefix_len, prov_ifname); if (NULL == if_name) { return OCF_NOT_RUNNING; } return OCF_SUCCESS; } int monitor_addr6(struct in6_addr* addr6, int prefix_len) { if(0 == is_addr6_available(addr6)) { return OCF_SUCCESS; } return OCF_NOT_RUNNING; } /* Send an unsolicited advertisement packet * Please refer to rfc4861 / rfc3542 */ int send_ua(struct in6_addr* src_ip, char* if_name) { int status = -1; int fd; int ifindex; int hop; struct ifreq ifr; u_int8_t *payload = NULL; int payload_size; struct nd_neighbor_advert *na; struct nd_opt_hdr *opt; struct sockaddr_in6 src_sin6; struct sockaddr_in6 dst_sin6; if ((fd = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6)) == 0) { cl_log(LOG_ERR, "socket(IPPROTO_ICMPV6) failed: %s", strerror(errno)); goto err; } /* set the outgoing interface */ ifindex = if_nametoindex(if_name); if (setsockopt(fd, IPPROTO_IPV6, IPV6_MULTICAST_IF, &ifindex, sizeof(ifindex)) < 0) { cl_log(LOG_ERR, "setsockopt(IPV6_MULTICAST_IF) failed: %s", strerror(errno)); goto err; } /* set the hop limit */ hop = 255; /* 255 is required. see rfc4861 7.1.2 */ if (setsockopt(fd, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &hop, sizeof(hop)) < 0) { cl_log(LOG_ERR, "setsockopt(IPV6_MULTICAST_HOPS) failed: %s", strerror(errno)); goto err; } /* set the source address */ memset(&src_sin6, 0, sizeof(src_sin6)); src_sin6.sin6_family = AF_INET6; src_sin6.sin6_addr = *src_ip; src_sin6.sin6_port = 0; if (IN6_IS_ADDR_LINKLOCAL(&src_sin6.sin6_addr) || IN6_IS_ADDR_MC_LINKLOCAL(&src_sin6.sin6_addr)) { src_sin6.sin6_scope_id = ifindex; } if (bind(fd, (struct sockaddr *)&src_sin6, sizeof(src_sin6)) < 0) { cl_log(LOG_ERR, "bind() failed: %s", strerror(errno)); goto err; } /* get the hardware address */ memset(&ifr, 0, sizeof(ifr)); strncpy(ifr.ifr_name, if_name, sizeof(ifr.ifr_name) - 1); if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { cl_log(LOG_ERR, "ioctl(SIOCGIFHWADDR) failed: %s", strerror(errno)); goto err; } /* build a neighbor advertisement message */ payload_size = sizeof(struct nd_neighbor_advert) + sizeof(struct nd_opt_hdr) + HWADDR_LEN; payload = memalign(sysconf(_SC_PAGESIZE), payload_size); if (!payload) { cl_log(LOG_ERR, "malloc for payload failed"); goto err; } memset(payload, 0, payload_size); /* Ugly typecast from ia64 hell! */ na = (struct nd_neighbor_advert *)((void *)payload); na->nd_na_type = ND_NEIGHBOR_ADVERT; na->nd_na_code = 0; na->nd_na_cksum = 0; /* calculated by kernel */ na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE; na->nd_na_target = *src_ip; /* options field; set the target link-layer address */ opt = (struct nd_opt_hdr *)(payload + sizeof(struct nd_neighbor_advert)); opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; opt->nd_opt_len = 1; /* The length of the option in units of 8 octets */ memcpy(payload + sizeof(struct nd_neighbor_advert) + sizeof(struct nd_opt_hdr), &ifr.ifr_hwaddr.sa_data, HWADDR_LEN); /* sending an unsolicited neighbor advertisement to all */ memset(&dst_sin6, 0, sizeof(dst_sin6)); dst_sin6.sin6_family = AF_INET6; inet_pton(AF_INET6, BCAST_ADDR, &dst_sin6.sin6_addr); /* should not fail */ if (sendto(fd, payload, payload_size, 0, (struct sockaddr *)&dst_sin6, sizeof(dst_sin6)) != payload_size) { cl_log(LOG_ERR, "sendto(%s) failed: %s", if_name, strerror(errno)); goto err; } status = 0; err: close(fd); free(payload); return status; } /* find the network interface associated with an address */ char* scan_if(struct in6_addr* addr_target, int* plen_target, int use_mask, char* prov_ifname) { FILE *f; static char devname[21]=""; struct in6_addr addr; struct in6_addr mask; unsigned int plen, scope, dad_status, if_idx; unsigned int addr6p[4]; /* open /proc/net/if_inet6 file */ if ((f = fopen(IF_INET6, "r")) == NULL) { return NULL; } /* Loop for each entry */ while (1) { int i; int n; int s; gboolean same = TRUE; i = fscanf(f, "%08x%08x%08x%08x %x %02x %02x %02x %20s\n", &addr6p[0], &addr6p[1], &addr6p[2], &addr6p[3], &if_idx, &plen, &scope, &dad_status, devname); if (i == EOF) { break; } else if (i != 9) { cl_log(LOG_INFO, "Error parsing %s, " "perhaps the format has changed\n", IF_INET6); break; } /* Consider link-local addresses (scope == 0x20) only when * the inerface name is provided, and global addresses * (scope == 0). Skip everything else. */ if (scope != 0) { if (scope != 0x20 || prov_ifname == 0 || *prov_ifname == 0) continue; } /* If specified prefix, only same prefix entry * would be considered. */ if (*plen_target!=0 && plen != *plen_target) { continue; } /* If interface name provided, only same devname entry * would be considered */ if (prov_ifname!=0 && *prov_ifname!=0) { if (strcmp(devname, prov_ifname)) continue; } for (i = 0; i< 4; i++) { addr.s6_addr32[i] = htonl(addr6p[i]); } /* Make the mask based on prefix length */ memset(mask.s6_addr, 0xff, 16); if (use_mask && plen < 128) { n = plen / 32; memset(mask.s6_addr32 + n + 1, 0, (3 - n) * 4); s = 32 - plen % 32; if (s == 32) mask.s6_addr32[n] = 0x0; else mask.s6_addr32[n] = 0xffffffff << s; mask.s6_addr32[n] = htonl(mask.s6_addr32[n]); } /* compare addr and addr_target */ same = TRUE; for (i = 0; i < 4; i++) { if ((addr.s6_addr32[i]&mask.s6_addr32[i]) != (addr_target->s6_addr32[i]&mask.s6_addr32[i])) { same = FALSE; break; } } /* We found it! */ if (same) { fclose(f); *plen_target = plen; return devname; } } fclose(f); return NULL; } /* find a proper network interface to assign the address */ char* find_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname) { char *best_ifname = scan_if(addr_target, plen_target, 1, prov_ifname); /* use the provided ifname and prefix if the address did not match */ if (best_ifname == NULL && prov_ifname != 0 && *prov_ifname != 0 && *plen_target != 0) { cl_log(LOG_INFO, "Could not find a proper interface by the ipv6addr. Using the specified nic:'%s' and cidr_netmask:'%d'", prov_ifname, *plen_target); return prov_ifname; } return best_ifname; } /* get the device name and the plen_target of a special address */ char* get_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname) { return scan_if(addr_target, plen_target, 0, prov_ifname); } int assign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name) { struct in6_ifreq ifr6; /* Get socket first */ int fd; struct ifreq ifr; fd = socket(AF_INET6, SOCK_DGRAM, 0); if (fd < 0) { return 1; } /* Query the index of the if */ strcpy(ifr.ifr_name, if_name); if (ioctl(fd, SIOGIFINDEX, &ifr) < 0) { return -1; } /* Assign the address to the if */ ifr6.ifr6_addr = *addr6; ifr6.ifr6_ifindex = ifr.ifr_ifindex; ifr6.ifr6_prefixlen = prefix_len; if (ioctl(fd, SIOCSIFADDR, &ifr6) < 0) { return -1; } close (fd); return 0; } int unassign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name) { int fd; struct ifreq ifr; struct in6_ifreq ifr6; /* Get socket first */ fd = socket(AF_INET6, SOCK_DGRAM, 0); if (fd < 0) { return 1; } /* Query the index of the if */ strcpy(ifr.ifr_name, if_name); if (ioctl(fd, SIOGIFINDEX, &ifr) < 0) { return -1; } /* Unassign the address to the if */ ifr6.ifr6_addr = *addr6; ifr6.ifr6_ifindex = ifr.ifr_ifindex; ifr6.ifr6_prefixlen = prefix_len; if (ioctl(fd, SIOCDIFADDR, &ifr6) < 0) { return -1; } close (fd); return 0; } #define MINPACKSIZE 64 int is_addr6_available(struct in6_addr* addr6) { struct sockaddr_in6 addr; struct icmp6_hdr icmph; u_char outpack[MINPACKSIZE]; int icmp_sock; int ret; struct iovec iov; u_char packet[MINPACKSIZE]; struct msghdr msg; icmp_sock = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6); memset(&icmph, 0, sizeof(icmph)); icmph.icmp6_type = ICMP6_ECHO_REQUEST; icmph.icmp6_code = 0; icmph.icmp6_cksum = 0; icmph.icmp6_seq = htons(0); icmph.icmp6_id = 0; memset(&outpack, 0, sizeof(outpack)); memcpy(&outpack, &icmph, sizeof(icmph)); memset(&addr, 0, sizeof(struct sockaddr_in6)); addr.sin6_family = AF_INET6; addr.sin6_port = htons(IPPROTO_ICMPV6); memcpy(&addr.sin6_addr,addr6,sizeof(struct in6_addr)); /* Only the first 8 bytes of outpack are meaningful... */ ret = sendto(icmp_sock, (char *)outpack, sizeof(outpack), 0, (struct sockaddr *) &addr, sizeof(struct sockaddr_in6)); if (0 >= ret) { return -1; } iov.iov_base = (char *)packet; iov.iov_len = sizeof(packet); msg.msg_name = &addr; msg.msg_namelen = sizeof(addr); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; ret = recvmsg(icmp_sock, &msg, MSG_DONTWAIT); if (0 >= ret) { return -1; } return 0; } static void usage(const char* self) { printf("usage: %s {start|stop|status|monitor|validate-all|meta-data}\n",self); return; } /* Following code is copied from send_arp.c, linux-HA project. */ void byebye(int nsig) { (void)nsig; /* Avoid an "error exit" log message if we're killed */ exit(0); } int create_pid_directory(const char *pid_file) { int status; int return_status = -1; struct stat stat_buf; char* dir; dir = strdup(pid_file); if (!dir) { cl_log(LOG_INFO, "Memory allocation failure: %s", strerror(errno)); return -1; } dirname(dir); status = stat(dir, &stat_buf); if (status < 0 && errno != ENOENT && errno != ENOTDIR) { cl_log(LOG_INFO, "Could not stat pid-file directory " "[%s]: %s", dir, strerror(errno)); goto err; } if (!status) { if (S_ISDIR(stat_buf.st_mode)) { goto out; } cl_log(LOG_INFO, "Pid-File directory exists but is " "not a directory [%s]", dir); goto err; } if (mkdir(dir, S_IRUSR|S_IWUSR|S_IXUSR | S_IRGRP|S_IXGRP) < 0) { cl_log(LOG_INFO, "Could not create pid-file directory " "[%s]: %s", dir, strerror(errno)); goto err; } out: return_status = 0; err: free(dir); return return_status; } int write_pid_file(const char *pid_file) { int pidfilefd; char pidbuf[11]; unsigned long pid; ssize_t bytes; if (*pid_file != '/') { cl_log(LOG_INFO, "Invalid pid-file name, must begin with a " "'/' [%s]\n", pid_file); return -1; } if (create_pid_directory(pid_file) < 0) { return -1; } while (1) { pidfilefd = open(pid_file, O_CREAT|O_EXCL|O_RDWR, S_IRUSR|S_IWUSR); if (pidfilefd < 0) { if (errno != EEXIST) { /* Old PID file */ cl_log(LOG_INFO, "Could not open pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } } else { break; } pidfilefd = open(pid_file, O_RDONLY, S_IRUSR|S_IWUSR); if (pidfilefd < 0) { cl_log(LOG_INFO, "Could not open pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } while (1) { bytes = read(pidfilefd, pidbuf, sizeof(pidbuf)-1); if (bytes < 0) { if (errno == EINTR) { continue; } cl_log(LOG_INFO, "Could not read pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } pidbuf[bytes] = '\0'; break; } if(unlink(pid_file) < 0) { cl_log(LOG_INFO, "Could not delete pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } if (!bytes) { cl_log(LOG_INFO, "Invalid pid in pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } close(pidfilefd); pid = strtoul(pidbuf, NULL, 10); if (pid == ULONG_MAX && errno == ERANGE) { cl_log(LOG_INFO, "Invalid pid in pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { cl_log(LOG_INFO, "Error killing old proccess [%lu] " "from pid-file [%s]: %s", pid, pid_file, strerror(errno)); return -1; } cl_log(LOG_INFO, "Killed old send_arp process [%lu]", pid); } if (snprintf(pidbuf, sizeof(pidbuf), "%u" , getpid()) >= (int)sizeof(pidbuf)) { cl_log(LOG_INFO, "Pid too long for buffer [%u]", getpid()); return -1; } while (1) { bytes = write(pidfilefd, pidbuf, strlen(pidbuf)); if (bytes != strlen(pidbuf)) { if (bytes < 0 && errno == EINTR) { continue; } cl_log(LOG_INFO, "Could not write pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } break; } close(pidfilefd); return 0; } static int meta_data_addr6(void) { const char* meta_data= "\n" "\n" "\n" " 1.0\n" " \n" " This script manages IPv6 alias IPv6 addresses,It can add an IP6\n" " alias, or remove one.\n" " \n" " Manages IPv6 aliases\n" " \n" " \n" " \n" " The IPv6 address this RA will manage \n" " \n" " IPv6 address\n" " \n" " \n" " \n" " \n" " The netmask for the interface in CIDR format. (ie, 24).\n" " The value of this parameter overwrites the value of _prefix_\n" " of ipv6addr parameter.\n" " \n" " Netmask\n" " \n" " \n" " \n" " \n" " The base network interface on which the IPv6 address will\n" " be brought online.\n" " \n" " Network interface\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "\n"; printf("%s\n",meta_data); return OCF_SUCCESS; } diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am index adfd97c68..5f383977e 100644 --- a/heartbeat/Makefile.am +++ b/heartbeat/Makefile.am @@ -1,131 +1,134 @@ # Makefile.am for OCF RAs # # Author: Sun Jing Dong # Copyright (C) 2004 IBM # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(ocf_SCRIPTS) $(ocfcommon_DATA) \ $(common_DATA) $(hb_DATA) $(dtd_DATA) \ README INCLUDES = -I$(top_srcdir)/include -I$(top_srcdir)/linux-ha ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat dtddir = $(datadir)/$(PACKAGE_NAME) dtd_DATA = ra-api-1.dtd if USE_IPV6ADDR ocf_PROGRAMS = IPv6addr else ocf_PROGRAMS = endif IPv6addr_SOURCES = IPv6addr.c IPv6addr_LDADD = -lplumb $(LIBNETLIBS) ocf_SCRIPTS = ClusterMon \ CTDB \ Dummy \ IPaddr \ IPaddr2 \ drbd \ anything \ AoEtarget \ apache \ asterisk \ nginx \ AudibleAlarm \ conntrackd \ db2 \ + dhcpd \ Delay \ eDir88 \ EvmsSCC \ Evmsd \ ethmonitor \ exportfs \ Filesystem \ fio \ ids \ iscsi \ ICP \ IPsrcaddr \ iSCSITarget \ iSCSILogicalUnit \ jboss \ LinuxSCSI \ LVM \ lxc \ MailTo \ ManageRAID \ ManageVE \ mysql \ mysql-proxy \ named \ nfsserver \ oracle \ oralsnr \ pingd \ portblock \ postfix \ pound \ pgsql \ proftpd \ Pure-FTPd \ Raid1 \ Route \ rsyncd \ rsyslog \ SAPDatabase \ SAPInstance \ SendArp \ ServeRAID \ slapd \ SphinxSearchDaemon \ Squid \ Stateful \ SysInfo \ scsi2reservation \ sfex \ symlink \ syslog-ng \ tomcat \ VIPArip \ VirtualDomain \ varnish \ vmware \ WAS \ WAS6 \ WinPopup \ Xen \ Xinetd ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat ocfcommon_DATA = ocf-shellfuncs \ ocf-binaries \ ocf-directories \ ocf-returncodes \ + ocf-rarun \ apache-conf.sh \ http-mon.sh \ sapdb-nosha.sh \ - sapdb.sh + sapdb.sh \ + ora-common.sh # Legacy locations hbdir = $(sysconfdir)/ha.d hb_DATA = shellfuncs diff --git a/heartbeat/Raid1 b/heartbeat/Raid1 index f85f55a1b..231e05323 100755 --- a/heartbeat/Raid1 +++ b/heartbeat/Raid1 @@ -1,412 +1,456 @@ #!/bin/sh # # # License: GNU General Public License (GPL) # Support: linux-ha@lists.linux-ha.org # # Raid1 -# Description: Manages a software Raid1 device on a shared storage medium. +# Description: Manages a Linux software RAID device on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # RAID patches: http://people.redhat.com/mingo/raid-patches/ # Word to the Wise: http://lwn.net/2000/0810/a/raid-faq.php3 # Sympathetic Ear: mailto:linux-raid@vger.kernel.org # # usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} # # # EXAMPLE config file /etc/raidtab.md0 # This file must exist on both machines! # # raiddev /dev/md0 # raid-level 1 # nr-raid-disks 2 # chunk-size 64k # persistent-superblock 1 # #nr-spare-disks 0 # device /dev/sda1 # raid-disk 0 # device /dev/sdb1 # raid-disk 1 # # EXAMPLE config file /etc/mdadm.conf (for more info:man mdadm.conf) # # DEVICE /dev/sdb1 /dev/sdc1 # ARRAY /dev/md0 UUID=4a865b55:ba27ef8d:29cd5701:6fb42799 ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} EOT } meta_data() { cat < 1.0 -Resource script for RAID1. It manages a software Raid1 device on a shared -storage medium. +This resource agent manages Linux software RAID (MD) devices on +a shared storage medium. It uses mdadm(8) to start, stop, and +monitor the MD devices. Raidtools are supported, but deprecated. +See https://raid.wiki.kernel.org/index.php/Linux_Raid for more +information. -Manages a software RAID1 device on shared storage +Manages Linux software RAID (MD) devices on shared storage -The RAID configuration file. e.g. /etc/raidtab or /etc/mdadm.conf. +The RAID configuration file, e.g. /etc/mdadm.conf. RAID config file -The block device to use. Alternatively, set to "auto" to manage -all devices specified in raidconf. +One or more block devices to use, space separated. Alternatively, +set to "auto" to manage all devices specified in raidconf. block device The value for the homehost directive; this is an mdadm feature to protect RAIDs against being activated by accident. It is recommended to create RAIDs managed by the cluster with "homehost" set to a special value, so they are not accidentially auto-assembled by nodes not supposed to own them. Homehost for mdadm + + + +If processes or kernel threads are using the array, it cannot be +stopped. We will try to stop processes, first by sending TERM and +then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. +The lsof(8) program is required to get the list of array users. +Of course, the kernel threads cannot be stopped this way. +If the processes are critical for data integrity, then set this +parameter to false. Note that in that case the stop operation +will fail and the node will be fenced. + +force stop processes using the array + + END } list_conf_arrays() { test -f $RAIDCONF || { ocf_log err "$RAIDCONF gone missing!" exit $OCF_ERR_GENERIC } grep ^ARRAY $RAIDCONF | awk '{print $2}' } forall() { local func=$1 local checkall=$2 local mddev rc=0 - for mddev in `list_conf_arrays`; do + for mddev in $RAIDDEVS; do $func $mddev rc=$(($rc | $?)) [ "$checkall" = all ] && continue [ $rc -ne 0 ] && return $rc done return $rc } +are_arrays_stopped() { + local rc mddev + for mddev in $RAIDDEVS; do + raid1_monitor_one $mddev + rc=$? + [ $rc -ne $OCF_NOT_RUNNING ] && break + done + test $rc -eq $OCF_NOT_RUNNING +} +md_assemble() { + local mddev=$1 + $MDADM --assemble $mddev --config=$RAIDCONF $MDADM_HOMEHOST +} # # START: Start up the RAID device # raid1_start() { + local rc raid1_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then # md already online, nothing to do. return $OCF_SUCCESS fi if [ $rc -ne $OCF_NOT_RUNNING ]; then # If the array is in a broken state, this agent doesn't # know how to repair that. - ocf_log err "$MDDEV in a broken state; cannot start (rc=$rc)" + ocf_log err "$RAIDDEVS in a broken state; cannot start (rc=$rc)" return $OCF_ERR_GENERIC fi - # Insert raid personality module - $MODPROBE raid1 - if [ $? -ne 0 ] ; then - # It is not fatal, chance is that we have raid1 builtin... - ocf_log warn "Couldn't insert RAID1 module" - fi - grep -q "^Personalities.*\[raid1\]" /proc/mdstat 2>/dev/null - if [ $? -ne 0 ] ; then - ocf_log err "We don't have RAID1 support! Exiting" - return $OCF_ERR_GENERIC - fi - if [ $HAVE_RAIDTOOLS = "true" ]; then # Run raidstart to start up the RAID array $RAIDSTART --configfile $RAIDCONF $MDDEV else - # Run mdadm - if [ "$MDDEV" = auto ]; then - $MDADM --assemble --scan --config=$RAIDCONF $MDADM_HOMEHOST - else - $MDADM --assemble $MDDEV --config=$RAIDCONF $MDADM_HOMEHOST - fi + forall md_assemble all fi raid1_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS else - ocf_log err "Couldn't start RAID for $MDDEV" + ocf_log err "Couldn't start RAID for $RAIDDEVS" return $OCF_ERR_GENERIC fi } # # STOP: stop the RAID device # mark_readonly() { local mddev=$1 local rc ocf_log info "Attempting to mark array $mddev readonly" $MDADM --readonly $mddev --config=$RAIDCONF rc=$? if [ $rc -ne 0 ]; then ocf_log err "Failed to set $mddev readonly (rc=$rc)" fi return $rc } raid1_stop_one() { ocf_log info "Stopping array $1" $MDADM --stop $1 --config=$RAIDCONF --wait-clean -W } +get_users_pids() { + local mddev=$1 + local outp l + ocf_log debug "running lsof to list $mddev users..." + outp=`lsof $mddev | tail -n +2` + echo "$outp" | awk '{print $2}' | sort -u + echo "$outp" | while read l; do + ocf_log warn "$l" + done +} +stop_raid_users() { + local pids + pids=`forall get_users_pids all | sort -u` + if [ -z "$pids" ]; then + ocf_log warn "lsof reported no users holding arrays" + return 2 + else + ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids + fi +} +stop_arrays() { + if [ $HAVE_RAIDTOOLS = "true" ]; then + $RAIDSTOP --configfile $RAIDCONF $MDDEV + else + forall raid1_stop_one all + fi +} raid1_stop() { local rc # See if the MD device is already cleanly stopped: - if [ "$MDDEV" != auto ]; then - raid1_monitor - if [ $? -eq $OCF_NOT_RUNNING ]; then - return $OCF_SUCCESS - fi + if are_arrays_stopped; then + return $OCF_SUCCESS fi # Turn off raid - if [ $HAVE_RAIDTOOLS = "true" ]; then - $RAIDSTOP --configfile $RAIDCONF $MDDEV - else - if [ "$MDDEV" = auto ]; then - forall raid1_stop_one all + if ! stop_arrays; then + if ocf_is_true $FORCESTOP; then + if have_binary lsof; then + stop_raid_users + case $? in + 2) false;; + *) stop_arrays;; + esac + else + ocf_log warn "install lsof(8) to list users holding the disk" + false + fi else - raid1_stop_one $MDDEV + false fi fi rc=$? if [ $rc -ne 0 ]; then - ocf_log err "Couldn't stop RAID for $MDDEV (rc=$rc)" + ocf_log warn "Couldn't stop RAID for $RAIDDEVS (rc=$rc)" if [ $HAVE_RAIDTOOLS != "true" ]; then - if [ "$MDDEV" = auto ]; then - forall mark_readonly all - else - mark_readonly $MDDEV - fi + forall mark_readonly all fi return $OCF_ERR_GENERIC fi - if [ "$MDDEV" = auto ]; then - local mddev - for mddev in `list_conf_arrays`; do - raid1_monitor_one $mddev - rc=$? - [ $rc -ne $OCF_NOT_RUNNING ] && break - done - else - raid1_monitor_one $MDDEV - rc=$? - fi - if [ $rc -eq $OCF_NOT_RUNNING ]; then + if are_arrays_stopped; then return $OCF_SUCCESS fi - - ocf_log err "RAID $MDDEV still active after stop command!" + + ocf_log err "RAID $RAIDDEVS still active after stop command!" return $OCF_ERR_GENERIC } # # monitor: a less noisy status # raid1_monitor_one() { local mddev=$1 local md=`echo $mddev | sed 's,/dev/,,'` local rc - TRY_READD=0 + local TRY_READD=0 # check if the md device exists first if [ ! -b $mddev ]; then ocf_log info "$mddev is not a block device" return $OCF_NOT_RUNNING fi if ! grep -e "^$md[ \t:]" /proc/mdstat >/dev/null ; then ocf_log info "$md not found in /proc/mdstat" return $OCF_NOT_RUNNING fi if [ $HAVE_RAIDTOOLS != "true" ]; then $MDADM --detail --test $mddev >/dev/null 2>&1 ; rc=$? case $rc in 0) ;; 1) ocf_log warn "$mddev has at least one failed device." TRY_READD=1 ;; 2) ocf_log err "$mddev has failed." return $OCF_ERR_GENERIC ;; 4) ocf_log err "mdadm failed on $mddev." return $OCF_ERR_GENERIC ;; *) ocf_log err "mdadm returned an unknown result ($rc)." return $OCF_ERR_GENERIC ;; esac fi if [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" != 0 \ -a $TRY_READD -eq 1 -a $OCF_CHECK_LEVEL -gt 0 ]; then ocf_log info "Attempting recovery sequence to re-add devices on $mddev:" $MDADM $mddev --fail detached $MDADM $mddev --remove failed $MDADM $mddev --re-add missing # TODO: At this stage, there's nothing to actually do # here. Either this worked or it did not. fi if ! dd if=$mddev count=1 bs=512 of=/dev/null \ iflag=direct >/dev/null 2>&1 ; then ocf_log err "$mddev: I/O error on read" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } raid1_monitor() { - if [ "$MDDEV" = auto ]; then - forall raid1_monitor_one - else - raid1_monitor_one $MDDEV - fi + forall raid1_monitor_one } # # STATUS: is the raid device online or offline? # raid1_status() { # See if the MD device is online local rc raid1_monitor rc=$? if [ $rc -ne $OCF_SUCCESS ]; then echo "stopped" else echo "running" fi return $rc } raid1_validate_all() { return $OCF_SUCCESS } - + +PROC_CLEANUP_TIME=3 if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; *) ;; esac RAIDCONF="$OCF_RESKEY_raidconf" MDDEV="$OCF_RESKEY_raiddev" +FORCESTOP="${OCF_RESKEY_force_stop:-1}" if [ -z "$RAIDCONF" ] ; then ocf_log err "Please set OCF_RESKEY_raidconf!" exit $OCF_ERR_CONFIGURED fi if [ ! -r "$RAIDCONF" ] ; then ocf_log err "Configuration file [$RAIDCONF] does not exist, or can not be opend!" exit $OCF_ERR_INSTALLED fi if [ -z "$MDDEV" ] ; then ocf_log err "Please set OCF_RESKEY_raiddev to the Raid device you want to control!" exit $OCF_ERR_CONFIGURED fi +if ocf_is_true $FORCESTOP && ! have_binary lsof; then + ocf_log warn "Please install lsof(8), we may need it when stopping Raid device! Now continuing anyway ..." +fi + HAVE_RAIDTOOLS=false if have_binary $MDADM >/dev/null 2>&1 ; then if [ -n "$OCF_RESKEY_homehost" ]; then MDADM_HOMEHOST="--homehost=${OCF_RESKEY_homehost}" else MDADM_HOMEHOST="" fi else check_binary $RAIDSTART HAVE_RAIDTOOLS=true fi -if [ "$MDDEV" = "auto" -a $HAVE_RAIDTOOLS = true ]; then - ocf_log err "autoconf supported only with mdadm!" - exit $OCF_ERR_INSTALLED +if [ $HAVE_RAIDTOOLS = true ]; then + if [ "$MDDEV" = "auto" ]; then + ocf_log err "autoconf supported only with mdadm!" + exit $OCF_ERR_INSTALLED + elif [ `echo $MDDEV|wc -w` -gt 1 ]; then + ocf_log err "multiple devices supported only with mdadm!" + exit $OCF_ERR_INSTALLED + fi +fi + +if [ "$MDDEV" = "auto" ]; then + RAIDDEVS=`list_conf_arrays` +else + RAIDDEVS="$MDDEV" fi # At this stage, # [ $HAVE_RAIDTOOLS = false ] <=> we have $MDADM, # otherwise we have raidtools (raidstart and raidstop) # Look for how we are called case "$1" in start) raid1_start ;; stop) raid1_stop ;; status) raid1_status ;; monitor) raid1_monitor ;; validate-all) raid1_validate_all ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/SendArp b/heartbeat/SendArp index 553469b36..faba7f047 100755 --- a/heartbeat/SendArp +++ b/heartbeat/SendArp @@ -1,257 +1,257 @@ #!/bin/sh # # # Copyright (c) 2006, Huang Zhen # Converting original heartbeat RA to OCF RA. # # Copyright (C) 2004 Horms # # Based on IPaddr2: Copyright (C) 2003 Tuomo Soini # # License: GNU General Public License (GPL) # Support: linux-ha@lists.linux-ha.org # # This script send out gratuitous Arp for an IP address # # It can be used _instead_ of the IPaddr2 or IPaddr resource # to send gratuitous arp for an IP address on a given interface, # without adding the address to that interface. I.e. if for # some reason you want to send gratuitous arp for addresses # managed by IPaddr2 or IPaddr on an additional interface. # # OCF parameters are as below: # OCF_RESKEY_ip # OCF_RESKEY_nic # # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs SENDARP=$HA_BIN/send_arp SENDARPPIDDIR=${HA_RSCTMP} BASEIP="$OCF_RESKEY_ip" INTERFACE="$OCF_RESKEY_nic" RESIDUAL="" SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$BASEIP" # Set default values : ${ARP_INTERVAL_MS=200} # milliseconds between ARPs : ${ARP_REPEAT=5} # repeat count : ${ARP_BACKGROUND=yes} # no to run in foreground : ${ARP_NETMASK=ffffffffffff} # netmask for ARP ####################################################################### sendarp_meta_data() { cat < 1.0 This RA can be used _instead_ of the IPaddr2 or IPaddr RA to send gratuitous ARP for an IP address on a given interface, without adding the address to that interface. For example, if for some resaon you wanted to send gratuitous ARP for addresses managed by IPaddr2 or IPaddr on an additional interface. Broadcasts unsolicited ARP announcements The IP address for sending ARP packet. IP address The NIC for sending ARP packet. NIC END } ####################################################################### sendarp_usage() { cat < 1.0 The resource agent of Squid. This manages a Squid instance as an HA resource. Manages a Squid proxy server instance This is a required parameter. This parameter specifies squid's executable file. Executable file This is a required parameter. This parameter specifies a configuration file for a squid instance managed by this RA. Configuration file This is a required parameter. This parameter specifies a process id file for a squid instance managed by this RA. Pidfile This is a required parameter. This parameter specifies a port number for a squid instance managed by this RA. If plural ports are used, you must specifiy the only one of them. Port number -This is an omittable parameter. -On a stop action, a normal stop method is firstly used. -and then the confirmation of its completion is awaited for -the specified seconds by this parameter. -The default value is 10. +On stop, a squid shutdown is invoked first. If the resource +doesn't stop within this timeout, we resort to stopping +processes by sending signals and finally KILLing them. -Number of seconds to await to confirm a normal stop method +how long to wait for squid shutdown to stop the +instance before resorting to kill This is an optional parameter. This RA runs in debug mode when this parameter includes 'x' or 'v'. If 'x' is included, both of STDOUT and STDERR redirect to the logfile specified by "debug_log", and then the builtin shell option 'x' is turned on. It is similar about 'v'. Debug mode This is an optional and omittable parameter. This parameter specifies a destination file for debug logs and works only if this RA run in debug mode. Refer to "debug_mode" about debug mode. If no value is given but it's requied, it's made by the following rules: "/var/log/" as a directory part, the basename of the configuration file given by "syslog_ng_conf" as a basename part, ".log" as a suffix. A destination of the debug log END return $OCF_SUCCESS } get_pids() { SQUID_PIDS=( ) # Seek by pattern SQUID_PIDS[0]=$(pgrep -f "$PROCESS_PATTERN") # Seek by pidfile SQUID_PIDS[1]=$(awk '1{print $1}' $SQUID_PIDFILE 2>/dev/null) if [[ -n "${SQUID_PIDS[1]}" ]]; then typeset exe exe=$(ls -l "/proc/${SQUID_PIDS[1]}/exe") if [[ $? = 0 ]]; then exe=${exe##*-> } if ! [[ "$exe" = $SQUID_EXE ]]; then SQUID_PIDS[1]="" fi else SQUID_PIDS[1]="" fi fi # Seek by port SQUID_PIDS[2]=$( netstat -apn | - awk '/tcp.*[0-9]+\.[0-9]+\.+[0-9]+\.[0-9]+:'$SQUID_PORT' /{ + awk '/tcp.*[0-9]+\.[0-9]+\.+[0-9]+\.[0-9]+:'$SQUID_PORT' / && $7~/^[1-9]/ { sub("\\/.*", "", $7); print $7; exit}') } are_all_pids_found() { if [[ -n "${SQUID_PIDS[0]}" ]] && [[ -n "${SQUID_PIDS[1]}" ]] && [[ -n "${SQUID_PIDS[2]}" ]] then return 0 else return 1 fi } are_pids_sane() { if [[ "${SQUID_PIDS[1]}" = "${SQUID_PIDS[2]}" ]]; then return $OCF_SUCCESS else ocf_log err "$SQUID_NAME:Pid unmatch" return $OCF_ERR_GENERIC fi } is_squid_dead() { if [[ -z "${SQUID_PIDS[0]}" ]] && [[ -z "${SQUID_PIDS[2]}" ]] then return 0 else return 1 fi } monitor_squid() { typeset trialcount=0 while true; do get_pids if are_all_pids_found; then are_pids_sane return $OCF_SUCCESS fi if is_squid_dead; then return $OCF_NOT_RUNNING fi ocf_log info "$SQUID_NAME:Inconsistent processes:" \ "${SQUID_PIDS[0]},${SQUID_PIDS[1]},${SQUID_PIDS[2]}" (( trialcount = trialcount + 1 )) if (( trialcount > SQUID_CONFIRM_TRIALCOUNT )); then ocf_log err "$SQUID_NAME:Inconsistency of processes remains unsolved" return $OCF_ERR_GENERIC fi sleep 1 done } start_squid() { typeset status monitor_squid status=$? if [[ $status != $OCF_NOT_RUNNING ]]; then return $status fi set -- "$SQUID_OPTS" ocf_run $SQUID_EXE -f "$SQUID_CONF" "$@" status=$? if [[ $status != $OCF_SUCCESS ]]; then return $OCF_ERR_GENERIC fi while true; do get_pids if are_all_pids_found && are_pids_sane; then return $OCF_SUCCESS fi ocf_log info "$SQUID_NAME:Waiting for squid to be invoked" sleep 1 done return $OCF_ERR_GENERIC } stop_squid() { typeset lapse_sec if ocf_run $SQUID_EXE -f $SQUID_CONF -k shutdown; then lapse_sec=0 while true; do get_pids if is_squid_dead; then rm -f $SQUID_PIDFILE return $OCF_SUCCESS fi (( lapse_sec = lapse_sec + 1 )) if (( lapse_sec > SQUID_STOP_TIMEOUT )); then break fi sleep 1 ocf_log info "$SQUID_NAME:$FUNCNAME:$LINENO: " \ "stop NORM $lapse_sec/$SQUID_STOP_TIMEOUT" done fi while true; do get_pids ocf_log info "$SQUID_NAME:$FUNCNAME:$LINENO: " \ "try to stop by SIGKILL:${SQUID_PIDS[0]} ${SQUID_PIDS[2]}" kill -KILL ${SQUID_PIDS[0]} ${SQUID_PIDS[2]} sleep 1 if is_squid_dead; then rm -f $SQUID_PIDFILE return $OCF_SUCCESS fi done return $OCF_ERR_GENERIC } status_squid() { return $OCF_SUCCESS } validate_all_squid() { ocf_log info "validate_all_squid[$SQUID_NAME]" return $OCF_SUCCESS } : === Debug ${0##*/} $1 === if [[ "$1" = "meta-data" ]]; then metadata_squid exit $? fi SQUID_CONF="${OCF_RESKEY_squid_conf}" if [[ -z "$SQUID_CONF" ]]; then ocf_log err "SQUID_CONF is not defined" exit $OCF_ERR_CONFIGURED fi SQUID_NAME="${SQUID_CONF##*/}" SQUID_NAME="${SQUID_NAME%.*}" DEBUG_LOG="${OCF_RESKEY_debug_log-/var/log/squid_${SQUID_NAME}_debug}.log" DEBUG_MODE="" case $OCF_RESKEY_debug_mode in *x*) DEBUG_MODE="${DEBUG_MODE}x";; esac case $OCF_RESKEY_debug_mode in *v*) DEBUG_MODE="${DEBUG_MODE}v";; esac if [ -n "$DEBUG_MODE" ]; then PS4='\d \t \h '"${1-unknown} " export PS4 exec 1>>$DEBUG_LOG 2>&1 set -$DEBUG_MODE fi SQUID_EXE="${OCF_RESKEY_squid_exe}" if [[ -z "$SQUID_EXE" ]]; then ocf_log err "SQUID_EXE is not defined" exit $OCF_ERR_CONFIGURED fi if [[ ! -x "$SQUID_EXE" ]]; then ocf_log err "$SQUID_EXE is not found" exit $OCF_ERR_CONFIGURED fi SQUID_PIDFILE="${OCF_RESKEY_squid_pidfile}" if [[ -z "$SQUID_PIDFILE" ]]; then ocf_log err "SQUID_PIDFILE is not defined" exit $OCF_ERR_CONFIGURED fi SQUID_PORT="${OCF_RESKEY_squid_port}" if [[ -z "$SQUID_PORT" ]]; then ocf_log err "SQUID_PORT is not defined" exit $OCF_ERR_CONFIGURED fi SQUID_OPTS="${OCF_RESKEY_squid_opts}" SQUID_PIDS=( ) SQUID_CONFIRM_TRIALCOUNT="${OCF_RESKEY_squid_confirm_trialcount-3}" -SQUID_STOP_TIMEOUT="${OCF_RESKEY_squid_stop_timeout-5}" +SQUID_STOP_TIMEOUT="${OCF_RESKEY_squid_stop_timeout-10}" SQUID_SUSPEND_TRIALCOUNT="${OCF_RESKEY_squid_suspend_trialcount-10}" PROCESS_PATTERN="$SQUID_EXE -f $SQUID_CONF" COMMAND=$1 case "$COMMAND" in start) ocf_log debug "[$SQUID_NAME] Enter squid start" start_squid func_status=$? ocf_log debug "[$SQUID_NAME] Leave squid start $func_status" exit $func_status ;; stop) ocf_log debug "[$SQUID_NAME] Enter squid stop" stop_squid func_status=$? ocf_log debug "[$SQUID_NAME] Leave squid stop $func_status" exit $func_status ;; status) status_squid exit $? ;; monitor) #ocf_log debug "[$SQUID_NAME] Enter squid monitor" monitor_squid func_status=$? #ocf_log debug "[$SQUID_NAME] Leave squid monitor $func_status" exit $func_status ;; validate-all) validate_all_squid exit $? ;; *) usage ;; esac # vim: set sw=4 ts=4 : diff --git a/heartbeat/VIPArip b/heartbeat/VIPArip index 0e81ed82f..01c6c994f 100755 --- a/heartbeat/VIPArip +++ b/heartbeat/VIPArip @@ -1,294 +1,302 @@ #!/bin/sh # # License: GNU General Public License (GPL) # Support: linux-ha@lists.linux-ha.org # Author: Huang Zhen # Copyright (c) 2006 International Business Machines # # Virtual IP Address by RIP2 protocol. # This script manages IP alias in different subnet with quagga/ripd. # It can add an IP alias, or remove one. # # The quagga package should be installed to run this RA # # usage: $0 {start|stop|status|monitor|validate-all|meta-data} # # The "start" arg adds an IP alias. # Surprisingly, the "stop" arg removes one. :-) # # OCF parameters are as below # OCF_RESKEY_ip The IP address in different subnet # OCF_RESKEY_nic The nic for broadcast the route information # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs RIPDCONF=$HA_RSCTMP/VIPArip-ripd.conf ZEBRA=/usr/sbin/zebra RIPD=/usr/sbin/ripd USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; ####################################################################### meta_data() { cat < 1.0 Virtual IP Address by RIP2 protocol. This script manages IP alias in different subnet with quagga/ripd. It can add an IP alias, or remove one. Manages a virtual IP address through RIP2 The IPv4 address in different subnet, for example "192.168.1.1". The IP address in different subnet The nic for broadcast the route information. The ripd uses this nic to broadcast the route informaton to others The nic for broadcast the route information Absolute path to the zebra binary. zebra binary Absolute path to the ripd binary. ripd binary END exit $OCF_SUCCESS } usage() { echo $USAGE >&2 } new_config_file() { echo new_config_file $1 $2 $3 cat >$RIPDCONF < $RIPDCONF.tmp cp $RIPDCONF.tmp $RIPDCONF } add_ip() { echo add_ip $1 sed "s/ip_tag/ip_tag\naccess-list private permit $1\/32/g" $RIPDCONF > $RIPDCONF.tmp cp $RIPDCONF.tmp $RIPDCONF } del_ip() { echo del_ip $1 sed "/$1/d" $RIPDCONF > $RIPDCONF.tmp cp $RIPDCONF.tmp $RIPDCONF if $GREP "access-list private permit" $RIPDCONF>/dev/null then echo some other IP is running reload_config else stop_quagga echo remove $RIPDCONF rm $RIPDCONF fi } add_nic() { echo add_nic $1 if $GREP "network $1" $RIPDCONF >/dev/null then echo the nic is already in the config file else sed "s/nic_tag/nic_tag\n no passive-interface $1\n network $1\n distribute-list private out $1\n distribute-list private in $1/g" $RIPDCONF > $RIPDCONF.tmp cp $RIPDCONF.tmp $RIPDCONF fi } reload_config() { echo reload_config echo $RIPDCONF: cat $RIPDCONF echo killall -SIGHUP ripd killall -SIGHUP ripd } start_quagga() { echo start_quagga echo $RIPDCONF: cat $RIPDCONF echo $ZEBRA -d $ZEBRA -d echo $RIPD -d -f $RIPDCONF $RIPD -d -f $RIPDCONF } stop_quagga() { echo stop_quagga echo $RIPDCONF: cat $RIPDCONF echo killall -SIGTERM ripd killall -SIGTERM ripd echo killall -SIGTERM zebra killall -SIGTERM zebra } start_rip_ip() { echo start_rip_ip check_params if [ x"$OCF_RESKEY_nic" = x ] then echo OCF_RESKEY_nic is null, set to eth0 OCF_RESKEY_nic="eth0" fi - if $IP2UTIL addr | $GREP $OCF_RESKEY_ip >/dev/null - then - ocf_log err "Invalid OCF_RESKEY_ip [$OCF_RESKEY_ip]" - exit $OCF_ERR_ARGS - fi + status_rip_ip + case $? in + $OCF_SUCCESS) + ocf_log info "already running" + exit $OCF_SUCCESS + ;; + $OCF_NOT_RUNNING) + ;; + *) + ocf_log info "state undefined, stopping first" + stop_rip_ip + ;; + esac $IP2UTIL addr add $OCF_RESKEY_ip/32 dev lo if [ -f "$RIPDCONF" ] then # there is a config file, add new data(IP,nic,metric) # to the existing config file. add_ip $OCF_RESKEY_ip add_nic $OCF_RESKEY_nic set_metric 1 reload_config echo sleep 3 sleep 3 set_metric 3 reload_config else new_config_file $OCF_RESKEY_ip $OCF_RESKEY_nic 1 start_quagga echo sleep 3 sleep 3 set_metric 3 reload_config fi return $OCF_SUCCESS } stop_rip_ip() { echo stop_rip_ip check_params status_rip_ip - if [ $? = $OCF_NOT_RUNNING ] - then - ocf_log err "Invalid OCF_RESKEY_ip [$OCF_RESKEY_ip]" - exit $OCF_ERR_ARGS + if [ $? = $OCF_NOT_RUNNING ] + then + exit $OCF_SUCCESS fi $IP2UTIL addr del $OCF_RESKEY_ip dev lo echo sleep 2 sleep 2 del_ip $OCF_RESKEY_ip return $OCF_SUCCESS } status_rip_ip() { check_params if $IP2UTIL addr | $GREP $OCF_RESKEY_ip >/dev/null then if $GREP $OCF_RESKEY_ip $RIPDCONF >/dev/null then if pidof ripd >/dev/null then return $OCF_SUCCESS fi fi + return $OCF_ERR_GENERIC fi return $OCF_NOT_RUNNING } if [ $# -ne 1 ] then usage exit $OCF_ERR_ARGS fi [ x != x"$OCF_RESKEY_zebra_binary" ] && ZEBRA=$OCF_RESKEY_zebra_binary [ x != x"$OCF_RESKEY_ripd_binary" ] && RIPD=$OCF_RESKEY_ripd_binary case $1 in start) start_rip_ip;; stop) stop_rip_ip;; status) status_rip_ip;; monitor) status_rip_ip;; validate-all) check_binary $IP2UTIL exit $OCF_SUCCESS;; meta-data) meta_data;; usage) usage; exit $OCF_SUCCESS;; *) usage - exit $OCF_ERR_ARGS + exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/VirtualDomain b/heartbeat/VirtualDomain index dd8565792..ac5c13c93 100755 --- a/heartbeat/VirtualDomain +++ b/heartbeat/VirtualDomain @@ -1,556 +1,557 @@ #!/bin/sh # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # Resource Agent for domains managed by the libvirt API. # Requires a running libvirt daemon (libvirtd). # # (c) 2008-2010 Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # usage: $0 {start|stop|status|monitor|migrate_to|migrate_from|meta-data|validate-all} # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_force_stop_default=0 OCF_RESKEY_hypervisor_default="$(virsh --quiet uri)" OCF_RESKEY_autoset_utilization_cpu_default="true" OCF_RESKEY_autoset_utilization_hv_memory_default="true" : ${OCF_RESKEY_force_stop=${OCF_RESKEY_force_stop_default}} : ${OCF_RESKEY_hypervisor=${OCF_RESKEY_hypervisor_default}} : ${OCF_RESKEY_autoset_utilization_cpu=${OCF_RESKEY_autoset_utilization_cpu_default}} : ${OCF_RESKEY_autoset_utilization_hv_memory=${OCF_RESKEY_autoset_utilization_hv_memory_default}} ####################################################################### ## I'd very much suggest to make this RA use bash, ## and then use magic $SECONDS. ## But for now: NOW=$(date +%s) usage() { echo "usage: $0 {start|stop|status|monitor|migrate_to|migrate_from|meta-data|validate-all}" } meta_data() { cat < 1.1 Resource agent for a virtual domain (a.k.a. domU, virtual machine, virtual environment etc., depending on context) managed by libvirtd. Manages virtual domains through the libvirt virtualization framework Absolute path to the libvirt configuration file, for this virtual domain. Virtual domain configuration file Hypervisor URI to connect to. See the libvirt documentation for details on supported URI formats. The default is system dependent. Hypervisor URI Always forcefully shut down ("destroy") the domain on stop. The default behavior is to resort to a forceful shutdown only after a graceful shutdown attempt has failed. You should only set this to true if your virtual domain (or your virtualization backend) does not support graceful shutdown. Always force shutdown on stop Transport used to connect to the remote hypervisor while migrating. Please refer to the libvirt documentation for details on transports available. If this parameter is omitted, the resource will use libvirt's default transport to connect to the remote hypervisor. Remote hypervisor transport Use a dedicated migration network. The migration URI is composed by adding this parameters value to the end of the node name. If the node name happens to be an FQDN (as opposed to an unqualified host name), insert the suffix immediately prior to the first period (.) in the FQDN. At the moment Qemu/KVM and Xen migration via a dedicated network is supported. Note: Be sure this composed host name is locally resolveable and the associated IP is reachable through the favored network. Migration network host name suffix To additionally monitor services within the virtual domain, add this parameter with a list of scripts to monitor. Note: when monitor scripts are used, the start and migrate_from operations will complete only when all monitor scripts have completed successfully. Be sure to set the timeout of these operations to accommodate this delay. space-separated list of monitor scripts If set true, the agent will detect the number of domainU's vCPUs from virsh, and put it into the cpu utilization of the resource when the monitor is executed. Enable auto setting the cpu utilization of the resource If set true, the agent will detect the number of *Max memory* from virsh, and put it into the hv_memory utilization of the resource when the monitor is executed. Enable auto setting the hv_memory utilization of the resource EOF } set_util_attr() { local attr=$1 val=$2 local cval outp cval=$(crm_resource -Q -r $OCF_RESOURCE_INSTANCE -z -g $attr 2>/dev/null) if [ "$cval" != "$val" ]; then outp=`crm_resource -r $OCF_RESOURCE_INSTANCE -z -p $attr -v $val 2>&1` || ocf_log warn "crm_resource failed to set utilization attribute $attr: $outp" fi } update_utilization() { local dom_cpu dom_mem if ocf_is_true "$OCF_RESKEY_autoset_utilization_cpu"; then dom_cpu=$(LANG=C virsh $VIRSH_OPTIONS dominfo ${DOMAIN_NAME} | awk '/CPU\(s\)/{print $2}') test -n "$dom_cpu" && set_util_attr cpu $dom_cpu fi if ocf_is_true "$OCF_RESKEY_autoset_utilization_hv_memory"; then dom_mem=$(LANG=C virsh $VIRSH_OPTIONS dominfo ${DOMAIN_NAME} | awk '/Max memory/{printf("%d", $3/1024)}') test -n "$dom_mem" && set_util_attr hv_memory "$dom_mem" fi } # Set options to be passed to virsh: VIRSH_OPTIONS="--connect=${OCF_RESKEY_hypervisor} --quiet" # A state file where we record the domain name: STATEFILE="${HA_RSCTMP}/VirtualDomain-${OCF_RESOURCE_INSTANCE}.state" VirtualDomain_Define() { local virsh_output local domain_name # Note: passing in the domain name from outside the script is # intended for testing and debugging purposes only. Don't do this # in production, instead let the script figure out the domain name # from the config file. You have been warned. if [ -z "$DOMAIN_NAME" ]; then # Spin until we have a domain name while true; do virsh_output=`virsh ${VIRSH_OPTIONS} define ${OCF_RESKEY_config}` domain_name=`echo "$virsh_output" | sed -e 's/Domain \(.*\) defined from .*$/\1/'` if [ -n "$domain_name" ]; then break; fi ocf_log debug "Domain not defined yet, probably unable to connect to hypervisor. Retrying." sleep 1 done echo "$domain_name" > $STATEFILE ocf_log info "Domain name \"$domain_name\" saved to $STATEFILE." else ocf_log warn "Domain name ${DOMAIN_NAME} already defined, overriding configuration file ${OCF_RESKEY_config}. You should do this for testing only." fi } VirtualDomain_Cleanup_Statefile() { rm -f $STATEFILE || ocf_log warn "Failed to remove $STATEFILE during $__OCF_ACTION." } VirtualDomain_Status() { local try=0 rc=$OCF_ERR_GENERIC status="no state" while [ "$status" = "no state" ]; do try=$(($try + 1 )) status="`virsh $VIRSH_OPTIONS domstate $DOMAIN_NAME`" case "$status" in "shut off") # shut off: domain is defined, but not started ocf_log debug "Virtual domain $DOMAIN_NAME is currently $status." rc=$OCF_NOT_RUNNING ;; - running|paused|idle|blocked) + running|paused|idle|blocked|"in shutdown") # running: domain is currently actively consuming cycles # paused: domain is paused (suspended) # idle: domain is running but idle # blocked: synonym for idle used by legacy Xen versions + # in shutdown: the domain is in process of shutting down, but has not completely shutdown or crashed. ocf_log debug "Virtual domain $DOMAIN_NAME is currently $status." rc=$OCF_SUCCESS ;; ""|"no state") # Empty string may be returned when virsh does not # receive a reply from libvirtd. # "no state" may occur when the domain is currently # being migrated (on the migration target only), or # whenever virsh can't reliably obtain the domain # state. status="no state" if [ "$__OCF_ACTION" = "stop" ] && [ $try -ge 3 ]; then # During the stop operation, we want to bail out # quickly, so as to be able to force-stop (destroy) # the domain if necessary. ocf_log error "Virtual domain $DOMAIN_NAME has no state during stop operation, bailing out." return $OCF_ERR_GENERIC; else # During all other actions, we just wait and try # again, relying on the CRM/LRM to time us out if # this takes too long. ocf_log info "Virtual domain $DOMAIN_NAME currently has no state, retrying." sleep 1 fi ;; *) # any other output is unexpected. ocf_log error "Virtual domain $DOMAIN_NAME has unknown status \"$status\"!" ;; esac done return $rc } VirtualDomain_Start() { if VirtualDomain_Status; then ocf_log info "Virtual domain $DOMAIN_NAME already running." return $OCF_SUCCESS fi virsh $VIRSH_OPTIONS start ${DOMAIN_NAME} rc=$? if [ $rc -ne 0 ]; then ocf_log error "Failed to start virtual domain ${DOMAIN_NAME}." return $OCF_ERR_GENERIC fi while ! VirtualDomain_Monitor; do sleep 1 done return $OCF_SUCCESS } VirtualDomain_Stop() { local i local status local shutdown_timeout local out ex VirtualDomain_Status status=$? case $status in $OCF_SUCCESS) if ! ocf_is_true $OCF_RESKEY_force_stop; then # Issue a graceful shutdown request ocf_log info "Issuing graceful shutdown request for domain ${DOMAIN_NAME}." virsh $VIRSH_OPTIONS shutdown ${DOMAIN_NAME} # The "shutdown_timeout" we use here is the operation # timeout specified in the CIB, minus 5 seconds shutdown_timeout=$(( $NOW + ($OCF_RESKEY_CRM_meta_timeout/1000) -5 )) # Loop on status until we reach $shutdown_timeout while [ $NOW -lt $shutdown_timeout ]; do VirtualDomain_Status status=$? case $status in $OCF_NOT_RUNNING) # This was a graceful shutdown. Clean # up and return. VirtualDomain_Cleanup_Statefile return $OCF_SUCCESS ;; $OCF_SUCCESS) # Domain is still running, keep # waiting (until shutdown_timeout # expires) sleep 1 ;; *) # Something went wrong. Bail out and # resort to forced stop (destroy). break; esac NOW=$(date +%s) done fi ;; $OCF_NOT_RUNNING) ocf_log info "Domain $DOMAIN_NAME already stopped." return $OCF_SUCCESS esac # OK. Now if the above graceful shutdown hasn't worked, kill # off the domain with destroy. If that too does not work, # have the LRM time us out. ocf_log info "Issuing forced shutdown (destroy) request for domain ${DOMAIN_NAME}." out=$(virsh $VIRSH_OPTIONS destroy ${DOMAIN_NAME} 2>&1) ex=$? echo >&2 "$out" # unconditionally clean up. VirtualDomain_Cleanup_Statefile case $ex$out in *"error:"*"domain is not running"*) : ;; # unexpected path to the intended outcome, all is well [!0]*) return $OCF_ERR_GENERIC ;; 0*) while [ $status != $OCF_NOT_RUNNING ]; do VirtualDomain_Status status=$? done ;; esac return $OCF_SUCCESS } VirtualDomain_Migrate_To() { local target_node local remoteuri local transport_suffix local migrateuri local migrateport local migrate_target local hypervisor target_node="$OCF_RESKEY_CRM_meta_migrate_target" if VirtualDomain_Status; then # Find out the remote hypervisor to connect to. That is, turn # something like "qemu://foo:9999/system" into # "qemu+tcp://bar:9999/system" if [ -n "${OCF_RESKEY_migration_transport}" ]; then transport_suffix="+${OCF_RESKEY_migration_transport}" fi # A typical migration URI via a special migration network looks # like "tcp://bar-mig:49152". The port would be randomly chosen # by libvirt from the range 49152-49215 if omitted, at least since # version 0.7.4 ... if [ -n "${OCF_RESKEY_migration_network_suffix}" ]; then hypervisor="${OCF_RESKEY_hypervisor%%[+:]*}" # Hostname might be a FQDN migrate_target=$(echo ${target_node} | sed -e "s,^\([^.]\+\),\1${OCF_RESKEY_migration_network_suffix},") case $hypervisor in qemu) # For quiet ancient libvirt versions a migration port is needed # and the URI must not contain the "//". Newer versions can handle # the "bad" URI. migrateport=$(( 49152 + $(ocf_maybe_random) % 64 )) migrateuri="tcp:${migrate_target}:${migrateport}" ;; xen) migrateuri="xenmigr://${migrate_target}" ;; *) ocf_log warn "$DOMAIN_NAME: Migration via dedicated network currently not supported for ${hypervisor}." ;; esac fi # Scared of that sed expression? So am I. :-) remoteuri=$(echo ${OCF_RESKEY_hypervisor} | sed -e "s,\(.*\)://[^/:]*\(:\?[0-9]*\)/\(.*\),\1${transport_suffix}://${target_node}\2/\3,") # OK, we know where to connect to. Now do the actual migration. ocf_log info "$DOMAIN_NAME: Starting live migration to ${target_node} (using remote hypervisor URI ${remoteuri} ${migrateuri})." virsh ${VIRSH_OPTIONS} migrate --live $DOMAIN_NAME ${remoteuri} ${migrateuri} rc=$? if [ $rc -ne 0 ]; then ocf_log err "$DOMAIN_NAME: live migration to ${remoteuri} ${migrateuri} failed: $rc" return $OCF_ERR_GENERIC else ocf_log info "$DOMAIN_NAME: live migration to ${target_node} succeeded." VirtualDomain_Cleanup_Statefile return $OCF_SUCCESS fi else ocf_log err "$DOMAIN_NAME: migrate_to: Not active locally!" return $OCF_ERR_GENERIC fi } VirtualDomain_Migrate_From() { while ! VirtualDomain_Monitor; do sleep 1 done ocf_log info "$DOMAIN_NAME: live migration from ${OCF_RESKEY_CRM_meta_migrate_source} succeeded." return $OCF_SUCCESS } VirtualDomain_Monitor() { # First, check the domain status. If that returns anything other # than $OCF_SUCCESS, something is definitely wrong. VirtualDomain_Status rc=$? if [ ${rc} -eq ${OCF_SUCCESS} ]; then # OK, the generic status check turned out fine. Now, if we # have monitor scripts defined, run them one after another. for script in ${OCF_RESKEY_monitor_scripts}; do script_output="$($script 2>&1)" script_rc=$? if [ ${script_rc} -ne ${OCF_SUCCESS} ]; then # A monitor script returned a non-success exit # code. Stop iterating over the list of scripts, log a # warning message, and propagate $OCF_ERR_GENERIC. ocf_log warn "Monitor command \"${script}\" for domain ${DOMAIN_NAME} returned ${script_rc} with output: ${script_output}" rc=$OCF_ERR_GENERIC break else ocf_log debug "Monitor command \"${script}\" for domain ${DOMAIN_NAME} completed successfully with output: ${script_output}" fi done fi update_utilization return ${rc} } VirtualDomain_Validate_All() { # Required binaries: for binary in virsh sed; do check_binary $binary done if [ -z $OCF_RESKEY_config ]; then ocf_log error "Missing configuration parameter \"config\"." return $OCF_ERR_CONFIGURED fi # check if we can read the config file (otherwise we're unable to # deduce $DOMAIN_NAME from it, see below) if [ ! -r $OCF_RESKEY_config ]; then if ocf_is_probe; then ocf_log info "Configuration file $OCF_RESKEY_config not readable during probe." elif [ "$__OCF_ACTION" = "stop" ]; then ocf_log info "Configuration file $OCF_RESKEY_config not readable, resource considered stopped." else ocf_log error "Configuration file $OCF_RESKEY_config does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi } if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac # Everything except usage and meta-data must pass the validate test VirtualDomain_Validate_All || exit $? # During a probe, it is permissible for the config file to not be # readable (it might be on shared storage not available during the # probe). In that case, VirtualDomain_Define can't work and we're # unable to get the domain name. Thus, we also can't check whether the # domain is running. The only thing we can do here is to assume that # it is not running. if [ ! -r $OCF_RESKEY_config ]; then ocf_is_probe && exit $OCF_NOT_RUNNING [ "$__OCF_ACTION" = "stop" ] && exit $OCF_SUCCESS fi # Define the domain on startup, and re-define whenever someone deleted # the state file, or touched the config. if [ ! -e $STATEFILE ] || [ $OCF_RESKEY_config -nt $STATEFILE ]; then VirtualDomain_Define fi # By now, we should definitely be able to read from the state file. # If not, something went wrong. if [ ! -r $STATEFILE ]; then ocf_log err "$STATEFILE not found or unreadable. This is unexpected. Cannot determine domain name." exit $OCF_ERR_GENERIC fi # Finally, retrieve the domain name from the state file. DOMAIN_NAME=`cat $STATEFILE 2>/dev/null` if [ -z $DOMAIN_NAME ]; then ocf_log err "$STATEFILE is empty. This is unexpected. Cannot determine domain name." exit $OCF_ERR_GENERIC fi case $1 in start) VirtualDomain_Start ;; stop) VirtualDomain_Stop ;; migrate_to) VirtualDomain_Migrate_To ;; migrate_from) VirtualDomain_Migrate_From ;; status) VirtualDomain_Status ;; monitor) VirtualDomain_Monitor ;; validate-all) ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/Xen b/heartbeat/Xen index 17cb27b67..986bfc47c 100755 --- a/heartbeat/Xen +++ b/heartbeat/Xen @@ -1,499 +1,499 @@ #!/bin/sh # # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # Resource Agent for the Xen Hypervisor. # Manages Xen virtual machine instances by # mapping cluster resource start and stop, # to Xen create and shutdown, respectively. # # usage: $0 {start|stop|status|monitor|meta-data} # # OCF parameters are as below: # OCF_RESKEY_xmfile # Absolute path to the Xen control file, # for this virtual machine. # OCF_RESKEY_allow_mem_management # Change memory usage on start/stop/migration # of virtual machine # OCF_RESKEY_reserved_Dom0_memory # minimum memory reserved for domain 0 # OCF_RESKEY_monitor_scripts # scripts to monitor services within the # virtual domain ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### usage() { cat <<-! usage: $0 {start|stop|status|monitor|meta-data|validate-all} ! } : ${OCF_RESKEY_xmfile=/etc/xen/vm/MyDomU} : ${OCF_RESKEY_shutdown_acpi=0} : ${OCF_RESKEY_allow_mem_management=0} : ${OCF_RESKEY_reserved_Dom0_memory=512} meta_data() { cat < 1.0 Resource Agent for the Xen Hypervisor. Manages Xen virtual machine instances by mapping cluster resource start and stop, to Xen create and shutdown, respectively. A note on names We will try to extract the name from the config file (the xmfile attribute). If you use a simple assignment statement, then you should be fine. Otherwise, if there's some python acrobacy involved such as dynamically assigning names depending on other variables, and we will try to detect this, then please set the name attribute. You should also do that if there is any chance of a pathological situation where a config file might be missing, for example if it resides on a shared storage. If all fails, we finally fall back to the instance id to preserve backward compatibility. Para-virtualized guests can also be migrated by enabling the meta_attribute allow-migrate. Manages Xen unprivileged domains (DomUs) Absolute path to the Xen control file, for this virtual machine. Xen control file Name of the virtual machine. Xen DomU name The Xen agent will first try an orderly shutdown using xm shutdown. Should this not succeed within this timeout, the agent will escalate to xm destroy, forcibly killing the node. If this is not set, it will default to two-third of the stop action timeout. Setting this value to 0 forces an immediate destroy. Shutdown escalation timeout Handle shutdown by simulating an ACPI power button event. Enable this to allow graceful shutdown for HVM domains without installed PV drivers. Simulate power button event on shutdown This parameter enables dynamic adjustment of memory for start and stop actions used for Dom0 and the DomUs. The default is to not adjust memory dynamically. Use dynamic memory management In case of a live migration, the system will default to using the IP address associated with the hostname via DNS or /etc/hosts. This parameter allows you to specify a node attribute that will be queried instead for the target node, overriding the IP address. This allows you to use a dedicated network for live migration traffic to a specific node. Warning: make very sure the IP address does point to the right node. Or else the live migration will end up somewhere else, greatly confusing the cluster and causing havoc. Node attribute containing target IP address In case memory management is used, this parameter defines the minimum amount of memory to be reserved for the dom0. The default minimum memory is 512MB. Minimum Dom0 memory To additionally monitor services within the unprivileged domain, add this parameter with a list of scripts to monitor. list of space separated monitor scripts END } Xen_Status() { if have_binary xen-list; then xen-list $1 2>/dev/null | grep -qs "State.*[-r][-b][-p]--" 2>/dev/null if [ $? -ne 0 ]; then return $OCF_NOT_RUNNING else return $OCF_SUCCESS fi fi STATUS=`xm list --long $1 2>/dev/null | grep status 2>/dev/null` if [ "X${STATUS}" != "X" ]; then # we have Xen 3.0.4 or higher STATUS_NOSPACES=`echo "$STATUS" | awk '{ print $1,$2}'` if [ "$STATUS_NOSPACES" = "(status 2)" -o "$STATUS_NOSPACES" = "(status 1)" ]; then return $OCF_SUCCESS else return $OCF_NOT_RUNNING fi else # we have Xen 3.0.3 or lower STATUS=`xm list --long $1 2>/dev/null | grep state 2>/dev/null` echo "${STATUS}" | grep -qs "[-r][-b][-p]---" if [ $? -ne 0 ]; then return $OCF_NOT_RUNNING else return $OCF_SUCCESS fi fi } Xen_Adjust_Memory() { if ocf_is_true "${OCF_RESKEY_allow_mem_management}"; then CNTNEW=$1 RUNNING=`Xen_List_running` RUNCNT=`Xen_Count_running` MAXMEM=`Xen_Total_Memory` if [ ${RUNCNT} -eq 0 -a ${CNTNEW} -eq 0 ]; then RUNCNT=1 fi #NEWMEM=`echo "(${MAXMEM}-${OCF_RESKEY_reserved_Dom0_memory})/(${RUNCNT}+${CNTNEW})"|bc` NEWMEM=$(( (${MAXMEM} - ${OCF_RESKEY_reserved_Dom0_memory}) / (${RUNCNT} + ${CNTNEW} ) )) # do not rely on ballooning add dom0_mem=512 instead to force memory for dom0 #xm mem-set Domain-0 ${OCF_RESKEY_reserved_Dom0_memory} for DOM in ${RUNNING}; do xm mem-set ${DOM} ${NEWMEM} done ocf_log info "Adjusted memory to: $NEWMEM, for the following $RUNCNT domains: $RUNNING" fi } Xen_List_all() { xm list | grep -v -e "Name" -e "Domain-0" | awk '{print $1}' } Xen_List_running() { ALL_DOMS=`Xen_List_all` for DOM in ${ALL_DOMS}; do if Xen_Status $DOM; then echo "${DOM} " fi done } Xen_Count_running() { Xen_List_running | wc -w } Xen_Monitor() { Xen_Status ${DOMAIN_NAME} if [ $? -eq ${OCF_NOT_RUNNING} ]; then return ${OCF_NOT_RUNNING} fi if [ "X${OCF_RESKEY_monitor_scripts}" = "X" ]; then return ${OCF_SUCCESS} fi for SCRIPT in ${OCF_RESKEY_monitor_scripts}; do $SCRIPT if [ $? -ne 0 ]; then return ${OCF_ERR_GENERIC} fi done return ${OCF_SUCCESS} } Xen_Total_Memory() { xm info | grep "^total_memory" | awk '{print $3}' } Xen_Start() { if Xen_Status ${DOMAIN_NAME}; then ocf_log info "Xen domain $DOMAIN_NAME already running." return $OCF_SUCCESS fi if [ ! -f "${OCF_RESKEY_xmfile}" ]; then ocf_log err "Config file ${OCF_RESKEY_xmfile} for $DOMAIN_NAME does not exist." return $OCF_ERR_INSTALLED fi if ocf_is_true "${OCF_RESKEY_allow_mem_management}"; then Xen_Adjust_Memory 1 ocf_log info "New memory for virtual domains: ${NEWMEM}" sed -i -e "/^memory=/ s/^memory=.*/memory=${NEWMEM}/" ${OCF_RESKEY_xmfile} xm mem-set ${DOMAIN_NAME} ${NEWMEM} fi xm create ${OCF_RESKEY_xmfile} name=$DOMAIN_NAME rc=$? if [ $rc -ne 0 ]; then return $OCF_ERR_GENERIC else if ocf_is_true "${OCF_RESKEY_allow_mem_management}"; then xm mem-set ${DOMAIN_NAME} ${NEWMEM} fi fi while sleep 1; do Xen_Monitor && return $OCF_SUCCESS done } xen_domain_stop() { local dom=$1 local timeout if [ -n "$OCF_RESKEY_shutdown_timeout" ]; then timeout=$OCF_RESKEY_shutdown_timeout elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then # Allow 2/3 of the action timeout for the orderly shutdown # (The origin unit is ms, hence the conversion) timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) else timeout=60 fi if [ "$timeout" -gt 0 ]; then ocf_log info "Xen domain $dom will be stopped (timeout: ${timeout}s)" if ocf_is_true "${OCF_RESKEY_shutdown_acpi}"; then xm trigger $dom power else xm shutdown $dom fi while Xen_Status $dom && [ "$timeout" -gt 0 ]; do ocf_log debug "$dom still not stopped. Waiting..." timeout=$((timeout-1)) sleep 1 done fi if [ "$timeout" -eq 0 ]; then while Xen_Status $dom; do ocf_log warn "Xen domain $dom will be destroyed!" $xenkill $dom sleep 1 done # Note: This does not give up. stop isn't allowed to to fail. # If xm destroy fails, stop will eventually timeout. # This is the correct behaviour. fi ocf_log info "Xen domain $dom stopped." } Xen_Stop() { local vm if Xen_Status ${DOMAIN_NAME}; then vm=${DOMAIN_NAME} elif Xen_Status migrating-${DOMAIN_NAME}; then ocf_log info "Xen domain $DOMAIN_NAME is migrating" vm="migrating-${DOMAIN_NAME}" else ocf_log info "Xen domain $DOMAIN_NAME already stopped." fi if [ "$vm" ]; then xen_domain_stop $vm else # It is supposed to be gone, but there have been situations where xm # list / xen-list showed it as stopped but it was still instantiated. # Nuke it once more to make sure: $xenkill ${DOMAIN_NAME} fi Xen_Adjust_Memory 0 return $OCF_SUCCESS } Xen_Migrate_To() { target_node="$OCF_RESKEY_CRM_meta_migrate_target" - target_attr="$OCF_RESKEY_CRM_node_ip_attribute" + target_attr="$OCF_RESKEY_node_ip_attribute" target_addr="$target_node" if Xen_Status ${DOMAIN_NAME}; then ocf_log info "$DOMAIN_NAME: Starting xm migrate to $target_node" if [ -n "$target_attr" ]; then nodevalue=`crm_attribute --type nodes --node-uname $target_node --attr-name $target_attr --get-value -q` if [ -n "${nodevalue}" -a "${nodevalue}" != "(null)" ]; then target_addr="$nodevalue" ocf_log info "$DOMAIN_NAME: $target_node is using address $target_addr" fi fi xm migrate --live $DOMAIN_NAME $target_addr rc=$? if [ $rc -ne 0 ]; then ocf_log err "$DOMAIN_NAME: xm migrate to $target_node failed: $rc" return $OCF_ERR_GENERIC else Xen_Adjust_Memory 0 ocf_log info "$DOMAIN_NAME: xm migrate to $target_node succeeded." return $OCF_SUCCESS fi else ocf_log err "$DOMAIN_NAME: migrate_to: Not active locally!" return $OCF_ERR_GENERIC fi } Xen_Migrate_From() { if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then # Allow 2/3 of the action timeout for status to stabilize # (The origin unit is ms, hence the conversion) timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) else timeout=10 # should be plenty fi while ! Xen_Status ${DOMAIN_NAME} && [ $timeout -gt 0 ]; do ocf_log debug "$DOMAIN_NAME: Not yet active locally, waiting (timeout: ${timeout}s)" timeout=$((timeout-1)) sleep 1 done if Xen_Status ${DOMAIN_NAME}; then Xen_Adjust_Memory 0 ocf_log info "$DOMAIN_NAME: Active locally, migration successful" return $OCF_SUCCESS else ocf_log err "$DOMAIN_NAME: Not active locally, migration failed!" return $OCF_ERR_GENERIC fi } Xen_Validate_All() { return $OCF_SUCCESS } if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac # the name business: # # 1. use the name attribute, or # 2. find the name in the config file (if it exists) and use that # unless it contains funny characters such as '%' or space, or # 3. use the OCF_RESOURCE_INSTANCE if [ x"${OCF_RESKEY_name}" != x ]; then DOMAIN_NAME="${OCF_RESKEY_name}" else if [ -f "${OCF_RESKEY_xmfile}" ]; then DOMAIN_NAME=`awk '$1~/^name(=|$)/{print}' ${OCF_RESKEY_xmfile} | sed 's/.*=[[:space:]]*//' | tr -d "[\"']"` if echo "$DOMAIN_NAME" | grep -qs '[%[:space:]]'; then DOMAIN_NAME="" fi fi DOMAIN_NAME=${DOMAIN_NAME:-${OCF_RESOURCE_INSTANCE}} fi for binary in xm sed awk; do check_binary $binary done if have_binary xen-destroy ; then xenkill="xen-destroy" else xenkill="xm destroy" fi if [ -n "$OCF_RESKEY_shutdown_timeout" ]; then ocf_is_decimal "$OCF_RESKEY_shutdown_timeout" || { ocf_log err "shutdown_timeout must be a number" exit $OCF_ERR_CONFIGURED } fi case $1 in start) Xen_Start ;; stop) Xen_Stop ;; migrate_to) Xen_Migrate_To ;; migrate_from) Xen_Migrate_From ;; monitor) Xen_Monitor ;; status) Xen_Status ${DOMAIN_NAME} ;; validate-all) Xen_Validate_All ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/apache b/heartbeat/apache index 4c59e4747..d70276548 100755 --- a/heartbeat/apache +++ b/heartbeat/apache @@ -1,610 +1,547 @@ #!/bin/sh # # High-Availability Apache/IBMhttp control script # # apache (aka IBMhttpd) # # Description: starts/stops apache web servers. # # Author: Alan Robertson # Sun Jiang Dong # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2005 International Business Machines # # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 apache::/opt/IBMHTTPServer/conf/httpd.conf # node1 10.0.0.170 IBMhttpd # # Our parsing of the Apache config files is very rudimentary. # It'll work with lots of different configurations - but not every # possible configuration. # # Patches are being accepted ;-) # # OCF parameters: # OCF_RESKEY_configfile # OCF_RESKEY_httpd # OCF_RESKEY_port # OCF_RESKEY_statusurl # OCF_RESKEY_options # OCF_RESKEY_testregex # OCF_RESKEY_client # OCF_RESKEY_testurl # OCF_RESKEY_testregex10 # OCF_RESKEY_testconffile # OCF_RESKEY_testname # OCF_RESKEY_envfiles : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/apache-conf.sh . ${OCF_FUNCTIONS_DIR}/http-mon.sh HA_VARRUNDIR=${HA_VARRUN} ####################################################################### # # Configuration options - usually you don't need to change these # ####################################################################### # IBMHTTPD=/opt/IBMHTTPServer/bin/httpd HTTPDLIST="/sbin/httpd2 /usr/sbin/httpd2 /usr/sbin/apache2 /sbin/httpd /usr/sbin/httpd /usr/sbin/apache $IBMHTTPD" MPM=/usr/share/apache2/find_mpm if [ -x $MPM ] then HTTPDLIST="$HTTPDLIST `$MPM 2>/dev/null`" fi LOCALHOST="http://localhost" HTTPDOPTS="-DSTATUS" DEFAULT_IBMCONFIG=/opt/IBMHTTPServer/conf/httpd.conf DEFAULT_NORMCONFIG="/etc/apache2/httpd.conf" # # You can also set # HTTPD # PORT # STATUSURL # CONFIGFILE # in this section if what we're doing doesn't work for you... # # End of Configuration options ####################################################################### CMD=`basename $0` # The config-file-pathname is the pathname to the configuration # file for this web server. Various appropriate defaults are # assumed if no config file is specified. If this command is # invoked as *IBM*, then the default config file name is # $DEFAULT_IBMCONFIG, otherwise the default config file # will be $DEFAULT_NORMCONFIG. usage() { cat <<-! usage: $0 action action: start start the web server stop stop the web server status return the status of web server, run or down monitor return TRUE if the web server appears to be working. For this to be supported you must configure mod_status and give it a server-status URL. You have to have installed either curl or wget for this to work. meta-data show meta data message validate-all validate the instance parameters ! - exit $1 } # # return TRUE if a process with given PID is running # ProcessRunning() { ApachePID=$1 # Use /proc if it looks like it's here... if [ -d /proc -a -d /proc/1 ] then [ -d /proc/$ApachePID ] else # This assumes we're running as root... kill -s 0 "$ApachePID" >/dev/null 2>&1 fi } - silent_status() { if [ -f $PidFile ] then ProcessRunning `cat $PidFile` else : No pid file false fi } # May be useful to add other distros in future validate_default_config() { if [ -e /etc/SuSE-release ]; then validate_default_suse_config else return 0 fi } # When using the default /etc/apache2/httpd.conf on SUSE, the file # /etc/apache2/sysconfig.d/include.conf is required to be present, # but this is only generated if you run the apache init script # (with contents derived from /etc/sysconfig/apache2). So, here, # if we're using the default system config file and it requires # that include, we run "/etc/init.d/apache2 configtest" to ensure # the relevant config is generated and valid. We're also taking # this opportunity to enable mod_status if it's not present. validate_default_suse_config() { if [ "$CONFIGFILE" = "$DEFAULT_NORMCONFIG" ] && \ - grep -Eq '^Include\s+/etc/apache2/sysconfig.d/include.conf' "$CONFIGFILE" + grep -Eq '^Include[[:space:]]+/etc/apache2/sysconfig.d/include.conf' "$CONFIGFILE" then [ -x "/usr/sbin/a2enmod" ] && ocf_run -q /usr/sbin/a2enmod status ocf_run -q /etc/init.d/apache2 configtest return else return 0 fi } -start_apache() { +apache_start() { if silent_status then ocf_log info "$CMD already running (pid $ApachePID)" return $OCF_SUCCESS fi validate_default_config || return $OCF_ERR_CONFIGURED # https://bugs.launchpad.net/ubuntu/+source/apache2/+bug/603211 [ -d /var/run/apache2 ] || mkdir /var/run/apache2 ocf_run $HTTPD $HTTPDOPTS $OPTIONS -f $CONFIGFILE tries=0 while : # wait until the user set timeout do - monitor_apache + apache_monitor ec=$? if [ $ec -eq $OCF_NOT_RUNNING ] then tries=`expr $tries + 1` ocf_log info "waiting for apache $CONFIGFILE to come up" sleep 1 else break fi done if [ $ec -ne 0 ] && silent_status; then - stop_apache + apache_stop fi return $ec } -stop_apache() { +apache_stop() { if silent_status then if kill $ApachePID then tries=0 while ProcessRunning $ApachePID && [ $tries -lt 10 ] do sleep 1 kill $ApachePID >/dev/null ocf_log info "Killing apache PID $ApachePID" tries=`expr $tries + 1` done else ocf_log warn "Killing apache PID $ApachePID FAILED." fi if ProcessRunning $ApachePID then ocf_log info "$CMD still running ($ApachePID)." false else ocf_log info "$CMD stopped." fi else ocf_log info "$CMD is not running." fi for sig in SIGTERM SIGHUP SIGKILL ; do if pgrep -f $HTTPD.*$CONFIGFILE >/dev/null ; then pkill -$sig -f $HTTPD.*$CONFIGFILE >/dev/null ocf_log info "apache children were signalled ($sig)" sleep 1 else break fi done } -status_apache() { - silent_status - rc=$? - if - [ $rc -eq 0 ] - then - ocf_log info "$CMD is running (pid $ApachePID)." - return $OCF_SUCCESS - else - ocf_log info "$CMD is stopped." - return $OCF_NOT_RUNNING - fi -} - -monitor_apache_extended() { +apache_monitor_10() { if [ "$TESTCONFFILE" ]; then readtestconf < $TESTCONFFILE else test_url="$TESTURL" test_regex="$TESTREGEX10" fi whattorun=`gethttpclient` fixtesturl is_testconf_sane || return $OCF_ERR_CONFIGURED - $whattorun "$test_url" | grep -Ei "$test_regex" > /dev/null + if $whattorun "$test_url" | grep -Ei "$test_regex" > /dev/null + then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi } -monitor_apache_basic() { - if [ -z "$STATUSURL" ]; then - ocf_log err "statusurl parameter empty" - return $OCF_ERR_CONFIGURED - elif [ -z "$ourhttpclient" ]; then - ocf_log err "could not find a http client; make sure that either wget or curl is available" - return $OCF_ERR_CONFIGURED +apache_monitor_basic() { + if ${ourhttpclient}_func "$STATUSURL" | grep -Ei "$TESTREGEX" > /dev/null + then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC fi - ${ourhttpclient}_func "$STATUSURL" | grep -Ei "$TESTREGEX" > /dev/null } -monitor_apache() { +apache_monitor() { silent_status if [ $? -ne 0 ]; then ocf_log info "$CMD not running" return $OCF_NOT_RUNNING fi ourhttpclient=`findhttpclient` # we'll need one - monitor_apache_basic - rc=$? - [ $rc -ne 0 ] && return $rc - case "$OCF_CHECK_LEVEL" in - ""|0) true;; - 10) monitor_apache_extended;; - *) - ocf_log err "bad OCF_CHECK_LEVEL: $OCF_CHECK_LEVEL" - return $OCF_ERR_CONFIGURED - ;; + if [ -z "$ourhttpclient" ]; then + ocf_log err "could not find a http client; make sure that either wget or curl is available" + return $OCF_ERR_INSTALLED + fi + case `ocf_check_level 10` in + 0) apache_monitor_basic;; + 10) apache_monitor_10;; esac } -metadata_apache(){ +apache_meta_data(){ cat < 1.0 This is the resource agent for the Apache web server. This resource agent operates both version 1.x and version 2.x Apache servers. The start operation ends with a loop in which monitor is repeatedly called to make sure that the server started and that it is operational. Hence, if the monitor operation does not succeed within the start operation timeout, the apache resource will end with an error status. The monitor operation by default loads the server status page which depends on the mod_status module and the corresponding configuration file (usually /etc/apache2/mod_status.conf). Make sure that the server status page works and that the access is allowed *only* from localhost (address 127.0.0.1). See the statusurl and testregex attributes for more details. See also http://httpd.apache.org/ Manages an Apache web server instance The full pathname of the Apache configuration file. This file is parsed to provide defaults for various other resource agent parameters. configuration file path The full pathname of the httpd binary (optional). httpd binary path A port number that we can probe for status information using the statusurl. This will default to the port number found in the configuration file, or 80, if none can be found in the configuration file. httpd port The URL to monitor (the apache server status page by default). If left unspecified, it will be inferred from the apache configuration file. If you set this, make sure that it succeeds *only* from the localhost (127.0.0.1). Otherwise, it may happen that the cluster complains about the resource being active on multiple nodes. url name Regular expression to match in the output of statusurl. Case insensitive. monitor regular expression Client to use to query to Apache. If not specified, the RA will try to find one on the system. Currently, wget and curl are supported. For example, you can set this parameter to "curl" if you prefer that to wget. http client URL to test. If it does not start with "http", then it's considered to be relative to the Listen address. test url Regular expression to match in the output of testurl. Case insensitive. extended monitor regular expression A file which contains test configuration. Could be useful if you have to check more than one web application or in case sensitive info should be passed as arguments (passwords). Furthermore, using a config file is the only way to specify certain parameters. Please see README.webapps for examples and file description. test configuration file Name of the test within the test configuration file. test name Extra options to apply when starting apache. See man httpd(8). command line options Files (one or more) which contain extra environment variables. If you want to prevent script from reading the default file, set this parameter to empty string. environment settings files We will try to detect if the URL (for monitor) is IPv6, but if that doesn't work set this to true to enforce IPv6. use ipv6 with http clients END - - exit $OCF_SUCCESS + return $OCF_SUCCESS } -validate_all_apache() { - +apache_validate_all() { if CheckPort $PORT; then # We are sure to succeed here, since we forced $PORT to be valid in GetParams() : OK else ocf_log err "Port number $PORT is invalid!" - exit $OCF_ERR_ARGS + return $OCF_ERR_INSTALLED fi - if [ -z $STATUSURL ]; then - : OK to be empty - else - case $STATUSURL in - http://*) ;; - *) - ocf_log err "Invalid STATUSURL $STATUSURL" - exit $OCF_ERR_ARGS ;; - esac - fi + case $STATUSURL in + http://*) ;; + *) + ocf_log err "Invalid STATUSURL $STATUSURL" + return $OCF_ERR_CONFIGURED ;; + esac + if [ ! -x $HTTPD ]; then ocf_log err "HTTPD $HTTPD not found or is not an executable!" - exit $OCF_ERR_ARGS + return $OCF_ERR_INSTALLED fi if [ ! -f $CONFIGFILE ]; then # We are sure to succeed here, since we have parsed $CONFIGFILE before getting here ocf_log err "Configuration file $CONFIGFILE not found!" - exit $OCF_ERR_CONFIGURED + return $OCF_ERR_INSTALLED fi - return $OCF_SUCCESS } -if - [ $# -eq 1 ] -then - COMMAND=$1 +find_httpd_prog() { + case $0 in + *IBM*) HTTPD=$IBMHTTPD + DefaultConfig=$DEFAULT_IBMCONFIG;; + *) + HTTPD= + for h in $HTTPDLIST + do + if + [ -f $h -a -x $h ] + then + HTTPD=$h + break + fi + done + # Let the user know that the $HTTPD used is not the one (s)he specified via $OCF_RESKEY_httpd + if + [ "X$OCF_RESKEY_httpd" != X -a "X$HTTPD" != X ] + then + ocf_log info "Using $HTTPD as HTTPD" + fi + DefaultConfig=$DEFAULT_NORMCONFIG;; + esac +} + +apache_getconfig() { + # these variables are global HTTPD="$OCF_RESKEY_httpd" PORT="$OCF_RESKEY_port" STATUSURL="$OCF_RESKEY_statusurl" CONFIGFILE="$OCF_RESKEY_configfile" OPTIONS="$OCF_RESKEY_options" CLIENT=${OCF_RESKEY_client} TESTREGEX=${OCF_RESKEY_testregex:-''} TESTURL="$OCF_RESKEY_testurl" TESTREGEX10=${OCF_RESKEY_testregex10} TESTCONFFILE="$OCF_RESKEY_testconffile" TESTNAME="$OCF_RESKEY_testname" : ${OCF_RESKEY_envfiles="/etc/apache2/envvars"} source_envfiles $OCF_RESKEY_envfiles -else - usage $OCF_ERR_ARGS -fi -LSB_STATUS_STOPPED=3 -if - [ "X$HTTPD" = X -o ! -f "$HTTPD" -o ! -x "$HTTPD" ] -then - case $0 in - *IBM*) HTTPD=$IBMHTTPD - DefaultConfig=$DEFAULT_IBMCONFIG;; - *) - HTTPD= - for h in $HTTPDLIST - do - if - [ -f $h -a -x $h ] - then - HTTPD=$h - break - fi - done -# It is possible that we still do not have a valid httpd at this stage - if - [ -z "$HTTPD" ] - then - case $COMMAND in - stop) exit $OCF_SUCCESS;; - monitor) exit $OCF_NOT_RUNNING;; - status) exit $LSB_STATUS_STOPPED;; - meta-data) metadata_apache;; - esac - ocf_log err "No valid httpd found! Please revise your item" - exit $OCF_ERR_INSTALLED - fi -# Let the user know that the $HTTPD used is not the one (s)he specified via $OCF_RESKEY_httpd - if - [ "X$OCF_RESKEY_httpd" != X ] - then - ocf_log info "Using $HTTPD as HTTPD" - fi - DefaultConfig=$DEFAULT_NORMCONFIG;; - esac -fi -httpd_basename=`basename $HTTPD` -case $httpd_basename in - *-*) httpd_basename=`echo "$httpd_basename" | sed -e 's%\-.*%%'`;; -esac - -case "$CONFIGFILE" in - "") CONFIGFILE=$DefaultConfig;; - *) ;; -esac - -if - [ ! -f "$CONFIGFILE" ] -then - case $COMMAND in - stop) ocf_log warn "$CONFIGFILE not found - apache considered stopped" - exit $OCF_SUCCESS;; - monitor) exit $OCF_NOT_RUNNING;; - status) exit $LSB_STATUS_STOPPED;; + if + [ "X$HTTPD" = X -o ! -f "$HTTPD" -o ! -x "$HTTPD" ] + then + find_httpd_prog + fi + CONFIGFILE=${CONFIGFILE:-$DefaultConfig} + httpd_basename=`basename $HTTPD` + case $httpd_basename in + *-*) httpd_basename=`echo "$httpd_basename" | sed -e 's%\-.*%%'`;; esac -fi + GetParams $CONFIGFILE +} -if - [ "X$COMMAND" = Xmeta-data ] || GetParams $CONFIGFILE -then - : OK -else - ocf_log err "Cannot parse config file [$CONFIGFILE]" - exit $OCF_ERR_INSTALLED -fi +OCF_REQUIRED_PARAMS="" +OCF_REQUIRED_BINARIES="" +ocf_rarun $* -case $COMMAND in - start) start_apache;; - stop) stop_apache;; - status) status_apache;; - monitor) monitor_apache;; - meta-data) metadata_apache;; - validate-all) validate_all_apache;; - *) usage $OCF_ERR_UNIMPLEMENTED;; -esac +# vim:sw=2:ts=8: diff --git a/heartbeat/apache-conf.sh b/heartbeat/apache-conf.sh index c2e24fc6b..12723cb25 100644 --- a/heartbeat/apache-conf.sh +++ b/heartbeat/apache-conf.sh @@ -1,181 +1,185 @@ # # Common apache code # (sourced by apache) # # Author: Alan Robertson # Sun Jiang Dong # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2005 International Business Machines # source_envfiles() { for f; do [ -f "$f" -a -r "$f" ] && . "$f" done } apachecat() { awk ' function procline() { split($0,a); if( a[1]~/^[Ii]nclude$/ ) { procinclude(a[2]); } else { if( a[1]=="ServerRoot" ) { rootdir=a[2]; gsub("\"","",rootdir); } print; } } function printfile(infile, a) { while( (getline 0 ) { procline(); } close(infile); } function allfiles(dir, cmd,f) { cmd="find -L "dir" -type f"; while( ( cmd | getline f ) > 0 ) { printfile(f); } close(cmd); } function listfiles(pattern, cmd,f) { cmd="ls "pattern" 2>/dev/null"; while( ( cmd | getline f ) > 0 ) { printfile(f); } close(cmd); } function procinclude(spec) { if( rootdir!="" && spec!~/^\// ) { spec=rootdir"/"spec; } if( isdir(spec) ) { allfiles(spec); # read all files in a directory (and subdirs) } else { listfiles(spec); # there could be jokers } } function isdir(s) { return !system("test -d \""s"\""); } { procline(); } ' $1 | sed 's/#.*//;s/[[:blank:]]*$//;s/^[[:blank:]]*//' | grep -v '^$' } # # set parameters (as shell vars) from our apache config file # get_apache_params() { configfile=$1 shift 1 vars=`echo $@ | sed 's/ /,/g'` eval ` apachecat $configfile | awk -v vars="$vars" ' BEGIN{ split(vars,v,","); for( i in v ) vl[i]=tolower(v[i]); } { for( i in v ) if( tolower($1)==vl[i] ) { print v[i]"="$2 delete vl[i] break } } '` } # # Return the location(s) that are handled by the given handler # FindLocationForHandler() { PerlScript='while (<>) { /"]+)/i && ($loc=$1); '"/SetHandler +$2"'/i && print "$loc\n"; }' apachecat $1 | perl -e "$PerlScript" } # # Check if the port is valid # CheckPort() { ocf_is_decimal "$1" && [ $1 -gt 0 ] } buildlocalurl() { [ "x$Listen" != "x" ] && echo "http://${Listen}" || echo "${LOCALHOST}:${PORT}" } # the test url may need a local prefix (as specified in the # apache Listen directive) fixtesturl() { echo $test_url | grep -qs "^http" && return test_url="`buildlocalurl`$test_url" } # # Get all the parameters we need from the Apache config file # GetParams() { ConfigFile=$1 if [ ! -f $ConfigFile ]; then - return 1 + return $OCF_ERR_INSTALLED fi get_apache_params $ConfigFile ServerRoot PidFile Port Listen case $PidFile in /*) ;; [[:alnum:]]*) PidFile=$ServerRoot/$PidFile;; *) PidFile=$HA_VARRUNDIR/${httpd_basename}.pid;; esac for p in "$PORT" "$Port" 80; do if CheckPort "$p"; then PORT="$p" break fi done echo $Listen | grep ':' >/dev/null || # Listen could be just port spec Listen="localhost:$Listen" # # It's difficult to figure out whether the server supports # the status operation. # (we start our server with -DSTATUS - just in case :-)) # # Typically (but not necessarily) the status URL is /server-status # # For us to think status will work, we have to have the following things: # # - The server-status handler has to be mapped to some URL somewhere # # We assume that: # # - the "main" web server at $PORT will also support it if we can find it # somewhere in the file # - it will be supported at the same URL as the one we find in the file # # If this doesn't work for you, then set the statusurl attribute. # if [ "X$STATUSURL" = "X" ] then StatusURL=`FindLocationForHandler $1 server-status | tail -1` STATUSURL="`buildlocalurl`$StatusURL" fi - test "$PidFile" + if ! test "$PidFile"; then + return $OCF_ERR_INSTALLED + else + return $OCF_SUCCESS + fi } diff --git a/heartbeat/conntrackd b/heartbeat/conntrackd index 7502f5af1..32eab6bb8 100755 --- a/heartbeat/conntrackd +++ b/heartbeat/conntrackd @@ -1,327 +1,335 @@ #!/bin/bash # # # An OCF RA for conntrackd # http://conntrack-tools.netfilter.org/ # # Copyright (c) 2011 Dominik Klein # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### OCF_RESKEY_binary_default=conntrackd OCF_RESKEY_config_default=/etc/conntrackd/conntrackd.conf -: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} + +# For users of versions prior to 1.2: +# Map renamed parameter "conntrackd" to "binary" if in use +: ${OCF_RESKEY_binary=${OCF_RESKEY_conntrackd-${OCF_RESKEY_binary_default}}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} meta_data() { cat < -1.1 +1.2 Master/Slave OCF Resource Agent for conntrackd This resource agent manages conntrackd - + Name of the conntrackd executable. If conntrackd is installed and available in the default PATH, it is sufficient to configure the name of the binary For example "my-conntrackd-binary-version-0.9.14" If conntrackd is installed somewhere else, you may also give a full path For example "/packages/conntrackd-0.9.14/sbin/conntrackd" Name of the conntrackd executable Full path to the conntrackd.conf file. For example "/packages/conntrackd-0.9.14/etc/conntrackd/conntrackd.conf" Path to conntrackd.conf END } meta_expect() { local what=$1 whatvar=OCF_RESKEY_CRM_meta_${1//-/_} op=$2 expect=$3 local val=${!whatvar} if [[ -n $val ]]; then # [, not [[, or it won't work ;) [ $val $op $expect ] && return fi ocf_log err "meta parameter misconfigured, expected $what $op $expect, but found ${val:-unset}." exit $OCF_ERR_CONFIGURED } conntrackd_is_master() { # You can't query conntrackd whether it is master or slave. It can be both at the same time. # This RA creates a statefile during promote and enforces master-max=1 and clone-node-max=1 ha_pseudo_resource $statefile monitor } conntrackd_set_master_score() { ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 } conntrackd_monitor() { rc=$OCF_NOT_RUNNING # It does not write a PID file, so check the socket exists after # extracting its path from the configuration file local conntrack_socket=$(awk '/^[ \t]*UNIX[ \t]*{/,/^[ \t]*}/ { if ($1 == "Path") { print $2 } }' $OCF_RESKEY_config) [ -S "$conntrack_socket" ] && rc=$OCF_SUCCESS if [ "$rc" -eq "$OCF_SUCCESS" ]; then # conntrackd is running # now see if it acceppts queries if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -s > /dev/null 2>&1; then rc=$OCF_ERR_GENERIC ocf_log err "conntrackd is running but not responding to queries" fi if conntrackd_is_master; then rc=$OCF_RUNNING_MASTER # Restore master setting on probes if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then conntrackd_set_master_score $master_score fi else # Restore master setting on probes if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then conntrackd_set_master_score $slave_score fi fi fi return $rc } conntrackd_start() { rc=$OCF_ERR_GENERIC # Keep trying to start the resource; # wait for the CRM to time us out if this fails while :; do conntrackd_monitor status=$? case "$status" in $OCF_SUCCESS) - rc=$OCF_SUCCESS conntrackd_set_master_score $slave_score + # -n = request resync from the others + if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -n; then + ocf_log err "$OCF_RESKEY_binary -C $OCF_RESKEY_config -n failed during start." + rc=$OCF_ERR_GENERIC + else + rc=$OCF_SUCCESS + fi break ;; $OCF_NOT_RUNNING) ocf_log info "Starting conntrackd" $OCF_RESKEY_binary -C $OCF_RESKEY_config -d ;; $OCF_RUNNING_MASTER) ocf_log warn "conntrackd already in master mode, demoting." ha_pseudo_resource $statefile stop ;; $OCF_ERR_GENERIC) ocf_log err "conntrackd start failed" rc=$OCF_ERR_GENERIC break ;; esac done return $rc } conntrackd_stop() { rc=$OCF_ERR_GENERIC # Keep trying to bring down the resource; # wait for the CRM to time us out if this fails while :; do conntrackd_monitor status=$? case "$status" in $OCF_SUCCESS|$OCF_ERR_GENERIC) ocf_log info "Stopping conntrackd" $OCF_RESKEY_binary -C $OCF_RESKEY_config -k ;; $OCF_NOT_RUNNING) rc=$OCF_SUCCESS break ;; $OCF_RUNNING_MASTER) ocf_log warn "conntrackd still master" ;; esac done return $rc } conntrackd_validate_all() { check_binary "$OCF_RESKEY_binary" if ! [ -e "$OCF_RESKEY_config" ]; then ocf_log err "Config FILE $OCF_RESKEY_config does not exist" return $OCF_ERR_INSTALLED fi meta_expect master-node-max = 1 meta_expect master-max = 1 meta_expect clone-node-max = 1 - meta_expect clone-max = 2 return $OCF_SUCCESS } conntrackd_promote() { rc=$OCF_SUCCESS if ! conntrackd_is_master; then # -c = Commit the external cache to the kernel # -f = Flush internal and external cache # -R = resync with the kernel table # -B = send a bulk update on the line for parm in c f R B; do if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then ocf_log err "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during promote." rc=$OCF_ERR_GENERIC break fi done ha_pseudo_resource $statefile start conntrackd_set_master_score $master_score fi return $rc } conntrackd_demote() { rc=$OCF_SUCCESS if conntrackd_is_master; then # -t = shorten kernel timers to remove zombies # -n = request a resync from the others for parm in t n; do if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then ocf_log err "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during demote." rc=$OCF_ERR_GENERIC break fi done ha_pseudo_resource $statefile stop conntrackd_set_master_score $slave_score fi return $rc } conntrackd_notify() { hostname=$(hostname) # OCF_RESKEY_CRM_meta_notify_master_uname is a whitespace separated list of master hostnames for master in $OCF_RESKEY_CRM_meta_notify_master_uname; do # if we are the master and an instance was just started on another node: # send a bulk update to allow failback if [ "$hostname" = "$master" -a "$OCF_RESKEY_CRM_meta_notify_type" = "post" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "start" -a "$OCF_RESKEY_CRM_meta_notify_start_uname" != "$hostname" ]; then ocf_log info "Sending bulk update in post start to peers to allow failback" $OCF_RESKEY_binary -C $OCF_RESKEY_config -B fi done for tobepromoted in $OCF_RESKEY_CRM_meta_notify_promote_uname; do # if there is a promote action to be executed on another node: # send a bulk update to allow failback if [ "$hostname" != "$tobepromoted" -a "$OCF_RESKEY_CRM_meta_notify_type" = "pre" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "promote" ]; then ocf_log info "Sending bulk update in pre promote to peers to allow failback" $OCF_RESKEY_binary -C $OCF_RESKEY_config -B fi done } conntrackd_usage() { cat <, 1997-1999 +# Peter Poeml , 2000-2006 +# Marius Tomaschewski , 2006-2010 +# +# and Linux-HA contributors + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_binary_default="dhcpd" +OCF_RESKEY_pid_default="/var/run/dhcpd.pid" +OCF_RESKEY_user_default=dhcpd +OCF_RESKEY_group_default=nogroup +OCF_RESKEY_config_default="" +OCF_RESKEY_chrooted_default="true" +OCF_RESKEY_chrooted_path_default="/var/lib/dhcp" +OCF_RESKEY_leases_default="/db/dhcpd.leases" +OCF_RESKEY_interface_default="" +OCF_RESKEY_includes_default="" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} +: ${OCF_RESKEY_chrooted=${OCF_RESKEY_chrooted_default}} +: ${OCF_RESKEY_chrooted_path=${OCF_RESKEY_chrooted_path_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_leases=${OCF_RESKEY_leases_default}} +: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} +: ${OCF_RESKEY_includes=${OCF_RESKEY_includes_default}} + +# To enable support for different versions of dhcp, we need +# to know what version we are being run against. +DHCP_VERSION_MAJOR=`$OCF_RESKEY_binary --version 2>&1 | awk -F- '{print $3}' | awk -F. '{print $1}' | sed s/^[a-zA-Z]//g` + +# These files are always copied by default to ensure the chroot environment works. +DEFAULT_FILE_LIST="/etc/gai.conf /etc/nsswitch.conf /etc/resolv.conf /etc/host.conf /etc/hosts /etc/localtime /dev/urandom" + +usage() { + cat < + + + 0.1 + +Manage an ISC DHCP server service in a chroot environment. + + Chrooted ISC DHCP Server resource agent. + + + + The absolute path to the DHCP server configuration file. + + Configuration file + + + + + Configure the dhcpd service to run in a chrooted or non-chrooted + mode. + + Enable chroot mode + + + + + The absolute path of the chrooted DHCP environment. + + The chrooted path + + + + + The binary for the DHCP server process. An absolute path + definition is not required, but can be used to override + environment path. + + dhcpd binary + + + + + The system user the DHCP server process will run as when + it is chrooted. + + dhcpd owner + + + + + The system group the DHCP server process will run as when + it is chrooted. + + dhcpd group owner + + + + + The network interface(s) the DHCP server process will + bind to. A blank value will bind the process to all + interfaces. + + Network Interface + + + + + This parameter provides a means to copy include files + into the chrooted environment. If a dhcpd.conf file + contains a line similar to this: + + include "/etc/named.keys"; + + Then an admin also has to tell the dhcpd RA that this + file should be pulled into the chrooted environment. This + is a space delimited list. + + Include Files + + + + + The leases database file, relative to chrooted_path. + + Leases file + + + + + The path and filename of the PID file. It is relative + to chrooted_path. + + PID file + + + + + + + + + + + +EOF +} + +# Validate most critical parameters +dhcpd_validate_all() { + check_binary $OCF_RESKEY_binary + + + if ! ocf_is_probe; then + # Test for the appropriate configuration files depending on if + # chroot mode is enabled. + if ocf_is_true $OCF_RESKEY_chrooted ; then + if ! test -e "$OCF_RESKEY_chrooted_path"; then + ocf_log err "Path $OCF_RESKEY_chrooted_path does not exist." + return $OCF_ERR_INSTALLED + fi + + if test -n "$OCF_RESKEY_chrooted_path/$OCF_RESKEY_config" -a ! -r "$OCF_RESKEY_chrooted_path/$OCF_RESKEY_config"; then + ocf_log err "Configuration file $OCF_RESKEY_chrooted_path/$OCF_RESKEY_config doesn't exist" + return $OCF_ERR_INSTALLED + fi + else + if test -n "$OCF_RESKEY_config" -a ! -r "$OCF_RESKEY_config"; then + ocf_log err "Configuration file $OCF_RESKEY_config doesn't exist" + return $OCF_ERR_INSTALLED + fi + fi + + fi + + if ! getent passwd $OCF_RESKEY_user >/dev/null 2>&1; then + ocf_log err "User $OCF_RESKEY_user doesn't exist" + return $OCF_ERR_INSTALLED + fi + + return $OCF_SUCCESS +} + +# dhcpd_monitor. Send a request to dhcpd and check response. +dhcpd_monitor() { + # Assume chrooted mode is being used, but if not update the PIDF + # variable to point to the non-chrooted PID file. + PIDF="$OCF_RESKEY_chrooted_path/$OCF_RESKEY_pid" + + if ! ocf_is_true $OCF_RESKEY_chrooted ; then + PIDF=`dirname $OCF_RESKEY_pid`/dhcpd/`basename $OCF_RESKEY_pid` + fi + + ocf_pidfile_status $PIDF >/dev/null 2>&1 || return $OCF_NOT_RUNNING + + return $OCF_SUCCESS +} + +# Initialize Chroot +dhcpd_initialize_chroot() { + # If we are running the initialization for the first time, we need to make + # the new chrooted folder, in case we are not using the same default. + if ! [ -d $OCF_RESKEY_chrooted_path ] ; then + ocf_log info "Initializing $OCF_RESKEY_chrooted_path for use." + fi + + # Make sure all sub-paths are created if something went wrong during + # a partial run. + for i in db dev etc lib64 var/run; do + mkdir -p $OCF_RESKEY_chrooted_path/$i + done + + # If we are running version 4 of the dhcp server, we need to mount a proc partition. + if [ $DHCP_VERSION_MAJOR -ge 4 ] ; then + mkdir -p $OCF_RESKEY_chrooted_path/proc + + if ! [ -e $OCF_RESKEY_chrooted_path/proc/net/dev ] ; then + mount -t proc -o ro proc $OCF_RESKEY_chrooted_path/proc > /dev/null 2>&1 + fi + fi + + # If the folder to store the PID file does not exist, make it. + if ! [ -d "$OCF_RESKEY_chrooted_path`dirname $OCF_RESKEY_pid`" ] ; then + mkdir -p "$OCF_RESKEY_chrooted_path`dirname $OCF_RESKEY_pid`" + fi + + # Ensure all permissions are in place if the folder was re-created. + chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_chrooted_path/`dirname $OCF_RESKEY_leases` + chown -R $OCF_RESKEY_user:$OCF_RESKEY_group "$OCF_RESKEY_chrooted_path/`dirname $OCF_RESKEY_pid`" + + ## If there is no conf file, we can't initialize the chrooted + ## environment, return with "program not configured" + if ! [ -f $OCF_RESKEY_config ] ; then + ocf_log err "dhcpd has not been configured." + return $OCF_ERR_CONFIGURED + fi + + # If the leases file does not exist, create it, as this is a fresh install. + if [ ! -e $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases ]; then + touch $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases + fi + + # Remove the random device. + test -e "$OCF_RESKEY_chrooted_path/dev/urandom" && + rm -f $OCF_RESKEY_chrooted_path/dev/urandom + + # Test for the existance of the defined include files, and append + # them to the list of files to be copied. + for i in $OCF_RESKEY_includes ; do + if [ -e $i ] ; then + DEFAULT_FILE_LIST="$DEFAULT_FILE_LIST $i" + else + ocf_log err "include file $i does not exist" + return $OCF_ERR_INSTALLED + fi + done + + # Ensure all "modified" non-chrooted configuration files are copied into the chrooted environment. + for i in $OCF_RESKEY_config $DEFAULT_FILE_LIST; do + # First, lets make sure the directory exists within the chrooted environment. + if test -d "$i" ; then + mkdir -p $OCF_RESKEY_chrooted_path/$i + elif test -e "$i" ; then + mkdir -p "`dirname $OCF_RESKEY_chrooted_path/$i`" + fi + + # Next, we copy the configuration file into place. + cp -aL "$i" "$OCF_RESKEY_chrooted_path/${i%/*}/" > /dev/null 2>&1 || + { ocf_log err "could not copy $i to chroot jail"; return $OCF_ERR_GENERIC; } + done + + libdir=$(basename $(echo /var/lib/dhcp/lib*)) + if test -x /usr/bin/ldd ; then + get_ldd_deps() + { + ldd_wl="\/$libdir\/lib" + ldd_bl="\/$libdir\/libc\." + /usr/bin/ldd "$1" | while read a b c d ; do + [ -n "$c" ] || continue + [[ $c =~ $ldd_wl ]] || continue + [[ $c =~ $ldd_bl ]] && continue + echo $c + done + } + else + get_ldd_deps() { :; } + fi + cplibs=`for i in /$libdir/libresolv.so.* /$libdir/libnss_*.so.* /$libdir/libpthread.so.0 /$libdir/libdl.so.2 + do + if [ -s "$i" ] ; then + echo "$i" + get_ldd_deps "$i" + fi + done | sort -u` + for i in $cplibs ; do + if [ -s "$i" ]; then + cp -pL "$i" "/var/lib/dhcp/$libdir/" || + { ocf_log err "could not copy $i to chroot jail"; return $OCF_ERR_GENERIC; } + fi + done + + return $OCF_SUCCESS +} + +# Initialize a non-chroot environment +dhcpd_initialize() { + ## If there is no conf file, we can't start a dhcp service. + if ! [ -f $OCF_RESKEY_config ] ; then + ocf_log err "dhcpd has not been configured." + return $OCF_ERR_CONFIGURED + fi + + # As with the standard DHCP init script, we can still use the + # chrooted default path for storing the leases file. This behavior + # is consistent with the existing /etc/init.d/dhcpd script. + if ! [ -d $OCF_RESKEY_chrooted_path ] ; then + ocf_log info "Initializing $OCF_RESKEY_chrooted_path for use." + fi + + # If the leases file does not exist, create it, as this is a fresh install. + if [ ! -e $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases ]; then + touch $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases + fi + + # if the PID storage path does not exist, make it, and setup the permissions. + + # NOTE: This part of the script has a potential security flaw, in that if someone + # puts in /var/run as the path, it will change ownership to the dhcpd user + # and group. However, all that would do is allow that user to view the contents + # of the files, which they can do now anyway. If this becomes an issue, I can work + # in some changes. + + # We need to append "dhcpd" to the path for the PID file storage folder, because + # if /var/run is used, that folders permissions can not be changed, otherwise it affects + # more then just one application. + if ! [ -d `dirname $OCF_RESKEY_pid`/dhcpd ] ; then + mkdir -p `dirname $OCF_RESKEY_pid`/dhcpd + + if [ -n "$OCF_RESKEY_user" -a "x$OCF_RESKEY_user" != "xroot" ] ; then + chown $OCF_RESKEY_user `dirname $OCF_RESKEY_pid`/dhcpd + fi + + if [ -n "$OCF_RESKEY_group" -a "x$OCF_RESKEY_group" != "xwheel" ] ; then + chgrp $OCF_RESKEY_group `dirname $OCF_RESKEY_pid`/dhcpd + fi + fi + + return $OCF_SUCCESS +} + +# Start +dhcpd_start() { + # Lets make sure we are not already running. + if dhcpd_monitor; then + ocf_log info "dhcpd already running" + return $OCF_SUCCESS + fi + + # Only initialize the chrooted path(s) if chroot mode is enabled. + if ocf_is_true $OCF_RESKEY_chrooted ; then + dhcpd_initialize_chroot || + { ocf_log err "Could not fully initialize the chroot environment." ; return $OCF_ERR_INSTALLED; } + else + dhcpd_initialize || + { ocf_log err "Could not fully initialize the runtime environment." ; return $OCF_ERR_INSTALLED; } + fi + + dhcpd_validate_all || exit + + # Define an empty string variable, to ensure it exists when needed. + DHCPD_ARGS="" + + # To ensure consistent behavior with the standard DHCPD init script, + # use the chrooted default path for storing a leases file, when not in + # a chrooted enviroment. + if ocf_is_true $OCF_RESKEY_chrooted ; then + DHCPD_ARGS="$DHCPD_ARGS -chroot $OCF_RESKEY_chrooted_path -lf $OCF_RESKEY_leases" + else + DHCPD_ARGS="$DHCPD_ARGS -lf $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases" + fi + + if [ -n "$OCF_RESKEY_user" ]; then + DHCPD_ARGS="$DHCPD_ARGS -user $OCF_RESKEY_user" + fi + + if [ -n "$OCF_RESKEY_group" ]; then + DHCPD_ARGS="$DHCPD_ARGS -group $OCF_RESKEY_group" + fi + + # If there is a pid file containing a pid, the machine might have crashed. pid files in + # /var/run are always cleaned up at boot time, but this is not the case for the pid file in + # the chroot jail. Therefore, an old pid file may exist. This is only a problem if it + # incidentally contains the pid of a running process. If this process is not a 'dhcpd', + # we remove the pid. (dhcpd itself only checks whether the pid is alive or not.) + + PIDF="$OCF_RESKEY_chrooted_path/$OCF_RESKEY_pid" + + if ocf_is_true $OCF_RESKEY_chrooted ; then + ocf_log info "Starting dhcpd [chroot] service." + DHCPD_ARGS="$DHCPD_ARGS -pf $OCF_RESKEY_pid" + else + ocf_log info "Starting dhcpd [non-chroot] service." + PIDF=`dirname $OCF_RESKEY_pid`/dhcpd/`basename $OCF_RESKEY_pid` + DHCPD_ARGS="$DHCPD_ARGS -pf $PIDF" + fi + + test -e "$PIDF" && rm -f $PIDF + + ocf_run $OCF_RESKEY_binary -cf $OCF_RESKEY_config $DHCPD_ARGS $OCF_RESKEY_interface || + return $OCF_ERR_INSTALLED + + while ! dhcpd_monitor; do + sleep .1 + ocf_log info "waiting for dhcpd to start" + return $OCF_SUCCESS + done + + if ocf_is_true $OCF_RESKEY_chrooted ; then + ocf_log info "dhcpd [chrooted] has started." + else + ocf_log info "dhcpd [non-chrooted] has started." + fi + + return $OCF_SUCCESS +} + +# Stop +dhcpd_stop () { + local timeout + local timewait + local rc + + dhcpd_monitor + rc=$? + + case "$rc" in + "$OCF_SUCCESS") + # Currently running, and is expected behaviour. + ;; + "$OCF_NOT_RUNNING") + # Currently not running, therefore nothing to do. + ocf_log info "dhcpd already stopped" + return $OCF_SUCCESS + ;; + esac + + PIDF="$OCF_RESKEY_chrooted_path/$OCF_RESKEY_pid" + + if ! ocf_is_true $OCF_RESKEY_chrooted ; then + PIDF=`dirname $OCF_RESKEY_pid`/dhcpd/`basename $OCF_RESKEY_pid` + fi + + kill `cat $PIDF` + + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + timewait=$((OCF_RESKEY_CRM_meta_timeout/1500)) + + sleep 0.1; timeout=0 # Sleep here for .1 sec to let dhcpd finish. + while dhcpd_monitor ; do + if [ $timeout -ge $timewait ]; then + break + else + sleep 1 + timeout=`expr $timeout + 1` + fi + done + + #If still up + if dhcpd_monitor 2>&1; then + ocf_log err "dhcpd is still up! Trying kill -s KILL" + + kill -s SIGKILL `cat $PIDF` + fi + + # If we are running a dhcp server v4 or higher, unmount the proc partition. + if [ $DHCP_VERSION_MAJOR -ge 4 ] ; then + # We only want to unmount proc in a chrooted environment, else we could + # cause other issues. + if ocf_is_true $OCF_RESKEY_chrooted ; then + umount $OCF_RESKEY_chrooted_path/proc > /dev/null 2>&1 + fi + fi + + rm -f $PIDF + + ocf_log info "dhcpd stopped" + return $OCF_SUCCESS +} + +# Make sure meta-data and usage always succeed +case $__OCF_ACTION in +meta-data) dhcpd_meta_data + exit $OCF_SUCCESS + ;; +validate-all) dhcpd_validate_all + exit $OCF_SUCCESS + ;; +usage|help) dhcpd_usage + exit $OCF_SUCCESS + ;; +esac + +# Translate each action into the appropriate function call +case $__OCF_ACTION in +start) dhcpd_start;; +stop) dhcpd_stop;; +monitor) dhcpd_monitor;; +*) dhcpd_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/exportfs b/heartbeat/exportfs index 8bcc1ea73..1fdaffeb0 100755 --- a/heartbeat/exportfs +++ b/heartbeat/exportfs @@ -1,347 +1,368 @@ #!/bin/sh # exportfs # # Description: Manages nfs exported file system. # # (c) 2010 Ben Timby, Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # License: GNU General Public License v2 (GPLv2) and later ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_unlock_on_stop_default=0 OCF_RESKEY_wait_for_leasetime_on_stop_default=0 OCF_RESKEY_rmtab_backup_default=".rmtab" : ${OCF_RESKEY_unlock_on_stop=${OCF_RESKEY_unlock_on_stop_default}} : ${OCF_RESKEY_wait_for_leasetime_on_stop=${OCF_RESKEY_wait_for_leasetime_on_stop_default}} : ${OCF_RESKEY_rmtab_backup=${OCF_RESKEY_rmtab_backup_default}} ####################################################################### exportfs_meta_data() { cat < 1.0 Exportfs uses the exportfs command to add/remove nfs exports. It does NOT manage the nfs server daemon. It depends on Linux specific NFS implementation details, so is considered not portable to other platforms yet. Manages NFS exports The client specification allowing remote machines to mount the directory over NFS. Client ACL. The options to pass to exportfs for the exported directory. Export options. The directory which you wish to export using NFS. The directory to export. The fsid option to pass to exportfs. This can be a unique positive integer, a UUID, or the special string "root" which is functionally identical to numeric fsid of 0. 0 (root) identifies the export as the root of an NFSv4 pseudofilesystem -- avoid this setting unless you understand its special status. This value will override any fsid provided via the options parameter. Unique fsid within cluster. Relinquish NFS locks associated with this filesystem when the resource stops. Enabling this parameter is highly recommended unless the path exported by this ${__SCRIPT_NAME} resource is also exported by a different resource. Unlock filesystem on stop? When stopping (unexporting), wait out the NFSv4 lease time. Only after all leases have expired does the NFS kernel server relinquish all server-side handles on the exported filesystem. If this ${__SCRIPT_NAME} resource manages an export that resides on a mount point designed to fail over along with the NFS export itself, then enabling this parameter will ensure such failover is working properly. Note that when this parameter is set, your stop timeout MUST accommodate for the wait period. This parameter is safe to disable if none of your NFS clients are using NFS version 4 or later. Ride out the NFSv4 lease time on resource stop? Back up those entries from the NFS rmtab that apply to the exported directory, to the specified backup file. The filename is interpreted as relative to the exported directory. This backup is required if clients are connecting to the export via NFSv3 over TCP. Note that a configured monitor operation is required for this functionality. To disable rmtab backups, set this parameter to the special string "none". Location of the rmtab backup, relative to directory. END return $OCF_SUCCESS } backup_rmtab() { local rmtab_backup if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then rmtab_backup="${OCF_RESKEY_directory}/${OCF_RESKEY_rmtab_backup}" grep ":${OCF_RESKEY_directory}:" /var/lib/nfs/rmtab > ${rmtab_backup} fi } restore_rmtab() { local rmtab_backup if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then rmtab_backup="${OCF_RESKEY_directory}/${OCF_RESKEY_rmtab_backup}" if [ -r ${rmtab_backup} ]; then local tmpf=`mktemp` sort -u ${rmtab_backup} /var/lib/nfs/rmtab > $tmpf && install -o root -m 644 $tmpf /var/lib/nfs/rmtab rm -f $tmpf ocf_log debug "Restored `wc -l ${rmtab_backup}` rmtab entries from ${rmtab_backup}." else ocf_log warn "rmtab backup ${rmtab_backup} not found or not readable." fi fi } exportfs_usage() { cat < ${unlockfile} + ocf_log info "Unlocked NFS export ${OCF_RESKEY_directory}" + else + ocf_log warn "Unable to unlock NFS export ${OCF_RESKEY_directory}, ${unlockfile} not found or not writable" + fi +} +wait_for_leasetime() { + local leasetimefile + local sleeptime + leasetimefile=/proc/fs/nfsd/nfsv4leasetime + if [ -r ${leasetimefile} ]; then + sleeptime=$((`cat ${leasetimefile}`+2)) + ocf_log info "Sleeping ${sleeptime} seconds to accommodate for NFSv4 lease expiry" + sleep ${sleeptime}s + else + ocf_log warn "Unable to read NFSv4 lease time from ${leasetimefile}, file not found or not readable" + fi +} +cleanup_export_cache() { + # see if the cache is blocking unexport + local contentfile=/proc/net/rpc/nfsd.export/content + local fsid_re="fsid=$OCF_RESKEY_fsid," + local i=1 + while :; do + fgrep -q "$fsid_re" $contentfile || + break + ocf_log info "Cleanup export cache ... (try $i)" + ocf_run exportfs -f + sleep 0.5 + let i=$i+1 + done +} exportfs_stop () { exportfs_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then ocf_log debug "${OCF_RESKEY_directory} not exported" return $OCF_SUCCESS fi ocf_log info "Un-exporting file system ..." # Backup the rmtab to ensure smooth NFS-over-TCP failover backup_rmtab ocf_run exportfs -v -u ${OCF_RESKEY_clientspec}:${OCF_RESKEY_directory} rc=$? if ocf_is_true ${OCF_RESKEY_unlock_on_stop}; then - local unlockfile - unlockfile=/proc/fs/nfsd/unlock_filesystem - if [ -w ${unlockfile} ]; then - echo "${OCF_RESKEY_directory}" > ${unlockfile} - ocf_log info "Unlocked NFS export ${OCF_RESKEY_directory}" - else - ocf_log warn "Unable to unlock NFS export ${OCF_RESKEY_directory}, ${unlockfile} not found or not writable" - fi + unlock_fs fi if ocf_is_true ${OCF_RESKEY_wait_for_leasetime_on_stop}; then - local leasetimefile - local sleeptime - leasetimefile=/proc/fs/nfsd/nfsv4leasetime - if [ -r ${leasetimefile} ]; then - sleeptime=$((`cat ${leasetimefile}`+2)) - ocf_log info "Sleeping ${sleeptime} seconds to accommodate for NFSv4 lease expiry" - sleep ${sleeptime}s - else - ocf_log warn "Unable to read NFSv4 lease time from ${leasetimefile}, file not found or not readable" - fi + wait_for_leasetime fi if [ $rc -eq 0 ]; then + cleanup_export_cache ocf_log info "Un-exported file system" return $OCF_SUCCESS + else + ocf_log err "Failed to un-export file system" + exit $OCF_ERR_GENERIC fi - - ocf_log err "Failed to un-export file system" - exit $OCF_ERR_GENERIC } exportfs_validate () { # Checks for required parameters if [ -z "$OCF_RESKEY_directory" ]; then ocf_log err "Missing required parameter \"directory\"" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_fsid" ]; then ocf_log err "Missing required parameter \"fsid\"" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_clientspec" ]; then ocf_log err "Missing required parameter \"clientspec\"" exit $OCF_ERR_CONFIGURED fi # Checks applicable only to non-probes if ! ocf_is_probe; then if [ ! -d $OCF_RESKEY_directory ]; then ocf_log err "$OCF_RESKEY_directory does not exist or is not a directory" exit $OCF_ERR_INSTALLED fi fi } if [ $# -ne 1 ]; then exportfs_usage exit $OCF_ERR_ARGS fi case $__OCF_ACTION in meta-data) exportfs_meta_data exit $OCF_SUCCESS ;; usage|help) exportfs_usage exit $OCF_SUCCESS ;; *) ;; esac exportfs_validate case $__OCF_ACTION in start) exportfs_start ;; stop) exportfs_stop ;; status|monitor) exportfs_monitor ;; validate-all) # nothing to do -- we're already validated ;; *) exportfs_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/http-mon.sh b/heartbeat/http-mon.sh index c6576fb0c..d7b618204 100644 --- a/heartbeat/http-mon.sh +++ b/heartbeat/http-mon.sh @@ -1,119 +1,119 @@ # # General http monitor code # (sourced by apache and httpmon) # # Author: Alan Robertson # Sun Jiang Dong # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2005 International Business Machines # # default options for http clients # NB: We _always_ test a local resource, so it should be # safe to connect from the local interface. bind_address="127.0.0.1" curl_ipv6_opts="" if ocf_is_true "$OCF_RESKEY_use_ipv6" || echo "$STATUSURL" | grep -qs "::"; then bind_address="::1" curl_ipv6_opts="-g" fi WGETOPTS="-O- -q -L --no-proxy --bind-address=$bind_address" CURLOPTS="-o - -Ss -L --interface lo $curl_ipv6_opts" # # run the http client # curl_func() { cl_opts="$CURLOPTS $test_httpclient_opts" if [ x != "x$test_user" ]; then echo "-u $test_user:$test_password" | curl -K - $cl_opts "$1" else curl $cl_opts "$1" fi } wget_func() { auth="" cl_opts="$WGETOPTS $test_httpclient_opts" [ x != "x$test_user" ] && auth="--http-user=$test_user --http-passwd=$test_password" wget $auth $cl_opts "$1" } # # rely on whatever the user provided userdefined() { $test_httpclient $test_httpclient_opts "$1" } # # find a good http client # findhttpclient() { # prefer wget (for historical reasons) - if [ "x$CLIENT" != x ]; then + if [ "x$CLIENT" != x ] && which "$CLIENT" >/dev/null 2>&1; then echo "$CLIENT" elif which wget >/dev/null 2>&1; then echo "wget" elif which curl >/dev/null 2>&1; then echo "curl" else return 1 fi } gethttpclient() { [ -z "$test_httpclient" ] && test_httpclient=$ourhttpclient case "$test_httpclient" in curl|wget) echo ${test_httpclient}_func;; #these are supported *) echo userdefined;; esac } # test configuration good? is_testconf_sane() { if [ "x$test_regex" = x -o "x$test_url" = x ]; then ocf_log err "test regular expression or test url empty" return 1 fi if [ "x$test_user$test_password" != x -a \( "x$test_user" = x -o "x$test_password" = x \) ]; then ocf_log err "bad user authentication for extended test" return 1 fi return 0 } # # read the test definition from the config # readtestconf() { test_name="$1" # we look for this one or the first one if empty lcnt=0 readdef="" test_url="" test_regex="" test_user="" test_password="" test_httpclient="" test_httpclient_opts="" while read key value; do lcnt=$((lcnt+1)) if [ "$readdef" ]; then case "$key" in "url") test_url="$value" ;; "user") test_user="$value" ;; "password") test_password="$value" ;; "client") test_httpclient="$value" ;; "client_opts") test_httpclient_opts="$value" ;; "match") test_regex="$value" ;; "end") break ;; "#"*|"") ;; *) ocf_log err "$lcnt: $key: unknown keyword"; return 1 ;; esac else [ "$key" = "test" ] && [ -z "$test_name" -o "$test_name" = "$value" ] && readdef=1 fi done } diff --git a/heartbeat/iSCSILogicalUnit b/heartbeat/iSCSILogicalUnit index 25ee32e32..b55ccd0c2 100755 --- a/heartbeat/iSCSILogicalUnit +++ b/heartbeat/iSCSILogicalUnit @@ -1,506 +1,506 @@ #!/bin/bash # # # iSCSILogicalUnit OCF RA. Exports and manages iSCSI Logical Units. # # (c) 2009-2010 Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults # Set a default implementation based on software installed if have_binary ietadm; then OCF_RESKEY_implementation_default="iet" elif have_binary tgtadm; then OCF_RESKEY_implementation_default="tgt" elif have_binary lio_node; then OCF_RESKEY_implementation_default="lio" fi : ${OCF_RESKEY_implementation=${OCF_RESKEY_implementation_default}} # Use a default SCSI ID and SCSI SN that is unique across the cluster, # and persistent in the event of resource migration. # SCSI IDs are limited to 24 bytes, but only 16 bytes are known to be # supported by all iSCSI implementations this RA cares about. Thus, # for a default, use the first 16 characters of # $OCF_RESOURCE_INSTANCE. OCF_RESKEY_scsi_id_default="${OCF_RESOURCE_INSTANCE:0:16}" : ${OCF_RESKEY_scsi_id=${OCF_RESKEY_scsi_id_default}} # To have a reasonably unique default SCSI SN, use the first 8 bytes # of an MD5 hash of of $OCF_RESOURCE_INSTANCE sn=`echo -n "${OCF_RESOURCE_INSTANCE}" | openssl md5 | sed -e 's/(stdin)= //'` OCF_RESKEY_scsi_sn_default=${sn:0:8} : ${OCF_RESKEY_scsi_sn=${OCF_RESKEY_scsi_sn_default}} ####################################################################### meta_data() { cat < 0.9 Manages iSCSI Logical Unit. An iSCSI Logical unit is a subdivision of an SCSI Target, exported via a daemon that speaks the iSCSI protocol. Manages iSCSI Logical Units (LUs) The iSCSI target daemon implementation. Must be one of "iet", "tgt", or "lio". If unspecified, an implementation is selected based on the availability of management utilities, with "iet" being tried first, then "tgt", then "lio". iSCSI target daemon implementation The iSCSI Qualified Name (IQN) that this Logical Unit belongs to. iSCSI target IQN The Logical Unit number (LUN) exposed to initiators. Logical Unit number (LUN) The path to the block device exposed. Some implementations allow this to be a regular file, too. Block device (or file) path The SCSI ID to be configured for this Logical Unit. The default is the resource name, truncated to 24 bytes. SCSI ID The SCSI serial number to be configured for this Logical Unit. The default is a hash of the resource name, truncated to 8 bytes. SCSI serial number The SCSI vendor ID to be configured for this Logical Unit. SCSI vendor ID The SCSI product ID to be configured for this Logical Unit. SCSI product ID Additional LU parameters. A space-separated list of "name=value" pairs which will be passed through to the iSCSI daemon's management interface. The supported parameters are implementation dependent. Neither the name nor the value may contain whitespace. List of iSCSI LU parameters Allowed initiators. A space-separated list of initiators allowed to connect to this lun. Initiators may be listed in any syntax the target implementation allows. If this parameter is empty or not set, access to this lun will not be allowed from any initiator, if target is not in demo mode. This parameter is only necessary, when using LIO. List of iSCSI initiators allowed to connect to this lun. END } ####################################################################### iSCSILogicalUnit_usage() { cat < # (C) 2007 Novell Inc. All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # See usage() and meta_data() below for more details... # # OCF instance parameters: # OCF_RESKEY_portal: the iSCSI portal address or host name (required) # OCF_RESKEY_target: the iSCSI target (required) # OCF_RESKEY_iscsiadm: iscsiadm program path (optional) # OCF_RESKEY_discovery_type: discovery type (optional; default: sendtargets) +# OCF_RESKEY_try_recovery: wait for iSCSI recovery in monitor (optional; default: false) # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_udev_default="yes" OCF_RESKEY_iscsiadm_default="iscsiadm" OCF_RESKEY_discovery_type_default="sendtargets" +OCF_RESKEY_try_recovery_default="false" : ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}} : ${OCF_RESKEY_iscsiadm=${OCF_RESKEY_iscsiadm_default}} : ${OCF_RESKEY_discovery_type=${OCF_RESKEY_discovery_type_default}} usage() { methods=`iscsi_methods` methods=`echo $methods | tr ' ' '|'` cat < 1.0 OCF Resource Agent for iSCSI. Add (start) or remove (stop) iSCSI targets. Manages a local iSCSI initiator and its connections to iSCSI targets The iSCSI portal address in the form: {ip_address|hostname}[":"port] Portal address The iSCSI target IQN. Target IQN Target discovery type. Check the open-iscsi documentation for supported discovery types. Target discovery type open-iscsi administration utility binary. iscsiadm binary If the next resource depends on the udev creating a device then we wait until it is finished. On a normally loaded host this should be done quickly, but you may be unlucky. If you are not using udev set this to "no", otherwise we will spin in a loop until a timeout occurs. udev + + +If the iSCSI session exists but is currently inactive/broken, +which is most probably due to network problems, the iSCSI layer +will try to recover. If this parameter is set to true, we'll wait +for the recovery to succeed. In that case the monitor operation +can only time out so you should set the monitor op timeout +attribute appropriately. + +on error wait for iSCSI recovery in monitor + + + EOF } iscsi_methods() { cat <= "2.0-872" changed discovery semantics # see http://www.mail-archive.com/open-iscsi@googlegroups.com/msg04883.html # there's a new discoverydb command which should be used instead discovery open_iscsi_discovery() { local output - local severity=err local discovery_variant="discovery" local options="" local cmd local version=`$iscsiadm --version | awk '{print $3}'` ocf_version_cmp "$version" "2.0-871" if [ $? -eq 2 ]; then # newer than 2.0-871? discovery_variant="discoverydb" [ "$discovery_type" = "sendtargets" ] && options="-D" fi cmd="$iscsiadm -m $discovery_variant -p $OCF_RESKEY_portal -t $discovery_type $options" - ocf_is_probe && severity=info output=`$cmd` if [ $? -ne 0 -o x = "x$output" ]; then [ x != "x$output" ] && { - ocf_log $severity "$cmd FAILED" + ocf_log err "$cmd FAILED" echo "$output" } return 3 fi - portal=`echo "$output" | + PORTAL=`echo "$output" | awk -v target="$OCF_RESKEY_target" ' $NF==target{ if( NF==3 ) portal=$2; # sles compat mode else portal=$1; sub(",.*","",portal); print portal; }'` - case `echo "$portal" | wc -w` in + case `echo "$PORTAL" | wc -w` in 0) #target not found echo "$output" - ocf_log $severity "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal" + ocf_log err "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal" return 1 ;; 1) #we're ok return 0 ;; *) # handle multihome hosts reporting multiple portals - for p in $portal; do + for p in $PORTAL; do if [ "$OCF_RESKEY_portal" = "$p" ]; then - portal="$OCF_RESKEY_portal" + PORTAL="$OCF_RESKEY_portal" return 0 fi done echo "$output" ocf_log err "sorry, can't handle multihomed hosts unless you specify the portal exactly" return 2 ;; esac } open_iscsi_add() { $iscsiadm -m node -p $1 -T $2 -l } +open_iscsi_get_session_id() { + local target="$1" + $iscsiadm -m session 2>/dev/null | grep "$target$" | + awk '{print $2}' | tr -d '[]' +} open_iscsi_remove() { - $iscsiadm -m node -p $1 -T $2 -u + local target="$1" + local session_id + session_id=`open_iscsi_get_session_id "$target"` + if [ "$session_id" ]; then + $iscsiadm -m session -r $session_id -u + else + ocf_log err "cannot find session id for target $target" + return 1 + fi } open_iscsi_status() { - $iscsiadm -m session 2>/dev/null | grep -qs "$2$" + local target="$1" + local session_id conn_state outp + local msg_logged + local recov + + recov=${2:-$OCF_RESKEY_try_recovery} + session_id=`open_iscsi_get_session_id "$target"` + [ -z "$session_id" ] && + return 1 + while :; do + outp=`$iscsiadm -m session -r $session_id -P 1` || + return 2 + conn_state=`echo "$outp" | sed -n '/Connection State/s/.*: //p'` + # some drivers don't return connection state, in that case + # we'll assume that we're still connected + case "$conn_state" in + "LOGGED IN") + [ -n "$msg_logged" ] && + ocf_log info "connection state $conn_state. Session restored." + return 0;; + "Unknown"|"") # this is also probably OK + [ -n "$msg_logged" ] && + ocf_log info "connection state $conn_state. Session restored." + return 0;; + *) # failed + if [ "$__OCF_ACTION" != stop ] && ! ocf_is_probe && ocf_is_true $recov; then + if [ -z "$msg_logged" ]; then + ocf_log warning "connection state $conn_state, waiting for recovery..." + msg_logged=1 + fi + sleep 1 + else + ocf_log err "iscsiadm output: $outp" + return 2 + fi + ;; + esac + done +} + +disk_discovery() { + $discovery # discover and setup the real portal string (address) + case $? in + 0) ;; + 1|2) exit $OCF_ERR_GENERIC ;; + 3) if ! is_iscsid_running; then + [ $setup_rc -eq 1 ] && + ocf_log warning "iscsid.startup probably not correctly set in /etc/iscsi/iscsid.conf" + exit $OCF_ERR_INSTALLED + fi + exit $OCF_ERR_GENERIC + ;; + esac } # # NB: this is udev specific! # wait_for_udev() { - dev=/dev/disk/by-path/ip-$portal-iscsi-$OCF_RESKEY_target + dev=/dev/disk/by-path/ip-$PORTAL-iscsi-$OCF_RESKEY_target while :; do ls $dev* >/dev/null 2>&1 && break ocf_log warning "waiting for udev to create $dev" sleep 1 done } iscsi_status() { - if $disk_status "$portal" $OCF_RESKEY_target; then - return $OCF_SUCCESS - else - return $OCF_NOT_RUNNING - fi + $disk_status $OCF_RESKEY_target $* + case $? in + 0) return $OCF_SUCCESS;; + 1) return $OCF_NOT_RUNNING;; + 2) return $OCF_ERR_GENERIC;; + esac } iscsi_start() { - if iscsi_status; then - ocf_log info "iscsi $portal $OCF_RESKEY_target already running" + iscsi_status + case $? in + $OCF_SUCCESS) + ocf_log info "iscsi $PORTAL $OCF_RESKEY_target already running" return $OCF_SUCCESS - else - $add_disk $portal $OCF_RESKEY_target || + ;; + $OCF_NOT_RUNNING) + $add_disk $PORTAL $OCF_RESKEY_target || return $OCF_ERR_GENERIC - case "$udev" in + case "$OCF_RESKEY_udev" in [Yy]es) wait_for_udev || return $OCF_ERR_GENERIC ;; *) ;; esac - if iscsi_status; then - return $OCF_SUCCESS - else - return $OCF_ERR_GENERIC - fi + ;; + *) # the session exists, but it's broken + ocf_log warning "iscsi $PORTAL $OCF_RESKEY_target in failed state" + ;; + esac + iscsi_status 1 # enforce wait + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC fi } iscsi_stop() { - if iscsi_status; then - $remove_disk $portal $OCF_RESKEY_target || + iscsi_status + if [ $? -ne $OCF_NOT_RUNNING ] ; then + $remove_disk $OCF_RESKEY_target || return $OCF_ERR_GENERIC - if iscsi_status; then + iscsi_status + if [ $? -ne $OCF_NOT_RUNNING ] ; then return $OCF_ERR_GENERIC else return $OCF_SUCCESS fi else - ocf_log info "iscsi $portal $OCF_RESKEY_target already stopped" + ocf_log info "iscsi $OCF_RESKEY_target already stopped" return $OCF_SUCCESS fi } iscsi_monitor() { - if $disk_status "$portal" $OCF_RESKEY_target; then + if $disk_status $OCF_RESKEY_target; then return $OCF_SUCCESS else return $OCF_NOT_RUNNING fi } # # 'main' starts here... # if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; methods) iscsi_methods exit $OCF_SUCCESS;; esac if [ x = "x$OCF_RESKEY_target" ]; then ocf_log err "target parameter not set" exit $OCF_ERR_CONFIGURED fi if [ x = "x$OCF_RESKEY_portal" ]; then ocf_log err "portal parameter not set" exit $OCF_ERR_CONFIGURED fi case `uname` in Linux) setup=open_iscsi_setup ;; *) ocf_log info "platform `uname` may not be supported" setup=open_iscsi_setup ;; esac LSB_STATUS_STOPPED=3 $setup setup_rc=$? if [ $setup_rc -gt 1 ]; then ocf_log info "iscsi initiator utilities not installed or not setup" case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $OCF_ERR_INSTALLED;; esac fi if [ `id -u` != 0 ]; then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi -discovery_type=${OCF_RESKEY_discovery_type} -udev=${OCF_RESKEY_udev} -$discovery # discover and setup the real portal string (address) -case $? in -0) ;; -1) [ "$1" = stop ] && exit $OCF_SUCCESS - [ "$1" = monitor ] && exit $OCF_NOT_RUNNING - [ "$1" = status ] && exit $LSB_STATUS_STOPPED - exit $OCF_ERR_GENERIC -;; -2) [ "$1" = stop ] && { - iscsi_monitor || exit $OCF_SUCCESS - } - ocf_is_probe && { - iscsi_monitor; exit - } - exit $OCF_ERR_GENERIC -;; -3) ocf_is_probe && exit $OCF_NOT_RUNNING - if ! is_iscsid_running; then - [ $setup_rc -eq 1 ] && - ocf_log warning "iscsid.startup probably not correctly set in /etc/iscsi/iscsid.conf" - [ "$1" = stop ] && exit $OCF_SUCCESS - exit $OCF_ERR_INSTALLED - fi - exit $OCF_ERR_GENERIC -;; -esac - # which method was invoked? case "$1" in - start) iscsi_start + start) + discovery_type=${OCF_RESKEY_discovery_type} + disk_discovery + iscsi_start ;; stop) iscsi_stop ;; - status) if iscsi_status - then + status) iscsi_status + rc=$? + case $rc in + $OCF_SUCCESS) echo iscsi target $OCF_RESKEY_target running - exit $OCF_SUCCESS - else + ;; + $OCF_NOT_RUNNING) echo iscsi target $OCF_RESKEY_target stopped - exit $OCF_NOT_RUNNING - fi + ;; + *) + echo iscsi target $OCF_RESKEY_target failed + ;; + esac + exit $rc ;; monitor) iscsi_status ;; validate-all) # everything already validated # just exit successfully here. exit $OCF_SUCCESS;; *) iscsi_methods exit $OCF_ERR_UNIMPLEMENTED;; esac # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/jboss b/heartbeat/jboss index 9fd161f64..4453d86c2 100755 --- a/heartbeat/jboss +++ b/heartbeat/jboss @@ -1,434 +1,500 @@ #!/bin/sh # # Description: Manages a Jboss Server as an OCF High-Availability # resource under Heartbeat/LinuxHA control # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # # Copyright (c) 2009 Bauer Systems KG / Stefan Schluppeck # ####################################################################### # OCF parameters: # OCF_RESKEY_resource_name - The name of the resource. Default is ${OCF_RESOURCE_INSTANCE} # why not let the RA log through lrmd? # 2009/09/09 Nakahira: # jboss_console is used to record output of the "run.sh". # The log of "Run.sh" should not be output to ha-log because it is so annoying. # OCF_RESKEY_console - A destination of the log of jboss run and shutdown script. Default is /var/log/${OCF_RESKEY_resource_name}.log # OCF_RESKEY_shutdown_timeout - Time-out at the time of the stop. Default is 5 # OCF_RESKEY_kill_timeout - The re-try number of times awaiting a stop. Default is 10 # OCF_RESKEY_user - A user name to start a JBoss. Default is root # OCF_RESKEY_statusurl - URL for state confirmation. Default is http://127.0.0.1:8080 # OCF_RESKEY_java_home - Home directory of the Java. Default is ${JAVA_HOME} # OCF_RESKEY_java_opts - Options for Java. # OCF_RESKEY_jboss_home - Home directory of Jboss. Default is None # is it possible to devise this string from options? I'm afraid # that allowing users to set this could be error prone. # 2009/09/09 Nakahira: # It is difficult to set it automatically because jboss_pstring # greatly depends on the environment. At any rate, system architect # should note that pstring doesn't influence other processes. # OCF_RESKEY_pstring - String Jboss will found in procceslist. Default is "java -Dprogram.name=run.sh" # OCF_RESKEY_run_opts - Options for jboss to run. Default is "-c default -l lpg4j" # OCF_RESKEY_shutdown_opts - Options for jboss to shutdonw. Default is "-s 127.0.0.1:1099" +# OCF_RESKEY_rotate_consolelog - Control console log logrotation flag. Default is false. +# OCF_RESKEY_rotate_value - console log logrotation value. Default is 86400 span(seconds). +# OCF_RESKEY_rotate_logsuffix - Control console log logrotation suffix. Default is .%F. ############################################################################### : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs usage() { cat <<-! usage: $0 action action: start start jboss stop stop the jboss status return the status of jboss, run or down monitor return TRUE if the jboss appears to be working. You have to have installed $WGETNAME for this to work. meta-data show meta data message validate-all validate the instance parameters ! return $OCF_ERR_ARGS } isrunning_jboss() { local rc if [ -z "$1" ];then ocf_run -q -err wget -t 1 -O /dev/null $STATUSURL else # Retry message for restraint wget -t 1 -O /dev/null $STATUSURL 2>/dev/null fi rc=$? if [ $rc -eq 0 ]; then return $OCF_SUCCESS fi # JBoss service error return $OCF_ERR_GENERIC } monitor_jboss() { if ! pgrep -f "$PSTRING" > /dev/null; then return $OCF_NOT_RUNNING fi isrunning_jboss $1 } +rotate_console() +{ + # Look for rotatelogs/rotatelogs2 + if [ -x /usr/sbin/rotatelogs ]; then + ROTATELOGS=/usr/sbin/rotatelogs + elif [ -x /usr/sbin/rotatelogs2 ]; then + ROTATELOGS=/usr/sbin/rotatelogs2 + else + ocf_log warn "rotatelogs command not found." + return 1 + fi + + # Clean up and set permissions on required files + rm -rf "$CONSOLE" + mkfifo -m700 "$CONSOLE" + chown --dereference "$JBOSS_USER" "$CONSOLE" || true + + su - -s /bin/sh $JBOSS_USER \ + -c "$ROTATELOGS -l \"$CONSOLE$ROTATELOG_SUFFIX\" $ROTATEVALUE" \ + < "$CONSOLE" > /dev/null 2>&1 & +} + start_jboss() { monitor_jboss start if [ $? = $OCF_SUCCESS ]; then ocf_log info "JBoss already running." return $OCF_SUCCESS fi + + if ocf_is_true $ROTATELOG_FLG; then + rotate_console + if [ $? = 0 ]; then + ocf_log debug "Rotate console log succeeded." + else + ocf_log warn "Rotate console log failed. Starting jboss without console log rotation." + fi + fi ocf_log info "Starting JBoss[$RESOURCE_NAME]" if [ "$JBOSS_USER" = root ]; then "$JBOSS_HOME/bin/run.sh" $RUN_OPTS \ >> "$CONSOLE" 2>&1 & else su - -s /bin/bash "$JBOSS_USER" \ -c "export JAVA_HOME=${JAVA_HOME}; \ export JAVA_OPTS=${JAVA_OPTS}; \ export JBOSS_HOME=${JBOSS_HOME}; \ $JBOSS_HOME/bin/run.sh $RUN_OPTS" \ >> "$CONSOLE" 2>&1 & fi while true; do monitor_jboss start if [ $? = $OCF_SUCCESS ]; then break fi ocf_log info "start_jboss[$RESOURCE_NAME]: retry monitor_jboss" sleep 3 done return $OCF_SUCCESS } stop_jboss() { ocf_log info "Stopping JBoss[$RESOURCE_NAME]" if [ "$JBOSS_USER" = root ]; then "$JBOSS_HOME/bin/shutdown.sh" $SHUTDOWN_OPTS -S \ >> "$CONSOLE" 2>&1 & else su - -s /bin/bash "$JBOSS_USER" \ -c "export JAVA_HOME=${JAVA_HOME}; \ export JBOSS_HOME=${JBOSS_HOME}; \ $JBOSS_HOME/bin/shutdown.sh $SHUTDOWN_OPTS -S" \ >> "$CONSOLE" 2>&1 & fi lapse_sec=0 while pgrep -f "$PSTRING" > /dev/null; do sleep 1 lapse_sec=`expr $lapse_sec + 1` ocf_log info "stop_jboss[$RESOURCE_NAME]: stop NORM $lapse_sec/$SHUTDOWN_TIMEOUT" if [ $lapse_sec -ge $SHUTDOWN_TIMEOUT ]; then break fi done if pgrep -f "$PSTRING" > /dev/null; then ocf_log info "stop_jboss[$RESOURCE_NAME]: output a JVM thread dump to $CONSOLE" pkill -QUIT -f "$PSTRING" lapse_sec=0 while true; do sleep 1 lapse_sec=`expr $lapse_sec + 1` ocf_log info "stop_jboss[$RESOURCE_NAME]: kill jboss by SIGTERM ($lapse_sec/$KILL_TIMEOUT)" pkill -TERM -f "$PSTRING" if pgrep -f "$PSTRING" > /dev/null; then if [ $lapse_sec -ge $KILL_TIMEOUT ]; then break fi else break fi done fi # If the JBoss process hangs, JBoss RA waits $SHUTDOWN_TIMEOUT # seconds and tries kill TERM and QUIT for $KILL_TIMEOUT seconds. # The stop timeout of RA should be # longer than $SHUTDOWN_TIMEOUT + $KILL_TIMEOUT. lapse_sec=0 while pgrep -f "$PSTRING" > /dev/null; do sleep 1 lapse_sec=`expr $lapse_sec + 1` ocf_log info "stop_jboss[$RESOURCE_NAME]: kill jboss by SIGKILL ($lapse_sec/@@@)" pkill -KILL -f "$PSTRING" done + + if ocf_is_true $ROTATELOG_FLG; then + rm -f "${CONSOLE}" + fi + return $OCF_SUCCESS } status_jboss() { if ! pgrep -f "$PSTRING" > /dev/null; then echo "JBoss process[$RESOURCE_NAME] is not running." return $OCF_NOT_RUNNING fi if isrunning_jboss; then echo "JBoss[$RESOURCE_NAME] is running." return $OCF_SUCCESS else echo "JBoss process[$RESOURCE_NAME] is running." echo "But, we can not access JBoss web service." return $OCF_NOT_RUNNING fi } metadata_jboss() { cat < 1.0 Resource script for Jboss. It manages a Jboss instance as an HA resource. Manages a JBoss application server instance The name of the resource. Defaults to the name of the resource instance. The name of the resource A destination of the log of jboss run and shutdown script. jboss log path Timeout for jboss bin/shutdown.sh. We wait for this timeout to expire, then send the TERM and QUIT signals. Finally, the KILL signal is used to terminate the jboss process. You should set the timeout for the stop operation to a value bigger than the sum of the timeout parameters. See also kill_timeout. shutdown timeout If bin/shutdown.sh doesn't stop the jboss process, then we send it TERM and QUIT signals, intermittently and once a second. After this timeout expires, if the process is still live, we use the KILL signal. See also shutdown_timeout. stop by signal timeout A user name to start a JBoss. A user name to start a resource. URL to test in the monitor operation. URL to test in the monitor operation. Home directory of Java. Defaults to the environment variable JAVA_HOME. If it is not set, then define this parameter. Home directory of Java. Java options. Java options. Home directory of Jboss. Home directory of Jboss. With this string heartbeat matches for the right process to kill. pkill/pgrep search string Start options to start Jboss with, defaults are from the Jboss-Doku. options for jboss run.sh Stop options to stop Jboss with. options for jboss shutdown.sh + + +Rotate console log flag. + +Rotate console log flag + + + + + +console log rotation value (default is 86400 seconds). + +console log rotation value (default is 86400 seconds) + + + + + +Rotate console log suffix. + +Rotate console log suffix + + + END return $OCF_SUCCESS } validate_all_jboss() { ocf_log info "validate_all_jboss[$RESOURCE_NAME]" return $OCF_SUCCESS } COMMAND=$1 RESOURCE_NAME="${OCF_RESKEY_resource_name-${OCF_RESOURCE_INSTANCE}}" CONSOLE="${OCF_RESKEY_console-/var/log/${RESOURCE_NAME}.log}" SHUTDOWN_TIMEOUT="${OCF_RESKEY_shutdown_timeout-5}" KILL_TIMEOUT="${OCF_RESKEY_kill_timeout-10}" JBOSS_USER="${OCF_RESKEY_user-root}" STATUSURL="${OCF_RESKEY_statusurl-http://127.0.0.1:8080}" PSTRING="${OCF_RESKEY_pstring-java -Dprogram.name=run.sh}" RUN_OPTS="${OCF_RESKEY_run_opts--c default -l lpg4j}" SHUTDOWN_OPTS="${OCF_RESKEY_shutdown_opts--s 127.0.0.1:1099}" +ROTATELOG_FLG="${OCF_RESKEY_rotate_consolelog-false}" +ROTATEVALUE="${OCF_RESKEY_rotate_value-86400}" +ROTATELOG_SUFFIX="${OCF_RESKEY_rotate_logsuffix-.%F}" if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi if [ "$COMMAND" = "meta-data" ]; then metadata_jboss exit $OCF_SUCCESS fi if [ "$COMMAND" = "help" -o "$COMMAND" = "usage" ]; then usage exit $OCF_SUCCESS fi # test if these two are set and if directories exist and if the # required scripts/binaries exist; use OCF_ERR_INSTALLED JAVA_HOME="${OCF_RESKEY_java_home-${JAVA_HOME}}" JAVA_OPTS="${OCF_RESKEY_java_opts}" JBOSS_HOME="${OCF_RESKEY_jboss_home}" LSB_STATUS_STOPPED=3 if [ ! -d "$JAVA_HOME" -o ! -d "$JBOSS_HOME" ]; then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac ocf_log err "JAVA_HOME or JBOSS_HOME does not exist." exit $OCF_ERR_INSTALLED fi export JAVA_HOME JAVA_OPTS JBOSS_HOME JAVA=${JAVA_HOME}/bin/java if [ ! -x "$JAVA" ]; then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac ocf_log err "java command does not exist." exit $OCF_ERR_INSTALLED fi case "$COMMAND" in start) start_jboss func_status=$? exit $func_status ;; stop) stop_jboss func_status=$? exit $func_status ;; status) status_jboss exit $? ;; monitor) monitor_jboss func_status=$? exit $func_status ;; validate-all) validate_all_jboss exit $? ;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/mysql b/heartbeat/mysql index 1ca9281ef..43d155de8 100755 --- a/heartbeat/mysql +++ b/heartbeat/mysql @@ -1,1084 +1,1260 @@ #!/bin/sh # # # MySQL # # Description: Manages a MySQL database as Linux-HA resource # # Authors: Alan Robertson: DB2 Script # Jakub Janczak: rewrite as MySQL # Andrew Beekhof: cleanup and import # Sebastian Reitenbach: add OpenBSD defaults, more cleanup # Narayan Newton: add Gentoo/Debian defaults # Marian Marinov, Florian Haas: add replication capability +# Yves Trudeau, Baron Schwartz: add VIP support and improve replication # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # (c) 2002-2005 International Business Machines, Inc. # 2005-2010 Linux-HA contributors # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 mysql # # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_binary # OCF_RESKEY_client_binary # OCF_RESKEY_config # OCF_RESKEY_datadir # OCF_RESKEY_user # OCF_RESKEY_group # OCF_RESKEY_test_table # OCF_RESKEY_test_user # OCF_RESKEY_test_passwd # OCF_RESKEY_enable_creation # OCF_RESKEY_additional_parameters # OCF_RESKEY_log # OCF_RESKEY_pid # OCF_RESKEY_socket +# OCF_RESKEY_replication_user +# OCF_RESKEY_replication_passwd +# OCF_RESKEY_replication_port +# OCF_RESKEY_max_slave_lag +# OCF_RESKEY_evict_outdated_slaves +# OCF_RESKEY_reader_attribute + ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Fill in some defaults if no values are specified HOSTOS=`uname` if [ "X${HOSTOS}" = "XOpenBSD" ];then OCF_RESKEY_binary_default="/usr/local/bin/mysqld_safe" OCF_RESKEY_config_default="/etc/my.cnf" OCF_RESKEY_datadir_default="/var/mysql" OCF_RESKEY_user_default="_mysql" OCF_RESKEY_group_default="_mysql" OCF_RESKEY_log_default="/var/log/mysqld.log" OCF_RESKEY_pid_default="/var/mysql/mysqld.pid" OCF_RESKEY_socket_default="/var/run/mysql/mysql.sock" else OCF_RESKEY_binary_default="/usr/bin/safe_mysqld" OCF_RESKEY_config_default="/etc/my.cnf" OCF_RESKEY_datadir_default="/var/lib/mysql" OCF_RESKEY_user_default="mysql" OCF_RESKEY_group_default="mysql" OCF_RESKEY_log_default="/var/log/mysqld.log" OCF_RESKEY_pid_default="/var/run/mysql/mysqld.pid" OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" fi OCF_RESKEY_client_binary_default="mysql" OCF_RESKEY_test_user_default="root" OCF_RESKEY_test_table_default="mysql.user" OCF_RESKEY_test_passwd_default="" OCF_RESKEY_enable_creation_default=0 OCF_RESKEY_additional_parameters_default="" OCF_RESKEY_replication_port_default="3306" OCF_RESKEY_max_slave_lag_default="3600" OCF_RESKEY_evict_outdated_slaves_default="false" +OCF_RESKEY_reader_attribute_default="readable" : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} MYSQL_BINDIR=`dirname ${OCF_RESKEY_binary}` : ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} : ${OCF_RESKEY_datadir=${OCF_RESKEY_datadir_default}} : ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} : ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} : ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} : ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} : ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} : ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} : ${OCF_RESKEY_test_table=${OCF_RESKEY_test_table_default}} : ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} : ${OCF_RESKEY_enable_creation=${OCF_RESKEY_enable_creation_default}} : ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} : ${OCF_RESKEY_replication_user=${OCF_RESKEY_replication_user_default}} : ${OCF_RESKEY_replication_passwd=${OCF_RESKEY_replication_passwd_default}} : ${OCF_RESKEY_replication_port=${OCF_RESKEY_replication_port_default}} : ${OCF_RESKEY_max_slave_lag=${OCF_RESKEY_max_slave_lag_default}} : ${OCF_RESKEY_evict_outdated_slaves=${OCF_RESKEY_evict_outdated_slaves_default}} +: ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}} + +####################################################################### +# Convenience variables + +MYSQL=$OCF_RESKEY_client_binary +MYSQL_OPTIONS_LOCAL="-S $OCF_RESKEY_socket --connect_timeout=10" +MYSQL_OPTIONS_REPL="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd" +MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" +MYSQL_TOO_MANY_CONN_ERR=1040 + +CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot " +HOSTNAME=`uname -n` +CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME " +INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` +CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s mysql_replication" + ####################################################################### usage() { cat < 1.0 -Resource script for MySQL. +Resource script for MySQL. May manage a standalone MySQL database, a clone set with externally managed replication, or a complete master/slave replication setup. + +While managing replication, the default behavior is to use uname -n +values in the change master to command. Other IPs can be specified +manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP +giving the IP to use for replication. For example, if the mysql primitive +you are using is p_mysql, the attribute to set will be +p_mysql_mysql_master_IP. Manages a MySQL database instance Location of the MySQL server binary MySQL server binary Location of the MySQL client binary MySQL client binary Configuration file MySQL config Directory containing databases MySQL datadir User running MySQL daemon MySQL user Group running MySQL daemon (for logfile and directory permissions) MySQL group The logfile to be used for mysqld. MySQL log file The pidfile to be used for mysqld. MySQL pid file The socket to be used for mysqld. MySQL socket Table to be tested in monitor statement (in database.table notation) MySQL test table -MySQL test user +MySQL test user, must have select privilege on test_table MySQL test user MySQL test user password MySQL test user password If the MySQL database does not exist, it will be created Create the database if it does not exist Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld MySQL replication user. This user is used for starting and stopping MySQL replication, for setting and resetting the master host, and for setting and unsetting read-only mode. Because of that, this user must have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, and PROCESS -privileges on all nodes within the cluster. +privileges on all nodes within the cluster. Mandatory if you define +a master-slave resource. MySQL replication user MySQL replication password. Used for replication client and slave. +Mandatory if you define a master-slave resource. MySQL replication user password The port on which the Master MySQL instance is listening. MySQL replication port The maximum number of seconds a replication slave is allowed to lag behind its master. Do not set this to zero. What the cluster manager does in case a slave exceeds this maximum lag is determined by the evict_outdated_slaves parameter. Maximum time (seconds) a MySQL slave is allowed to lag behind a master If set to true, any slave which is more than max_slave_lag seconds behind the master has its MySQL instance shut down. If this parameter is set to false in a primitive or clone resource, it is simply ignored. If set to false in a master/slave resource, then exceeding the maximum slave lag will merely push down the master preference so the lagging slave is never promoted to the new master. Determines whether to shut down badly lagging slaves + + +An attribute that the RA can manage to specify whether a node +can be read from. This node attribute will be 1 if it's fine to +read from the node, and 0 otherwise (for example, when a slave +has lagged too far behind the master). + +A typical example for the use of this attribute would be to tie +a set of IP addresses to MySQL slaves that can be read from. + +This parameter is only meaningful in master/slave set configurations. + +Sets the node attribute that determines +whether a node is usable for clients to read from. + + END } -####################################################################### -# Convenience variables - -MYSQL=$OCF_RESKEY_client_binary -MYSQL_OPTIONS_LOCAL="-S $OCF_RESKEY_socket --connect_timeout=10" -MYSQL_OPTIONS_REPL="--user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd" - -CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot " -HOSTNAME=`uname -n` -CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME -l forever" -INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` - -####################################################################### # Convenience functions set_read_only() { # Sets or unsets read-only mode. Accepts one boolean as its # optional argument. If invoked without any arguments, defaults to # enabling read only mode. Should only be set in master/slave # setups. # Returns $OCF_SUCCESS if the operation succeeds, or # $OCF_ERR_GENERIC if it fails. local ro_val if ocf_is_true $1; then ro_val="on" else ro_val="off" fi - local mysql_options - mysql_options="$MYSQL_OPTIONS_LOCAL" - if [ -n $OCF_RESKEY_replication_user ]; then - mysql_options="$mysql_options $MYSQL_OPTIONS_REPL" - fi - ocf_run $MYSQL $mysql_options \ + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "SET GLOBAL read_only=${ro_val}" } get_read_only() { # Check if read-only is set - local mysql_options local read_only_state - mysql_options="$MYSQL_OPTIONS_LOCAL" - if [ -n $OCF_RESKEY_replication_user ]; then - mysql_options="$mysql_options $MYSQL_OPTIONS_REPL" - fi - - read_only_state=`$MYSQL $mysql_options \ + read_only_state=`$MYSQL $MYSQL_OPTIONS_REPL \ -e "SHOW VARIABLES" | grep read_only | awk '{print $2}'` - + if [ "$read_only_state" = "ON" ]; then return 0 else return 1 fi } is_slave() { # Determine whether the machine is currently running as a MySQL # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW # SLAVE STATUS creates an empty result set, 0 otherwise. local rc local tmpfile - local mysql_options # Check whether this machine should be slave if ! ocf_is_ms || ! get_read_only; then - return 1; + return 1 fi + + get_slave_info + rc=$? - tmpfile=`mktemp ${HA_RSCTMP}/is_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` - - mysql_options="$MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL" - - $MYSQL $mysql_options \ - -e 'SHOW SLAVE STATUS\G' > $tmpfile - - # "SHOW SLAVE STATUS" returns an empty set if instance is not a - # replication slave - if [ -s $tmpfile ]; then - rm -f $tmpfile - return 0 + if [ $rc -eq 0 ]; then + # show slave status is not empty + # Is there a master_log_file defined? (master_log_file is deleted + # by reset slave + if [ "$master_log_file" ]; then + return 0 + else + return 1 + fi + else + # "SHOW SLAVE STATUS" returns an empty set if instance is not a + # replication slave + return 1 fi - - rm -f $tmpfile - return 1 + } parse_slave_info() { # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 } get_slave_info() { # Warning: this sets $tmpfile and LEAVE this file! You must delete it after use! local mysql_options + + if [ "$master_log_file" -a "$master_host" ]; then + # variables are already defined, get_slave_info has been run before + return $OCF_SUCCESS + else + tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` - tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` - - mysql_options="$MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL" - - $MYSQL $mysql_options \ + $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW SLAVE STATUS\G' > $tmpfile - if [ -s $tmpfile ]; then - master_host=`parse_slave_info Master_Host $tmpfile` - master_user=`parse_slave_info Master_User $tmpfile` - master_port=`parse_slave_info Master_Port $tmpfile` - master_log_file=`parse_slave_info Master_Log_File $tmpfile` - master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` - slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` - slave_io=`parse_slave_info Slave_IO_Running $tmpfile` - last_errno=`parse_slave_info Last_Errno $tmpfile` - secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` + if [ -s $tmpfile ]; then + master_host=`parse_slave_info Master_Host $tmpfile` + master_user=`parse_slave_info Master_User $tmpfile` + master_port=`parse_slave_info Master_Port $tmpfile` + master_log_file=`parse_slave_info Master_Log_File $tmpfile` + master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` + slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` + slave_io=`parse_slave_info Slave_IO_Running $tmpfile` + last_errno=`parse_slave_info Last_Errno $tmpfile` + secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` + ocf_log debug "MySQL instance running as a replication slave" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + ocf_log err "check_slave invoked on an instance that is not a replication slave." + return $OCF_ERR_GENERIC + fi - ocf_log debug "MySQL instance running as a replication slave" - else - # Instance produced an empty "SHOW SLAVE STATUS" output -- - # instance is not a slave - ocf_log err "check_slave invoked on an instance that is not a replication slave." - return $OCF_ERR_GENERIC + return $OCF_SUCCESS fi - - return $OCF_SUCCESS } check_slave() { # Checks slave status - local rc + local rc new_master get_slave_info rc=$? if [ $rc -eq 0 ]; then - if [ $last_errno -ne 0 ]; then + # Did we receive an error other than max_connections? + if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then # Whoa. Replication ran into an error. This slave has # diverged from its master. Make sure this resource # doesn't restart in place. ocf_log err "MySQL instance configured for replication, but replication has failed." ocf_log err "See $tmpfile for details" - exit $OCF_ERR_INSTALLED + + # Just pull the reader VIP away, killing MySQL here would be pretty evil + # on a loaded server + + set_reader_attr 0 + exit $OCF_SUCCESS + + fi + + # If we got max_connections, let's remove the vip + if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then + set_reader_attr 0 + exit $OCF_SUCCESS fi if [ "$slave_io" != 'Yes' ]; then # Not necessarily a bad thing. The master may have # temporarily shut down, and the slave may just be # reconnecting. A warning can't hurt, though. ocf_log warn "MySQL Slave IO threads currently not running." + + # Sanity check, are we at least on the right master + new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` + + if [ "$master_host" != "$new_master" ]; then + # Not pointing to the right master, not good, removing the VIPs + set_reader_attr 0 + + exit $OCF_SUCCESS + fi + fi if [ "$slave_sql" != 'Yes' ]; then # We don't have a replication SQL thread running. Not a - # good thing. Try to recoved by restarting the resource in - # place. + # good thing. Try to recoved by restarting the SQL thread + # and remove reader vip. Prevent MySQL restart. ocf_log err "MySQL Slave SQL threads currently not running." ocf_log err "See $tmpfile for details" - exit $OCF_ERR_GENERIC + + # Remove reader vip + set_reader_attr 0 + + # try to restart slave + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" + + # Return success to prevent a restart + exit $OCF_SUCCESS fi if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then # We're supposed to bail out if we lag too far # behind. Let's check our lag. if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then ocf_log err "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)." ocf_log err "See $tmpfile for details" + + # Remove reader vip + set_reader_attr 0 + exit $OCF_ERR_INSTALLED fi - elif ocf_is_ms; then + elif ocf_is_ms; then # Even if we're not set to evict lagging slaves, we can # still use the seconds behind master value to set our # master preference. local master_pref master_pref=$((${OCF_RESKEY_max_slave_lag}-${secs_behind})) if [ $master_pref -lt 0 ]; then # Sanitize a below-zero preference to just zero master_pref=0 fi $CRM_MASTER -v $master_pref fi + # is the slave ok to have a VIP on it + if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then + set_reader_attr 0 + else + set_reader_attr 1 + fi + ocf_log debug "MySQL instance running as a replication slave" rm -f $tmpfile else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave + # TODO: Needs to handle when get_slave_info will return too many connections error rm -f $tmpfile ocf_log err "check_slave invoked on an instance that is not a replication slave." exit $OCF_ERR_GENERIC fi } set_master() { - local new_master_host master_log_file master_log_pos + local new_master master_log_file master_log_pos local master_params - new_master_host=$1 + new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` # Keep replication position get_slave_info - if [ "$master_log_file" -a "$new_master_host" = "$master_host" ]; then -# master_params=", MASTER_LOG_FILE='$master_log_file', \ -# MASTER_LOG_POS=$master_log_pos" + if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then + # master_params=", MASTER_LOG_FILE='$master_log_file', \ + # MASTER_LOG_POS=$master_log_pos" ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos" rm -f $tmpfile return else - master_log_file=`$CRM_ATTR -n $new_master_host-log-file-${INSTANCE_ATTR_NAME} -q -G` - master_log_pos=`$CRM_ATTR -n $new_master_host-log-pos-${INSTANCE_ATTR_NAME} -q -G` + master_log_file=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f2` + master_log_pos=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f3` if [ -n "$master_log_file" -a -n "$master_log_pos" ]; then master_params=", MASTER_LOG_FILE='$master_log_file', \ - MASTER_LOG_POS=$master_log_pos" - ocf_log info "Restored master pos for $new_master_host : $master_log_file:$master_log_pos" + MASTER_LOG_POS=$master_log_pos" + ocf_log info "Restored master pos for $new_master : $master_log_file:$master_log_pos" fi fi # Informs the MySQL server of the master to replicate # from. Accepts one mandatory argument which must contain the host # name of the new master host. The master must either be unchanged # from the laste master the slave replicated from, or freshly # reset with RESET MASTER. - ocf_run $MYSQL $MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL \ - -e "CHANGE MASTER TO MASTER_HOST='$new_master_host', \ - MASTER_USER='$OCF_RESKEY_replication_user', \ - MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params" - + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "CHANGE MASTER TO MASTER_HOST='$new_master', \ + MASTER_USER='$OCF_RESKEY_replication_user', \ + MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params" rm -f $tmpfile } unset_master(){ # Instructs the MySQL server to stop replicating from a master # host. # If we're currently not configured to be replicating from any # host, then there's nothing to do. But we do log a warning as # no-one but the CRM should be touching the MySQL master/slave # configuration. if ! is_slave; then ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" return $OCF_SUCCESS fi - local mysql_options - mysql_options="$MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL" - local tmpfile tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX` - # First, stop the slave I/O thread and wait for relay log + # At this point, the master is read only so there should not be much binlogs to transfer + # Let's wait for the last bits + while true; do + $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW PROCESSLIST\G' > $tmpfile + if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then + ocf_log info "MySQL slave has finished reading master binary log" + break + fi + if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then + ocf_log info "Master is down, no more binary logs to come" + break + fi + if grep -i 'Connecting to master' $tmpfile >/dev/null; then + ocf_log info "Master is down, no more binary logs to come" + break + fi + if ! grep 'system user' $tmpfile >/dev/null; then + ocf_log info "Slave is not running - not waiting to finish" + break + fi + + sleep 1 + done + + # Now, stop the slave I/O thread and wait for relay log # processing to complete - ocf_run $MYSQL $mysql_options \ + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE IO_THREAD" if [ $? -gt 0 ]; then ocf_log err "Error stopping slave IO thread" exit $OCF_ERR_GENERIC fi while true; do - $MYSQL $mysql_options \ + $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW PROCESSLIST\G' > $tmpfile - if grep '[Hh]as read all relay log' $tmpfile >/dev/null; then + if grep -i 'Has read all relay log' $tmpfile >/dev/null; then ocf_log info "MySQL slave has finished processing relay log" break fi if ! grep -q 'system user' $tmpfile; then ocf_log info "Slave not runnig - not waiting to finish" break fi ocf_log info "Waiting for MySQL slave to finish processing relay log" sleep 1 - done + done rm -f $tmpfile # Now, stop all slave activity and unset the master host - ocf_run $MYSQL $mysql_options \ + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" if [ $? -gt 0 ]; then ocf_log err "Error stopping rest slave threads" exit $OCF_ERR_GENERIC fi - #Save current state - get_slave_info - $CRM_ATTR -n $master_host-log-file-${INSTANCE_ATTR_NAME} -v $master_log_file - $CRM_ATTR -n $master_host-log-pos-${INSTANCE_ATTR_NAME} -v $master_log_pos - rm -f $tmpfile - - ocf_run $MYSQL $mysql_options \ - -e "CHANGE MASTER TO MASTER_HOST=''" + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "RESET SLAVE;" if [ $? -gt 0 ]; then - ocf_log err "Failed to set master" + ocf_log err "Failed to reset slave" exit $OCF_ERR_GENERIC fi } -# Start replication as slave. Master hostname as parameter +# Start replication as slave start_slave() { - local master_host - - master_host="$1" - - # Remove state attributes - it will be invalid after START SLAVE - $CRM_ATTR -n $master_host-log-file-${INSTANCE_ATTR_NAME} -D - $CRM_ATTR -n $master_host-log-pos-${INSTANCE_ATTR_NAME} -D - ocf_run $MYSQL $MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL \ + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" } +# Set the attribute controlling the readers VIP +set_reader_attr() { + local curr_attr_value + + curr_attr_value=$(get_reader_attr) + + if [ "$curr_attr_value" -ne "$1" ]; then + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1 + fi + +} + +# get the attribute controlling the readers VIP +get_reader_attr() { + local attr_value + local rc + + attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` + rc=$? + if [ "$rc" -eq "0" ]; then + echo $attr_value + else + echo -1 + fi + +} + +# Stores data for MASTER STATUS from MySQL +update_data_master_status() { + + master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}" + + $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file +} + + +# Returns the specified value from the stored copy of SHOW MASTER STATUS. +# should be call after update_data_master_status for tmpfile +# Arguments: +# $1 The value to get. +get_master_status() { + awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file" +} + + +# Determines what IP address is attached to the current host. The output of the +# crm_attribute command looks like this: +# scope=nodes name=IP value=10.2.2.161 +# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n +# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the +# change master to command. +get_local_ip() { + local IP + IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G` + if [ ! $? -eq 0 ]; then + uname -n + else + echo $IP + fi +} + ####################################################################### # Functions invoked by resource manager actions mysql_validate() { check_binary $OCF_RESKEY_binary check_binary $OCF_RESKEY_client_binary if [ ! -f $OCF_RESKEY_config ]; then ocf_log err "Config $OCF_RESKEY_config doesn't exist"; return $OCF_ERR_INSTALLED; fi if [ ! -d $OCF_RESKEY_datadir ]; then ocf_log err "Datadir $OCF_RESKEY_datadir doesn't exist"; return $OCF_ERR_INSTALLED; fi getent passwd $OCF_RESKEY_user >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "User $OCF_RESKEY_user doesn't exit"; return $OCF_ERR_INSTALLED; fi getent group $OCF_RESKEY_group >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "Group $OCF_RESKEY_group doesn't exist"; return $OCF_ERR_INSTALLED; fi true } mysql_status() { - # Set the log level of the error message (default:err) - local loglevel - loglevel=${1:-err} - if [ ! -e $OCF_RESKEY_pid ]; then - ocf_log $loglevel "MySQL is not running" + ocf_log $1 "MySQL is not running" return $OCF_NOT_RUNNING; fi pid=`cat $OCF_RESKEY_pid`; if [ -d /proc -a -d /proc/1 ]; then [ "u$pid" != "u" -a -d /proc/$pid ] else kill -s 0 $pid >/dev/null 2>&1 fi if [ $? -eq 0 ]; then return $OCF_SUCCESS; else - ocf_log $loglevel "MySQL not running: removing old PID file" + ocf_log $1 "MySQL not running: removing old PID file" rm -f $OCF_RESKEY_pid return $OCF_NOT_RUNNING; fi } mysql_monitor() { local rc local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi - + mysql_status $status_loglevel + rc=$? + # TODO: check max connections error + # If status returned an error, return that immediately if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then # Check if this instance is configured as a slave, and if so # check slave status if is_slave; then check_slave fi - local mysql_options - mysql_options="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" - # Check for test table - ocf_run -q $MYSQL $mysql_options \ + ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" rc=$? if [ $rc -ne 0 ]; then - ocf_log err "Failed to select from $OCF_RESKEY_test_table"; + ocf_log err "Failed to select from $test_table"; return $OCF_ERR_GENERIC; fi fi if ocf_is_ms && ! get_read_only; then ocf_log debug "MySQL monitor succeeded (master)"; return $OCF_RUNNING_MASTER else ocf_log debug "MySQL monitor succeeded"; return $OCF_SUCCESS fi } mysql_start() { + if ocf_is_ms; then + # Initialize the ReaderVIP attribute, monitor will enable it + set_reader_attr 0 + fi + mysql_status info if [ $? = $OCF_SUCCESS ]; then ocf_log info "MySQL already running" return $OCF_SUCCESS fi touch $OCF_RESKEY_log chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log chmod 0640 $OCF_RESKEY_log [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log if ocf_is_true "$OCF_RESKEY_enable_creation" && [ ! -d $OCF_RESKEY_datadir/mysql ] ; then ocf_log info "Initializing MySQL database: " $MYSQL_BINDIR/mysql_install_db --datadir=$OCF_RESKEY_datadir rc=$? if [ $rc -ne 0 ] ; then ocf_log err "Initialization failed: $rc"; exit $OCF_ERR_GENERIC fi chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_datadir fi pid_dir=`dirname $OCF_RESKEY_pid` if [ ! -d $pid_dir ] ; then ocf_log info "Creating PID dir: $pid_dir" mkdir -p $pid_dir chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir fi socket_dir=`dirname $OCF_RESKEY_socket` if [ ! -d $socket_dir ] ; then ocf_log info "Creating socket dir: $socket_dir" mkdir -p $socket_dir chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir fi # Regardless of whether we just created the directory or it # already existed, check whether it is writable by the configured # user for dir in $pid_dir $socket_dir; do - if ! su -s /bin/sh - $OCF_RESKEY_user -c "test -w $dir"; then - ocf_log err "Directory $dir is not writable by $OCF_RESKEY_user" - exit $OCF_ERR_PERM; - fi + if ! su -s /bin/sh - $OCF_RESKEY_user -c "test -w $dir"; then + ocf_log err "Directory $dir is not writable by $OCF_RESKEY_user" + exit $OCF_ERR_PERM; + fi done # Uncomment to perform permission clensing # - not convinced this should be enabled by default # #chmod 0755 $OCF_RESKEY_datadir #chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir #chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir mysql_extra_params= if ocf_is_ms; then mysql_extra_params="--skip-slave-start" fi ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ - --pid-file=$OCF_RESKEY_pid \ - --socket=$OCF_RESKEY_socket \ - --datadir=$OCF_RESKEY_datadir \ - --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ - $mysql_extra_params >/dev/null 2>&1 & + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ + $mysql_extra_params >/dev/null 2>&1 & rc=$? if [ $rc != 0 ]; then ocf_log err "MySQL start command failed: $rc" return $rc fi # Spin waiting for the server to come up. # Let the CRM/LRM time us out if required. start_wait=1 while [ $start_wait = 1 ]; do mysql_status info rc=$? if [ $rc = $OCF_SUCCESS ]; then start_wait=0 - elif [ $rc != $OCF_NOT_RUNNING ]; then - ocf_log err "MySQL start failed: $rc" + elif [ $rc != $OCF_NOT_RUNNING ]; then + ocf_log info "MySQL start failed: $rc" return $rc fi sleep 2 done if ocf_is_ms; then # We're configured as a stateful resource. We must start as # slave by default. At this point we don't know if the CRM has # already promoted a master. So, we simply start in read only # mode. set_read_only on - + # Now, let's see whether there is a master. We might be a new # node that is just joining the cluster, and the CRM may have # promoted a master before. master_host=`echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " "` if [ "$master_host" -a "$master_host" != ${HOSTNAME} ]; then ocf_log info "Changing MySQL configuration to replicate from $master_host." - set_master $master_host - ocf_run $MYSQL $MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL \ - -e 'START SLAVE' - if [ $? -ne 0 ]; then - ocf_log err "Failed to start slave" - return $OCF_ERR_GENERIC - fi - else - ocf_log info "No MySQL master present, clearing replication state" - unset_master - fi - - master_host=`echo $OCF_RESKEY_CRM_meta_notify_master_uname` - if [ "$master_host" -a "$master_host" != ${HOSTNAME} ]; then - ocf_log info "Changing MySQL configuration to replicate from $master_host." - set_master $master_host - start_slave $master_host + set_master + start_slave if [ $? -ne 0 ]; then ocf_log err "Failed to start slave" return $OCF_ERR_GENERIC fi else ocf_log info "No MySQL master present - clearing replication state" unset_master fi # We also need to set a master preference, otherwise Pacemaker # won't ever promote us in the absence of any explicit # preference set by the administrator. We choose a low # greater-than-zero preference. $CRM_MASTER -v 1 + fi # Initial monitor action if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then OCF_CHECK_LEVEL=10 fi mysql_monitor rc=$? if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then ocf_log err "Failed initial monitor action" return $rc fi - + ocf_log info "MySQL started" return $OCF_SUCCESS } mysql_stop() { if ocf_is_ms; then # clear preference for becoming master $CRM_MASTER -D + + # Remove VIP capability + set_reader_attr 0 fi - mysql_status info - rc=$? - if [ $rc = $OCF_NOT_RUNNING ]; then - return $OCF_SUCCESS + if [ ! -f $OCF_RESKEY_pid ]; then + ocf_log info "MySQL is not running" + return $OCF_SUCCESS fi pid=`cat $OCF_RESKEY_pid 2> /dev/null ` /bin/kill $pid > /dev/null rc=$? if [ $rc != 0 ]; then ocf_log err "MySQL couldn't be stopped" return $OCF_ERR_GENERIC fi - # stop waiting shutdown_timeout=15 if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) fi count=0 - while [ $count -lt $shutdown_timeout ]; do + while [ $count -lt $shutdown_timeout ] + do mysql_status info rc=$? if [ $rc = $OCF_NOT_RUNNING ]; then break fi count=`expr $count + 1` sleep 1 - ocf_log info "MySQL still hasn't stopped yet. Waiting..." + ocf_log debug "MySQL still hasn't stopped yet. Waiting..." done mysql_status info if [ $? != $OCF_NOT_RUNNING ]; then - ocf_log warn "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." + ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." /bin/kill -KILL $pid > /dev/null fi ocf_log info "MySQL stopped"; rm -f /var/lock/subsys/mysqld rm -f $OCF_RESKEY_socket return $OCF_SUCCESS } mysql_promote() { - if ( ! mysql_status ); then + local master_info + + if ( ! mysql_status err ); then return $OCF_NOT_RUNNING fi - ocf_run $MYSQL $MYSQL_OPTIONS_LOCAL $MYSQL_OPTIONS_REPL \ + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" + + # Set Master Info in CIB, cluster level attribute + update_data_master_status + master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)" + ${CRM_ATTR_REPL_INFO} -v "$master_info" + rm -f $tmpfile + set_read_only off || return $OCF_ERR_GENERIC # Existing master gets a higher-than-default master preference, so # the cluster manager does not shuffle the master role around # unnecessarily $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1)) + # A master can accept reads + set_reader_attr 1 + return $OCF_SUCCESS } mysql_demote() { - if ! mysql_status; then + if ! mysql_status err; then return $OCF_NOT_RUNNING fi - set_read_only on - if [ $? -ne 0 ]; then - ocf_log err "Failed to set read-only"; - return $OCF_ERR_GENERIC; - fi - # Return master preference to default, so the cluster manager gets # a chance to select a new master $CRM_MASTER -v 1 } mysql_notify() { # If not configured as a Stateful resource, we make no sense of # notifications. if ! ocf_is_ms; then ocf_log info "This agent makes no use of notifications unless running in master/slave mode." return $OCF_SUCCESS fi local type_op type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" ocf_log debug "Received $type_op notification." case "$type_op" in - 'pre-promote') - # A new master is about to being promoted. It's not in - # read-write mode yet (that only occurs when it actually - # executes the promote action), so we can now safely - # connect to it and wait for it to start replicating. - local master_host - local master_status - master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname` - - if ( ! mysql_status ); then - return $OCF_NOT_RUNNING - fi - if [ -z "$master_host" ]; then - ocf_log err "Unable to determine master host!" - return $OCF_ERR_GENERIC - fi + 'pre-promote') + # Nothing to do now here, new replication info not yet published - if [ $master_host = ${HOSTNAME} ]; then - ocf_log info "This will be new master" - else - ocf_log info "Changing MySQL configuration to replicate from $master_host" - set_master $master_host - if [ $? -ne 0 ]; then - return $OCF_ERR_GENERIC + ;; + 'post-promote') + # The master has completed its promotion. Now is a good + # time to check whether our replication slave is working + # correctly. + master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "` + if [ "$master_host" = ${HOSTNAME} ]; then + ocf_log info "This will be the new master, ignoring post-promote notification." else - return $OCF_SUCCESS + ocf_log info "Resetting replication" + unset_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + ocf_log info "Changing MySQL configuration to replicate from $master_host" + set_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + start_slave + if [ $? -ne 0 ]; then + ocf_log err "Failed to start slave" + return $OCF_ERR_GENERIC + fi fi - fi - ;; - 'post-promote') - # The master has completed its promotion. Now is a good - # time to check whether our replication slave is working - # correctly. - master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname` - if [ "$master_host" = ${HOSTNAME} ]; then - ocf_log info "Ignoring post-promote notification for my own promotion." return $OCF_SUCCESS - fi - start_slave $master_host - if [ $? -ne 0 ]; then - ocf_log err "Failed to start slave" - return $OCF_ERR_GENERIC - fi ;; - 'post-demote') - demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname` - if [ $demote_host = ${HOSTNAME} ]; then - ocf_log info "Ignoring post-demote notification for my own demotion." + 'pre-demote') + demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` + if [ $demote_host = ${HOSTNAME} ]; then + ocf_log info "post-demote notification for $demote_host" + set_read_only on + if [ $? -ne 0 ]; then + ocf_log err "Failed to set read-only"; + return $OCF_ERR_GENERIC; + fi + + # Must kill all existing user threads because they are still Read/write + # in order for the slaves to complete the read of binlogs + local tmpfile + tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX` + $MYSQL $MYSQL_OPTIONS_REPL \ + -e "SHOW PROCESSLIST" > $tmpfile + + for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile` + do + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "KILL ${thread}" + done + else + ocf_log info "Ignoring post-demote notification execpt for my own demotion." + fi return $OCF_SUCCESS - fi - ocf_log info "post-demote notification for $demote_host." - # The former master has just been gracefully demoted. - unset_master ;; - *) - return $OCF_SUCCESS + 'post-demote') + demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` + if [ $demote_host = ${HOSTNAME} ]; then + ocf_log info "Ignoring post-demote notification for my own demotion." + return $OCF_SUCCESS + fi + ocf_log info "post-demote notification for $demote_host." + # The former master has just been gracefully demoted. + unset_master + ;; + *) + return $OCF_SUCCESS ;; esac } ####################################################################### + +########################################################################## +# If DEBUG_LOG is set, make this resource agent easy to debug: set up the +# debug log and direct all output to it. Otherwise, redirect to /dev/null. +# The log directory must be a directory owned by root, with permissions 0700, +# and the log must be writable and not a symlink. +########################################################################## +DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" +if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then + DEBUG_LOG_DIR="${DEBUG_LOG%/*}" + if [ -d "${DEBUG_LOG_DIR}" ]; then + exec 9>>"$DEBUG_LOG" + exec 2>&9 + date >&9 + echo "$*" >&9 + env | grep OCF_ | sort >&9 + set -x + else + exec 9>/dev/null + fi +fi + case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac mysql_validate rc=$? LSB_STATUS_STOPPED=3 if [ $rc -ne 0 ]; then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $rc;; esac fi # What kind of method was invoked? case "$1" in start) mysql_start;; stop) mysql_stop;; - status) mysql_status;; + status) mysql_status err;; monitor) mysql_monitor;; promote) mysql_promote;; demote) mysql_demote;; notify) mysql_notify;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/mysql-proxy b/heartbeat/mysql-proxy index 018a0a21c..f4395b6eb 100755 --- a/heartbeat/mysql-proxy +++ b/heartbeat/mysql-proxy @@ -1,468 +1,688 @@ #!/bin/sh # # Resource script for MySQL Proxy # # Description: Manages MySQL Proxy as an OCF resource in # an high-availability setup. # -# Tested with mysql-proxy 0.7.0 on Debian 5.0. +# Originally tested with MySQL Proxy 0.7.0 on Debian 5.0, +# but subsequent updates have been tested with MySQL Proxy 0.8.1 and 0.8.3 on Debian 6.0 only. +# # Based on the mysql and Pure-Ftpd OCF resource agents. # # Author: Raoul Bhatia : Original Author # License: GNU General Public License (GPL) # # # usage: $0 {start|stop|reload|status|monitor|validate-all|meta-data} # # The "start" arg starts a MySQL Proxy instance # # The "stop" arg stops it. # # TODO -# * add error checking like in mysql ocf ra (e.g. socketdir) +# * add in-depth monitoring by querying the mysql-proxy admin port # # Test via -# */usr/sbin/ocf-tester -n mp /usr/lib/ocf/resource.d/heartbeat/mysql-proxy -# */usr/sbin/ocf-tester -n ms -o binary="/usr/sbin/mysql-proxy" -o defaults_file="" -o parameters="--proxy-skip-profiling" \ -# -o admin_address="127.0.0.1:4041" -o proxy_backend_addresses="192.168.100.200:42006" \ -# -o proxy_address="/var/run/mysqld/mysqld.sock" /usr/lib/ocf/resource.d/heartbeat/mysql-proxy +# (note: this did not work with MySQL Proxy 0.8.1 and ocf-tester from resource-agents 3.9.2 on Debian 6.0) # -# * adding two mysql-instances (mysql-proxy-tcp and mysql-proxy-socket) and killing mysql-proxy-tcp -# beware, that as of mysql-proxy 0.7.0 (and possibly later), the socket is not automatically removed +# * /usr/sbin/ocf-tester -n mp -o binary="/usr/sbin/mysql-proxy" -o defaults_file="" -o parameters="--proxy-skip-profiling" \ +# -o admin_address="127.0.0.1:4041" -o admin_username="root" -o admin_password="la" -o admin_lua_script="/usr/lib/mysql-proxy/lua/admin.lua" \ +# -o proxy_backend_addresses="192.168.100.200:42006" -o proxy_address="/var/run/mysqld/mysqld.sock" /usr/lib/ocf/resource.d/heartbeat/mysql-proxy # # # OCF parameters: # OCF_RESKEY_binary +# OCF_RESKEY_client_binary # OCF_RESKEY_defaults_file # OCF_RESKEY_proxy_backend_addresses # OCF_RESKEY_proxy_read_only_backend_addresses # OCF_RESKEY_proxy_address # OCF_RESKEY_log_level # OCF_RESKEY_keepalive +# OCF_RESKEY_plugins # OCF_RESKEY_admin_address # OCF_RESKEY_admin_username # OCF_RESKEY_admin_password # OCF_RESKEY_admin_lua_script +# OCF_RESKEY_test_table +# OCF_RESKEY_test_user +# OCF_RESKEY_test_passwd # OCF_RESKEY_parameters # OCF_RESKEY_pidfile # ########################################################################## # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs : ${OCF_RESKEY_binary="/usr/sbin/mysql-proxy"} +: ${OCF_RESKEY_client_binary="mysql"} : ${OCF_RESKEY_defaults_file=""} : ${OCF_RESKEY_proxy_backend_addresses="127.0.0.1:3306"} : ${OCF_RESKEY_proxy_read_only_backend_addresses=""} : ${OCF_RESKEY_proxy_address=":4040"} : ${OCF_RESKEY_log_level=""} : ${OCF_RESKEY_keepalive=""} +: ${OCF_RESKEY_plugins=""} : ${OCF_RESKEY_admin_address="127.0.0.1:4041"} : ${OCF_RESKEY_admin_username=""} : ${OCF_RESKEY_admin_password=""} : ${OCF_RESKEY_admin_lua_script=""} +: ${OCF_RESKEY_test_table="mysql.user"} +: ${OCF_RESKEY_test_user=""} +: ${OCF_RESKEY_test_passwd=""} : ${OCF_RESKEY_parameters=""} : ${OCF_RESKEY_pidfile="${HA_RSCTMP}/mysql-proxy-${OCF_RESOURCE_INSTANCE}.pid"} USAGE="Usage: $0 {start|stop|reload|status|monitor|validate-all|meta-data}"; ########################################################################## usage() { echo $USAGE >&2 } meta_data() { cat < 0.1 This script manages MySQL Proxy as an OCF resource in a high-availability setup. -Tested with MySQL Proxy 0.7.0 on Debian 5.0. + +The default monitor operation will verify that mysql-proxy is running. + +The level 10 monitor operation is left out intentionally for possible future enhancements in conjunction with the admin plugin. + +The level 20 monitor operation will perform a SELECT on a given table to verify that the connection to a backend-server is actually working. + +Originally tested with MySQL Proxy 0.7.0 on Debian 5.0, but subsequent updates have been tested with MySQL Proxy 0.8.1 and 0.8.3 on Debian 6.0 only. -Manages a MySQL Proxy daemon +Manages a MySQL Proxy instance Full path to the MySQL Proxy binary. For example, "/usr/sbin/mysql-proxy". Full path to MySQL Proxy binary - + + +Location of the MySQL client binary + +MySQL client binary + + + + Full path to a MySQL Proxy configuration file. For example, "/etc/mysql-proxy.conf". Full path to configuration file Address:port of the remote backend-servers (default: 127.0.0.1:3306). MySQL Proxy backend-servers Address:port of the remote (read only) slave-server (default: ). MySql Proxy read only backend-servers Listening address:port of the proxy-server (default: :4040). You can also specify a socket like "/tmp/mysql-proxy.sock". MySQL Proxy listening address Log all messages of level (error|warning|info|message|debug|) or higher. An empty value disables logging. MySQL Proxy log level. Try to restart the proxy if it crashed (default: ). Valid values: true or false. An empty value equals "false". Use keepalive option + + +Whitespace separated list of plugins to load (default: ). +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. + +MySQL Proxy plugins + + + -Listening address:port of the admin-server (default: 127.0.0.1:4041). +Listening address:port of the admin plugin (default: 127.0.0.1:4041). +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. -MySQL Proxy admin-server address +MySQL Proxy admin plugin listening address -Username to allow to log in (default: ). +Username for the admin plugin (default: ). +Required since MySQL Proxy 0.8.1, if the admin plugin is loaded. +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. -MySQL Proxy admin-server username +MySQL Proxy admin plugin username -Password to allow to log in (default: ). +Password for the admin plugin (default: ). +Required since MySQL Proxy 0.8.1, if the admin plugin is loaded. +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. -MySQL Proxy admin-server password +MySQL Proxy admin plugin password Script to execute by the admin plugin. +Required since MySQL Proxy 0.8.1, if the admin plugin is loaded. +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. -MySQL Proxy admin-server lua script +MySQL Proxy admin plugin lua script + + +Table to be tested in monitor statement (in database.table notation) + +MySQL test table + + + + + +MySQL test user + +MySQL test user + + + + + +MySQL test user password + +MySQL test user password + + The MySQL Proxy daemon may be called with additional parameters. Specify any of them here. MySQL Proxy additional parameters PID file PID file END } isRunning() { kill -s 0 "$1" 2>/dev/null } mysqlproxy_status() { if [ -f "${pidfile}" ]; then # MySQL Proxy is probably running PID=`head -n 1 "${pidfile}"` if [ ! -z "$PID" ] ; then isRunning "$PID" - # @TODO in-depth check: issue a query and/or check proxy-admin interface return $? fi fi # MySQL Proxy is not running false } mysqlproxy_start() { # if MySQL Proxy is running return success if mysqlproxy_status ; then ocf_log info "MySQL Proxy already running." return $OCF_SUCCESS fi PARAM_PREFIX='' + + # MySQL Proxy plugins to load + # @TODO check if the plugins are actually available? + if ocf_is_true $plugin_support; then + for p in $plugins; do + PARAM_PREFIX="$PARAM_PREFIX --plugins=$p" + done + fi + # check if the MySQL Proxy defaults-file exist if [ -f "$defaults_file" ]; then - PARAM_PREFIX="--defaults-file=$defaults_file " + PARAM_PREFIX="$PARAM_PREFIX --defaults-file=$defaults_file" fi # set log-level if [ ! -z "$log_level" ]; then - PARAM_PREFIX="$PARAM_PREFIX --log-level=$log_level " + PARAM_PREFIX="$PARAM_PREFIX --log-level=$log_level" fi # set keepalive if [ "$keepalive" = "true" ]; then - PARAM_PREFIX="$PARAM_PREFIX --keepalive " + PARAM_PREFIX="$PARAM_PREFIX --keepalive" fi # honor admin_* options if [ ! -z "$admin_username" ]; then - PARAM_PREFIX="$PARAM_PREFIX --admin-username=$admin_username " + PARAM_PREFIX="$PARAM_PREFIX --admin-username=$admin_username" fi if [ ! -z "$admin_password" ]; then - PARAM_PREFIX="$PARAM_PREFIX --admin-password=$admin_password " + PARAM_PREFIX="$PARAM_PREFIX --admin-password=$admin_password" fi if [ ! -z "$admin_lua_script" ]; then - PARAM_PREFIX="$PARAM_PREFIX --admin-lua-script=$admin_lua_script " + PARAM_PREFIX="$PARAM_PREFIX --admin-lua-script=$admin_lua_script" + fi + + # make sure that the pid directory exists + pid_dir=`dirname $pidfile` + if [ ! -d $pid_dir ] ; then + ocf_log info "Creating PID directory '$pid_dir'." + mkdir -p $pid_dir + #chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir # c/p from mysql ra; currently not needed fi # split multiple proxy-address options. # currently unsupported but let us hope for the future ;) for pa in $proxy_address; do [ -z "$pa" ] && continue OPTIONS=" $OPTIONS --proxy-address=$pa" + + # if $pa contains a slash, we are dealing with a socket + # make sure that the socket directory exists + if echo "$pa" | grep -q '/' ; then + socket_dir=`dirname $pa` + if [ ! -d $socket_dir ] ; then + ocf_log info "Creating socket directory '$socket_dir'." + mkdir -p $socket_dir + #chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir # c/p from mysql ra; currently not needed + fi + fi done # split multiple proxy-backend-addresses options. for pba in $proxy_backend_addresses; do [ -z "$pba" ] && continue OPTIONS=" $OPTIONS --proxy-backend-addresses=$pba" done # split multiple proxy-backend-addresses options. for proba in $proxy_read_only_backend_addresses; do [ -z "$proba" ] && continue OPTIONS=" $OPTIONS --proxy-read-only-backend-addresses=$proba" done - # build $OPTIONS and add addmin-address and pidfile + # build $OPTIONS and add admin-address and pidfile OPTIONS="$PARAM_PREFIX $OPTIONS --admin-address=$admin_address --pid-file=${pidfile}" + # add additional parameters + if [ -n "$parameters" ]; then + OPTIONS="$OPTIONS $parameters" + fi + # start MySQL Proxy #start-stop-daemon --start --quiet --pidfile $pidfile --make-pidfile --name mysql-proxy --startas $binary -b -- $OPTIONS $binary --daemon $OPTIONS ret=$? if [ $ret -ne 0 ]; then ocf_log err "MySQL Proxy returned error." $ret return $OCF_ERR_GENERIC fi + # @TODO add an initial monitoring action? + return $OCF_SUCCESS } mysqlproxy_stop() { if mysqlproxy_status ; then #start-stop-daemon --stop --quiet --retry 3 --exec $binary --pidfile $pidfile /bin/kill `cat "${pidfile}"` ret=$? if [ $ret -ne 0 ]; then ocf_log err "MySQL Proxy returned an error while stopping." $ret return $OCF_ERR_GENERIC fi # grant some time for shutdown and recheck sleep 1 if mysqlproxy_status ; then ocf_log err "MySQL Proxy failed to stop." return $OCF_ERR_GENERIC fi # remove dangling socketfile, if specified for pa in $proxy_address; do if [ -S "$pa" ]; then ocf_log info "Removing dangling socket file '$pa'." rm -f "$pa" fi done # remove dangling pidfile if [ -f "${pidfile}" ]; then ocf_log info "Removing dangling pidfile '${pidfile}'." rm -f "${pidfile}" fi fi return $OCF_SUCCESS } mysqlproxy_reload() { if mysqlproxy_status; then ocf_log info "Reloading MySQL Proxy." kill -s HUP `cat ${pidfile}` fi } mysqlproxy_monitor() { + local rc + local mysql_options pa + local mysql_server_parameter mysql_server_host mysql_server_port + if [ "${OCF_RESKEY_CRM_meta_interval:-0}" -eq "0" ]; then # in case of probe, monitor operation is surely treated as # under suspension. This will call start operation. # (c/p from ocf:heartbeat:sfex) mysqlproxy_validate_all rc=$? [ $rc -ne 0 ] && return $rc fi - if mysqlproxy_status ; then - return $OCF_SUCCESS + if ! mysqlproxy_status ; then + return $OCF_NOT_RUNNING fi - return $OCF_NOT_RUNNING + if [ $OCF_CHECK_LEVEL -eq 20 -a -n "$OCF_RESKEY_test_table" ]; then + mysql_options="--connect_timeout=10 --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" + + # cycle each address + for pa in $proxy_address; do + # build correct connect parameter + if [ -S "$pa" ]; then + # we need to monitor a mysql socket + mysql_server_parameter="--socket=$pa" + else + # we need to monitor a host address + mysql_server_parameter="" + + # split host:port + # @TODO correctly handle IPv6 address + # @TODO correctly handle 0.0.0.0 address + mysql_server_host=`echo $pa | cut -d : -f 1` + mysql_server_port=`echo $pa | cut -d : -f 2` + + if [ -n $mysql_server_host ]; then + mysql_server_parameter="$mysql_server_parameter --host=$mysql_server_host" + fi + if [ -n $mysql_server_port ]; then + mysql_server_parameter="$mysql_server_parameter --port=$mysql_server_port" + fi + fi + + # Check for test table + ocf_run $mysql $mysql_server_parameter $mysql_options \ + -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log err "Failed to select from $OCF_RESKEY_test_table"; + return $OCF_ERR_GENERIC; + fi + done + + fi + + return $OCF_SUCCESS } mysqlproxy_validate_all() { + # local variables + local config_error=0 + # check that the MySQL Proxy binary exists and can be executed - if [ ! -x "$binary" ]; then - ocf_log err "MySQL Proxy binary '$binary' does not exist or cannot be executed." - return $OCF_ERR_INSTALLED - fi + check_binary $binary + # @TODO skip the next test if OCF_CHECK_LEVEL == 0? + check_binary $mysql # check for valid log-level - echo $log_level | egrep "^(error|warning|info|message|debug|)$" >/dev/null + echo $log_level | egrep -q "^(error|warning|info|message|debug|)$" if [ $? -ne 0 ]; then ocf_log err "MySQL Proxy log level '$log_level' not in valid range error|warning|info|message|debug" return $OCF_ERR_CONFIGURED fi + + # if we're running MySQL Proxy > 0.8.1 and there is any admin parameter set, + # explicitly load the admin (and the proxy) plugin. + # (version 0.8.2 does not load the admin plugin by default anymore) + ocf_version_cmp "$version" "0.8.1" + ret=$? + if [ $ret -eq 2 ]; then + # simple check: concat all parameters and check if the string has non-zero length + if [ -n "$admin_username$admin_password$admin_lua_script$admin_address" ]; then + plugins="proxy admin" + has_plugin_admin=1 + else + has_plugin_admin=0 + fi + fi + + + # check for required admin_* parameters for 0.8.1 and 0.8.2 (with admin module) + # translated: if (version == 0.8.1 or (version > 0.8.1 and has_plugin_admin)) + if [ $ret -eq 1 -o \( $ret -eq 2 -a $has_plugin_admin -eq 1 \) ]; then + if [ -z "$admin_username" ]; then + ocf_log err "Missing required parameter \"admin_username\"" + config_error=1 + fi + if [ -z "$admin_password" ]; then + ocf_log err "Missing required parameter \"admin_password\"" + config_error=1 + fi + if [ -z "$admin_lua_script" ]; then + ocf_log err "Missing required parameter \"admin_lua_script\"" + config_error=1 + fi + + # check if the admin_lua_script, if specified, exists + if [ -n "$admin_lua_script" -a ! -e "$admin_lua_script" ]; then + ocf_log err "MySQL Proxy admin lua script '$admin_lua_script' does not exist or is not readable." + fi + fi + + # issue a warning during start if the user wants to load a plugin + # but this version of MySQL Proxy does not support the plugin architecture. + if [ -n "$plugins" ] && ocf_is_false "$plugin_support" && [ $__OCF_ACTION = 'start' ]; then + ocf_log warn "You are running MySQL Proxy version '$version'. This version does not support the plugin architecture. Please use version 0.7.0 or later to load the plugins '$plugins'." + fi + + # exit in case we have found relevant config errors + if [ $config_error -eq 1 ]; then + exit $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS } # # Main # if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi pidfile=$OCF_RESKEY_pidfile binary=$OCF_RESKEY_binary defaults_file=$OCF_RESKEY_defaults_file proxy_backend_addresses=$OCF_RESKEY_proxy_backend_addresses proxy_read_only_backend_addresses=$OCF_RESKEY_proxy_read_only_backend_addresses admin_address=$OCF_RESKEY_admin_address admin_username=$OCF_RESKEY_admin_username admin_password=$OCF_RESKEY_admin_password admin_lua_script=$OCF_RESKEY_admin_lua_script proxy_address=$OCF_RESKEY_proxy_address log_level=$OCF_RESKEY_log_level keepalive=$OCF_RESKEY_keepalive +plugins=`echo $OCF_RESKEY_plugins | tr "[:space:]" "\n" | sort -u` +mysql=$OCF_RESKEY_client_binary +parameters=$OCF_RESKEY_parameters +plugin_support=false +has_plugin_admin=0 # 0 because this simplifies the if statements # debugging stuff #echo OCF_RESKEY_binary=$OCF_RESKEY_binary >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_defaults_file=$OCF_RESKEY_defaults_file >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_proxy_backend_addresses=$OCF_RESKEY_proxy_backend_addresses >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_proxy_read_only_backend_addresses=$OCF_RESKEY_proxy_read_only_backend_addresses >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_proxy_address=$OCF_RESKEY_proxy_address >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_log_level=$OCF_RESKEY_log_level >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_keepalive=$OCF_RESKEY_keepalive >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_admin_address=$OCF_RESKEY_admin_address >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_admin_username=$OCF_RESKEY_admin_username >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_admin_password=$OCF_RESKEY_admin_password >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_admin_lua_script=$OCF_RESKEY_admin_lua_script >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_parameters=$OCF_RESKEY_parameters >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE #echo OCF_RESKEY_pidfile=$OCF_RESKEY_pidfile >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +# handle some parameters before performing any additional checks +case $1 in + meta-data) meta_data + exit $? + ;; + + usage) usage + exit $OCF_SUCCESS + ;; +esac + + +# determine MySQL Proxy version +check_binary $binary +version=`$binary --version | grep ^mysql-proxy | awk '{print $NF}'` + +# version 0.7.0 (and later) support the plugin architecture and load the admin plugin by default +# version 0.8.1 loads admin plugin by default and requires the admin parameters to be set +# version 0.8.2 does not load the admin plugin by default anymore +ocf_version_cmp "$version" "0.7.0" +ret=$? +if [ $ret -eq 1 -o $ret -eq 2 ]; then + plugin_support=true + has_plugin_admin=1 +fi + + +# perform action case $1 in start) mysqlproxy_validate_all && mysqlproxy_start exit $? ;; stop) mysqlproxy_validate_all && mysqlproxy_stop exit $? ;; reload) mysqlproxy_reload exit $? ;; status) if mysqlproxy_status; then ocf_log info "MySQL Proxy is running." exit $OCF_SUCCESS else ocf_log info "MySQL Proxy is stopped." exit $OCF_NOT_RUNNING fi ;; monitor) mysqlproxy_monitor exit $? ;; validate-all) mysqlproxy_validate_all exit $? ;; - meta-data) meta_data - exit $? - ;; - - usage) usage - exit $OCF_SUCCESS - ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver index 296d6c878..6414e3a30 100755 --- a/heartbeat/nfsserver +++ b/heartbeat/nfsserver @@ -1,269 +1,296 @@ #!/bin/sh # nfsserver # # Description: Manages nfs server as OCF resource # by hxinwei@gmail.com # License: GNU General Public License v2 (GPLv2) and later if [ -n "$OCF_DEBUG_LIBRARY" ]; then . $OCF_DEBUG_LIBRARY else : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs fi DEFAULT_INIT_SCRIPT="/etc/init.d/nfsserver" DEFAULT_NOTIFY_CMD="/sbin/sm-notify" +DEFAULT_RPCPIPEFS_DIR="/var/lib/nfs/rpc_pipefs" nfsserver_meta_data() { cat < 1.0 Nfsserver helps to manage the Linux nfs server as a failover-able resource in Linux-HA. It depends on Linux specific NFS implementation details, so is considered not portable to other platforms yet. Manages an NFS server The default init script shipped with the Linux distro. The nfsserver resource agent offloads the start/stop/monitor work to the init script because the procedure to start/stop/monitor nfsserver varies on different Linux distro. Init script for nfsserver The tool to send out NSM reboot notification. Failover of nfsserver can be considered as rebooting to different machines. The nfsserver resource agent use this command to notify all clients about the happening of failover. The tool to send out notification. The nfsserver resource agent will save nfs related information in this specific directory. And this directory must be able to fail-over before nfsserver itself. Directory to store nfs server related information. Comma separated list of floating IP addresses used to access the nfs service IP addresses. + + +The mount point for the sunrpc file system. Default is $DEFAULT_RPCPIPEFS_DIR . +This script will mount(bind) nfs_shared_infodir on /var/lib/nfs/ (can not be changed), +and this script will mount the sunrpc file system on $DEFAULT_RPCPIPEFS_DIR (default, can be changed by this parameter). +If you want to move only rpc_pipefs/ (e.g. to keep rpc_pipefs/ local) from default , please set this value. + + +The mount point for the sunrpc file system. + + + + END return $OCF_SUCCESS } nfsserver_usage() { cat < $fn 2>&1 rc=$? ocf_log debug `cat $fn` rm -f $fn #Adapte LSB status code to OCF return code if [ $rc -eq 0 ]; then return $OCF_SUCCESS elif [ $rc -eq 3 ]; then return $OCF_NOT_RUNNING else return $OCF_ERR_GENERIC fi } prepare_directory () { [ -d "$fp" ] || mkdir -p $fp - [ -d "$fp/rpc_pipefs" ] || mkdir -p $fp/rpc_pipefs + [ -d "$rpcpipefs_make_dir" ] || mkdir -p $rpcpipefs_make_dir [ -d "$fp/sm" ] || mkdir -p $fp/sm [ -d "$fp/sm.ha" ] || mkdir -p $fp/sm.ha [ -d "$fp/sm.bak" ] || mkdir -p $fp/sm.bak [ -d "$fp/v4recovery" ] || mkdir -p $fp/v4recovery } is_bound () { mount | grep -q "$1 on $2 type none (.*bind)" return $? } bind_tree () { if is_bound $fp /var/lib/nfs; then ocf_log debug "$fp is already bound to /var/lib/nfs" return 0 fi mount --bind $fp /var/lib/nfs } unbind_tree () { - if `mount | grep -q "rpc_pipefs on /var/lib/nfs/rpc_pipefs"`; then - umount /var/lib/nfs/rpc_pipefs + if `mount | grep -q " on $rpcpipefs_umount_dir"`; then + umount -t rpc_pipefs $rpcpipefs_umount_dir fi if is_bound $fp /var/lib/nfs; then umount /var/lib/nfs fi } nfsserver_start () { + if nfsserver_monitor; then + ocf_log debug "NFS server is already started" + return $OCF_SUCCESS + fi + prepare_directory bind_tree rm -rf /var/lib/nfs/sm.ha/* > /dev/null 2>&1 cp -rf /var/lib/nfs/sm /var/lib/nfs/sm.bak /var/lib/nfs/state /var/lib/nfs/sm.ha > /dev/null 2>&1 ocf_log info "Starting NFS server ..." fn=`mktemp` ${OCF_RESKEY_nfs_init_script} start > $fn 2>&1 rc=$? ocf_log debug `cat $fn` rm -f $fn if [ $rc -ne 0 ]; then ocf_log err "Failed to start NFS server" return $rc fi #Notify the nfs server has been moved or rebooted #The init script do that already, but with the hostname, which may be ignored by client #we have to do it again with the nfs_ip local opts="-f -v" echo $OCF_RESKEY_nfs_notify_cmd | grep -qws rpc.statd && opts="" rm -rf /var/lib/nfs/sm.ha.save > /dev/null 2>&1 cp -rf /var/lib/nfs/sm.ha /var/lib/nfs/sm.ha.save > /dev/null 2>&1 for ip in `echo ${OCF_RESKEY_nfs_ip} | sed 's/,/ /g'`; do ${OCF_RESKEY_nfs_notify_cmd} $opts $ip -P /var/lib/nfs/sm.ha rm -rf /var/lib/nfs/sm.ha > /dev/null 2>&1 cp -rf /var/lib/nfs/sm.ha.save /var/lib/nfs/sm.ha > /dev/null 2>&1 done ocf_log info "NFS server started" return $OCF_SUCCESS } nfsserver_stop () { ocf_log info "Stopping NFS server ..." fn=`mktemp` ${OCF_RESKEY_nfs_init_script} stop > $fn 2>&1 rc=$? ocf_log debug `cat $fn` rm -f $fn if [ $rc -eq 0 ]; then unbind_tree ocf_log info "NFS server stopped" return $OCF_SUCCESS fi ocf_log err "Failed to stop NFS server" return $rc } nfsserver_validate () { check_binary ${OCF_RESKEY_nfs_init_script} check_binary ${OCF_RESKEY_nfs_notify_cmd} if [ x = x"${OCF_RESKEY_nfs_ip}" ]; then ocf_log err "nfs_ip not set" exit $OCF_ERR_CONFIGURED fi if [ x = "x$OCF_RESKEY_nfs_shared_infodir" ]; then ocf_log err "nfs_shared_infodir not set" exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } if [ -n "$OCF_RESKEY_CRM_meta_clone" ]; then ocf_log err "THIS RA DO NOT SUPPORT CLONE MODE!" exit $OCF_ERR_CONFIGURED fi nfsserver_validate case $__OCF_ACTION in start) nfsserver_start ;; stop) nfsserver_stop ;; monitor) nfsserver_monitor ;; - validate-all) nfsserver_validate + validate-all) exit $OCF_SUCCESS ;; *) nfsserver_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/ocf-rarun b/heartbeat/ocf-rarun new file mode 100644 index 000000000..b126d1a43 --- /dev/null +++ b/heartbeat/ocf-rarun @@ -0,0 +1,130 @@ +# +# This is the OCF RA driver. It should take care of all the +# boring details and leave only the parts which are really about +# the actual resource to the resource agent. +# +# The interface +# +# The RA needs to define functions for all supported actions and +# name them _. For instance, apache_start or +# apache_meta_data. +# +# The required parameters should all be listed in the +# OCF_REQUIRED_PARAMS variable. For example, "config user group". +# +# The OCF_REQUIRED_BINARIES variable should contain a list of all +# programs which are needed for the correct operation of the +# resource agent. +# +# _getconfig and _validate_all are optional. getconfig is +# where RA can read more configuration from the file system or do +# some other configuration processing. +# validate_all checks if the environment is OK. +# +# If it exists, the _probe function is invoked on probes +# (monitor with interval 0) instead of _monitor. + +is_function() { + test z"`command -v $1`" = z"$1" +} +run_function() { + is_function $1 && $1 +} +is_var_defined() { + test z != "z$(eval echo $`echo $1`)" +} +mk_action_func() { + ACTION_FUNC=${OCF_RESOURCE_TYPE}_`echo $__OCF_ACTION | tr '-' '_'` +} +validate_args() { + is_function $ACTION_FUNC || { + ocf_log err "$__OCF_ACTION: action not supported" + run_function ${OCF_RESOURCE_TYPE}_methods + exit $OCF_ERR_UNIMPLEMENTED + } +} +simple_actions() { + case $__OCF_ACTION in + meta-data|usage|methods) + $ACTION_FUNC + exit $OCF_SUCCESS + ;; + esac +} +run_probe() { + if is_function ${OCF_RESOURCE_TYPE}_probe; then + ${OCF_RESOURCE_TYPE}_probe + exit + fi +} +check_required_params() { + local v + for v in $OCF_REQUIRED_PARAMS; do + is_var_defined OCF_RESKEY_$v || { + ocf_log err "$v: required parameter not set" + exit $OCF_ERR_CONFIGURED + } + done +} +handle_invalid_env() { + local rc msg + rc=$1 + msg=${2:-"environment is invalid, resource considered stopped"} + case "$__OCF_ACTION" in + stop) + ocf_log info $msg + exit $OCF_SUCCESS + ;; + monitor) + if ocf_is_probe; then + ocf_log info $msg + exit $OCF_NOT_RUNNING + else + # in recurring monitor, this amounts to error + ocf_log err $msg + exit $OCF_ERR_GENERIC + fi + ;; + status) + ocf_log info $msg + exit $LSB_STATUS_STOPPED + ;; + *) + ocf_log err $msg + exit $rc + ;; + esac +} +check_required_binaries() { + local v + for v in $OCF_REQUIRED_BINARIES; do + have_binary $v || { + handle_invalid_env $OCF_ERR_INSTALLED "$v: required binary not installed" + } + done +} +validate_env() { + check_required_binaries # all binaries present? + is_function ${OCF_RESOURCE_TYPE}_validate_all || + return + local rc + LSB_STATUS_STOPPED=3 + ${OCF_RESOURCE_TYPE}_validate_all # is environment ok? + rc=$? + if [ $rc -ne 0 ]; then + handle_invalid_env $rc + fi +} + +# ocf_rarun: the main function +ocf_rarun() { + mk_action_func # create action function name + validate_args # validate command line arguments + simple_actions # run meta-data (or similar) + check_required_params # all required parameters defined? + run_function ${OCF_RESOURCE_TYPE}_getconfig # get extra configuration + validate_env # is environment ok? + ocf_is_probe && run_probe # do probe + shift 1 # skip action + $ACTION_FUNC $* # run action +} diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in index e1cd33fdf..473a40927 100644 --- a/heartbeat/ocf-shellfuncs.in +++ b/heartbeat/ocf-shellfuncs.in @@ -1,616 +1,676 @@ # # # Common helper functions for the OCF Resource Agents supplied by # heartbeat. # # Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Brée # All Rights Reserved. # # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Build version: $Format:%H$ # TODO: Some of this should probably split out into a generic OCF # library for shell scripts, but for the time being, we'll just use it # ourselves... # # TODO wish-list: # - Generic function for evaluating version numbers # - Generic function(s) to extract stuff from our own meta-data # - Logging function which automatically adds resource identifier etc # prefixes # TODO: Move more common functionality for OCF RAs here. # # This was common throughout all legacy Heartbeat agents unset LC_ALL; export LC_ALL unset LANGUAGE; export LANGUAGE __SCRIPT_NAME=`basename $0` if [ -z "$OCF_ROOT" ]; then : ${OCF_ROOT=@OCF_ROOT_DIR@} fi if [ "$OCF_FUNCTIONS_DIR" = ${OCF_ROOT}/resource.d/heartbeat ]; then # old unset OCF_FUNCTIONS_DIR fi : ${OCF_FUNCTIONS_DIR:=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-binaries . ${OCF_FUNCTIONS_DIR}/ocf-returncodes . ${OCF_FUNCTIONS_DIR}/ocf-directories +. ${OCF_FUNCTIONS_DIR}/ocf-rarun # Define OCF_RESKEY_CRM_meta_interval in case it isn't already set, # to make sure that ocf_is_probe() always works : ${OCF_RESKEY_CRM_meta_interval=0} ocf_is_root() { if [ X`id -u` = X0 ]; then true else false fi } ocf_maybe_random() { local rnd="$RANDOM" # Something sane-ish in case a shell doesn't support $RANDOM [ -n "$rnd" ] || rnd=$$ echo $rnd } # Portability comments: # o The following rely on Bourne "sh" pattern-matching, which is usually # that for filename generation (note: not regexp). # o The "*) true ;;" clause is probably unnecessary, but is included # here for completeness. # o The negation in the pattern uses "!". This seems to be common # across many OSes (whereas the alternative "^" fails on some). # o If an OS is encountered where this negation fails, then a possible # alternative would be to replace the function contents by (e.g.): # [ -z "`echo $1 | tr -d '[0-9]'`" ] # ocf_is_decimal() { case "$1" in ""|*[!0-9]*) # empty, or at least one non-decimal false ;; *) true ;; esac } ocf_is_true() { case "$1" in yes|true|1|YES|TRUE|ja|on|ON) true ;; *) false ;; esac } ocf_is_hex() { case "$1" in ""|*[!0-9a-fA-F]*) # empty, or at least one non-hex false ;; *) true ;; esac } ocf_is_octal() { case "$1" in ""|*[!0-7]*) # empty, or at least one non-octal false ;; *) true ;; esac } __ocf_set_defaults() { __OCF_ACTION="$1" # Return to sanity for the agents... unset LANG LC_ALL=C export LC_ALL # TODO: Review whether we really should source this. Or rewrite # to match some emerging helper function syntax...? This imports # things which no OCF RA should be using... # Strip the OCF_RESKEY_ prefix from this particular parameter if [ -z "$OCF_RESKEY_OCF_CHECK_LEVEL" ]; then : ${OCF_CHECK_LEVEL:=0} else : ${OCF_CHECK_LEVEL:=$OCF_RESKEY_OCF_CHECK_LEVEL} fi if [ ! -d "$OCF_ROOT" ]; then ha_log "ERROR: OCF_ROOT points to non-directory $OCF_ROOT." exit $OCF_ERR_GENERIC fi if [ -z "$OCF_RESOURCE_TYPE" ]; then : ${OCF_RESOURCE_TYPE:=$__SCRIPT_NAME} fi if [ -z "$OCF_RA_VERSION_MAJOR" ]; then : We are being invoked as an init script. : Fill in some things with reasonable values. : ${OCF_RESOURCE_INSTANCE:="default"} return 0 fi if [ "x$__OCF_ACTION" = "xmeta-data" ]; then OCF_RESOURCE_INSTANCE="undef" fi if [ -z "$OCF_RESOURCE_INSTANCE" ]; then ha_log "ERROR: Need to tell us our resource instance name." exit $OCF_ERR_ARGS fi } hadate() { date "+${HA_DATEFMT}" } set_logtag() { if [ -z "$HA_LOGTAG" ]; then if [ -n "$OCF_RESOURCE_INSTANCE" ]; then HA_LOGTAG="$__SCRIPT_NAME($OCF_RESOURCE_INSTANCE)[$$]" else HA_LOGTAG="$__SCRIPT_NAME[$$]" fi fi } ha_log() { local loglevel [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" # if we're connected to a tty, then output to stderr if tty >/dev/null; then if [ "x$HA_debug" = "x0" -a "x$loglevel" = xdebug ] ; then return 0 fi if [ "$HA_LOGTAG" ]; then echo "$HA_LOGTAG: $*" else echo "$*" fi >&2 return 0 fi set_logtag if [ "x${HA_LOGD}" = "xyes" ] ; then ha_logger -t "${HA_LOGTAG}" "$@" if [ "$?" -eq "0" ] ; then return 0 fi fi if [ -n "$HA_LOGFACILITY" ] then : logging through syslog # loglevel is unknown, use 'notice' for now loglevel=notice case "${*}" in *ERROR*) loglevel=err;; *WARN*) loglevel=warning;; *INFO*|info) loglevel=info;; esac logger -t "$HA_LOGTAG" -p ${HA_LOGFACILITY}.${loglevel} "${*}" fi if [ -n "$HA_LOGFILE" ] then : appending to $HA_LOGFILE echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_LOGFILE fi if [ -z "$HA_LOGFACILITY" -a -z "$HA_LOGFILE" ] then : appending to stderr echo `hadate`"${*}" >&2 fi if [ -n "$HA_DEBUGLOG" ] then : appending to $HA_DEBUGLOG echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG fi } ha_debug() { if [ "x${HA_debug}" = "x0" ] ; then return 0 fi if tty >/dev/null; then if [ "$HA_LOGTAG" ]; then echo "$HA_LOGTAG: $*" else echo "$*" fi >&2 return 0 fi set_logtag if [ "x${HA_LOGD}" = "xyes" ] ; then ha_logger -t "${HA_LOGTAG}" -D "ha-debug" "$@" if [ "$?" -eq "0" ] ; then return 0 fi fi [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" if [ -n "$HA_LOGFACILITY" ] then : logging through syslog logger -t "$HA_LOGTAG" -p "${HA_LOGFACILITY}.debug" "${*}" fi if [ -n "$HA_DEBUGLOG" ] then : appending to $HA_DEBUGLOG echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG fi if [ -z "$HA_LOGFACILITY" -a -z "$HA_DEBUGLOG" ] then : appending to stderr echo "$HA_LOGTAG: `hadate`${*}: ${HA_LOGFACILITY}" >&2 fi } ha_parameter() { local VALUE VALUE=`sed -e 's%[ ][ ]*% %' -e 's%^ %%' -e 's%#.*%%' $HA_CF | grep -i "^$1 " | sed 's%[^ ]* %%'` if [ "X$VALUE" = X ] then case $1 in keepalive) VALUE=2;; deadtime) ka=`ha_parameter keepalive` VALUE=`expr $ka '*' 2 '+' 1`;; esac fi echo $VALUE } ocf_log() { # TODO: Revisit and implement internally. if [ $# -lt 2 ] then ocf_log err "Not enough arguments [$#] to ocf_log." fi __OCF_PRIO="$1" shift __OCF_MSG="$*" case "${__OCF_PRIO}" in crit) __OCF_PRIO="CRIT";; err) __OCF_PRIO="ERROR";; warn) __OCF_PRIO="WARNING";; info) __OCF_PRIO="INFO";; debug)__OCF_PRIO="DEBUG";; *) __OCF_PRIO=`echo ${__OCF_PRIO}| tr '[a-z]' '[A-Z]'`;; esac if [ "${__OCF_PRIO}" = "DEBUG" ]; then ha_debug "${__OCF_PRIO}: $__OCF_MSG" else ha_log "${__OCF_PRIO}: $__OCF_MSG" fi } # # ocf_deprecated: Log a deprecation warning # Usage: ocf_deprecated [param-name] # Arguments: param-name optional, name of a boolean resource # parameter that can be used to suppress # the warning (default # "ignore_deprecation") ocf_deprecated() { local param param=${1:-ignore_deprecation} # don't use ${!param} here, it's a bashism if ! ocf_is_true $(eval echo \$OCF_RESKEY_$param); then ocf_log warn "This resource agent is deprecated" \ "and may be removed in a future release." \ "See the man page for details." \ "To suppress this warning, set the \"${param}\"" \ "resource parameter to true." fi } # # Ocf_run: Run a script, and log its output. # Usage: ocf_run [-q] [-info|-warn|-err] # -q: don't log the output of the command if it succeeds # -info|-warn|-err: log the output of the command at given # severity if it fails (defaults to err) # ocf_run() { local rc local output local verbose=1 local loglevel=err local var for var in 1 2 do case "$1" in "-q") verbose="" shift 1;; "-info"|"-warn"|"-err") loglevel=`echo $1 | sed -e s/-//g` shift 1;; *) ;; esac done output=`"$@" 2>&1` rc=$? output=`echo $output` if [ $rc -eq 0 ]; then if [ "$verbose" -a ! -z "$output" ]; then ocf_log info "$output" fi return $OCF_SUCCESS else if [ ! -z "$output" ]; then ocf_log $loglevel "$output" else ocf_log $loglevel "command failed: $*" fi return $rc fi } ocf_pidfile_status() { local pid pidfile=$1 if [ ! -e $pidfile ]; then # Not exists return 2 fi pid=`cat $pidfile` kill -0 $pid 2>&1 > /dev/null if [ $? = 0 ]; then return 0 fi # Stale return 1 } ocf_take_lock() { local lockfile=$1 local rnd=$(ocf_maybe_random) sleep 0.$rnd while ocf_pidfile_status $lockfile do ocf_log info "Sleeping until $lockfile is released..." sleep 0.$rnd done echo $$ > $lockfile } ocf_release_lock_on_exit() { local lockfile=$1 trap "rm -f $lockfile" EXIT } # returns true if the CRM is currently running a probe. A probe is # defined as a monitor operation with a monitoring interval of zero. ocf_is_probe() { [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" = 0 ] } # returns true if the resource is configured as a clone. This is # defined as a resource where the clone-max meta attribute is present, # and set to greater than zero. ocf_is_clone() { [ ! -z "${OCF_RESKEY_CRM_meta_clone_max}" ] && [ "${OCF_RESKEY_CRM_meta_clone_max}" -gt 0 ] } # returns true if the resource is configured as a multistate # (master/slave) resource. This is defined as a resource where the # master-max meta attribute is present, and set to greater than zero. ocf_is_ms() { [ ! -z "${OCF_RESKEY_CRM_meta_master_max}" ] && [ "${OCF_RESKEY_CRM_meta_master_max}" -gt 0 ] } # version check functions # allow . and - to delimit version numbers # max version number is 999 # letters and such are effectively ignored # ocf_is_ver() { echo $1 | grep '^[0-9][0-9.-]*[0-9]$' >/dev/null 2>&1 } ocf_ver2num() { echo $1 | awk -F'[.-]' ' {for(i=1; i<=NF; i++) s=s*1000+$i; print s} ' } ocf_ver_level(){ echo $1 | awk -F'[.-]' '{print NF}' } ocf_ver_complete_level(){ local ver="$1" local level="$2" local i=0 while [ $i -lt $level ]; do ver=${ver}.0 i=`expr $i + 1` done echo $ver } # usage: ocf_version_cmp VER1 VER2 # version strings can contain digits, dots, and dashes # must start and end with a digit # returns: # 0: VER1 smaller (older) than VER2 # 1: versions equal # 2: VER1 greater (newer) than VER2 # 3: bad format ocf_version_cmp() { ocf_is_ver "$1" || return 3 ocf_is_ver "$2" || return 3 local v1=$1 local v2=$2 local v1_level=`ocf_ver_level $v1` local v2_level=`ocf_ver_level $v2` local level_diff if [ $v1_level -lt $v2_level ]; then level_diff=`expr $v2_level - $v1_level` v1=`ocf_ver_complete_level $v1 $level_diff` elif [ $v1_level -gt $v2_level ]; then level_diff=`expr $v1_level - $v2_level` v2=`ocf_ver_complete_level $v2 $level_diff` fi v1=`ocf_ver2num $v1` v2=`ocf_ver2num $v2` if [ $v1 -eq $v2 ]; then return 1 elif [ $v1 -lt $v2 ]; then return 0 else return 2 # -1 would look funny in shell ;-) fi } # usage: dirname DIR dirname() { local a local b [ $# = 1 ] || return 1 a="$1" while [ 1 ]; do b="${a%/}" [ "$a" = "$b" ] && break a="$b" done b=${a%/*} [ -z "$b" -o "$a" = "$b" ] && b="." echo "$b" return 0 } # # pseudo_resource status tracking function... # # This allows pseudo resources to give correct status information. As we add # resource monitoring, and better resource tracking in general, this will # become essential. # # These scripts work because ${HA_RSCTMP} is cleaned out every time # heartbeat is started. # # We create "resource-string" tracking files under ${HA_RSCTMP} in a # very simple way: # # Existence of "${HA_RSCTMP}/resource-string" means that we consider # the resource named by "resource-string" to be running. # # Note that "resource-string" needs to be unique. Using the resource type # plus the resource instance arguments to make up the resource string # is probably sufficient... # # usage: ha_pseudo_resource resource-string op [tracking_file] # where op is {start|stop|monitor|status|restart|reload|print} # print is a special op which just prints the tracking file location # user can override our choice of the tracking file location by # specifying it as the third arg # Note that all operations are silent... # ha_pseudo_resource() { local ha_resource_tracking_file="${3:-${HA_RSCTMP}/$1}" case $2 in start|restart|reload) touch "$ha_resource_tracking_file";; stop) rm -f "$ha_resource_tracking_file";; status|monitor) if [ -f "$ha_resource_tracking_file" ] then return 0 else case $2 in status) return 3;; *) return 7;; esac fi;; print) echo "$ha_resource_tracking_file";; *) return 3;; esac } # usage: rmtempdir TMPDIR rmtempdir() { [ $# = 1 ] || return 1 if [ -e "$1" ]; then rmdir "$1" || return 1 fi return 0 } # usage: maketempfile [-d] maketempfile() { if [ $# = 1 -a "$1" = "-d" ]; then mktemp -d return -0 elif [ $# != 0 ]; then return 1 fi mktemp return 0 } # usage: rmtempfile TMPFILE rmtempfile () { [ $# = 1 ] || return 1 if [ -e "$1" ]; then rm "$1" || return 1 fi return 0 } +# echo the first lower supported check level +# pass set of levels supported by the agent +# (in increasing order, 0 is optional) +ocf_check_level() +{ + local lvl prev + lvl=0 + prev=0 + if ocf_is_decimal "$OCF_CHECK_LEVEL"; then + # the level list should be very short + for lvl; do + if [ "$lvl" -eq "$OCF_CHECK_LEVEL" ]; then + break + elif [ "$lvl" -gt "$OCF_CHECK_LEVEL" ]; then + lvl=$prev # the previous one + break + fi + prev=$lvl + done + fi + echo $lvl +} + +# usage: ocf_stop_processes SIGNALS WAIT_TIME PIDS +# +# we send signals (use quotes for more than one!) in the order +# given; if one or more processes are still running we try KILL; +# the wait_time is the _total_ time we'll spend in this function +# this time may be slightly exceeded if the processes won't leave +# +# returns: +# 0: all processes left +# 1: some processes still running +# +# example: +# +# ocf_stop_processes TERM 5 $pids +# +ocf_stop_processes() { + local signals="$1" + local wait_time="$(($2/`echo $signals|wc -w`))" + shift 2 + local pids="$*" + local sig i + test -z "$pids" && + return 0 + for sig in $signals KILL; do + kill -s $sig $pids 2>/dev/null + # try to leave early, and yet leave processes time to exit + sleep 0.2 + for i in `seq $wait_time`; do + kill -s 0 $pids 2>/dev/null || + return 0 + sleep 1 + done + done + return 1 +} + __ocf_set_defaults "$@" diff --git a/heartbeat/ora-common.sh b/heartbeat/ora-common.sh new file mode 100644 index 000000000..f52dbc577 --- /dev/null +++ b/heartbeat/ora-common.sh @@ -0,0 +1,79 @@ +# ora-common.sh +# +# Description: Common code for oracle and oralsnr resource agents +# +# +# Author: Dejan Muhamedagic +# Support: linux-ha@lists.linux-ha.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2012 Dejan Muhamedagic, SUSE/Attachmate +# + +# Gather up information about our oracle instance + +rmtmpfiles() { + rm -f $TMPFILES +} + +ora_common_getconfig() { + ORACLE_SID=$1 + ORACLE_HOME=$2 + ORACLE_OWNER=$3 + + # get ORACLE_HOME from /etc/oratab if not set + [ x = "x$ORACLE_HOME" ] && + ORACLE_HOME=`awk -F: "/^$ORACLE_SID:/"'{print $2}' /etc/oratab` + + # there a better way to find out ORACLE_OWNER? + [ x = "x$ORACLE_OWNER" ] && + ORACLE_OWNER=`ls -ld $ORACLE_HOME/. 2>/dev/null | awk 'NR==1{print $3}'` + + LD_LIBRARY_PATH=$ORACLE_HOME/lib + LIBPATH=$ORACLE_HOME/lib + TNS_ADMIN=$ORACLE_HOME/network/admin + PATH=$ORACLE_HOME/bin:$ORACLE_HOME/dbs:$PATH + export ORACLE_SID ORACLE_HOME ORACLE_OWNER TNS_ADMIN + export LD_LIBRARY_PATH LIBPATH + + ORA_ENVF=`mktemp` + dumporaenv > $ORA_ENVF + chmod 644 $ORA_ENVF + TMPFILES="$ORA_ENVF" + trap "rmtmpfiles" EXIT +} + +ora_common_validate_all() { + # Let's make sure a few important things are set... + if [ x = "x$ORACLE_HOME" ]; then + ocf_log info "ORACLE_HOME not set" + return $OCF_ERR_INSTALLED + fi + if [ x = "x$ORACLE_OWNER" ]; then + ocf_log info "ORACLE_OWNER not set" + return $OCF_ERR_INSTALLED + fi + + US=`id -u -n` + if [ $US != root -a $US != $ORACLE_OWNER ] + then + ocf_log err "$0 must be run as root or $ORACLE_OWNER" + return $OCF_ERR_PERM + fi + return 0 +} + +dumporaenv() { +cat< 1.0 Resource script for oracle. Manages an Oracle Database instance as an HA resource. Manages an Oracle Database instance The Oracle SID (aka ORACLE_SID). sid The Oracle home directory (aka ORACLE_HOME). If not specified, then the SID along with its home should be listed in /etc/oratab. home The Oracle owner (aka ORACLE_OWNER). If not specified, then it is set to the owner of file \$ORACLE_HOME/dbs/*\${ORACLE_SID}.ora. If this does not work for you, just set it explicitely. user Sometimes IPC objects (shared memory segments and semaphores) belonging to an Oracle instance might be left behind which prevents the instance from starting. It is not easy to figure out which shared segments belong to which instance, in particular when more instances are running as same user. What we use here is the "oradebug" feature and its "ipc" trace utility. It is not optimal to parse the debugging information, but I am not aware of any other way to find out about the IPC information. In case the format or wording of the trace report changes, parsing might fail. There are some precautions, however, to prevent stepping on other peoples toes. There is also a dumpinstipc option which will make us print the IPC objects which belong to the instance. Use it to see if we parse the trace file correctly. Three settings are possible: - none: don't mess with IPC and hope for the best (beware: you'll probably be out of luck, sooner or later) - instance: try to figure out the IPC stuff which belongs to the instance and remove only those (default; should be safe) - orauser: remove all IPC belonging to the user which runs the instance (don't use this if you run more than one instance as same user or if other apps running as this user use IPC) The default setting "instance" should be safe to use, but in that case we cannot guarantee that the instance will start. In case IPC objects were already left around, because, for instance, someone mercilessly killing Oracle processes, there is no way any more to find out which IPC objects should be removed. In that case, human intervention is necessary, and probably _all_ instances running as same user will have to be stopped. The third setting, "orauser", guarantees IPC objects removal, but it does that based only on IPC objects ownership, so you should use that only if every instance runs as separate user. Please report any problems. Suggestions/fixes welcome. ipcrm The clear of the backup mode of ORACLE. clear_backupmode How to stop Oracle is a matter of taste it seems. The default method ("checkpoint/abort") is: alter system checkpoint; shutdown abort; This should be the fastest safe way bring the instance down. If you find "shutdown abort" distasteful, set this attribute to "immediate" in which case we will shutdown immediate; If you still think that there's even better way to shutdown an Oracle instance we are willing to listen. shutdown_method END } # # methods: What methods/operations do we support? # oracle_methods() { cat <<-! start stop status monitor dumpinstipc showdbstat cleanup validate-all methods meta-data usage ! } - -# Gather up information about our oracle instance - -ora_info() { - ORACLE_SID=$1 - ORACLE_HOME=$2 - ORACLE_OWNER=$3 - - # get ORACLE_HOME from /etc/oratab if not set - [ x = "x$ORACLE_HOME" ] && - ORACLE_HOME=`awk -F: "/^$ORACLE_SID:/"'{print $2}' /etc/oratab` - - # there a better way to find out ORACLE_OWNER? - [ x = "x$ORACLE_OWNER" ] && - ORACLE_OWNER=`ls -ld $ORACLE_HOME/. 2>/dev/null | awk 'NR==1{print $3}'` - - sqlplus=$ORACLE_HOME/bin/sqlplus - lsnrctl=$ORACLE_HOME/bin/lsnrctl - tnsping=$ORACLE_HOME/bin/tnsping -} - -testoraenv() { - # Let's make sure a few important things are set... - if [ x = "x$ORACLE_HOME" ]; then - ocf_log info "ORACLE_HOME not set" - return $OCF_ERR_CONFIGURED - fi - if [ x = "x$ORACLE_OWNER" ]; then - ocf_log info "ORACLE_OWNER not set" - return $OCF_ERR_CONFIGURED - fi - # and some important things are there - if [ ! -x "$sqlplus" ]; then - ocf_log info "$sqlplus does not exist" - return $OCF_ERR_INSTALLED - fi - if [ ! -x "$lsnrctl" ]; then - ocf_log err "$lsnrctl does not exist" - return $OCF_ERR_INSTALLED - fi - if [ ! -x "$tnsping" ]; then - ocf_log err "$tnsping does not exist" - return $OCF_ERR_INSTALLED - fi - return 0 -} - -setoraenv() { - LD_LIBRARY_PATH=$ORACLE_HOME/lib - LIBPATH=$ORACLE_HOME/lib - TNS_ADMIN=$ORACLE_HOME/network/admin - PATH=$ORACLE_HOME/bin:$ORACLE_HOME/dbs:$PATH - export ORACLE_SID ORACLE_HOME ORACLE_OWNER TNS_ADMIN - export LD_LIBRARY_PATH LIBPATH -} -dumporaenv() { -cat</dev/null && + return 0 + output=`dbasql mk_mon_user show_mon_user` + if echo "$output" | grep -w "^$MONUSR" >/dev/null; then + return 0 + else + ocf_log err "could not create $MONUSR oracle user" + ocf_log err "sqlplus output: $output" + return 1 + fi +} # # print the output of dbstat (for debugging) # showdbstat() { echo "Full output:" dbstat | execsql echo "Stripped output:" echo "<`dbasql dbstat`>" } # # IPC stuff: not overly complex, but quite involved :-/ # -IPC_LOCKFILE=${HA_RSCTMP}/oracle_ipc.lock # Part 1: Oracle +other_trace_junk() { + echo $1 | sed 's/trc$/trm/' +} dumpinstipc() { - local dumpdest=`dbasql_one getdumpdest` - if [ "x$dumpdest" = x -o ! -d "$dumpdest" ]; then - ocf_log warn "$dumpdest is not a directory" - return 1 - fi - ocf_take_lock $IPC_LOCKFILE - local fcount=`ls -rt $dumpdest | grep '\.trc$' | wc -l` - output=`dbasql getipc` - local lastf=`ls -rt $dumpdest | grep '\.trc$' | tail -1` - local fcount2=`ls -rt $dumpdest | grep '\.trc$' | wc -l` - rm -f $IPC_LOCKFILE - if [ $((fcount+1)) -eq $fcount2 ]; then - echo $dumpdest/$lastf + local output tracef + output=`dbasql getipc` # filename in the 2nd line + tracef=`echo "$output" | awk 'NR==2' | grep '^/.*trc$'` + if [ "$tracef" ]; then + echo $tracef else ocf_log warn "'dbasql getipc' failed: $output" return 1 fi } parseipc() { local inf=$1 if [ ! -f "$1" ]; then ocf_log warn "$1: no such ipc trace file" return 1 fi awk ' $3 == "Shmid" {n=1;next} n { if( $3~/^[0-9]+$/ ) print $3; n=0 } ' $inf | sort -u | sed 's/^/m:/' awk ' /Semaphore List/ {insems=1;next} insems { for( i=1; i<=NF; i++ ) if( $i~/^[0-9]+$/ ) print $i; } /system semaphore information/ {exit} ' $inf | sort -u | sed 's/^/s:/' + TMPFILES="$TMPFILES $inf `other_trace_junk $inf`" } # Part 2: OS (ipcs,ipcrm) filteroraipc() { # this portable? grep -w $ORACLE_OWNER | awk '{print $2}' } ipcdesc() { local what=$1 case $what in m) echo "shared memory segment";; s) echo "semaphore";; q) echo "message queue";; esac } rmipc() { local what=$1 id=$2 ipcs -$what | filteroraipc | grep -w $id >/dev/null 2>&1 || return ocf_log info "Removing `ipcdesc $what` $id." ipcrm -$what $id } ipcrm_orauser() { local what id for what in m s q; do for id in `ipcs -$what | filteroraipc`; do rmipc $what $id done done } ipcrm_instance() { local ipcobj for ipcobj; do rmipc `echo $ipcobj | sed 's/:/ /'` done } # # oracle_status: is the Oracle instance running? # # quick check to see if the instance is up -is_oracle_up() { +is_proc_running() { ps -ef | grep -wiqs "[^ ]*[_]pmon_${ORACLE_SID}" } # instance in OPEN state? instance_live() { - local status=`dbasql_one dbstat` + local status=`monsql_one dbstat` if [ "$status" = OPEN ]; then return 0 else ocf_log info "$ORACLE_SID instance state is not OPEN (dbstat output: $status)" return 1 fi } ora_cleanup() { #rm -fr /tmp/.oracle #??? rm -f `ls $ORACLE_HOME/dbs/lk* | grep -i "$ORACLE_SID\$"` #return case $IPCRM in none) ;; instance) ipcrm_instance $* ;; orauser) ipcrm_orauser $* ;; - *) - ocf_log warn "bad usage: ipcrm set to $IPCRM" - ;; esac } +oracle_getconfig() { + ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" + + clear_backupmode=${OCF_RESKEY_clear_backupmode:-"false"} + shutdown_method=${OCF_RESKEY_shutdown_method:-"checkpoint/abort"} + IPCRM=${OCF_RESKEY_ipcrm:-"instance"} +} + # # oracle_start: Start the Oracle instance # # NOTE: We handle instance in the MOUNTED and STARTED states # efficiently # We *do not* handle instance in the restricted or read-only # mode, i.e. it appears as running, but its availability is # "not for general use" # oracle_start() { local status output - if is_oracle_up; then - status="`dbasql_one dbstat`" + if is_proc_running; then + status="`monsql_one dbstat`" case "$status" in "OPEN") : nothing to be done, we can leave right now ocf_log info "Oracle instance $ORACLE_SID already running" return $OCF_SUCCESS ;; "STARTED") output=`dbasql dbmount` ;; "MOUNTED") : we proceed if mounted ;; *) # status unknown output=`dbasql dbstop dbstart_mount` ;; esac else output="`dbasql dbstart_mount`" # try to cleanup in case of # ORA-01081: cannot start already-running ORACLE - shut it down first if echo "$output" | grep ORA-01081 >/dev/null 2>&1; then ocf_log info "ORA-01081 error found, trying to cleanup oracle (dbstart_mount output: $output)" ora_cleanup output=`dbasql dbstart_mount` fi fi # oracle instance should be mounted. status="`dbasql_one dbstat`" case "$status" in "MOUNTED") ;; *) : error!! ocf_log err "oracle $ORACLE_SID can not be mounted (status: $status)" return $OCF_ERR_GENERIC ;; esac # It is examined whether mode is "online backup mode", # and if it is true, makes clear the mode. # Afterwards, DB is opened. if is_clear_backupmode_set && is_instance_in_backup_mode; then clear_backup_mode fi output=`dbasql dbopen` - if ! is_oracle_up; then + # check/create the monitor user + if ! check_mon_user; then + return $OCF_ERR_GENERIC + fi + + if ! is_proc_running; then ocf_log err "oracle process not running: $output" return $OCF_ERR_GENERIC elif ! instance_live; then ocf_log err "oracle instance $ORACLE_SID not started: $output" return $OCF_ERR_GENERIC else : cool, we are up and running ocf_log info "Oracle instance $ORACLE_SID started: $output" return $OCF_SUCCESS fi } # # oracle_stop: Stop the Oracle instance # oracle_stop() { local status output ipc="" - if is_oracle_up; then + if is_proc_running; then [ "$IPCRM" = "instance" ] && ipc=$(parseipc `dumpinstipc`) output=`dbasql dbstop` else ocf_log info "Oracle instance $ORACLE_SID already stopped" return $OCF_SUCCESS fi - ora_kill # kill any processes left - if is_oracle_up; then + ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged + if is_proc_running; then ocf_log err "Oracle instance $ORACLE_SID not stopped: $output" return $OCF_ERR_GENERIC else ocf_log info "Oracle instance $ORACLE_SID stopped: $output" sleep 1 # give em a chance to cleanup ocf_log info "Cleaning up for $ORACLE_SID" ora_cleanup "$ipc" return $OCF_SUCCESS fi } -# kill the database processes (if any left) -# give them 30 secs to exit cleanly (6 times 5) -killprocs() { - local sig=$1 - shift 1 - # Record stderr - kill -s $sig $* >/dev/null -} -ora_kill() { - oraprocs=`eval $procs | awk '{print $1}'` - if [ -z "$oraprocs" ]; then - ocf_log debug "All oracle processes are already stopped." - return - fi - killprocs TERM $oraprocs - for i in 1 2 3 4 5; do - if [ -z "`eval $procs | awk '{print $1}'`" ]; then - ocf_log debug "All oracle processes are killed." - return - fi - sleep 5 - done - killprocs KILL `eval $procs | awk '{print $1}'` -} # # oracle_monitor: Can the Oracle instance do anything useful? # oracle_monitor() { - if ! is_oracle_up; then + if ! is_proc_running; then ocf_log info "oracle process not running" return $OCF_NOT_RUNNING fi if ! instance_live; then - ocf_log info "oracle instance $ORACLE_SID is down" - return $OCF_NOT_RUNNING + ocf_log err "oracle instance $ORACLE_SID is down" + return $OCF_ERR_GENERIC fi #ocf_log info "Oracle instance $ORACLE_SID is alive" return $OCF_SUCCESS } -# -# 'main' starts here... -# - -if [ $# -ne 1 ] -then - usage - exit $OCF_ERR_ARGS -fi - -# These operations don't require OCF instance parameters to be set -case "$1" in - meta-data) meta_data - exit $OCF_SUCCESS;; - - usage) usage - exit $OCF_SUCCESS;; - - methods) oracle_methods - exit $?;; - - *);; -esac - -clear_backupmode=${OCF_RESKEY_clear_backupmode:-"false"} -shutdown_method=${OCF_RESKEY_shutdown_method:-"checkpoint/abort"} - -case "${shutdown_method}" in -"immediate") ;; -"checkpoint/abort") ;; -*) ocf_log err "unsupported shutdown_method, please read meta-data" -esac - -if [ x = "x$OCF_RESKEY_sid" ] -then - ocf_log err "Please set OCF_RESKEY_sid to the Oracle SID !" - exit $OCF_ERR_ARGS -fi - -ora_info "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" - -LSB_STATUS_STOPPED=3 -testoraenv -rc=$? -if [ $rc -ne 0 ]; then - ocf_log info "Oracle environment for SID $ORACLE_SID does not exist" - case "$1" in - stop) exit $OCF_SUCCESS;; - monitor) exit $OCF_NOT_RUNNING;; - status) exit $LSB_STATUS_STOPPED;; - *) - ocf_log err "Oracle environment for SID $ORACLE_SID broken" - exit $rc - ;; - esac -fi - -setoraenv # important: set the environment for the SID -envtmpf=`mktemp` -dumporaenv > $envtmpf -chmod 644 $envtmpf -TMPFILES="$envtmpf" -rmtmpfiles() { - rm -f $TMPFILES - # our lock file? - if [ "$$" = "`cat $IPC_LOCKFILE 2>/dev/null`" ]; then - rm -f $IPC_LOCKFILE +# other supported actions +oracle_status() { + if is_proc_running + then + echo Oracle instance $ORACLE_SID is running + exit $OCF_SUCCESS + else + echo Oracle instance $ORACLE_SID is stopped + exit $OCF_NOT_RUNNING fi } -trap "rmtmpfiles" EXIT -procs="ps -e -o pid,args | grep -i \"[o]ra[a-zA-Z0-9_]*$ORACLE_SID\$\"" - -US=`id -u -n` -if [ $US != root -a $US != $ORACLE_OWNER ] -then - ocf_log err "$0 must be run as root or $ORACLE_OWNER" - exit $OCF_ERR_PERM -fi - -if [ x = "x$OCF_RESKEY_ipcrm" ] -then - IPCRM="instance" -else - IPCRM="$OCF_RESKEY_ipcrm" -fi - -# What kind of method was invoked? -case "$1" in - - start) oracle_start - exit $?;; - - stop) oracle_stop - exit $?;; - - status) if is_oracle_up - then - echo Oracle instance $ORACLE_SID is running - exit $OCF_SUCCESS - else - echo Oracle instance $ORACLE_SID is stopped - exit $OCF_NOT_RUNNING - fi +oracle_dumpinstipc() { + is_proc_running && parseipc `dumpinstipc` +} +oracle_showdbstat() { + showdbstat +} +oracle_cleanup() { + if [ "$IPCRM" = "instance" ]; then + ora_cleanup $(parseipc `dumpinstipc`) + else + ora_cleanup + fi +} +oracle_validate_all() { + case "${shutdown_method}" in + "immediate") ;; + "checkpoint/abort") ;; + *) ocf_log err "unsupported shutdown_method, please read meta-data" + return $OCF_ERR_CONFIGURED ;; + esac - dumpinstipc) - is_oracle_up && parseipc `dumpinstipc` - exit $?;; - - showdbstat) - showdbstat - exit $?;; - - cleanup) - if [ "$IPCRM" = "instance" ]; then - ora_cleanup $(parseipc `dumpinstipc`) - else - ora_cleanup - fi - exit $?;; + case "${IPCRM}" in + "none"|"instance"|"orauser") ;; + *) ocf_log err "unsupported ipcrm setting, please read meta-data" + return $OCF_ERR_CONFIGURED + ;; + esac - monitor) oracle_monitor - exit $?;; + ora_common_validate_all +} - validate-all) # OCF_RESKEY_sid was already checked by testoraenv(), - # just exit successfully here. - exit $OCF_SUCCESS;; +# used in ora-common.sh +show_procs() { + ps -e -o pid,args | grep -i "[o]ra[a-zA-Z0-9_]*$ORACLE_SID$" +} +proc_pids() { show_procs | awk '{print $1}'; } +PROCS_CLEANUP_TIME="30" - *) oracle_methods - exit $OCF_ERR_UNIMPLEMENTED;; -esac +MONUSR="OCFMON" +OCF_REQUIRED_PARAMS="sid" +OCF_REQUIRED_BINARIES="sqlplus" +ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/oralsnr b/heartbeat/oralsnr index bc85abe01..97f236053 100755 --- a/heartbeat/oralsnr +++ b/heartbeat/oralsnr @@ -1,429 +1,269 @@ #!/bin/sh # # # oralsnr # # Description: Manages an Oracle Listener as a High-Availability # resource # # # Author: Dejan Muhamedagic # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # Copyright: (C) 2006 International Business Machines, Inc. # # This code inspired by the DB2 resource script # written by Alan Robertson # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 oralsnr::sid::home::user::listener # -# See usage() function below for more details... +# See oralsnr_usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_sid (mandatory; for the monitor op) # OCF_RESKEY_home (optional; else read it from /etc/oratab) # OCF_RESKEY_user (optional; user to run the listener) # OCF_RESKEY_listener (optional; defaults to LISTENER) # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/ora-common.sh ####################################################################### SH=/bin/sh -usage() { +oralsnr_usage() { methods=`oralsnr_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages an Oracle Database instance as an HA resource. The 'start' operation starts the database. The 'stop' operation stops the database. The 'status' operation reports whether the database is running The 'monitor' operation reports whether the database seems to be working The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } -meta_data() { +oralsnr_meta_data() { cat < 1.0 Resource script for Oracle Listener. It manages an Oracle Listener instance as an HA resource. Manages an Oracle TNS listener The Oracle SID (aka ORACLE_SID). Necessary for the monitor op, i.e. to do tnsping SID. sid The Oracle home directory (aka ORACLE_HOME). If not specified, then the SID should be listed in /etc/oratab. home Run the listener as this user. user Listener instance to be started (as defined in listener.ora). Defaults to LISTENER. listener END } # # methods: What methods/operations do we support? # oralsnr_methods() { cat <<-! start stop status monitor validate-all methods meta-data usage ! } - -# Gather up information about our oralsnr instance - -ora_info() { - ORACLE_SID=$1 - ORACLE_HOME=$2 - ORACLE_OWNER=$3 - - # get ORACLE_HOME from /etc/oratab if not set - [ x = "x$ORACLE_HOME" ] && - ORACLE_HOME=`awk -F: "/^$ORACLE_SID:/"'{print $2}' /etc/oratab` - - # there a better way to find out ORACLE_OWNER? - [ x = "x$ORACLE_OWNER" ] && - ORACLE_OWNER=`ls -ld $ORACLE_HOME/. 2>/dev/null | awk 'NR==1{print $3}'` - - sqlplus=$ORACLE_HOME/bin/sqlplus - lsnrctl=$ORACLE_HOME/bin/lsnrctl - tnsping=$ORACLE_HOME/bin/tnsping -} - -testoraenv() { - # Let's make sure a few important things are set... - if [ x = "x$ORACLE_HOME" ]; then - ocf_log info "ORACLE_HOME not set" - return $OCF_ERR_CONFIGURED - fi - if [ x = "x$ORACLE_OWNER" ]; then - ocf_log info "ORACLE_OWNER not set" - return $OCF_ERR_CONFIGURED - fi - # and some important things are there - if [ ! -x "$sqlplus" ]; then - ocf_log info "$sqlplus does not exist" - return $OCF_ERR_INSTALLED - fi - if [ ! -x "$lsnrctl" ]; then - ocf_log err "$lsnrctl does not exist" - return $OCF_ERR_INSTALLED - fi - if [ ! -x "$tnsping" ]; then - ocf_log err "$tnsping does not exist" - return $OCF_ERR_INSTALLED - fi - return 0 -} - -setoraenv() { - LD_LIBRARY_PATH=$ORACLE_HOME/lib - LIBPATH=$ORACLE_HOME/lib - TNS_ADMIN=$ORACLE_HOME/network/admin - PATH=$ORACLE_HOME/bin:$ORACLE_HOME/dbs:$PATH - export ORACLE_SID ORACLE_HOME ORACLE_OWNER TNS_ADMIN - export LD_LIBRARY_PATH LIBPATH -} -dumporaenv() { -cat</dev/null -} # -# is_oralsnr_up: is listener process running? -# oralsnr_status: is the listener running? +# is_proc_running: is the listener running? # -is_oralsnr_up() { - [ x != "x`eval $procs`" ] +is_proc_running() { + show_procs | grep "." > /dev/null } -oralsnr_status() { - output=`$lsnrctl status $listener` - echo "$output" | tail -1 | grep -qs 'completed successfully' - RET=$? - if [ $RET -ne 0 ]; then - ocf_log info "$listener status failed: $output" +# the following two should be run only if the process is running +test_listener() { + local output + output=`lsnrctl status $listener` + if echo "$output" | tail -1 | grep -qs 'completed successfully' + then + return $OCF_SUCCESS + else + ocf_log err "$listener status failed: $output" + return $OCF_ERR_GENERIC fi - return $RET } # and does it work? -tnsping() { - output=`$tnsping $ORACLE_SID` - echo "$output" | tail -1 | grep -qs '^OK' - RET=$? - if [ $RET -ne 0 ]; then - ocf_log info "$tnsping $ORACLE_SID failed: $output" +test_tnsping() { + local output + output=`tnsping $ORACLE_SID` + if echo "$output" | tail -1 | grep -qs '^OK'; then + return $OCF_SUCCESS + else + ocf_log err "tnsping $ORACLE_SID failed: $output" + return $OCF_ERR_GENERIC fi - return $RET } # # oralsnr_monitor: Can we connect to the listener? # oralsnr_monitor() { - if oralsnr_status && tnsping - then - : good - #ocf_log info "Listener $listener running" - return $OCF_SUCCESS + if is_proc_running; then + test_listener && test_tnsping else - ocf_log info "Listener $listener not running" return $OCF_NOT_RUNNING fi } -# -# 'main' starts here... -# - -if [ $# -ne 1 ] -then - usage - exit $OCF_ERR_ARGS -fi - -# These operations don't require OCF instance parameters to be set -case "$1" in - meta-data) meta_data - exit $OCF_SUCCESS;; - - usage) usage - exit $OCF_SUCCESS;; - - methods) oralsnr_methods - exit $?;; - - *);; -esac - -if [ x = "x$OCF_RESKEY_sid" ] -then - ocf_log err "Please set OCF_RESKEY_sid to the Oracle SID !" - exit $OCF_ERR_ARGS -fi - -ora_info "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" - -LSB_STATUS_STOPPED=3 -testoraenv -rc=$? -if [ $rc -ne 0 ]; then - ocf_log info "Oracle environment for SID $ORACLE_SID does not exist" - case "$1" in - stop) exit $OCF_SUCCESS;; - monitor) exit $OCF_NOT_RUNNING;; - status) exit $LSB_STATUS_STOPPED;; - *) - ocf_log err "Oracle environment for SID $ORACLE_SID broken" - exit $rc - ;; - esac -fi - -setoraenv # important: set the environment for the SID -envtmpf=`mktemp` -dumporaenv > $envtmpf -chmod 644 $envtmpf -trap "rm -f $envtmpf" EXIT - -# -# default listener is "LISTENER" -# -listener=${OCF_RESKEY_listener:-"LISTENER"} -# how to get listener processes -procs="ps -e -o pid,user,args | grep '[t]nslsnr' | grep -w $listener | grep -w $ORACLE_OWNER" - -US=`id -u -n` -if [ $US != root -a $US != $ORACLE_OWNER ] -then - ocf_log err "$0 must be run as root or $ORACLE_OWNER" - exit $OCF_ERR_PERM -fi - -# What kind of method was invoked? -case "$1" in - - start) oralsnr_start - exit $?;; - - stop) oralsnr_stop - exit $?;; +oralsnr_status() { + if is_proc_running + then + echo Listener $listener is running + exit $OCF_SUCCESS + else + echo Listener $listener is stopped + exit $OCF_NOT_RUNNING + fi +} - status) if oralsnr_status - then - echo Listener $listener is running - exit $OCF_SUCCESS - else - echo Listener $listener is stopped - exit $OCF_NOT_RUNNING - fi - ;; +oralsnr_getconfig() { + ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" + listener=${OCF_RESKEY_listener:-"LISTENER"} +} - monitor) oralsnr_monitor - exit $?;; +oralsnr_validate_all() { + ora_common_validate_all +} - validate-all) # OCF_RESKEY_sid was already checked by ora_info(), - # just exit successfully here. - exit $OCF_SUCCESS;; +# used in ora-common.sh +show_procs() { + ps -e -o pid,user,args | + grep '[t]nslsnr' | grep -w "$listener" | grep -w "$ORACLE_OWNER" +} +proc_pids() { show_procs | awk '{print $1}'; } +PROCS_CLEANUP_TIME="10" - *) oralsnr_methods - exit $OCF_ERR_UNIMPLEMENTED;; -esac +OCF_REQUIRED_PARAMS="sid" +OCF_REQUIRED_BINARIES="lsnrctl tnsping" +ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/pgsql b/heartbeat/pgsql index adecd46c4..b57488de4 100755 --- a/heartbeat/pgsql +++ b/heartbeat/pgsql @@ -1,1733 +1,1724 @@ #!/bin/sh # # Description: Manages a PostgreSQL Server as an OCF High-Availability # resource # # Authors: Serge Dubrouski (sergeyfd@gmail.com) -- original RA # Florian Haas (florian@linbit.com) -- makeover # Takatoshi MATSUO (matsuo.tak@gmail.com) -- support replication # # Copyright: 2006-2012 Serge Dubrouski # and other Linux-HA contributors # License: GNU General Public License (GPL) # ############################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # # Get PostgreSQL Configuration parameter # get_pgsql_param() { - local config local param_name param_name=$1 - - #Check that config file exists - if [ -n "$OCF_RESKEY_config" ]; then - config=$OCF_RESKEY_config - else - config=$OCF_RESKEY_pgdata/postgresql.conf - fi - - check_config "$config" - [ $? -eq 0 ] || return - perl_code="if (/^\s*$param_name[\s=]+\s*(.*)$/) { \$dir=\$1; \$dir =~ s/\s*\#.*//; \$dir =~ s/^'(\S*)'/\$1/; print \$dir;}" - perl -ne "$perl_code" < $config + perl -ne "$perl_code" < $OCF_RESKEY_config } # Defaults OCF_RESKEY_pgctl_default=/usr/bin/pg_ctl OCF_RESKEY_psql_default=/usr/bin/psql OCF_RESKEY_pgdata_default=/var/lib/pgsql/data OCF_RESKEY_pgdba_default=postgres OCF_RESKEY_pghost_default="" OCF_RESKEY_pgport_default=5432 -OCF_RESKEY_config_default="" OCF_RESKEY_start_opt_default="" OCF_RESKEY_pgdb_default=template1 OCF_RESKEY_logfile_default=/dev/null OCF_RESKEY_stop_escalate_default=30 OCF_RESKEY_monitor_user_default="" OCF_RESKEY_monitor_password_default="" OCF_RESKEY_monitor_sql_default="select now();" # Defaults for replication OCF_RESKEY_rep_mode_default=none OCF_RESKEY_node_list_default="" OCF_RESKEY_restore_command_default="" +OCF_RESKEY_archive_cleanup_command_default="" +OCF_RESKEY_recovery_end_command_default="" OCF_RESKEY_master_ip_default="" OCF_RESKEY_repuser_default="postgres" OCF_RESKEY_primary_conninfo_opt_default="" OCF_RESKEY_tmpdir_default="/var/lib/pgsql/tmp" OCF_RESKEY_xlog_check_count_default="3" OCF_RESKEY_crm_attr_timeout_default="5" OCF_RESKEY_stop_escalate_in_slave_default=30 : ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} : ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} : ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}} : ${OCF_RESKEY_pgdba=${OCF_RESKEY_pgdba_default}} : ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}} : ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}} -: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_pgdata}/postgresql.conf} : ${OCF_RESKEY_start_opt=${OCF_RESKEY_start_opt_default}} : ${OCF_RESKEY_pgdb=${OCF_RESKEY_pgdb_default}} : ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} : ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} : ${OCF_RESKEY_monitor_user=${OCF_RESKEY_monitor_user_default}} : ${OCF_RESKEY_monitor_password=${OCF_RESKEY_monitor_password_default}} : ${OCF_RESKEY_monitor_sql=${OCF_RESKEY_monitor_sql_default}} + # for replication : ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}} : ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}} : ${OCF_RESKEY_restore_command=${OCF_RESKEY_restore_command_default}} +: ${OCF_RESKEY_archive_cleanup_command=${OCF_RESKEY_archive_cleanup_command_default}} +: ${OCF_RESKEY_recovery_end_command=${OCF_RESKEY_recovery_end_command_default}} : ${OCF_RESKEY_master_ip=${OCF_RESKEY_master_ip_default}} : ${OCF_RESKEY_repuser=${OCF_RESKEY_repuser_default}} : ${OCF_RESKEY_primary_conninfo_opt=${OCF_RESKEY_primary_conninfo_opt_default}} : ${OCF_RESKEY_tmpdir=${OCF_RESKEY_tmpdir_default}} : ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} : ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} : ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} usage() { cat < 1.0 Resource script for PostgreSQL. It manages a PostgreSQL as an HA resource. Manages a PostgreSQL database instance Path to pg_ctl command. pgctl Start options (-o start_opt in pg_ctl). "-i -p 5432" for example. start_opt Additional pg_ctl options (-w, -W etc..). ctl_opt Path to psql command. psql Path to PostgreSQL data directory. pgdata User that owns PostgreSQL. pgdba Hostname/IP address where PostgreSQL is listening pghost Port where PostgreSQL is listening pgport PostgreSQL user that pgsql RA will user for monitor operations. If it's not set pgdba user will be used. monitor_user Password for monitor user. monitor_password SQL script that will be used for monitor operations. monitor_sql -Path to the PostgreSQL configuration file for the instance +Path to the PostgreSQL configuration file for the instance. Configuration file - + Database that will be used for monitoring. pgdb Path to PostgreSQL server log output file. logfile Unix socket directory for PostgeSQL socketdir Number of shutdown retries (using -m fast) before resorting to -m immediate stop escalation Replication mode(none(default)/async/sync). "async" and "sync" require PostgreSQL 9.1 or later. If you use async or sync, it requires node_list, master_ip, restore_command parameters, and needs setting postgresql.conf, pg_hba.conf up for replication. Please delete "include /../../rep_mode.conf" line in postgresql.conf when you switch from sync to async. rep_mode All node names. Please separate each node name with a space. This is required for replication. node list restore_command for recovery.conf. This is required for replication. restore_command + + +archive_cleanup_command for recovery.conf. +This is used for replication and is optional. + +archive_cleanup_command + + + + + +recovery_end_command for recovery.conf. +This is used for replication and is optional. + +recovery_end_command + + + Master's floating IP address to be connected from hot standby. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. master ip User used to connect to the master server. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. repuser primary_conninfo options of recovery.conf except host, port, user and application_name. This is optional for replication. primary_conninfo_opt Path to temporary directory. This is optional for replication. tmpdir Number of checking xlog on monitor before promote. This is optional for replication. xlog check count The timeout of crm_attribute forever update command. Default value is 5 seconds. This is optional for replication. The timeout of crm_attribute forever update command. Number of shutdown retries (using -m fast) before resorting to -m immediate in Slave state. This is optional for replication. stop escalation_in_slave EOF } # # Run the given command in the Resource owner environment... # runasowner() { local quietrun="" local loglevel="-err" local var for var in 1 2 do case "$1" in "-q") quietrun="-q" shift 1;; "warn"|"err") loglevel="-$1" shift 1;; *) ;; esac done ocf_run $quietrun $loglevel su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; $*" } # # Shell escape # escape_string() { echo "$*" | sed -e "s/'/'\\\\''/g" } # # methods: What methods/operations do we support? # pgsql_methods() { cat </dev/null 2>&1" return $? fi # No PID file false } # # pgsql_real_monitor # pgsql_real_monitor() { local loglevel local rc local output # Set the log level of the error message loglevel=${1:-err} if ! pgsql_status then ocf_log info "PostgreSQL is down" return $OCF_NOT_RUNNING fi if is_replication; then #Check replication state output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ -Atc \"${CHECK_MS_SQL}\""` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel return $OCF_ERR_GENERIC fi case "$output" in f) ocf_log debug "PostgreSQL is running as a primary." if [ "$OCF_RESKEY_monitor_sql" = "$OCF_RESKEY_monitor_sql_default" ]; then return $OCF_RUNNING_MASTER fi ;; t) ocf_log debug "PostgreSQL is running as a hot standby." return $OCF_SUCCESS;; *) ocf_log err "$CHECK_MS_SQL output is $output" return $OCF_ERR_GENERIC;; esac fi OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options \ -c '$OCF_RESKEY_monitor_sql'" rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel return $OCF_ERR_GENERIC fi if is_replication; then return $OCF_RUNNING_MASTER fi return $OCF_SUCCESS } pgsql_replication_monitor() { local rc - local rsc - local instance - local my_status - local data_status - local is_master="" rc=$1 if [ $rc -ne $OCF_SUCCESS -a $rc -ne "$OCF_RUNNING_MASTER" ]; then return $rc fi # If I am Master if [ $rc -eq $OCF_RUNNING_MASTER ]; then change_data_status "$NODENAME" "LATEST" change_pgsql_status "$NODENAME" "PRI" control_slave_status || return $OCF_ERR_GENERIC return $rc fi # I can't get master node name from $OCF_RESKEY_CRM_meta_notify_master_uname on monitor, # so I will get master node name using crm_mon -n - if output=`crm_mon -n1 | grep " Master"`; then - rsc=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` - instance=0 - while : - do - if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then - break - fi - if echo "$output" | grep "${rsc}:${instance}"; then - is_master="yes" - break - fi - instance=`expr $instance + 1` - done - fi - - if [ ! -n "$is_master" ]; then + crm_mon -n1 | tr -d "\t" | grep -q "^${RESOURCE_NAME}:.* Master" + if [ $? -ne 0 ] ; then # If I am Slave and Master is not exist ocf_log info "Master does not exist." change_pgsql_status "$NODENAME" "HS:alone" have_master_right if [ $? -eq 0 ]; then rm -f ${XLOG_NOTE_FILE}.* fi else output=`$CRM_ATTR_FOREVER -N "$NODENAME" \ -n "$PGSQL_DATA_STATUS_ATTR" -G -q` if [ "$output" = "DISCONNECT" ]; then change_pgsql_status "$NODENAME" "HS:alone" fi fi return $rc } #pgsql_monitor: pgsql_real_monitor() wrapper for replication pgsql_monitor() { local rc pgsql_real_monitor rc=$? if ! is_replication; then return $rc else pgsql_replication_monitor $rc return $? fi } # pgsql_post_demote pgsql_post_demote() { DEMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname | sed "s/ /\n/g" | head -1` ocf_log debug "post-demote called. Demote uname is $DEMOTE_NODE" if [ "$DEMOTE_NODE" != "$NODENAME" ]; then if ! echo $OCF_RESKEY_CRM_meta_notify_master_uname | grep $NODENAME; then show_master_baseline change_pgsql_status "$NODENAME" "HS:alone" fi fi return $OCF_SUCCESS } pgsql_pre_promote() { local master_baseline local my_master_baseline local cmp_location local number_of_nodes # If my data is newer than new master's one, I fail my resource. PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1` - number_of_nodes=`echo $OCF_RESKEY_node_list | wc -w` + number_of_nodes=`echo $NODE_LIST | wc -w` if [ $number_of_nodes -ge 3 -a \ "$OCF_RESKEY_rep_mode" = "sync" -a \ "$PROMOTE_NODE" != "$NODENAME" ]; then master_baseline=`$CRM_ATTR_REBOOT -N "$PROMOTE_NODE" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` if [ $? -eq 0 ]; then my_master_baseline=`$CRM_ATTR_REBOOT -N "$NODENAME" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` # get older location cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ sort | head -1` if [ "$cmp_location" != "$my_master_baseline" ]; then ocf_log err "My data is newer than new master's one. New master's location : $master_baseline" $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY return $OCF_ERR_GENERIC fi fi fi return $OCF_SUCCESS } pgsql_notify() { local type="${OCF_RESKEY_CRM_meta_notify_type}" local op="${OCF_RESKEY_CRM_meta_notify_operation}" local rc if ! is_replication; then return $OCF_SUCCESS fi ocf_log debug "notify: ${type} for ${op}" case $type in pre) case $op in promote) pgsql_pre_promote return $? ;; esac ;; post) case $op in promote) delete_xlog_location PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1` if [ "$PROMOTE_NODE" != "$NODENAME" ]; then delete_master_baseline fi return $OCF_SUCCESS ;; demote) pgsql_post_demote return $? ;; start|stop) if [ "$NODENAME " = "$OCF_RESKEY_CRM_meta_notify_master_uname" ]; then control_slave_status fi return $OCF_SUCCESS ;; esac ;; esac return $OCF_SUCCESS } control_slave_status() { local rc local data_status local target local all_data_status local tmp_data_status local node_name local number_of_nodes all_data_status=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ -Atc \"${CHECK_REPLICATION_STATE_SQL}\""` rc=$? if [ $rc -eq 0 ]; then if [ -n "$all_data_status" ]; then all_data_status=`echo $all_data_status | sed "s/\n/ /g"` fi else report_psql_error $rc warn return 1 fi - number_of_nodes=`echo $OCF_RESKEY_node_list | wc -w` - for target in $OCF_RESKEY_node_list; do + number_of_nodes=`echo $NODE_LIST | wc -w` + for target in $NODE_LIST; do if [ "$target" = "$NODENAME" ]; then continue fi data_status="DISCONNECT" if [ -n "$all_data_status" ]; then for tmp_data_status in $all_data_status; do node_name=`echo $tmp_data_status | cut -d "|" -f 1` state=`echo $tmp_data_status | cut -d "|" -f 2` sync_state=`echo $tmp_data_status | cut -d "|" -f 3` ocf_log debug "node=$node_name, state=$state, sync_state=$sync_state" if [ "$node_name" = "$target" ];then data_status="$state|$sync_state" break fi done fi case "$data_status" in "STREAMING|SYNC") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_PROMOTE" change_pgsql_status "$target" "HS:sync" ;; "STREAMING|ASYNC") change_data_status "$target" "$data_status" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then change_master_score "$target" "$CAN_NOT_PROMOTE" if ! is_sync_mode "$target"; then set_sync_mode "$target" fi else if [ $number_of_nodes -le 2 ]; then change_master_score "$target" "$CAN_PROMOTE" else # I can't determine which slave's data is newest in async mode. change_master_score "$target" "$CAN_NOT_PROMOTE" fi fi change_pgsql_status "$target" "HS:async" ;; "STREAMING|POTENTIAL") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" change_pgsql_status "$target" "HS:potential" ;; "DISCONNECT") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ] && \ is_sync_mode "$target"; then set_async_mode "$target" fi ;; *) change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ] && \ is_sync_mode "$target"; then set_async_mode "$target" fi change_pgsql_status "$target" "HS:connected" ;; esac done return 0 } have_master_right() { local old local new local output local data_status local node local mylocation local count local newestXlog local oldfile local newfile ocf_log debug "Checking if I have a master right." data_status=`$CRM_ATTR_FOREVER -N "$NODENAME" -n \ "$PGSQL_DATA_STATUS_ATTR" -G -q` if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi else if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "STREAMING|ASYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi fi ocf_log info "My data status=$data_status." show_xlog_location if [ $? -ne 0 ]; then ocf_log err "Failed to show my xlog location." exit $OCF_ERR_GENERIC fi old=0 for count in `seq $OCF_RESKEY_xlog_check_count`; do if [ -f ${XLOG_NOTE_FILE}.$count ]; then old=$count continue fi break done new=`expr $old + 1` # get xlog locations of all nodes - for node in ${OCF_RESKEY_node_list}; do + for node in ${NODE_LIST}; do output=`$CRM_ATTR_REBOOT -N "$node" -n \ "$PGSQL_XLOG_LOC_NAME" -G -q 2>/dev/null` if [ $? -ne 0 ]; then ocf_log warn "Can't get $node xlog location." continue else ocf_log info "$node xlog location : $output" echo "$node $output" >> ${XLOG_NOTE_FILE}.${new} if [ "$node" = "$NODENAME" ]; then mylocation=$output fi fi done oldfile=`cat ${XLOG_NOTE_FILE}.${old} 2>/dev/null` newfile=`cat ${XLOG_NOTE_FILE}.${new} 2>/dev/null` if [ "$oldfile" != "$newfile" ]; then # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 return 1 fi if [ "$new" -ge "$OCF_RESKEY_xlog_check_count" ]; then newestXlog=`printf "$newfile\n" | sort -t " " -k 2,3 -r | \ head -1 | cut -d " " -f 2` if [ "$newestXlog" = "$mylocation" ]; then ocf_log info "I have a master right." $CRM_MASTER -v $PROMOTE_ME return 0 fi change_data_status "$NODENAME" "DISCONNECT" ocf_log info "I don't have correct master data." # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 fi return 1 } is_replication() { if [ "$OCF_RESKEY_rep_mode" != "none" ]; then return 0 fi return 1 } get_my_location() { local rc local output local replay_loc local receive_loc local output1 local output2 local log1 local log2 local newer_location output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ -Atc \"${CHECK_XLOG_LOC_SQL}\""` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc warn ocf_log err "Can't get my xlog location." return 1 fi replay_loc=`echo $output | cut -d "|" -f 1` receive_loc=`echo $output | cut -d "|" -f 2` output1=`echo "$replay_loc" | cut -d "/" -f 1` output2=`echo "$replay_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` replay_loc="${log1}${log2}" output1=`echo "$receive_loc" | cut -d "/" -f 1` output2=`echo "$receive_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` receive_loc="${log1}${log2}" newer_location=`printf "$replay_loc\n$receive_loc" | sort -r | head -1` echo "$newer_location" return 0 } show_xlog_location() { local location location=`get_my_location` || return 1 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -v "$location" } delete_xlog_location() { $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -D } show_master_baseline() { local rc local location runasowner -q err "$OCF_RESKEY_psql $psql_options \ -U $OCF_RESKEY_pgdba -c 'CHECKPOINT'" rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc warn fi location=`get_my_location` ocf_log info "My master baseline : $location." $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -v "$location" } delete_master_baseline() { $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -D } set_async_mode_all() { [ "$OCF_RESKEY_rep_mode" = "sync" ] || return 0 ocf_log info "Set all nodes into async mode." - runasowner -q err "echo "" > \"$REP_MODE_CONF\"" + runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" if [ $? -ne 0 ]; then ocf_log err "Can't set all nodes into async mode." return 1 fi return 0 } set_async_mode() { local sync_node_in_conf sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` if [ -n "$sync_node_in_conf" ]; then ocf_log info "Setup $1 into async mode." sync_node_in_conf=`echo $sync_node_in_conf | sed "s/$1//g" |\ sed "s/^,//g" | sed "s/,,/,/g" | sed "s/,$//g"` - if [ -n $sync_node_in_conf ]; then - echo "synchronous_standby_names = '$sync_node_in_conf'" > "$REP_MODE_CONF" - else - echo "" > "$REP_MODE_CONF" - fi + echo "synchronous_standby_names = '$sync_node_in_conf'" > "$REP_MODE_CONF" else ocf_log info "$1 is already in async mode." return 0 fi ocf_log info "All synced nodes : \"$sync_node_in_conf\"" reload_conf } set_sync_mode() { local sync_node_in_conf sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` if [ -n "$sync_node_in_conf" ]; then ocf_log info "Setup $1 into sync mode." echo "synchronous_standby_names = '$sync_node_in_conf,$1'" > "$REP_MODE_CONF" else ocf_log info "Setup $1 into sync mode." echo "synchronous_standby_names = '$1'" > "$REP_MODE_CONF" fi sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` ocf_log info "All synced nodes : \"$sync_node_in_conf\"" reload_conf } is_sync_mode() { cat $REP_MODE_CONF | grep -q -e "[,' ]$1[,' ]" } reload_conf() { # Invoke pg_ctl runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata reload" if [ $? -eq 0 ]; then ocf_log info "Reload configuration file." else ocf_log err "Can't reload configuration file." return 1 fi return 0 } +user_recovery_conf() { + # put archive_cleanup_command and recovery_end_command only when defined by user + if [ -n "$OCF_RESKEY_archive_cleanup_command" ]; then + echo "archive_cleanup_command = '${OCF_RESKEY_archive_cleanup_command}'" + fi + if [ -n "$OCF_RESKEY_recovery_end_command" ]; then + echo "recovery_end_command = '${OCF_RESKEY_recovery_end_command}'" + fi +} + make_recovery_conf() { runasowner "touch $RECOVERY_CONF" if [ $? -ne 0 ]; then ocf_log err "Can't create recovery.conf." return 1 fi cat > $RECOVERY_CONF <> $RECOVERY_CONF ocf_log debug "Created recovery.conf. host=${OCF_RESKEY_master_ip}, user=${OCF_RESKEY_repuser}" return 0 } # change pgsql-status. # arg1:node, arg2: value change_pgsql_status() { local output if ! is_node_online $1; then return 0 fi output=`$CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then # If slave's disk is broken, RA cannot read PID file # and misjudges the PostgreSQL as down while it is running. # It causes overwriting of pgsql-status by Master because replication is still connected. if [ "$output" = "STOP" -o "$output" = "UNKNOWN" ]; then if [ "$1" != "$NODENAME" ]; then ocf_log warn "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2 by $NODENAME is prohibited." return 0 fi fi ocf_log info "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2." $CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -v "$2" if [ $? -ne 0 ]; then ocf_log err "Can't change $PGSQL_STATUS_ATTR." return 1 fi fi return 0 } # change pgsql-data-status. # arg1:node, arg2: value change_data_status() { local output if ! node_exist $1; then return 0 fi while : do output=`$CRM_ATTR_FOREVER -N "$1" -n "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then ocf_log info "Changing $PGSQL_DATA_STATUS_ATTR on $1 : $output->$2." exec_func_with_timeout "$CRM_ATTR_FOREVER" "-N $1 -n \ $PGSQL_DATA_STATUS_ATTR -v \"$2\"" \ $OCF_RESKEY_crm_attr_timeout if [ $? -ne 0 ]; then ocf_log err "Can't change $PGSQL_DATA_STATUS_ATTR." return 1 fi else break fi done return 0 } # change master-score # arg1:node, arg2: score change_master_score() { - local rsc local instance local current_score if ! is_node_online $1; then return 0 fi - rsc=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` instance=0 while : do if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then break fi - if [ "${rsc}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then + if [ "${RESOURCE_NAME}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then instance=`expr $instance + 1` continue fi - current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-${rsc}:${instance}" -G -q 2>/dev/null` + current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-${RESOURCE_NAME}:${instance}" -G -q 2>/dev/null` if [ -n "$current_score" -a "$current_score" != "$2" ]; then - ocf_log info "Changing ${rsc}:${instance} master score on $1 : $current_score->$2." - $CRM_ATTR_REBOOT -N "$target" -n "master-${rsc}:${instance}" -v "$2" + ocf_log info "Changing ${RESOURCE_NAME}:${instance} master score on $1 : $current_score->$2." + $CRM_ATTR_REBOOT -N "$target" -n "master-${RESOURCE_NAME}:${instance}" -v "$2" if [ $? -ne 0 ]; then ocf_log err "Can't change master score." return 1 fi fi instance=`expr $instance + 1` done return 0 } report_psql_error() { local rc local loglevel rc=$1 loglevel=${2:-err} ocf_log $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running" if [ $rc -eq 1 ]; then ocf_log err "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." elif [ $rc -eq 2 ]; then ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." elif [ $rc -eq 3 ]; then ocf_log err "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." fi } # # timeout management function # arg1 : command # arg2 : command's args # arg3 : timeout(s) # exec_func_with_timeout() { local func_pid local count local rc $1 `eval echo $2` & func_pid=$! count=0 while kill -s 0 $func_pid >/dev/null 2>&1; do sleep 1 count=`expr $count + 1` if [ $count -ge $3 ]; then ocf_log debug "Execute $1 time out." kill -s 9 $func_pid >/dev/null 2>&1 return 0 fi done wait $func_pid } is_node_online() { crm_mon -1 -n | grep -e "^Node $1 " -e "^Node $1:" | grep -q -v "OFFLINE" } node_exist() { crm_mon -1 -n | grep -q "^Node $1" } check_binary2() { if ! have_binary "$1"; then ocf_log err "Setup problem: couldn't find command: $1" return 1 fi return 0 } check_config() { local rc=0 if [ ! -f "$1" ]; then if ocf_is_probe; then ocf_log info "Configuration file is $1 not readable during probe." rc=1 else ocf_log err "Configuration file $1 doesn't exist" rc=2 fi fi return $rc } # Validate most critical parameters pgsql_validate_all() { local version + local check_config_rc if ! check_binary2 "$OCF_RESKEY_pgctl" || ! check_binary2 "$OCF_RESKEY_psql"; then return $OCF_ERR_INSTALLED fi - if [ -n "$OCF_RESKEY_config" -a ! -f "$OCF_RESKEY_config" ]; then - check_config "$OCF_RESKEY_config" - [ $? -eq 2 ] && return $OCF_ERR_INSTALLED - fi + check_config "$OCF_RESKEY_config" + check_config_rc=$? + [ $check_config_rc -eq 2 ] && return $OCF_ERR_INSTALLED + [ $check_config_rc -eq 0 ] && : ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "User $OCF_RESKEY_pgdba doesn't exist"; return $OCF_ERR_INSTALLED; fi if ocf_is_probe; then ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" else if ! runasowner "test -w $OCF_RESKEY_pgdata"; then ocf_log err "Directory $OCF_RESKEY_pgdata is not writable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM; fi fi if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ] then ocf_log err "monitor password can't be empty" return $OCF_ERR_CONFIGURED fi if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ] then ocf_log err "monitor_user has to be set if monitor_password is set" return $OCF_ERR_CONFIGURED fi if is_replication; then version=`cat $OCF_RESKEY_pgdata/PG_VERSION` if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then ocf_log err "Replication mode needs PostgreSQL 9.1 or higher." return $OCF_ERR_INSTALLED fi if ! ocf_is_ms; then ocf_log err "Replication requires Master/Slave configuration." return $OCF_ERR_CONFIGURED fi if [ ! "$OCF_RESKEY_rep_mode" = "sync" -a ! "$OCF_RESKEY_rep_mode" = "async" ]; then ocf_log err "Invalid rep_mode : $OCF_RESKEY_rep_mode" return $OCF_ERR_CONFIGURED fi if [ ! -n "$OCF_RESKEY_master_ip" ]; then ocf_log err "master_ip can't be empty." return $OCF_ERR_CONFIGURED fi - if [ ! -n "$OCF_RESKEY_node_list" ]; then + if [ ! -n "$NODE_LIST" ]; then ocf_log err "node_list can't be empty." return $OCF_ERR_CONFIGURED fi - if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then - if ! grep -q "include '$REP_MODE_CONF' # added by pgsql RA" $OCF_RESKEY_pgdata/postgresql.conf; then - echo "include '$REP_MODE_CONF' # added by pgsql RA" >> $OCF_RESKEY_pgdata/postgresql.conf + if [ "$OCF_RESKEY_rep_mode" = "sync" -a $check_config_rc -eq 0 ]; then + if ! grep -q "include '$REP_MODE_CONF' # added by pgsql RA" $OCF_RESKEY_config; then + echo "include '$REP_MODE_CONF' # added by pgsql RA" >> $OCF_RESKEY_config fi fi if ! mkdir -p $OCF_RESKEY_tmpdir || ! chown $OCF_RESKEY_pgdba $OCF_RESKEY_tmpdir || ! chmod 700 $OCF_RESKEY_tmpdir; then ocf_log err "Can't create directory $OCF_RESKEY_tmpdir or it is not readable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM fi fi return $OCF_SUCCESS } # # Check if we need to create a log file # check_log_file() { if [ ! -f "$1" ] then touch $1 > /dev/null 2>&1 chown $OCF_RESKEY_pgdba:`getent passwd $OCF_RESKEY_pgdba | cut -d ":" -f 4` $1 fi #Check if $OCF_RESKEY_pgdba can write to the log file if ! runasowner "test -w $1" then return 1 fi return 0 } # # Check socket directory # check_socket_dir() { if [ ! -d "$OCF_RESKEY_socketdir" ]; then if ! mkdir "$OCF_RESKEY_socketdir"; then ocf_log err "Can't create directory $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chown $OCF_RESKEY_pgdba:`getent passwd \ $OCF_RESKEY_pgdba | cut -d ":" -f 4` "$OCF_RESKEY_socketdir" then ocf_log err "Can't change ownership for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chmod 2775 "$OCF_RESKEY_socketdir"; then ocf_log err "Can't change permissions for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi else if ! runasowner "touch $OCF_RESKEY_socketdir/test.$$"; then ocf_log err "$OCF_RESKEY_pgdba can't create files in $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi rm $OCF_RESKEY_socketdir/test.$$ fi } # # 'main' starts here... # if [ $# -ne 1 ] then usage exit $OCF_ERR_GENERIC fi PIDFILE=${OCF_RESKEY_pgdata}/postmaster.pid BACKUPLABEL=${OCF_RESKEY_pgdata}/backup_label if is_replication; then RECOVERY_CONF=${OCF_RESKEY_pgdata}/recovery.conf REP_MODE_CONF=${OCF_RESKEY_tmpdir}/rep_mode.conf PGSQL_LOCK=${OCF_RESKEY_tmpdir}/PGSQL.lock XLOG_NOTE_FILE=${OCF_RESKEY_tmpdir}/xlog_note CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount" CAN_NOT_PROMOTE="-INFINITY" CAN_PROMOTE="100" PROMOTE_ME="1000" CHECK_MS_SQL="select pg_is_in_recovery()" CHECK_XLOG_LOC_SQL="select pg_last_xlog_replay_location(),pg_last_xlog_receive_location()" CHECK_REPLICATION_STATE_SQL="select application_name,upper(state),upper(sync_state) from pg_stat_replication" - PGSQL_STATUS_ATTR="pgsql-status" - PGSQL_DATA_STATUS_ATTR="pgsql-data-status" - PGSQL_XLOG_LOC_NAME="pgsql-xlog-loc" - PGSQL_MASTER_BASELINE="pgsql-master-baseline" + RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` + PGSQL_STATUS_ATTR="${RESOURCE_NAME}-status" + PGSQL_DATA_STATUS_ATTR="${RESOURCE_NAME}-data-status" + PGSQL_XLOG_LOC_NAME="${RESOURCE_NAME}-xlog-loc" + PGSQL_MASTER_BASELINE="${RESOURCE_NAME}-master-baseline" - NODENAME=`uname -n` + NODENAME=`uname -n | tr '[A-Z]' '[a-z]'` + NODE_LIST=`echo $OCF_RESKEY_node_list | tr '[A-Z]' '[a-z]'` OPERATION=$1 fi case "$1" in methods) pgsql_methods exit $?;; meta-data) meta_data exit $OCF_SUCCESS;; esac -# $OCF_RESKEY_pgdata has to be initialized at this momemnt -: ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} - pgsql_validate_all rc=$? [ "$1" = "validate-all" ] && exit $rc if [ $rc -ne 0 ] then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $OCF_NOT_RUNNING;; *) exit $rc;; esac fi US=`id -u -n` if [ $US != root -a $US != $OCF_RESKEY_pgdba ] then ocf_log err "$0 must be run as root or $OCF_RESKEY_pgdba" exit $OCF_ERR_GENERIC fi # make psql command options if [ -n "$OCF_RESKEY_monitor_user" ]; then PGUSER=$OCF_RESKEY_monitor_user; export PGUSER PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" else psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" fi if [ -n "$OCF_RESKEY_pghost" ]; then psql_options="$psql_options -h $OCF_RESKEY_pghost" else if [ -n "$OCF_RESKEY_socketdir" ]; then psql_options="$psql_options -h $OCF_RESKEY_socketdir" fi fi # What kind of method was invoked? case "$1" in status) if pgsql_status then ocf_log info "PostgreSQL is up" exit $OCF_SUCCESS else ocf_log info "PostgreSQL is down" exit $OCF_NOT_RUNNING fi;; monitor) pgsql_monitor exit $?;; start) pgsql_start exit $?;; promote) pgsql_promote exit $?;; demote) pgsql_demote exit $?;; notify) pgsql_notify exit $?;; stop) pgsql_stop exit $?;; *) exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/portblock b/heartbeat/portblock index 602c3ca59..53b35e965 100755 --- a/heartbeat/portblock +++ b/heartbeat/portblock @@ -1,477 +1,477 @@ #!/bin/sh # # portblock: iptables temporary portblocking control # # Author: Sun Jiang Dong (initial version) # Philipp Reisner (per-IP filtering) # # License: GNU General Public License (GPL) # # Copyright: (C) 2005 International Business Machines # # OCF parameters are as below: # OCF_RESKEY_protocol # OCF_RESKEY_portno # OCF_RESKEY_action # OCF_RESKEY_ip # OCF_RESKEY_tickle_dir # OCF_RESKEY_sync_script ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_ip_default="0.0.0.0/0" : ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} ####################################################################### CMD=`basename $0` TICKLETCP=$HA_BIN/tickle_tcp usage() { cat <&2 usage: $CMD {start|stop|status|monitor|meta-data|validate-all} $CMD is used to temporarily block ports using iptables. It can be used to turn off a port before bringing up an IP address, and enable it after a service is started. To do that for samba, the following resource line can be used: $CMD::tcp::137,138::block \\ 10.10.10.20 \\ nmbd smbd \\ $CMD::tcp::137,138::unblock This will do the follwing things: - DROP all incoming packets for TCP ports 137 and 138 - Bring up the IP alias 10.10.10.20 - start the nmbd and smbd services - Re-enable TCP ports 137 and 138 (enable normal firewall rules on those ports) This prevents clients from getting ICMP port unreachable if they try to reconnect to the service after the alias is enabled but before nmbd and smbd are running. These packets will cause some clients to give up attempting to reconnect to the server. NOTE: iptables is linux-specific... An additional feature in the portblock RA is the tickle ACK function enabled by specifying the tickle_dir parameter. The tickle ACK triggers the clients to faster reconnect their TCP connections to the fail-overed server. Please note that this feature is often used for the floating IP fail- over scenario where the long-lived TCP connections need to be tickled. It doesn't support the cluster alias IP scenario. When using the tickle ACK function, in addition to the normal usage of portblock RA, the parameter tickle_dir must be specified in the action=unblock instance of the portblock resources. For example, you may stack resources like below: portblock action=block services portblock action=unblock tickle_dir=/tickle/state/dir If you want to tickle all the TCP connections which connected to _one_ floating IP but different ports, no matter how many portblock resources you have defined, you should enable tickles for _one_ portblock resource(action=unblock) only. The tickle_dir is a location which stores the established TCP connections. It can be a shared directory(which is cluster-visible to all nodes) or a local directory. If you use the shared directory, you needn't do any other things. If you use the local directory, you must also specify the sync_script paramater. We recommend you to use csync2 as the sync_script. For example, if you use the local directory /tmp/tickle as tickle_dir, you could setup the csync2 as the csync2 documentation says and configure your /etc/csync2/csync2.cfg like: group ticklegroup { host node1; host node2; key /etc/csync2/ticklegroup.key; include /etc/csync2/csync2.cfg; include /tmp/tickle; auto younger; } Then specify the parameter sync_script as "csync2 -xv". END } meta_data() { cat < 1.0 Resource script for portblock. It is used to temporarily block ports using iptables. In addition, it may allow for faster TCP reconnects for clients on failover. Use that if there are long lived TCP connections to an HA service. This feature is enabled by setting the tickle_dir parameter and only in concert with action set to unblock. Note that the tickle ACK function is new as of version 3.0.2 and hasn't yet seen widespread use. Block and unblocks access to TCP and UDP ports The protocol used to be blocked/unblocked. protocol The port number used to be blocked/unblocked. portno The action (block/unblock) to be done on the protocol::portno. action The IP address used to be blocked/unblocked. ip The shared or local directory (_must_ be absolute path) which stores the established TCP connections. Tickle directory If the tickle_dir is a local directory, then the TCP connection state file has to be replicated to other nodes in the cluster. It can be csync2 (default), some wrapper of rsync, or whatever. It takes the file name as a single argument. For csync2, set it to "csync2 -xv". Connection state file synchronization script END } # # Because this is the normal usage, we consider "block" # resources to be pseudo-resources -- that is, their status can't # be reliably determined through external means. # This is because we expect an "unblock" resource to come along # and disable us -- but we're still in some sense active... # #active_grep_pat {udp|tcp} portno,portno active_grep_pat() { w="[ ][ ]*" any="0\\.0\\.0\\.0/0" - echo "^DROP${w}${1}${w}--${w}${any}${w}${3}${w}multiport${w}dports${w}${2} " + echo "^DROP${w}${1}${w}--${w}${any}${w}${3}${w}multiport${w}dports${w}${2}\>" } #chain_isactive {udp|tcp} portno,portno ip chain_isactive() { PAT=`active_grep_pat "$1" "$2" "$3"` $IPTABLES -n -L INPUT | grep "$PAT" >/dev/null } save_tcp_connections() { [ -z "$OCF_RESKEY_tickle_dir" ] && return statefile=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip if [ -z "$OCF_RESKEY_sync_script" ]; then netstat -tn |awk -F '[:[:space:]]+' ' $8 == "ESTABLISHED" && $4 == "'$OCF_RESKEY_ip'" \ {printf "%s:%s\t%s:%s\n", $4,$5, $6,$7}' | dd of="$statefile".new conv=fsync && mv "$statefile".new "$statefile" else netstat -tn |awk -F '[:[:space:]]+' ' $8 == "ESTABLISHED" && $4 == "'$OCF_RESKEY_ip'" \ {printf "%s:%s\t%s:%s\n", $4,$5, $6,$7}' \ > $statefile $OCF_RESKEY_sync_script $statefile > /dev/null 2>&1 & fi } run_tickle_tcp() { [ -z "$OCF_RESKEY_tickle_dir" ] && return echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip [ -f $f ] && cat $f | $TICKLETCP -n 3 } SayActive() { echo "$CMD DROP rule for INPUT chain [$*] is running (OK)" } SayConsideredActive() { echo "$CMD DROP rule for INPUT chain [$*] considered to be running (OK)" } SayInactive() { echo "$CMD DROP rule for INPUT chain [$*] is inactive" } #IptablesStatus {udp|tcp} portno,portno ip {block|unblock} IptablesStatus() { local rc rc=$OCF_ERR_GENERIC activewords="$CMD $1 $2 is running (OK)" if chain_isactive "$1" "$2" "$3"; then case $4 in block) SayActive $* rc=$OCF_SUCCESS ;; *) SayInactive $* rc=$OCF_NOT_RUNNING ;; esac else case $4 in block) if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then SayConsideredActive $* rc=$OCF_SUCCESS else SayInactive $* rc=$OCF_NOT_RUNNING fi ;; *) if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then SayActive $* #This is only run on real monitor events. save_tcp_connections rc=$OCF_SUCCESS else SayInactive $* rc=$OCF_NOT_RUNNING fi ;; esac fi return $rc } #IptablesBLOCK {udp|tcp} portno,portno ip IptablesBLOCK() { if chain_isactive "$1" "$2" "$3" then : OK -- chain already active else $IPTABLES -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP fi return $? } #IptablesUNBLOCK {udp|tcp} portno,portno ip IptablesUNBLOCK() { if chain_isactive "$1" "$2" "$3" then $IPTABLES -D INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP else : Chain Not active fi return $? } #IptablesStart {udp|tcp} portno,portno ip {block|unblock} IptablesStart() { ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start case $4 in block) IptablesBLOCK "$@";; unblock) IptablesUNBLOCK "$@" rc=$? run_tickle_tcp #ignore run_tickle_tcp exit code! return $rc ;; *) usage; return 1; esac return $? } #IptablesStop {udp|tcp} portno,portno ip {block|unblock} IptablesStop() { ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop case $4 in block) IptablesUNBLOCK "$@";; unblock) save_tcp_connections IptablesBLOCK "$@" ;; *) usage; return 1;; esac return $? } # # Check if the port is valid, this function code is not decent, but works # CheckPort() { # Examples of valid port: "1080", "1", "0080" # Examples of invalid port: "1080bad", "0", "0000", "" echo $1 |egrep -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*' } IptablesValidateAll() { check_binary $IPTABLES case $protocol in tcp|udp) ;; *) ocf_log err "Invalid protocol $protocol!" exit $OCF_ERR_CONFIGURED ;; esac if CheckPort "$portno"; then : else ocf_log err "Invalid port number $portno!" exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_tickle_dir" ]; then if [ x"$action" != x"unblock" ]; then ocf_log err "Tickles are only useful with action=unblock!" exit $OCF_ERR_CONFIGURED fi if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then ocf_log err "The tickle dir doesn't exist!" exit $OCF_ERR_INSTALLED fi fi case $action in block|unblock) ;; *) ocf_log err "Invalid action $action!" exit $OCF_ERR_CONFIGURED ;; esac return $OCF_SUCCESS } if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; *) ;; esac if [ -z "$OCF_RESKEY_protocol" ]; then ocf_log err "Please set OCF_RESKEY_protocol" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_portno" ]; then ocf_log err "Please set OCF_RESKEY_portno" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_action" ]; then ocf_log err "Please set OCF_RESKEY_action" exit $OCF_ERR_CONFIGURED fi protocol=$OCF_RESKEY_protocol portno=$OCF_RESKEY_portno action=$OCF_RESKEY_action ip=$OCF_RESKEY_ip case $1 in start) IptablesStart $protocol $portno $ip $action ;; stop) IptablesStop $protocol $portno $ip $action ;; status|monitor) IptablesStatus $protocol $portno $ip $action ;; validate-all) IptablesValidateAll ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/postfix b/heartbeat/postfix index 5f67ca8eb..8619af60d 100755 --- a/heartbeat/postfix +++ b/heartbeat/postfix @@ -1,398 +1,415 @@ #!/bin/sh # # Resource script for Postfix # # Description: Manages Postfix as an OCF resource in # an high-availability setup. # # Author: Raoul Bhatia : Original Author # License: GNU General Public License (GPL) # Note: If you want to run multiple Postfix instances, please see # http://amd.co.at/adminwiki/Postfix#Adding_a_Second_Postfix_Instance_on_one_Server # http://www.postfix.org/postconf.5.html # # # usage: $0 {start|stop|reload|monitor|validate-all|meta-data} # # The "start" arg starts a Postfix instance # # The "stop" arg stops it. # # OCF parameters: # OCF_RESKEY_binary # OCF_RESKEY_config_dir # OCF_RESKEY_parameters # ########################################################################## # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs : ${OCF_RESKEY_binary="/usr/sbin/postfix"} : ${OCF_RESKEY_config_dir=""} : ${OCF_RESKEY_parameters=""} USAGE="Usage: $0 {start|stop|reload|monitor|validate-all|meta-data}"; ########################################################################## usage() { echo $USAGE >&2 } meta_data() { cat < 0.1 This script manages Postfix as an OCF resource in a high-availability setup. Manages a highly available Postfix mail server instance Full path to the Postfix binary. For example, "/usr/sbin/postfix". Full path to Postfix binary Full path to a Postfix configuration directory. For example, "/etc/postfix". Full path to configuration directory The Postfix daemon may be called with additional parameters. Specify any of them here. END } postfix_running() { + local loglevel + loglevel=${1:-err} + # run Postfix status if available if ocf_is_true $status_support; then - output=`$binary $OPTION_CONFIG_DIR status 2>&1` + $binary $OPTION_CONFIG_DIR status 2>&1 ret=$? if [ $ret -ne 0 ]; then - ocf_log err "Postfix status: '$output'." $ret + ocf_log $loglevel "Postfix status: " $ret fi return $ret fi # manually check Postfix's pid PIDFILE=${queue_dir}/pid/master.pid if [ -f $PIDFILE ]; then PID=`head -n 1 $PIDFILE` kill -s 0 $PID >/dev/null 2>&1 && [ `ps -p $PID | grep master | wc -l` -eq 1 ] return $? fi # Postfix is not running false } postfix_start() { # if Postfix is running return success - if postfix_running; then + if postfix_running info; then ocf_log info "Postfix already running." return $OCF_SUCCESS fi # start Postfix $binary $OPTIONS start >/dev/null 2>&1 ret=$? if [ $ret -ne 0 ]; then - ocf_log err "Postfix returned error." $ret + ocf_log err "Postfix returned error: " $ret return $OCF_ERR_GENERIC fi # grant some time for startup/forking the sub processes # and loop initial monitoring until success or timeout while true; do sleep 1 # break if postfix is up and running; log failure otherwise - postfix_running && break - ocf_log info "Postfix failed initial monitor action." $ret + postfix_running info && break + ocf_log info "Postfix failed initial monitor action: " $ret done ocf_log info "Postfix started." return $OCF_SUCCESS } postfix_stop() { # if Postfix is not running return success - if ! postfix_running; then + if ! postfix_running info; then ocf_log info "Postfix already stopped." return $OCF_SUCCESS fi # stop Postfix $binary $OPTIONS stop >/dev/null 2>&1 ret=$? if [ $ret -ne 0 ]; then - ocf_log err "Postfix returned an error while stopping." $ret + ocf_log err "Postfix returned an error while stopping: " $ret return $OCF_ERR_GENERIC fi # grant some time for shutdown and recheck 5 times for i in 1 2 3 4 5; do - if postfix_running; then + if postfix_running info; then sleep 1 + else + break fi done # escalate to abort if we did not stop by now # @TODO shall we loop here too? - if postfix_running; then + if postfix_running info; then ocf_log err "Postfix failed to stop. Escalating to 'abort'." $binary $OPTIONS abort >/dev/null 2>&1; ret=$? sleep 5 # postfix abort did not succeed if postfix_running; then ocf_log err "Postfix failed to abort." return $OCF_ERR_GENERIC fi fi ocf_log info "Postfix stopped." return $OCF_SUCCESS } postfix_reload() { if postfix_running; then ocf_log info "Reloading Postfix." $binary $OPTIONS reload fi } postfix_monitor() { - if postfix_running; then + local status_loglevel="err" + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + if postfix_running $status_loglevel; then return $OCF_SUCCESS fi return $OCF_NOT_RUNNING } postfix_validate_all() { # check that the Postfix binaries exist and can be executed check_binary "$binary" check_binary "postconf" # if true, run in-depth directory checks dir_check=true # check config_dir and alternate_config_directories parameter if [ "x$config_dir" != "x" ]; then if [ ! -d "$config_dir" ]; then if ocf_is_probe; then ocf_log info "Postfix configuration directory '$config_dir' not readable during probe." # skip in-depth directory checks if config file isn't readable during probe dir_check=false else ocf_log err "Postfix configuration directory '$config_dir' does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi alternate_config_directories=`postconf -h alternate_config_directories 2>/dev/null | grep "$config_dir/\?"` if [ "x$alternate_config_directories" = "x" ]; then ocf_log err "Postfix main configuration must contain correct 'alternate_config_directories' parameter." return $OCF_ERR_INSTALLED fi fi # check spool/queue and data directories (if applicable) # this is required because "postfix check" does not catch all errors if ocf_is_true $dir_check; then if [ ! -d "$queue_dir" ]; then if ocf_is_probe; then ocf_log info "Postfix queue directory '$queue_dir' not readable during probe." else ocf_log err "Postfix queue directory '$queue_dir' does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi if ocf_is_true $status_support; then data_dir=`postconf $OPTION_CONFIG_DIR -h data_directory 2>/dev/null` + data_dir_count=`echo "$data_dir" | tr ',' ' ' | wc -w` + if [ $data_dir_count -gt 1 ]; then + ocf_log err "Postfix data directory '$orig_data_dir' cannot be set to multiple directories." + return $OCF_ERR_INSTALLED + fi if [ ! -d "$data_dir" ]; then if ocf_is_probe; then ocf_log info "Postfix data directory '$data_dir' not readable during probe." else ocf_log err "Postfix data directory '$data_dir' does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi fi # check directory permissions if ocf_is_true $status_support; then user=`postconf $OPTION_CONFIG_DIR -h mail_owner 2>/dev/null` for dir in "$data_dir"; do if ! su -s /bin/sh - $user -c "test -w $dir"; then if ocf_is_probe; then ocf_log info "Directory '$dir' is not writable by user '$user' during probe." else ocf_log err "Directory '$dir' is not writable by user '$user'." return $OCF_ERR_PERM; fi fi done fi fi # run Postfix internal check, if not probing if ! ocf_is_probe; then $binary $OPTIONS check >/dev/null 2>&1 ret=$? if [ $ret -ne 0 ]; then - ocf_log err "Postfix 'check' failed." $ret + ocf_log err "Postfix 'check' failed: " $ret return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS } # # Main # if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi binary=$OCF_RESKEY_binary config_dir=$OCF_RESKEY_config_dir parameters=$OCF_RESKEY_parameters # handle parameters case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) usage exit $OCF_SUCCESS ;; esac # build Postfix options string *outside* to access from each method OPTIONS='' OPTION_CONFIG_DIR='' # check for Postfix's postconf binary check_binary "postconf" # check if the Postfix config_dir exist if [ "x$config_dir" != "x" ]; then # remove all trailing slashes to ease "postconf alternate_config_directories" match config_dir=`echo $config_dir | sed 's/\/*$//'` # reset config_dir if it equals Postfix's default config_directory postconf -h config_directory 2>/dev/null | grep -q "^$config_dir/\?$" if [ $? -eq 0 ]; then config_dir="" fi # set OPTIONS if config_dir is still set # save OPTION_CONFIG_DIR seperatly if [ "x$config_dir" != "x" ]; then OPTION_CONFIG_DIR="-c $config_dir" OPTIONS=$OPTION_CONFIG_DIR fi fi # add all additional parameters to options string if [ "x$parameters" != "x" ]; then OPTIONS="$OPTIONS $parameters" fi # important directories, used in different methods queue_dir=`postconf $OPTION_CONFIG_DIR -h queue_directory 2>/dev/null` # check Postfix version and status support status_support=false postfix_version=`postconf -h mail_version 2>/dev/null` ocf_version_cmp "$postfix_version" "2.5.0" ret=$? # we need Postfix 2.5.0 or greater for status/data_directory support if [ $ret -eq 1 -o $ret -eq 2 ]; then status_support=true fi postfix_validate_all ret=$? LSB_STATUS_STOPPED=3 if [ $ret -ne $OCF_SUCCESS ]; then case $1 in stop) exit $OCF_SUCCESS ;; *) exit $ret;; esac fi case $1 in monitor) postfix_monitor exit $? ;; start) postfix_start exit $? ;; stop) postfix_stop exit $? ;; reload) postfix_reload exit $? ;; validate-all) exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/sapdb.sh b/heartbeat/sapdb.sh index 9a6ab307e..4952589da 100644 --- a/heartbeat/sapdb.sh +++ b/heartbeat/sapdb.sh @@ -1,336 +1,336 @@ # # sapdb.sh - for systems having SAPHostAgent installed # (sourced by SAPDatabase) # # Description: This code is separated from the SAPDatabase agent to # introduce new functions for systems which having # SAPHostAgent installed. # Someday it might be merged back into SAPDatabase agein. # # Author: Alexander Krauth, September 2010 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2010, 2012 Alexander Krauth # # # background_check_saphostexec : Run a request to saphostexec in a separat task, to be able to react on a hanging process # background_check_saphostexec() { timeout=600 count=0 $SAPHOSTCTRL -function ListDatabases >/dev/null 2>&1 & pid=$! while kill -0 $pid > /dev/null 2>&1 do sleep 0.1 count=$(( $count + 1 )) if [ $count -ge $timeout ]; then kill -9 $pid >/dev/null 2>&1 ocf_log warn "saphostexec did not respond to the method 'ListDatabases' within 60 seconds" return $OCF_ERR_GENERIC # Timeout fi done # child already has finished, now evaluate it's returncode wait $pid } # # cleanup_saphostexec : make sure to cleanup the SAPHostAgent in case of any # misbehavior # cleanup_saphostexec() { pkill -9 -f "$SAPHOSTEXEC" pkill -9 -f "$SAPHOSTSRV" oscolpid=`pgrep -f "$SAPHOSTOSCOL"` # we check saposcol pid, because it # might not run under control of # saphostexec # cleanup saposcol shared memory, otherwise it will not start again if [ -n "$oscolpid" ];then kill -9 $oscolpid oscolipc=`ipcs -m | grep "4dbe " | awk '{print $2}'` if [ -n "$oscolipc" ]; then ipcrm -m $oscolipc fi fi } # # check_saphostexec : Before using saphostctrl we make sure that the # saphostexec is running on the current node. # check_saphostexec() { chkrc=$OCF_SUCCESS running=`pgrep -f "$SAPHOSTEXEC" | wc -l` if [ $running -gt 0 ]; then if background_check_saphostexec; then return $OCF_SUCCESS else ocf_log warn "saphostexec did not respond to the method 'ListDatabases' correctly (rc=$?), it will be killed now" running=0 fi fi if [ $running -eq 0 ]; then ocf_log warn "saphostexec is not running on node `hostname`, it will be started now" cleanup_saphostexec output=`$SAPHOSTEXEC -restart 2>&1` # now make sure the daemon has been started and is able to respond srvrc=1 while [ $srvrc -ne 0 -a `pgrep -f "$SAPHOSTEXEC" | wc -l` -gt 0 ] do sleep 1 background_check_saphostexec srvrc=$? done if [ $srvrc -eq 0 ] then ocf_log info "saphostexec on node `hostname` was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "saphostexec on node `hostname` could not be started! - $output" chkrc=$OCF_ERR_GENERIC fi fi return $chkrc } # # sapdatabase_start : Start the SAP database # sapdatabase_start() { check_saphostexec rc=$? if [ $rc -eq $OCF_SUCCESS ] then sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" DBINST="" if [ -n "$OCF_RESKEY_DBINSTANCE" ] then DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " fi FORCE="" if ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then FORCE="-force" fi output=`$SAPHOSTCTRL -function StartDatabase -dbname $SID -dbtype $DBTYPE $DBINST $FORCE -service` sapdatabase_monitor 1 rc=$? if [ $rc -eq 0 ] then ocf_log info "SAP database $SID started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" else ocf_log err "SAP database $SID start failed: $output" rc=$OCF_ERR_GENERIC fi fi return $rc } # # sapdatabase_stop: Stop the SAP database # sapdatabase_stop() { check_saphostexec rc=$? if [ $rc -eq $OCF_SUCCESS ] then sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" DBINST="" if [ -n "$OCF_RESKEY_DBINSTANCE" ] then DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " fi output=`$SAPHOSTCTRL -function StopDatabase -dbname $SID -dbtype $DBTYPE $DBINST -force -service` if [ $? -eq 0 ] then ocf_log info "SAP database $SID stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP database $SID stop failed: $output" rc=$OCF_ERR_GENERIC fi fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" return $rc } # # sapdatabase_monitor: Can the given database instance do anything useful? # sapdatabase_monitor() { strict=$1 rc=$OCF_SUCCESS if ! ocf_is_true $strict then sapdatabase_status rc=$? else check_saphostexec rc=$? if [ $rc -eq $OCF_SUCCESS ] then count=0 DBINST="" if [ -n "$OCF_RESKEY_DBINSTANCE" ] then DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " fi output=`$SAPHOSTCTRL -function GetDatabaseStatus -dbname $SID -dbtype $DBTYPE $DBINST` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVICE in `echo "$output" | grep -i 'Component[ ]*Name *[:=] [A-Za-z][A-Za-z0-9_]* (' | sed 's/^.*Component[ ]*Name *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i'` do COLOR=`echo "$output" | grep -i "Component[ ]*Name *[:=] *$SERVICE (" | sed 's/^.*Status *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i'` STATE=0 case $COLOR in Running) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then ocf_log err "SAP database service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then ocf_log err "The resource does not run any services which this RA could monitor!" rc=$OCF_ERR_ARGS fi if [ $rc -ne $OCF_SUCCESS ] then ocf_log err "The SAP database $SID ist not running: $output" fi fi fi return $rc } # # sapdatabase_status: Are there any database processes on this host ? # sapdatabase_status() { case $DBTYPE in ADA) SEARCH="$SID/db/pgm/kernel" SUSER=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` SNUM=2 ;; ORA) SEARCH="ora_[a-z][a-z][a-z][a-z]_" SUSER="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" SNUM=4 ;; DB6) SEARCH="db2[a-z][a-z][a-z]" SUSER="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" SNUM=2 ;; SYB) SEARCH="dataserver" SUSER="syb`echo $SID | tr '[:upper:]' '[:lower:]'`" SNUM=1 ;; HDB) SEARCH="hdb[a-z]*server" SUSER="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" SNUM=1 ;; esac cnt=`ps -u $SUSER -o args 2> /dev/null | grep -c $SEARCH` [ $cnt -ge $SNUM ] && return $OCF_SUCCESS return $OCF_NOT_RUNNING } # # sapdatabase_recover: # sapdatabase_recover() { OCF_RESKEY_AUTOMATIC_RECOVER=1 sapdatabase_stop sapdatabase_start } # # sapdatabase_validate: Check the symantic of the input parameters # sapdatabase_validate() { rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing parameter SID: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi case "$DBTYPE" in ORA|ADA|DB6|SYB|HDB) ;; *) ocf_log err "Parsing parameter DBTYPE: '$DBTYPE' is not a supported database type!" rc=$OCF_ERR_ARGS ;; esac return $rc } # # sapdatabase_init: initialize global variables at the beginning # sapdatabase_init() { OCF_RESKEY_AUTOMATIC_RECOVER_default=0 : ${OCF_RESKEY_AUTOMATIC_RECOVER=${OCF_RESKEY_AUTOMATIC_RECOVER_default}} if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then case $DBTYPE in ORA) export OCF_RESKEY_MONITOR_SERVICES="Instance|Database|Listener" ;; ADA) export OCF_RESKEY_MONITOR_SERVICES="Database" ;; DB6) db2sid="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" export OCF_RESKEY_MONITOR_SERVICES="${SID}|${db2sid}" ;; - SYB) export OCF_RESKEY_MONITOR_SERVICES="Server|Database" + SYB) export OCF_RESKEY_MONITOR_SERVICES="Server" ;; HDB) export OCF_RESKEY_MONITOR_SERVICES="hdbindexserver" ;; esac fi } diff --git a/heartbeat/slapd b/heartbeat/slapd index 1429f3c70..6661984ff 100755 --- a/heartbeat/slapd +++ b/heartbeat/slapd @@ -1,579 +1,587 @@ #!/bin/bash # # Stand-alone LDAP Daemon (slapd) # # Description: Manages Stand-alone LDAP Daemon (slapd) as an OCF resource in # an high-availability setup. # # Authors: Jeroen Koekkoek # nozawat@gmail.com # John Keith Hohm # # License: GNU General Public License (GPL) # Copyright: (C) 2011 Pagelink B.V. # # The OCF code was inspired by the Postfix resource script written by # Raoul Bhatia . # # The code for managing the slapd instance is based on the the slapd init # script found in Debian GNU/Linux 6.0. # # OCF parameters: # OCF_RESKEY_slapd # OCF_RESKEY_ldapsearch # OCF_RESKEY_config # OCF_RESKEY_pidfile # OCF_RESKEY_user # OCF_RESKEY_group # OCF_RESKEY_services # OCF_RESKEY_watch_suffix # OCF_RESKEY_ignore_suffix # OCF_RESKEY_bind_dn # OCF_RESKEY_password # OCF_RESKEY_parameters # OCF_RESKEY_stop_escalate # ################################################################################ # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs : ${OCF_RESKEY_slapd="/usr/sbin/slapd"} : ${OCF_RESKEY_ldapsearch="ldapsearch"} : ${OCF_RESKEY_config=""} : ${OCF_RESKEY_pidfile=""} : ${OCF_RESKEY_user=""} : ${OCF_RESKEY_group=""} : ${OCF_RESKEY_services="ldap:///"} : ${OCF_RESKEY_watch_suffix=""} : ${OCF_RESKEY_ignore_suffix=""} : ${OCF_RESKEY_bind_dn=""} : ${OCF_RESKEY_password=""} : ${OCF_RESKEY_parameters=""} : ${OCF_RESKEY_stop_escalate=15} USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}" ORIG_IFS=$IFS NEWLINE=' ' ################################################################################ usage() { echo $USAGE >&2 } meta_data() { cat < 0.1 Resource script for Stand-alone LDAP Daemon (slapd). It manages a slapd instance as an OCF resource. Manages a Stand-alone LDAP Daemon (slapd) instance Full path to the slapd binary. For example, "/usr/sbin/slapd". Full path to slapd binary Full path to the ldapsearch binary. For example, "/usr/bin/ldapsearch". Full path to ldapsearch binary Full path to a slapd configuration directory or a slapd configuration file. For example, "/etc/ldap/slapd.d" or "/etc/ldap/slapd.conf". Full path to configuration directory or file File to read the PID from; read from olcPidFile/pidfile in config if not set. File to read PID from User name or id slapd will run with. The group id is also changed to this user's gid, unless the group parameter is used to override. User name or id slapd will run with Group name or id slapd will run with. Group name or id slapd will run with LDAP (and other scheme) URLs slapd will serve. For example, "ldap://127.0.0.1:389 ldaps:/// ldapi:///" LDAP (and other scheme) URLs to serve Suffix (database backend) that will be monitored for availability. Multiple suffixes can be specified by providing a space seperated list. By providing one or more suffixes here, the ignore_suffix parameter is discarded. All suffixes will be monitored if left blank. Suffix that will be monitored for availability. Suffix (database backend) that will not be monitored for availability. Multiple suffixes can be specified by providing a space seperated list. No suffix will be excluded if left blank. Suffix that will not be monitored for availability. Distinguished Name used to bind to the LDAP directory for testing. Leave blank to bind to the LDAP directory anonymously. Distinguished Name used to bind to the LDAP directory for testing. Password used to bind to the LDAP directory for testing. Password used to bind to the LDAP directory for testing. slapd may be called with additional parameters. Specify any of them here. Any additional parameters to slapd. Number of seconds to wait for shutdown (using SIGTERM) before resorting to SIGKILL Seconds before stop escalation to KILL END } terminate() { local pid=$1 local signal=$2 local recheck=${3-0} local rc local waited=0 kill -$signal $pid >/dev/null 2>&1; rc=$? while [ \( $rc -eq 0 \) -a \( $recheck -eq 0 -o $waited -lt $recheck \) ]; do kill -0 $pid >/dev/null 2>&1; rc=$? let "waited += 1" if [ $rc -eq 0 ]; then sleep 1 fi done if [ $rc -ne 0 ]; then return 0 fi return 1 } watch_suffix() { local rc if [ -n "$OCF_RESKEY_watch_suffix" ]; then if echo "'$OCF_RESKEY_watch_suffix'" | grep "'$1'" >/dev/null 2>&1; then rc=0 else rc=1 fi else if echo "'$OCF_RESKEY_ignore_suffix'" | grep "'$1'" >/dev/null 2>&1; then rc=1 else rc=0 fi fi return $rc } slapd_pid() { local pid if [ -f "$pid_file" ]; then pid=`head -n 1 "$pid_file" 2>/dev/null` if [ "X$pid" != "X" ]; then echo "$pid" return $OCF_SUCCESS fi ocf_log err "slapd pid file '$pid_file' empty." return $OCF_ERR_GENERIC fi ocf_log info "slapd pid file '$pid_file' does not exist." return $OCF_NOT_RUNNING } slapd_status() { local pid=$1 local state=$? if [ $state -eq $OCF_SUCCESS ]; then if ! kill -0 $pid >/dev/null 2>&1; then return $OCF_NOT_RUNNING else return $OCF_SUCCESS fi fi return $state } slapd_start() { local options local reason local rc local state slapd_status `slapd_pid`; state=$? if [ $state -eq $OCF_SUCCESS ]; then ocf_log info "slapd already running." return $state elif [ $state -eq $OCF_ERR_GENERIC ]; then return $state fi options="-u $user -g $group" if [ -d "$config" ]; then options="$options -F $config" elif [ -f "$config" ]; then options="$options -f $config" else ocf_log err "slapd configuration '$config' does not exist." return $OCF_ERR_INSTALLED fi if [ -n "$parameters" ]; then options="$options $parameters" fi if [ -n "$services" ]; then $slapd -h "$services" $options 2>&1; rc=$? else $slapd $options 2>&1; rc=$? fi if [ $rc -ne 0 ]; then ocf_log err "slapd returned error." return $OCF_ERR_GENERIC fi while true; do slapd_monitor start if [ $? = "$OCF_SUCCESS" ]; then break fi sleep 1 done ocf_log info "slapd started." return $OCF_SUCCESS } slapd_stop() { local pid local rc local state pid=`slapd_pid`; slapd_status $pid; state=$? if [ $state -eq $OCF_NOT_RUNNING ]; then ocf_log info "slapd already stopped." return $OCF_SUCCESS elif [ $state -eq $OCF_ERR_GENERIC ]; then return $state fi terminate $pid TERM $OCF_RESKEY_stop_escalate; rc=$? if [ $rc -ne 0 ]; then ocf_log err "slapd failed to stop. Escalating to KILL." terminate $pid KILL; rc=$? fi if [ -f "$pid_file" ]; then rm -f "$pid_file" >/dev/null 2>&1 fi ocf_log info "slapd stopped." return $OCF_SUCCESS } slapd_monitor() { local options local rc local state local suffix local suffixes - local err_option="-err" + local err_option="-info" slapd_status `slapd_pid`; state=$? if [ $state -eq $OCF_NOT_RUNNING ]; then if [ -z "$1" ];then if ! ocf_is_probe; then ocf_log err "slapd process not found." fi fi return $state elif [ $state -ne $OCF_SUCCESS ]; then ocf_log err "slapd returned error." return $state fi if [ -d "$config" ]; then for suffix in `find "$config"/'cn=config' -type f -name olcDatabase* -exec \ sed -ne 's/^[[:space:]]*olcSuffix:[[:space:]]\+\(.\+\)/\1/p' {} \;` do suffix=${suffix#\"*} suffix=${suffix%\"*} if watch_suffix $suffix; then suffixes="$suffixes $suffix" fi done elif [ -f "$config" ]; then for suffix in `sed -ne 's/^[[:space:]]*suffix[[:space:]]\+\(.\+\)/\1/p' "$config"` do suffix=${suffix#\"*} suffix=${suffix%\"*} if watch_suffix $suffix; then suffixes="$suffixes $suffix" fi done else - ocf_log err "slapd configuration '$config' does not exist." - return $OCF_ERR_INSTALLED + if ocf_is_probe; then + ocf_log info "slapd configuration '$config' does not exist during probe." + else + ocf_log err "slapd configuration '$config' does not exist." + return $OCF_ERR_INSTALLED + fi fi options="-LLL -s base -x" if [ -n "$bind_dn" ]; then - options="$options -D '$bind_dn' -w '$password'" + options="$options -D $bind_dn -w $password" fi [ -z "$1" ] && err_option="" for suffix in $suffixes; do ocf_run -q $err_option "$ldapsearch" -H "$services" -b "$suffix" $options >/dev/null 2>&1; rc=$? case "$rc" in "0") ocf_log debug "slapd database with suffix '$suffix' reachable" ;; "49") ocf_log err "slapd database with suffix '$suffix' unreachable. Invalid credentials." return $OCF_ERR_CONFIGURED ;; *) if [ -z "$1" ] || [ -n "$1" -a $rc -ne 1 ]; then ocf_log err "slapd database with suffix '$suffix' unreachable. exit code ($rc)" - state=$OCF_ERR_GENERIC fi + state=$OCF_ERR_GENERIC ;; esac done return $state } slapd_validate_all() { check_binary "$slapd" check_binary "$ldapsearch" if [ -z "$pid_file" ]; then if [ -d "$config" ]; then pid_file=`sed -ne \ 's/^olcPidFile:[[:space:]]\+\(.\+\)[[:space:]]*/\1/p' \ "$config"/'cn=config.ldif' 2>/dev/null` elif [ -f "$config" ]; then pid_file=`sed -ne \ 's/^pidfile[[:space:]]\+\(.\+\)/\1/p' \ "$config" 2>/dev/null` else - ocf_log err "slapd configuration '$config' does not exist." - return $OCF_ERR_INSTALLED + if ocf_is_probe; then + ocf_log info "slapd configuration '$config' does not exist during probe." + else + ocf_log err "slapd configuration '$config' does not exist." + return $OCF_ERR_INSTALLED + fi fi fi if [ -z "$user" ]; then user=`id -nu 2>/dev/null` elif ! id "$user" >/dev/null 2>&1; then ocf_log err "slapd user '$user' does not exist" return $OCF_ERR_INSTALLED fi if [ -z "$group" ]; then group=`id -ng 2>/dev/null` elif ! grep "^$group:" /etc/group >/dev/null 2>&1; then ocf_log err "slapd group '$group' does not exist" return $OCF_ERR_INSTALLED fi pid_dir=`dirname "$pid_file"` if [ ! -d "$pid_dir" ]; then mkdir -p "$pid_dir" chown -R "$user" "$pid_dir" chgrp -R "$group" "$pid_dir" fi return $OCF_SUCCESS } # # Main # slapd=$OCF_RESKEY_slapd ldapsearch=$OCF_RESKEY_ldapsearch config=$OCF_RESKEY_config user=$OCF_RESKEY_user group=$OCF_RESKEY_group services=$OCF_RESKEY_services bind_dn=$OCF_RESKEY_bind_dn password=$OCF_RESKEY_password parameters=$OCF_RESKEY_parameters pid_file=$OCF_RESKEY_pidfile if [ -z "$config" ]; then if [ -e "/etc/ldap/slapd.d" ]; then config="/etc/ldap/slapd.d" else config="/etc/ldap/slapd.conf" fi fi if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) usage exit $OCF_SUCCESS ;; esac slapd_validate_all rc=$? [ $rc -eq $OCF_SUCCESS ] || exit $rc case $1 in status) slapd_status `slapd_pid`; state=$? if [ $state -eq $OCF_SUCCESS ]; then ocf_log debug "slapd is running." elif [ $state -eq $OCF_NOT_RUNNING ]; then ocf_log debug "slapd is stopped." fi exit $state ;; start) slapd_start exit $? ;; stop) slapd_stop exit $? ;; monitor) slapd_monitor; state=$? exit $state ;; validate-all) exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/tomcat b/heartbeat/tomcat index c52537585..27f71b59a 100755 --- a/heartbeat/tomcat +++ b/heartbeat/tomcat @@ -1,496 +1,626 @@ #!/bin/sh # # Description: Manages a Tomcat Server as an OCF High-Availability # resource under Heartbeat/LinuxHA control # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # # Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION # ####################################################################### # OCF parameters: # OCF_RESKEY_tomcat_name - The name of the resource. Default is tomcat # OCF_RESKEY_script_log - A destination of the log of this script. Default /var/log/OCF_RESKEY_tomcat_name.log # OCF_RESKEY_tomcat_stop_timeout - Time-out at the time of the stop. Default is 5. DEPRECATED # OCF_RESKEY_tomcat_suspend_trialcount - The re-try number of times awaiting a stop. Default is 10. DEPRECATED # OCF_RESKEY_tomcat_user - A user name to start a resource. # OCF_RESKEY_statusurl - URL for state confirmation. Default is http://127.0.0.1:8080 +# OCF_RESKEY_max_stop_time - The max time it should take for proper shutdown. Restrictions, only Tomcat6. # OCF_RESKEY_java_home - Home directory of Java. Default is none # OCF_RESKEY_java_opts - Options to pass to Java JVM for start and stop. Default is none # OCF_RESKEY_catalina_home - Home directory of Tomcat. Default is none # OCF_RESKEY_catalina_base - Base directory of Tomcat. Default is OCF_RESKEY_catalina_home +# OCF_RESKEY_catalina_out - Log file name of Tomcat. Default is OCF_RESKEY_catalina_home/logs/catalina.out # OCF_RESKEY_catalina_pid - A PID file name of Tomcat. Default is OCF_RESKEY_catalina_home/logs/catalina.pid # OCF_RESKEY_tomcat_start_opts - Start options of Tomcat. Default is none. # OCF_RESKEY_catalina_opts - CATALINA_OPTS environment variable. Default is none. +# OCF_RESKEY_catalina_tmpdir - CATALINA_TMPDIR environment variable. Default is none. # OCF_RESKEY_catalina_rotate_log - Control catalina.out logrotation flag. Default is NO. # OCF_RESKEY_catalina_rotatetime - catalina.out logrotation time span(seconds). Default is 86400. +# OCF_RESKEY_java_endorsed_dirs - JAVA_ENDORSED_DIRS environment variable. Default is none. +# OCF_RESKEY_logging_config - LOGGING_CONFIG environment variable. Default is none. +# OCF_RESKEY_logging_manager - LOGGING_MANAGER environment variable. Default is none. ############################################################################### : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ############################################################################ # Usage usage() { cat <<-! usage: $0 action action: start start Tomcat stop stop Tomcat status return the status of Tomcat, up or down monitor return TRUE if Tomcat appears to be working. You have to have installed $WGETNAME for this to work. meta-data show meta data message validate-all validate the instance parameters ! } ############################################################################ # Check tomcat service availability isrunning_tomcat() { - if ! have_binary $WGET; then - ocf_log err "Monitoring not supported by $OCF_RESOURCE_INSTANCE" - ocf_log info "Please make sure that wget is available" - return $OCF_ERR_CONFIGURED - fi - $WGET -O /dev/null $RESOURCE_STATUSURL >/dev/null 2>&1 + $WGET --tries=20 -O /dev/null $RESOURCE_STATUSURL >/dev/null 2>&1 } ############################################################################ # isalive_tomcat() { + # As the server stops, the PID file disappears. To avoid race conditions, + # we will have remembered the PID of a running instance on script entry. + local pid=$rememberedPID + # If there is a PID file, use that if [ -f $CATALINA_PID ]; then - PID=`head -n 1 $CATALINA_PID` + ocf_log debug "Reading pid from $CATALINA_PID" + # race conditions on PID file being removed by stopping tomcat... + pid=`head -n 1 $CATALINA_PID` + fi + if [ -n "$pid" ] && [ "$pid" -gt 0 ]; then # Retry message for restraint - kill -s 0 $PID >/dev/null 2>&1 + ocf_log debug "Sending noop signal to $pid" + kill -s 0 $pid >/dev/null 2>&1 return $? fi # No PID file false } ############################################################################ # Check tomcat process and service availability monitor_tomcat() { isalive_tomcat || return $OCF_NOT_RUNNING isrunning_tomcat || - return $OCF_NOT_RUNNING + return $OCF_ERR_GENERIC return $OCF_SUCCESS } ############################################################################ # Execute catalina.out log rotation rotate_catalina_out() { # Look for rotatelogs/rotatelogs2 if [ -x /usr/sbin/rotatelogs ]; then ROTATELOGS=/usr/sbin/rotatelogs elif [ -x /usr/sbin/rotatelogs2 ]; then ROTATELOGS=/usr/sbin/rotatelogs2 else ocf_log warn "rotatelogs command not found." return 1 fi # Clean up and set permissions on required files - rm -rf "$CATALINA_HOME"/temp/* "$CATALINA_HOME/logs/catalina.out" - mkfifo -m700 "$CATALINA_HOME/logs/catalina.out" - chown --dereference "$RESOURCE_TOMCAT_USER" "$CATALINA_HOME/logs/catalina.out" || true + rm -rf "$CATALINA_HOME"/temp/* "$CATALINA_OUT" + mkfifo -m700 "$CATALINA_OUT" + chown --dereference "$RESOURCE_TOMCAT_USER" "$CATALINA_OUT" || true # -s is required because tomcat5.5's login shell is /bin/false su - -s /bin/sh $RESOURCE_TOMCAT_USER \ -c "$ROTATELOGS -l \"$CATALINA_HOME/logs/catalina_%F.log\" $CATALINA_ROTATETIME" \ - < "$CATALINA_HOME/logs/catalina.out" > /dev/null 2>&1 & + < "$CATALINA_OUT" > /dev/null 2>&1 & +} + +############################################################################ +# Tomcat Command +tomcatCommand() +{ +cat<<-END_TOMCAT_COMMAND + export JAVA_HOME=${JAVA_HOME} + export JAVA_OPTS="${JAVA_OPTS}" + export CATALINA_HOME=${CATALINA_HOME} + export CATALINA_BASE=${CATALINA_BASE} + export CATALINA_OUT=${CATALINA_OUT} + export CATALINA_PID=${CATALINA_PID} + export CATALINA_OPTS="${CATALINA_OPTS}" + export CATALINA_TMPDIR="${CATALINA_TMPDIR}" + export JAVA_ENDORSED_DIRS="${JAVA_ENDORSED_DIRS}" + export LOGGING_CONFIG="${LOGGING_CONFIG}" + export LOGGING_MANAGER="${LOGGING_MANAGER}" + $CATALINA_HOME/bin/catalina.sh $@ +END_TOMCAT_COMMAND +} +attemptTomcatCommand() +{ + if [ "$RESOURCE_TOMCAT_USER" = RUNASIS ]; then + "$CATALINA_HOME/bin/catalina.sh" $@ >> "$TOMCAT_CONSOLE" 2>&1 + else + tomcatCommand $@ | su - -s /bin/sh "$RESOURCE_TOMCAT_USER" >> "$TOMCAT_CONSOLE" 2>&1 + fi } ############################################################################ # Start Tomcat start_tomcat() { cd "$CATALINA_HOME/bin" + validate_all_tomcat || exit $? + monitor_tomcat if [ $? = $OCF_SUCCESS ]; then return $OCF_SUCCESS fi # Remove $CATALINA_PID if it exists rm -f $CATALINA_PID #ocf_log debug "catalina.out rotation FLG = ${CATALINA_ROTATE_LOG}" if [ ${CATALINA_ROTATE_LOG} = "YES" ]; then rotate_catalina_out if [ $? = 0 ]; then ocf_log debug "Rotate catalina.out succeeded." else ocf_log warn "Rotate catalina.out failed. Starting tomcat without catalina.out rotation." fi fi echo "`date "+%Y/%m/%d %T"`: start ===========================" >> "$TOMCAT_CONSOLE" ocf_log debug "CATALINA_OPTS value = ${CATALINA_OPTS}" - if [ "$RESOURCE_TOMCAT_USER" = RUNASIS ]; then - "$CATALINA_HOME/bin/catalina.sh" start $TOMCAT_START_OPTS \ - >> "$TOMCAT_CONSOLE" 2>&1 & - else - cat<<-END_TOMCAT_START | su - -s /bin/sh "$RESOURCE_TOMCAT_USER" >> "$TOMCAT_CONSOLE" 2>&1 & - export JAVA_HOME=${JAVA_HOME} - export JAVA_OPTS="${JAVA_OPTS}" - export CATALINA_HOME=${CATALINA_HOME} - export CATALINA_BASE=${CATALINA_BASE} - export CATALINA_PID=${CATALINA_PID} - export CATALINA_OPTS="${CATALINA_OPTS}" - $CATALINA_HOME/bin/catalina.sh start ${TOMCAT_START_OPTS} -END_TOMCAT_START - fi + attemptTomcatCommand start ${TOMCAT_START_OPTS} & while true; do monitor_tomcat if [ $? = $OCF_SUCCESS ]; then break fi ocf_log debug "start_tomcat[$TOMCAT_NAME]: retry monitor_tomcat" sleep 3 done return $OCF_SUCCESS } ############################################################################ # Stop Tomcat stop_tomcat() { - STOP_TIMEOUT=$((OCF_RESKEY_CRM_meta_timeout/1000-1)) + RA_TIMEOUT=$((OCF_RESKEY_CRM_meta_timeout/1000)) + STOP_TIMEOUT=$((RA_TIMEOUT-5)) + if [ -n "$MAX_STOP_TIME" ]; then + if [ $MAX_STOP_TIME -gt $RA_TIMEOUT ]; then + ocf_log warn "max_stop_timeout must be shorter than the timeout of stop operation." + fi + if [ $MAX_STOP_TIME -eq 0 ]; then + STOP_TIMEOUT=$RA_TIMEOUT + else + STOP_TIMEOUT=$MAX_STOP_TIME + fi + fi cd "$CATALINA_HOME/bin" + memorize_pid # This lets monitoring continue to work reliably + echo "`date "+%Y/%m/%d %T"`: stop ###########################" >> "$TOMCAT_CONSOLE" - if [ "$RESOURCE_TOMCAT_USER" = RUNASIS ]; then - "$CATALINA_HOME/bin/catalina.sh" stop $STOP_TIMEOUT -force \ - >> "$TOMCAT_CONSOLE" 2>&1 - else - cat<<-END_TOMCAT_STOP | su - -s /bin/sh "$RESOURCE_TOMCAT_USER" >> "$TOMCAT_CONSOLE" 2>&1 - export JAVA_HOME=${JAVA_HOME} - export JAVA_OPTS="${JAVA_OPTS}" - export CATALINA_HOME=${CATALINA_HOME} - export CATALINA_BASE=${CATALINA_BASE} - export CATALINA_PID=${CATALINA_PID} - $CATALINA_HOME/bin/catalina.sh stop $STOP_TIMEOUT -force -END_TOMCAT_STOP - fi + attemptTomcatCommand stop $STOP_TIMEOUT -force lapse_sec=0 while isalive_tomcat; do sleep 1 lapse_sec=`expr $lapse_sec + 1` ocf_log debug "stop_tomcat[$TOMCAT_NAME]: stop failed, killing with SIGKILL ($lapse_sec)" - if [ -f $CATALINA_PID ]; then - PID=`head -n 1 $CATALINA_PID` - kill -KILL $PID - fi + kill -KILL $rememberedPID done if [ ${CATALINA_ROTATE_LOG} = "YES" ]; then - rm -f "$CATALINA_PID" "${CATALINA_HOME}/logs/catalina.out" + rm -f "$CATALINA_PID" "${CATALINA_OUT}" else rm -f "$CATALINA_PID" fi return $OCF_SUCCESS } metadata_tomcat() { cat < 1.0 Resource script for Tomcat. It manages a Tomcat instance as a cluster resource. Manages a Tomcat servlet environment instance to Tomcat process on start. Used to ensure process is still running and must be unique. ]]> The name of the resource Log file, used during start and stop operations. Log file Time-out for stop operation. DEPRECATED Time-out for the stop operation. DEPRECATED Maximum number of times to retry stop operation before suspending and killing Tomcat. DEPRECATED. Does not retry. Max retry count for stop operation. DEPRECATED The user who starts Tomcat. The user who starts Tomcat URL for state confirmation. URL for state confirmation + + +Number of seconds to wait during a stop before drastic measures +(force kill) are used on the tomcat process. +This number MUST be less than your cluster stop timeout for the resource. +The default value is five seconds before the timeout value of stop operation. +When it is over this value, it stop a process in kill commands. +This parameter is only effective on Tomcat 6 or later. + +The max time it should take for proper shutdown. + + + Home directory of Java. Home directory of Java Java JVM options used on start and stop. Java options parsed to JVM, used on start and stop. Home directory of Tomcat. Home directory of Tomcat Instance directory of Tomcat Instance directory of Tomcat, defaults to catalina_home + + +Log file name of Tomcat + +Log file name of Tomcat, defaults to catalina_home/logs/catalina.out + + + A PID file name for Tomcat. A PID file name for Tomcat Tomcat start options. Tomcat start options Catalina options, for the start operation only. Catalina options + + +Temporary directory of Tomcat + +Temporary directory of Tomcat, defaults to none + + + Rotate catalina.out flag. Rotate catalina.out flag catalina.out rotation interval (seconds). catalina.out rotation interval (seconds) + + +Java_endorsed_dirs of tomcat + +Java_endorsed_dirs of tomcat, defaults to none + + + + + +Logging_config of tomcat + +Logging_config of tomcat, defaults to none + + + + + +Logging_manager of tomcat + +Logging_manager of tomcat, defaults to none. + + + END return $OCF_SUCCESS } validate_all_tomcat() { ocf_log info "validate_all_tomcat[$TOMCAT_NAME]" + + misconfigured=0 + notinstalled=0 + wrongpermissions=0 + + check_binary $WGET + + if [ -n "$MAX_STOP_TIME" ] && [ "$MAX_STOP_TIME" -lt 0 ]; then + ocf_log err "max_stop_time must be set to a value greater than 0." + misconfigured=1 + fi + + if [[ "$RESOURCE_STATUSURL" =~ :[0-9][0-9]* ]]; then + port=${RESOURCE_STATUSURL##*:} + port=${port%%/*} + ocf_log debug "Tomcat port is $port" + ocf_log debug "grep port=\"$port\" $CATALINA_HOME/conf/server.xml" + if [ "$port" -gt 0 ]; then + grep "port=\"$port\"" $CATALINA_HOME/conf/server.xml > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log err "Your configured status URL specifies a port ($port), but the server does not have a connector listening to that port in $CATALINA_HOME/conf/server.xml" + notinstalled=1 + fi + fi + fi + + if [ $misconfigured -gt 0 ]; then + return $OCF_ERR_CONFIGURED + fi + + if [ $notinstalled -gt 0 ]; then + return $OCF_ERR_INSTALLED + fi + + if [ $wrongpermissions -gt 0 ]; then + return $OCF_ERR_PERM + fi + return $OCF_SUCCESS } +# As we stop tomcat, it removes it's own pid file...we still want to know what it was +memorize_pid() +{ + if [ -f $CATALINA_PID ]; then + rememberedPID=$(cat $CATALINA_PID) + fi +} + # ### tomcat RA environment variables # COMMAND=$1 TOMCAT_NAME="${OCF_RESKEY_tomcat_name-tomcat}" TOMCAT_CONSOLE="${OCF_RESKEY_script_log-/var/log/$TOMCAT_NAME.log}" RESOURCE_TOMCAT_USER="${OCF_RESKEY_tomcat_user-RUNASIS}" RESOURCE_STATUSURL="${OCF_RESKEY_statusurl-http://127.0.0.1:8080}" JAVA_HOME="${OCF_RESKEY_java_home}" JAVA_OPTS="${OCF_RESKEY_java_opts}" CATALINA_HOME="${OCF_RESKEY_catalina_home}" CATALINA_BASE="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}" +CATALINA_OUT="${OCF_RESKEY_catalina_out-$CATALINA_HOME/logs/catalina.out}" CATALINA_PID="${OCF_RESKEY_catalina_pid-$CATALINA_HOME/logs/catalina.pid}" +MAX_STOP_TIME="${OCF_RESKEY_max_stop_time}" TOMCAT_START_OPTS="${OCF_RESKEY_tomcat_start_opts}" CATALINA_OPTS="-Dname=$TOMCAT_NAME ${OCF_RESKEY_catalina_opts}" +CATALINA_TMPDIR="${OCF_RESKEY_catalina_tmpdir}" CATALINA_ROTATE_LOG="${OCF_RESKEY_catalina_rotate_log-NO}" CATALINA_ROTATETIME="${OCF_RESKEY_catalina_rotatetime-86400}" +JAVA_ENDORSED_DIRS="${OCF_RESKEY_java_endorsed_dirs}" +LOGGING_CONFIG="${OCF_RESKEY_logging_config}" +LOGGING_MANAGER="${OCF_RESKEY_logging_manager}" LSB_STATUS_STOPPED=3 if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case "$COMMAND" in meta-data) metadata_tomcat; exit $OCF_SUCCESS;; help|usage) usage; exit $OCF_SUCCESS;; esac if [ ! -d "$JAVA_HOME" -o ! -d "$CATALINA_HOME" -o ! -d "$CATALINA_BASE" ]; then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac ocf_log err "JAVA_HOME or CATALINA_HOME or CATALINA_BASE does not exist." exit $OCF_ERR_INSTALLED fi -export JAVA_HOME JAVA_OPTS CATALINA_HOME CATALINA_BASE CATALINA_PID CATALINA_OPTS +export JAVA_HOME JAVA_OPTS CATALINA_HOME CATALINA_BASE CATALINA_OUT CATALINA_PID CATALINA_OPTS CATALINA_TMPDIR JAVA_ENDORSED_DIRS LOGGING_CONFIG LOGGING_MANAGER JAVA=${JAVA_HOME}/bin/java if [ ! -x "$JAVA" ]; then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac ocf_log err "java command does not exist." exit $OCF_ERR_INSTALLED fi # # ------------------ # the main script # ------------------ # case "$COMMAND" in start) ocf_log debug "[$TOMCAT_NAME] Enter tomcat start" start_tomcat func_status=$? ocf_log debug "[$TOMCAT_NAME] Leave tomcat start $func_status" exit $func_status ;; stop) ocf_log debug "[$TOMCAT_NAME] Enter tomcat stop" stop_tomcat func_status=$? ocf_log debug "[$TOMCAT_NAME] Leave tomcat stop $func_status" exit $func_status ;; status) if monitor_tomcat; then echo tomcat instance $TOMCAT_NAME is running exit $OCF_SUCCESS else echo tomcat instance $TOMCAT_NAME is stopped exit $OCF_NOT_RUNNING fi exit $? ;; monitor) #ocf_log debug "[$TOMCAT_NAME] Enter tomcat monitor" monitor_tomcat func_status=$? #ocf_log debug "[$TOMCAT_NAME] Leave tomcat monitor $func_status" exit $func_status ;; meta-data) metadata_tomcat exit $? ;; validate-all) validate_all_tomcat exit $? ;; usage|help) usage exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/ldirectord/ldirectord.in b/ldirectord/ldirectord.in index 12df5d072..45a420430 100644 --- a/ldirectord/ldirectord.in +++ b/ldirectord/ldirectord.in @@ -1,5216 +1,5237 @@ #!/usr/bin/perl -w ###################################################################### # ldirectord http://www.vergenet.net/linux/ldirectord/ # Linux Director Daemon - run "perldoc ldirectord" for details # # 1999-2006 (C) Jacob Rief , # Horms and others # # License: GNU General Public License (GPL) # # Note: * The original author of this software was Jacob Rief circa 1999 # * It was maintained by Jacob Rief and Horms # from November 1999 to July 2003. # * From July 2003 Horms is the maintainer # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # 02111-1307 USA # ###################################################################### # A Brief history of versions: # # From oldest to newest # 1.1-1.144: ldirecord maintained in CVS HEAD branch # 1.145-1.186: ldirectord.in maintained in CVS HEAD BRANCH # 1.186-ha-VERSION: ldirectord.in maintained in mercurial =head1 NAME ldirectord - Linux Director Daemon Daemon to monitor remote services and control Linux Virtual Server =head1 SYNOPSIS B [B<-d|--debug>] [--] [I] B | B | B | B | B | B | B B [B<-h|-?|--help|-v|--version>] =head1 DESCRIPTION B is a daemon to monitor and administer real servers in a cluster of load balanced virtual servers. B typically is started from heartbeat but can also be run from the command line. On startup B reads the file B<@sysconfdir@/ha.d/conf/>I. After parsing the file, entries for virtual servers are created on the LVS. Now at regular intervals the specified real servers are monitored and if they are considered alive, added to a list for each virtual server. If a real server fails, it is removed from that list. Only one instance of B can be started for each configuration, but more instances of B may be started for different configurations. This helps to group clusters of services. Normally one would put an entry inside B<@sysconfdir@/ha.d/haresources> I to start ldirectord from heartbeat. =head1 OPTIONS I: This is the name for the configuration as specified in the file B<@sysconfdir@/ha.d/conf/>I B<-d|--debug> Don't start as daemon and log verbosely. B<-h|--help> Print user manual and exit. B<-v|--version> Print version and exit. B the daemon for the specified configuration. B the daemon for the specified configuration. This is the same as sending a TERM signal to the running daemon. B the daemon for the specified configuration. The same as stopping and starting. B the configuration file. This is only useful for modifications inside a virtual server entry. It will have no effect on adding or removing a virtual server block. This is the same as sending a HUP signal to the running daemon. B of the running daemon for the specified configuration. =head1 SYNTAX =head2 Description of how to write configuration files BI<(ip_address|hostname:portnumber|servicename)|firewall-mark> Defines a virtual service by IP-address (or hostname) and port (or servicename) or firewall-mark. A firewall-mark is an integer greater than zero. The configuration of marking packets is controlled using the C<-m> option to B(8). All real services and flags for a virtual service must follow this line immediately and be indented. BI Timeout in seconds for connect, external, external-perl and ping checks. If the timeout is exceeded then the real server is declared dead. If defined in a virtual server section then the global value is overridden. If undefined then the value of negotiatetimeout is used. negotiatetimeout is also a global value that may be overridden by a per-virtual setting. If both checktimeout and negotiatetimeout are unset, the default is used. Default: 5 seconds BI Timeout in seconds for negotiate checks. If defined in a virtual server section then the global value is overridden. If undefined then the value of connecttimeout is used. connecttimeout is also a global value that may be overridden by a per-virtual setting. If both negotiatetimeout and connecttimeout are unset, the default is used. Default: 30 seconds BI Defines the number of second between server checks. When fork=no this option defines the amount of time ldirectord sleeps between running all of the realserver checks in all virtual service pools. When fork=yes this option defines the amount of time each forked child sleeps per virtual service pool after running all realserver checks for that pool. If set in the virtual server section then the global value is overridden, but ONLY if using forking mode (BI). Default: 10 seconds BI This option is deprecated and slated for removal in a future version. Please see the 'failurecount' option. The number of times a check will be attempted before it is considered to have failed. Only works with ping checks. Note that the checktimeout/negotiatetimeout is additive, so if a connect check is used, checkcount is 3 and checktimeout is 2 seconds, then a total of 6 seconds worth of timeout will occur before the check fails. If defined in a virtual server section then the global value is overridden. Default: 1 BI The number of consecutive times a failure will have to be reported by a check before the realserver is considered to have failed. A value of 1 will have the realserver considered failed on the first failure. A successful check will reset the failure counter to 0. If defined in a virtual server section then the global value is overridden. Default: 1 BB | B Defines if should continuously check the configuration file for modification. If this is set to 'yes' and the configuration file changed on disk and its modification time (mtime) is newer than the previous version, the configuration is automatically reloaded. Default: no BIB<"> If this directive is defined, B automatically calls the executable I after the configuration file has changed on disk. This is useful to update the configuration file through B on the other heartbeated host. The first argument to the callback is the name of the configuration. This directive might also be used to restart B automatically after the configuration file changed on disk. However, if B is set to yes, the configuration is reloaded anyway. BI [B | B | B] the server onto which a webservice is redirected if all real servers are down. Typically this would be 127.0.0.1 with an emergency page. If defined in a virtual server section then the global value is overridden. BIB<"> If this directive is defined, the supplied script is executed whenever all real servers for a virtual service are down or when the first real server comes up again. In the first case, it is called with "start" as its first argument, in the latter with "stop". If defined in a virtual server section then the global value is overridden. BIB<">|syslog_facility An alternative logfile might be specified with this directive. If the logfile does not have a leading '/', it is assumed to be a syslog(3) facility name. Default: log directly to the file I. BI[, I]...B<"> A valid email address for sending alerts about the changed connection status to any real server defined in the virtual service. This option requires perl module MailTools to be installed. Automatically tries to send email using any of the built-in methods. See perldoc Mail::Mailer for more info on methods. Multiple addresses may be supplied, comma delimited. If defined in a virtual server section then the global value is overridden. BI A valid email address to use as the from address of the email alerts. You can use a plain email address or any RFC-compliant string for the From header in the body of an email message (such as: "ldirectord Alerts" ) Do not quote this string unless you want the quotes passed in as part of the From header. Default: unset, take system generated default (probably root@hostname) B I Delay in seconds between repeating email alerts while any given real server in the virtual service remains inaccessible. A setting of zero seconds will inhibit the repeating alerts. The email timing accuracy of this setting is dependent on the number of seconds defined in the checkinterval configuration option. If defined in a virtual server section then the global value is overridden. Default: 0 BB | B | B | B | B | B,... Comma delimited list of server states in which email alerts should be sent. B is a short-hand for "B,B,B,B". If B is specified, no other option may be specified, otherwise options are ored with each other. If defined in a virtual server section then the global value is overridden. Default: all BIB<"> A valid SMTP server address to use for sending email via SMTP. If defined in a virtual server section then the global value is overridden. BIB<"> Use this directive to start an instance of ldirectord for the named I. BB | B If I, then ldirectord does not go into background mode. All log-messages are redirected to stdout instead of a logfile. This is useful to run B supervised from daemontools. See http://untroubled.org/rpms/daemontools/ or http://cr.yp.to/daemontools.html for details. Default: I BB | B If I, then ldirectord will spawn a child process for every virtual server, and run checks against the real servers from them. This will increase response times to changes in real server status in configurations with many virtual servers. This may also use less memory then running many separate instances of ldirectord. Child processes will be automatically restarted if they die. Default: I BB | B If I, then when real or failback servers are determined to be down, they are not actually removed from the kernel's LVS table. Rather, their weight is set to zero which means that no new connections will be accepted. This has the side effect, that if the real server has persistent connections, new connections from any existing clients will continue to be routed to the real server, until the persistent timeout can expire. See L for more information on persistent connections. This side-effect can be avoided by running the following: echo 1 > /proc/sys/net/ipv4/vs/expire_quiescent_template If the proc file isn't present this probably means that the kernel doesn't have LVS support, LVS support isn't loaded, or the kernel is too old to have the proc file. Running ipvsadm as root should load LVS into the kernel if it is possible. If I, then the real or failback servers will be removed from the kernel's LVS table. The default is I. If defined in a virtual server section then the global value is overridden. Default: I +BB | B + +If I, then when real or failback servers are determined +to be down, they are readded to the kernel's LVS table with weight 0 if +they do not exist in the table. Setting the value to no, allows manually +removing the realserver to manually disable all persistent connections. BB | B If I, then when ldirectord exits it will remove all of the virtual server pools that it is managing from the kernel's LVS table. If I, then the virtual server pools it is managing and any real or failback servers listed in them at the time ldirectord exits will be left as-is. If you want to be able to stop ldirectord without having traffic to your realservers interrupted you will want to set this to I. If defined in a virtual server section then the global value is overridden. Default: I BI If this option is set ldirectord will look for a special file in the specified directory and, if found, force the status of the real server identified by the file to down, skipping the normal health check. This would be useful if you wish to force servers down for maintenance without having to modify the actual ldirectord configuration file. For example, given a realserver with IP 172.16.1.2, service on port 4444, and a resolvable reverse DNS entry pointing to "realserver2.example.com" ldirectord will check for the existence of the following files: =over =item 172.16.1.2:4444 =item 172.16.1.2 =item realserver2.example.com:4444 =item realserver2.example.com =item realserver2:4444 =item realserver2 =back If any one of those files is found then ldirectord will immediately force the status of the server to down as if the check had failed. Note: Since it checks for the IP/hostname without the port this means you can decide to place an entire realserver into maintenance across a large number of virtual service pools with a single file (if you were going to reboot the server, for instance) or include the port number and put just a particular service into maintenance. This option is not valid in a virtual server section. Default: disabled =head2 Section virtual The following commands must follow a B entry and must be indented with a minimum of 4 spaces or one tab. B Iip_address|hostname][:portnumber|servicename>] B | B | B [I] [B<">IB<", ">IB<">] Defines a real service by IP-address (or hostname) and port (or servicename). If the port is omitted then a 0 will be used, this is intended primarily for fwmark services where the port for real servers is ignored. Optionally a range of IPv4 addresses (or two hostnames) may be given, in which case each IPv4 address in the range will be treated as a real server using the given port. The second argument defines the forwarding method, must be B, B or B. The third argument is optional and defines the weight for that real server. If omitted then a weight of 1 will be used. The last two arguments are also optional. They define a request-receive pair to be used to check if a server is alive. They override the request-receive pair in the virtual server section. These two strings must be quoted. If the request string starts with I the IP-address and port of the real server is overridden, otherwise the IP-address and port of the real server is used. =head2 For TCP and UDP (non fwmark) virtual services, unless the forwarding method is B and the IP address of a real server is non-local (not present on a interface on the host running ldirectord) then the port of the real server will be set to that of its virtual service. That is, port-mapping is only available to if the real server is another machine and the forwarding method is B. This is due to the way that the underlying LVS code in the kernel functions. =head2 More than one of these entries may be inside a virtual section. The checktimeout, negotiatetimeout, checkcount, fallback, emailalert, emailalertfreq and quiescent options listed above may also appear inside a virtual section, in which case the global setting is overridden. BB | B | B | B | B | B | B | BI Type of check to perform. Negotiate sends a request and matches a receive string. Connect only attempts to make a TCP/IP connection, thus the request and receive strings may be omitted. If checktype is a number then negotiate and connect is combined so that after each N connect attempts one negotiate attempt is performed. This is useful to check often if a service answers and in much longer intervals a negotiating check is done. Ping means that ICMP ping will be used to test the availability of real servers. Ping is also used as the connect check for UDP services. Off means no checking will take place and no real or fallback servers will be activated. On means no checking will take place and real servers will always be activated. Default is I. BB | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B The type of service to monitor when using checktype=negotiate. None denotes a service that will not be monitored. simpletcp sends the B string to the server and tests it against the B regexp. The other types of checks connect to the server using the specified protocol. Please see the B and B sections for protocol specific information. Default: =over 4 =item * Virtual server port is 21: ftp =item * Virtual server port is 25: smtp =item * Virtual server port is 53: dns =item * Virtual server port is 80: http =item * Virtual server port is 110: pop =item * Virtual server port is 119: nntp =item * Virtual server port is 143: imap =item * Virtual server port is 389: ldap =item * Virtual server port is 443: https =item * Virtual server port is 587: submission =item * Virtual server port is 993: imaps =item * Virtual server port is 995: pops =item * Virtual server port is 1521: oracle =item * Virtual server port is 1812: radius =item * Virtual server port is 3128: http_proxy =item * Virtual server port is 3306: mysql =item * Virtual server port is 5432: pgsql =item * Virtual server port is 5060: sip =item * Otherwise: none =back BIB<"> This setting is used if checktype is external or external-perl and is the command to be run to check the status of a real server. It should exit with status 0 if everything is ok, or non-zero otherwise. Four parameters are passed to the script: =over 4 =item * virtual server ip/firewall mark =item * virtual server port =item * real server ip =item * real server port =back If the checktype is external-perl then the command is assumed to be a Perl script and it is evaluated into an anonymous subroutine which is called at check time, avoiding a fork-exec. The argument signature and exit code conventions are identical to checktype external. That is, an external-perl checktype should also work as an external checktype. Default: /bin/true BI Number of port to monitor. Sometimes check port differs from service port. Default: port specified for each real server BIB<"> This object will be requested each checkinterval seconds on each real server. The string must be inside quotes. Note that this string may be overridden by an optional per real-server based request-string. For an HTTP/HTTPS check, this should be a relative URI, while it has to be absolute for the 'http_proxy' check type. In the latter case, this URI will be requested through the proxy backend that is being checked. For a DNS check this should the name of an A record, or the address of a PTR record to look up. For a MySQL, Oracle or PostgeSQL check, this should be an SQL SELECT query. The data returned is not checked, only that the answer is one or more rows. This is a required setting. For a simpletcp check, this string is sent verbatim except any occurrences of \n are replaced with a new line character. BIB<"> If the requested result contains this I, the real server is declared alive. The regexp must be inside quotes. Keep in mind that regexps are not plain strings and that you need to escape the special characters if they should as literals. Note that this regexp may be overridden by an optional per real-server based receive regexp. For a DNS check this should be any one the A record's addresses or any one of the PTR record's names. For a MySQL check, the receive setting is not used. B | B Sets the HTTP method which should be used to fetch the URI specified in the request-string. GET is the method used by default if the parameter is not set. If HEAD is used, the receive-string should be unset. Default: GET BIB<"> Used when using a negotiate check with HTTP or HTTPS. Sets the host header used in the HTTP request. In the case of HTTPS this generally needs to match the common name of the SSL certificate. If not set then the host header will be derived from the request url for the real server if present. As a last resort the IP address of the real server will be used. BIB<"> For FTP, IMAP, LDAP, MySQL, Oracle, POP and PostgreSQL, the username used to log in. For Radius the passwd is used for the attribute User-Name. For SIP, the username is used as both the to and from address for an OPTIONS query. Default: =over 4 =item * FTP: Anonymous =item * MySQL Oracle, and PostgreSQL: Must be specified in the configuration =item * SIP: ldirectord\@, hostname is derived as per the passwd option below. =item * Otherwise: empty string, which denotes that case authentication will not be attempted. =back BIB<"> Password to use to login to FTP, IMAP, LDAP, MySQL, Oracle, POP, PostgreSQL and SIP servers. For Radius the passwd is used for the attribute User-Password. Default: =over 4 =item * FTP: ldirectord\@, where hostname is the environment variable HOSTNAME evaluated at run time, or sourced from uname if unset. =item * Otherwise: empty string. In the case of LDAP, MySQL, Oracle, and PostgreSQL this means that authentication will not be performed. =back BIB<"> Database to use for MySQL, Oracle and PostgreSQL servers, this is the database that the query (set by B above) will be performed against. This is a required setting. BIB<"> Secret to use for Radius servers, this is the secret used to perform an Access-Request with the username (set by B above) and passwd (set by B above). Default: empty string B I Scheduler to be used by LVS for loadbalancing. For an information on the available sehedulers please see the ipvsadm(8) man page. Default: "wrr" B I Number of seconds for persistent client connections. B I | I Netmask to be used for granularity of persistent client connections. IPv4 netmask should be specified in dotted quad notation. IPv6 netmask should be specified as a prefix length between 1 and 128. B | B | B Protocol to be used. If the virtual is specified as an IP address and port then it must be one of tcp or udp. If a firewall mark then the protocol must be fwm. Default: =over 4 =item * Virtual is an IP address and port, and the port is not 53: tcp =item * Virtual is an IP address and port, and the port is 53: udp =item * Virtual is a firewall mark: fwm =back BIB<"> File to continuously log the real service checks to for this virtual service. This is useful for monitoring when and why real services were down or for statistics. The log format is: [timestamp|pid|real_service_id|status|message] Default: no separate logging of service checks. =head1 IPv6 Directives for IPv6 are virtual6, real6, fallback6. IPv6 addresses specified for virtual6, real6, fallback6 and a file of maintenance directory should be enclosed by brackets ([2001:db8::abcd]:80). Following checktype and service are supported. BB | B | B | B | B | B | BI BB | B | B | B | B =head1 FILES B<@sysconfdir@/ha.d/ldirectord.cf> B BIB<.pid> B =head1 SEE ALSO L, L Ldirectord Web Page: http://www.vergenet.net/linux/ldirectord/ =head1 AUTHORS Horms Jacob Rief =cut use strict; # Set defaults for configuration variables in the "set_defaults" function use vars qw( $VERSION_STR $AUTOCHECK $CHECKINTERVAL $LDIRECTORD $LDIRLOG $NEGOTIATETIMEOUT $DEFAULT_NEGOTIATETIMEOUT $RUNPID $CHECKTIMEOUT $DEFAULT_CHECKTIMEOUT $CHECKCOUNT $FAILURECOUNT $QUIESCENT + $READDQUIESCENT $FORKING $EMAILALERT $EMAILALERTFREQ $EMAILALERTSTATUS $EMAILALERTFROM $SMTP $CLEANSTOP $MAINTDIR $CALLBACK $CFGNAME $CMD $CONFIG $DEBUG $FALLBACK $FALLBACK6 $FALLBACKCOMMAND $SUPERVISED $IPVSADM $checksum $DAEMON_STATUS $DAEMON_STATUS_STARTING $DAEMON_STATUS_RUNNING $DAEMON_STATUS_STOPPING $DAEMON_STATUS_RELOADING $DAEMON_STATUS_ALL $DAEMON_TERM $DAEMON_HUP $DAEMON_CHLD $opt_d $opt_h $stattime %LD_INSTANCE @OLDVIRTUAL @REAL @VIRTUAL $HOSTNAME %EMAILSTATUS %FORK_CHILDREN $SERVICE_UP $SERVICE_DOWN %check_external_perl__funcs $CRLF ); $VERSION_STR = "Linux Director v1.186-ha"; $DAEMON_STATUS_STARTING = 0x1; $DAEMON_STATUS_RUNNING = 0x2; $DAEMON_STATUS_STOPPING = 0x4; $DAEMON_STATUS_RELOADING = 0x8; $DAEMON_STATUS_ALL = $DAEMON_STATUS_STARTING | $DAEMON_STATUS_RUNNING | $DAEMON_STATUS_STOPPING | $DAEMON_STATUS_RELOADING; $SERVICE_UP = 0; $SERVICE_DOWN =1; # default values $DAEMON_TERM = undef; $DAEMON_HUP = undef; $LDIRECTORD = ld_find_cmd("ldirectord", 1); if (! defined $LDIRECTORD) { $LDIRECTORD = "@sbindir@/ldirectord"; } $RUNPID = "/var/run/ldirectord"; $CRLF = "\x0d\x0a"; # Set global configuration default values: set_defaults(); use Getopt::Long; use Pod::Usage; #use English; #use Time::HiRes qw( gettimeofday tv_interval ); use Socket; use Socket6; use Sys::Hostname; use POSIX qw(setsid :sys_wait_h); use Sys::Syslog qw(:DEFAULT setlogsock); BEGIN { # wrap exit() to preserve replacability *CORE::GLOBAL::exit = sub { CORE::exit(@_ ? shift : 0); }; } # command line options my @OLD_ARGV = @ARGV; my $opt_d = ''; my $opt_h = ''; my $opt_v = ''; Getopt::Long::Configure ("bundling", "no_auto_abbrev", "require_order"); GetOptions("debug|d" => \$opt_d, "help|h|?" => \$opt_h, "version|v" => \$opt_v) or usage(); # main code $DEBUG = $opt_d ? 3 : 0; if ($opt_h) { exec_wrapper("/usr/bin/perldoc -U $LDIRECTORD"); &ld_exit(127, "Exec failed"); } if ($opt_v) { print("$VERSION_STR\n" . "1999-2006 Jacob Rief, Horms and others\n" . "\n". "\n" . "ldirectord comes with ABSOLUTELY NO WARRANTY.\n" . "This is free software, and you are welcome to redistribute it\n". "under certain conditions. " . "See the GNU General Public Licence for details.\n"); &ld_exit(0, ""); } if ($DEBUG>0 and -f "./ipvsadm") { $IPVSADM="./ipvsadm"; } else { if (-x "/sbin/ipvsadm") { $IPVSADM="/sbin/ipvsadm"; } elsif (-x "/usr/sbin/ipvsadm") { $IPVSADM="/usr/sbin/ipvsadm"; } else { die "Can not find ipvsadm"; } } # There is a memory leak in perl's socket code when # the default IO layer is used. So use "perlio" unless # something else has been explicitly set. # http://archive.develooper.com/perl5-porters@perl.org/msg85468.html unless(defined($ENV{'PERLIO'})) { $ENV{'PERLIO'} = "perlio"; exec_wrapper($0, @OLD_ARGV); } $DAEMON_STATUS = $DAEMON_STATUS_STARTING; ld_init(); ld_setup(); ld_start(); ld_cmd_children("start", %LD_INSTANCE); $DAEMON_STATUS = $DAEMON_STATUS_RUNNING; ld_main(); &ld_rm_file("$RUNPID.$CFGNAME.pid"); &ld_exit(0, "Reached end of \"main\""); # functions sub ld_init { # install signal handlers (this covers TERM) #require Net::LDAP; $SIG{'INT'} = \&ld_handler_term; $SIG{'QUIT'} = \&ld_handler_term; $SIG{'ILL'} = \&ld_handler_term; $SIG{'ABRT'} = \&ld_handler_term; $SIG{'FPE'} = \&ld_handler_term; $SIG{'SEGV'} = \&ld_handler_term; $SIG{'TERM'} = \&ld_handler_term; $SIG{'BUS'} = \&ld_handler_term; $SIG{'SYS'} = \&ld_handler_term; $SIG{'XCPU'} = \&ld_handler_term; $SIG{'XFSZ'} = \&ld_handler_term; $SIG{'IOT'} = \&ld_handler_term; # This used to call a signal handler, that logged a message # However, this typically goes to syslog and if syslog # is playing up a loop will occur. $SIG{'PIPE'} = "IGNORE"; # HUP is actually used $SIG{'HUP'} = \&ld_handler_hup; # Reap Children $SIG{'CHLD'} = \&ld_handler_chld; if (defined $ENV{HOSTNAME}) { $HOSTNAME = "$ENV{HOSTNAME}"; } else { use POSIX "uname"; my ($s, $n, $r, $v, $m) = uname; $HOSTNAME = $n; } # search for the correct configuration file if ( !defined $ARGV[0] ) { usage(); } if ( defined $ARGV[0] && defined $ARGV[1] ) { $CONFIG = $ARGV[0]; if ($CONFIG =~ /([^\/]+)$/) { $CFGNAME = $1; } $CMD = $ARGV[1]; } elsif ( defined $ARGV[0] ) { $CONFIG = "ldirectord.cf"; $CFGNAME = "ldirectord"; $CMD = $ARGV[0]; } if ( $CMD ne "start" and $CMD ne "stop" and $CMD ne "status" and $CMD ne "restart" and $CMD ne "try-restart" and $CMD ne "reload" and $CMD ne "force-reload") { usage(); } if ( -f "@sysconfdir@/ha.d/$CONFIG" ) { $CONFIG = "@sysconfdir@/ha.d/$CONFIG"; } elsif ( -f "@sysconfdir@/ha.d/conf/$CONFIG" ) { $CONFIG = "@sysconfdir@/ha.d/conf/$CONFIG"; } elsif ( ! -f "$CONFIG" ) { init_error("Config file $CONFIG not found"); } read_config(); undef @OLDVIRTUAL; { my $log_str = "Invoking ldirectord invoked as: $0 "; for my $i (@ARGV) { $log_str .= $i . " "; } ld_log($log_str); } my $oldpid; my $filepid; if (open(FILE, "<$RUNPID.$CFGNAME.pid")) { $_ = ; chomp; $filepid = $_; close(FILE); # Check to make sure this isn't a stale pid file if (open(FILE, "; if (/ldirectord/) { $oldpid = $filepid; } close(FILE); } } if (defined $oldpid) { if ($CMD eq "start") { ld_exit(0, "Exiting from ldirectord $CMD"); } elsif ($CMD eq "stop") { kill 15, $oldpid; ld_exit(0, "Exiting from ldirectord $CMD"); } elsif ($CMD eq "restart" or $CMD eq "try-restart") { kill 15, $oldpid; while (-f "$RUNPID.$CFGNAME.pid") { # wait until old pid file is removed sleep 1; } # N.B Fall through } elsif ($CMD eq "reload" or $CMD eq "force-reload") { kill 1, $oldpid; ld_exit(0, "Exiting from ldirectord $CMD"); } else { # status print STDERR "ldirectord for $CONFIG is running with pid: $oldpid\n"; ld_cmd_children("status", %LD_INSTANCE); ld_log("ldirectord for $CONFIG is running with pid: $oldpid"); ld_log("Exiting from ldirectord $CMD"); ld_exit(0, "Exiting from ldirectord $CMD"); } } else { if ($CMD eq "start" or $CMD eq "restart") { ; } elsif ($CMD eq "stop" or $CMD eq "try-restart") { ld_exit(0, "Exiting from ldirectord $CMD"); } elsif ($CMD eq "status") { my $status; if (defined $filepid) { print STDERR "ldirectord stale pid file " . "$RUNPID.$CFGNAME.pid for $CONFIG\n"; ld_log("ldirectord stale pid file " . "$RUNPID.$CFGNAME.pid for $CONFIG"); $status = 1; } else { $status = 3; } print "ldirectord is stopped for $CONFIG\n"; ld_exit($status, "Exiting from ldirectord $CMD"); } else { ld_log("ldirectord is stopped for $CONFIG"); ld_exit(1, "Exiting from ldirectord $CMD"); } } # Run as daemon if ($SUPERVISED eq "yes" || $opt_d) { &ld_log("Starting $VERSION_STR with pid: $$"); } else { &ld_log("Starting $VERSION_STR as daemon"); open(FILE, ">$RUNPID.$CFGNAME.pid") || init_error("Can not open $RUNPID.$CFGNAME.pid"); &ld_daemon(); print FILE "$$\n"; close(FILE); } } sub usage { pod2usage(-input => $LDIRECTORD, -exitval => -1); } sub init_error { my $msg = shift; chomp($msg); &ld_log("$msg"); unless ($opt_d) { print STDERR "$msg\n"; } ld_exit(1, "Initialisation Error"); } # ld_handler_term # If we get a signal then log it and quit sub ld_handler_term { my ($signal) = (@_); if (defined $DAEMON_TERM) { $SIG{'__DIE__'} = "IGNORE"; $SIG{"$signal"} = "IGNORE"; die("Exit Handler Repeatedly Called\n"); } $DAEMON_TERM = $signal; $DAEMON_STATUS = $DAEMON_STATUS_STOPPING; } sub ld_process_term { $DAEMON_STATUS = $DAEMON_STATUS_STOPPING; ld_cmd_children("stop", %LD_INSTANCE); ld_stop(); &ld_log("Linux Director Daemon terminated on signal: $DAEMON_TERM"); &ld_rm_file("$RUNPID.$CFGNAME.pid"); &ld_exit(0, "Linux Director Daemon terminated on signal: $DAEMON_TERM"); } sub ld_handler_hup { $DAEMON_HUP=1; } sub ld_process_hup { &ld_log("Reloading Linux Director Daemon config on signal"); $DAEMON_HUP = undef; &reread_config(); } sub ld_handler_chld { $DAEMON_CHLD=1; # NOTE: calling waitpid here would mess up $? } sub ld_process_chld { my $i = 0; undef $DAEMON_CHLD; while (waitpid(-1, WNOHANG) > 0) { print "child: $i\n"; $i++; } } sub check_signal { if (defined $DAEMON_TERM) { ld_process_term(); } if (defined $DAEMON_HUP) { ld_process_hup(); } if (defined $DAEMON_CHLD) { ld_process_chld(); } } sub reread_config { @OLDVIRTUAL = @VIRTUAL; @VIRTUAL = (); my %OLD_INSTANCE = %LD_INSTANCE; my %RELOAD; my %STOP; my %START; my $child; $DAEMON_STATUS = $DAEMON_STATUS_RELOADING; eval { &read_config(); foreach $child (keys %LD_INSTANCE) { if (defined $OLD_INSTANCE{$child}) { $RELOAD{$child} = 1; } else { $START{$child} = 1; } } foreach $child (keys %OLD_INSTANCE) { if (not defined $LD_INSTANCE{$child}) { $STOP{$child} = 1; } } &ld_cmd_children("stop", %STOP); &ld_cmd_children("reload_or_start", %RELOAD); &ld_cmd_children("start", %START); foreach my $vid (keys %FORK_CHILDREN) { &ld_log("Killing child $vid (PID=$FORK_CHILDREN{$vid})"); kill 15, $FORK_CHILDREN{$vid}; } &ld_setup(); &ld_start(); }; if ($@) { @VIRTUAL = @OLDVIRTUAL; %LD_INSTANCE = %OLD_INSTANCE; } $DAEMON_STATUS = $DAEMON_STATUS_RUNNING; undef @OLDVIRTUAL; } sub parse_emailalertstatus { my ($line, $arg) = (@_); my @s = split/\s*,\s*/, $arg; my $none = 0; my $status = 0; for my $i (@s) { if ($i eq "none") { $none++; } } for my $i (@s) { if ($i eq "none") { next; } elsif ($i eq "all") { $status = $DAEMON_STATUS_ALL; } elsif ($i eq "starting") { $status |= $DAEMON_STATUS_STARTING; } elsif ($i eq "stopping") { $status |= $DAEMON_STATUS_STOPPING; } elsif ($i eq "running") { $status |= $DAEMON_STATUS_RUNNING; } elsif ($i eq "reloading") { $status |= $DAEMON_STATUS_RELOADING; } else { &config_error($line, "invalid email alert status at: \"$i\"") } if ($none > 0) { &config_error($line, "invalid email alert status: " . "\"$i\" specified with \"none\""); } } return $status; } sub set_defaults { $AUTOCHECK = "no"; $CALLBACK = undef; $CHECKCOUNT = 1; $CHECKINTERVAL = 10; $CHECKTIMEOUT = -1; $CLEANSTOP = "yes"; $DEFAULT_CHECKTIMEOUT = 5; $DEFAULT_NEGOTIATETIMEOUT = 30; $EMAILALERT = ""; $EMAILALERTFREQ = 0; $EMAILALERTFROM = undef; $EMAILALERTSTATUS = $DAEMON_STATUS_ALL; $FAILURECOUNT = 1; $FALLBACK = undef; $FALLBACK6 = undef; $FALLBACKCOMMAND = undef; $FORKING = "no"; $LDIRLOG = "/var/log/ldirectord.log"; $MAINTDIR = undef; $NEGOTIATETIMEOUT = -1; $QUIESCENT = "no"; + $READDQUIESCENT = "no"; $SUPERVISED = "no"; $SMTP = undef; } sub read_emailalert { my ($line, $addr) = (@_); # Strip of enclosing quotes $addr =~ s/^\"([^"]*)\"$/$1/; $addr =~ /(.+)/ or &config_error($line, "no email address specified"); return $addr; } sub read_config { undef @VIRTUAL; undef @REAL; undef $CALLBACK; undef %LD_INSTANCE; undef $checksum; # Reset/set global config variables to defaults before parsing the config file. set_defaults(); $stattime = 0; my %virtual_seen; open(CFGFILE, "<$CONFIG") or &config_error(0, "can not open file $CONFIG"); my $line = 0; my $linedata; while() { $line++; $linedata = $_; outer_loop: if ($linedata =~ /^virtual(6)?\s*=\s*(.*)/) { my $af = defined($1) ? AF_INET6 : AF_INET; my $vattr = $2; my $ip_port = undef; my $fwm = undef; my $virtual_id; my $virtual_line = $line; my $virtual_port; my $fallback_line; my @rsrv_todo; if ($vattr =~ /^(\d+\.\d+\.\d+\.\d+):([0-9A-Za-z-_]+)/ && $af == AF_INET) { $ip_port = "$1:$2"; $virtual_port = $2; } elsif ($vattr =~ /^([0-9A-Za-z._+-]+):([0-9A-Za-z-_]+)/) { $ip_port = "$1:$2"; $virtual_port = $2; } elsif ($vattr =~ /^(\d+)/){ $fwm = $1; } elsif ($vattr =~ /^\[([0-9A-Fa-f:]+)\]:([0-9A-Za-z-_]+)/ && $af == AF_INET) { &config_error($line, "cannot specify an IPv6 address here. please use \"virtual6\" instead."); } elsif ($vattr =~ /^\[([0-9A-Fa-f:]+)\]:([0-9A-Za-z-_]+)/ && $af == AF_INET6) { my $v6addr = $1; my $v6port = $2; if (!inet_pton(AF_INET6,$v6addr)) { &config_error($line,"invalid ipv6 address for virtual server"); } $ip_port = "[$v6addr]:$v6port"; $virtual_port = $v6port; } else { &config_error($line, "invalid address for virtual server"); } my (%vsrv, @rsrv); if ($ip_port) { $vsrv{checktype} = "negotiate"; $vsrv{protocol} = "tcp"; if ($ip_port =~ /:(53|domain)$/) { $vsrv{protocol} = "udp"; } $vsrv{port} = $virtual_port; } else { $vsrv{fwm} = $fwm; $vsrv{checktype} = "negotiate"; $vsrv{protocol} = "fwm"; $vsrv{service} = "none"; $vsrv{port} = "0"; } $vsrv{addressfamily} = $af; $vsrv{real} = \@rsrv; $vsrv{scheduler} = "wrr"; $vsrv{checkcommand} = "/bin/true"; $vsrv{request} = "/"; $vsrv{receive} = ""; $vsrv{login} = ""; $vsrv{passwd} = ""; $vsrv{database} = ""; $vsrv{checktimeout} = -1; $vsrv{checkcount} = -1; $vsrv{negotiatetimeout} = -1; $vsrv{failurecount} = -1; $vsrv{num_connects} = 0; $vsrv{httpmethod} = "GET"; $vsrv{secret} = ""; push(@VIRTUAL, \%vsrv); while() { $line++; $linedata=$_; if(m/^\s*#/) { next; } s/#.*//; s/\t/ /g; unless (/^ {4,}(.+)/) { last; } my $rcmd = $1; if ($rcmd =~ /^(real(6)?)\s*=\s*(.*)/) { if ($af == AF_INET && defined($2) || $af == AF_INET6 && ! defined($2)) { &config_error($line, join("", ("cannot specify \"$1\" here. please use \"real", ($af == AF_INET) ? "" : "6", "\" instead"))); } push @rsrv_todo, [$3, $line]; } elsif ($rcmd =~ /^request\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "no request string specified"); $vsrv{request} = $1; unless($vsrv{request}=~/^\//){ $vsrv{request} = "/" . $vsrv{request}; } } elsif ($rcmd =~ /^receive\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid receive string"); $vsrv{receive} = $1; } elsif ($rcmd =~ /^checktype\s*=\s*(.*)/){ if ($1 =~ /(\d+)/ && $1>=0) { $vsrv{num_connects} = $1; $vsrv{checktype} = "combined"; } elsif ( $1 =~ /([\w-]+)/ && ($1 eq "connect" || $1 eq "negotiate" || $1 eq "ping" || $1 eq "off" || $1 eq "on" || $1 eq "external" || $1 eq "external-perl") ) { $vsrv{checktype} = $1; } else { &config_error($line, "checktype must be \"connect\", \"negotiate\", \"on\", \"off\", \"ping\", \"external\", \"external-perl\" or a positive number"); } } elsif ($rcmd =~ /^checkcommand\s*=\s*\"(.*)\"/ or $rcmd =~ /^checkcommand\s*=\s*(.*)/){ $1 =~ /(.+)/ or &config_error($line, "invalid check command"); $vsrv{checkcommand} = $1; } elsif ($rcmd =~ /^checktimeout\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check timeout"); $vsrv{checktimeout} = $1; } elsif ($rcmd =~ /^connecttimeout\s*=\s*(.*)/){ &config_error($line, "connecttimeout directive " . "deprecated in favour of " . "negotiatetimeout"); } elsif ($rcmd =~ /^negotiatetimeout\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid negotiate timeout"); $vsrv{negotiatetimeout} = $1; } elsif ($rcmd =~ /^checkcount\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check count"); $vsrv{checkcount} = $1; &config_warn($line, "checkcount option is deprecated and slated for removal. please see 'failurecount'"); } elsif ($rcmd =~ /^failurecount\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid failure count"); $vsrv{failurecount} = $1; } elsif ($rcmd =~ /^checkinterval\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid checkinterval"); $vsrv{checkinterval} = $1 } elsif ($rcmd =~ /^checkport\s*=\s*(.*)/){ $1 =~ /(\d+)/ or &config_error($line, "invalid port"); ( $1 > 0 && $1 < 65536 ) or &config_error($line, "checkport must be in range 1..65536"); $vsrv{checkport} = $1; } elsif ($rcmd =~ /^login\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid login string"); $vsrv{login} = $1; } elsif ($rcmd =~ /^passwd\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid password"); $vsrv{passwd} = $1; } elsif ($rcmd =~ /^database\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid database"); $vsrv{database} = $1; } elsif ($rcmd =~ /^secret\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid secret"); $vsrv{secret} = $1; } elsif ($rcmd =~ /^load\s*=\s*\"(.*)\"/) { $1 =~ /(\w+)/ or &config_error($line, "invalid string for load testing"); $vsrv{load} = $1; } elsif ($rcmd =~ /^scheduler\s*=\s*(.*)/) { # Intentionally ommit checking the # scheduler against a list of know # schedulers. This is because from # time to time new schedulers are # added. But ldirectord is # maintained distributed # independently of this. Thus # ldirectord needs to be manually # updated/upgraded. So just accept # any scheduler that matches # [a-z]+. I.e. is syntactically # correct (all schedulers so far # match that pattern). Ipvsadm will # report an error is a scheduler # isn't available / doesn't exist. $1 =~ /([a-z]+)/ or &config_error($line, "invalid scheduler, should be only lowercase letters (a-z)"); $vsrv{scheduler} = $1; } elsif ($rcmd =~ /^persistent\s*=\s*(.*)/) { $1 =~ /(\d+)/ or &config_error($line, "invalid persistent timeout"); $vsrv{persistent} = $1; } elsif ($rcmd =~ /^netmask\s*=\s*(.*)/) { my $val = $1; if ($vsrv{addressfamily} == AF_INET6) { if ($val !~ /^\d+$/ or ($val < 1 || $val > 128)) { &config_error($line, "invalid netmask: a prefix length between 1 and 128 is required"); } } else { if ($val !~ /^\d+\.\d+\.\d+\.\d+$/) { &config_error($line, "invalid netmask: dotted quad notation is required"); } } $vsrv{netmask} = $val; } elsif ($rcmd =~ /^protocol\s*=\s*(.*)/) { if ( $1 =~ /(\w+)/ ) { if ( $vsrv{protocol} eq "fwm" ) { if ($1 eq "fwm" ) { ; #Do nothing, it is already set } else { &config_error($line, "protocol must be fwm if the virtual service is a fwmark (a number)"); } } else { # tcp or udp if ($1 eq "tcp" || $1 eq "udp") { $vsrv{protocol} = $1; } else { &config_error($line, "protocol must be tcp or udp if the virtual service is an address and port"); } } } else { &config_error($line, "invalid protocol"); } } elsif ($rcmd =~ /^service\s*=\s*(.*)/) { $1 =~ /(\w+)/ && ($1 eq "dns" || $1 eq "ftp" || $1 eq "http" || $1 eq "https" || $1 eq "http_proxy" || $1 eq "imap" || $1 eq "imaps" || $1 eq "ldap" || $1 eq "nntp" || $1 eq "mysql" || $1 eq "none" || $1 eq "oracle"|| $1 eq "pop" || $1 eq "pops" || $1 eq "radius"|| $1 eq "pgsql" || $1 eq "sip" || $1 eq "smtp" || $1 eq "submission" || $1 eq "simpletcp") or &config_error($line, "service must " . "be dns, ftp, " . "http, https, " . "http_proxy, " . "imap, imaps, " . "ldap, nntp, " . "mysql, none, " . "oracle, " . "pop, pops, " . "radius, " . "pgsql, " . "simpletcp, " . "sip, smtp " . "or submission"); $vsrv{service} = $1; if($vsrv{service} eq "ftp" and $vsrv{login} eq "") { $vsrv{login} = "anonymous"; } elsif($vsrv{service} eq "sip" and $vsrv{login} eq "") { $vsrv{login} = "ldirectord\@$HOSTNAME"; } if($vsrv{service} eq "ftp" and $vsrv{passwd} eq "") { $vsrv{passwd} = "ldirectord\@$HOSTNAME"; } } elsif ($rcmd =~ /^httpmethod\s*=\s*(.*)/) { $1 =~ /(\w+)/ && (uc($1) eq "GET" || uc($1) eq "HEAD") or &config_error($line, "httpmethod must be GET or HEAD"); $vsrv{httpmethod} = uc($1); } elsif ($rcmd =~ /^virtualhost\s*=\s*(.*)/) { $1 =~ /\"?([^\"]*)\"?/ or &config_error($line, "invalid virtualhost"); $vsrv{virtualhost} = $1; } elsif ($rcmd =~ /^(fallback(6)?)\s*=\s*(.*)/) { # Allow specification of a virtual-specific fallback host if ($af == AF_INET && defined($2) || $af == AF_INET6 && ! defined($2)) { &config_error($line, join("", ("cannot specify \"$1\" here. please use \"fallback", ($af == AF_INET) ? "" : "6", "\" instead"))); } $fallback_line=$line; $vsrv{fallback} = parse_fallback($line, $3, \%vsrv); } elsif ($rcmd =~ /^fallbackcommand\s*=\s*\"(.*)\"/ or $rcmd =~ /^fallbackcommand\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "invalid fallback command"); $vsrv{fallbackcommand} = $1; } elsif ($rcmd =~ /^quiescent\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "quiescent must be 'yes' or 'no'"); $vsrv{quiescent} = $1; } elsif ($rcmd =~ /^emailalert\s*=\s*(.*)/) { $vsrv{emailalert} = read_emailalert($line, $1); } elsif ($rcmd =~ /^emailalertfreq\s*=\s*(\d*)/) { $1 =~ /(\d+)/ or &config_error($line, "invalid email alert frequency"); $vsrv{emailalertfreq} = $1; } elsif ($rcmd =~ /^emailalertstatus\s*=\s*(.*)/) { $vsrv{emailalertstatus} = &parse_emailalertstatus($line, $1); } elsif ($rcmd =~ /^monitorfile\s*=\s*\"(.*)\"/ or $rcmd =~ /^monitorfile\s*=\s*(.*)/) { my $monitorfile = $1; unless (open(MONITORFILE, ">>$monitorfile") and close(MONITORFILE)) { &config_error($line, "unable to open monitorfile $monitorfile: $!"); } $vsrv{monitorfile} = $monitorfile; } elsif ($rcmd =~ /^cleanstop\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "cleanstop must be 'yes' or 'no'"); $vsrv{cleanstop} = $1; } elsif ($rcmd =~ /^smtp\s*=\s*(.*)/) { $1 =~ /(^([0-9A-Za-z._+-]+))/ or &config_error($line, "invalid SMTP server address"); $vsrv{smtp} = $1; } else { &config_error($line, "Unknown command \"$linedata\""); } undef $linedata; } # As the protocol needs to be known to call # getservbyname() all resolution must be # delayed until the protocol is finalised. # That is after the entire configuration # for a virtual service has been parsed. &_ld_read_config_fallback_resolve($fallback_line, $vsrv{protocol}, $vsrv{fallback}, $af); &_ld_read_config_virtual_resolve($virtual_line, \%vsrv, $ip_port, $af); &_ld_read_config_real_resolve(\%vsrv, \@rsrv_todo, $af); # Check for duplicate now we have all the # information to generate the id $virtual_id = get_virtual_id_str(\%vsrv); if (defined $virtual_seen{$virtual_id}) { &config_error($line, "duplicate virtual server"); } $virtual_seen{$virtual_id} = 1; unless(defined($linedata)) { last; } #Arggh a goto :( goto outer_loop; } next if ($linedata =~ /^\s*$/ || $linedata =~ /^\s*#/); if ($linedata =~ /^checktimeout\s*=\s*(.*)/) { ($1 =~ /(\d+)/ && $1 && $1>0) or &config_error($line, "invalid check timeout value"); $CHECKTIMEOUT = $1; } elsif ($linedata =~ /^connecttimeout\s*=\s*(.*)/) { &config_error($line, "connecttimeout directive " . "deprecated in favour of " . "negotiatetimeout"); } elsif ($linedata =~ /^negotiatetimeout\s*=\s*(.*)/) { ($1 =~ /(\d+)/ && $1 && $1>0) or &config_error($line, "invalid negotiate timeout value"); $NEGOTIATETIMEOUT = $1; } elsif ($linedata =~ /^checkinterval\s*=\s*(.*)/) { $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check interval value"); $CHECKINTERVAL = $1; } elsif ($linedata =~ /^checkcount\s*=\s*(.*)/) { $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check count value"); $CHECKCOUNT = $1; &config_warn($line, "checkcount option is deprecated and slated for removal. please see 'failurecount'"); } elsif ($linedata =~ /^failurecount\s*=\s*(.*)/) { $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid failure count value"); $FAILURECOUNT = $1; } elsif ($linedata =~ /^fallback(6)?\s*=\s*(.*)/) { my $af = defined($1) ? AF_INET6 : AF_INET; my $tcp = parse_fallback($line, $2, undef); my $udp = parse_fallback($line, $2, undef); &_ld_read_config_fallback_resolve($line, "tcp", $tcp, $af); &_ld_read_config_fallback_resolve($line, "udp", $udp, $af); if ($af == AF_INET) { $FALLBACK = { "tcp" => $tcp, "udp" => $udp }; } else { $FALLBACK6 = { "tcp" => $tcp, "udp" => $udp }; } } elsif ($linedata =~ /^fallbackcommand\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "invalid fallback command"); $FALLBACKCOMMAND = $1; } elsif ($linedata =~ /^autoreload\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "autoreload must be 'yes' or 'no'"); $AUTOCHECK = $1; } elsif ($linedata =~ /^callback\s*=\s*\"(.*)\"/) { $CALLBACK = $1; } elsif ($linedata =~ /^logfile\s*=\s*\"(.*)\"/) { my $tmpLDIRLOG = $LDIRLOG; $LDIRLOG = $1; if (&ld_openlog()) { $LDIRLOG = $tmpLDIRLOG; &config_error($line, "unable to open logfile: $1"); } } elsif ($linedata =~ /^execute\s*=\s*(.*)/) { $LD_INSTANCE{$1} = 1; } elsif ($linedata =~ /^fork\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "fork must be 'yes' or 'no'"); $FORKING = $1; } elsif ($linedata =~ /^supervised/) { if (($linedata =~ /^supervised\s*=\s*(.*)/) and ($1 eq "yes" || $1 eq "no")) { $SUPERVISED = $1; } elsif ($linedata =~ /^supervised\s*$/) { $SUPERVISED = "yes"; &config_warn($line, "please update your config not to " . "use a bare supervised directive"); } else { &config_error($line, "supervised must be 'yes' or 'no'"); } } elsif ($linedata =~ /^quiescent\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "quiescent must be 'yes' or 'no'"); $QUIESCENT = $1; + } elsif ($linedata =~ /^readdquiescent\s*=\s*(.*)/) { + ($1 eq "yes" || $1 eq "no") + or &config_error($line, + "readdquiescent must be 'yes' or 'no'"); + $READDQUIESCENT = $1; } elsif ($linedata =~ /^emailalert\s*=\s*(.*)/) { $EMAILALERT = read_emailalert($line, $1); } elsif ($linedata =~ /^emailalertfreq\s*=\s*(\d*)/) { $1 =~ /(\d+)/ or &config_error($line, "invalid email alert frequency"); $EMAILALERTFREQ = $1; } elsif ($linedata =~ /^emailalertstatus\s*=\s*(.*)/) { $EMAILALERTSTATUS = &parse_emailalertstatus($line, $1); } elsif ($linedata =~ /^emailalertfrom\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "no email from address specified"); $EMAILALERTFROM = $1; } elsif ($linedata =~ /^cleanstop\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "cleanstop must be 'yes' or 'no'"); $CLEANSTOP = $1; } elsif ($linedata =~ /^smtp\s*=\s*(.*)/) { $1 =~ /(^([0-9A-Za-z._+-]+))/ or &config_error($line, "invalid SMTP server address"); $SMTP = $1; } elsif ($linedata =~ /^maintenancedir\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "maintenance directory not specified"); $MAINTDIR = $1; -d $MAINTDIR or &config_warn($line, "maintenance directory does not exist"); } else { if ($linedata =~ /^timeout\s*=\s*(.*)/) { &config_error($line, "timeout directive " . "deprecated in favour of " . "checktimeout and " . "negotiatetimeout"); } &config_error($line, "Unknown command $linedata "); } } close(CFGFILE); # Check for sensible use of checkinterval, warn if it is used in a virtual # service when fork=no if ($FORKING eq 'no') { foreach my $v (@VIRTUAL) { if (defined($$v{checkinterval})) { config_warn(-1, "checkinterval in virtual service ". get_virtual_id_str($v)." ignored when fork=no"); } } } return(0); } # _ld_read_config_virtual_resolve # Note: Should not need to be called directly, but won't do any damage if # you do. # Resolve the server (ip address) and port for a virtual service # pre: line: Line of configuration file fallback server was read from # Used for debugging messages # vsrv: Virtual Service to resolve server and port of # ip_port: server and port in the form # ip_address|hostname:port|service # af: Address family: AF_INET or AF_INET6 # post: Take ip_port, resolve it as per ld_gethostservbyname # and set $vsrv->{server} and $vsrv->{port} accordingly. # If $vsrv->{service} is not set, then set according to the value of # $vsrv->{port} # return: none # Debugging message will be reported and programme will exit # on error. sub _ld_read_config_virtual_resolve { my($line, $vsrv, $ip_port, $af)=(@_); if($ip_port){ $ip_port=&ld_gethostservbyname($ip_port, $vsrv->{protocol}, $af); if ($ip_port =~ /(\[[0-9A-Fa-f:]+\]):(\d+)/) { $vsrv->{server} = $1; $vsrv->{port} = $2; } elsif($ip_port){ ($vsrv->{server}, $vsrv->{port}) = split /:/, $ip_port; } else { &config_error($line, "invalid address for virtual service"); } if(!defined($vsrv->{service})){ $vsrv->{service} = ld_port_to_service($vsrv->{port}); } } } # ld_service_to_port # Resolve an ldirectord service name from its port number # pre: port: port number of the service # return: port name # "none" if the service is unknown sub ld_port_to_service { my ($port) = (@_); if ($port eq 21) { return "ftp"; } if ($port eq 25) { return "smtp"; } if ($port eq 53) { return "dns"; } if ($port eq 80) { return "http"; } if ($port eq 110) { return "pop"; } if ($port eq 119) { return "nntp"; } if ($port eq 143) { return "imap"; } if ($port eq 389) { return "ldap"; } if ($port eq 443) { return "https"; } if ($port eq 587) { return "submission"; } if ($port eq 995) { return "pops"; } if ($port eq 993) { return "imaps"; } if ($port eq 1521) { return "oracle"; } if ($port eq 1812) { return "radius"; } if ($port eq 3128) { return "http_proxy"; } if ($port eq 3306) { return "mysql"; } if ($port eq 5060) { return "sip"; } if ($port eq 5432) { return "pgsql"; } return "none"; } # ld_service_to_port # Resolve the port number from an ldirectord service name # pre: service: name of the service # return: port number # undef if the service is unknown sub ld_service_to_port { my ($service) = (@_); if ($service eq "ftp") { return 21; } if ($service eq "smtp") { return 25; } if ($service eq "dns") { return 53; } if ($service eq "http") { return 80; } if ($service eq "pop") { return 110; } if ($service eq "nntp") { return 119; } if ($service eq "imap") { return 143; } if ($service eq "ldap") { return 389; } if ($service eq "https") { return 443; } if ($service eq "submission") { return 587; } if ($service eq "imaps") { return 993; } if ($service eq "pops") { return 995; } if ($service eq "oracle") { return 1521; } if ($service eq "radius") { return 1812; } if ($service eq "http_proxy") { return 3128; } if ($service eq "mysql") { return 3306; } if ($service eq "sip") { return 5060; } if ($service eq "pgsql") { return 5432; } return undef; } # ld_checkport # Resolve the port to connect to for service checks # Note: Should only be used inside service checks, # as its not the same as the port of the real server # pre: v: virtual service # r: real server # return: port number # undef if the service is unknown sub ld_checkport { my ($v, $r) = (@_); if (defined $v->{checkport}) { return $v->{checkport}; } if ($r->{port} > 0) { return $r->{port}; } return ld_service_to_port($v->{service}); } # _ld_read_config_fallback_resolve # Note: Should not need to be called directly, but won't do any damage if # you do. # Resolve the fallback server for a virtual service # pre: line: Line of configuration file fallback server was read from # Used for debugging messages # vsrv: Virtual Service to resolve fallback server of # af: Address family: AF_INET or AF_INET6 # post: Take $vsrv->{fallback}, resolve it as per ld_gethostservbyname # and set $vsrv->{fallback} to the result # return: none # Debugging message will be reported and programme will exit # on error. sub _ld_read_config_fallback_resolve { my($line, $protocol, $fallback, $af)=(@_); my ($ipversion, $ipaddress); unless($fallback) { return; } if ($af == AF_INET) { $ipversion = "IPv4"; } elsif ($af == AF_INET6) { $ipversion = "IPv6"; } else { $ipversion = "IP??($af)"; } unless ($ipaddress = &ld_gethostbyname($fallback->{server}, $af)) { &config_error($line, "invalid $ipversion address or could not resolve for fallback server: " . $fallback->{server}); } $fallback->{server} = $ipaddress; unless($fallback->{"port"}) { return; } $fallback->{port} = &ld_getservbyname($fallback->{port}, $protocol) or &config_error($line, "invalid port for fallback server"); } # _ld_read_config_real_resolve # Note: Should not need to be called directly, but won't do any damage if # you do. # Run through the list of real servers read in the configuration file for a # virtual server and parse these entries # pre: vsrv: Virtual Service to parse real servers for # rsrv_todo: List of real servers read from config but not parsed. # List is a list of list reference. The first element in # each list reference is the line read from the # configuration after "real=". The second element is the # line number, used for error reporting # af: Address family: AF_INET or AF_INET6 # post: Run through rsrv_todo and parse real servers # return: none # Debugging message will be reported and programme will exit # on error. sub _ld_read_config_real_resolve { my ($vsrv, $rsrv_todo, $af)=(@_); my $i; my $str; my $line; my $ip1; my $ip2; my $port; my $resolved_ip1; my $resolved_ip2; my $resolved_port; my $flags; for $i (@$rsrv_todo) { ($str, $line)=@$i; $str =~ /(\d+\.\d+\.\d+\.\d+|[A-Za-z0-9.-]+|\[[0-9A-fa-f:]+\])(->(\d+\.\d+\.\d+\.\d+|[A-Za-z0-9.-]+|\[[0-9A-fa-f:]+\]))?(:(\d+|[A-Za-z0-9-_]+))?\s+(.*)/ or &config_error($line, "invalid address for real server" . " (wrong format)"); $ip1=$1; $ip2=$3; if(defined($5)){ $port=$5; } else { $port="0"; } $flags=$6; $resolved_ip1=&ld_gethostbyname($ip1, $af); unless( defined($resolved_ip1) ) { &config_error($line, "invalid address ($ip1) for real server" . " (could not resolve host)"); } if( defined($port) ){ $resolved_port=&ld_getservbyname($port); unless( defined($resolved_port) ){ &config_error($line, "invalid port ($port) for real server" . " (could not resolve port)"); } } if ( defined ($ip2) ) { $resolved_ip2=&ld_gethostbyname($ip2, $af); unless( defined ($resolved_ip2) ) { &config_error($line, "invalid address ($ip2) for " . "real server" . " (could not resolve end host)"); } &add_real_server_range($line, $vsrv, $resolved_ip1, $resolved_ip2, $resolved_port, $flags, $af); } else { &add_real_server($line, $vsrv, $resolved_ip1, $resolved_port, $flags); } } } # add_real_server_range # Add a real server for each IP address in a range # pre: line: line number real server was read from # Used for debugging information # vsrv: virtual server to add real server to # first: First IP address in range # last: First IP address in range # port: Port of real servers # flags: Flags for real servers. Should be of the form # gate|masq|ipip [] [">I", ""] # af: Address family: AF_INET or AF_INET6 # post: real servers are added to virtual server # return: none # Debugging message will be reported and programme will exit # on error. sub add_real_server_range { my ($line, $vsrv, $first, $last, $port, $flags, $af) = (@_); my (@tmp, $first_i, $last_i, $i, $rsrv); if ($af == AF_INET) { if ( ($first_i=&ip_to_int($first)) <0 ) { &config_error($line, "Invalid IP address: $first"); } if ( ($last_i=&ip_to_int($last)) <0 ) { &config_error($line, "Invalid IP address: $last"); } if ($first_i>$last_i) { &config_error($line, "Invalid Range: $first-$last: First value must be " . "greater than or equal to the second value"); } # A for loop didn't seem to want to work $i=$first_i; while ( $i le $last_i ) { &add_real_server($line, $vsrv, &int_to_ip($i), $port, $flags); $i++; } } elsif ($af == AF_INET6) { # not supported yet &config_error($line, "Address range for IPv6 is not supported yet"); } else { die "address family must be AF_INET or AF_INET6\n"; } } # add_real_server # Add a real server to a virtual # pre: line: line number real server was read from # Used for debugging information # vsrv: virtual server to add real server to # ip: IP address of real server # port: Port of real server # flags: Flags for real server. Should be of the form # gate|masq|ipip [] [">I", ""] # post: real server is added to virtual server # return: none # Debugging message will be reported and programme will exit # on error. sub add_real_server { my ($line, $vsrv, $ip, $port, $flags) = (@_); my $ref; my $realsrv=0; my $new_rsrv; my $rsrv; $new_rsrv = {"server"=>$ip, "port"=>$port}; $flags =~ /(\w+)(.*)/ && ($1 eq "gate" || $1 eq "masq" || $1 eq "ipip") or &config_error($line, "forward method must be gate, masq or ipip"); $new_rsrv->{"forward"} =$1; $flags = $2; $rsrv=$vsrv->{"real"}; if(defined($flags) and $flags =~ /\s+(\d+)(.*)/) { $new_rsrv->{"weight"} = $1; $flags = $2; } else { $new_rsrv->{"weight"} = 1; } if(defined($flags) and $flags =~ /\s+\"(.*)\"[, ]\s*\"(.*)\"(.*)/) { $new_rsrv->{"request"} = $1; unless ($new_rsrv->{request}=~/^\//) { $new_rsrv->{request} = "/" . $new_rsrv->{request}; } $new_rsrv->{"receive"} = $2; $flags = $3; } if (defined($flags) and $flags =~/\S/) { &config_error($line, "Invalid real server line, around " . "\"$flags\""); } push(@$rsrv, $new_rsrv); my $real = get_real_id_str($new_rsrv, $vsrv); my $virtual = get_virtual_id_str($vsrv); for my $r (@REAL){ if($r->{"real"} eq $real){ my $ref=$r->{"virtual"}; push(@$ref, $virtual); $realsrv=1; last; } } if($realsrv==0){ push(@REAL, { "real"=>$real, "virtual"=>[ $virtual ] }); } } # parse_fallback # Parse a fallback server # pre: line: line number real server was read from # fallback: line read from configuration file # Should be of the form # ip_address|hostname[:port|:service_name] [gate|masq|ipip] # post: fallback is parsed # return: Reference to hash of the form # { server => blah, forward => blah } # Debugging message will be reported and programme will exit # on error. sub parse_fallback { my ($line, $fallback, $vsrv) = (@_); my $parse_line; my $server; my $port; my $fwd; $parse_line = $fallback; if ($parse_line =~ /(\S+)(\s+(\S+))?\s*$/) { # get "ip:port" and a forwarding method $fwd = $3; $parse_line = $1; } if ($parse_line =~ /(:(\d+|[A-Za-z0-9-_]+))?$/) { # get host and port $port=$2; $parse_line =~ s/(:(\d+|[A-Za-z0-9-_]+))?$//; $server = $parse_line; } unless(defined($server)) { &config_error($line, "invalid fallback server: $fallback"); } if (not defined($port) and defined($vsrv)) { $port = $vsrv->{"port"}; } if($fwd) { ($fwd eq "gate" || $fwd eq "masq" || $fwd eq "ipip") or &config_error($line, "forward method must be gate, masq or ipip"); } else { $fwd="gate" } return({"server"=>$server, "port"=>$port, "forward"=>$fwd, "weight"=>1}); } sub __config_log { my ($line, $prefix, $msg) = @_; chomp($msg); $msg .= "\n"; my $msg_prefix = "$prefix [$$]"; if ($line > 0) { $msg_prefix .= " reading file $CONFIG at line $line"; } $msg = "$msg_prefix: $msg"; if ($opt_d or $DAEMON_STATUS == $DAEMON_STATUS_STARTING) { print STDERR $msg; } else { &ld_log("$msg"); } } sub config_warn { my ($line, $msg) = @_; __config_log($line, "Warning", $msg); } sub config_error { my ($line, $msg) = @_; __config_log($line, "Error", $msg); if ($DAEMON_STATUS == $DAEMON_STATUS_STARTING) { &ld_rm_file("$RUNPID.$CFGNAME.pid"); &ld_exit(2, "config_error: Configuration Error"); } else { die; } } sub ld_setup { for my $v (@VIRTUAL) { if ($$v{protocol} eq "tcp") { $$v{proto} = "-t"; } elsif ($$v{protocol} eq "udp") { $$v{proto} = "-u"; } elsif ($$v{protocol} eq "fwm") { $$v{proto} = "-f"; } $$v{flags} = "$$v{proto} " . &get_virtual_option($v) . " "; $$v{flags} .= "-s $$v{scheduler} " if defined ($$v{scheduler}); if (defined $$v{persistent}) { $$v{flags} .= "-p $$v{persistent} "; $$v{flags} .= "-M $$v{netmask} " if defined ($$v{netmask}); } my $real = $$v{real}; for my $r (@$real) { $$r{forw} = get_forward_flag($$r{forward}); my $port=ld_checkport($v, $r); my $schema = $$v{service}; if ($$v{service} eq 'http_proxy') { $schema = 'http'; } if (defined $$r{request} && defined $$r{receive}) { my $uri = $$r{request}; $uri =~ s/^\///g; if ($$r{request} =~ /$schema:\/\//) { $$r{url} = "$uri"; } else { $$r{url} = "$schema:\/\/$$r{server}:$port\/$uri"; } } else { my $uri = $$v{request}; $uri =~ s/^\///g; if ($$v{service} eq 'http_proxy') { $$r{url} = "$uri"; } else { $$r{url} = "$schema:\/\/$$r{server}:$port\/$uri"; } $$r{request} = $$v{request} unless defined $$r{request}; $$r{receive} = $$v{receive}; } if ($$v{checktype} eq "combined") { $$r{num_connects} = 999999; } else { $$r{num_connects} = -1; } } # checktimeout and negotiate timeout are # mutual defaults for each other, so calculate # checktimeout in a temporary variable so as not # to affect the calculation of negotiatetimeout. my $checktimeout = $$v{checktimeout}; if ($checktimeout < 0) { $checktimeout = $$v{negotiatetimeout}; } if ($checktimeout < 0) { $checktimeout = $CHECKTIMEOUT; } if ($checktimeout < 0) { $checktimeout = $NEGOTIATETIMEOUT; } if ($checktimeout < 0) { $checktimeout = $DEFAULT_CHECKTIMEOUT; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $$v{checktimeout}; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $NEGOTIATETIMEOUT; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $CHECKTIMEOUT; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $DEFAULT_NEGOTIATETIMEOUT; } $$v{checktimeout} = $checktimeout; if ($$v{checkcount} < 0) { $$v{checkcount} = $CHECKCOUNT; } if ($$v{failurecount} < 0) { $$v{failurecount} = $FAILURECOUNT; } } } # ld_read_ipvsadm # # Net::FTP seems to set the input record separator ($\) to null # putting IO into slurp (whole file at a time, rather than line at a time) # mode. Net::FTP does this using local $\, which should mean # that the change doesn' effect code here, but it does. It also # seems to be impossible to turn it off, by say setting $\ back to '\n' # Perhaps there is more to this than meets the eye. Perhaps it's a perl bug. # In any case, this should fix the problem. # # This should not affect pid or config file parsing as they are called # before Net::FTP and as this appears to be a bit of a work around, # I'd rather use it in as few places as possible # # Observed with perl v5.8.8 (Debian's perl 5.8.8-6) # -- Horms, 17th July 2005 sub ld_readline { my ($fd, $buf) = (@_); my $line; # Uncomment the following line to turn off this work around # return readline($fd); $line = shift @$buf; if (defined $line) { return $line . "\n"; } push @$buf, split /\n/, readline($fd); $line = shift @$buf; if (defined $line) { return $line . "\n"; } return undef; } # ld_read_ipvsadm # Parses the output of "ipvsadm -L -n" and puts into a structure of # the following from: # # { # (vip_address:vport|fwmark) protocol => { # "scheduler" => scheduler, # "persistent" => timeout, # May be omitted # "netmask" => netmask, # May be omitted # "real" => { # rip_address:rport => { # "forward" => forwarding_mechanism, # "weight" => weight # }, # ... # } # }, # ... # } # # where: # vip_address: IP address of virtual service # vport: Port of virtual service # fwmark: Firewall Mark of virtual service # scheduler: Scheduler for virtual service # timeout: Timeout for persistency. Omitted if service is not persistent. # nemask: Netmask for persistency. Omitted if service is not persistent. # # rip_address: IP address of real server # rport: Port of real server # forwarding_mechanism: Forwarding mechanism for real server. # One of: gate, ipip, masq. # weight: Weight of real server # # pre: none # post: ipvsadm -L -n is parsed # result: reference to sructure detailed above. sub ld_read_ipvsadm { my %oldsrv; my $real_service; my $fwd; my $buf = []; my $fh; my $line; # read status of current ipvsadm -L -n unless(open($fh, "$IPVSADM -L -n 2>&1|")){ &ld_exit(1, "Could not run $IPVSADM -L -n: $!"); } # Skip the first three lines $line = ld_readline($fh, $buf); $line = ld_readline($fh, $buf); $line = ld_readline($fh, $buf); while (1) { $line = ld_readline($fh, $buf); if (not defined $line) { last; } if ($line =~ /^(\w+)\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+|\d+)( IPv6)?\s+(\w+)\s+persistent\s+(\d+)\s+mask\s+(.*)/) { $real_service = &gen_real_service_str($2, $1, $3); $oldsrv{"$real_service"} = {"real"=>{}, "scheduler"=>$4, "persistent"=>$5, "netmask"=>$6}; } elsif ($line =~ /^(\w+)\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+|\d+)( IPv6)?\s+(\w+)\s+persistent\s+(\d+)/) { $real_service = &gen_real_service_str($2, $1, $3); $oldsrv{"$real_service"} = {"real"=>{}, "scheduler"=>$4, "persistent"=>$5}; } elsif ($line =~ /^(\w+)\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+|\d+)( IPv6)?\s+(\w+)/) { $real_service = &gen_real_service_str($2, $1, $3); $oldsrv{"$real_service"} = {"real"=>{}, "scheduler"=>$4}; } elsif ($line =~ /^ ->\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+)\s+(\w+)\s+(\d+)/) { if (not defined( $real_service)) { &ld_debug(2, "Real server read from ipvsadm " . "doesn't seem to be inside a " . "virtual service: \"$line\"\n"); next; } if ($2 eq "Route") { $fwd = "gate"; } elsif ($2 eq "Tunnel") { $fwd = "ipip"; } elsif ($2 eq "Masq") { $fwd = "masq"; } $oldsrv{"$real_service"}->{"real"}->{"$1"} = {"forward"=>$fwd, "weight"=>$3}; } else { &ld_debug(2, "Unknown line read from ipvsadm: " . "\"$line\"\n"); next; } } close($fh); return(\%oldsrv); } sub gen_real_service_str { my ($service_address, $protocol, $v6flag) = @_; return "$service_address ".lc($protocol).(defined($v6flag) ? "6" : ""); } sub get_real_service_str { my ($v) = (@_); if ($v->{"protocol"} eq "fwm") { return &get_virtual($v) . " " . $v->{protocol} . (($v->{addressfamily} == AF_INET6) ? "6" : ""); } else { return &get_virtual($v) . " " . $v->{protocol}; } } sub ld_start { my $oldsrv; my $real_service; my $nv; my $nr; my $server_down = {}; # read status of current ipvsadm -L -n $oldsrv=&ld_read_ipvsadm(); # make sure virtual servers are up to date foreach $nv (@VIRTUAL) { my $real_service = &get_real_service_str($nv); if (exists($oldsrv->{"$real_service"})) { # service exists, modify it &system_wrapper("$IPVSADM -E $$nv{flags}"); &ld_log("Changed virtual server: " . &get_virtual($nv)); } else { # no such service, create a new one &system_wrapper("$IPVSADM -A $$nv{flags}"); &ld_log("Added virtual server: " . &get_virtual($nv)); } } # make sure real servers are up to date foreach $nv (@VIRTUAL) { my $nreal = $nv->{real}; my $ov = $oldsrv->{&get_real_service_str($nv)}; my $or = $ov->{real}; my $fallback = fallback_find($nv); if (defined($fallback)) { delete($or->{"$fallback->{server}:$fallback->{port}"}); } for $nr (@$nreal) { my $real_str = "$nr->{server}:$nr->{port}"; if (! defined($or->{$real_str}) or $or->{$real_str}->{weight} == 0) { $server_down->{$real_str} = [$nv, $nr]; #service_set($nv, $nr, "down", {force => 1}); } else { if (defined $server_down->{$real_str}) { delete($server_down->{$real_str}); } service_set($nv, $nr, "up", {force => 1}); } delete($or->{$real_str}); } # remove remaining entries for real servers for my $k (keys %$or) { purge_untracked_service($nv, $k, "start"); delete($$or{$k}); } delete($oldsrv->{&get_real_service_str($nv)}); &fallback_on($nv); } for my $k (keys (%$server_down)) { - my $v = $server_down->{$k}; + my $v = $server_down->{$k}; + if ($READDQUIESCENT eq "no") { + # Ensure that the server is initially added + service_set(@$v[0], @$v[1], "up", {force => 1}); + } + # Remove Server service_set(@$v[0], @$v[1], "down", {force => 1}); delete($server_down->{$k}); #sleep 5; } # remove remaining entries for virtual servers foreach $nv (@OLDVIRTUAL) { if (! defined($oldsrv->{&get_real_service_str($nv)})) { next; } purge_virtual($nv, "start"); } } sub ld_cmd_children { my ($cmd, %children) = (@_); # instantiate other ldirectord, if specified my $child; foreach $child (keys %children) { if ($cmd eq "reload_or_start") { if (&system_wrapper("$LDIRECTORD $child reload")) { &system_wrapper("$LDIRECTORD $child start"); } } else { &system_wrapper("$LDIRECTORD $child $cmd"); } } } sub ld_stop { # Kill children if ($FORKING eq 'yes') { foreach my $virtual_id (keys (%FORK_CHILDREN)) { my $pid = $FORK_CHILDREN{$virtual_id}; ld_log("Killing child $virtual_id PID=$pid"); kill 15, $pid; } } foreach my $v (@VIRTUAL) { next if ( (! defined($$v{cleanstop}) and $CLEANSTOP eq 'no') or (defined($$v{cleanstop}) and $$v{cleanstop} eq 'no') ); my $real = $$v{real}; foreach my $r (@$real) { if (defined $$r{virtual_status}) { purge_service($v, $r, "stop"); } } purge_virtual($v, "stop"); } } sub ld_main { # Main failover checking code while (1) { if ($FORKING eq 'yes') { foreach my $v (@VIRTUAL) { my $virtual_id = get_virtual_id_str($v); if (!exists($FORK_CHILDREN{$virtual_id})) { &ld_log("Child not running for $virtual_id, spawning"); my $pid = fork; if (!defined($pid)) { &ld_log("fork failed"); } elsif ($pid == 0) { run_child($v); } else { $FORK_CHILDREN{get_virtual_id_str($v)} = $pid; &ld_log("Spawned child $virtual_id PID=$pid"); } } elsif (waitpid($FORK_CHILDREN{get_virtual_id_str($v)}, WNOHANG)) { delete $FORK_CHILDREN{get_virtual_id_str($v)}; } } check_signal(); if (!check_cfgfile()) { sleep 1; } check_signal(); } else { my @real_checked; foreach my $v (@VIRTUAL) { my $real = $$v{real}; my $virtual_id = get_virtual_id_str($v); REAL: foreach my $r (@$real) { my $real_id = get_real_id_str($r, $v); check_signal(); foreach my $tmp_id (@real_checked) { if($real_id eq $tmp_id) { &ld_debug(3, "Already checked: real server=$real_id (virtual=$virtual_id)"); next REAL; } } _check_real($v, $r); push(@real_checked, $real_id); } } check_signal(); if (!check_cfgfile()) { sleep $CHECKINTERVAL; } check_signal(); ld_emailalert_resend(); check_signal(); } } } sub run_child { my $v = shift; # Just exit on signals $SIG{'INT'} = "DEFAULT"; $SIG{'QUIT'} = "DEFAULT"; $SIG{'ILL'} = "DEFAULT"; $SIG{'ABRT'} = "DEFAULT"; $SIG{'FPE'} = "DEFAULT"; $SIG{'SEGV'} = "DEFAULT"; $SIG{'TERM'} = "DEFAULT"; $SIG{'BUS'} = "DEFAULT"; $SIG{'SYS'} = "DEFAULT"; $SIG{'XCPU'} = "DEFAULT"; $SIG{'XFSZ'} = "DEFAULT"; $SIG{'IOT'} = "DEFAULT"; $SIG{'PIPE'} = "IGNORE"; $SIG{'HUP'} = sub { exit 1 }; my $real = $$v{real}; my $virtual_id = get_virtual_id_str($v); my $checkinterval = $$v{checkinterval} || $CHECKINTERVAL; $0 = "ldirectord $virtual_id"; while (1) { foreach my $r (@$real) { $0 = "ldirectord $virtual_id checking $$r{server}"; _check_real($v, $r); } $0 = "ldirectord $virtual_id"; sleep $checkinterval; ld_emailalert_resend(); } } sub _check_real { my $v = shift; my $r = shift; my $real_id = get_real_id_str($r, $v); my $virtual_id = get_virtual_id_str($v); if (_check_real_for_maintenance($r)) { service_set($v, $r, "down", {do_log => 1, force => 1}, "Server in maintenance"); return; } elsif ($$v{checktype} eq "negotiate" || $$r{num_connects}>=$$v{num_connects}) { &ld_debug(2, "Checking negotiate: real server=$real_id (virtual=$virtual_id)"); if (grep $$v{service} eq $_, ("http", "https", "http_proxy")) { $$r{num_connects} = 0 if (check_http($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "pop") { $$r{num_connects} = 0 if (check_pop($v, $r, 0) == $SERVICE_UP); } elsif ($$v{service} eq "pops") { $$r{num_connects} = 0 if (check_pop($v, $r, 1) == $SERVICE_UP); } elsif ($$v{service} eq "imap") { $$r{num_connects} = 0 if (check_imap($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "imaps") { $$r{num_connects} = 0 if (check_imaps($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "smtp" or $$v{service} eq "submission") { $$r{num_connects} = 0 if (check_smtp($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "ftp") { $$r{num_connects} = 0 if (check_ftp($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "ldap") { $$r{num_connects} = 0 if (check_ldap($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "nntp") { $$r{num_connects} = 0 if (check_nntp($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "dns") { $$r{num_connects} = 0 if (check_dns($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "sip") { $$r{num_connects} = 0 if (check_sip($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "radius") { $$r{num_connects} = 0 if (check_radius($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "mysql") { $$r{num_connects} = 0 if (check_mysql($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "pgsql") { $$r{num_connects} = 0 if (check_pgsql($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "oracle") { $$r{num_connects} = 0 if (check_oracle($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "simpletcp") { $$r{num_connects} = 0 if (check_simpletcp($v, $r) == $SERVICE_UP); } else { $$r{num_connects} = 0 if (check_none($v, $r) == $SERVICE_UP); } } elsif ($$v{checktype} eq "connect") { if ($$v{protocol} ne "udp") { &ld_debug(2, "Checking connect: real server=$real_id (virtual=$virtual_id)"); check_connect($v, $r); } else { &ld_debug(2, "Checking connect (ping): real server=$real_id (virtual=$virtual_id)"); check_ping($v, $r); } } elsif ($$v{checktype} eq "ping") { &ld_debug(2, "Checking ping: real server=$real_id (virtual=$virtual_id)"); check_ping($v, $r); } elsif ($$v{checktype} eq "external") { &ld_debug(2, "Checking external: real server=$real_id (virtual=$virtual_id)"); check_external($v, $r); } elsif ($$v{checktype} eq "external-perl") { &ld_debug(2, "Checking external-perl: real server=$real_id (virtual=$virtual_id)"); check_external_perl($v, $r); } elsif ($$v{checktype} eq "off") { &ld_debug(2, "Checking off: No real or fallback servers to be added\n"); } elsif ($$v{checktype} eq "on") { &ld_debug(2, "Checking on: Real servers are added without any checks\n"); &service_set($v, $r, "up"); } elsif ($$v{checktype} eq "combined") { &ld_debug(2, "Checking combined-connect: real server=$real_id (virtual=$virtual_id)"); if (check_connect($v, $r) == $SERVICE_UP) { $$r{num_connects}++; } else { $$r{num_connects} = 999999; } } } sub _check_real_for_maintenance { my $r = shift; return undef if(!$MAINTDIR); my $servername = ld_gethostbyaddr($$r{server}); # Extract just the first component of the full name so we can match short or FQDN names $servername =~ /^([a-z][a-z0-9\-]+)\./; my $servershortname = $1; if (-e "$MAINTDIR/$$r{server}:$$r{port}") { &ld_debug(2, "Server maintenance: Found file $$r{server}:$$r{port}"); return 1; } elsif (-e "$MAINTDIR/$$r{server}") { &ld_debug(2, "Server maintenance: Found file $$r{server}"); return 1; } elsif ($servername && -e "$MAINTDIR/$servername:$$r{port}") { &ld_debug(2, "Server maintenance: Found file $servername:$$r{port}"); return 1; } elsif ($servername && -e "$MAINTDIR/$servername") { &ld_debug(2, "Server maintenance: Found file $servername"); return 1; } elsif ($servershortname && -e "$MAINTDIR/$servershortname:$$r{port}") { &ld_debug(2, "Server maintenance: Found file $servershortname:$$r{port}"); return 1; } elsif ($servershortname && -e "$MAINTDIR/$servershortname") { &ld_debug(2, "Server maintenance: Found file $servershortname"); return 1; } return undef; } sub check_http { use LWP::UserAgent; use LWP::Debug; if($DEBUG > 2) { LWP::Debug::level('+'); } my ($v, $r) = @_; $$r{url} =~ /(http|https):\/\/([^:\/]+)(:([^\/]+))?(\/.*)/; my $host = $2; #my $port = $3; my $uri = $4; my $virtualhost = (defined $$v{virtualhost} ? $$v{virtualhost} : $host); &ld_debug(2, "check_http: url=\"$$r{url}\" " . "virtualhost=\"$virtualhost\""); my $ua = new LWP::UserAgent(); my $h = undef; if ($$v{service} eq "http_proxy") { my $port = ld_checkport($v, $r); $ua->proxy("http", "http://$$r{server}:$port/"); } else { $h = new HTTP::Headers("Host" => $virtualhost); } my $req = new HTTP::Request("$$v{httpmethod}", "$$r{url}", $h); my $res; # LWP does not seem to honour timeouts set using $ua->timeout() # for HTTPS. So use an alarm instead. This also has the advantage # of being cumulative timeout, rather than a per send/receive # timeout. eval { # LWP makes unguarded calls to eval # which throw a fatal exception if they fail # Needless to say, this is completely stupid. # Resetting of $SIG{'__DIE__'} is also # needed now that alarm() is used. local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{negotiatetimeout}"); &ld_debug(2, "Starting Check"); alarm $$v{negotiatetimeout}; &ld_debug(2, "Starting HTTP/HTTPS"); $res = $ua->request($req); &ld_debug(2, "Finished HTTP/HTTPS"); alarm 0; # Cancel the alarm }; if (not defined $res) { &ld_debug(2, "check_http: timeout"); goto down; } if ($$v{service} eq "https") { &ld_debug(2, "SSL-Cipher: " . $res->header('Client-SSL-Cipher')); &ld_debug(2, "SSL-Cert-Subject: " . $res->header('Client-SSL-Cert-Subject')); &ld_debug(2, "SSL-Cert-Issuer: " . $res->header('Client-SSL-Cert-Issuer')); } my $recstr = $$r{receive}; if ($res->is_success && (!($recstr =~ /.+/) || $res->content =~ /$recstr/)) { service_set($v, $r, "up", {do_log => 1}, $res->status_line); &ld_debug(2, "check_http: $$r{url} is up\n"); return $SERVICE_UP; } my $log_message = $res->is_success ? $res->content : $res->status_line; service_set($v, $r, "down", {do_log => 1}, $log_message); &ld_debug(3, "Headers " . $res->headers->as_string); down: &ld_debug(2, "check_http: $$r{url} is down\n"); return $SERVICE_DOWN; } sub check_smtp { require Net::SMTP; my ($v, $r) = @_; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking $$v{service}: server=$$r{server} port=$port"); my $smtp = new Net::SMTP($$r{server}, Port => $port, Timeout => $$v{negotiatetimeout}); if ($smtp) { $smtp->quit; service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } else { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } sub check_pop { require Mail::POP3Client; my ($v, $r, $ssl) = @_; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking pop server=$$r{server} port=$port ssl=$ssl"); my $pop = new Mail::POP3Client(USER => $$v{login}, PASSWORD => $$v{passwd}, HOST => $$r{server}, USESSL => $ssl, PORT => $port, DEBUG => 0, TIMEOUT => $$v{negotiatetimeout}); if (!$pop) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } if($$v{login} ne "") { my $authres = $pop->login(); $pop->close(); if (!$authres) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } $pop->close(); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } sub check_imap { require Net::IMAP::Simple; my ($v, $r) = @_; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking imap server=$$r{server} port=$port"); my $imap = Net::IMAP::Simple->new($$r{server}, port => $port, timeout => $$v{negotiatetimeout}); if (!$imap) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } if($$v{login} ne "") { my $authres = $imap->login($$v{login},$$v{passwd}); $imap->quit; if (!$authres) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } $imap->quit(); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } sub check_imaps { require Net::IMAP::Simple::SSL; my ($v, $r) = @_; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking imaps server=$$r{server} port=$port"); my $imaps = Net::IMAP::Simple::SSL->new($$r{server}, port => $port, timeout => $$v{negotiatetimeout}); if (!$imaps) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } if($$v{login} ne "") { my $authres = $imaps->login($$v{login},$$v{passwd}); $imaps->quit; if (!$authres) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } $imaps->quit(); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } sub check_ldap { my ($v, $r) = @_; require Net::LDAP; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking ldap server=$$r{server} port=$port"); my $recstr = $$r{receive}; my $ldap = Net::LDAP->new("$$r{server}", port => $port, timeout => $$v{negotiatetimeout}); if(!$ldap) { service_set($v, $r, "down", {do_log => 1}, "Connection failed"); &ld_debug(4, "Connection failed"); return $SERVICE_DOWN; } my $mesg; if ($$v{login} && $$v{passwd}) { $mesg = $ldap->bind($$v{login}, password=>$$v{passwd}) ; } else { $mesg = $ldap->bind ; } if ($mesg->is_error) { service_set($v, $r, "down", {do_log => 1}, "Bind failed"); &ld_debug(4, "Bind failed"); return $SERVICE_DOWN; } &ld_debug(4, "Base : " . substr($$r{request},1)); my $result = $ldap->search ( base => substr($$r{request},1) . "", scope => "base", filter => "(objectClass=*)" ); if($result->count != 1) { service_set($v, $r, "down", {do_log => 1}, "No answer received"); &ld_debug(2, "Count failed : " . $result->count); return $SERVICE_DOWN; } my $href = $result->as_struct; my @arrayOfDNs = keys %$href ; if (!($recstr =~ /.+/) || $arrayOfDNs[0] =~ /$recstr/) { service_set($v, $r, "up", {do_log => 1}, "Success"); return $SERVICE_UP; } else { service_set($v, $r, "down", {do_log => 1}, "Response mismatch"); &ld_debug(4,"Message differs : " . ", " . $$r{receive} . ", " . $arrayOfDNs[0] . "."); return $SERVICE_DOWN; } } sub check_nntp { use IO::Socket; use IO::Socket::INET6; use IO::Select; my ($v, $r) = @_; my $sock; my $s; my $buf; my $port = ld_checkport($v, $r); my $status = 1; &ld_debug(2, "Checking nntp server=$$r{server} port=$port"); unless ($sock = IO::Socket::INET6->new(PeerAddr => $$r{server}, PeerPort => $port, Proto => 'tcp', TimeOut => $$v{negotiatetimeout})) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } $s = IO::Select->new(); $s->add($sock); if (scalar($s->can_read($$v{negotiatetimeout})) == 0) { service_set($v, $r, "down", {do_log => 1}); } else { sysread($sock, $buf, 64); if ($buf =~ /^2/) { service_set($v, $r, "up", {do_log => 1}); $status = 0; } else { service_set($v, $r, "down", {do_log => 1}); } } $s->remove($sock); $sock->close; return $status; } sub check_radius { require Authen::Radius; my ($v, $r) = @_; &ld_debug(2, "Checking radius"); my $port = ld_checkport($v, $r); my $radius; my $result = ""; eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); &ld_debug(2, "Starting Check"); alarm $$v{checktimeout}; &ld_debug(2, "Starting Radius"); $radius = new Authen::Radius(Host => "$$r{server}:$port", Secret=>$$v{secret}, TimeOut=>$$v{negotiatetimeout}, Errmode=>'die'); $result = $radius->check_pwd($$v{login}, $$v{passwd}); &ld_debug(2, "Finished Radius"); alarm 0; # Cancel the alarm }; if ($result eq "") { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); &ld_debug(3, "Radius Error: ".$radius->get_error); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_mysql { return check_sql(@_, "mysql", "database"); } sub check_pgsql { return check_sql(@_, "Pg", "dbname"); } sub check_sql_log_errstr { my ($prefix, $errstr) = (@_); for $_ (split /\n/, $errstr) { &ld_debug(4, "$prefix $_\n"); } } sub check_oracle { return check_sql(@_, "Oracle", "sid"); } sub check_sql { require DBI; my ($v, $r, $dbd, $dbname) = @_; my $port = ld_checkport($v, $r); my ($dbh, $sth, $query, $rows, $result); $result = $SERVICE_DOWN; $query = $$r{request}; $query =~ s#^/##; unless ($$v{login} && $query) { &ld_log("Error: Must specify a login and request string " . "for MySQL, Oracle and PostgreSQL checks. " . "Not adding $$r{server}.\n"); goto err_down; } $result=2; # Set result flag. Only ok if ends up at zero. &ld_debug(2, "Checking $$v{server} server=$$r{server} port=$port\n"); $dbh = DBI->connect("dbi:$dbd:$dbname=$$v{database};" . "host=$$r{server};port=$port", $$v{login}, $$v{passwd}); unless ($dbh) { &ld_debug(2, "Failed to bind to $$r{server} with DBI->errstr\n"); check_sql_log_errstr("Failed to bind to $$r{server} with", DBI->errstr); goto err_down; } $result--; $sth = $dbh->prepare($query); unless ($sth) { &ld_debug(2, "Error preparing statement: $dbh->errstr\n"); check_sql_log_errstr("Error preparing statement:", $dbh->errstr); goto err_disconect; } # Test to see if any errors are returned $sth->execute; if ($dbh->err) { &ld_debug(2, "Error executing statement: $dbh->errstr : $dbh->err\n"); check_sql_log_errstr("Error executing statement:", $dbh->errstr, $dbh->err); goto err_finish; } # On error "execute" will return undef. # # Assuming you're using 'SELECT' you will get the number of rows # returned from the db when running execute: the 'rows' method is # only used when doing something that is NOT a select. I cannot # imagine that you would ever want to insert or update from a # regular polling on this system, so we will assume you are using # SELECT here. # # Ideally you will do something like this: 'select * from # director_slave where enabled=1' This way you can have, say, a # MEMORY table in MySQL where you insert a value into a row # (enabled) that says whether or not you want to actually use this # in the pool from ldirector / ipvs, and disable them without # actually turning off your sql server. $sth->execute; if ($dbd eq "Oracle") { $sth->fetchrow_hashref() } unless ($rows = $sth->rows) { check_sql_log_errstr("Error executing statement:", $dbh->errstr, $dbh->err); goto err_finish; } # Actually look to see if there was data returned in this statement, # else disable node if($rows > 0) { goto out; } else { goto err_finish; } out: $result = $SERVICE_UP; err_finish: $sth->finish(); err_disconnect: $dbh->disconnect(); err_down: service_set($v, $r, $result == $SERVICE_UP ? "up" : "down", {do_log => 1}); return $result; } sub check_connect { my ($v, $r) = @_; my $port = ld_checkport($v, $r); eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{checktimeout}; my $sock = &ld_open_socket($$r{server}, $port, $$v{protocol}); if ($sock) { close($sock); } else { alarm 0; # Cancel the alarm die("Socket Connect Failed"); } &ld_debug(3, "Connected to $$r{server} (port $port)"); alarm 0; # Cancel the alarm }; if ($@) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_external { my ($v, $r) = @_; my $v_server; if (defined $$v{server}) { $v_server = $$v{server}; } else { $v_server = $$v{fwm}; } my $result = system_timeout($$v{checktimeout}, $$v{checkcommand}, $v_server, $$v{port}, $$r{server}, $$r{port}); if ($result) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: " . "$@ after calling $$v{checkcommand} with result " . "$result"); return 0; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return 1; } } sub check_external_perl { my ($v, $r) = @_; my $result; my $v_server; eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{checktimeout}; if (defined $$v{server}) { $v_server = $$v{server}; } else { $v_server = $$v{fwm}; } my $cmdfunc = $check_external_perl__funcs{$$v{checkcommand}}; if (!defined($cmdfunc)) { open(CMDFILE, "<$$v{checkcommand}") || die "cannot open external-perl checkcommand file: $$v{checkcommand}"; $cmdfunc = eval("sub { \@ARGV=\@_; " . join("", ) . " }"); close(CMDFILE); $check_external_perl__funcs{$$v{checkcommand}} = $cmdfunc; } no warnings 'redefine'; local *CORE::GLOBAL::exit = sub { $result = shift; goto external_exit; }; $cmdfunc->($v_server, $$v{port}, $$r{server}, $$r{port}); external_exit: alarm 0; }; if ($@ or $result != 0) { &service_set($v, $r, "down"); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: " . "$@ after calling (external-perl) $$v{checkcommand} with result " . "$result"); return 0; } else { &service_set($v, $r, "up"); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return 1; } } sub check_sip { my ($v, $r) = @_; my $sip_d_port = ld_checkport($v, $r); &ld_debug(2, "Checking sip server=$$r{server} port=$sip_d_port"); eval { use Socket; local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{negotiatetimeout}; my $sock = &ld_open_socket($$r{server}, $sip_d_port, $$v{protocol}); unless ($sock) { alarm 0; die("Socket Connect Failed"); } my ($sip_s_addr_str, $sip_s_port) = &ld_get_addrport($sock); &ld_debug(3, "Connected from $sip_s_addr_str:$sip_s_port to " . $$r{server} . ":$sip_d_port"); select $sock; $|=1; select STDOUT; my $request = "OPTIONS sip:" . $$v{login} . " SIP/2.0\r\n" . "Via: SIP/2.0/UDP $sip_s_addr_str:$sip_s_port;" . "branch=z9hG4bKhjhs8ass877\r\n" . "Max-Forwards: 70\r\n" . "To: \r\n" . "From: ;tag=1928301774\r\n" . "Call-ID: " . (join "", map { unpack "H*", chr(rand(256)) } 1..8) . "\r\n" . "CSeq: 63104 OPTIONS\r\n" . "Contact: \r\n" . "Accept: application/sdp\r\n" . "Content-Length: 0\r\n\r\n"; print "Request:\n$request"; print $sock $request; my $ok; my $reply; while (<$sock>) { chomp; $/="\r"; chomp; $/="\n"; last if ($_ eq ""); if (!defined $ok) { # Check status $ok = $_; if ($ok !~ m/^SIP\/2.0 200 OK/) { alarm 0; # Cancel the alarm close($sock); die "$ok\n"; } next; } $reply .= "$_\n"; # Add more checks here as desired } alarm 0; # Cancel the alarm close($sock); if (!defined $ok) { die "No OK\n"; } print "Reply:\n$ok\n$reply\n"; }; if ($@) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_simpletcp { my ($v, $r) = @_; my $d_port = ld_checkport($v, $r); &ld_debug(2, "Checking simpletcp server=$$r{server} port=$d_port"); eval { use Socket; local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{negotiatetimeout}; my $sock = &ld_open_socket($$r{server}, $d_port, $$v{protocol}); unless ($sock) { alarm 0; die("Socket Connect Failed"); } my ($s_addr_str, $s_port) = &ld_get_addrport($sock); &ld_debug(3, "Connected from $s_addr_str:$s_port to " . $$r{server} . ":$d_port"); select $sock; $|=1; select STDOUT; my $request = substr($$r{request}, 1); $request =~ s/\\n/\n/g ; &ld_debug(2, "Checking simpletcp server=$$r{server} port=$d_port request:\n$request"); print $sock $request; shutdown($sock, SHUT_WR); my $ok; my $reply; while (<$sock>) { &ld_debug(2, "Checking simpletcp server=$$r{server} port=$d_port receive=" . $$r{receive} ." got: $_\n"); if ( $_ =~ /$$r{receive}/ ) { $ok = 1; last; } } alarm 0; # Cancel the alarm close($sock); if (!defined $ok) { die "No OK\n"; } }; if ($@) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_ftp { require Net::FTP; my ($v, $r) = @_; my $ftp; my $memory; my $debug = ($DEBUG > 2) ? 1 : 0; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking ftp server=$$r{server} port=$port"); &ld_debug(4, "Timeout is $$v{negotiatetimeout}"); open(TMP,'+>', undef); # In some cases Net::FTP dies if there is a timeout eval { unless ($ftp = Net::FTP->new("$$r{server}:$port", Timeout=>$$v{negotiatetimeout}, Debug=>$debug)) { die "Could not connect\n"; } $ftp->login($$v{login}, $$v{passwd}); $ftp->cwd("/"); $ftp->binary(); $ftp->pasv(); $ftp->get("$$r{request}", *TMP); $ftp->quit(); }; if ($@) { &ld_debug(2, "Warning: $@"); } seek TMP, 0, 0; local $/; $memory = ; close TMP; if ($memory =~ /$$r{receive}/) { service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } sub check_dns { my $res; my $query; my $rr; my $request; my $server; my ($v,$r) = @_; { # Net::DNS makes unguarded calls to eval # which throw a fatal exception if they fail # Needless to say, this is completely stupid. local $SIG{'__DIE__'} = "DEFAULT"; # When fork=yes we need to ignore the child death local $SIG{'CHLD'} = "IGNORE"; require Net::DNS; } $res = new Net::DNS::Resolver; if($DEBUG > 2) { $res->debug(1); } $$r{"request"} =~ m/^\/?(.*)/; $request=$1; $server = &ld_strip_brackets($$r{server}); &ld_debug(2, "Checking dns: request=\"$request\" receive=\"" . $$r{"receive"} . "\"\n"); eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "timeout\n"; }; alarm($$v{negotiatetimeout}); $res->nameservers($server); if ($$v{"protocol"} eq "tcp") { $res->usevc(1); } $query = $res->search($request); alarm(0); }; if (@$ eq "timeout\n" or ! $query) { service_set($v, $r, "down", {do_log => 1}, "Connection timed out"); return $SERVICE_DOWN; } foreach $rr ($query->answer) { if (($rr->type eq "A" and $rr->address eq $$r{"receive"}) or ($rr->type eq "PTR" and $rr->ptrdname eq $$r{"receive"})) { service_set($v, $r, "up", {do_log => 1}, "Success"); return $SERVICE_UP; } } service_set($v, $r, "down", {do_log => 1}, "Response mismatch"); return $SERVICE_DOWN; } sub check_ping { use Net::Ping; my ($v,$r) = (@_); &ld_debug(2, "Checking ping: " . "host=\"" . $$r{server} . "\" checktimeout=\"" . $$v{"checktimeout"} . "\" checkcount=\"" . $$v{"checkcount"} . "\"\n"); my $p = Net::Ping->new("icmp","1","64"); for (my $attempt = 0; $attempt < $$v{"checkcount"}; $attempt++) { if ($p->ping($$r{server}, $$v{"checktimeout"})) { &ld_debug(2, "pong from $$r{server}\n"); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } &ld_debug(2, "ping to $$r{server} timed out " . "(attempt " . ($attempt + 1) . "/" . $$v{"checkcount"} . ")\n"); } service_set($v, $r, "down"); return $SERVICE_DOWN; } # check_none # Dummy function to check service if service type is none. # Just activates the real server sub check_none { my ($v, $r) = @_; &ld_debug(2, "Checking none"); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } # service_set # Used to bring up and down real servers. # This is the function you should call if you want to bring a real # server up or down. # This function is safe to call regardless of the current state of a # real server. # Do _not_ call _service_up or _service_down directly. # pre: v: virtual that the real service belongs to # Only used to determine the protocol of the service # r: real server to take down # state: up or down # up to bring the real service up # down to bring the real service up # flags: hash with the following (optional) keys: # force => 1 - force setting of the specified state # do_log => 1 - log the state to the monitorfile # (when called as the result of a check) # post: The real server is brought up or down for each virtual service # it belongs to. # return: none sub service_set { my ($v, $r, $state, $flags, $log_msg) = @_; my ($real, $virtual, $virt, $now); if ($$flags{'do_log'}) { $now = localtime(); if (!defined($log_msg)) { $log_msg = "-"; } # URI-escape special log characters ('|' and newlines) $log_msg =~ s/([%|\r\n])/sprintf("%%%.2x", ord($1))/eg; } # Find the real server in @REAL foreach $real (@REAL) { if($real->{"real"} eq get_real_id_str($r, $v)) { $virtual = $real->{"virtual"}; last; } } return unless (defined($virtual)); # Check each virtual service for the real server and make # changes as necessary foreach $v (@VIRTUAL){ # Use found rather than relying on tmp_id being # set when we leave the foreach loop. There # seems to some weirdness in Perl (5.6.0 on Redhat 7.2) my $found = 0; my $tmp_id; my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); my $log_str = "real server=$real_id" . " (virtual=$virtual_id)"; foreach $tmp_id (@$virtual) { if($virtual_id eq $tmp_id) { $found = 1; last; } } if ($found == 1) { if ($state=~/up/i) { _service_up($v, $r, $$flags{"force"}); &ld_debug(2, "Enabled $log_str"); } elsif ($state=~/down/i) { _service_down($v, $r, $$flags{"force"}); &ld_debug(2, "Disabled $log_str"); } if ($$v{"monitorfile"} and $$flags{"do_log"}) { my $real_log_msg = $real_id; $real_log_msg =~ tr/:/ /s; $real_log_msg =~ s/\\//g; unless( open(CHECKLOG, ">>$$v{monitorfile}") and print CHECKLOG "[$now] [$$] $real_log_msg [$state] $log_msg\n" and close(CHECKLOG) ) { die("Error writing to monitorfile '$$v{monitorfile}': $!"); } } } } } # _remove_service # Remove a real server by either making it quiescent or deleting it # Should be called by _service_down or fallback_off # I.e. If you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # If the real server exists (which it should) make it quiescent or # delete it, depending on the global and per virtual service quiescent flag. # If it # doesn't exist, just leave it as it will be added by the # _service_up code as appropriate. # pre: v: reference to virtual service to with the real server belongs # rservice: service to restore. Of the form server:port for a tcp or # udp service. Of the form fwmark for a fwm service. # rforw: Forwarding mechanism of service. Sould be one of "-g" "-i" or # "-m" # tag: Tag to use for logging. Should be either "real" or "fallback" # post: real service is taken up from the respective virtual service # if it is inactive # return: none sub _remove_service { my ($v, $rservice, $rforw, $tag) = (@_); my $oldsrv; my $ov; my $or; my $ipvsadm_args; my $log_args; my $virtual_str; my $old_rservice; my $is_quiescent; $virtual_str = &get_virtual($v); $oldsrv=&ld_read_ipvsadm(); $ov=$oldsrv->{&get_real_service_str($v)}; if(!defined($ov)){ return; } if ($tag ne "fallback" and ((defined $$v{quiescent} and $$v{quiescent} eq "yes") or (!defined($$v{quiescent}) and $QUIESCENT eq "yes"))){ $is_quiescent = "quiescent"; } $or=$ov->{"real"}->{$rservice}; # If a virtual service is a IP/port service (not fwmark) # and a real-servers uses a forwarding mechanism other than masq # then the port will always be that of the virtual service. # This includes real-servers that LVS has set to use # the local forwarding mechanism because their IP address # is local. Thus, if $rservice does not exist test # for the same ip address with the virtual servers port. # N.B: This could cause strange things to happen if # there is a clash between two real servers on different ports # that LVS has mapped to being the same thing. if(!defined($or)) { $old_rservice = $rservice; $rservice =~ /(.*):(.*)/; $rservice = $1; $virtual_str =~ /(.*):(.*)/; $rservice .= ":" . $2; $or=$ov->{"real"}->{$rservice}; # If this doesn't exist either, use the original service. # Otherwise if masq and quiescence is in use, the # real server is not local, and it has an alternate port to # the virtual server, using the mapped service will # result in a quiescent service being created on the # virtual server's port, which is not wanted. if(!defined($or)) { $rservice = $old_rservice; $old_rservice = undef; } } if((!defined($or) and !defined($is_quiescent)) or (defined($is_quiescent) and defined($or) and $or->{"weight"} eq 0 and get_forward_flag($or->{"forward"}) eq $rforw)){ return; } $ipvsadm_args = "$$v{proto} " . &get_virtual_option($v) . " -r $rservice"; $log_args = "$tag server: $rservice "; if(defined($old_rservice)) { $log_args .= "mapped from $old_rservice " } $log_args .= "($virtual_str)"; my $server_str=$rservice . " " . $virtual_str; my $currenttime=time(); if(defined($is_quiescent)) { if (defined($or)) { &system_wrapper("$IPVSADM -e " . "$ipvsadm_args $rforw -w 0"); + &ld_log("Quiescent $log_args (Weight set to 0)"); + &ld_emailalert_send("Quiescent $log_args (Weight set to 0)", + $v, $rservice, $currenttime); } - else { + elsif ($READDQUIESCENT eq "yes") { &system_wrapper("$IPVSADM -a " . "$ipvsadm_args $rforw -w 0"); - } - &ld_log("Quiescent $log_args (Weight set to 0)"); - &ld_emailalert_send("Quiescent $log_args (Weight set to 0)", + &ld_log("Readd Quiescent $log_args (Weight set to 0)"); + &ld_emailalert_send("Quiescent $log_args (Weight set to 0)", $v, $rservice, $currenttime); + } } else { &system_wrapper("$IPVSADM -d $ipvsadm_args"); &ld_log("Deleted $log_args"); &ld_emailalert_send("Deleted $log_args", $v, $rservice, $currenttime); } } # _restore_service # Make a retore a real server. The opposite of _quiescent_server. # Should be called by _service_up or fallback_on # I.e. If you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # If the real server exists (which it should) make it quiescent. If it # doesn't exist, just leave it as it will be added by the _service_up code # as appropriate. # pre: v: reference to virtual service to with the real server belongs # rservice: service to restore. Of the form server:port for a tcp or # udp service. Of the form fwmark for a fwm service. # rforw: Forwarding mechanism of service. Sould be one of "-g" "-i" or # "-m" # rwght: Weight of service. Sold be of the form "" # e.g. "1" # tag: Tag to use for logging. Should be either "real" or "fallback" # post: real service is taken up from the respective virtual service # if it is inactive # return: none sub _restore_service { my ($v, $rservice, $rforw, $rwght, $tag) = (@_); my $oldsrv; my $ov; my $or; my $ipvsadm_args; my $log_args; $ipvsadm_args = "$$v{proto} " . &get_virtual_option($v) . " -r $rservice $rforw -w $rwght"; $log_args = "$tag server: $rservice " . "(" #. scalar(%{$v->{real_status}}) . &get_virtual($v) . ")"; #if the server exists then restore its weight # otherwise add the server $oldsrv=&ld_read_ipvsadm(); $ov=$oldsrv->{&get_real_service_str($v)}; if(defined($ov)){ $or=$ov->{"real"}->{$rservice}; } if(defined($or)){ unless($or->{"weight"} eq $rwght and get_forward_flag($or->{"forward"}) eq $rforw){ &system_wrapper("$IPVSADM -e $ipvsadm_args"); &ld_log("Restored $log_args (Weight set to $rwght)"); &ld_emailalert_send("Restored $log_args " . "(Weight set to $rwght)", $v, $rservice, 0); } } else { &system_wrapper("$IPVSADM -a $ipvsadm_args"); &ld_log("Added $log_args (Weight set to $rwght)"); &ld_emailalert_send("Added $log_args (Weight set to $rwght)", $v, $rservice, 0); } } # Check the status of a server # Should only be called from _status_up, _status_down, # _service_up, or _service_down # Returns 1 if the server is up, 0 if down sub _status_check { my ($v, $r, $is_fallback) = (@_); my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); if (defined($is_fallback)) { if (defined($v->{real_status}) or (defined($v->{fallback_status}) and $v->{fallback_status}->{"$real_id"})) { return 1; } } else { if (defined ($v->{real_status}) and $v->{real_status}->{"$real_id"}) { return 1; } } return 0; } # Set the status of a server as up # Should only be called from _service_up or _ld_start sub _status_up { my ($v, $r, $is_fallback) = (@_); my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); return undef if(_status_check($v, $r, $is_fallback)); $r->{virtual_status}->{"$virtual_id"} = 1; if (defined $is_fallback) { $v->{fallback_status}->{"$real_id"} = 1; } else { $v->{real_status}->{"$real_id"} = 1; } return 1; } # Set the status of a server as down # Should only be called from _service_down or ld_stop sub _status_down { my ($v, $r, $is_fallback) = (@_); my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); return undef if (!_status_check($v, $r, $is_fallback)); if (defined($is_fallback)) { delete $v->{fallback_status}->{"$real_id"}; if (! %{$v->{fallback_status}}) { $v->{fallback_status} = undef; } } else { delete $v->{real_status}->{"$real_id"}; if (! %{$v->{real_status}}) { $v->{real_status} = undef; } } delete $r->{virtual_status}->{"$virtual_id"}; if (! %{$r->{virtual_status}}) { $r->{virtual_status} = undef; } return 1; } # _service_up # Bring a real service up if it is down # Should be called by service_set only # I.e. If you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # pre: v: reference to virtual service to with the real server belongs # r: reference to the real server to take down # post: real service is taken up from the respective virtual service # if it is inactive # return: none sub _service_up { my ($v, $r, $force) = (@_); if ($r->{failcount} > 0) { ld_log("Resetting soft failure count: " . $r->{server} . ":" . $r->{port} . " (" . get_virtual_id_str($v) . ")"); } $r->{failcount} = 0; if (! _status_up($v, $r) and ! defined($force)) { return; } &_restore_service($v, $r->{server} . ":" . $r->{port}, $r->{forw}, $r->{weight}, "real"); &fallback_off($v); } # _service_down # Bring a real service down if it is up # Should be called by service_set only # I.e. if you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # pre: v: reference to virtual service to with the real server belongs # r: reference to the real server to take down # post: real service is taken down from the respective virtual service # if it is active # return: none sub _service_down { my ($v, $r, $force) = @_; if (!_status_check($v, $r) and !defined($force)) { return; } $r->{failcount}++; if (!defined($force) and _status_check($v, $r) and ($r->{failcount} < $v->{failurecount})) { ld_log("Soft failure real server: " . $r->{server} . ":" . $r->{port} . " (" . get_virtual_id_str($v) . ") failure " . $r->{failcount} . "/" . $v->{failurecount}); return; } _status_down($v, $r); &_remove_service($v, $r->{server} . ":" . $r->{port}, $r->{forw}, "real"); &fallback_on($v); } # fallback_on # Turn on the fallback server for a virtual service if it is inactive # pre: v: virtual to turn fallback service on for # post: fallback server is turned on if it was inactive # return: none sub fallback_on { my ($v, $force) = (@_); my $fallback=&fallback_find($v); if (defined($fallback) and (_status_up($v, $fallback, "fallback") or defined($force))) { &_restore_service($v, $fallback->{server} . ":" . $fallback->{port}, get_forward_flag($fallback->{forward}), "1", "fallback"); } if (!defined ($v->{real_status})) { &do_fallback_command($v, "start"); } } # fallback_off # Turn off the fallback server for a virtual service if it is active # pre: v: virtual to turn fallback service off for # post: fallback server is turned off if it was active # return: none sub fallback_off { my ($v, $force) = (@_); my $fallback=&fallback_find($v); if (defined($fallback) and (_status_down($v, $fallback, "fallback") or defined($force))) { &_remove_service($v, $fallback->{server} . ":" . $fallback->{port}, get_forward_flag($fallback->{forward}), "fallback"); } if (defined ($v->{real_status})) { &do_fallback_command($v, "stop"); } } # fallback_find # Determine the fallback for a virtual service # pre: virtual: reference to a virtual service # post: none # return: $virtual->{"fallback"} if defined # else $FALLBACK->{$virtual->{"protocol"}} if defined # else undef sub fallback_find { my ($virtual) = (@_); my($global_fallback_ptr); # fallback pointer my $ipv6p = ($virtual->{addressfamily} == AF_INET6) ? 1 : 0; if( defined $virtual->{"fallback"} ) { return($virtual->{"fallback"}); } elsif ( not defined($FALLBACK) and not $ipv6p ) { return undef; } elsif ( not defined($FALLBACK6) and $ipv6p ) { return undef; } if ($ipv6p) { # IPv6 $global_fallback_ptr = $FALLBACK6; } else { $global_fallback_ptr = $FALLBACK; } # If the global fallback has a port, it can be used as is if (defined($global_fallback_ptr->{$virtual->{"protocol"}}->{"port"})) { return $global_fallback_ptr->{$virtual->{"protocol"}}; } # Else create an anonymous fallback my %anon_fallback = %{$global_fallback_ptr->{$virtual->{"protocol"}}}; $anon_fallback{"port"} = $virtual->{"port"}; return \%anon_fallback; } # fallback_command # Execute the fallback command with the given status if it wasn't executed # with this status already for the supplied virtual service. sub do_fallback_command { my ($v, $status) = (@_); if (defined $v->{fallbackcommand_status} and $v->{fallbackcommand_status} eq $status) { return; } $v->{fallbackcommand_status} = $status; if (defined($v->{fallbackcommand})) { &system_wrapper($v->{fallbackcommand} . " " . $status); } elsif (defined($FALLBACKCOMMAND)) { &system_wrapper($FALLBACKCOMMAND . " " . $status); } } # Used during stop, start and reload to remove stale real servers from LVS sub purge_untracked_service { my ($v, $rservice, $tag) = (@_); my $log_arg = "Purged real server ($tag): $rservice (" . &get_virtual($v) . ")"; &system_wrapper("$IPVSADM -d $v->{proto} " . &get_virtual_option($v) . " -r $rservice"); &ld_log($log_arg); &ld_emailalert_send($log_arg, $v, $rservice, 0); } # Used during stop, start and reload to remove stale real servers from LVS sub purge_service { my ($v, $r, $tag) = (@_); purge_untracked_service($v, "$r->{server}:$r->{port}", $tag); _status_down($v, $r); } # Used during stop, start and reload to remove stale virtual services from LVS sub purge_virtual { my ($v, $tag) = (@_); &system_wrapper("$IPVSADM -D $v->{proto} " . &get_virtual_option($v)); &ld_log("Purged virtual server ($tag): " . &get_virtual($v)); } sub check_cfgfile { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime) = stat($CONFIG); my ($status); return if ($stattime==$mtime); $stattime = $mtime; use Digest::MD5 qw(md5 md5_hex); my $ctx = Digest::MD5->new; unless (open(CFGFILE, "<$CONFIG")) { &config_warn(0, "can not open file $CONFIG for checking"); return 0; } $ctx->addfile(*CFGFILE); close(CFGFILE); my $digest = $ctx->hexdigest; if (defined $checksum && $checksum ne $digest) { &ld_log("Configuration file '$CONFIG' has changed on disk"); if ($AUTOCHECK eq "yes") { &ld_log(" - reread new configuration"); &reread_config(); } else { &ld_log(" - ignore new configuration\n"); } if (defined($CALLBACK) and -x $CALLBACK) { &system_wrapper("$CALLBACK $CONFIG"); } $status = 1; } $checksum = $digest; return $status; } # ld_openlog # Open logger # make log rotation work # pre: none # post: If logger is a file, it opened and closed again as a test # If logger is syslog, it is opened so it can be used without # needing to be opened again. # Otherwise, nothing is done. # return: 0 on success # 1 on error sub ld_openlog { if ($opt_d or $SUPERVISED eq "yes") { # Instantly do nothing return(0); } if( $LDIRLOG =~ /^\/(.*)/ ) { # Open and close the file as a test. # We open the file each time we want to log to it unless (open(LOGFILE, ">>$LDIRLOG") and close(LOGFILE)) { return 1; } } else { # Assume LDIRLOG is a logfacility, log to syslog setlogsock( "unix" ); openlog( "ldirectord", "pid", "$LDIRLOG" ); } return(0); } # ld_log # Log a message. # pre: message: Message to write # post: message and timetsamp is written to loged # If logger is a file, it is opened and closed again as a # primitive means to make log rotation work # return: 0 on success # 1 on error sub ld_log { my ($message) = (@_); my $now = localtime(); &ld_debug(2, $message); chomp $message; if ($opt_d) { print STDERR "$message\n"; } elsif ($SUPERVISED eq "yes") { print "[$now] $message\n"; } elsif ( $LDIRLOG =~ /^\/(.*)/ ) { unless (open(LOGFILE, ">>$LDIRLOG") and print LOGFILE "[$now|$CFGNAME|$$] $message\n" and close(LOGFILE)) { print STDERR "$message\n"; return 1; } } else { # Assume LDIRLOG is a logfacility, log to syslog syslog( "info", "$message" ); } return(0); } sub daemon_status_str { if ($DAEMON_STATUS == $DAEMON_STATUS_STARTING) { return "starting"; } elsif ($DAEMON_STATUS == $DAEMON_STATUS_RUNNING) { return "running"; } elsif ($DAEMON_STATUS == $DAEMON_STATUS_STOPPING) { return "stopping"; } elsif ($DAEMON_STATUS == $DAEMON_STATUS_RELOADING) { return "reloading"; } return "UNKNOWN"; } # ld_emailalert_send # Send email alerts per virtual server # pre: message: Message to email # post: message is emailed if emailalert defined for virtualserver # return: 0 on success # 1 on error sub ld_emailalert_send { my ($subject, $v, $rserver, $currenttime) = (@_); my $status = 0; my $to_addr; my $frequency; my $virtual_str; my $id; my $statusfilter; my $smtp_server; $frequency = defined $v->{emailalertfreq} ? $v->{emailalertfreq} : $EMAILALERTFREQ; $virtual_str = &get_virtual($v); $id = "$rserver ($virtual_str)"; if ($currenttime == 0 or $frequency == 0) { delete $EMAILSTATUS{"$id"}; } else { $EMAILSTATUS{$id}->{v} = $v; $EMAILSTATUS{$id}->{alerttime} = $currenttime; } $statusfilter = defined $v->{emailalertstatus} ? $v->{emailalertstatus} : $EMAILALERTSTATUS; if (($DAEMON_STATUS & $statusfilter) == 0) { return 0; } $to_addr = defined $v->{emailalert} ? $v->{emailalert} : $EMAILALERT; if ($to_addr eq "") { return 0; } $smtp_server = defined $v->{smtp} ? $v->{smtp} : $SMTP; &ld_log("emailalert: $subject"); if (defined $smtp_server) { $status = &ld_emailalert_net_smtp($smtp_server, $to_addr, $subject); } else { $status = &ld_emailalert_mail_send($to_addr, $subject); } return($status); } # ld_emailalert_net_smtp # Send email alerts via SMTP server # pre: smtp: SMTP server defined # post: message is emailed if SMTP server is valid and working # return: 0 on success # 1 on error sub ld_emailalert_net_smtp { my ($smtp_server, $to_addr, $subject) = (@_); my $status = 0; use Net::SMTP; use Sys::Hostname; my $hostname = hostname; my $smtp = Net::SMTP->new($smtp_server); if ($smtp) { $smtp->mail("$ENV{USER}\@$hostname"); $smtp->to($to_addr); $smtp->data(); if($EMAILALERTFROM) { $smtp->datasend("From: $EMAILALERTFROM\n"); } else { $smtp->datasend("From: $ENV{USER}\@$hostname\n"); } $smtp->datasend("To: $to_addr\n"); $smtp->datasend("Subject: $subject\n\n"); $smtp->datasend("ldirectord host: $hostname\n" . "Log-Message: $subject\n" . "Daemon-Status: " . &daemon_status_str() . "\n"); $smtp->dataend(); $smtp->quit; } else { &ld_log("failed to send SMTP email message\n"); $status = 1; } return($status); } # ld_emailalert_mail_send # Send email alerts via Mail::Send # pre: smtp: SMTP server not defined # post: message is emailed if one of the Mail::Send methods works # return: 0 on success # 1 on error sub ld_emailalert_mail_send { my ($to_addr, $subject) = (@_); my $emailmsg; my $emailfh; my $status = 0; use Mail::Send; $emailmsg = new Mail::Send Subject=>$subject, To=>$to_addr; $emailmsg->set('From', $EMAILALERTFROM) if ($EMAILALERTFROM); $emailfh = $emailmsg->open; print $emailfh "ldirectord host: " . hostname() . "\n" . "Log-Message: $subject\n" . "Daemon-Status: " . &daemon_status_str() . "\n"; unless ($emailfh->close) { &ld_log("failed to send email message\n"); $status = 1; } return($status); } # ld_emailalert_resend # Resend email alerts as necessary # pre: none # post: EMAILSTATUS array is updated and alerts are sent as necessary # return: none sub ld_emailalert_resend { my $currenttime = time(); my $es; my $id; my $rserver; my $frequency; foreach $id (keys %EMAILSTATUS) { $es = $EMAILSTATUS{$id}; $frequency = defined $es->{v}->{emailalertfreq} ? $es->{v}->{emailalertfreq} : $EMAILALERTFREQ; $id =~ m/(.*) /; $rserver = $1; if ($currenttime - $es->{alerttime} < $frequency) { next; } &ld_emailalert_send("Inaccessible real server: $id", $es->{v}, $rserver, $currenttime); } } # ld_debug # Log a message to a STDOUT. # pre: priority: priority of message # message: Message to write # post: message is written to STDOUT if $DEBUG >= priority # return: none sub ld_debug { my ($priority, $message) = (@_); if ( $DEBUG >= $priority ) { chomp $message; print STDERR "DEBUG${priority}: $message\n"; } } # system_wrapper # Wrapper around system() to log errors # # WARNING: Do not use alarm() together with this function. A internal # pipe will not be reclaimed (at least with Perl 5.8.8). This can # cause ldirectord to run out of file handles. # # pre: LIST: arguments to pass to system() # post: system() is called and if it returns non-zero a failure # message is logged # return: return value of system() sub system_wrapper { my (@args)=(@_); my $status; &ld_log("Running system(@args)") if $DEBUG>2; $status = system(@args); if($status != 0) { &ld_log("system(@args) failed: $!"); } return($status) } # system_timeout # Emulate system() with timeout via fork(), exec(), and waitpid() and # TERMinate the child on timeout. Set an alarm() for the timeout. # # This function does not suffer the deficiencies of system_wrapper() # of leaving pipes unreclaimed. Zombies are reaped by ld_handler_chld # and the related code. # # pre: timeout: timeout in seconds # LIST: arguments to pass to exec() # return: >= 0 exit status of the child process # 127 exec failed # -1 timeout # -2 fork failed sub system_timeout { my $timeout = shift; my (@args) = (@_); my $status; &ld_log("Running system_timeout($timeout, @args)") if $DEBUG>2; my $childpid = fork(); if (!defined($childpid)) { &ld_log("fork failed: $!"); return(-2); } elsif ($childpid) { # parent eval { local $SIG{'ALRM'} = sub { die "timeout\n"; }; alarm $timeout; waitpid($childpid, 0); $status = $? >> 8; # When die()-ing in the SIGALRM handler we # will never reach this point. Child/Zombie # is left behind. The grim reaper # (ld_handler_chld + ld_process_chld) will # take care of the zombie. }; alarm 0; if ($@) { # timeout if ($@ ne "timeout\n") { # log unexpected errors &ld_log("system_timeout($timeout, @args) " . "unexpected error: $@"); } else { &ld_log("system_timeout($timeout, @args) " . "timed out, kill -TERM child"); } # TERMinate child kill 15, $childpid; return(-1); } else { # did not timeout return($status); } } else { # child exec(@args) or &ld_exit(127, "exec(@args) failed: $!"); die "ld_exit() broken?, stopped"; } } # exec_wrapper # Wrapper around exec() to log errors # pre: LIST: arguments to pass to exec() # post: exec() is called and if it returns non-zero a failure # message is logged # return: return value of exec() on failure # does not return on success sub exec_wrapper { my (@args)=(@_); my $status; &ld_log("Running exec(@args)") if $DEBUG>2; $status = exec(@args) or &ld_log("exec(@args) failed"); return($status) } # ld_rm_file # Remove a file, symink, or anything that isn't a directory # and exists # pre: filename: file to delete # post: If filename does not exist or is a directory an # error state is reached # Else filename is delete # If $DEBUG >=2 errors are logged # return: 0 on success # -1 on error sub ld_rm_file { my ($filename)=(@_); my ($status); if(-d "$filename"){ &ld_debug(2, "ld_rm_file: $filename is a directory, skipping"); return(-1); } if(! -e "$filename"){ &ld_debug(2, "ld_rm_file: $filename doesn't exist, skipping"); return(-1); } $status = unlink($filename); if($status!=1){ &ld_debug(2, "ld_rm_file: Error deleting: $filename: $!"); } return(($status==1)?0:-1) } # is_octet # See if a number is an octet, that is >=0 and <=255 # pre: alleged_octet: the octet to test # post: alleged_octet is checked to see if it is valid # return: 1 if the alleged_octet is an octet # 0 otherwise sub is_octet { my ($alleged_octet)=(@_); if($alleged_octet<0){ return 0; } if($alleged_octet>255){ return 0; } return(1); } # is_ip # Check that a given string is an IP address # pre: alleged_ip: string representing ip address # post: alleged_ip is checked to see if it is valid # return: 1 if alleged_ip is a valid ip address # 0 otherwise sub is_ip { my ($alleged_ip)=(@_); if ($alleged_ip =~ /:/) { unless(inet_pton(AF_INET6,$alleged_ip)){ return 0; } return(1); } #If we don't have four, . delimited numbers then we have no hope unless($alleged_ip=~m/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) { return 0; } #Each octet mist be >=0 and <=255 unless(&is_octet($1)){ return 0; } unless(&is_octet($2)){ return 0; } unless(&is_octet($3)){ return 0; } unless(&is_octet($4)){ return 0; } return(1); } # ip_to_int # Turn an IP address given as a dotted quad into an integer # pre: ip_address: string representing IP address # post: post ip_address is converted to an integer # return: -1 if an error occurs # integer representation of IP address otherwise sub ip_to_int { my ($ip_address)=(@_); unless(&is_ip($ip_address)){ return(-1); } unless($ip_address=~m/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/){ return(-1); } return(((((($1<<8)+$2)<<8)+$3)<<8)+$4); } # int_to_ip # Turn an IP address given as a dotted quad into an integer # pre: ip_address: string representing IP address # post: Decimal is converted to a dotted quad # return: -1 if an error occurs # integer representation of IP address otherwise sub int_to_ip { my ($ip_address)=(@_); my $result = ""; return(sprintf( "%d.%d.%d.%d", ($ip_address>>24)&255, ($ip_address>>16)&255, ($ip_address>>8)&255, $ip_address&255 )); } # get_virtual # Get the service for a virtual # pre: nv: virtual to get the service for # post: none # return: fwmark of service if it is a fwm service # ip_address:port otherwise sub get_virtual { my ($nv) = (@_); if ($nv->{"protocol"} eq "fwm"){ return $nv->{"fwm"}; } else { return $nv->{"server"} . ":" . $nv->{"port"}; } } # get_virtual_option # Get the ipvsadm option corresponding to a virtual service # pre: nv: virtual to get the service for # post: none # return: fwmark of service if it is a fwm service # fwmark of service + "-6" if it is a fwm service and the address family is AF_INET6 # ip_address:port otherwise sub get_virtual_option { my ($nv) = (@_); my ($cmdline) = &get_virtual($nv); if ($nv->{"protocol"} eq "fwm" && $nv->{addressfamily} == AF_INET6) { $cmdline .= " -6"; } return $cmdline; } # get_real_id_str # Get an id string for a real server # pre: r: Real service. # protocol: protocol of the real service # tcp or udp # service: type of service # post: none # return: Id string for the real server sub get_real_id_str { my ($r, $v) = (@_); my $request = ""; my $receive = ""; my $checkport = ""; my $virtualhost = ""; my $check; my $real; if(defined($r->{"request"})) { $request = $r->{"request"}; } else { $request = $v->{"request"}; } if(defined($r->{"receive"})) { $receive = $r->{"receive"}; } else { $receive = $v->{"receive"}; } if($v->{"checktype"} eq "negotiate" or $v->{"checktype"} eq "combined") { $check = $v->{"checktype"} . ":" . $v->{"service"}; } elsif($v->{"checktype"} eq "external" or $v->{"checktype"} eq "external-perl") { $check = $v->{"checktype"} . ":" . $v->{"checkcommand"}; } else { $check = $v->{"checktype"}; } if(defined($v->{"checkport"})) { $checkport = $v->{"checkport"}; } if(defined($v->{"virtualhost"})) { $virtualhost = $v->{"virtualhost"}; } $real = $check . ":" . $v->{"protocol"} . ":" . $r->{"server"} . ":" . $r->{"port"} . ":" . $virtualhost . ":" . $checkport . ":" . $r->{"weight"} . ":" . $r->{"forward"} . ":" . quotemeta($request) . ":" . quotemeta($receive); } # get_virtual_id_str # Get an id string for a virtual service # pre: v: Virtual service # post: none # return: Id string for the virtual service sub get_virtual_id_str { my ($v) = (@_); if ($v->{"protocol"} eq "fwm") { return $v->{"protocol"} . (($v->{addressfamily} == AF_INET6)?"6":"") . ":" . &get_virtual($v); } else { return $v->{"protocol"} . ":" . &get_virtual($v); } } # get_forward_flag # Get the ipvsadm flag corresponding to a forwarding mechanism # pre: forward: Name of forwarding mechanism. u # Should be one of ipip, masq or gate # post: none # return: ipvsadm flag corresponding to the forwarding mechanism # " " if $forward is unknown sub get_forward_flag { my ($forward) = (@_); unless(defined($forward)) { return(" "); } if ($forward eq "masq") { return("-m"); } elsif ($forward eq "gate") { return("-g"); } elsif ($forward eq "ipip") { return("-i"); } return(" "); } # ld_exit # Exit and log a message # pre: exit_status: Integer exit status to exit with # 0 will be used if parameter is omitted # message: Message to log when exiting. May be omitted # post: If exit_status is non-zero or $DEBUG>2 then # message logged. # Programme exits with exit_status # return: does not return sub ld_exit { my ($exit_status, $message)=(@_); unless(defined($exit_status)) { $exit_status=0; } unless(defined($message)) { $message=""; } if ($exit_status!=0 or $DEBUG>2) { &ld_log("Exiting with exit_status $exit_status: $message"); } exit($exit_status); } # ld_open_socket # Open a socket connection # pre: remote: IP address as a dotted quad of remote host to connect to # port: port to connect to # protocol: Protocol to use. Should be either "tcp" or "udp" # post: A Socket connection is opened to the remote host # return: Open socket # undef on error sub ld_open_socket { my ($remote, $port, $protocol) = @_; my ($iaddr, $paddr, $pro, $result, $pf); local *SOCK; $remote = &ld_strip_brackets($remote); if (inet_pton(AF_INET6,$remote)) { $iaddr = inet_pton(AF_INET6,$remote); $paddr = pack_sockaddr_in6($port, $iaddr); $pf = PF_INET6; } else { $iaddr = inet_aton($remote) || die "no host: $remote"; $paddr = sockaddr_in($port, $iaddr); $pf = PF_INET; } $pro = getprotobyname($protocol); if ($protocol eq "udp") { socket(SOCK, $pf, SOCK_DGRAM, $pro) || die "socket: $!"; } else { socket(SOCK, $pf, SOCK_STREAM, $pro) || die "socket: $!"; } $result = connect(SOCK, $paddr); unless ($result) { return undef; } return *SOCK; } # daemon # Close and fork to become a daemon. # # Notes from unix programmer faq # http://www.landfield.com/faqs/unix-faq/programmer/faq/ # # Almost none of this is necessary (or advisable) if your daemon is being # started by `inetd'. In that case, stdin, stdout and stderr are all set up # for you to refer to the network connection, and the `fork()'s and session # manipulation should *not* be done (to avoid confusing `inetd'). Only the # `chdir()' step remains useful. # # Gratuitously over documented, because it can be # # Written by Horms, horms@verge.net.au for an unrelated project while # working for Zip World, http://www.zipworld.com.au/, 1997-1999. sub ld_daemon { # `fork()' so the parent can exit, this returns control to the command # line or shell invoking your program. This step is required so that # the new process is guaranteed not to be a process group leader. The # next step, `setsid()', fails if you're a process group leader. &ld_daemon_become_child(); # setsid()' to become a process group and session group leader. Since a # controlling terminal is associated with a session, and this new # session has not yet acquired a controlling terminal our process now # has no controlling terminal, which is a Good Thing for daemons. if(POSIX::setsid()<0){ &ld_exit(1, "ld_daemon: Could not setsid"); } # fork()' again so the parent, (the session group leader), can exit. # This means that we, as a non-session group leader, can never regain a # controlling terminal. &ld_daemon_become_child(); # `chdir("/")' to ensure that our process doesn't keep any directory in # use. Failure to do this could make it so that an administrator # couldn't unmount a filesystem, because it was our current directory. if(chdir("/")<0){ &ld_exit(1, "ld_daemon: Could not chdir"); } # `close()' fds 0, 1, and 2. This releases the standard in, out, and # error we inherited from our parent process. We have no way of knowing # where these fds might have been redirected to. Note that many daemons # use `sysconf()' to determine the limit `_SC_OPEN_MAX'. `_SC_OPEN_MAX' # tells you the maximum open files/process. Then in a loop, the daemon # can close all possible file descriptors. You have to decide if you # need to do this or not. If you think that there might be # file-descriptors open you should close them, since there's a limit on # number of concurrent file descriptors. close(STDIN); close(STDOUT); close(STDERR); # Establish new open descriptors for stdin, stdout and stderr. Even if # you don't plan to use them, it is still a good idea to have them open. # The precise handling of these is a matter of taste; if you have a # logfile, for example, you might wish to open it as stdout or stderr, # and open `/dev/null' as stdin; alternatively, you could open # `/dev/console' as stderr and/or stdout, and `/dev/null' as stdin, or # any other combination that makes sense for your particular daemon. # # This code used to open /dev/console for STDOUT and STDERR, # but that was changed to /dev/null to stop the code hanging in # the case where /dev/console is unavailable for some reason # http://www.osdl.org/developer_bugzilla/show_bug.cgi?id=1180 if(open(STDIN, ">/dev/null")<0){ &ld_exit(-1, "ld_daemon: Could not open /dev/null"); } if(open(STDERR, ">>/dev/null")<0){ &ld_exit(-1, "ld_daemon: Could not open /dev/null"); } } # ld_daemon_become_child # Fork, kill parent and return child process # pre: none # post: process forks and parent exits # All process exit with exit status -1 if an error occurs # return: parent: exits # child: none (this is the process that returns) # Written by Horms, horms@verge.net.au for an unrelated project while # working for Zip World, http://www.zipworld.com.au/, 1997-1999. sub ld_daemon_become_child { my($status); $status = fork(); if ($status<0){ &ld_exit(-1, "ld_daemon_become_child: Could not fork: $!"); } if ($status>0){ &ld_exit(0, "ld_daemon_become_child: Parent exiting as it should"); } } # ld_gethostbyname # Wrapper to gethostbyname. Look up the/an IP address of a hostname # If an IP address is given is it returned # pre: name: Hostname of IP address to lookup # af: Address Family: AF_INET etc.. # post: gethostbyname is called to find an IP address for $name # This is converted to a string # return: IP address # undef on error sub ld_gethostbyname { my ($name, $af)=(@_); if ($name =~ /\[(.*)\]/) { $name = $1; } my @host = getaddrinfo($name, 0, $af); if (!defined($host[3])) { return undef; } my @ret = getnameinfo($host[3], NI_NUMERICHOST | NI_NUMERICSERV); if ($host[0] == AF_INET6) { return "[$ret[0]]"; } else { return $ret[0]; } } # ld_gethostbyaddr # Wrapper to gethostbyaddr. Look up the hostname from an IP address. # If no reverse DNS record is found, return undef # pre: ip: IP address of host to lookup # post: gethostbyaddr is called to find a hostname for IP $ip # return: hostname # undef on error sub ld_gethostbyaddr { my ($ip)=(@_); $ip = &ld_strip_brackets($ip); my @host = getaddrinfo($ip,0); if (!defined($host[3])) { return undef; } my @ret = getnameinfo($host[3], NI_NAMEREQD); return undef unless(scalar(@ret) == 2); return $ret[0]; } # ld_getservbyname # Wrapper for getservbyname. Look up the port for a service name # If a port is given it is returned. # pre: name: Port or Service name to look up # post: if $name is a number # if 0<=$name<=65536 $name is returned # else undef is returned # else getservbyname is called to look up the port for the service # return: Port # undef on error sub ld_getservbyname { my ($name, $protocol)=(@_); if($name=~/^[0-9]+$/){ return(($name>=0 and $name<65536)?$name:undef); } my @serv=getservbyname($name, $protocol); return((@serv and defined($serv[2]))?$serv[2]:undef); } # ld_getservhostbyname # Wrapper for ld_gethostbyname and ld_getservbyname. Given a server of the # form ip_address|hostname[:port|servicename] return ip_address[:port] # pre: hostserv: Servver of the form ip_address|hostname[:port|servicename] # protocol: Protocol for service. Should be either "tcp" or "udp" # af: Address Family: AF_INET etc.. # post: lookups performed as per ld_getservbyname and ld_gethostbyname # return: ip_address[:port] # undef on error sub ld_gethostservbyname{ my ($hostserv, $protocol, $af) = (@_); my $ip; my $port; if ($hostserv =~ /(:(\d+|[A-Za-z0-9-_]+))?$/) { $port = $2; $ip = $hostserv; $ip =~ s/(:(\d+|[A-Za-z0-9-_]+))?$//; } else { $ip = $hostserv; } $ip=&ld_gethostbyname($ip, $af) or return(undef); if(defined($port)){ $port=&ld_getservbyname($port, $protocol); if (defined($port)) { return("$ip:$port"); } else { return(undef); } } return($ip); } # ld_find_cmd_path # Find executable in path # pre: cmd: command to find # path: ':' delimited paths to check # relative: if set, allow cmd to be a relative path, # which is checked first # return: path to command # undef if not found sub ld_find_cmd_path { my ($cmd, $path, $relative) = (@_); if (defined $relative and $relative and -f "$cmd" ) { return $cmd; } if ($cmd =~ /^\// and -x "$cmd" ) { return $cmd; } if ($cmd =~ /\//) { return undef; } for my $p (split /:/, $path) { if ( -x "$p/$cmd" ) { return "$p/$cmd"; } } return undef; } # ld_find_cmd_path # Find executable in $ENV{'PATH'} # pre: cmd: command to find # relative: if set, allow cmd to be a relative path, # which is checked first # return: path to command # undef if not found sub ld_find_cmd { return ld_find_cmd_path($_[0], $ENV{'PATH'}, $_[1]); } # ld_get_addrport # Get address string and port number from a given socket. # pre: socket # return: (address, port) # undef if cannot get sub ld_get_addrport { my($sock) = @_; my ($s_addr_str, $s_port, $s_addr, $len); my $s_sockaddr = getsockname($sock); $len = length($s_sockaddr); if ($len == 28) { # IPv6 ($s_port, $s_addr) = unpack_sockaddr_in6($s_sockaddr); $s_addr_str = inet_ntop(AF_INET6, $s_addr); $s_addr_str = "[$s_addr_str]"; } elsif ($len == 16) { # IPv4 ($s_port, $s_addr) = unpack_sockaddr_in($s_sockaddr); $s_addr_str = inet_ntop(AF_INET, $s_addr); } else { die "unexpected length of sockaddr\n"; } return ($s_addr_str, $s_port); } # ld_strip_brackets # Strip brackets in the string # pre: string # return: string sub ld_strip_brackets { my($str) = @_; $str =~ s/[\[\]]//g; return $str; } diff --git a/rgmanager/src/resources/clusterfs.sh b/rgmanager/src/resources/clusterfs.sh index d052ffeef..cc99d5163 100755 --- a/rgmanager/src/resources/clusterfs.sh +++ b/rgmanager/src/resources/clusterfs.sh @@ -1,293 +1,330 @@ #!/bin/bash # # Cluster File System mount/umount/fsck/etc. agent # # Copyright (C) 2000 Mission Critical Linux # Copyright (C) 2002-2011 Red Hat, Inc. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # . $(dirname $0)/utils/fs-lib.sh do_metadata() { cat < 1.0 This defines a cluster file system mount (i.e. GFS) Defines a cluster file system mount. Symbolic name for this file system. File System Name Path in file system heirarchy to mount this file system. Mount Point Block device, file system label, or UUID of file system. Device or Label File system type. If not specified, mount(8) will attempt to determine the file system type. File system type If set, the cluster will kill all processes using this file system when the resource group is stopped. Otherwise, the unmount will fail, and the resource group will be restarted. Force Unmount If set, the file system will be checked (even if it is a journalled file system). This option is ignored for non-journalled file systems such as ext2. Mount Options If set and unmounting the file system fails, the node will immediately reboot. Generally, this is used in conjunction - with force-unmount support, but it is not required. + with force_unmount support, but it is not required. Seppuku Unmount File system ID for NFS exports. This can be overridden in individual nfsclient entries. NFS File system ID If set, the node will try to kill lockd and issue reclaims across all remaining network interface cards. This happens always, regardless of unmounting failed. Enable NFS lock workarounds + + + If set and unmounting the file system fails, the node will + try to restart nfs daemon and nfs lockd to drop all filesystem + references. Use this option as last resource. + This option requires force_unmount to be set and it is not + compatible with nfsserver resource. + + + Enable NFS daemon and lockd workaround + + + + EOT } verify_fstype() { # Auto detect? [ -z "$OCF_RESKEY_fstype" ] && return $OCF_SUCCESS case $OCF_RESKEY_fstype in gfs|gfs2) return $OCF_SUCCESS ;; *) ocf_log err "File system type $OCF_RESKEY_fstype not supported" return $OCF_ERR_ARGS ;; esac } verify_options() { declare -i ret=$OCF_SUCCESS # # From mount(8) # for o in `echo $OCF_RESKEY_options | sed -e s/,/\ /g`; do case $o in async|atime|auto|defaults|dev|exec|_netdev|noatime) continue ;; noauto|nodev|noexec|nosuid|nouser|ro|rw|suid|sync) continue ;; dirsync|user|users) continue ;; esac case $OCF_RESKEY_fstype in gfs) case $o in lockproto=*|locktable=*|hostdata=*) continue; ;; localcaching|localflocks|ignore_local_fs) continue; ;; num_glockd|acl|suiddir) continue ;; esac ;; gfs2) # XXX continue ;; esac ocf_log err "Option $o not supported for $OCF_RESKEY_fstype" ret=$OCF_ERR_ARGS done return $ret } do_verify() { verify_name || return $OCF_ERR_ARGS verify_fstype || return $OCF_ERR_ARGS verify_device || return $OCF_ERR_ARGS verify_mountpoint || return $OCF_ERR_ARGS verify_options || return $OCF_ERR_ARGS } do_pre_unmount() { # # Check the rgmanager-supplied reference count if one exists. # If the reference count is <= 1, we can safely proceed # if [ -n "$OCF_RESKEY_RGMANAGER_meta_refcnt" ]; then refs=$OCF_RESKEY_RGMANAGER_meta_refcnt if [ $refs -gt 0 ]; then ocf_log debug "Not unmounting $OCF_RESOURCE_INSTANCE - still in use by $refs other service(s)" return 2 fi fi if [ -z "$force_umount" ]; then ocf_log debug "Not umounting $dev (clustered file system)" return 2 fi # # Always do this hackery on clustered file systems. # if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then ocf_log warning "Dropping node-wide NFS locks" mkdir -p $mp/.clumanager/statd pkill -KILL -x lockd # Copy out the notify list; our # IPs are already torn down if notify_list_store $mp/.clumanager/statd; then notify_list_broadcast $mp/.clumanager/statd fi fi # Always invalidate buffers on clusterfs resources clubufflush -f $dev return 0 } +do_force_unmount() { + if [ "$OCF_RESKEY_nfsrestart" = "yes" ] || \ + [ "$OCF_RESKEY_nfsrestart" = "1" ]; then + ocf_log warning "Restarting nfsd/nfslock" + nfsexports=$(cat /var/lib/nfs/etab) + service nfslock stop + service nfs stop + service nfs start + service nfslock start + echo "$nfsexports" | { while read line; do + nfsexp=$(echo $line | awk '{print $1}') + nfsopts=$(echo $line | sed -e 's#.*(##g' -e 's#).*##g') + nfsacl=$(echo $line | awk '{print $2}' | sed -e 's#(.*##g') + if [ -n "$nfsopts" ]; then + exportfs -i -o "$nfsopts" "$nfsacl":$nfsexp + else + exportfs -i "$nfsacl":$nfsexp + fi + done; } + fi + return 1 +} + main $* diff --git a/rgmanager/src/resources/fs.sh.in b/rgmanager/src/resources/fs.sh.in index c43c17713..0ed934b39 100644 --- a/rgmanager/src/resources/fs.sh.in +++ b/rgmanager/src/resources/fs.sh.in @@ -1,463 +1,497 @@ #!/bin/bash # # File system (normal) mount/umount/fsck/etc. agent # # # Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved. # Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # . $(dirname $0)/utils/fs-lib.sh do_metadata() { cat < 1.0 This defines a standard file system mount (= not a clustered or otherwise shared file system). Defines a file system mount. Symbolic name for this file system. File System Name Path in file system heirarchy to mount this file system. Mount Point Block device, file system label, or UUID of file system. Device or Label File system type. If not specified, mount(8) will attempt to determine the file system type. File system type If set, the cluster will kill all processes using this file system when the resource group is stopped. Otherwise, the unmount will fail, and the resource group will be restarted. Force Unmount Use quick status checks. When set to 0 (the default), this agent behaves normally. When set to 1, this agent will not log errors incurred or perform the file system accessibility check (e.g. it will not try to read from/write to the file system). You should only set this to 1 if you have lots of file systems on your cluster or you are seeing very high load spikes as a direct result of this agent. Quick/brief status checks. If set and unmounting the file system fails, the node will immediately reboot. Generally, this is used in conjunction - with force-unmount support, but it is not required. + with force_unmount support, but it is not required. Seppuku Unmount If set and unmounting the file system fails, the node will try to kill lockd and issue reclaims across all remaining network interface cards. Enable NFS lock workarounds + + + If set and unmounting the file system fails, the node will + try to restart nfs daemon and nfs lockd to drop all filesystem + references. Use this option as last resource. + This option requires force_unmount to be set and it is not + compatible with nfsserver resource. + + + Enable NFS daemon and lockd workaround + + + + File system ID for NFS exports. This can be overridden in individual nfsclient entries. NFS File system ID If set, the file system will be checked (even if it is a journalled file system). This option is ignored for non-journalled file systems such as ext2. Force fsck support Options used when the file system is mounted. These are often file-system specific. See mount(8) for supported mount options. Mount Options EOT } verify_fstype() { # Auto detect? [ -z "$OCF_RESKEY_fstype" ] && return 0 case $OCF_RESKEY_fstype in ext2|ext3|ext4|btrfs|jfs|xfs|reiserfs|vfat|tmpfs|vxfs) return 0 ;; *) echo "File system type $OCF_RESKEY_fstype not supported" return $OCF_ERR_ARGS ;; esac } verify_options() { declare -i ret=$OCF_SUCCESS declare o # # From mount(8) # for o in `echo $OCF_RESKEY_options | sed -e s/,/\ /g`; do case $o in async|atime|auto|defaults|dev|exec|_netdev|noatime) continue ;; noauto|nodev|noexec|nosuid|nouser|ro|rw|suid|sync) continue ;; dirsync|user|users) continue ;; esac do_verify_option $OCF_RESKEY_fstype "$o" case $OCF_RESKEY_fstype in ext2|ext3|ext4) case $o in bsddf|minixdf|check|check=*|nocheck|debug) continue ;; errors=*|grpid|bsdgroups|nogrpid|sysvgroups) continue ;; resgid=*|resuid=*|sb=*|grpquota|noquota) continue ;; quota|usrquota|nouid32) continue ;; esac if [ "$OCF_RESKEY_fstype" = "ext3" ] || [ "$OCF_RESKEY_fstype" = "ext4" ]; then case $0 in noload|data=*) continue ;; esac fi ;; vfat) case $o in blocksize=512|blocksize=1024|blocksize=2048) continue ;; uid=*|gid=*|umask=*|dmask=*|fmask=*) continue ;; check=r*|check=n*|check=s*|codepage=*) continue ;; conv=b*|conv=t*|conv=a*|cvf_format=*) continue ;; cvf_option=*|debug|fat=12|fat=16|fat=32) continue ;; iocharset=*|quiet) continue ;; esac ;; jfs) case $o in conv|hash=rupasov|hash=tea|hash=r5|hash=detect) continue ;; hashed_relocation|no_unhashed_relocation) continue ;; noborder|nolog|notail|resize=*) continue ;; esac ;; xfs) case $o in biosize=*|dmapi|xdsm|logbufs=*|logbsize=*) continue ;; logdev=*|rtdev=*|noalign|noatime) continue ;; norecovery|osyncisdsync|quota|userquota) continue ;; uqnoenforce|grpquota|gqnoenforce) continue ;; sunit=*|swidth=*) continue ;; esac ;; tmpfs) case $o in size=*|nr_blocks=*|mode=*) continue ;; esac ;; btrfs) # tbd continue ;; esac echo Option $o not supported for $OCF_RESKEY_fstype ret=$OCF_ERR_ARGS done return $ret } do_validate() { verify_name || return $OCF_ERR_ARGS verify_fstype || return $OCF_ERR_ARGS verify_device || return $OCF_ERR_ARGS verify_mountpoint || return $OCF_ERR_ARGS verify_options || return $OCF_ERR_ARGS } do_pre_mount() { declare fstype="$OCF_RESKEY_fstype" # # Check to determine if we need to fsck the filesystem. # # Note: this code should not indicate in any manner suggested # file systems to use in the cluster. Known filesystems are # listed here for correct operation. # case "$fstype" in reiserfs) typeset fsck_needed="" ;; ext3) typeset fsck_needed="" ;; ext4) typeset fsck_needed="" ;; btrfs) typeset fsck_needed="" ;; jfs) typeset fsck_needed="" ;; xfs) typeset fsck_needed="" ;; vxfs) typeset fsck_needed="" ;; ext2) typeset fsck_needed=yes ;; minix) typeset fsck_needed=yes ;; vfat) typeset fsck_needed=yes ;; msdos) typeset fsck_needed=yes ;; "") typeset fsck_needed=yes ;; # assume fsck *) typeset fsck_needed=yes # assume fsck ocf_log warn "\ Unknown file system type '$fstype' for device $dev. Assuming fsck is required." ;; esac # # Fsck the device, if needed. # if [ -n "$fsck_needed" ] || [ "${OCF_RESKEY_force_fsck}" = "yes" ] ||\ [ "${OCF_RESKEY_force_fsck}" = "1" ]; then typeset fsck_log=@LOGDIR@/$(basename $dev).fsck.log ocf_log debug "Running fsck on $dev" fsck -p $dev >> $fsck_log 2>&1 ret_val=$? if [ $ret_val -gt 1 ]; then ocf_log err "\ 'fsck -p $dev' failed, error=$ret_val; check $fsck_log for errors" ocf_log debug "Invalidating buffers for $dev" $INVALIDATEBUFFERS -f $dev return $OCF_ERR_GENERIC fi rm -f $fsck_log fi return 0 } do_post_mount() { # # Create this for the NFS NLM broadcast bit # if [ $NFS_TRICKS -eq 0 ]; then if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then mkdir -p "$mp"/.clumanager/statd notify_list_merge "$mp"/.clumanager/statd fi fi return 0 } do_force_unmount() { if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then ocf_log warning "Dropping node-wide NFS locks" pkill -KILL -x lockd mkdir -p "$mp"/.clumanager/statd # Copy out the notify list; our # IPs are already torn down notify_list_store "$mp"/.clumanager/statd # Save for post-umount phase export nfslock_reclaim=1 fi + if [ "$OCF_RESKEY_nfsrestart" = "yes" ] || \ + [ "$OCF_RESKEY_nfsrestart" = "1" ]; then + ocf_log warning "Restarting nfsd/nfslock" + nfsexports=$(cat /var/lib/nfs/etab) + service nfslock stop + service nfs stop + service nfs start + service nfslock start + echo "$nfsexports" | { while read line; do + nfsexp=$(echo $line | awk '{print $1}') + nfsopts=$(echo $line | sed -e 's#.*(##g' -e 's#).*##g') + nfsacl=$(echo $line | awk '{print $2}' | sed -e 's#(.*##g') + if [ -n "$nfsopts" ]; then + exportfs -i -o "$nfsopts" "$nfsacl":$nfsexp + else + exportfs -i "$nfsacl":$nfsexp + fi + done; } + fi + # Proceed with fuser -kvm... return 1 } do_post_unmount() { if [ "$nfslock_reclaim" = "1" ]; then # If we have this flag set, do a full reclaim broadcast notify_list_broadcast "$mp"/.clumanager/statd fi return 0 } main $* diff --git a/rgmanager/src/resources/ip.sh b/rgmanager/src/resources/ip.sh index 8686f0279..29d6bfc6b 100755 --- a/rgmanager/src/resources/ip.sh +++ b/rgmanager/src/resources/ip.sh @@ -1,1006 +1,1023 @@ #!/bin/bash # # IPv4/IPv6 address management using new /sbin/ifcfg instead of # ifconfig utility. # # # Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved. # Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # LC_ALL=C LANG=C PATH=/bin:/sbin:/usr/bin:/usr/sbin export LC_ALL LANG PATH # Grab nfs lock tricks if available export NFS_TRICKS=1 if [ -f "$(dirname $0)/svclib_nfslock" ]; then . $(dirname $0)/svclib_nfslock NFS_TRICKS=0 fi . $(dirname $0)/ocf-shellfuncs meta_data() { cat < 1.0 This is an IP address. Both IPv4 and IPv6 addresses are supported, as well as NIC link monitoring for each IP address. This is an IP address. IPv4 or IPv6 address to use as a virtual IP resource. IP Address IPv4 or IPv6 address protocol family. Family Enabling this causes the status check to fail if the link on the NIC to which this IP address is bound is not present. Monitor NIC Link If set and unmounting the file system fails, the node will try to kill lockd and issue reclaims across all remaining network interface cards. Enable NFS lock workarounds Amount of time to sleep after removing an IP address. Value is specified in seconds. Default value is 10. Amount of time (seconds) to sleep. Disable updating of routing using RDISC protocol and preserve static routes. Disable updating of routing using RDISC protocol + + + The network interface to which the IP address should be added. The interface must already be configured and active. This parameter should be used only when at least two active interfaces have IP addresses on the same subnet and it is desired to have the IP address added to a particular interface. + + + Network interface + + + EOT } verify_address() { # XXX TBD return 0 } verify_all() { # XXX TBD return 0 } # # Expand an IPv6 address. # ipv6_expand() { typeset addr=$1 typeset maskbits typeset -i x typeset tempaddr maskbits=${addr/*\//} if [ "$maskbits" = "$addr" ]; then maskbits="" else # chop off mask bits addr=${addr/\/*/} fi # grab each hex quad and expand it to 4 digits if it isn't already # leave doublecolon in place for expansion out to the proper number of zeros later tempaddr="" for count in `seq 1 8`; do quad=`echo $addr|awk -v count=$count -F : '{print $count}'` quadlen=${#quad} if [ $quadlen -eq 0 ]; then quad=:: elif [ $quadlen -eq 1 ]; then quad=000$quad elif [ $quadlen -eq 2 ]; then quad=00$quad elif [ $quadlen -eq 3 ]; then quad=0$quad fi tempaddr=$tempaddr$quad done addr=$tempaddr # use space as placeholder addr=${addr/::/\ } # get rid of colons addr=${addr//:/} # add in zeroes where the doublecolon was len=$((${#addr}-1)) zeroes= while [ $len -lt 32 ]; do zeroes="0$zeroes" ((len++)) done addr=${addr/\ /$zeroes} # probably a better way to do this for (( x=0; x < ${#addr} ; x++)); do naddr=$naddr${addr:x:1} if (( x < (${#addr} - 1) && x%4 == 3)); then naddr=$naddr: fi done if [ -n "$maskbits" ]; then echo "$naddr/$maskbits" return 0 fi echo "$naddr" return 0 } # # see if two ipv6 addrs are in the same subnet # ipv6_same_subnet() { declare addrl=$1 declare addrr=$2 declare m=$3 declare r x llsb rlsb if [ $# -lt 2 ]; then ocf_log err "usage: ipv6_same_subnet addr1 addr2 [mask]" return 255 fi if [ -z "$m" ]; then m=${addrl/*\//} [ -n "$m" ] || return 1 fi if [ "${addrr}" != "${addrr/*\//}" ] && [ "$m" != "${addrr/*\//}" ]; then return 1 fi addrl=${addrl/\/*/} if [ ${#addrl} -lt 39 ]; then addrl=$(ipv6_expand $addrl) fi addrr=${addrr/\/*/} if [ ${#addrr} -lt 39 ]; then addrr=$(ipv6_expand $addrr) fi # Calculate the amount to compare directly x=$(($m/4+$m/16-(($m%4)==0))) # and the remaining number of bits r=$(($m%4)) if [ $r -ne 0 ]; then # If we have any remaining bits, we will need to compare # them later. Get them now. llsb=`printf "%d" 0x${addrl:$x:1}` rlsb=`printf "%d" 0x${addrr:$x:1}` # One less byte to compare directly, please ((--x)) fi # direct (string comparison) to see if they are equal if [ "${addrl:0:$x}" != "${addrr:0:$x}" ]; then return 1 fi case $r in 0) return 0 ;; 1) [ $(($llsb & 8)) -eq $(($rlsb & 8)) ] return $? ;; 2) [ $(($llsb & 12)) -eq $(($rlsb & 12)) ] return $? ;; 3) [ $(($llsb & 14)) -eq $(($rlsb & 14)) ] return $? ;; esac return 1 } ipv4_same_subnet() { declare addrl=$1 declare addrr=$2 declare m=$3 declare r x llsb rlsb if [ $# -lt 2 ]; then ocf_log err "usage: ipv4_same_subnet current_addr new_addr [maskbits]" return 255 fi # # Chop the netmask off of the ipaddr: # e.g. 1.2.3.4/22 -> 22 # if [ -z "$m" ]; then m=${addrl/*\//} [ -n "$m" ] || return 1 fi # # Check to see if there was a subnet mask provided on the # new IP address. If there was one and it does not match # our expected subnet mask, we are done. # if [ "${addrr}" != "${addrr/\/*/}" ] && [ "$m" != "${addrr/*\//}" ]; then return 1 fi # # Chop off subnet bits for good. # addrl=${addrl/\/*/} addrr=${addrr/\/*/} # # Remove '.' characters from dotted decimal notation and save # in arrays. i.e. # # 192.168.1.163 -> array[0] = 192 # array[1] = 168 # array[2] = 1 # array[3] = 163 # let x=0 for quad in ${addrl//./\ }; do ip1[((x++))]=$quad done x=0 for quad in ${addrr//./\ }; do ip2[((x++))]=$quad done x=0 while [ $m -ge 8 ]; do ((m-=8)) if [ ${ip1[x]} -ne ${ip2[x]} ]; then return 1 fi ((x++)) done case $m in 0) return 0 ;; 1) [ $((${ip1[x]} & 128)) -eq $((${ip2[x]} & 128)) ] return $? ;; 2) [ $((${ip1[x]} & 192)) -eq $((${ip2[x]} & 192)) ] return $? ;; 3) [ $((${ip1[x]} & 224)) -eq $((${ip2[x]} & 224)) ] return $? ;; 4) [ $((${ip1[x]} & 240)) -eq $((${ip2[x]} & 240)) ] return $? ;; 5) [ $((${ip1[x]} & 248)) -eq $((${ip2[x]} & 248)) ] return $? ;; 6) [ $((${ip1[x]} & 252)) -eq $((${ip2[x]} & 252)) ] return $? ;; 7) [ $((${ip1[x]} & 254)) -eq $((${ip2[x]} & 254)) ] return $? ;; esac return 1 } ipv6_list_interfaces() { declare idx dev ifaddr declare ifaddr_exp while read idx dev ifaddr; do isSlave $dev if [ $? -ne 2 ]; then continue fi idx=${idx/:/} ifaddr_exp=$(ipv6_expand $ifaddr) echo $dev ${ifaddr_exp/\/*/} ${ifaddr_exp/*\//} done < <(/sbin/ip -o -f inet6 addr | awk '{print $1,$2,$4}') return 0 } isSlave() { declare intf=$1 declare line if [ -z "$intf" ]; then ocf_log err "usage: isSlave " return $OCF_ERR_ARGS fi line=$(/sbin/ip link list dev $intf) if [ $? -ne 0 ]; then ocf_log err "$intf not found" return $OCF_ERR_GENERIC fi if [ "$line" = "${line/<*SLAVE*>/}" ]; then return 2 fi # Yes, it is a slave device. Ignore. return 0 } # # Check if interface is in UP state # interface_up() { declare intf=$1 if [ -z "$intf" ]; then ocf_log err "usage: interface_up " return 1 fi line=$(/sbin/ip -o link show up dev $intf 2> /dev/null) [ -z "$line" ] && return 2 return 0 } ethernet_link_up() { declare linkstate=$(ethtool $1 | grep "Link detected:" |\ awk '{print $3}') [ -n "$linkstate" ] || return 0 case $linkstate in yes) return 0 ;; *) return 1 ;; esac return 1 } # # Checks the physical link status of an ethernet or bonded interface. # network_link_up() { declare slaves declare intf_arg=$1 declare link_up=1 # Assume link down declare intf_test if [ -z "$intf_arg" ]; then ocf_log err "usage: network_link_up " return 1 fi ethernet_link_up $intf_arg link_up=$? if [ $link_up -eq 0 ]; then ocf_log debug "Link for $intf_arg: Detected" else ocf_log warn "Link for $intf_arg: Not detected" fi return $link_up } ipv4_list_interfaces() { declare idx dev ifaddr while read idx dev ifaddr; do isSlave $dev if [ $? -ne 2 ]; then continue fi idx=${idx/:/} echo $dev ${ifaddr/\/*/} ${ifaddr/*\//} done < <(/sbin/ip -o -f inet addr | awk '{print $1,$2,$4}') return 0 } # # Add an IP address to our interface. # ipv6() { declare dev maskbits declare addr=$2 declare addr_exp=$(ipv6_expand $addr) while read dev ifaddr_exp maskbits; do if [ -z "$dev" ]; then continue fi if [ "$1" = "add" ]; then + if [ -n "$OCF_RESKEY_prefer_interface" ] && \ + [ "$OCF_RESKEY_prefer_interface" != $dev ]; then + continue + fi ipv6_same_subnet $ifaddr_exp/$maskbits $addr_exp if [ $? -ne 0 ]; then continue fi interface_up $dev if [ $? -ne 0 ]; then continue fi if [ "$OCF_RESKEY_monitor_link" = "yes" ]; then network_link_up $dev if [ $? -ne 0 ]; then continue fi fi if [ "${addr/\/*/}" = "${addr}" ]; then addr="$addr/$maskbits" fi ocf_log info "Adding IPv6 address $addr to $dev" fi if [ "$1" = "del" ]; then if [ "${addr_exp/\/*/}" != "$ifaddr_exp" ]; then continue fi addr=`/sbin/ip addr list | grep "$addr" | head -n 1 | awk '{print $2}'` ocf_log info "Removing IPv6 address $addr from $dev" fi if [ "$1" = "add" ]; then ocf_log debug "Pinging addr ${addr%%/*} from dev $dev" if ping_check inet6 ${addr%%/*} $dev; then ocf_log err "IPv6 address collision ${addr%%/*}" return 1 fi fi /sbin/ip -f inet6 addr $1 dev $dev $addr [ $? -ne 0 ] && return 1 # # NDP should take of figuring out our new address. Plus, # we do not have something (like arping) to do this for ipv6 # anyway. # # RFC 2461, section 7.2.6 states thusly: # # Note that because unsolicited Neighbor Advertisements do not # reliably update caches in all nodes (the advertisements might # not be received by all nodes), they should only be viewed as # a performance optimization to quickly update the caches in # most neighbors. # # Not sure if this is necessary for ipv6 either. file=$(which rdisc 2>/dev/null) if [ -f "$file" ]; then if [ "$OCF_RESKEY_disable_rdisc" != "yes" ] && \ [ "$OCF_RESKEY_disable_rdisc" != "1" ]; then killall -HUP rdisc || rdisc -fs fi fi return 0 done < <(ipv6_list_interfaces) return 1 } # # Add an IP address to our interface. # ipv4() { declare dev ifaddr maskbits declare addr=$2 while read dev ifaddr maskbits; do if [ -z "$dev" ]; then continue fi if [ "$1" = "add" ]; then + if [ -n "$OCF_RESKEY_prefer_interface" ] && \ + [ "$OCF_RESKEY_prefer_interface" != $dev ]; then + continue + fi ipv4_same_subnet $ifaddr/$maskbits $addr if [ $? -ne 0 ]; then continue fi interface_up $dev if [ $? -ne 0 ]; then continue fi if [ "$OCF_RESKEY_monitor_link" = "yes" ]; then network_link_up $dev if [ $? -ne 0 ]; then continue fi fi if [ "${addr/\/*/}" = "${addr}" ]; then addr="$addr/$maskbits" fi ocf_log info "Adding IPv4 address $addr to $dev" fi if [ "$1" = "del" ]; then if [ "${addr/\/*/}" != "$ifaddr" ]; then continue fi addr=`/sbin/ip addr list | grep "$ifaddr/" | head -n 1 | awk '{print $2}'` ocf_log info "Removing IPv4 address $addr from $dev" fi if [ "$1" = "add" ]; then ocf_log debug "Pinging addr ${addr%%/*} from dev $dev" if ping_check inet ${addr%%/*} $dev; then ocf_log err "IPv4 address collision ${addr%%/*}" return 1 fi fi /sbin/ip -f inet addr $1 dev $dev $addr [ $? -ne 0 ] && return 1 # # The following is needed only with ifconfig; ifcfg does it for us # if [ "$1" = "add" ]; then # do that freak arp thing hwaddr=$(/sbin/ip -o link show $dev) hwaddr=${hwaddr/*link\/ether\ /} hwaddr=${hwaddr/\ \*/} addr=${addr/\/*/} ocf_log debug "Sending gratuitous ARP: $addr $hwaddr" arping -q -c 2 -U -I $dev $addr fi file=$(which rdisc 2>/dev/null) if [ -f "$file" ]; then if [ "$OCF_RESKEY_disable_rdisc" != "yes" ] && \ [ "$OCF_RESKEY_disable_rdisc" != "1" ]; then killall -HUP rdisc || rdisc -fs fi fi return 0 done < <(ipv4_list_interfaces) return 1 } # # Usage: # ping_check
[interface] # ping_check() { declare ops="-c 1 -w 2" declare pingcmd="" if [ "$1" = "inet6" ]; then pingcmd="ping6" else pingcmd="ping" fi if [ -n "$3" ]; then ops="$ops -I $3" fi return $($pingcmd $ops $2 &> /dev/null) } # # Usage: # check_interface_up
# check_interface_up() { declare dev declare addr=${2/\/*/} declare currentAddr caExpanded if [ "$1" == "inet6" ]; then addrExpanded=$(ipv6_expand $addr) for currentAddr in `/sbin/ip -f $1 -o addr|awk '{print $4}'`; do caExpanded=$(ipv6_expand $currentAddr) caExpanded=${caExpanded/\/*/} if [ "$addrExpanded" == "$caExpanded" ]; then dev=$(/sbin/ip -f $1 -o addr | grep " ${currentAddr/\/*/}" | awk '{print $2}') break fi done else dev=$(/sbin/ip -f $1 -o addr | grep " $addr/" | awk '{print $2}') fi if [ -z "$dev" ]; then return 1 fi interface_up $dev return $? } # # Usage: # address_configured
# address_configured() { declare line declare addr declare currentAddr caExpanded # Chop off maxk bits addr=${2/\/*/} if [ "$1" == "inet6" ]; then addrExpanded=$(ipv6_expand $addr) for currentAddr in `/sbin/ip -f $1 -o addr|awk '{print $4}'`; do caExpanded=$(ipv6_expand $currentAddr) caExpanded=${caExpanded/\/*/} if [ "$addrExpanded" == "$caExpanded" ]; then line=$(/sbin/ip -f $1 -o addr | grep " ${currentAddr/\/*/}"); break fi done else line=$(/sbin/ip -f $1 -o addr | grep " $addr/") fi if [ -z "$line" ]; then return 1 fi return 0 } # # Usage: # ip_op
[quiet] # ip_op() { declare dev declare rtr declare addr=${3/\/*/} declare caExpanded currentAddr if [ "$2" = "status" ]; then ocf_log debug "Checking $3, Level $OCF_CHECK_LEVEL" if [ "$1" == "inet6" ]; then addrExpanded=$(ipv6_expand $addr) for currentAddr in `/sbin/ip -f $1 -o addr|awk '{print $4}'`; do caExpanded=$(ipv6_expand $currentAddr) caExpanded=${caExpanded/\/*/} if [ "$addrExpanded" == "$caExpanded" ]; then dev=$(/sbin/ip -f $1 -o addr | grep " ${currentAddr/\/*/}" | awk '{print $2}') break fi done else dev=$(/sbin/ip -f $1 -o addr | grep " $addr/" | awk '{print $2}') fi if [ -z "$dev" ]; then ocf_log warn "$3 is not configured" return 1 fi ocf_log debug "$3 present on $dev" if [ "$OCF_RESKEY_monitor_link" = "yes" ]; then if ! network_link_up $dev; then ocf_log warn "No link on $dev..." return 1 fi ocf_log debug "Link detected on $dev" fi [ $OCF_CHECK_LEVEL -lt 10 ] && return 0 if ! ping_check $1 $addr $dev; then ocf_log warn "Failed to ping $addr" return 1 fi ocf_log debug "Local ping to $addr succeeded" return 0 fi case $1 in inet) ipv4 $2 $3 return $? ;; inet6) if [ "$2" = "del" ]; then addrExpanded=$(ipv6_expand $addr) for currentAddr in `/sbin/ip -f $1 -o addr|awk '{print $4}'`; do caExpanded=$(ipv6_expand $currentAddr) caExpanded=${caExpanded/\/*/} if [ "$addrExpanded" == "$caExpanded" ]; then addr6=$(/sbin/ip -f $1 -o addr | grep " ${currentAddr/\/*/}" | awk '{print $4}') ipv6 $2 $addr6 return $? fi done fi ipv6 $2 $3 return $? ;; esac return 1 } case ${OCF_RESKEY_family} in inet) ;; inet6) ;; *) if [ "${OCF_RESKEY_address//:/}" != "${OCF_RESKEY_address}" ]; then export OCF_RESKEY_family=inet6 else export OCF_RESKEY_family=inet fi ;; esac if [ -z "$OCF_CHECK_LEVEL" ]; then OCF_CHECK_LEVEL=0 fi if [ "${OCF_RESKEY_monitor_link}" = "no" ] || [ "${OCF_RESKEY_monitor_link}" = "0" ]; then OCF_RESKEY_monitor_link="no" else OCF_RESKEY_monitor_link="yes" fi case $1 in start) if address_configured ${OCF_RESKEY_family} ${OCF_RESKEY_address}; then ocf_log debug "${OCF_RESKEY_address} already configured" exit 0 fi ip_op ${OCF_RESKEY_family} add ${OCF_RESKEY_address} if [ $? -ne 0 ]; then exit $OCF_ERR_GENERIC fi if [ $NFS_TRICKS -eq 0 ]; then if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then notify_list_broadcast /var/lib/nfs/statd fi fi exit $? ;; stop) if address_configured ${OCF_RESKEY_family} ${OCF_RESKEY_address}; then ip_op ${OCF_RESKEY_family} del ${OCF_RESKEY_address} # Make sure it's down if address_configured ${OCF_RESKEY_family} ${OCF_RESKEY_address}; then ocf_log err "Failed to remove ${OCF_RESKEY_address}" exit 1 fi # XXX Let nfsd/lockd clear their queues; we hope to have a # way to enforce this in the future if [ -z "$OCF_RESKEY_sleeptime" ]; then sleep 10 else if [ "$OCF_RESKEY_sleeptime" -gt "0" ]; then sleep $OCF_RESKEY_sleeptime fi fi else ocf_log debug "${OCF_RESKEY_address} is not configured" fi exit 0 ;; status|monitor) ip_op ${OCF_RESKEY_family} status ${OCF_RESKEY_address} [ $? -ne 0 ] && exit $OCF_NOT_RUNNING check_interface_up ${OCF_RESKEY_family} ${OCF_RESKEY_address} exit $? ;; restart) $0 stop || exit $OCF_ERR_GENERIC $0 start || exit $OCF_ERR_GENERIC exit 0 ;; meta-data) meta_data exit 0 ;; validate-all|verify_all) verify_all exit $? ;; *) echo "usage: $0 {start|stop|status|monitor|restart|meta-data|validate-all}" exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/rgmanager/src/resources/orainstance.sh b/rgmanager/src/resources/orainstance.sh index 93c4c68d1..3d4c12349 100644 --- a/rgmanager/src/resources/orainstance.sh +++ b/rgmanager/src/resources/orainstance.sh @@ -1,526 +1,526 @@ #!/bin/bash # # Copyright 2003-2004, 2006-2011 Red Hat, Inc. # # Author(s): # Hardy Merrill # Lon Hohberger # Michael Moon # # This program is Open Source software. You may modify and/or redistribute # it persuant to the terms of the Open Software License version 2.1, which # is available from the following URL and is included herein by reference: # # http://opensource.org/licenses/osl-2.1.php # # chkconfig: 345 99 01 # description: Service script for starting/stopping \ # Oracle(R) Database 10g on \ # Red Hat Enterprise Linux 5 # # NOTES: # # (1) You can comment out the LOCKFILE declaration below. This will prevent # the need for this script to access anything outside of the ORACLE_HOME # path. # # (2) You MUST customize ORACLE_USER, ORACLE_HOME, ORACLE_SID, and # ORACLE_HOSTNAME to match your installation if not running from within # rgmanager. # # (3) Do NOT place this script in shared storage; place it in ORACLE_USER's # home directory in non-clustered environments and /usr/share/cluster # in rgmanager/Red Hat cluster environments. # # Oracle is a registered trademark of Oracle Corporation. # Oracle9i is a trademark of Oracle Corporation. # Oracle10g is a trademark of Oracle Corporation. # All other trademarks are property of their respective owners. # # # $Id: orainstance.sh 127 2009-08-21 09:17:52Z hevirtan $ # # Original version is distributed with RHCS. The modifications include # the following minor changes: # - Meta-data moved to a dedicated file # - Support for multiple listeners # - Disabled EM # - SysV init support removed. Only usable with rgmanager # . /etc/init.d/functions declare SCRIPT="`basename $0`" declare SCRIPTDIR="`dirname $0`" # Required parameters from rgmanager ORACLE_USER=$OCF_RESKEY_user ORACLE_HOME=$OCF_RESKEY_home ORACLE_SID=$OCF_RESKEY_name # Optional parameters with default values LISTENERS=$OCF_RESKEY_listeners LOCKFILE="/tmp/.oracle10g-${ORACLE_SID}.lock" [ -n "$OCF_RESKEY_lockfile" ] && LOCKFILE=$OCF_RESKEY_lockfile export LISTENERS ORACLE_USER ORACLE_HOME ORACLE_SID LOCKFILE export LD_LIBRARY_PATH=$ORACLE_HOME/lib export PATH=$ORACLE_HOME/bin:$PATH declare -i RESTART_RETRIES=3 declare -r DB_PROCNAMES="pmon" declare -r LSNR_PROCNAME="tnslsnr" # # Start Oracle (database portion) # start_db() { declare tmpfile declare logfile declare -i rv tmpfile=/tmp/$SCRIPT-start.$$ logfile=/tmp/$SCRIPT-start.log.$$ # Set up our sqlplus script. Basically, we're trying to # capture output in the hopes that it's useful in the case # that something doesn't work properly. echo "startup" > $tmpfile echo "quit" >> $tmpfile sqlplus "/ as sysdba" < $tmpfile > $logfile rv=$? rm -f $tmpfile # Dump logfile to /var/log/messages initlog -q -c "cat $logfile" if [ $rv -ne 0 ]; then rm -f $logfile initlog -n $SCRIPT -q -s "sqlplus returned 1, failed" return 1 fi # If we see: # ORA-.....: failure, we failed - grep -q "failure" $logfile + grep -q "^ORA-" $logfile rv=$? rm -f $logfile if [ $rv -eq 0 ]; then initlog -n $SCRIPT -q -s "found failure in stdout, returning 1" return 1 fi return 0 } # # Stop Oracle (database portion) # stop_db() { declare tmpfile declare logfile declare -i rv tmpfile=/tmp/$SCRIPT-stop.$$ logfile=/tmp/$SCRIPT-stop.log.$$ ora_procname="ora_${DB_PROCNAMES}_${ORACLE_SID}" status $ora_procname if [ $? -ne 0 ]; then # No pmon process found, db already down return 0 fi # Setup for Stop ... echo "shutdown immediate" > $tmpfile echo "quit" >> $tmpfile sqlplus "/ as sysdba" < $tmpfile > $logfile rv=$? rm -f $tmpfile # Dump logfile to /var/log/messages initlog -q -c "cat $logfile" # sqlplus returned failure. We'll return failed to rhcs if [ $rv -ne 0 ]; then rm -f $logfile initlog -n $SCRIPT -q -s "sqlplus returned 1, failed" return 1 fi - grep -q failure $logfile + grep -q "^ORA-" $logfile rv=$? rm -f $logfile # If we see 'failure' in the log, we're done. if [ $rv -eq 0 ]; then initlog -n $SCRIPT -q -s "found failure in stdout, returning 1" return 1 fi return 0 } # # Destroy any remaining processes with refs to $ORACLE_SID # force_cleanup() { declare pids declare pid pids=`ps ax | grep $ORACLE_SID | grep -v grep | awk '{print $1}'` initlog -n $SCRIPT -s " Not all Oracle processes exited cleanly, killing" for pid in $pids; do kill -9 $pid if [ $? -eq 0 ]; then initlog -n $SCRIPT -s "Killed $pid" fi done return 0 } # # Wait for oracle processes to exit. Time out after 60 seconds # exit_idle() { declare -i n=0 while ps ax | grep $ORACLE_SID | grep -q -v $LSNR_PROCNAME | grep -q -v grep; do if [ $n -ge 90 ]; then force_cleanup return 0 fi sleep 1 ((n++)) done return 0 } # # Get database background process status. Restart it if it failed and # we have seen the lock file. # get_db_status() { declare -i subsys_lock=$1 declare -i i=0 declare -i rv=0 declare ora_procname for procname in $DB_PROCNAMES ; do ora_procname="ora_${procname}_${ORACLE_SID}" status $ora_procname if [ $? -eq 0 ] ; then # This one's okay; go to the next one. continue fi # We're not supposed to be running, and we are, # in fact, not running... if [ $subsys_lock -ne 0 ]; then return 3 fi for (( i=$RESTART_RETRIES ; i; i-- )) ; do # this db process is down - stop and # (re)start all ora_XXXX_$ORACLE_SID processes initlog -q -n $SCRIPT -s "Restarting Oracle Database..." stop_db start_db if [ $? == 0 ] ; then # ora_XXXX_$ORACLE_SID processes started # successfully, so break out of the # stop/start # 'for' loop break fi done if [ $i -eq 0 ]; then # stop/start's failed - return 1 (failure) initlog -q -n $SCRIPT -s "Restart failed, retuning 1" return 1 fi done return 0 } # # Get the status of the Oracle listener process # get_lsnr_status() { declare -i subsys_lock=$1 declare -i rv declare -r LISTENER=$3 lsnrctl status $LISTENER >& /dev/null rv=$? if [ $rv == 0 ] ; then return 0 # Listener is running fine fi # We're not supposed to be running, and we are, # in fact, not running. Return 3 if [ $subsys_lock -ne 0 ]; then return 3 fi # Listener is NOT running (but should be) - try to restart for (( i=$RESTART_RETRIES ; i; i-- )) ; do initlog -n $SCRIPT -q -s "Restarting Oracle listener ($LISTENER)" lsnrctl start $LISTENER lsnrctl status $LISTENER >& /dev/null if [ $? == 0 ] ; then break # Listener was (re)started and is running fine fi done if [ $i -eq 0 ]; then # stop/start's failed - return 1 (failure) initlog -n $SCRIPT -q -s "Listener restart failed, retuning 1" return 1 fi lsnrctl status $LISTENER >& /dev/null if [ $? != 0 ] ; then initlog -n $SCRIPT -q -s "Listener status failed, retuning 1" return 1 # Problem restarting the Listener fi return 0 # Success restarting the Listener } # # Helps us keep a running status so we know what our ultimate return # code will be. Returns 1 if the $1 and $2 are not equivalent, otherwise # returns $1. The return code is meant to be the next $1 when this is # called, so, for example: # # update_status 0 <-- returns 0 # update_status $? 0 <-- returns 0 # update_status $? 3 <-- returns 1 (values different - error condition) # update_status $? 1 <-- returns 1 (same, but happen to be error state!) # # update_status 3 # update_status $? 3 <-- returns 3 # # (and so forth...) # update_status() { declare -i old_status=$1 declare -i new_status=$2 if [ -z "$2" ]; then return $old_status fi if [ $old_status -ne $new_status ]; then initlog -n $SCRIPT -q -s "$old_status vs $new_status - returning 1" return 1 fi return $old_status } # # Print an error message to the user and exit. # oops() { #echo "Please configure this script ($0) to" #echo "match your installation." #echo #echo " $1 failed validation checks." initlog -n $SCRIPT -q -s "$1 failed validation checks" exit 1 } # # Do some validation on the user-configurable stuff at the beginning of the # script. # validation_checks() { # If the oracle user doesn't exist, we're done. [ -n "$ORACLE_USER" ] || oops "ORACLE_USER" id -u $ORACLE_USER > /dev/null || oops "ORACLE_USER" id -g $ORACLE_USER > /dev/null || oops "ORACLE_USER" # If the oracle home isn't a directory, we're done [ -n "$ORACLE_HOME" ] || oops ORACLE_HOME # If the oracle SID is NULL, we're done [ -n "$ORACLE_SID" ] || oops ORACLE_SID # Super user? Automatically change UID and exec as oracle user. # Oracle needs to be run as the Oracle user, not root! if [ "`id -u`" = "0" ]; then su $ORACLE_USER -c "$0 $*" exit $? fi # If we're not root and not the Oracle user, we're done. [ "`id -u`" = "`id -u $ORACLE_USER`" ] || exit 1 [ "`id -g`" = "`id -g $ORACLE_USER`" ] || exit 1 # Go home. cd $ORACLE_HOME return 0 } # # Start Oracle # start_oracle() { initlog -n $SCRIPT -q -s "Starting Oracle Database" start_db || return 1 for LISTENER in ${LISTENERS}; do logfile=/tmp/$SCRIPT-lsn-$$.log initlog -n $SCRIPT -q -s "Starting Oracle Listener $LISTENER" lsnrctl start $LISTENER > $logfile initlog -q -c "cat $logfile" rm -f $logfile done if [ -n "$LOCKFILE" ]; then touch $LOCKFILE fi return 0 } # # Stop Oracle # stop_oracle() { if ! [ -e "$ORACLE_HOME/bin/lsnrctl" ]; then initlog -n $SCRIPT -q -s "Oracle Listener Control is not available ($ORACLE_HOME not mounted?)" return 0 fi initlog -n $SCRIPT -q -s "Stopping Oracle Database" stop_db || return 1 for LISTENER in ${LISTENERS}; do initlog -n $SCRIPT -q -s "Stopping Oracle Listener $LISTENER" lsnrctl stop $LISTENER done initlog -n $SCRIPT -q -s "Waiting for all Oracle processes to exit" exit_idle if [ $? -ne 0 ]; then initlog -n $SCRIPT -q -s "WARNING: Not all Oracle processes exited cleanly" fi if [ -n "$LOCKFILE" ]; then rm -f $LOCKFILE fi return 0 } # # Find and display the status of iAS infrastructure. # # This has three parts: # (1) Oracle database itself # (2) Oracle listener process # (3) OPMN and OPMN-managed processes # # - If all are (cleanly) down, we return 3. In order for this to happen, # $LOCKFILE must not exist. In this case, we try and restart certain parts # of the service - as this may be running in a clustered environment. # # - If some but not all are running (and, if $LOCKFILE exists, we could not # restart the failed portions), we return 1 (ERROR) # # - If all are running, return 0. In the "all-running" case, we recreate # $LOCKFILE if it does not exist. # status_oracle() { declare -i subsys_lock=1 declare -i last declare -i depth=$1 # Check for lock file. Crude and rudimentary, but it works if [ -z "$LOCKFILE" ] || [ -f $LOCKFILE ]; then subsys_lock=0 fi # Check database status get_db_status $subsys_lock $depth update_status $? # Start last=$? # Check & report listener status for LISTENER in ${LISTENERS}; do get_lsnr_status $subsys_lock $depth $LISTENER update_status $? $last last=$? done # No lock file, but everything's running. Put the lock # file back. XXX - this kosher? if [ $last -eq 0 ] && [ $subsys_lock -ne 0 ]; then touch $LOCKFILE fi return $last } ######################## # Do some real work... # ######################## case $1 in meta-data) cat `echo $0 | sed 's/^\(.*\)\.sh$/\1.metadata/'` exit 0 ;; start) validation_checks $* start_oracle exit $? ;; stop) validation_checks $* stop_oracle exit $? ;; status|monitor) validation_checks $* status_oracle $OCF_CHECK_LEVEL exit $? ;; restart) $0 stop || exit $? $0 start || exit $? exit 0 ;; *) echo "usage: $SCRIPT {start|stop|restart|status|monitor|meta-data}" exit 1 ;; esac exit 0 diff --git a/rgmanager/src/resources/utils/fs-lib.sh b/rgmanager/src/resources/utils/fs-lib.sh index 3bc7cacd8..d31957356 100644 --- a/rgmanager/src/resources/utils/fs-lib.sh +++ b/rgmanager/src/resources/utils/fs-lib.sh @@ -1,1005 +1,1007 @@ #!/bin/bash # # Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved. # Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # # File system common functions # LC_ALL=C LANG=C PATH=/bin:/sbin:/usr/bin:/usr/sbin export LC_ALL LANG PATH # Private return codes FAIL=2 NO=1 YES=0 YES_STR="yes" [ -z "$OCF_RESOURCE_INSTANCE" ] && export OCF_RESOURCE_INSTANCE="filesystem:$OCF_RESKEY_name" # # Using a global to contain the return value saves # clone() operations. This is important to reduce # resource consumption during status checks. # # There is no way to return a string from a function # in bash without cloning the process, which is exactly # what we are trying to avoid. So, we have to resort # to using a dedicated global variable. This one is # for the real_device() function below. # declare REAL_DEVICE # # Stub ocf_log function for when we are using # quick_status, since ocf_log generally forks (and # sourcing ocf-shellfuncs forks -a lot-). # ocf_log() { echo $* } # # Assume NFS_TRICKS are not available until we are # proved otherwise. # export NFS_TRICKS=1 # # Quick status doesn't fork() or clone() when using # device files directly. (i.e. not symlinks, LABEL= or # UUID= # if [ "$1" = "status" -o "$1" = "monitor" ] && [ "$OCF_RESKEY_quick_status" = "1" ]; then echo Using Quick Status # XXX maybe we can make ocf-shellfuncs have a 'quick' mode too? export OCF_SUCCESS=0 export OCF_ERR_GENERIC=1 else # # Grab nfs lock tricks if available # if [ -f "$(dirname $0)/svclib_nfslock" ]; then . $(dirname $0)/svclib_nfslock NFS_TRICKS=0 fi . $(dirname $0)/ocf-shellfuncs fi verify_name() { if [ -z "$OCF_RESKEY_name" ]; then ocf_log err "No file system name specified." return $OCF_ERR_ARGS fi return $OCF_SUCCESS } verify_mountpoint() { if [ -z "$OCF_RESKEY_mountpoint" ]; then ocf_log err "No mount point specified." return $OCF_ERR_ARGS fi if ! [ -e "$OCF_RESKEY_mountpoint" ]; then ocf_log info "Mount point $OCF_RESKEY_mountpoint will be "\ "created at mount time." return $OCF_SUCCESS fi [ -d "$OCF_RESKEY_mountpoint" ] && return $OCF_SUCCESS ocf_log err "$OCF_RESKEY_mountpoint exists but is not a directory." return $OCF_ERR_ARGS } # # This used to be called using $(...), but doing this causes bash # to set up a pipe and clone(). So, the output of this function is # stored in the global variable REAL_DEVICE, declared previously. # real_device() { declare dev="$1" declare realdev REAL_DEVICE="" [ -z "$dev" ] && return $OCF_ERR_ARGS # Oops, we have a link. Sorry, this is going to fork. if [ -h "$dev" ]; then realdev=$(readlink -f $dev) if [ $? -ne 0 ]; then return $OCF_ERR_ARGS fi REAL_DEVICE="$realdev" return $OCF_SUCCESS fi # If our provided blockdev is a device, we are done if [ -b "$dev" ]; then REAL_DEVICE="$dev" return $OCF_SUCCESS fi # It's not a link, it's not a block device. If it also # does not match UUID= or LABEL=, then findfs is not # going to find anything useful, so we should quit now. if [ "${dev/UUID=/}" = "$dev" ] && [ "${dev/LABEL=/}" = "$dev" ]; then return $OCF_ERR_GENERIC fi # When using LABEL= or UUID=, we can't save a fork. realdev=$(findfs "$dev" 2> /dev/null) if [ -n "$realdev" ] && [ -b "$realdev" ]; then REAL_DEVICE="$realdev" return $OCF_SUCCESS fi return $OCF_ERR_GENERIC } verify_device() { declare realdev if [ -z "$OCF_RESKEY_device" ]; then ocf_log err "No device or label specified." return $OCF_ERR_ARGS fi real_device "$OCF_RESKEY_device" realdev="$REAL_DEVICE" if [ -n "$realdev" ]; then if [ "$realdev" != "$OCF_RESKEY_device" ]; then ocf_log info "Specified $OCF_RESKEY_device maps to $realdev" fi return $OCF_SUCCESS fi ocf_log err "Device or label \"$OCF_RESKEY_device\" not valid" return $OCF_ERR_ARGS } # # mount_in_use device mount_point # # Check to see if either the device or mount point are in use anywhere on # the system. It is not required that the device be mounted on the named # moint point, just if either are in use. # mount_in_use () { declare mp tmp_mp declare dev tmp_dev declare junka junkb junkc junkd if [ $# -ne 2 ]; then ocf_log err "Usage: mount_in_use device mount_point". return $FAIL fi dev="$1" mp="$2" while read -r tmp_dev tmp_mp junka junkb junkc junkd; do # XXX fork/clone warning XXX if [ "${tmp_dev:0:1}" != "-" ]; then tmp_dev="$(printf "$tmp_dev")" fi if [ -n "$tmp_dev" -a "$tmp_dev" = "$dev" ]; then case $OCF_RESKEY_fstype in cifs|nfs|nfs4) ;; *) return $YES ;; esac fi # Mountpoint from /proc/mounts containing spaces will # have spaces represented in octal. printf takes care # of this for us. tmp_mp="$(printf "$tmp_mp")" if [ -n "$tmp_mp" -a "$tmp_mp" = "$mp" ]; then return $YES fi done < /proc/mounts return $NO } # # is_mounted device mount_point # # Check to see if the device is mounted. Print a warning if its not # mounted on the directory we expect it to be mounted on. # is_mounted () { declare mp tmp_mp declare dev tmp_dev declare ret=$FAIL declare found=1 declare poss_mp if [ $# -ne 2 ]; then ocf_log err "Usage: is_mounted device mount_point" return $FAIL fi real_device "$1" dev="$REAL_DEVICE" if [ -z "$dev" ]; then ocf_log err "$OCF_RESOURCE_INSTANCE: is_mounted: Could not match $1 with a real device" return $OCF_ERR_ARGS fi if [ -h "$2" ]; then mp="$(readlink -f $2)" else mp="$2" fi ret=$NO # This bash glyph simply removes a trailing slash # if one exists. /a/b/ -> /a/b; /a/b -> /a/b. mp="${mp%/}" while read -r tmp_dev tmp_mp junk_a junk_b junk_c junk_d do # XXX fork/clone warning XXX if [ "${tmp_dev:0:1}" != "-" ]; then tmp_dev="$(printf "$tmp_dev")" fi real_device "$tmp_dev" tmp_dev="$REAL_DEVICE" # XXX fork/clone warning XXX # Mountpoint from /proc/mounts containing spaces will # have spaces represented in octal. printf takes care # of this for us. tmp_mp="$(printf "$tmp_mp")" if [ -n "$tmp_dev" -a "$tmp_dev" = "$dev" ]; then # # Check to see if its mounted in the right # place # if [ -n "$tmp_mp" ]; then if [ "$tmp_mp" != "$mp" ]; then poss_mp=$tmp_mp else found=0 fi fi ret=$YES fi done < /proc/mounts if [ $ret -eq $YES ] && [ $found -ne 0 ]; then case $OCF_RESKEY_fstype in cifs|nfs|nfs4) ret=$NO ;; *) ocf_log warn "Device $dev is mounted on $poss_mp instead of $mp" ;; esac fi return $ret } # # is_alive mount_point # # Check to see if mount_point is alive (testing read/write) # is_alive() { declare errcode declare mount_point="$1" declare file=".writable_test.$(hostname)" declare rw if [ $# -ne 1 ]; then ocf_log err "Usage: is_alive mount_point" return $FAIL fi [ -z "$OCF_CHECK_LEVEL" ] && export OCF_CHECK_LEVEL=0 test -d "$mount_point" if [ $? -ne 0 ]; then ocf_log err "${OCF_RESOURCE_INSTANCE}: is_alive: $mount_point is not a directory" return $FAIL fi [ $OCF_CHECK_LEVEL -lt 10 ] && return $YES # depth 10 test (read test) ls "$mount_point" > /dev/null 2> /dev/null errcode=$? if [ $errcode -ne 0 ]; then ocf_log err "${OCF_RESOURCE_INSTANCE}: is_alive: failed read test on [$mount_point]. Return code: $errcode" return $NO fi [ $OCF_CHECK_LEVEL -lt 20 ] && return $YES # depth 20 check (write test) rw=$YES for o in `echo $OCF_RESKEY_options | sed -e s/,/\ /g`; do if [ "$o" = "ro" ]; then rw=$NO fi done if [ $rw -eq $YES ]; then file="$mount_point"/$file while true; do if [ -e "$file" ]; then file=${file}_tmp continue else break fi done touch $file > /dev/null 2> /dev/null errcode=$? if [ $errcode -ne 0 ]; then ocf_log err "${OCF_RESOURCE_INSTANCE}: is_alive: failed write test on [$mount_point]. Return code: $errcode" return $NO fi rm -f $file > /dev/null 2> /dev/null fi return $YES } # # Decide which quota options are enabled and return a string # which we can pass to quotaon # quota_opts() { declare quotaopts="" declare opts="$1" declare mopt for mopt in `echo $opts | sed -e s/,/\ /g`; do case $mopt in quota) quotaopts="gu" break ;; usrquota) quotaopts="u$quotaopts" continue ;; grpquota) quotaopts="g$quotaopts" continue ;; noquota) quotaopts="" return 0 ;; esac done echo $quotaopts return 0 } # # Enable quotas on the mount point if the user requested them # enable_fs_quotas() { declare -i need_check=0 declare -i rv declare quotaopts="" declare mopt declare opts="$1" declare mp="$2" if ! type quotaon &> /dev/null; then ocf_log err "quotaon not found in $PATH" return $OCF_ERR_GENERIC fi quotaopts=$(quota_opts $opts) [ -z "$quotaopts" ] && return 0 ocf_log debug "quotaopts = $quotaopts" # Ok, create quota files if they don't exist for f in quota.user aquota.user quota.group aquota.group; do if ! [ -f "$mp/$f" ]; then ocf_log info "$mp/$f was missing - creating" touch "$mp/$f" chmod 600 "$mp/$f" need_check=1 fi done if [ $need_check -eq 1 ]; then ocf_log info "Checking quota info in $mp" quotacheck -$quotaopts "$mp" fi ocf_log info "Enabling Quotas on $mp" ocf_log debug "quotaon -$quotaopts \"$mp\"" quotaon -$quotaopts "$mp" rv=$? if [ $rv -ne 0 ]; then # Just a warning ocf_log warn "Unable to turn on quotas for $mp; return = $rv" fi return $rv } # Agent-specific actions to take before mounting # (if required). Typically things like fsck. do_pre_mount() { return 0 } # Default mount handler - for block devices # do_mount() { declare dev="$1" declare mp="$2" declare mount_options="" declare fstype_option="" declare fstype # # Get the filesystem type, if specified. # fstype_option="" fstype=${OCF_RESKEY_fstype} case "$fstype" in ""|"[ ]*") fstype="" ;; *) # found it fstype_option="-t $fstype" ;; esac # # Get the mount options, if they exist. # mount_options="" opts=${OCF_RESKEY_options} case "$opts" in ""|"[ ]*") opts="" ;; *) # found it mount_options="-o $opts" ;; esac # # Mount the device # ocf_log info "mounting $dev on $mp" ocf_log err "mount $fstype_option $mount_options $dev $mp" mount $fstype_option $mount_options "$dev" "$mp" ret_val=$? if [ $ret_val -ne 0 ]; then ocf_log err "\ 'mount $fstype_option $mount_options $dev $mp' failed, error=$ret_val" return 1 fi return 0 } # Agent-specific actions to take after mounting # (if required). do_post_mount() { return 0 } # Agent-specific actions to take before unmounting # (if required) do_pre_unmount() { return 0 } # Agent-specific actions to take after umount succeeds # (if required) do_post_unmount() { return 0 } -# Agent-specific force-unmount logic, if required +# Agent-specific force unmount logic, if required # return = nonzero if successful, or 0 if unsuccessful # (unsuccessful = try harder) do_force_unmount() { return 1 } # # start_filesystem # start_filesystem() { declare -i ret_val=$OCF_SUCCESS declare mp="${OCF_RESKEY_mountpoint}" declare dev="" # device declare fstype="" declare opts="" declare mount_options="" # # Check if mount point was specified. If not, no need to continue. # case "$mp" in ""|"[ ]*") # nothing to mount return $OCF_SUCCESS ;; /*) # found it ;; *) # invalid format ocf_log err \ "start_filesystem: Invalid mount point format (must begin with a '/'): \'$mp\'" return $OCF_ERR_ARGS ;; esac # # Get the device # real_device "$OCF_RESKEY_device" dev="$REAL_DEVICE" if [ -z "$dev" ]; then ocf_log err "\ start_filesystem: Could not match $OCF_RESKEY_device with a real device" return $OCF_ERR_ARGS fi # # Ensure we've got a valid directory # if [ -e "$mp" ]; then if ! [ -d "$mp" ]; then ocf_log err"\ start_filesystem: Mount point $mp exists but is not a directory" return $OCF_ERR_ARGS fi else ocf_log err "\ start_filesystem: Creating mount point $mp for device $dev" mkdir -p "$mp" ret_val=$? if [ $ret_val -ne 0 ]; then ocf_log err "\ start_filesystem: Unable to create $mp. Error code: $ret_val" return $OCF_ERR_GENERIC fi fi # # See if the device is already mounted. # is_mounted "$dev" "$mp" case $? in $YES) # already mounted ocf_log debug "$dev already mounted" return $OCF_SUCCESS ;; $NO) # not mounted, continue ;; *) return $FAIL ;; esac # # Make sure that neither the device nor the mount point are mounted # (i.e. they may be mounted in a different location). The'mount_in_use' # function checks to see if either the device or mount point are in # use somewhere else on the system. # mount_in_use "$dev" "$mp" case $? in $YES) # uh oh, someone is using the device or mount point ocf_log err "\ Cannot mount $dev on $mp, the device or mount point is already in use!" return $FAIL ;; $NO) # good, no one else is using it ;; $FAIL) return $FAIL ;; *) ocf_log err "Unknown return from mount_in_use" return $FAIL ;; esac do_pre_mount case $? in 0) ;; 1) return $OCF_ERR_GENERIC ;; 2) return $OCF_SUCCESS ;; esac do_mount "$dev" "$mp" case $? in 0) ;; 1) return $OCF_ERR_GENERIC ;; 2) return $OCF_SUCCESS ;; esac do_post_mount case $? in 0) ;; 1) return $OCF_ERR_GENERIC ;; 2) return $OCF_SUCCESS ;; esac enable_fs_quotas "$opts" "$mp" return $OCF_SUCCESS } # # stop_filesystem - unmount a file system; calls out to # stop_filesystem() { declare -i ret_val=0 declare -i try declare -i sleep_time=5 # time between each umount failure declare umount_failed="" declare force_umount="" declare self_fence="" declare quotaopts="" # # Get the mount point, if it exists. If not, no need to continue. # mp=${OCF_RESKEY_mountpoint} case "$mp" in ""|"[ ]*") # nothing to mount return $OCF_SUCCESS ;; /*) # found it ;; *) # invalid format ocf_log err \ "stop_filesystem: Invalid mount point format (must begin with a '/'): \'$mp\'" return $FAIL ;; esac # # Get the device # real_device "$OCF_RESKEY_device" dev="$REAL_DEVICE" if [ -z "$dev" ]; then ocf_log err "\ stop: Could not match $OCF_RESKEY_device with a real device" return $FAIL fi # # Get the force unmount setting if there is a mount point. # case ${OCF_RESKEY_force_unmount} in $YES_STR) force_umount=$YES ;; on) force_umount=$YES ;; true) force_umount=$YES ;; 1) force_umount=$YES ;; *) force_umount="" ;; esac case ${OCF_RESKEY_self_fence} in $YES_STR) self_fence=$YES ;; on) self_fence=$YES ;; true) self_fence=$YES ;; 1) self_fence=$YES ;; *) self_fence="" ;; esac do_pre_unmount case $? in 0) ;; 1) return $OCF_ERR_GENERIC ;; 2) return $OCF_SUCCESS ;; esac # # Preparations: sync, turn off quotas # sync quotaopts=$(quota_opts $OCF_RESKEY_options) if [ -n "$quotaopts" ]; then ocf_log debug "Turning off quotas for $mp" quotaoff -$quotaopts "$mp" &> /dev/null fi # # Unmount the device. # for try in 1 2 3; do if [ $try -ne 1 ]; then sleep $sleep_time fi is_mounted "$dev" "$mp" case $? in $NO) ocf_log info "$dev is not mounted" umount_failed= break ;; $YES) # fallthrough ;; *) return $FAIL ;; esac ocf_log info "unmounting $mp" umount "$mp" ret_val=$? - if [ $ret_val -eq 0 ]; then + # some versions of umount will exit with status 16 iff + # the umount(2) succeeded but /etc/mtab could not be written. + if [ $ret_val -eq 0 -o $ret_val -eq 16 ]; then umount_failed= break fi ocf_log debug "umount failed: $ret_val" umount_failed=yes if [ -z "$force_umount" ]; then continue fi # Force unmount: try #1: send SIGTERM if [ $try -eq 1 ]; then - # Try fs-specific force-unmount, if provided + # Try fs-specific force unmount, if provided do_force_unmount if [ $? -eq 0 ]; then # if this succeeds, we should be done continue fi ocf_log warning "Sending SIGTERM to processes on $mp" fuser -TERM -kvm "$mp" continue else ocf_log warning "Sending SIGKILL to processes on $mp" fuser -kvm "$mp" case $? in 0) ;; 1) return $OCF_ERR_GENERIC ;; 2) break ;; esac fi done # for do_post_unmount case $? in 0) ;; 1) return $OCF_ERR_GENERIC ;; 2) return $OCF_SUCCESS ;; esac if [ -n "$umount_failed" ]; then ocf_log err "'umount $mp' failed, error=$ret_val" if [ "$self_fence" ]; then ocf_log alert "umount failed - REBOOTING" sync reboot -fn fi return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } do_start() { declare tries=0 declare rv while [ $tries -lt 3 ]; do start_filesystem rv=$? if [ $rv -eq 0 ]; then return 0 fi ((tries++)) sleep 3 done return $rv } do_stop() { stop_filesystem return $? } do_monitor() { # # Get the device # real_device "$OCF_RESKEY_device" dev="$REAL_DEVICE" if [ -z "$dev" ]; then ocf_log err "\ start_filesystem: Could not match $OCF_RESKEY_device with a real device" return $OCF_NOT_RUNNING fi is_mounted "$dev" "${OCF_RESKEY_mountpoint}" if [ $? -ne $YES ]; then ocf_log err "${OCF_RESOURCE_INSTANCE}: ${OCF_RESKEY_device} is not mounted on ${OCF_RESKEY_mountpoint}" return $OCF_NOT_RUNNING fi if [ "$OCF_RESKEY_quick_status" = "1" ]; then return 0 fi is_alive "${OCF_RESKEY_mountpoint}" [ $? -eq $YES ] && return 0 ocf_log err "fs:${OCF_RESKEY_name}: Mount point is not accessible!" return $OCF_ERR_GENERIC } do_restart() { stop_filesystem if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi start_filesystem if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi return 0 } # MUST BE OVERRIDDEN do_metadata() { return 1 } do_validate() { return 1 } main() { case $1 in start) do_start exit $? ;; stop) do_stop exit $? ;; status|monitor) do_monitor exit $? ;; restart) do_restart exit $? ;; meta-data) do_metadata exit $? ;; validate-all) do_validate ;; *) echo "usage: $0 {start|stop|status|monitor|restart|meta-data|validate-all}" exit $OCF_ERR_UNIMPLEMENTED ;; esac exit 0 } diff --git a/tools/ocft/Filesystem b/tools/ocft/Filesystem index ec77a27be..75203d73a 100644 --- a/tools/ocft/Filesystem +++ b/tools/ocft/Filesystem @@ -1,110 +1,110 @@ # Filesystem # by dejan@suse.de on # Tue Feb 15 18:50:04 CET 2011 CONFIG Agent Filesystem AgentRoot /usr/lib/ocf/resource.d/heartbeat HangTimeout 20 VARIABLE OCFT_fs=/var/run/resource-agents/ocft-Filesystem-fs OCFT_loop=/dev/loop7 OCFT_dir=/var/run/resource-agents/ocft-Filesystem-mnt SETUP-AGENT losetup $OCFT_loop 2>/dev/null && exit 1 rmdir $OCFT_dir 2>/dev/null || true mkdir $OCFT_dir dd if=/dev/zero of=$OCFT_fs bs=1 count=0 seek=16M 2>/dev/null - mke2fs -Fq $OCFT_fs + mke2fs -j -Fq -m 0 $OCFT_fs losetup $OCFT_loop $OCFT_fs CLEANUP-AGENT rmdir $OCFT_dir rm $OCFT_fs losetup -d $OCFT_loop CASE-BLOCK required_args Env OCF_RESKEY_device=$OCFT_loop - Env OCF_RESKEY_fstype=ext2 + Env OCF_RESKEY_fstype=ext3 Env OCF_RESKEY_directory=$OCFT_dir CASE-BLOCK default_status AgentRun stop CASE-BLOCK prepare Include required_args Include default_status CASE "check base env" Include prepare AgentRun start OCF_SUCCESS CASE "check base env: invalid 'OCF_RESKEY_device'" Include prepare Env OCF_RESKEY_device=/dev/no_such_device AgentRun start OCF_ERR_INSTALLED CASE "check base env: unset 'OCF_RESKEY_device'" Include prepare Unenv OCF_RESKEY_device AgentRun start OCF_ERR_CONFIGURED CASE "normal start" Include prepare AgentRun start OCF_SUCCESS CASE "normal stop" Include prepare AgentRun start AgentRun stop OCF_SUCCESS CASE "double start" Include prepare AgentRun start AgentRun start OCF_SUCCESS CASE "double stop" Include prepare AgentRun stop OCF_SUCCESS CASE "monitor when running" Include prepare AgentRun start AgentRun monitor OCF_SUCCESS CASE "monitor when not running" Include prepare AgentRun monitor OCF_NOT_RUNNING CASE "monitor depth 10 when running" Include prepare AgentRun start Env OCF_CHECK_LEVEL=10 AgentRun monitor OCF_SUCCESS CASE "monitor depth 20 with running" Include prepare AgentRun start Env OCF_CHECK_LEVEL=20 AgentRun monitor OCF_SUCCESS CASE "start insert failure (remove device)" Include prepare Bash losetup -d $OCFT_loop BashAtExit losetup $OCFT_loop $OCFT_fs AgentRun start OCF_ERR_GENERIC CASE "monitor depth 20 insert failure (r/o fs)" Include prepare AgentRun start Bash mount -o remount,ro $OCFT_dir BashAtExit mount -o remount,rw $OCFT_dir Env OCF_CHECK_LEVEL=20 AgentRun monitor OCF_ERR_GENERIC CASE "unimplemented command" Include prepare AgentRun no_cmd OCF_ERR_UNIMPLEMENTED diff --git a/tools/ocft/Makefile.am b/tools/ocft/Makefile.am index c91f2388a..390c6133d 100644 --- a/tools/ocft/Makefile.am +++ b/tools/ocft/Makefile.am @@ -1,48 +1,51 @@ # Author: John Shi # jshi@suse.de # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(ocftcfgs_DATA) $(ocft_DATA) sbin_SCRIPTS = ocft ocftcfgsdir = $(datadir)/$(PACKAGE_NAME)/ocft/configs ocftcfgs_DATA = apache \ IPaddr2 \ IPv6addr \ Filesystem \ LVM \ + Raid1 \ IPsrcaddr \ MailTo \ mysql \ + mysql-proxy \ pgsql \ db2 \ + oracle \ drbd.linbit \ nfsserver \ portblock \ iscsi \ named \ postfix \ Xinetd \ SendArp ocftdir = $(datadir)/$(PACKAGE_NAME)/ocft ocft_DATA = README \ README.zh_CN \ caselib diff --git a/tools/ocft/Raid1 b/tools/ocft/Raid1 new file mode 100644 index 000000000..1c239c8a0 --- /dev/null +++ b/tools/ocft/Raid1 @@ -0,0 +1,146 @@ +# Raid1 +# by dejan@suse.de on +# Fri Aug 24 17:01:40 CEST 2012 + +CONFIG + Agent Raid1 + AgentRoot /usr/lib/ocf/resource.d/heartbeat + InstallPackage mdadm + HangTimeout 20 + +VARIABLE + OCFT_disk0=/var/run/resource-agents/ocft-Raid1-disk0 + OCFT_disk1=/var/run/resource-agents/ocft-Raid1-disk1 + OCFT_disk2=/var/run/resource-agents/ocft-Raid1-disk2 + OCFT_disk3=/var/run/resource-agents/ocft-Raid1-disk3 + OCFT_raidconf=/var/run/resource-agents/ocft-mdadm.conf + OCFT_raiddev=/dev/md8 + OCFT_raiddev2=/dev/md9 + OCFT_loop0=/dev/loop6 + OCFT_loop1=/dev/loop7 + OCFT_loop2=/dev/loop4 + OCFT_loop3=/dev/loop5 + +SETUP-AGENT + losetup $OCFT_loop0 2>/dev/null && exit 1 + losetup $OCFT_loop1 2>/dev/null && exit 1 + losetup $OCFT_loop2 2>/dev/null && exit 1 + losetup $OCFT_loop3 2>/dev/null && exit 1 + dd if=/dev/zero of=$OCFT_disk0 bs=1 count=0 seek=16M 2>/dev/null + dd if=/dev/zero of=$OCFT_disk1 bs=1 count=0 seek=16M 2>/dev/null + dd if=/dev/zero of=$OCFT_disk2 bs=1 count=0 seek=16M 2>/dev/null + dd if=/dev/zero of=$OCFT_disk3 bs=1 count=0 seek=16M 2>/dev/null + losetup $OCFT_loop0 $OCFT_disk0 + losetup $OCFT_loop1 $OCFT_disk1 + losetup $OCFT_loop2 $OCFT_disk2 + losetup $OCFT_loop3 $OCFT_disk3 + mdadm --create $OCFT_raiddev -l 0 --raid-devices=2 $OCFT_loop0 $OCFT_loop1 + mdadm --create $OCFT_raiddev2 -l 0 --raid-devices=2 $OCFT_loop2 $OCFT_loop3 + echo DEVICE $OCFT_loop0 $OCFT_loop1 > $OCFT_raidconf + echo DEVICE $OCFT_loop2 $OCFT_loop3 >> $OCFT_raidconf + echo ARRAY $OCFT_raiddev devices=$OCFT_loop0,$OCFT_loop1 >> $OCFT_raidconf + echo ARRAY $OCFT_raiddev2 devices=$OCFT_loop2,$OCFT_loop3 >> $OCFT_raidconf + +CLEANUP-AGENT + mdadm --zero-superblock $OCFT_loop0 + mdadm --zero-superblock $OCFT_loop1 + mdadm --zero-superblock $OCFT_loop2 + mdadm --zero-superblock $OCFT_loop3 + mdadm --remove $OCFT_raiddev 2>/dev/null + mdadm --remove $OCFT_raiddev2 2>/dev/null + losetup -d $OCFT_loop0 + losetup -d $OCFT_loop1 + losetup -d $OCFT_loop2 + losetup -d $OCFT_loop3 + rm $OCFT_disk0 $OCFT_disk1 $OCFT_raidconf + rm $OCFT_disk2 $OCFT_disk3 + +CASE-BLOCK required_args + Env OCF_RESKEY_raidconf=$OCFT_raidconf + Env OCF_RESKEY_raiddev=$OCFT_raiddev + +CASE-BLOCK default_status + AgentRun stop + +CASE-BLOCK prepare + Include required_args + Include default_status + +CASE-BLOCK prepare_auto + Include required_args + Env OCF_RESKEY_raiddev="auto" + Include default_status + +CASE-BLOCK prepare_multiple + Include required_args + Env OCF_RESKEY_raiddev="$OCFT_raiddev $OCFT_raiddev2" + Include default_status + +CASE "check base env" + Include prepare + AgentRun start OCF_SUCCESS + +CASE "check base env: invalid 'OCF_RESKEY_raiddev'" + Include prepare + Env OCF_RESKEY_raiddev=/dev/no_such_device + AgentRun start OCF_ERR_GENERIC + +CASE "check base env: unset 'OCF_RESKEY_raiddev'" + Include prepare + Unenv OCF_RESKEY_raiddev + AgentRun start OCF_ERR_CONFIGURED + +CASE "normal start" + Include prepare + AgentRun start OCF_SUCCESS + +CASE "normal stop" + Include prepare + AgentRun start + AgentRun stop OCF_SUCCESS + +CASE "double start" + Include prepare + AgentRun start + AgentRun start OCF_SUCCESS + +CASE "double stop" + Include prepare + AgentRun stop OCF_SUCCESS + +CASE "monitor when running" + Include prepare + AgentRun start + AgentRun monitor OCF_SUCCESS + +CASE "monitor when not running" + Include prepare + AgentRun monitor OCF_NOT_RUNNING + +CASE "normal start (auto)" + Include prepare_auto + AgentRun start OCF_SUCCESS + AgentRun monitor OCF_SUCCESS + +CASE "normal stop (auto)" + Include prepare_auto + AgentRun start + AgentRun stop OCF_SUCCESS + AgentRun monitor OCF_NOT_RUNNING + +CASE "normal start (multiple)" + Include prepare + AgentRun start OCF_SUCCESS + AgentRun monitor OCF_SUCCESS + +CASE "normal stop (multiple)" + Include prepare + Env OCF_RESKEY_raiddev="$OCFT_raiddev $OCFT_raiddev2" + AgentRun start + AgentRun stop OCF_SUCCESS + AgentRun monitor OCF_NOT_RUNNING + +CASE "unimplemented command" + Include prepare + AgentRun no_cmd OCF_ERR_UNIMPLEMENTED + diff --git a/tools/ocft/SendArp b/tools/ocft/SendArp index 832897c43..d47b552e6 100644 --- a/tools/ocft/SendArp +++ b/tools/ocft/SendArp @@ -1,73 +1,73 @@ # SendArp CONFIG Agent SendArp AgentRoot /usr/lib/ocf/resource.d/heartbeat InstallPackage resource-agents HangTimeout 15 CASE-BLOCK required_args Env OCF_RESKEY_ip=127.0.0.1 Env OCF_RESKEY_nic=lo CASE-BLOCK default_status AgentRun stop CASE-BLOCK prepare Include required_args Include default_status CASE "check base env" Include prepare AgentRun start OCF_SUCCESS CASE "check base env: unset 'OCF_RESKEY_ip'" Include prepare Unenv OCF_RESKEY_ip - AgentRun start OCF_ERR_ARGS + AgentRun start OCF_ERR_CONFIGURE -CASE "check base env: set worng 'OCF_RESKEY_ip'" +CASE "check base env: set invalid 'OCF_RESKEY_ip'" Include prepare Env OCF_RESKEY_ip=not_ip_address - AgentRun start OCF_ERR_ARGS + AgentRun start OCF_ERR_CONFIGURE CASE "check base env: unset 'OCF_RESKEY_nic'" Include prepare Unenv OCF_RESKEY_nic - AgentRun start OCF_ERR_ARGS + AgentRun start OCF_ERR_CONFIGURE -CASE "check base env: set worng 'OCF_RESKEY_nic'" +CASE "check base env: set invalid 'OCF_RESKEY_nic'" Include prepare Env OCF_RESKEY_nic=not_nic - AgentRun start OCF_ERR_ARGS + AgentRun start OCF_ERR_CONFIGURE CASE "normal start" Include prepare AgentRun start OCF_SUCCESS CASE "normal stop" Include prepare AgentRun start AgentRun stop OCF_SUCCESS CASE "double start" Include prepare AgentRun start AgentRun start OCF_SUCCESS CASE "double stop" Include prepare AgentRun stop OCF_SUCCESS CASE "monitor with running" Include prepare AgentRun start AgentRun monitor OCF_SUCCESS CASE "monitor with not running" Include prepare AgentRun monitor OCF_NOT_RUNNING CASE "unimplemented command" Include prepare AgentRun no_cmd OCF_ERR_UNIMPLEMENTED diff --git a/tools/ocft/apache b/tools/ocft/apache index e93904471..797412d37 100644 --- a/tools/ocft/apache +++ b/tools/ocft/apache @@ -1,63 +1,63 @@ # apache # make sure that your apache configuration loads mod_status CONFIG Agent apache AgentRoot /usr/lib/ocf/resource.d/heartbeat InstallPackage apache2 HangTimeout 20 SETUP-AGENT /etc/init.d/apache2 start /etc/init.d/apache2 stop CASE-BLOCK default_status AgentRun stop CASE-BLOCK prepare Include default_status CASE "check base env" Include prepare AgentRun start OCF_SUCCESS CASE "check base env: set non-existing OCF_RESKEY_statusurl" Include prepare Env OCF_RESKEY_statusurl="yoyoyoyo" - AgentRun start OCF_ERR_GENERIC + AgentRun start OCF_ERR_CONFIGURED CASE "check base env: set non-existing OCF_RESKEY_configfile" Include prepare Env OCF_RESKEY_configfile="/yoyoyoyo/nosuchfile" AgentRun start OCF_ERR_INSTALLED CASE "normal start" Include prepare AgentRun start OCF_SUCCESS CASE "normal stop" Include prepare AgentRun start AgentRun stop OCF_SUCCESS CASE "double start" Include prepare AgentRun start AgentRun start OCF_SUCCESS CASE "double stop" Include prepare AgentRun stop OCF_SUCCESS CASE "running monitor" Include prepare AgentRun start AgentRun monitor OCF_SUCCESS CASE "not running monitor" Include prepare AgentRun monitor OCF_NOT_RUNNING CASE "unimplemented command" Include prepare AgentRun no_cmd OCF_ERR_UNIMPLEMENTED diff --git a/tools/ocft/mysql-proxy b/tools/ocft/mysql-proxy new file mode 100644 index 000000000..436e7a3a0 --- /dev/null +++ b/tools/ocft/mysql-proxy @@ -0,0 +1,82 @@ +# mysql-proxy +# by r.bhatia@ipax.at +# +# test cases (to implement): +# +# * /usr/sbin/ocf-tester -n mp -o binary="/usr/sbin/mysql-proxy" -o defaults_file="" -o parameters="--proxy-skip-profiling" \ +# -o admin_address="127.0.0.1:4041" -o admin_username="root" -o admin_password="la" -o admin_lua_script="/usr/lib/mysql-proxy/lua/admin.lua" \ +# -o proxy_backend_addresses="192.168.100.200:42006" -o proxy_address="/var/run/mysqld/mysqld.sock" /usr/lib/ocf/resource.d/heartbeat/mysql-proxy +# +# * OCF_CHECK_LEVEL 20 check + +CONFIG + AgentRoot /usr/lib/ocf/resource.d/heartbeat/ + InstallPackage mysql-proxy + HangTimeout 20 + +SETUP-AGENT + # nothing + +CASE-BLOCK crm_setting + Var OCF_RESKEY_CRM_meta_timeout=15000 + Var OCF_RESKEY_binary=/tmp/mysql-proxy + Var OCF_RESKEY_admin_username=root + Var OCF_RESKEY_admin_password=test123 + Var OCF_RESKEY_admin_lua_script=/usr/lib/mysql-proxy/lua/admin.lua + +CASE-BLOCK default_status + AgentRun stop + +CASE-BLOCK prepare + Bash [ ! -x /tmp/mysql-proxy ] && ln -s `which mysql-proxy` /tmp/mysql-proxy || true + Include crm_setting + +CASE-BLOCK teardown + AgentRun stop + BashAtExit rm -f /tmp/mysql-proxy + +CASE "check base env" + Include prepare + AgentRun start OCF_SUCCESS + Include teardown + +CASE "check base env: invalid 'OCF_RESKEY_binary'" + Include prepare + Var OCF_RESKEY_binary=no_such + AgentRun start OCF_ERR_INSTALLED + BashAtExit rm -f /tmp/mysql-proxy + +CASE "normal start" + Include prepare + AgentRun start OCF_SUCCESS + Include teardown + +CASE "normal stop" + Include prepare + AgentRun start + AgentRun stop OCF_SUCCESS + Include teardown + +CASE "double start" + Include prepare + AgentRun start + AgentRun start OCF_SUCCESS + Include teardown + +CASE "double stop" + Include prepare + AgentRun stop OCF_SUCCESS + +CASE "running monitor" + Include prepare + AgentRun start + AgentRun monitor OCF_SUCCESS + Include teardown + +CASE "not running monitor" + Include prepare + AgentRun monitor OCF_NOT_RUNNING + +CASE "unimplemented command" + Include prepare + AgentRun no_cmd OCF_ERR_UNIMPLEMENTED diff --git a/tools/ocft/oracle b/tools/ocft/oracle new file mode 100644 index 000000000..6f145c7d6 --- /dev/null +++ b/tools/ocft/oracle @@ -0,0 +1,81 @@ +# oracle +# (based on db2) +# +# Created on an SLE11SP2 running oracle 11g +# database sid is orcl +# adapt this in set_testenv below +# TODO: need oracle expert to break it, then test it +# + +CONFIG + Agent oracle + AgentRoot /usr/lib/ocf/resource.d/heartbeat + HangTimeout 40 + +SETUP-AGENT + # nothing + +CASE-BLOCK set_testenv + Env OCFT_sid=orcl + +CASE-BLOCK crm_setting + Env OCF_RESKEY_sid=$OCFT_sid + Env OCF_RESKEY_CRM_meta_timeout=30000 + +CASE-BLOCK default_status + AgentRun stop + +CASE-BLOCK prepare + Include set_testenv + Include crm_setting + Include default_status + +CASE "check base env" + Include prepare + AgentRun start OCF_SUCCESS + +CASE "check base env: no 'OCF_RESKEY_sid'" + Include prepare + Env OCF_RESKEY_sid= + AgentRun start OCF_ERR_CONFIGURED + +CASE "check base env: invalid 'OCF_RESKEY_home'" + Include prepare + Env OCF_RESKEY_home=/no_such + AgentRun start OCF_ERR_INSTALLED + +CASE "unimplemented command" + Include prepare + AgentRun no_cmd OCF_ERR_UNIMPLEMENTED + +CASE "normal start" + Include prepare + AgentRun start OCF_SUCCESS + +CASE "normal stop" + Include prepare + AgentRun start + AgentRun stop OCF_SUCCESS + +CASE "double start" + Include prepare + AgentRun start + AgentRun start OCF_SUCCESS + +CASE "double stop" + Include prepare + AgentRun stop OCF_SUCCESS + +CASE "started: monitor" + Include prepare + AgentRun start + AgentRun monitor OCF_SUCCESS + +CASE "not started: monitor" + Include prepare + AgentRun monitor OCF_NOT_RUNNING + +CASE "try different ipcrm method" + Include prepare + Env OCF_RESKEY_ipcrm=none + AgentRun start OCF_SUCCESS diff --git a/tools/sfex_lib.c b/tools/sfex_lib.c index 292f66c9e..47ce0d749 100644 --- a/tools/sfex_lib.c +++ b/tools/sfex_lib.c @@ -1,471 +1,474 @@ /*------------------------------------------------------------------------- * * Shared Disk File EXclusiveness Control Program(SF-EX) * * sfex_lib.c --- Libraries for other SF-EX modules. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION * * $Id$ * *-------------------------------------------------------------------------*/ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "sfex.h" #include "sfex_lib.h" static void *locked_mem; static int dev_fd; unsigned long sector_size = 0; int prepare_lock (const char *device) { + int sec_tmp = 0; + do { dev_fd = open (device, O_RDWR | O_DIRECT | O_SYNC); if (dev_fd == -1) { if (errno == EINTR || errno == EAGAIN) continue; cl_log(LOG_ERR, "can't open device %s: %s\n", device, strerror (errno)); exit (3); } break; } while (1); - ioctl(dev_fd, BLKSSZGET, §or_size); + ioctl(dev_fd, BLKSSZGET, &sec_tmp); + sector_size = (unsigned long)sec_tmp; if (sector_size == 0) { cl_log(LOG_ERR, "Get sector size failed: %s\n", strerror(errno)); exit(EXIT_FAILURE); } if (posix_memalign ((void **) (&locked_mem), SFEX_ODIRECT_ALIGNMENT, sector_size) != 0) { cl_log(LOG_ERR, "Failed to allocate aligned memory\n"); exit (3); } memset (locked_mem, 0, sector_size); return 0; } /* * get_progname --- a program name * * We get program name from directory path. It does not include delimiter * characters. Return value is pointer that point string of program name. * We assume delimiter is '/'. */ const char * get_progname (const char *argv0) { char *p; p = strrchr (argv0, '/'); if (p) return p + 1; else return argv0; } /* * get_nodename --- get a node name(hostname) * * We get a node name by using uname(2) and return pointer of it. * The error checks are done in this function. The caller does not have * to check return value. */ char * get_nodename (void) { struct utsname u; char *n; if (uname (&u)) { cl_log(LOG_ERR, "%s\n", strerror (errno)); exit (3); } if (strlen (u.nodename) > SFEX_MAX_NODENAME) { cl_log(LOG_ERR, "nodename %s is too long. must be less than %lu byte.\n", u.nodename, (unsigned long)SFEX_MAX_NODENAME); exit (3); } n = strdup (&u.nodename[0]); if (!n) { cl_log(LOG_ERR, "%s\n", strerror (errno)); exit (3); } return n; } /* * init_controldata --- initialize control data * * We initialize each member of sfex_controldata structure. */ void init_controldata (sfex_controldata * cdata, size_t blocksize, int numlocks) { memcpy (cdata->magic, SFEX_MAGIC, sizeof (cdata->magic)); cdata->version = SFEX_VERSION; cdata->revision = SFEX_REVISION; cdata->blocksize = blocksize; cdata->numlocks = numlocks; } /* * init_lockdata --- initialize lock data * * We initialize each member of sfex_lockdata structure. */ void init_lockdata (sfex_lockdata * ldata) { ldata->status = SFEX_STATUS_UNLOCK; ldata->count = 0; ldata->nodename[0] = 0; } /* * write_controldata --- write control data into file * * We write sfex_controldata struct into file. We open a file with * synchronization mode and write out control data. * * cdata --- pointer of control data * * device --- name of target file */ void write_controldata (const sfex_controldata * cdata) { sfex_controldata_ondisk *block; int fd; block = (sfex_controldata_ondisk *) (locked_mem); /* We write control data into the buffer with given format. */ /* We write the offset value of each field of the control data directly. * Because a point using this value is limited to two places, we do not * use macro. If you change the following offset values, you must change * values in the read_controldata() function. */ memset (block, 0, cdata->blocksize); memcpy (block->magic, cdata->magic, sizeof (block->magic)); snprintf ((char *) (block->version), sizeof (block->version), "%d", cdata->version); snprintf ((char *) (block->revision), sizeof (block->revision), "%d", cdata->revision); snprintf ((char *) (block->blocksize), sizeof (block->blocksize), "%u", (unsigned)cdata->blocksize); snprintf ((char *) (block->numlocks), sizeof (block->numlocks), "%d", cdata->numlocks); fd = dev_fd; if (lseek (fd, 0, SEEK_SET) == -1) { cl_log(LOG_ERR, "can't seek file pointer: %s\n", strerror (errno)); exit (3); } /* write buffer into a file */ do { ssize_t s = write (fd, block, cdata->blocksize); if (s == -1) { if (errno == EINTR || errno == EAGAIN) continue; cl_log(LOG_ERR, "can't write meta-data: %s\n", strerror (errno)); exit (3); } else break; } while (1); } /* * write_lockdata --- write lock data into file * * We write sfex_lockdata into file and seek file pointer to the given * position of lock data. * * cdata --- pointer for control data * * ldata --- pointer for lock data * * device --- file name for write * * index --- index number for lock data. 1 origine. */ int write_lockdata (const sfex_controldata * cdata, const sfex_lockdata * ldata, int index) { sfex_lockdata_ondisk *block; int fd; block = (sfex_lockdata_ondisk *) locked_mem; /* We write lock data into buffer with given format */ /* We write the offset value of each field of the control data directly. * Because a point using this value is limited to two places, we do not * use macro. If you chage the following offset values, you must change * values in the read_lockdata() function. */ memset (block, 0, cdata->blocksize); block->status = ldata->status; snprintf ((char *) (block->count), sizeof (block->count), "%d", ldata->count); snprintf ((char *) (block->nodename), sizeof (block->nodename), "%s", ldata->nodename); fd = dev_fd; /* seek a file pointer to given position */ if (lseek (fd, cdata->blocksize * index, SEEK_SET) == -1) { cl_log(LOG_ERR, "can't seek file pointer: %s\n", strerror (errno)); return -1; } /* write buffer into file */ do { ssize_t s = write (fd, block, cdata->blocksize); if (s == -1) { if (errno == EINTR || errno == EAGAIN) continue; cl_log(LOG_ERR, "can't write meta-data: %s\n", strerror (errno)); return -1; } else if (s != cdata->blocksize) { /* if writing atomically failed, this process is error */ cl_log(LOG_ERR, "can't write meta-data atomically.\n"); return -1; } break; } while (1); return 0; } /* * read_controldata --- read control data from file * * read sfex_controldata structure from file. * * cdata --- pointer for control data * * device --- file name for reading */ int read_controldata (sfex_controldata * cdata) { sfex_controldata_ondisk *block; block = (sfex_controldata_ondisk *) (locked_mem); if (lseek (dev_fd, 0, SEEK_SET) == -1) { cl_log(LOG_ERR, "can't seek file pointer: %s\n", strerror (errno)); return -1; } /* read data from file */ do { ssize_t s = read (dev_fd, block, sector_size); if (s == -1) { if (errno == EINTR || errno == EAGAIN) continue; cl_log(LOG_ERR, "can't read controldata meta-data: %s\n", strerror (errno)); return -1; } else break; } while (1); /* read control data from buffer */ /* 1. check the magic number. 2. check null terminator of each field 3. check the version number. 4. Unmuch of revision number is allowed */ /* We write the offset value of each field of the control data directly. * Because a point using this value is limited to two places, we do not * use macro. If you chage the following offset values, you must change * values in the write_controldata() function. */ memcpy (cdata->magic, block->magic, 4); if (memcmp (cdata->magic, SFEX_MAGIC, sizeof (cdata->magic))) { cl_log(LOG_ERR, "magic number mismatched. %c%c%c%c <-> %s\n", block->magic[0], block->magic[1], block->magic[2], block->magic[3], SFEX_MAGIC); return -1; } if (block->version[sizeof (block->version)-1] || block->revision[sizeof (block->revision)-1] || block->blocksize[sizeof (block->blocksize)-1] || block->numlocks[sizeof (block->numlocks)-1]) { cl_log(LOG_ERR, "control data format error.\n"); return -1; } cdata->version = atoi ((char *) (block->version)); if (cdata->version != SFEX_VERSION) { cl_log(LOG_ERR, "version number mismatched. program is %d, data is %d.\n", SFEX_VERSION, cdata->version); return -1; } cdata->revision = atoi ((char *) (block->revision)); cdata->blocksize = atoi ((char *) (block->blocksize)); cdata->numlocks = atoi ((char *) (block->numlocks)); return 0; } /* * read_lockdata --- read lock data from file * * read sfex_lockdata from file and seek file pointer to head position of the * file. * * cdata --- pointer for control data * * ldata --- pointer for lock data. Read lock data are stored into this * pointed area. * * device --- file name of source file * * index --- index number. 1 origin. */ int read_lockdata (const sfex_controldata * cdata, sfex_lockdata * ldata, int index) { sfex_lockdata_ondisk *block; int fd; block = (sfex_lockdata_ondisk *) (locked_mem); fd = dev_fd; /* seek a file pointer to given position */ if (lseek (fd, cdata->blocksize * index, SEEK_SET) == -1) { cl_log(LOG_ERR, "can't seek file pointer: %s\n", strerror (errno)); return -1; } /* read from file */ do { ssize_t s = read (fd, block, cdata->blocksize); if (s == -1) { if (errno == EINTR || errno == EAGAIN) continue; cl_log(LOG_ERR, "can't read lockdata meta-data: %s\n", strerror (errno)); return -1; } else if (s != cdata->blocksize) { cl_log(LOG_ERR, "can't read meta-data atomically.\n"); return -1; } break; } while (1); /* read control data form buffer */ /* 1. check null terminator of each field 2. check the status */ /* We write the offset value of each field of the control data directly. * Because a point using this value is limited to two places, we do not * use macro. If you chage the following offset values, you must change * values in the write_lockdata() function. */ if (block->count[sizeof(block->count)-1] || block->nodename[sizeof(block->nodename)-1]) { cl_log(LOG_ERR, "lock data format error.\n"); return -1; } ldata->status = block->status; if (ldata->status != SFEX_STATUS_UNLOCK && ldata->status != SFEX_STATUS_LOCK) { cl_log(LOG_ERR, "lock data format error.\n"); return -1; } ldata->count = atoi ((char *) (block->count)); strncpy ((char *) (ldata->nodename), (const char *) (block->nodename), sizeof(block->nodename)); #ifdef SFEX_DEBUG cl_log(LOG_INFO, "status: %c\n", ldata->status); cl_log(LOG_INFO, "count: %d\n", ldata->count); cl_log(LOG_INFO, "nodename: %s\n", ldata->nodename); #endif return 0; } /* * lock_index_check --- check the value of index * * The lock_index_check function checks whether the value of index exceeds * the number of lock data on the shared disk. * * cdata --- pointer for control data * * index --- index number */ int lock_index_check(sfex_controldata * cdata, int index) { if (read_controldata(cdata) == -1) { cl_log(LOG_ERR, "%s\n", "read_controldata failed in lock_index_check"); return -1; } #ifdef SFEX_DEBUG cl_log(LOG_INFO, "version: %d\n", cdata->version); cl_log(LOG_INFO, "revision: %d\n", cdata->revision); cl_log(LOG_INFO, "blocksize: %d\n", cdata->blocksize); cl_log(LOG_INFO, "numlocks: %d\n", cdata->numlocks); #endif if (index > cdata->numlocks) { cl_log(LOG_ERR, "index %d is too large. %d locks are stored.\n", index, cdata->numlocks); return -1; } if (cdata->blocksize != sector_size) { cl_log(LOG_ERR, "sector_size is not the same as the blocksize.\n"); return -1; } return 0; }